src/lxc/cgroups/cgfsng.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 /*
   4  * cgfs-ng.c: this is a new, simplified implementation of a filesystem
   5  * cgroup backend.  The original cgfs.c was designed to be as flexible
   6  * as possible.  It would try to find cgroup filesystems no matter where
   7  * or how you had them mounted, and deduce the most usable mount for
   8  * each controller.
   9  *
  10  * This new implementation assumes that cgroup filesystems are mounted
  11  * under /sys/fs/cgroup/clist where clist is either the controller, or
  12  * a comma-separated list of controllers.
  13  */
  14
  15 #ifndef _GNU_SOURCE
  16 #define _GNU_SOURCE 1
  17 #endif
  18 #include <ctype.h>
  19 #include <dirent.h>
  20 #include <errno.h>
  21 #include <grp.h>
  22 #include <linux/kdev_t.h>
  23 #include <linux/types.h>
  24 #include <poll.h>
  25 #include <signal.h>
  26 #include <stdint.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/epoll.h>
  31 #include <sys/types.h>
  32 #include <unistd.h>
  33
  34 #include "af_unix.h"
  35 #include "caps.h"
  36 #include "cgroup.h"
  37 #include "cgroup2_devices.h"
  38 #include "cgroup_utils.h"
  39 #include "commands.h"
  40 #include "commands_utils.h"
  41 #include "conf.h"
  42 #include "config.h"
  43 #include "log.h"
  44 #include "macro.h"
  45 #include "mainloop.h"
  46 #include "memory_utils.h"
  47 #include "mount_utils.h"
  48 #include "storage/storage.h"
  49 #include "string_utils.h"
  50 #include "syscall_wrappers.h"
  51 #include "utils.h"
  52
  53 #ifndef HAVE_STRLCPY
  54 #include "include/strlcpy.h"
  55 #endif
  56
  57 #ifndef HAVE_STRLCAT
  58 #include "include/strlcat.h"
  59 #endif
  60
  61 lxc_log_define(cgfsng, cgroup);
  62
  63 /* Given a pointer to a null-terminated array of pointers, realloc to add one
  64  * entry, and point the new entry to NULL. Do not fail. Return the index to the
  65  * second-to-last entry - that is, the one which is now available for use
  66  * (keeping the list null-terminated).
  67  */
  68 static int append_null_to_list(void ***list)
  69 {
  70         int newentry = 0;
  71
  72         if (*list)
  73                 for (; (*list)[newentry]; newentry++)
  74                         ;
  75
  76         *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
  77         (*list)[newentry + 1] = NULL;
  78         return newentry;
  79 }
  80
  81 /* Given a null-terminated array of strings, check whether @entry is one of the
  82  * strings.
  83  */
  84 static bool string_in_list(char **list, const char *entry)
  85 {
  86         if (!list)
  87                 return false;
  88
  89         for (int i = 0; list[i]; i++)
  90                 if (strequal(list[i], entry))
  91                         return true;
  92
  93         return false;
  94 }
  95
  96 /* Return a copy of @entry prepending "name=", i.e.  turn "systemd" into
  97  * "name=systemd". Do not fail.
  98  */
  99 static char *cg_legacy_must_prefix_named(char *entry)
 100 {
 101         size_t len;
 102         char *prefixed;
 103
 104         len = strlen(entry);
 105         prefixed = must_realloc(NULL, len + 6);
 106
 107         memcpy(prefixed, "name=", STRLITERALLEN("name="));
 108         memcpy(prefixed + STRLITERALLEN("name="), entry, len);
 109         prefixed[len + 5] = '\0';
 110
 111         return prefixed;
 112 }
 113
 114 /* Append an entry to the clist. Do not fail. @clist must be NULL the first time
 115  * we are called.
 116  *
 117  * We also handle named subsystems here. Any controller which is not a kernel
 118  * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
 119  * we refuse to use because we're not sure which we have here.
 120  * (TODO: We could work around this in some cases by just remounting to be
 121  * unambiguous, or by comparing mountpoint contents with current cgroup.)
 122  *
 123  * The last entry will always be NULL.
 124  */
 125 static void must_append_controller(char **klist, char **nlist, char ***clist,
 126                                    char *entry)
 127 {
 128         int newentry;
 129         char *copy;
 130
 131         if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
 132                 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
 133                 ERROR("It is both a named and kernel subsystem");
 134                 return;
 135         }
 136
 137         newentry = append_null_to_list((void ***)clist);
 138
 139         if (strnequal(entry, "name=", 5))
 140                 copy = must_copy_string(entry);
 141         else if (string_in_list(klist, entry))
 142                 copy = must_copy_string(entry);
 143         else
 144                 copy = cg_legacy_must_prefix_named(entry);
 145
 146         (*clist)[newentry] = copy;
 147 }
 148
 149 /* Given a handler's cgroup data, return the struct hierarchy for the controller
 150  * @c, or NULL if there is none.
 151  */
 152 static struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
 153 {
 154         if (!ops->hierarchies)
 155                 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
 156
 157         for (int i = 0; ops->hierarchies[i]; i++) {
 158                 if (!controller) {
 159                         /* This is the empty unified hierarchy. */
 160                         if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
 161                                 return ops->hierarchies[i];
 162
 163                         continue;
 164                 }
 165
 166                 /*
 167                  * Handle controllers with significant implementation changes
 168                  * from cgroup to cgroup2.
 169                  */
 170                 if (pure_unified_layout(ops)) {
 171                         if (strequal(controller, "devices")) {
 172                                 if (ops->unified->bpf_device_controller)
 173                                         return ops->unified;
 174
 175                                 break;
 176                         } else if (strequal(controller, "freezer")) {
 177                                 if (ops->unified->freezer_controller)
 178                                         return ops->unified;
 179
 180                                 break;
 181                         }
 182                 }
 183
 184                 if (string_in_list(ops->hierarchies[i]->controllers, controller))
 185                         return ops->hierarchies[i];
 186         }
 187
 188         if (controller)
 189                 WARN("There is no useable %s controller", controller);
 190         else
 191                 WARN("There is no empty unified cgroup hierarchy");
 192
 193         return ret_set_errno(NULL, ENOENT);
 194 }
 195
 196 /* Taken over modified from the kernel sources. */
 197 #define NBITS 32 /* bits in uint32_t */
 198 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
 199 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
 200
 201 static void set_bit(unsigned bit, uint32_t *bitarr)
 202 {
 203         bitarr[bit / NBITS] |= (1 << (bit % NBITS));
 204 }
 205
 206 static void clear_bit(unsigned bit, uint32_t *bitarr)
 207 {
 208         bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
 209 }
 210
 211 static bool is_set(unsigned bit, uint32_t *bitarr)
 212 {
 213         return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
 214 }
 215
 216 /* Create cpumask from cpulist aka turn:
 217  *
 218  *      0,2-3
 219  *
 220  * into bit array
 221  *
 222  *      1 0 1 1
 223  */
 224 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
 225 {
 226         __do_free uint32_t *bitarr = NULL;
 227         char *token;
 228         size_t arrlen;
 229
 230         arrlen = BITS_TO_LONGS(nbits);
 231         bitarr = calloc(arrlen, sizeof(uint32_t));
 232         if (!bitarr)
 233                 return ret_set_errno(NULL, ENOMEM);
 234
 235         lxc_iterate_parts(token, buf, ",") {
 236                 errno = 0;
 237                 unsigned end, start;
 238                 char *range;
 239
 240                 start = strtoul(token, NULL, 0);
 241                 end = start;
 242                 range = strchr(token, '-');
 243                 if (range)
 244                         end = strtoul(range + 1, NULL, 0);
 245
 246                 if (!(start <= end))
 247                         return ret_set_errno(NULL, EINVAL);
 248
 249                 if (end >= nbits)
 250                         return ret_set_errno(NULL, EINVAL);
 251
 252                 while (start <= end)
 253                         set_bit(start++, bitarr);
 254         }
 255
 256         return move_ptr(bitarr);
 257 }
 258
 259 /* Turn cpumask into simple, comma-separated cpulist. */
 260 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
 261 {
 262         __do_free_string_list char **cpulist = NULL;
 263         char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
 264         int ret;
 265
 266         for (size_t i = 0; i <= nbits; i++) {
 267                 if (!is_set(i, bitarr))
 268                         continue;
 269
 270                 ret = strnprintf(numstr, sizeof(numstr), "%zu", i);
 271                 if (ret < 0)
 272                         return NULL;
 273
 274                 ret = lxc_append_string(&cpulist, numstr);
 275                 if (ret < 0)
 276                         return ret_set_errno(NULL, ENOMEM);
 277         }
 278
 279         if (!cpulist)
 280                 return ret_set_errno(NULL, ENOMEM);
 281
 282         return lxc_string_join(",", (const char **)cpulist, false);
 283 }
 284
 285 static ssize_t get_max_cpus(char *cpulist)
 286 {
 287         char *c1, *c2;
 288         char *maxcpus = cpulist;
 289         size_t cpus = 0;
 290
 291         c1 = strrchr(maxcpus, ',');
 292         if (c1)
 293                 c1++;
 294
 295         c2 = strrchr(maxcpus, '-');
 296         if (c2)
 297                 c2++;
 298
 299         if (!c1 && !c2)
 300                 c1 = maxcpus;
 301         else if (c1 > c2)
 302                 c2 = c1;
 303         else if (c1 < c2)
 304                 c1 = c2;
 305         else if (!c1 && c2)
 306                 c1 = c2;
 307
 308         errno = 0;
 309         cpus = strtoul(c1, NULL, 0);
 310         if (errno != 0)
 311                 return -1;
 312
 313         return cpus;
 314 }
 315
 316 static inline bool is_unified_hierarchy(const struct hierarchy *h)
 317 {
 318         return h->version == CGROUP2_SUPER_MAGIC;
 319 }
 320
 321 /* Given two null-terminated lists of strings, return true if any string is in
 322  * both.
 323  */
 324 static bool controller_lists_intersect(char **l1, char **l2)
 325 {
 326         if (!l1 || !l2)
 327                 return false;
 328
 329         for (int i = 0; l1[i]; i++)
 330                 if (string_in_list(l2, l1[i]))
 331                         return true;
 332
 333         return false;
 334 }
 335
 336 /* For a null-terminated list of controllers @clist, return true if any of those
 337  * controllers is already listed the null-terminated list of hierarchies @hlist.
 338  * Realistically, if one is present, all must be present.
 339  */
 340 static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
 341 {
 342         if (!hlist)
 343                 return false;
 344
 345         for (int i = 0; hlist[i]; i++)
 346                 if (controller_lists_intersect(hlist[i]->controllers, clist))
 347                         return true;
 348
 349         return false;
 350 }
 351
 352 /* Return true if the controller @entry is found in the null-terminated list of
 353  * hierarchies @hlist.
 354  */
 355 static bool controller_found(struct hierarchy **hlist, char *entry)
 356 {
 357         if (!hlist)
 358                 return false;
 359
 360         for (int i = 0; hlist[i]; i++)
 361                 if (string_in_list(hlist[i]->controllers, entry))
 362                         return true;
 363
 364         return false;
 365 }
 366
 367 /* Return true if all of the controllers which we require have been found.  The
 368  * required list is  freezer and anything in lxc.cgroup.use.
 369  */
 370 static bool all_controllers_found(struct cgroup_ops *ops)
 371 {
 372         struct hierarchy **hlist;
 373
 374         if (!ops->cgroup_use)
 375                 return true;
 376
 377         hlist = ops->hierarchies;
 378         for (char **cur = ops->cgroup_use; cur && *cur; cur++)
 379                 if (!controller_found(hlist, *cur))
 380                         return log_error(false, "No %s controller mountpoint found", *cur);
 381
 382         return true;
 383 }
 384
 385 /* Get the controllers from a mountinfo line There are other ways we could get
 386  * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
 387  * could parse the mount options. But we simply assume that the mountpoint must
 388  * be /sys/fs/cgroup/controller-list
 389  */
 390 static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
 391                                         int type)
 392 {
 393         /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
 394          * for legacy hierarchies.
 395          */
 396         __do_free_string_list char **aret = NULL;
 397         int i;
 398         char *p2, *tok;
 399         char *p = line, *sep = ",";
 400
 401         for (i = 0; i < 4; i++) {
 402                 p = strchr(p, ' ');
 403                 if (!p)
 404                         return NULL;
 405                 p++;
 406         }
 407
 408         /* Note, if we change how mountinfo works, then our caller will need to
 409          * verify /sys/fs/cgroup/ in this field.
 410          */
 411         if (!strnequal(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15))
 412                 return log_warn(NULL, "Found hierarchy not under " DEFAULT_CGROUP_MOUNTPOINT ": \"%s\"", p);
 413
 414         p += 15;
 415         p2 = strchr(p, ' ');
 416         if (!p2)
 417                 return log_error(NULL, "Corrupt mountinfo");
 418         *p2 = '\0';
 419
 420         if (type == CGROUP_SUPER_MAGIC) {
 421                 __do_free char *dup = NULL;
 422
 423                 /* strdup() here for v1 hierarchies. Otherwise
 424                  * lxc_iterate_parts() will destroy mountpoints such as
 425                  * "/sys/fs/cgroup/cpu,cpuacct".
 426                  */
 427                 dup = must_copy_string(p);
 428                 if (!dup)
 429                         return NULL;
 430
 431                 lxc_iterate_parts(tok, dup, sep)
 432                         must_append_controller(klist, nlist, &aret, tok);
 433         }
 434         *p2 = ' ';
 435
 436         return move_ptr(aret);
 437 }
 438
 439 static char **cg_unified_make_empty_controller(void)
 440 {
 441         __do_free_string_list char **aret = NULL;
 442         int newentry;
 443
 444         newentry = append_null_to_list((void ***)&aret);
 445         aret[newentry] = NULL;
 446         return move_ptr(aret);
 447 }
 448
 449 static char **cg_unified_get_controllers(int dfd, const char *file)
 450 {
 451         __do_free char *buf = NULL;
 452         __do_free_string_list char **aret = NULL;
 453         char *sep = " \t\n";
 454         char *tok;
 455
 456         buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
 457         if (!buf)
 458                 return NULL;
 459
 460         lxc_iterate_parts(tok, buf, sep) {
 461                 int newentry;
 462                 char *copy;
 463
 464                 newentry = append_null_to_list((void ***)&aret);
 465                 copy = must_copy_string(tok);
 466                 aret[newentry] = copy;
 467         }
 468
 469         return move_ptr(aret);
 470 }
 471
 472 static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
 473                                        char **controllers)
 474 {
 475         if (!ops->cgroup_use)
 476                 return true;
 477
 478         for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
 479                 bool found = false;
 480
 481                 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
 482                         if (!strequal(*cur_use, *cur_ctrl))
 483                                 continue;
 484
 485                         found = true;
 486                         break;
 487                 }
 488
 489                 if (found)
 490                         continue;
 491
 492                 return false;
 493         }
 494
 495         return true;
 496 }
 497
 498 static int add_hierarchy(struct cgroup_ops *ops, char **clist, char *mountpoint,
 499                          char *container_base_path, int type)
 500 {
 501         __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
 502         __do_free struct hierarchy *new = NULL;
 503         __do_free_string_list char **controllers = clist;
 504         int idx;
 505
 506         if (abspath(container_base_path))
 507                 return syserrno_set(-EINVAL, "Container base path must be relative to controller mount");
 508
 509         if (!controllers && type != CGROUP2_SUPER_MAGIC)
 510                 return syserrno_set(-EINVAL, "Empty controller list for non-unified cgroup hierarchy passed");
 511
 512         dfd_mnt = open_at(-EBADF, mountpoint, PROTECT_OPATH_DIRECTORY,
 513                           PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
 514         if (dfd_mnt < 0)
 515                 return syserrno(-errno, "Failed to open %s", mountpoint);
 516
 517         if (!is_empty_string(container_base_path)) {
 518                 dfd_base = open_at(dfd_mnt, container_base_path,
 519                                    PROTECT_OPATH_DIRECTORY,
 520                                    PROTECT_LOOKUP_BENEATH_XDEV, 0);
 521                 if (dfd_base < 0)
 522                         return syserrno(-errno, "Failed to open %d(%s)", dfd_base, container_base_path);
 523         }
 524
 525         if (!controllers) {
 526                 /*
 527                 * We assume that the cgroup we're currently in has been delegated to
 528                 * us and we are free to further delege all of the controllers listed
 529                 * in cgroup.controllers further down the hierarchy.
 530                  */
 531                 if (dfd_base < 0)
 532                         controllers = cg_unified_get_controllers(dfd_mnt, "cgroup.controllers");
 533                 else
 534                         controllers = cg_unified_get_controllers(dfd_base, "cgroup.controllers");
 535                 if (!controllers)
 536                         controllers = cg_unified_make_empty_controller();
 537                 if (!controllers[0])
 538                         TRACE("No controllers are enabled for delegation");
 539         }
 540
 541         /* Exclude all controllers that cgroup use does not want. */
 542         if (!cgroup_use_wants_controllers(ops, controllers))
 543                 return log_trace(0, "Skipping cgroup hiearchy with non-requested controllers");
 544
 545         new = zalloc(sizeof(*new));
 546         if (!new)
 547                 return ret_errno(ENOMEM);
 548
 549         new->version                    = type;
 550         new->controllers                = move_ptr(controllers);
 551         new->mountpoint                 = mountpoint;
 552         new->container_base_path        = container_base_path;
 553         new->cgfd_con                   = -EBADF;
 554         new->cgfd_limit                 = -EBADF;
 555         new->cgfd_mon                   = -EBADF;
 556
 557         TRACE("Adding cgroup hierarchy with mountpoint %s and base cgroup %s",
 558               mountpoint, container_base_path);
 559         for (char *const *it = new->controllers; it && *it; it++)
 560                 TRACE("The detected hierarchy contains the %s controller", *it);
 561
 562         idx = append_null_to_list((void ***)&ops->hierarchies);
 563         if (dfd_base < 0)
 564                 new->dfd_base = dfd_mnt;
 565         else
 566                 new->dfd_base = move_fd(dfd_base);
 567         new->dfd_mnt = move_fd(dfd_mnt);
 568         if (type == CGROUP2_SUPER_MAGIC)
 569                 ops->unified = new;
 570         (ops->hierarchies)[idx] = move_ptr(new);
 571         return 0;
 572 }
 573
 574 /* Get a copy of the mountpoint from @line, which is a line from
 575  * /proc/self/mountinfo.
 576  */
 577 static char *cg_hybrid_get_mountpoint(char *line)
 578 {
 579         char *p = line, *sret = NULL;
 580         size_t len;
 581         char *p2;
 582
 583         for (int i = 0; i < 4; i++) {
 584                 p = strchr(p, ' ');
 585                 if (!p)
 586                         return NULL;
 587                 p++;
 588         }
 589
 590         if (!strnequal(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15))
 591                 return NULL;
 592
 593         p2 = strchr(p + 15, ' ');
 594         if (!p2)
 595                 return NULL;
 596         *p2 = '\0';
 597
 598         len = strlen(p);
 599         sret = must_realloc(NULL, len + 1);
 600         memcpy(sret, p, len);
 601         sret[len] = '\0';
 602
 603         return sret;
 604 }
 605
 606 /* Given a multi-line string, return a null-terminated copy of the current line. */
 607 static char *copy_to_eol(char *p)
 608 {
 609         char *p2, *sret;
 610         size_t len;
 611
 612         p2 = strchr(p, '\n');
 613         if (!p2)
 614                 return NULL;
 615
 616         len = p2 - p;
 617         sret = must_realloc(NULL, len + 1);
 618         memcpy(sret, p, len);
 619         sret[len] = '\0';
 620
 621         return sret;
 622 }
 623
 624 /* cgline: pointer to character after the first ':' in a line in a \n-terminated
 625  * /proc/self/cgroup file. Check whether controller c is present.
 626  */
 627 static bool controller_in_clist(char *cgline, char *c)
 628 {
 629         __do_free char *tmp = NULL;
 630         char *tok, *eol;
 631         size_t len;
 632
 633         eol = strchr(cgline, ':');
 634         if (!eol)
 635                 return false;
 636
 637         len = eol - cgline;
 638         tmp = must_realloc(NULL, len + 1);
 639         memcpy(tmp, cgline, len);
 640         tmp[len] = '\0';
 641
 642         lxc_iterate_parts(tok, tmp, ",")
 643                 if (strequal(tok, c))
 644                         return true;
 645
 646         return false;
 647 }
 648
 649 static inline char *trim(char *s)
 650 {
 651         size_t len;
 652
 653         len = strlen(s);
 654         while ((len > 1) && (s[len - 1] == '\n'))
 655                 s[--len] = '\0';
 656
 657         return s;
 658 }
 659
 660 /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
 661  * @controller.
 662  */
 663 static char *cg_hybrid_get_current_cgroup(bool relative, char *basecginfo,
 664                                           char *controller, int type)
 665 {
 666         char *base_cgroup = basecginfo;
 667
 668         for (;;) {
 669                 bool is_cgv2_base_cgroup = false;
 670
 671                 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
 672                 if ((type == CGROUP2_SUPER_MAGIC) && (*base_cgroup == '0'))
 673                         is_cgv2_base_cgroup = true;
 674
 675                 base_cgroup = strchr(base_cgroup, ':');
 676                 if (!base_cgroup)
 677                         return NULL;
 678                 base_cgroup++;
 679
 680                 if (is_cgv2_base_cgroup || (controller && controller_in_clist(base_cgroup, controller))) {
 681                         __do_free char *copy = NULL;
 682
 683                         base_cgroup = strchr(base_cgroup, ':');
 684                         if (!base_cgroup)
 685                                 return NULL;
 686                         base_cgroup++;
 687
 688                         copy = copy_to_eol(base_cgroup);
 689                         if (!copy)
 690                                 return NULL;
 691                         trim(copy);
 692
 693                         if (!relative) {
 694                                 base_cgroup = prune_init_scope(copy);
 695                                 if (!base_cgroup)
 696                                         return NULL;
 697                         } else {
 698                                 base_cgroup = copy;
 699                         }
 700
 701                         if (abspath(base_cgroup))
 702                                 base_cgroup = deabs(base_cgroup);
 703
 704                         /* We're allowing base_cgroup to be "". */
 705                         return strdup(base_cgroup);
 706                 }
 707
 708                 base_cgroup = strchr(base_cgroup, '\n');
 709                 if (!base_cgroup)
 710                         return NULL;
 711                 base_cgroup++;
 712         }
 713 }
 714
 715 static void must_append_string(char ***list, char *entry)
 716 {
 717         int newentry;
 718         char *copy;
 719
 720         newentry = append_null_to_list((void ***)list);
 721         copy = must_copy_string(entry);
 722         (*list)[newentry] = copy;
 723 }
 724
 725 static int get_existing_subsystems(char ***klist, char ***nlist)
 726 {
 727         __do_free char *line = NULL;
 728         __do_fclose FILE *f = NULL;
 729         size_t len = 0;
 730
 731         f = fopen("/proc/self/cgroup", "re");
 732         if (!f)
 733                 return -1;
 734
 735         while (getline(&line, &len, f) != -1) {
 736                 char *p, *p2, *tok;
 737                 p = strchr(line, ':');
 738                 if (!p)
 739                         continue;
 740                 p++;
 741                 p2 = strchr(p, ':');
 742                 if (!p2)
 743                         continue;
 744                 *p2 = '\0';
 745
 746                 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
 747                  * contains an entry of the form:
 748                  *
 749                  *      0::/some/path
 750                  *
 751                  * In this case we use "cgroup2" as controller name.
 752                  */
 753                 if ((p2 - p) == 0) {
 754                         must_append_string(klist, "cgroup2");
 755                         continue;
 756                 }
 757
 758                 lxc_iterate_parts(tok, p, ",") {
 759                         if (strnequal(tok, "name=", 5))
 760                                 must_append_string(nlist, tok);
 761                         else
 762                                 must_append_string(klist, tok);
 763                 }
 764         }
 765
 766         return 0;
 767 }
 768
 769 static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
 770                                               char **nlist)
 771 {
 772         int k;
 773         char **it;
 774
 775         TRACE("basecginfo is:");
 776         TRACE("%s", basecginfo);
 777
 778         for (k = 0, it = klist; it && *it; it++, k++)
 779                 TRACE("kernel subsystem %d: %s", k, *it);
 780
 781         for (k = 0, it = nlist; it && *it; it++, k++)
 782                 TRACE("named subsystem %d: %s", k, *it);
 783 }
 784
 785 static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
 786 {
 787         if (!path_prune || !hierarchies)
 788                 return 0;
 789
 790         for (int i = 0; hierarchies[i]; i++) {
 791                 struct hierarchy *h = hierarchies[i];
 792                 int ret;
 793
 794                 ret = cgroup_tree_prune(h->dfd_base, path_prune);
 795                 if (ret < 0)
 796                         SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
 797                 else
 798                         TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
 799
 800                 free_equal(h->container_limit_path, h->container_full_path);
 801         }
 802
 803         return 0;
 804 }
 805
 806 struct generic_userns_exec_data {
 807         struct hierarchy **hierarchies;
 808         const char *path_prune;
 809         struct lxc_conf *conf;
 810         uid_t origuid; /* target uid in parent namespace */
 811         char *path;
 812 };
 813
 814 static int cgroup_tree_remove_wrapper(void *data)
 815 {
 816         struct generic_userns_exec_data *arg = data;
 817         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
 818         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
 819         int ret;
 820
 821         if (!lxc_drop_groups() && errno != EPERM)
 822                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
 823
 824         ret = setresgid(nsgid, nsgid, nsgid);
 825         if (ret < 0)
 826                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
 827                                        (int)nsgid, (int)nsgid, (int)nsgid);
 828
 829         ret = setresuid(nsuid, nsuid, nsuid);
 830         if (ret < 0)
 831                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
 832                                        (int)nsuid, (int)nsuid, (int)nsuid);
 833
 834         return cgroup_tree_remove(arg->hierarchies, arg->path_prune);
 835 }
 836
 837 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
 838                                                 struct lxc_handler *handler)
 839 {
 840         int ret;
 841
 842         if (!ops) {
 843                 ERROR("Called with uninitialized cgroup operations");
 844                 return;
 845         }
 846
 847         if (!ops->hierarchies)
 848                 return;
 849
 850         if (!handler) {
 851                 ERROR("Called with uninitialized handler");
 852                 return;
 853         }
 854
 855         if (!handler->conf) {
 856                 ERROR("Called with uninitialized conf");
 857                 return;
 858         }
 859
 860         if (!ops->container_limit_cgroup) {
 861                 WARN("Uninitialized limit cgroup");
 862                 return;
 863         }
 864
 865         ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
 866         if (ret < 0)
 867                 WARN("Failed to detach bpf program from cgroup");
 868
 869         if (!lxc_list_empty(&handler->conf->id_map)) {
 870                 struct generic_userns_exec_data wrap = {
 871                         .conf                   = handler->conf,
 872                         .path_prune             = ops->container_limit_cgroup,
 873                         .hierarchies            = ops->hierarchies,
 874                         .origuid                = 0,
 875                 };
 876                 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
 877                                     &wrap, "cgroup_tree_remove_wrapper");
 878         } else {
 879                 ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup);
 880         }
 881         if (ret < 0)
 882                 SYSWARN("Failed to destroy cgroups");
 883 }
 884
 885 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
 886 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
 887 static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
 888                                     bool am_initialized)
 889 {
 890         __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
 891                        *offlinecpus = NULL, *posscpus = NULL;
 892         __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
 893                            *possmask = NULL;
 894         int ret;
 895         ssize_t i;
 896         ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
 897         bool flipped_bit = false;
 898
 899         posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
 900         if (!posscpus)
 901                 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
 902
 903         /* Get maximum number of cpus found in possible cpuset. */
 904         maxposs = get_max_cpus(posscpus);
 905         if (maxposs < 0 || maxposs >= INT_MAX - 1)
 906                 return false;
 907
 908         if (file_exists(__ISOL_CPUS)) {
 909                 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
 910                 if (!isolcpus)
 911                         return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
 912
 913                 if (isdigit(isolcpus[0])) {
 914                         /* Get maximum number of cpus found in isolated cpuset. */
 915                         maxisol = get_max_cpus(isolcpus);
 916                         if (maxisol < 0 || maxisol >= INT_MAX - 1)
 917                                 return false;
 918                 }
 919
 920                 if (maxposs < maxisol)
 921                         maxposs = maxisol;
 922                 maxposs++;
 923         } else {
 924                 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
 925         }
 926
 927         if (file_exists(__OFFLINE_CPUS)) {
 928                 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
 929                 if (!offlinecpus)
 930                         return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
 931
 932                 if (isdigit(offlinecpus[0])) {
 933                         /* Get maximum number of cpus found in offline cpuset. */
 934                         maxoffline = get_max_cpus(offlinecpus);
 935                         if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
 936                                 return false;
 937                 }
 938
 939                 if (maxposs < maxoffline)
 940                         maxposs = maxoffline;
 941                 maxposs++;
 942         } else {
 943                 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
 944         }
 945
 946         if ((maxisol == 0) && (maxoffline == 0)) {
 947                 cpulist = move_ptr(posscpus);
 948                 goto copy_parent;
 949         }
 950
 951         possmask = lxc_cpumask(posscpus, maxposs);
 952         if (!possmask)
 953                 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
 954
 955         if (maxisol > 0) {
 956                 isolmask = lxc_cpumask(isolcpus, maxposs);
 957                 if (!isolmask)
 958                         return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
 959         }
 960
 961         if (maxoffline > 0) {
 962                 offlinemask = lxc_cpumask(offlinecpus, maxposs);
 963                 if (!offlinemask)
 964                         return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
 965         }
 966
 967         for (i = 0; i <= maxposs; i++) {
 968                 if ((isolmask && !is_set(i, isolmask)) ||
 969                     (offlinemask && !is_set(i, offlinemask)) ||
 970                     !is_set(i, possmask))
 971                         continue;
 972
 973                 flipped_bit = true;
 974                 clear_bit(i, possmask);
 975         }
 976
 977         if (!flipped_bit) {
 978                 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
 979                 TRACE("No isolated or offline cpus present in cpuset");
 980         } else {
 981                 cpulist = move_ptr(posscpus);
 982                 TRACE("Removed isolated or offline cpus from cpuset");
 983         }
 984         if (!cpulist)
 985                 return log_error_errno(false, errno, "Failed to create cpu list");
 986
 987 copy_parent:
 988         if (!am_initialized) {
 989                 ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
 990                 if (ret < 0)
 991                         return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
 992
 993                 TRACE("Copied cpu settings of parent cgroup");
 994         }
 995
 996         return true;
 997 }
 998
 999 static bool cpuset1_initialize(int dfd_base, int dfd_next)
1000 {
1001         char mems[PATH_MAX];
1002         ssize_t bytes;
1003         char v;
1004
1005         /*
1006         * Determine whether the base cgroup has cpuset
1007         * inheritance turned on.
1008          */
1009         bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
1010         if (bytes < 0)
1011                 return syserrno(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
1012
1013         /*
1014         * Initialize cpuset.cpus and make remove any isolated
1015         * and offline cpus.
1016          */
1017         if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
1018                 return syserrno(false, "Failed to initialize cpuset.cpus");
1019
1020         /* Read cpuset.mems from parent... */
1021         bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
1022         if (bytes < 0)
1023                 return syserrno(false, "Failed to read file %d(cpuset.mems)", dfd_base);
1024
1025         /* ... and copy to first cgroup in the tree... */
1026         bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
1027         if (bytes < 0)
1028                 return syserrno(false, "Failed to write %d(cpuset.mems)", dfd_next);
1029
1030         /* ... and finally turn on cpuset inheritance. */
1031         bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
1032         if (bytes < 0)
1033                 return syserrno(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
1034
1035         return log_trace(true, "Initialized cpuset in the legacy hierarchy");
1036 }
1037
1038 static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
1039                                 bool cpuset_v1, bool eexist_ignore)
1040 {
1041         __do_close int dfd_final = -EBADF;
1042         int dfd_cur = dfd_base;
1043         int ret = 0;
1044         size_t len;
1045         char *cur;
1046         char buf[PATH_MAX];
1047
1048         if (is_empty_string(path))
1049                 return ret_errno(EINVAL);
1050
1051         len = strlcpy(buf, path, sizeof(buf));
1052         if (len >= sizeof(buf))
1053                 return ret_errno(E2BIG);
1054
1055         lxc_iterate_parts(cur, buf, "/") {
1056                 /*
1057                  * Even though we vetted the paths when we parsed the config
1058                  * we're paranoid here and check that the path is neither
1059                  * absolute nor walks upwards.
1060                  */
1061                 if (abspath(cur))
1062                         return syserrno_set(-EINVAL, "No absolute paths allowed");
1063
1064                 if (strnequal(cur, "..", STRLITERALLEN("..")))
1065                         return syserrno_set(-EINVAL, "No upward walking paths allowed");
1066
1067                 ret = mkdirat(dfd_cur, cur, mode);
1068                 if (ret < 0) {
1069                         if (errno != EEXIST)
1070                                 return syserrno(-errno, "Failed to create %d(%s)", dfd_cur, cur);
1071
1072                         ret = -EEXIST;
1073                 }
1074                 TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
1075
1076                 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
1077                 if (dfd_final < 0)
1078                         return syserrno(-errno, "Fail to open%s directory %d(%s)",
1079                                         !ret ? " newly created" : "", dfd_base, cur);
1080                 if (dfd_cur != dfd_base)
1081                         close(dfd_cur);
1082                 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
1083                         return syserrno(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
1084                 /*
1085                  * Leave dfd_final pointing to the last fd we opened so
1086                  * it will be automatically zapped if we return early.
1087                  */
1088                 dfd_cur = dfd_final;
1089         }
1090
1091         /* The final cgroup must be succesfully creatd by us. */
1092         if (ret) {
1093                 if (ret != -EEXIST || !eexist_ignore)
1094                         return syserrno_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
1095         }
1096
1097         return move_fd(dfd_final);
1098 }
1099
1100 static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
1101                                struct hierarchy *h, const char *cgroup_limit_dir,
1102                                const char *cgroup_leaf, bool payload)
1103 {
1104         __do_close int fd_limit = -EBADF, fd_final = -EBADF;
1105         __do_free char *path = NULL, *limit_path = NULL;
1106         bool cpuset_v1 = false;
1107
1108         /*
1109          * The legacy cpuset controller needs massaging in case inheriting
1110          * settings from its immediate ancestor cgroup hasn't been turned on.
1111          */
1112         cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
1113
1114         if (payload && cgroup_leaf) {
1115                 /* With isolation both parts need to not already exist. */
1116                 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
1117                 if (fd_limit < 0)
1118                         return syserrno(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
1119
1120                 TRACE("Created limit cgroup %d->%d(%s)",
1121                       fd_limit, h->dfd_base, cgroup_limit_dir);
1122
1123                 /*
1124                  * With isolation the devices legacy cgroup needs to be
1125                  * iinitialized early, as it typically contains an 'a' (all)
1126                  * line, which is not possible once a subdirectory has been
1127                  * created.
1128                  */
1129                 if (string_in_list(h->controllers, "devices") &&
1130                     !ops->setup_limits_legacy(ops, conf, true))
1131                         return log_error(false, "Failed to setup legacy device limits");
1132
1133                 limit_path = must_make_path(h->mountpoint, h->container_base_path, cgroup_limit_dir, NULL);
1134                 path = must_make_path(limit_path, cgroup_leaf, NULL);
1135
1136                 /*
1137                  * If we use a separate limit cgroup, the leaf cgroup, i.e. the
1138                  * cgroup the container actually resides in, is below fd_limit.
1139                  */
1140                 fd_final = __cgroup_tree_create(fd_limit, cgroup_leaf, 0755, cpuset_v1, false);
1141                 if (fd_final < 0) {
1142                         /* Ensure we don't leave any garbage behind. */
1143                         if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
1144                                 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
1145                         else
1146                                 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
1147                 }
1148         } else {
1149                 path = must_make_path(h->mountpoint, h->container_base_path, cgroup_limit_dir, NULL);
1150
1151                 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
1152         }
1153         if (fd_final < 0)
1154                 return syserrno(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
1155
1156         if (payload) {
1157                 h->cgfd_con = move_fd(fd_final);
1158                 h->container_full_path = move_ptr(path);
1159
1160                 if (fd_limit < 0)
1161                         h->cgfd_limit = h->cgfd_con;
1162                 else
1163                         h->cgfd_limit = move_fd(fd_limit);
1164
1165                 if (limit_path)
1166                         h->container_limit_path = move_ptr(limit_path);
1167                 else
1168                         h->container_limit_path = h->container_full_path;
1169         } else {
1170                 h->cgfd_mon = move_fd(fd_final);
1171         }
1172
1173         return true;
1174 }
1175
1176 static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
1177                                    bool payload)
1178 {
1179         bool prune = true;
1180
1181         if (payload) {
1182                 /* Check whether we actually created the cgroup to prune. */
1183                 if (h->cgfd_limit < 0)
1184                         prune = false;
1185
1186                 free_equal(h->container_full_path, h->container_limit_path);
1187                 close_equal(h->cgfd_con, h->cgfd_limit);
1188         } else {
1189                 /* Check whether we actually created the cgroup to prune. */
1190                 if (h->cgfd_mon < 0)
1191                         prune = false;
1192
1193                 close_prot_errno_disarm(h->cgfd_mon);
1194         }
1195
1196         /* We didn't create this cgroup. */
1197         if (!prune)
1198                 return;
1199
1200         if (cgroup_tree_prune(h->dfd_base, path_prune))
1201                 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
1202         else
1203                 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
1204 }
1205
1206 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1207                                                 struct lxc_handler *handler)
1208 {
1209         int len;
1210         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1211         const struct lxc_conf *conf;
1212
1213         if (!ops) {
1214                 ERROR("Called with uninitialized cgroup operations");
1215                 return;
1216         }
1217
1218         if (!ops->hierarchies)
1219                 return;
1220
1221         if (!handler) {
1222                 ERROR("Called with uninitialized handler");
1223                 return;
1224         }
1225
1226         if (!handler->conf) {
1227                 ERROR("Called with uninitialized conf");
1228                 return;
1229         }
1230         conf = handler->conf;
1231
1232         if (!ops->monitor_cgroup) {
1233                 WARN("Uninitialized monitor cgroup");
1234                 return;
1235         }
1236
1237         len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1238         if (len < 0)
1239                 return;
1240
1241         for (int i = 0; ops->hierarchies[i]; i++) {
1242                 __do_close int fd_pivot = -EBADF;
1243                 __do_free char *pivot_path = NULL;
1244                 struct hierarchy *h = ops->hierarchies[i];
1245                 bool cpuset_v1 = false;
1246                 int ret;
1247
1248                 /* Monitor might have died before we entered the cgroup. */
1249                 if (handler->monitor_pid <= 0) {
1250                         WARN("No valid monitor process found while destroying cgroups");
1251                         goto cgroup_prune_tree;
1252                 }
1253
1254                 if (conf->cgroup_meta.monitor_pivot_dir)
1255                         pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
1256                 else if (conf->cgroup_meta.dir)
1257                         pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
1258                 else
1259                         pivot_path = must_make_path(CGROUP_PIVOT, NULL);
1260
1261                 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
1262
1263                 fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
1264                 if (fd_pivot < 0) {
1265                         SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
1266                         continue;
1267                 }
1268
1269                 ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
1270                 if (ret != 0) {
1271                         SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
1272                         continue;
1273                 }
1274
1275 cgroup_prune_tree:
1276                 ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup);
1277                 if (ret < 0)
1278                         SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup);
1279                 else
1280                         TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup);
1281         }
1282 }
1283
1284 /*
1285  * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
1286  * proper prefix directory of lxc.cgroup.dir.payload.
1287  *
1288  * Returns the prefix length if it is set, otherwise zero on success.
1289  */
1290 static bool check_cgroup_dir_config(struct lxc_conf *conf)
1291 {
1292         const char *monitor_dir = conf->cgroup_meta.monitor_dir,
1293                    *container_dir = conf->cgroup_meta.container_dir,
1294                    *namespace_dir = conf->cgroup_meta.namespace_dir;
1295
1296         /* none of the new options are set, all is fine */
1297         if (!monitor_dir && !container_dir && !namespace_dir)
1298                 return true;
1299
1300         /* some are set, make sure lxc.cgroup.dir is not also set*/
1301         if (conf->cgroup_meta.dir)
1302                 return log_error_errno(false, EINVAL,
1303                         "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
1304
1305         /* make sure both monitor and payload are set */
1306         if (!monitor_dir || !container_dir)
1307                 return log_error_errno(false, EINVAL,
1308                         "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
1309
1310         /* namespace_dir may be empty */
1311         return true;
1312 }
1313
1314 __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1315 {
1316         __do_free char *monitor_cgroup = NULL;
1317         int idx = 0;
1318         int i;
1319         size_t len;
1320         char *suffix = NULL;
1321         struct lxc_conf *conf;
1322
1323         if (!ops)
1324                 return ret_set_errno(false, ENOENT);
1325
1326         if (!ops->hierarchies)
1327                 return true;
1328
1329         if (ops->monitor_cgroup)
1330                 return ret_set_errno(false, EEXIST);
1331
1332         if (!handler || !handler->conf)
1333                 return ret_set_errno(false, EINVAL);
1334
1335         conf = handler->conf;
1336
1337         if (!check_cgroup_dir_config(conf))
1338                 return false;
1339
1340         if (conf->cgroup_meta.monitor_dir) {
1341                 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
1342         } else if (conf->cgroup_meta.dir) {
1343                 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1344                                              DEFAULT_MONITOR_CGROUP_PREFIX,
1345                                              handler->name,
1346                                              CGROUP_CREATE_RETRY, NULL);
1347         } else if (ops->cgroup_pattern) {
1348                 __do_free char *cgroup_tree = NULL;
1349
1350                 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1351                 if (!cgroup_tree)
1352                         return ret_set_errno(false, ENOMEM);
1353
1354                 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
1355                                              DEFAULT_MONITOR_CGROUP,
1356                                              CGROUP_CREATE_RETRY, NULL);
1357         } else {
1358                 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1359                                              handler->name,
1360                                              CGROUP_CREATE_RETRY, NULL);
1361         }
1362         if (!monitor_cgroup)
1363                 return ret_set_errno(false, ENOMEM);
1364
1365         if (!conf->cgroup_meta.monitor_dir) {
1366                 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1367                 *suffix = '\0';
1368         }
1369         do {
1370                 if (idx && suffix)
1371                         sprintf(suffix, "-%d", idx);
1372
1373                 for (i = 0; ops->hierarchies[i]; i++) {
1374                         if (cgroup_tree_create(ops, handler->conf,
1375                                                ops->hierarchies[i],
1376                                                monitor_cgroup, NULL, false))
1377                                 continue;
1378
1379                         DEBUG("Failed to create cgroup %s)", monitor_cgroup);
1380                         for (int j = 0; j <= i; j++)
1381                                 cgroup_tree_prune_leaf(ops->hierarchies[j],
1382                                                        monitor_cgroup, false);
1383
1384                         idx++;
1385                         break;
1386                 }
1387         } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1388
1389         if (idx == 1000 || (!suffix && idx != 0))
1390                 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
1391
1392         ops->monitor_cgroup = move_ptr(monitor_cgroup);
1393         return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1394 }
1395
1396 /*
1397  * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1398  * next cgroup_pattern-1, -2, ..., -999.
1399  */
1400 __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1401 {
1402         __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL;
1403         char *limit_cgroup;
1404         int idx = 0;
1405         int i;
1406         size_t len;
1407         char *suffix = NULL;
1408         struct lxc_conf *conf;
1409
1410         if (!ops)
1411                 return ret_set_errno(false, ENOENT);
1412
1413         if (!ops->hierarchies)
1414                 return true;
1415
1416         if (ops->container_cgroup || ops->container_limit_cgroup)
1417                 return ret_set_errno(false, EEXIST);
1418
1419         if (!handler || !handler->conf)
1420                 return ret_set_errno(false, EINVAL);
1421
1422         conf = handler->conf;
1423
1424         if (!check_cgroup_dir_config(conf))
1425                 return false;
1426
1427         if (conf->cgroup_meta.container_dir) {
1428                 __limit_cgroup = strdup(conf->cgroup_meta.container_dir);
1429                 if (!__limit_cgroup)
1430                         return ret_set_errno(false, ENOMEM);
1431
1432                 if (conf->cgroup_meta.namespace_dir) {
1433                         container_cgroup = must_make_path(__limit_cgroup,
1434                                                           conf->cgroup_meta.namespace_dir,
1435                                                           NULL);
1436                         limit_cgroup = __limit_cgroup;
1437                 } else {
1438                         /* explicit paths but without isolation */
1439                         limit_cgroup = move_ptr(__limit_cgroup);
1440                         container_cgroup = limit_cgroup;
1441                 }
1442         } else if (conf->cgroup_meta.dir) {
1443                 limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1444                                            DEFAULT_PAYLOAD_CGROUP_PREFIX,
1445                                            handler->name,
1446                                            CGROUP_CREATE_RETRY, NULL);
1447                 container_cgroup = limit_cgroup;
1448         } else if (ops->cgroup_pattern) {
1449                 __do_free char *cgroup_tree = NULL;
1450
1451                 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1452                 if (!cgroup_tree)
1453                         return ret_set_errno(false, ENOMEM);
1454
1455                 limit_cgroup = must_concat(&len, cgroup_tree, "/",
1456                                            DEFAULT_PAYLOAD_CGROUP,
1457                                            CGROUP_CREATE_RETRY, NULL);
1458                 container_cgroup = limit_cgroup;
1459         } else {
1460                 limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1461                                            handler->name,
1462                                            CGROUP_CREATE_RETRY, NULL);
1463                 container_cgroup = limit_cgroup;
1464         }
1465         if (!limit_cgroup)
1466                 return ret_set_errno(false, ENOMEM);
1467
1468         if (!conf->cgroup_meta.container_dir) {
1469                 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1470                 *suffix = '\0';
1471         }
1472         do {
1473                 if (idx && suffix)
1474                         sprintf(suffix, "-%d", idx);
1475
1476                 for (i = 0; ops->hierarchies[i]; i++) {
1477                         if (cgroup_tree_create(ops, handler->conf,
1478                                                ops->hierarchies[i], limit_cgroup,
1479                                                conf->cgroup_meta.namespace_dir,
1480                                                true))
1481                                 continue;
1482
1483                         DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path ?: "(null)");
1484                         for (int j = 0; j <= i; j++)
1485                                 cgroup_tree_prune_leaf(ops->hierarchies[j],
1486                                                        limit_cgroup, true);
1487
1488                         idx++;
1489                         break;
1490                 }
1491         } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1492
1493         if (idx == 1000 || (!suffix && idx != 0))
1494                 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
1495
1496         ops->container_cgroup = move_ptr(container_cgroup);
1497         if (__limit_cgroup)
1498                 ops->container_limit_cgroup = move_ptr(__limit_cgroup);
1499         else
1500                 ops->container_limit_cgroup = ops->container_cgroup;
1501         INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup",
1502              ops->container_cgroup, ops->container_limit_cgroup);
1503         return true;
1504 }
1505
1506 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1507                                               struct lxc_handler *handler)
1508 {
1509         int monitor_len, transient_len = 0;
1510         char monitor[INTTYPE_TO_STRLEN(pid_t)],
1511             transient[INTTYPE_TO_STRLEN(pid_t)];
1512
1513         if (!ops)
1514                 return ret_set_errno(false, ENOENT);
1515
1516         if (!ops->hierarchies)
1517                 return true;
1518
1519         if (!ops->monitor_cgroup)
1520                 return ret_set_errno(false, ENOENT);
1521
1522         if (!handler || !handler->conf)
1523                 return ret_set_errno(false, EINVAL);
1524
1525         monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1526         if (monitor_len < 0)
1527                 return false;
1528
1529         if (handler->transient_pid > 0) {
1530                 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1531                 if (transient_len < 0)
1532                         return false;
1533         }
1534
1535         for (int i = 0; ops->hierarchies[i]; i++) {
1536                 struct hierarchy *h = ops->hierarchies[i];
1537                 int ret;
1538
1539                 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", monitor, monitor_len);
1540                 if (ret)
1541                         return log_error_errno(false, errno, "Failed to enter cgroup %d", h->cgfd_mon);
1542
1543                 TRACE("Moved monitor into cgroup %d", h->cgfd_mon);
1544
1545                 if (handler->transient_pid <= 0)
1546                         continue;
1547
1548                 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", transient, transient_len);
1549                 if (ret)
1550                         return log_error_errno(false, errno, "Failed to enter cgroup %d", h->cgfd_mon);
1551
1552                 TRACE("Moved transient process into cgroup %d", h->cgfd_mon);
1553
1554                 /*
1555                  * we don't keep the fds for non-unified hierarchies around
1556                  * mainly because we don't make use of them anymore after the
1557                  * core cgroup setup is done but also because there are quite a
1558                  * lot of them.
1559                  */
1560                 if (!is_unified_hierarchy(h))
1561                         close_prot_errno_disarm(h->cgfd_mon);
1562         }
1563         handler->transient_pid = -1;
1564
1565         return true;
1566 }
1567
1568 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1569                                               struct lxc_handler *handler)
1570 {
1571         int len;
1572         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1573
1574         if (!ops)
1575                 return ret_set_errno(false, ENOENT);
1576
1577         if (!ops->hierarchies)
1578                 return true;
1579
1580         if (!ops->container_cgroup)
1581                 return ret_set_errno(false, ENOENT);
1582
1583         if (!handler || !handler->conf)
1584                 return ret_set_errno(false, EINVAL);
1585
1586         len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1587         if (len < 0)
1588                 return false;
1589
1590         for (int i = 0; ops->hierarchies[i]; i++) {
1591                 struct hierarchy *h = ops->hierarchies[i];
1592                 int ret;
1593
1594                 if (is_unified_hierarchy(h) &&
1595                     (handler->clone_flags & CLONE_INTO_CGROUP))
1596                         continue;
1597
1598                 ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len);
1599                 if (ret != 0)
1600                         return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path);
1601
1602                 TRACE("Moved container into %s cgroup via %d", h->container_full_path, h->cgfd_con);
1603         }
1604
1605         return true;
1606 }
1607
1608 static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1609                       gid_t chown_gid, mode_t chmod_mode)
1610 {
1611         int ret;
1612
1613         ret = fchownat(dirfd, path, chown_uid, chown_gid,
1614                        AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1615         if (ret < 0)
1616                 return log_warn_errno(-1,
1617                                       errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1618                                       dirfd, path, (int)chown_uid,
1619                                       (int)chown_gid);
1620
1621         ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1622         if (ret < 0)
1623                 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1624                                       dirfd, path, (int)chmod_mode);
1625
1626         return 0;
1627 }
1628
1629 /* chgrp the container cgroups to container group.  We leave
1630  * the container owner as cgroup owner.  So we must make the
1631  * directories 775 so that the container can create sub-cgroups.
1632  *
1633  * Also chown the tasks and cgroup.procs files.  Those may not
1634  * exist depending on kernel version.
1635  */
1636 static int chown_cgroup_wrapper(void *data)
1637 {
1638         int ret;
1639         uid_t destuid;
1640         struct generic_userns_exec_data *arg = data;
1641         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1642         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1643
1644         if (!lxc_drop_groups() && errno != EPERM)
1645                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1646
1647         ret = setresgid(nsgid, nsgid, nsgid);
1648         if (ret < 0)
1649                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
1650                                        (int)nsgid, (int)nsgid, (int)nsgid);
1651
1652         ret = setresuid(nsuid, nsuid, nsuid);
1653         if (ret < 0)
1654                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
1655                                        (int)nsuid, (int)nsuid, (int)nsuid);
1656
1657         destuid = get_ns_uid(arg->origuid);
1658         if (destuid == LXC_INVALID_UID)
1659                 destuid = 0;
1660
1661         for (int i = 0; arg->hierarchies[i]; i++) {
1662                 int dirfd = arg->hierarchies[i]->cgfd_con;
1663
1664                 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
1665
1666                 /*
1667                  * Failures to chown() these are inconvenient but not
1668                  * detrimental We leave these owned by the container launcher,
1669                  * so that container root can write to the files to attach.  We
1670                  * chmod() them 664 so that container systemd can write to the
1671                  * files (which systemd in wily insists on doing).
1672                  */
1673
1674                 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC)
1675                         (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
1676
1677                 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
1678
1679                 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
1680                         continue;
1681
1682                 for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++)
1683                         (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
1684         }
1685
1686         return 0;
1687 }
1688
1689 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1690                                       struct lxc_conf *conf)
1691 {
1692         struct generic_userns_exec_data wrap;
1693
1694         if (!ops)
1695                 return ret_set_errno(false, ENOENT);
1696
1697         if (!ops->hierarchies)
1698                 return true;
1699
1700         if (!ops->container_cgroup)
1701                 return ret_set_errno(false, ENOENT);
1702
1703         if (!conf)
1704                 return ret_set_errno(false, EINVAL);
1705
1706         if (lxc_list_empty(&conf->id_map))
1707                 return true;
1708
1709         wrap.origuid = geteuid();
1710         wrap.path = NULL;
1711         wrap.hierarchies = ops->hierarchies;
1712         wrap.conf = conf;
1713
1714         if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1715                 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1716
1717         return true;
1718 }
1719
1720 __cgfsng_ops static void cgfsng_payload_finalize(struct cgroup_ops *ops)
1721 {
1722         if (!ops)
1723                 return;
1724
1725         if (!ops->hierarchies)
1726                 return;
1727
1728         for (int i = 0; ops->hierarchies[i]; i++) {
1729                 struct hierarchy *h = ops->hierarchies[i];
1730                 /*
1731                  * we don't keep the fds for non-unified hierarchies around
1732                  * mainly because we don't make use of them anymore after the
1733                  * core cgroup setup is done but also because there are quite a
1734                  * lot of them.
1735                  */
1736                 if (!is_unified_hierarchy(h))
1737                         close_prot_errno_disarm(h->cgfd_con);
1738         }
1739
1740         /*
1741          * The checking for freezer support should obviously be done at cgroup
1742          * initialization time but that doesn't work reliable. The freezer
1743          * controller has been demoted (rightly so) to a simple file located in
1744          * each non-root cgroup. At the time when the container is created we
1745          * might still be located in /sys/fs/cgroup and so checking for
1746          * cgroup.freeze won't tell us anything because this file doesn't exist
1747          * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1748          * find an already existing cgroup and then check within that cgroup
1749          * for the existence of cgroup.freeze but that will only work on
1750          * systemd based hosts. Other init systems might not manage cgroups and
1751          * so no cgroup will exist. So we defer until we have created cgroups
1752          * for our container which means we check here.
1753          */
1754         if (pure_unified_layout(ops) &&
1755             !faccessat(ops->unified->cgfd_con, "cgroup.freeze", F_OK,
1756                        AT_SYMLINK_NOFOLLOW)) {
1757                 TRACE("Unified hierarchy supports freezer");
1758                 ops->unified->freezer_controller = 1;
1759         }
1760 }
1761
1762 /* cgroup-full:* is done, no need to create subdirs */
1763 static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
1764 {
1765         switch (cgroup_automount_type) {
1766         case LXC_AUTO_CGROUP_RO:
1767                 return true;
1768         case LXC_AUTO_CGROUP_RW:
1769                 return true;
1770         case LXC_AUTO_CGROUP_MIXED:
1771                 return true;
1772         }
1773
1774         return false;
1775 }
1776
1777 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1778  * remount controller ro if needed and bindmount the cgroupfs onto
1779  * control/the/cg/path.
1780  */
1781 static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
1782                                        char *controllerpath, char *cgpath,
1783                                        const char *container_cgroup)
1784 {
1785         __do_free char *sourcepath = NULL;
1786         int ret, remount_flags;
1787         int flags = MS_BIND;
1788
1789         if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1790             (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
1791                 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1792                 if (ret < 0)
1793                         return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1794                                                controllerpath, controllerpath);
1795
1796                 remount_flags = add_required_remount_flags(controllerpath,
1797                                                            controllerpath,
1798                                                            flags | MS_REMOUNT);
1799                 ret = mount(controllerpath, controllerpath, "cgroup",
1800                             remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1801                             NULL);
1802                 if (ret < 0)
1803                         return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", controllerpath);
1804
1805                 INFO("Remounted %s read-only", controllerpath);
1806         }
1807
1808         sourcepath = must_make_path(h->mountpoint, h->container_base_path,
1809                                     container_cgroup, NULL);
1810         if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
1811                 flags |= MS_RDONLY;
1812
1813         ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1814         if (ret < 0)
1815                 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1816                                        h->controllers[0], cgpath);
1817         INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1818
1819         if (flags & MS_RDONLY) {
1820                 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1821                                                            flags | MS_REMOUNT);
1822                 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1823                 if (ret < 0)
1824                         return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
1825                 INFO("Remounted %s read-only", cgpath);
1826         }
1827
1828         INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1829         return 0;
1830 }
1831
1832 /* __cgroupfs_mount
1833  *
1834  * Mount cgroup hierarchies directly without using bind-mounts. The main
1835  * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1836  * cgroups for the LXC_AUTO_CGROUP_FULL option.
1837  */
1838 static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1839                             struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
1840                             const char *hierarchy_mnt)
1841 {
1842         __do_close int fd_fs = -EBADF;
1843         unsigned int flags = 0;
1844         char *fstype;
1845         int ret;
1846
1847         if (dfd_mnt_cgroupfs < 0)
1848                 return ret_errno(EINVAL);
1849
1850         flags |= MOUNT_ATTR_NOSUID;
1851         flags |= MOUNT_ATTR_NOEXEC;
1852         flags |= MOUNT_ATTR_NODEV;
1853         flags |= MOUNT_ATTR_RELATIME;
1854
1855         if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1856             (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO))
1857                 flags |= MOUNT_ATTR_RDONLY;
1858
1859         if (is_unified_hierarchy(h))
1860                 fstype = "cgroup2";
1861         else
1862                 fstype = "cgroup";
1863
1864         if (can_use_mount_api()) {
1865                 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1866                 if (fd_fs < 0)
1867                         return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1868
1869                 if (!is_unified_hierarchy(h)) {
1870                         for (const char **it = (const char **)h->controllers; it && *it; it++) {
1871                                 if (strnequal(*it, "name=", STRLITERALLEN("name=")))
1872                                         ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1873                                 else
1874                                         ret = fs_set_property(fd_fs, *it, "");
1875                                 if (ret < 0)
1876                                         return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1877                         }
1878                 }
1879
1880                 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1881                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1882                                 flags);
1883         } else {
1884                 __do_free char *controllers = NULL, *target = NULL;
1885                 unsigned int old_flags = 0;
1886                 const char *rootfs_mnt;
1887
1888                 if (!is_unified_hierarchy(h)) {
1889                         controllers = lxc_string_join(",", (const char **)h->controllers, false);
1890                         if (!controllers)
1891                                 return ret_errno(ENOMEM);
1892                 }
1893
1894                 rootfs_mnt = get_rootfs_mnt(rootfs);
1895                 ret = mnt_attributes_old(flags, &old_flags);
1896                 if (ret)
1897                         return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1898
1899                 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
1900                 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
1901         }
1902         if (ret < 0)
1903                 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1904                                        fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1905
1906         DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1907               fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1908         return 0;
1909 }
1910
1911 static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1912                                  struct lxc_rootfs *rootfs,
1913                                  int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
1914 {
1915         return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1916                                 dfd_mnt_cgroupfs, hierarchy_mnt);
1917 }
1918
1919 static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
1920                                       struct lxc_rootfs *rootfs,
1921                                       int dfd_mnt_cgroupfs,
1922                                       const char *hierarchy_mnt)
1923 {
1924         switch (cgroup_automount_type) {
1925         case LXC_AUTO_CGROUP_FULL_RO:
1926                 break;
1927         case LXC_AUTO_CGROUP_FULL_RW:
1928                 break;
1929         case LXC_AUTO_CGROUP_FULL_MIXED:
1930                 break;
1931         default:
1932                 return 0;
1933         }
1934
1935         return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1936                                 dfd_mnt_cgroupfs, hierarchy_mnt);
1937 }
1938
1939 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1940                                       struct lxc_handler *handler, int cg_flags)
1941 {
1942         __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF;
1943         __do_free char *cgroup_root = NULL;
1944         int cgroup_automount_type;
1945         bool in_cgroup_ns = false, wants_force_mount = false;
1946         struct lxc_conf *conf = handler->conf;
1947         struct lxc_rootfs *rootfs = &conf->rootfs;
1948         const char *rootfs_mnt = get_rootfs_mnt(rootfs);
1949         int ret;
1950
1951         if (!ops)
1952                 return ret_set_errno(false, ENOENT);
1953
1954         if (!ops->hierarchies)
1955                 return true;
1956
1957         if (!conf)
1958                 return ret_set_errno(false, EINVAL);
1959
1960         if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
1961                 return log_trace(true, "No cgroup mounts requested");
1962
1963         if (cg_flags & LXC_AUTO_CGROUP_FORCE) {
1964                 cg_flags &= ~LXC_AUTO_CGROUP_FORCE;
1965                 wants_force_mount = true;
1966         }
1967
1968         switch (cg_flags) {
1969         case LXC_AUTO_CGROUP_RO:
1970                 TRACE("Read-only cgroup mounts requested");
1971                 break;
1972         case LXC_AUTO_CGROUP_RW:
1973                 TRACE("Read-write cgroup mounts requested");
1974                 break;
1975         case LXC_AUTO_CGROUP_MIXED:
1976                 TRACE("Mixed cgroup mounts requested");
1977                 break;
1978         case LXC_AUTO_CGROUP_FULL_RO:
1979                 TRACE("Full read-only cgroup mounts requested");
1980                 break;
1981         case LXC_AUTO_CGROUP_FULL_RW:
1982                 TRACE("Full read-write cgroup mounts requested");
1983                 break;
1984         case LXC_AUTO_CGROUP_FULL_MIXED:
1985                 TRACE("Full mixed cgroup mounts requested");
1986                 break;
1987         default:
1988                 return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified");
1989         }
1990         cgroup_automount_type = cg_flags;
1991
1992         if (!wants_force_mount) {
1993                 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
1994
1995                 /*
1996                  * Most recent distro versions currently have init system that
1997                  * do support cgroup2 but do not mount it by default unless
1998                  * explicitly told so even if the host is cgroup2 only. That
1999                  * means they often will fail to boot. Fix this by pre-mounting
2000                  * cgroup2 by default. We will likely need to be doing this a
2001                  * few years until all distros have switched over to cgroup2 at
2002                  * which point we can safely assume that their init systems
2003                  * will mount it themselves.
2004                  */
2005                 if (pure_unified_layout(ops))
2006                         wants_force_mount = true;
2007         }
2008
2009         if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP))
2010                 in_cgroup_ns = true;
2011
2012         if (in_cgroup_ns && !wants_force_mount)
2013                 return log_trace(true, "Mounting cgroups not requested or needed");
2014
2015         /* This is really the codepath that we want. */
2016         if (pure_unified_layout(ops)) {
2017                 __do_close int dfd_mnt_unified = -EBADF;
2018
2019                 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
2020                                           PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
2021                 if (dfd_mnt_unified < 0)
2022                         return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
2023                                         DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
2024                 /*
2025                  * If cgroup namespaces are supported but the container will
2026                  * not have CAP_SYS_ADMIN after it has started we need to mount
2027                  * the cgroups manually.
2028                  *
2029                  * Note that here we know that wants_force_mount is true.
2030                  * Otherwise we would've returned early above.
2031                  */
2032                 if (in_cgroup_ns) {
2033                         /*
2034                          *  1. cgroup:rw:force    -> Mount the cgroup2 filesystem.
2035                          *  2. cgroup:ro:force    -> Mount the cgroup2 filesystem read-only.
2036                          *  3. cgroup:mixed:force -> See comment above how this
2037                          *                           does not apply so
2038                          *                           cgroup:mixed is equal to
2039                          *                           cgroup:rw when cgroup
2040                          *                           namespaces are supported.
2041
2042                          *  4. cgroup:rw    -> No-op; init system responsible for mounting.
2043                          *  5. cgroup:ro    -> No-op; init system responsible for mounting.
2044                          *  6. cgroup:mixed -> No-op; init system responsible for mounting.
2045                          *
2046                          *  7. cgroup-full:rw    -> Not supported.
2047                          *  8. cgroup-full:ro    -> Not supported.
2048                          *  9. cgroup-full:mixed -> Not supported.
2049
2050                          * 10. cgroup-full:rw:force    -> Not supported.
2051                          * 11. cgroup-full:ro:force    -> Not supported.
2052                          * 12. cgroup-full:mixed:force -> Not supported.
2053                          */
2054                         ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
2055                         if (ret < 0)
2056                                 return syserrno(false, "Failed to force mount cgroup filesystem in cgroup namespace");
2057
2058                         return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
2059                 } else {
2060                         /*
2061                          * Either no cgroup namespace supported (highly
2062                          * unlikely unless we're dealing with a Frankenkernel.
2063                          * Or the user requested to keep the cgroup namespace
2064                          * of the host or another container.
2065                          */
2066                         if (wants_force_mount) {
2067                                 /*
2068                                  * 1. cgroup:rw:force    -> Bind-mount the cgroup2 filesystem writable.
2069                                  * 2. cgroup:ro:force    -> Bind-mount the cgroup2 filesystem read-only.
2070                                  * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
2071                                  *                          and make the parent directory of the
2072                                  *                          container's cgroup read-only but the
2073                                  *                          container's cgroup writable.
2074                                  *
2075                                  * 10. cgroup-full:rw:force    ->
2076                                  * 11. cgroup-full:ro:force    ->
2077                                  * 12. cgroup-full:mixed:force ->
2078                                  */
2079                                 errno = EOPNOTSUPP;
2080                                 SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
2081                         } else {
2082                                 errno = EOPNOTSUPP;
2083                                 SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
2084                         }
2085                 }
2086
2087                 return syserrno(false, "Failed to mount cgroups");
2088         }
2089
2090         /*
2091          * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
2092          * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
2093          * DEFAULT_CGROUP_MOUNTPOINT define.
2094          */
2095         if (can_use_mount_api()) {
2096                 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
2097                 if (fd_fs < 0)
2098                         return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
2099
2100                 ret = fs_set_property(fd_fs, "mode", "0755");
2101                 if (ret < 0)
2102                         return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
2103
2104                 ret = fs_set_property(fd_fs, "size", "10240k");
2105                 if (ret < 0)
2106                         return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
2107
2108                 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
2109                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
2110                                 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
2111                                 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
2112         } else {
2113                 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
2114                 ret = safe_mount(NULL, cgroup_root, "tmpfs",
2115                                  MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
2116                                  "size=10240k,mode=755", rootfs_mnt);
2117         }
2118         if (ret < 0)
2119                 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
2120                                        DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
2121
2122         dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
2123                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
2124         if (dfd_mnt_tmpfs < 0)
2125                 return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
2126                                 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
2127
2128         for (int i = 0; ops->hierarchies[i]; i++) {
2129                 __do_free char *controllerpath = NULL, *path2 = NULL;
2130                 struct hierarchy *h = ops->hierarchies[i];
2131                 char *controller = strrchr(h->mountpoint, '/');
2132
2133                 if (!controller)
2134                         continue;
2135                 controller++;
2136
2137                 ret = mkdirat(dfd_mnt_tmpfs, controller, 0000);
2138                 if (ret < 0)
2139                         return log_error_errno(false, errno, "Failed to create cgroup mountpoint %d(%s)", dfd_mnt_tmpfs, controller);
2140
2141                 if (in_cgroup_ns && wants_force_mount) {
2142                         /*
2143                          * If cgroup namespaces are supported but the container
2144                          * will not have CAP_SYS_ADMIN after it has started we
2145                          * need to mount the cgroups manually.
2146                          */
2147                         ret = cgroupfs_mount(cgroup_automount_type, h, rootfs, dfd_mnt_tmpfs, controller);
2148                         if (ret < 0)
2149                                 return false;
2150
2151                         continue;
2152                 }
2153
2154                 /* Here is where the ancient kernel section begins. */
2155                 ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs, dfd_mnt_tmpfs, controller);
2156                 if (ret < 0)
2157                         return false;
2158
2159                 if (!cg_mount_needs_subdirs(cgroup_automount_type))
2160                         continue;
2161
2162                 if (!cgroup_root)
2163                         cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
2164
2165                 controllerpath = must_make_path(cgroup_root, controller, NULL);
2166                 path2 = must_make_path(controllerpath, h->container_base_path, ops->container_cgroup, NULL);
2167                 ret = mkdir_p(path2, 0755);
2168                 if (ret < 0 && (errno != EEXIST))
2169                         return false;
2170
2171                 ret = cg_legacy_mount_controllers(cgroup_automount_type, h, controllerpath, path2, ops->container_cgroup);
2172                 if (ret < 0)
2173                         return false;
2174         }
2175
2176         return true;
2177 }
2178
2179 /* Only root needs to escape to the cgroup of its init. */
2180 __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
2181                                             struct lxc_conf *conf)
2182 {
2183         if (!ops)
2184                 return ret_set_errno(false, ENOENT);
2185
2186         if (!ops->hierarchies)
2187                 return true;
2188
2189         if (!conf)
2190                 return ret_set_errno(false, EINVAL);
2191
2192         if (conf->cgroup_meta.relative || geteuid())
2193                 return true;
2194
2195         for (int i = 0; ops->hierarchies[i]; i++) {
2196                 __do_free char *fullpath = NULL;
2197                 int ret;
2198
2199                 fullpath =
2200                     must_make_path(ops->hierarchies[i]->mountpoint,
2201                                    ops->hierarchies[i]->container_base_path,
2202                                    "cgroup.procs", NULL);
2203                 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
2204                 if (ret != 0)
2205                         return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
2206         }
2207
2208         return true;
2209 }
2210
2211 __cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
2212 {
2213         int i = 0;
2214
2215         if (!ops)
2216                 return ret_set_errno(-1, ENOENT);
2217
2218         if (!ops->hierarchies)
2219                 return 0;
2220
2221         for (; ops->hierarchies[i]; i++)
2222                 ;
2223
2224         return i;
2225 }
2226
2227 __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
2228                                                      int n, char ***out)
2229 {
2230         int i;
2231
2232         if (!ops)
2233                 return ret_set_errno(false, ENOENT);
2234
2235         if (!ops->hierarchies)
2236                 return ret_set_errno(false, ENOENT);
2237
2238         /* sanity check n */
2239         for (i = 0; i < n; i++)
2240                 if (!ops->hierarchies[i])
2241                         return ret_set_errno(false, ENOENT);
2242
2243         *out = ops->hierarchies[i]->controllers;
2244
2245         return true;
2246 }
2247
2248 static bool cg_legacy_freeze(struct cgroup_ops *ops)
2249 {
2250         struct hierarchy *h;
2251
2252         h = get_hierarchy(ops, "freezer");
2253         if (!h)
2254                 return ret_set_errno(-1, ENOENT);
2255
2256         return lxc_write_openat(h->container_full_path, "freezer.state",
2257                                 "FROZEN", STRLITERALLEN("FROZEN"));
2258 }
2259
2260 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
2261                                     struct lxc_epoll_descr *descr)
2262 {
2263         __do_free char *line = NULL;
2264         __do_fclose FILE *f = NULL;
2265         int state = PTR_TO_INT(cbdata);
2266         size_t len;
2267         const char *state_string;
2268
2269         f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
2270         if (!f)
2271                 return LXC_MAINLOOP_ERROR;
2272
2273         if (state == 1)
2274                 state_string = "frozen 1";
2275         else
2276                 state_string = "frozen 0";
2277
2278         while (getline(&line, &len, f) != -1)
2279                 if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2))
2280                         return LXC_MAINLOOP_CLOSE;
2281
2282         rewind(f);
2283
2284         return LXC_MAINLOOP_CONTINUE;
2285 }
2286
2287 static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
2288                                 const char *state_string,
2289                                 int state_num,
2290                                 const char *epoll_error,
2291                                 const char *wait_error)
2292 {
2293         __do_close int fd = -EBADF;
2294         call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
2295         int ret;
2296         struct lxc_epoll_descr descr;
2297         struct hierarchy *h;
2298
2299         h = ops->unified;
2300         if (!h)
2301                 return ret_set_errno(-1, ENOENT);
2302
2303         if (!h->container_full_path)
2304                 return ret_set_errno(-1, EEXIST);
2305
2306         if (timeout != 0) {
2307                 __do_free char *events_file = NULL;
2308
2309                 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
2310                 fd = open(events_file, O_RDONLY | O_CLOEXEC);
2311                 if (fd < 0)
2312                         return log_error_errno(-1, errno, "Failed to open cgroup.events file");
2313
2314                 ret = lxc_mainloop_open(&descr);
2315                 if (ret)
2316                         return log_error_errno(-1, errno, "%s", epoll_error);
2317
2318                 /* automatically cleaned up now */
2319                 descr_ptr = &descr;
2320
2321                 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
2322                 if (ret < 0)
2323                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
2324         }
2325
2326         ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", state_string, 1);
2327         if (ret < 0)
2328                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
2329
2330         if (timeout != 0 && lxc_mainloop(&descr, timeout))
2331                 return log_error_errno(-1, errno, "%s", wait_error);
2332
2333         return 0;
2334 }
2335
2336 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
2337 {
2338         return cg_unified_freeze_do(ops, timeout, "1", 1,
2339                 "Failed to create epoll instance to wait for container freeze",
2340                 "Failed to wait for container to be frozen");
2341 }
2342
2343 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
2344 {
2345         if (!ops->hierarchies)
2346                 return ret_set_errno(-1, ENOENT);
2347
2348         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2349                 return cg_legacy_freeze(ops);
2350
2351         return cg_unified_freeze(ops, timeout);
2352 }
2353
2354 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
2355 {
2356         struct hierarchy *h;
2357
2358         h = get_hierarchy(ops, "freezer");
2359         if (!h)
2360                 return ret_set_errno(-1, ENOENT);
2361
2362         return lxc_write_openat(h->container_full_path, "freezer.state",
2363                                 "THAWED", STRLITERALLEN("THAWED"));
2364 }
2365
2366 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
2367 {
2368         return cg_unified_freeze_do(ops, timeout, "0", 0,
2369                 "Failed to create epoll instance to wait for container unfreeze",
2370                 "Failed to wait for container to be unfrozen");
2371 }
2372
2373 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2374 {
2375         if (!ops->hierarchies)
2376                 return ret_set_errno(-1, ENOENT);
2377
2378         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2379                 return cg_legacy_unfreeze(ops);
2380
2381         return cg_unified_unfreeze(ops, timeout);
2382 }
2383
2384 static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2385                                         const char *controller, bool limiting)
2386 {
2387         struct hierarchy *h;
2388
2389         h = get_hierarchy(ops, controller);
2390         if (!h)
2391                 return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"",
2392                                       controller ? controller : "(null)");
2393
2394         if (limiting)
2395                 return h->container_limit_path
2396                            ? h->container_limit_path + strlen(h->mountpoint)
2397                            : NULL;
2398
2399         return h->container_full_path
2400                    ? h->container_full_path + strlen(h->mountpoint)
2401                    : NULL;
2402 }
2403
2404 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2405                                                   const char *controller)
2406 {
2407     return cgfsng_get_cgroup_do(ops, controller, false);
2408 }
2409
2410 __cgfsng_ops static const char *cgfsng_get_limiting_cgroup(struct cgroup_ops *ops,
2411                                                            const char *controller)
2412 {
2413     return cgfsng_get_cgroup_do(ops, controller, true);
2414 }
2415
2416 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2417  * which must be freed by the caller.
2418  */
2419 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2420                                                        const char *inpath,
2421                                                        const char *filename)
2422 {
2423         return must_make_path(h->mountpoint, inpath, filename, NULL);
2424 }
2425
2426 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
2427 {
2428         int idx = 1;
2429         int ret;
2430         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2431         ssize_t pidstr_len;
2432
2433         /* Create leaf cgroup. */
2434         ret = mkdirat(unified_fd, ".lxc", 0755);
2435         if (ret < 0 && errno != EEXIST)
2436                 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2437
2438         pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2439         if (pidstr_len < 0)
2440                 return pidstr_len;
2441
2442         ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
2443         if (ret < 0)
2444                 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2445         if (ret == 0)
2446                 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
2447
2448         /* this is a non-leaf node */
2449         if (errno != EBUSY)
2450                 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
2451
2452         do {
2453                 bool rm = false;
2454                 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
2455                 char *slash = attach_cgroup;
2456
2457                 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2458                 if (ret < 0)
2459                         return ret;
2460
2461                 /*
2462                  * This shouldn't really happen but the compiler might complain
2463                  * that a short write would cause a buffer overrun. So be on
2464                  * the safe side.
2465                  */
2466                 if (ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2467                         return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2468
2469                 slash += (ret - STRLITERALLEN("/cgroup.procs"));
2470                 *slash = '\0';
2471
2472                 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2473                 if (ret < 0 && errno != EEXIST)
2474                         return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2475                 if (ret == 0)
2476                         rm = true;
2477
2478                 *slash = '/';
2479
2480                 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2481                 if (ret == 0)
2482                         return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
2483
2484                 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2485                         SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2486
2487                 /* this is a non-leaf node */
2488                 if (errno != EBUSY)
2489                         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2490
2491                 idx++;
2492         } while (idx < 1000);
2493
2494         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2495 }
2496
2497 static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2498                                      int unified_fd, int *sk_fd)
2499 {
2500         __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2501         int target_fds[2];
2502         ssize_t ret;
2503
2504         /* Create leaf cgroup. */
2505         ret = mkdirat(unified_fd, ".lxc", 0755);
2506         if (ret < 0 && errno != EEXIST)
2507                 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2508
2509         target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2510         if (target_fd0 < 0)
2511                 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2512         target_fds[0] = target_fd0;
2513
2514         target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2515         if (target_fd1 < 0)
2516                 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2517         target_fds[1] = target_fd1;
2518
2519         ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
2520         if (ret <= 0)
2521                 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
2522                                        target_fd0, target_fd1);
2523
2524         return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
2525 }
2526
2527 static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2528                                         int *sk_fd, pid_t pid)
2529 {
2530         __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2531         int target_fds[2];
2532         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2533         size_t pidstr_len;
2534         ssize_t ret;
2535
2536         ret = lxc_abstract_unix_recv_fds(sk, target_fds, 2, NULL, 0);
2537         if (ret <= 0)
2538                 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
2539         target_fd0 = target_fds[0];
2540         target_fd1 = target_fds[1];
2541
2542         pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2543
2544         ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2545         if (ret > 0 && ret == pidstr_len)
2546                 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2547
2548         ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
2549         if (ret > 0 && ret == pidstr_len)
2550                 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
2551
2552         return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2553                                target_fd0, target_fd1);
2554 }
2555
2556 struct userns_exec_unified_attach_data {
2557         const struct lxc_conf *conf;
2558         int unified_fd;
2559         int sk_pair[2];
2560         pid_t pid;
2561 };
2562
2563 static int cgroup_unified_attach_child_wrapper(void *data)
2564 {
2565         struct userns_exec_unified_attach_data *args = data;
2566
2567         if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2568             args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2569                 return ret_errno(EINVAL);
2570
2571         close_prot_errno_disarm(args->sk_pair[0]);
2572         return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2573                                          &args->sk_pair[1]);
2574 }
2575
2576 static int cgroup_unified_attach_parent_wrapper(void *data)
2577 {
2578         struct userns_exec_unified_attach_data *args = data;
2579
2580         if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2581             args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2582                 return ret_errno(EINVAL);
2583
2584         close_prot_errno_disarm(args->sk_pair[1]);
2585         return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2586                                             args->pid);
2587 }
2588
2589 /* Technically, we're always at a delegation boundary here (This is especially
2590  * true when cgroup namespaces are available.). The reasoning is that in order
2591  * for us to have been able to start a container in the first place the root
2592  * cgroup must have been a leaf node. Now, either the container's init system
2593  * has populated the cgroup and kept it as a leaf node or it has created
2594  * subtrees. In the former case we will simply attach to the leaf node we
2595  * created when we started the container in the latter case we create our own
2596  * cgroup for the attaching process.
2597  */
2598 static int __cg_unified_attach(const struct hierarchy *h,
2599                                const struct lxc_conf *conf, const char *name,
2600                                const char *lxcpath, pid_t pid,
2601                                const char *controller)
2602 {
2603         __do_close int unified_fd = -EBADF;
2604         __do_free char *path = NULL, *cgroup = NULL;
2605         int ret;
2606
2607         if (!conf || !name || !lxcpath || pid <= 0)
2608                 return ret_errno(EINVAL);
2609
2610         ret = cgroup_attach(conf, name, lxcpath, pid);
2611         if (ret == 0)
2612                 return log_trace(0, "Attached to unified cgroup via command handler");
2613         if (ret != -ENOCGROUP2)
2614                 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2615
2616         /* Fall back to retrieving the path for the unified cgroup. */
2617         cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2618         /* not running */
2619         if (!cgroup)
2620                 return 0;
2621
2622         path = must_make_path(h->mountpoint, cgroup, NULL);
2623
2624         unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
2625         if (unified_fd < 0)
2626                 return ret_errno(EBADF);
2627
2628         if (!lxc_list_empty(&conf->id_map)) {
2629                 struct userns_exec_unified_attach_data args = {
2630                         .conf           = conf,
2631                         .unified_fd     = unified_fd,
2632                         .pid            = pid,
2633                 };
2634
2635                 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2636                 if (ret < 0)
2637                         return -errno;
2638
2639                 ret = userns_exec_minimal(conf,
2640                                           cgroup_unified_attach_parent_wrapper,
2641                                           &args,
2642                                           cgroup_unified_attach_child_wrapper,
2643                                           &args);
2644         } else {
2645                 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2646         }
2647
2648         return ret;
2649 }
2650
2651 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2652                                        const struct lxc_conf *conf,
2653                                        const char *name, const char *lxcpath,
2654                                        pid_t pid)
2655 {
2656         int len, ret;
2657         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2658
2659         if (!ops)
2660                 return ret_set_errno(false, ENOENT);
2661
2662         if (!ops->hierarchies)
2663                 return true;
2664
2665         len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2666         if (len < 0)
2667                 return false;
2668
2669         for (int i = 0; ops->hierarchies[i]; i++) {
2670                 __do_free char *fullpath = NULL, *path = NULL;
2671                 struct hierarchy *h = ops->hierarchies[i];
2672
2673                 if (h->version == CGROUP2_SUPER_MAGIC) {
2674                         ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
2675                                                   h->controllers[0]);
2676                         if (ret < 0)
2677                                 return false;
2678
2679                         continue;
2680                 }
2681
2682                 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2683                 /* not running */
2684                 if (!path)
2685                         return false;
2686
2687                 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2688                 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2689                 if (ret < 0)
2690                         return log_error_errno(false, errno, "Failed to attach %d to %s",
2691                                                (int)pid, fullpath);
2692         }
2693
2694         return true;
2695 }
2696
2697 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.  Here we
2698  * don't have a cgroup_data set up, so we ask the running container through the
2699  * commands API for the cgroup path.
2700  */
2701 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2702                                      char *value, size_t len, const char *name,
2703                                      const char *lxcpath)
2704 {
2705         __do_free char *path = NULL;
2706         __do_free char *controller = NULL;
2707         char *p;
2708         struct hierarchy *h;
2709         int ret = -1;
2710
2711         if (!ops)
2712                 return ret_set_errno(-1, ENOENT);
2713
2714         controller = must_copy_string(filename);
2715         p = strchr(controller, '.');
2716         if (p)
2717                 *p = '\0';
2718
2719         path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
2720         /* not running */
2721         if (!path)
2722                 return -1;
2723
2724         h = get_hierarchy(ops, controller);
2725         if (h) {
2726                 __do_free char *fullpath = NULL;
2727
2728                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2729                 ret = lxc_read_from_file(fullpath, value, len);
2730         }
2731
2732         return ret;
2733 }
2734
2735 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2736 {
2737         for (int count = 0; count < 3; count++, val++) {
2738                 switch (*val) {
2739                 case 'r':
2740                         device->access[count] = *val;
2741                         break;
2742                 case 'w':
2743                         device->access[count] = *val;
2744                         break;
2745                 case 'm':
2746                         device->access[count] = *val;
2747                         break;
2748                 case '\n':
2749                 case '\0':
2750                         count = 3;
2751                         break;
2752                 default:
2753                         return ret_errno(EINVAL);
2754                 }
2755         }
2756
2757         return 0;
2758 }
2759
2760 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2761                                     const char *val)
2762 {
2763         int count, ret;
2764         char temp[50];
2765
2766         if (strequal("devices.allow", key))
2767                 device->allow = 1; /* allow the device */
2768         else
2769                 device->allow = 0; /* deny the device */
2770
2771         if (strequal(val, "a")) {
2772                 /* global rule */
2773                 device->type = 'a';
2774                 device->major = -1;
2775                 device->minor = -1;
2776                 return 0;
2777         }
2778
2779         switch (*val) {
2780         case 'a':
2781                 __fallthrough;
2782         case 'b':
2783                 __fallthrough;
2784         case 'c':
2785                 device->type = *val;
2786                 break;
2787         default:
2788                 return -1;
2789         }
2790
2791         val++;
2792         if (!isspace(*val))
2793                 return -1;
2794         val++;
2795         if (*val == '*') {
2796                 device->major = -1;
2797                 val++;
2798         } else if (isdigit(*val)) {
2799                 memset(temp, 0, sizeof(temp));
2800                 for (count = 0; count < sizeof(temp) - 1; count++) {
2801                         temp[count] = *val;
2802                         val++;
2803                         if (!isdigit(*val))
2804                                 break;
2805                 }
2806                 ret = lxc_safe_int(temp, &device->major);
2807                 if (ret)
2808                         return -1;
2809         } else {
2810                 return -1;
2811         }
2812         if (*val != ':')
2813                 return -1;
2814         val++;
2815
2816         /* read minor */
2817         if (*val == '*') {
2818                 device->minor = -1;
2819                 val++;
2820         } else if (isdigit(*val)) {
2821                 memset(temp, 0, sizeof(temp));
2822                 for (count = 0; count < sizeof(temp) - 1; count++) {
2823                         temp[count] = *val;
2824                         val++;
2825                         if (!isdigit(*val))
2826                                 break;
2827                 }
2828                 ret = lxc_safe_int(temp, &device->minor);
2829                 if (ret)
2830                         return -1;
2831         } else {
2832                 return -1;
2833         }
2834         if (!isspace(*val))
2835                 return -1;
2836
2837         return device_cgroup_parse_access(device, ++val);
2838 }
2839
2840 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.  Here we
2841  * don't have a cgroup_data set up, so we ask the running container through the
2842  * commands API for the cgroup path.
2843  */
2844 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2845                                      const char *key, const char *value,
2846                                      const char *name, const char *lxcpath)
2847 {
2848         __do_free char *path = NULL;
2849         __do_free char *controller = NULL;
2850         char *p;
2851         struct hierarchy *h;
2852         int ret = -1;
2853
2854         if (!ops || is_empty_string(key) || is_empty_string(value) ||
2855             is_empty_string(name) || is_empty_string(lxcpath))
2856                 return ret_errno(EINVAL);
2857
2858         controller = must_copy_string(key);
2859         p = strchr(controller, '.');
2860         if (p)
2861                 *p = '\0';
2862
2863         if (pure_unified_layout(ops) && strequal(controller, "devices")) {
2864                 struct device_item device = {};
2865
2866                 ret = device_cgroup_rule_parse(&device, key, value);
2867                 if (ret < 0)
2868                         return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2869                                                key, value);
2870
2871                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2872                 if (ret < 0)
2873                         return -1;
2874
2875                 return 0;
2876         }
2877
2878         path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
2879         /* not running */
2880         if (!path)
2881                 return -1;
2882
2883         h = get_hierarchy(ops, controller);
2884         if (h) {
2885                 __do_free char *fullpath = NULL;
2886
2887                 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2888                 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2889         }
2890
2891         return ret;
2892 }
2893
2894 /* take devices cgroup line
2895  *    /dev/foo rwx
2896  * and convert it to a valid
2897  *    type major:minor mode
2898  * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2899  * the output.
2900  */
2901 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2902                                             const char *devpath)
2903 {
2904         __do_free char *path = NULL;
2905         char *mode = NULL;
2906         int n_parts, ret;
2907         char *p;
2908         struct stat sb;
2909
2910         path = must_copy_string(devpath);
2911
2912         /*
2913          * Read path followed by mode. Ignore any trailing text.
2914          * A '    # comment' would be legal. Technically other text is not
2915          * legal, we could check for that if we cared to.
2916          */
2917         for (n_parts = 1, p = path; *p; p++) {
2918                 if (*p != ' ')
2919                         continue;
2920                 *p = '\0';
2921
2922                 if (n_parts != 1)
2923                         break;
2924                 p++;
2925                 n_parts++;
2926
2927                 while (*p == ' ')
2928                         p++;
2929
2930                 mode = p;
2931
2932                 if (*p == '\0')
2933                         return ret_set_errno(-1, EINVAL);
2934         }
2935
2936         if (!mode)
2937                 return ret_errno(EINVAL);
2938
2939         if (device_cgroup_parse_access(device, mode) < 0)
2940                 return -1;
2941
2942         ret = stat(path, &sb);
2943         if (ret < 0)
2944                 return ret_set_errno(-1, errno);
2945
2946         mode_t m = sb.st_mode & S_IFMT;
2947         switch (m) {
2948         case S_IFBLK:
2949                 device->type = 'b';
2950                 break;
2951         case S_IFCHR:
2952                 device->type = 'c';
2953                 break;
2954         default:
2955                 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
2956         }
2957
2958         device->major = MAJOR(sb.st_rdev);
2959         device->minor = MINOR(sb.st_rdev);
2960         device->allow = 1;
2961
2962         return 0;
2963 }
2964
2965 static int convert_devpath(const char *invalue, char *dest)
2966 {
2967         struct device_item device = {};
2968         int ret;
2969
2970         ret = device_cgroup_rule_parse_devpath(&device, invalue);
2971         if (ret < 0)
2972                 return -1;
2973
2974         ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2975                          device.minor, device.access);
2976         if (ret < 0)
2977                 return log_error_errno(ret, -ret,
2978                                        "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2979                                        device.type, device.major, device.minor,
2980                                        device.access);
2981
2982         return 0;
2983 }
2984
2985 /* Called from setup_limits - here we have the container's cgroup_data because
2986  * we created the cgroups.
2987  */
2988 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2989                               const char *value, bool is_cpuset)
2990 {
2991         __do_free char *controller = NULL;
2992         char *p;
2993         /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2994         char converted_value[50];
2995         struct hierarchy *h;
2996
2997         controller = must_copy_string(filename);
2998         p = strchr(controller, '.');
2999         if (p)
3000                 *p = '\0';
3001
3002         if (strequal("devices.allow", filename) && value[0] == '/') {
3003                 int ret;
3004
3005                 ret = convert_devpath(value, converted_value);
3006                 if (ret < 0)
3007                         return ret;
3008                 value = converted_value;
3009         }
3010
3011         h = get_hierarchy(ops, controller);
3012         if (!h)
3013                 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
3014
3015         if (is_cpuset) {
3016                 int ret = lxc_write_openat(h->container_full_path, filename, value, strlen(value));
3017                 if (ret)
3018                         return ret;
3019         }
3020         return lxc_write_openat(h->container_limit_path, filename, value, strlen(value));
3021 }
3022
3023 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
3024                                                     struct lxc_conf *conf,
3025                                                     bool do_devices)
3026 {
3027         __do_free struct lxc_list *sorted_cgroup_settings = NULL;
3028         struct lxc_list *cgroup_settings = &conf->cgroup;
3029         struct lxc_list *iterator, *next;
3030         struct lxc_cgroup *cg;
3031         bool ret = false;
3032
3033         if (!ops)
3034                 return ret_set_errno(false, ENOENT);
3035
3036         if (!conf)
3037                 return ret_set_errno(false, EINVAL);
3038
3039         cgroup_settings = &conf->cgroup;
3040         if (lxc_list_empty(cgroup_settings))
3041                 return true;
3042
3043         if (!ops->hierarchies)
3044                 return ret_set_errno(false, EINVAL);
3045
3046         if (pure_unified_layout(ops))
3047                 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
3048
3049         sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
3050         if (!sorted_cgroup_settings)
3051                 return false;
3052
3053         lxc_list_for_each(iterator, sorted_cgroup_settings) {
3054                 cg = iterator->elem;
3055
3056                 if (do_devices == strnequal("devices", cg->subsystem, 7)) {
3057                         if (cg_legacy_set_data(ops, cg->subsystem, cg->value, strnequal("cpuset", cg->subsystem, 6))) {
3058                                 if (do_devices && (errno == EACCES || errno == EPERM)) {
3059                                         SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3060                                         continue;
3061                                 }
3062                                 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3063                                 goto out;
3064                         }
3065                         DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
3066                 }
3067         }
3068
3069         ret = true;
3070         INFO("Limits for the legacy cgroup hierarchies have been setup");
3071 out:
3072         lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
3073                 lxc_list_del(iterator);
3074                 free(iterator);
3075         }
3076
3077         return ret;
3078 }
3079
3080 /*
3081  * Some of the parsing logic comes from the original cgroup device v1
3082  * implementation in the kernel.
3083  */
3084 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
3085                                      struct lxc_conf *conf, const char *key,
3086                                      const char *val)
3087 {
3088         struct device_item device_item = {};
3089         int ret;
3090
3091         if (strequal("devices.allow", key) && *val == '/')
3092                 ret = device_cgroup_rule_parse_devpath(&device_item, val);
3093         else
3094                 ret = device_cgroup_rule_parse(&device_item, key, val);
3095         if (ret < 0)
3096                 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", key, val);
3097
3098         /*
3099          * Note that bpf_list_add_device() indicates whether or not it had to
3100          * alter the current device list by return 1 and 0; both indicate
3101          * success. A negative return value indicates and error.
3102          */
3103         ret = bpf_list_add_device(&conf->bpf_devices, &device_item);
3104         if (ret < 0)
3105                 return -1;
3106
3107         return 0;
3108 }
3109
3110 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
3111                                              struct lxc_handler *handler)
3112 {
3113         struct lxc_list *cgroup_settings, *iterator;
3114         struct hierarchy *h;
3115         struct lxc_conf *conf;
3116
3117         if (!ops)
3118                 return ret_set_errno(false, ENOENT);
3119
3120         if (!ops->hierarchies)
3121                 return true;
3122
3123         if (!ops->container_cgroup)
3124                 return ret_set_errno(false, EINVAL);
3125
3126         if (!handler || !handler->conf)
3127                 return ret_set_errno(false, EINVAL);
3128         conf = handler->conf;
3129
3130         cgroup_settings = &conf->cgroup2;
3131         if (lxc_list_empty(cgroup_settings))
3132                 return true;
3133
3134         if (!pure_unified_layout(ops))
3135                 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
3136
3137         if (!ops->unified)
3138                 return false;
3139         h = ops->unified;
3140
3141         lxc_list_for_each (iterator, cgroup_settings) {
3142                 struct lxc_cgroup *cg = iterator->elem;
3143                 int ret;
3144
3145                 if (strnequal("devices", cg->subsystem, 7))
3146                         ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, cg->value);
3147                 else
3148                         ret = lxc_write_openat(h->container_limit_path, cg->subsystem, cg->value, strlen(cg->value));
3149                 if (ret < 0)
3150                         return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3151
3152                 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3153         }
3154
3155         return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
3156 }
3157
3158 __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
3159 {
3160         struct lxc_conf *conf;
3161         struct hierarchy *unified;
3162
3163         if (!ops)
3164                 return ret_set_errno(false, ENOENT);
3165
3166         if (!ops->hierarchies)
3167                 return true;
3168
3169         if (!ops->container_cgroup)
3170                 return ret_set_errno(false, EEXIST);
3171
3172         if (!handler || !handler->conf)
3173                 return ret_set_errno(false, EINVAL);
3174         conf = handler->conf;
3175
3176         unified = ops->unified;
3177         if (!unified || !unified->bpf_device_controller ||
3178             !unified->container_full_path ||
3179             lxc_list_empty(&(conf->bpf_devices).device_item))
3180                 return true;
3181
3182         return bpf_cgroup_devices_attach(ops, &conf->bpf_devices);
3183 }
3184
3185 static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
3186 {
3187         __do_close int dfd_final = -EBADF;
3188         __do_free char *add_controllers = NULL, *copy = NULL;
3189         size_t full_len = 0;
3190         struct hierarchy *unified;
3191         int dfd_cur, ret;
3192         char *cur;
3193         char **it;
3194
3195         if (!ops->hierarchies || !pure_unified_layout(ops))
3196                 return true;
3197
3198         unified = ops->unified;
3199         if (!unified->controllers[0])
3200                 return true;
3201
3202         /* For now we simply enable all controllers that we have detected by
3203          * creating a string like "+memory +pids +cpu +io".
3204          * TODO: In the near future we might want to support "-<controller>"
3205          * etc. but whether supporting semantics like this make sense will need
3206          * some thinking.
3207          */
3208         for (it = unified->controllers; it && *it; it++) {
3209                 full_len += strlen(*it) + 2;
3210                 add_controllers = must_realloc(add_controllers, full_len + 1);
3211
3212                 if (unified->controllers[0] == *it)
3213                         add_controllers[0] = '\0';
3214
3215                 (void)strlcat(add_controllers, "+", full_len + 1);
3216                 (void)strlcat(add_controllers, *it, full_len + 1);
3217
3218                 if ((it + 1) && *(it + 1))
3219                         (void)strlcat(add_controllers, " ", full_len + 1);
3220         }
3221
3222         copy = strdup(cgroup);
3223         if (!copy)
3224                 return false;
3225
3226         /*
3227          * Placing the write to cgroup.subtree_control before the open() is
3228          * intentional because of the cgroup2 delegation model. It enforces
3229          * that leaf cgroups don't have any controllers enabled for delegation.
3230          */
3231         dfd_cur = unified->dfd_base;
3232         lxc_iterate_parts(cur, copy, "/") {
3233                 /*
3234                  * Even though we vetted the paths when we parsed the config
3235                  * we're paranoid here and check that the path is neither
3236                  * absolute nor walks upwards.
3237                  */
3238                 if (abspath(cur))
3239                         return syserrno_set(-EINVAL, "No absolute paths allowed");
3240
3241                 if (strnequal(cur, "..", STRLITERALLEN("..")))
3242                         return syserrno_set(-EINVAL, "No upward walking paths allowed");
3243
3244                 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
3245                 if (ret < 0)
3246                         return syserrno(-errno, "Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
3247
3248                 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
3249
3250                 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
3251                 if (dfd_final < 0)
3252                         return syserrno(-errno, "Fail to open directory %d(%s)", dfd_cur, cur);
3253                 if (dfd_cur != unified->dfd_base)
3254                         close(dfd_cur);
3255                 /*
3256                  * Leave dfd_final pointing to the last fd we opened so
3257                  * it will be automatically zapped if we return early.
3258                  */
3259                 dfd_cur = dfd_final;
3260         }
3261
3262         return true;
3263 }
3264
3265 __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
3266 {
3267         if (!ops)
3268                 return ret_set_errno(false, ENOENT);
3269
3270         return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
3271 }
3272
3273 __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
3274 {
3275         if (!ops)
3276                 return ret_set_errno(false, ENOENT);
3277
3278         return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
3279 }
3280
3281 static void cg_unified_delegate(char ***delegate)
3282 {
3283         __do_free char *buf = NULL;
3284         char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
3285         char *token;
3286         int idx;
3287
3288         buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
3289         if (!buf) {
3290                 for (char **p = standard; p && *p; p++) {
3291                         idx = append_null_to_list((void ***)delegate);
3292                         (*delegate)[idx] = must_copy_string(*p);
3293                 }
3294                 SYSWARN("Failed to read /sys/kernel/cgroup/delegate");
3295                 return;
3296         }
3297
3298         lxc_iterate_parts(token, buf, " \t\n") {
3299                 /*
3300                  * We always need to chown this for both cgroup and
3301                  * cgroup2.
3302                  */
3303                 if (strequal(token, "cgroup.procs"))
3304                         continue;
3305
3306                 idx = append_null_to_list((void ***)delegate);
3307                 (*delegate)[idx] = must_copy_string(token);
3308         }
3309 }
3310
3311 /* At startup, parse_hierarchies finds all the info we need about cgroup
3312  * mountpoints and current cgroups, and stores it in @d.
3313  */
3314 static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileged)
3315 {
3316         __do_free char *basecginfo = NULL, *line = NULL;
3317         __do_free_string_list char **klist = NULL, **nlist = NULL;
3318         __do_fclose FILE *f = NULL;
3319         int ret;
3320         size_t len = 0;
3321
3322         /* Root spawned containers escape the current cgroup, so use init's
3323          * cgroups as our base in that case.
3324          */
3325         if (!relative && (geteuid() == 0))
3326                 basecginfo = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3327         else
3328                 basecginfo = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3329         if (!basecginfo)
3330                 return ret_set_errno(-1, ENOMEM);
3331
3332         ret = get_existing_subsystems(&klist, &nlist);
3333         if (ret < 0)
3334                 return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
3335
3336         f = fopen("/proc/self/mountinfo", "re");
3337         if (!f)
3338                 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
3339
3340         lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
3341
3342         while (getline(&line, &len, f) != -1) {
3343                 __do_free char *base_cgroup = NULL, *mountpoint = NULL;
3344                 __do_free_string_list char **controller_list = NULL;
3345                 int type;
3346                 bool writeable;
3347
3348                 type = get_cgroup_version(line);
3349                 if (type == 0)
3350                         continue;
3351
3352                 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
3353                         continue;
3354
3355                 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
3356                         if (type == CGROUP2_SUPER_MAGIC)
3357                                 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3358                         else if (type == CGROUP_SUPER_MAGIC)
3359                                 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3360                 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
3361                         if (type == CGROUP_SUPER_MAGIC)
3362                                 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3363                 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3364                         if (type == CGROUP2_SUPER_MAGIC)
3365                                 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3366                 }
3367
3368                 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
3369                 if (!controller_list && type == CGROUP_SUPER_MAGIC)
3370                         continue;
3371
3372                 if (type == CGROUP_SUPER_MAGIC)
3373                         if (controller_list_is_dup(ops->hierarchies, controller_list)) {
3374                                 TRACE("Skipping duplicating controller");
3375                                 continue;
3376                         }
3377
3378                 mountpoint = cg_hybrid_get_mountpoint(line);
3379                 if (!mountpoint) {
3380                         WARN("Failed parsing mountpoint from \"%s\"", line);
3381                         continue;
3382                 }
3383
3384                 if (type == CGROUP_SUPER_MAGIC)
3385                         base_cgroup = cg_hybrid_get_current_cgroup(relative, basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
3386                 else
3387                         base_cgroup = cg_hybrid_get_current_cgroup(relative, basecginfo, NULL, CGROUP2_SUPER_MAGIC);
3388                 if (!base_cgroup) {
3389                         WARN("Failed to find current cgroup");
3390                         continue;
3391                 }
3392
3393                 if (type == CGROUP2_SUPER_MAGIC)
3394                         writeable = test_writeable_v2(mountpoint, base_cgroup);
3395                 else
3396                         writeable = test_writeable_v1(mountpoint, base_cgroup);
3397                 if (!writeable) {
3398                         TRACE("The %s group is not writeable", base_cgroup);
3399                         continue;
3400                 }
3401
3402                 if (type == CGROUP2_SUPER_MAGIC)
3403                         ret = add_hierarchy(ops, NULL, move_ptr(mountpoint), move_ptr(base_cgroup), type);
3404                 else
3405                         ret = add_hierarchy(ops, move_ptr(controller_list), move_ptr(mountpoint), move_ptr(base_cgroup), type);
3406                 if (ret)
3407                         return syserrno(ret, "Failed to add cgroup hierarchy");
3408                 if (ops->unified && unprivileged)
3409                         cg_unified_delegate(&(ops->unified)->cgroup2_chown);
3410         }
3411
3412         /* verify that all controllers in cgroup.use and all crucial
3413          * controllers are accounted for
3414          */
3415         if (!all_controllers_found(ops))
3416                 return log_error_errno(-1, ENOENT, "Failed to find all required controllers");
3417
3418         return 0;
3419 }
3420
3421 /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
3422 static char *cg_unified_get_current_cgroup(bool relative)
3423 {
3424         __do_free char *basecginfo = NULL, *copy = NULL;
3425         char *base_cgroup;
3426
3427         if (!relative && (geteuid() == 0))
3428                 basecginfo = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3429         else
3430                 basecginfo = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3431         if (!basecginfo)
3432                 return NULL;
3433
3434         base_cgroup = strstr(basecginfo, "0::/");
3435         if (!base_cgroup)
3436                 return NULL;
3437
3438         base_cgroup = base_cgroup + 3;
3439         copy = copy_to_eol(base_cgroup);
3440         if (!copy)
3441                 return NULL;
3442         trim(copy);
3443
3444         if (!relative) {
3445                 base_cgroup = prune_init_scope(copy);
3446                 if (!base_cgroup)
3447                         return NULL;
3448         } else {
3449                 base_cgroup = copy;
3450         }
3451
3452         if (abspath(base_cgroup))
3453                 base_cgroup = deabs(base_cgroup);
3454
3455         /* We're allowing base_cgroup to be "". */
3456         return strdup(base_cgroup);
3457 }
3458
3459 static int cg_unified_init(struct cgroup_ops *ops, bool relative,
3460                            bool unprivileged)
3461 {
3462         __do_free char *base_cgroup = NULL;
3463         int ret;
3464
3465         base_cgroup = cg_unified_get_current_cgroup(relative);
3466         if (!base_cgroup)
3467                 return ret_errno(EINVAL);
3468
3469         /* TODO: If the user requested specific controllers via lxc.cgroup.use
3470          * we should verify here. The reason I'm not doing it right is that I'm
3471          * not convinced that lxc.cgroup.use will be the future since it is a
3472          * global property. I much rather have an option that lets you request
3473          * controllers per container.
3474          */
3475
3476         ret = add_hierarchy(ops, NULL,
3477                             must_copy_string(DEFAULT_CGROUP_MOUNTPOINT),
3478                             move_ptr(base_cgroup), CGROUP2_SUPER_MAGIC);
3479         if (ret)
3480                 return syserrno(ret, "Failed to add unified cgroup hierarchy");
3481
3482         if (unprivileged)
3483                 cg_unified_delegate(&(ops->unified)->cgroup2_chown);
3484
3485         if (bpf_devices_cgroup_supported())
3486                 ops->unified->bpf_device_controller = 1;
3487
3488         ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3489         return CGROUP2_SUPER_MAGIC;
3490 }
3491
3492 static int __cgroup_init(struct cgroup_ops *ops, struct lxc_conf *conf)
3493 {
3494         __do_close int dfd = -EBADF;
3495         bool relative = conf->cgroup_meta.relative;
3496         int ret;
3497         const char *tmp;
3498
3499         if (ops->dfd_mnt_cgroupfs_host >= 0)
3500                 return ret_errno(EINVAL);
3501
3502         /*
3503          * I don't see the need for allowing symlinks here. If users want to
3504          * have their hierarchy available in different locations I strongly
3505          * suggest bind-mounts.
3506          */
3507         dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3508                         PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3509         if (dfd < 0)
3510                 return syserrno(-errno, "Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
3511
3512         tmp = lxc_global_config_value("lxc.cgroup.use");
3513         if (tmp) {
3514                 __do_free char *pin = NULL;
3515                 char *chop, *cur;
3516
3517                 pin = must_copy_string(tmp);
3518                 chop = pin;
3519
3520                 lxc_iterate_parts(cur, chop, ",")
3521                         must_append_string(&ops->cgroup_use, cur);
3522         }
3523
3524         /*
3525          * Keep dfd referenced by the cleanup function and actually move the fd
3526          * once we know the initialization succeeded. So if we fail we clean up
3527          * the dfd.
3528          */
3529         ops->dfd_mnt_cgroupfs_host = dfd;
3530
3531         if (unified_cgroup_fd(dfd))
3532                 ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
3533         else
3534                 ret = cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
3535         if (ret < 0)
3536                 return syserrno(ret, "Failed to initialize cgroups");
3537
3538         /* Transfer ownership to cgroup_ops. */
3539         move_fd(dfd);
3540         return 0;
3541 }
3542
3543 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3544 {
3545         const char *cgroup_pattern;
3546
3547         if (!ops)
3548                 return ret_set_errno(-1, ENOENT);
3549
3550         /* copy system-wide cgroup information */
3551         cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3552         if (cgroup_pattern && !strequal(cgroup_pattern, ""))
3553                 ops->cgroup_pattern = must_copy_string(cgroup_pattern);
3554
3555         return 0;
3556 }
3557
3558 struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
3559 {
3560         __do_free struct cgroup_ops *cgfsng_ops = NULL;
3561
3562         cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
3563         if (!cgfsng_ops)
3564                 return ret_set_errno(NULL, ENOMEM);
3565
3566         cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3567         cgfsng_ops->dfd_mnt_cgroupfs_host = -EBADF;
3568
3569         if (__cgroup_init(cgfsng_ops, conf))
3570                 return NULL;
3571
3572         cgfsng_ops->data_init                           = cgfsng_data_init;
3573         cgfsng_ops->payload_destroy                     = cgfsng_payload_destroy;
3574         cgfsng_ops->monitor_destroy                     = cgfsng_monitor_destroy;
3575         cgfsng_ops->monitor_create                      = cgfsng_monitor_create;
3576         cgfsng_ops->monitor_enter                       = cgfsng_monitor_enter;
3577         cgfsng_ops->monitor_delegate_controllers        = cgfsng_monitor_delegate_controllers;
3578         cgfsng_ops->payload_delegate_controllers        = cgfsng_payload_delegate_controllers;
3579         cgfsng_ops->payload_create                      = cgfsng_payload_create;
3580         cgfsng_ops->payload_enter                       = cgfsng_payload_enter;
3581         cgfsng_ops->payload_finalize                    = cgfsng_payload_finalize;
3582         cgfsng_ops->get_cgroup                          = cgfsng_get_cgroup;
3583         cgfsng_ops->get                                 = cgfsng_get;
3584         cgfsng_ops->set                                 = cgfsng_set;
3585         cgfsng_ops->freeze                              = cgfsng_freeze;
3586         cgfsng_ops->unfreeze                            = cgfsng_unfreeze;
3587         cgfsng_ops->setup_limits_legacy                 = cgfsng_setup_limits_legacy;
3588         cgfsng_ops->setup_limits                        = cgfsng_setup_limits;
3589         cgfsng_ops->driver                              = "cgfsng";
3590         cgfsng_ops->version                             = "1.0.0";
3591         cgfsng_ops->attach                              = cgfsng_attach;
3592         cgfsng_ops->chown                               = cgfsng_chown;
3593         cgfsng_ops->mount                               = cgfsng_mount;
3594         cgfsng_ops->devices_activate                    = cgfsng_devices_activate;
3595         cgfsng_ops->get_limiting_cgroup                 = cgfsng_get_limiting_cgroup;
3596
3597         cgfsng_ops->criu_escape                         = cgfsng_criu_escape;
3598         cgfsng_ops->criu_num_hierarchies                = cgfsng_criu_num_hierarchies;
3599         cgfsng_ops->criu_get_hierarchies                = cgfsng_criu_get_hierarchies;
3600
3601         return move_ptr(cgfsng_ops);
3602 }
3603
3604 int cgroup_attach(const struct lxc_conf *conf, const char *name,
3605                   const char *lxcpath, pid_t pid)
3606 {
3607         __do_close int unified_fd = -EBADF;
3608         int ret;
3609
3610         if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
3611                 return ret_errno(EINVAL);
3612
3613         unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3614         if (unified_fd < 0)
3615                 return ret_errno(ENOCGROUP2);
3616
3617         if (!lxc_list_empty(&conf->id_map)) {
3618                 struct userns_exec_unified_attach_data args = {
3619                         .conf           = conf,
3620                         .unified_fd     = unified_fd,
3621                         .pid            = pid,
3622                 };
3623
3624                 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3625                 if (ret < 0)
3626                         return -errno;
3627
3628                 ret = userns_exec_minimal(conf,
3629                                           cgroup_unified_attach_parent_wrapper,
3630                                           &args,
3631                                           cgroup_unified_attach_child_wrapper,
3632                                           &args);
3633         } else {
3634                 ret = cgroup_attach_leaf(conf, unified_fd, pid);
3635         }
3636
3637         return ret;
3638 }
3639
3640 /* Connects to command socket therefore isn't callable from command handler. */
3641 int cgroup_get(const char *name, const char *lxcpath,
3642                const char *filename, char *buf, size_t len)
3643 {
3644         __do_close int unified_fd = -EBADF;
3645         ssize_t ret;
3646
3647         if (is_empty_string(filename) || is_empty_string(name) ||
3648             is_empty_string(lxcpath))
3649                 return ret_errno(EINVAL);
3650
3651         if ((buf && !len) || (len && !buf))
3652                 return ret_errno(EINVAL);
3653
3654         unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3655         if (unified_fd < 0)
3656                 return ret_errno(ENOCGROUP2);
3657
3658         ret = lxc_read_try_buf_at(unified_fd, filename, buf, len);
3659         if (ret < 0)
3660                 SYSERROR("Failed to read cgroup value");
3661
3662         return ret;
3663 }
3664
3665 /* Connects to command socket therefore isn't callable from command handler. */
3666 int cgroup_set(const char *name, const char *lxcpath,
3667                const char *filename, const char *value)
3668 {
3669         __do_close int unified_fd = -EBADF;
3670         ssize_t ret;
3671
3672         if (is_empty_string(filename) || is_empty_string(value) ||
3673             is_empty_string(name) || is_empty_string(lxcpath))
3674                 return ret_errno(EINVAL);
3675
3676         unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3677         if (unified_fd < 0)
3678                 return ret_errno(ENOCGROUP2);
3679
3680         if (strnequal(filename, "devices.", STRLITERALLEN("devices."))) {
3681                 struct device_item device = {};
3682
3683                 ret = device_cgroup_rule_parse(&device, filename, value);
3684                 if (ret < 0)
3685                         return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", filename, value);
3686
3687                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3688         } else {
3689                 ret = lxc_writeat(unified_fd, filename, value, strlen(value));
3690         }
3691
3692         return ret;
3693 }
3694
3695 static int do_cgroup_freeze(int unified_fd,
3696                             const char *state_string,
3697                             int state_num,
3698                             int timeout,
3699                             const char *epoll_error,
3700                             const char *wait_error)
3701 {
3702         __do_close int events_fd = -EBADF;
3703         call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
3704         int ret;
3705         struct lxc_epoll_descr descr = {};
3706
3707         if (timeout != 0) {
3708                 ret = lxc_mainloop_open(&descr);
3709                 if (ret)
3710                         return log_error_errno(-1, errno, "%s", epoll_error);
3711
3712                 /* automatically cleaned up now */
3713                 descr_ptr = &descr;
3714
3715                 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3716                 if (events_fd < 0)
3717                         return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3718
3719                 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
3720                 if (ret < 0)
3721                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3722         }
3723
3724         ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3725         if (ret < 0)
3726                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3727
3728         if (timeout != 0) {
3729                 ret = lxc_mainloop(&descr, timeout);
3730                 if (ret)
3731                         return log_error_errno(-1, errno, "%s", wait_error);
3732         }
3733
3734         return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3735 }
3736
3737 static inline int __cgroup_freeze(int unified_fd, int timeout)
3738 {
3739         return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3740                                 "Failed to create epoll instance to wait for container freeze",
3741                                 "Failed to wait for container to be frozen");
3742 }
3743
3744 int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
3745 {
3746         __do_close int unified_fd = -EBADF;
3747         int ret;
3748
3749         if (is_empty_string(name) || is_empty_string(lxcpath))
3750                 return ret_errno(EINVAL);
3751
3752         unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3753         if (unified_fd < 0)
3754                 return ret_errno(ENOCGROUP2);
3755
3756         lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
3757         ret = __cgroup_freeze(unified_fd, timeout);
3758         lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
3759         return ret;
3760 }
3761
3762 int __cgroup_unfreeze(int unified_fd, int timeout)
3763 {
3764         return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3765                                 "Failed to create epoll instance to wait for container freeze",
3766                                 "Failed to wait for container to be frozen");
3767 }
3768
3769 int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
3770 {
3771         __do_close int unified_fd = -EBADF;
3772         int ret;
3773
3774         if (is_empty_string(name) || is_empty_string(lxcpath))
3775                 return ret_errno(EINVAL);
3776
3777         unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3778         if (unified_fd < 0)
3779                 return ret_errno(ENOCGROUP2);
3780
3781         lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
3782         ret = __cgroup_unfreeze(unified_fd, timeout);
3783         lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
3784         return ret;
3785 }