src/lxc/cgroups/cgfsng.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 /*
   4  * cgfs-ng.c: this is a new, simplified implementation of a filesystem
   5  * cgroup backend.  The original cgfs.c was designed to be as flexible
   6  * as possible.  It would try to find cgroup filesystems no matter where
   7  * or how you had them mounted, and deduce the most usable mount for
   8  * each controller.
   9  *
  10  * This new implementation assumes that cgroup filesystems are mounted
  11  * under /sys/fs/cgroup/clist where clist is either the controller, or
  12  * a comma-separated list of controllers.
  13  */
  14
  15 #ifndef _GNU_SOURCE
  16 #define _GNU_SOURCE 1
  17 #endif
  18 #include <ctype.h>
  19 #include <dirent.h>
  20 #include <errno.h>
  21 #include <grp.h>
  22 #include <linux/kdev_t.h>
  23 #include <linux/types.h>
  24 #include <poll.h>
  25 #include <signal.h>
  26 #include <stdint.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/epoll.h>
  31 #include <sys/types.h>
  32 #include <unistd.h>
  33
  34 #include "af_unix.h"
  35 #include "caps.h"
  36 #include "cgroup.h"
  37 #include "cgroup2_devices.h"
  38 #include "cgroup_utils.h"
  39 #include "commands.h"
  40 #include "commands_utils.h"
  41 #include "conf.h"
  42 #include "config.h"
  43 #include "log.h"
  44 #include "macro.h"
  45 #include "mainloop.h"
  46 #include "memory_utils.h"
  47 #include "mount_utils.h"
  48 #include "storage/storage.h"
  49 #include "string_utils.h"
  50 #include "syscall_wrappers.h"
  51 #include "utils.h"
  52
  53 #ifndef HAVE_STRLCPY
  54 #include "include/strlcpy.h"
  55 #endif
  56
  57 #ifndef HAVE_STRLCAT
  58 #include "include/strlcat.h"
  59 #endif
  60
  61 lxc_log_define(cgfsng, cgroup);
  62
  63 /* Given a pointer to a null-terminated array of pointers, realloc to add one
  64  * entry, and point the new entry to NULL. Do not fail. Return the index to the
  65  * second-to-last entry - that is, the one which is now available for use
  66  * (keeping the list null-terminated).
  67  */
  68 static int append_null_to_list(void ***list)
  69 {
  70         int newentry = 0;
  71
  72         if (*list)
  73                 for (; (*list)[newentry]; newentry++)
  74                         ;
  75
  76         *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
  77         (*list)[newentry + 1] = NULL;
  78         return newentry;
  79 }
  80
  81 /* Given a null-terminated array of strings, check whether @entry is one of the
  82  * strings.
  83  */
  84 static bool string_in_list(char **list, const char *entry)
  85 {
  86         if (!list)
  87                 return false;
  88
  89         for (int i = 0; list[i]; i++)
  90                 if (strequal(list[i], entry))
  91                         return true;
  92
  93         return false;
  94 }
  95
  96 /* Return a copy of @entry prepending "name=", i.e.  turn "systemd" into
  97  * "name=systemd". Do not fail.
  98  */
  99 static char *cg_legacy_must_prefix_named(char *entry)
 100 {
 101         size_t len;
 102         char *prefixed;
 103
 104         len = strlen(entry);
 105         prefixed = must_realloc(NULL, len + 6);
 106
 107         memcpy(prefixed, "name=", STRLITERALLEN("name="));
 108         memcpy(prefixed + STRLITERALLEN("name="), entry, len);
 109         prefixed[len + 5] = '\0';
 110
 111         return prefixed;
 112 }
 113
 114 /* Append an entry to the clist. Do not fail. @clist must be NULL the first time
 115  * we are called.
 116  *
 117  * We also handle named subsystems here. Any controller which is not a kernel
 118  * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
 119  * we refuse to use because we're not sure which we have here.
 120  * (TODO: We could work around this in some cases by just remounting to be
 121  * unambiguous, or by comparing mountpoint contents with current cgroup.)
 122  *
 123  * The last entry will always be NULL.
 124  */
 125 static void must_append_controller(char **klist, char **nlist, char ***clist,
 126                                    char *entry)
 127 {
 128         int newentry;
 129         char *copy;
 130
 131         if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
 132                 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
 133                 ERROR("It is both a named and kernel subsystem");
 134                 return;
 135         }
 136
 137         newentry = append_null_to_list((void ***)clist);
 138
 139         if (strnequal(entry, "name=", 5))
 140                 copy = must_copy_string(entry);
 141         else if (string_in_list(klist, entry))
 142                 copy = must_copy_string(entry);
 143         else
 144                 copy = cg_legacy_must_prefix_named(entry);
 145
 146         (*clist)[newentry] = copy;
 147 }
 148
 149 /* Given a handler's cgroup data, return the struct hierarchy for the controller
 150  * @c, or NULL if there is none.
 151  */
 152 static struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
 153 {
 154         if (!ops->hierarchies)
 155                 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
 156
 157         for (int i = 0; ops->hierarchies[i]; i++) {
 158                 if (!controller) {
 159                         /* This is the empty unified hierarchy. */
 160                         if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
 161                                 return ops->hierarchies[i];
 162
 163                         continue;
 164                 }
 165
 166                 /*
 167                  * Handle controllers with significant implementation changes
 168                  * from cgroup to cgroup2.
 169                  */
 170                 if (pure_unified_layout(ops)) {
 171                         if (strequal(controller, "devices")) {
 172                                 if (ops->unified->bpf_device_controller)
 173                                         return ops->unified;
 174
 175                                 break;
 176                         } else if (strequal(controller, "freezer")) {
 177                                 if (ops->unified->freezer_controller)
 178                                         return ops->unified;
 179
 180                                 break;
 181                         }
 182                 }
 183
 184                 if (string_in_list(ops->hierarchies[i]->controllers, controller))
 185                         return ops->hierarchies[i];
 186         }
 187
 188         if (controller)
 189                 WARN("There is no useable %s controller", controller);
 190         else
 191                 WARN("There is no empty unified cgroup hierarchy");
 192
 193         return ret_set_errno(NULL, ENOENT);
 194 }
 195
 196 /* Taken over modified from the kernel sources. */
 197 #define NBITS 32 /* bits in uint32_t */
 198 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
 199 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
 200
 201 static void set_bit(unsigned bit, uint32_t *bitarr)
 202 {
 203         bitarr[bit / NBITS] |= (1 << (bit % NBITS));
 204 }
 205
 206 static void clear_bit(unsigned bit, uint32_t *bitarr)
 207 {
 208         bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
 209 }
 210
 211 static bool is_set(unsigned bit, uint32_t *bitarr)
 212 {
 213         return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
 214 }
 215
 216 /* Create cpumask from cpulist aka turn:
 217  *
 218  *      0,2-3
 219  *
 220  * into bit array
 221  *
 222  *      1 0 1 1
 223  */
 224 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
 225 {
 226         __do_free uint32_t *bitarr = NULL;
 227         char *token;
 228         size_t arrlen;
 229
 230         arrlen = BITS_TO_LONGS(nbits);
 231         bitarr = calloc(arrlen, sizeof(uint32_t));
 232         if (!bitarr)
 233                 return ret_set_errno(NULL, ENOMEM);
 234
 235         lxc_iterate_parts(token, buf, ",") {
 236                 errno = 0;
 237                 unsigned end, start;
 238                 char *range;
 239
 240                 start = strtoul(token, NULL, 0);
 241                 end = start;
 242                 range = strchr(token, '-');
 243                 if (range)
 244                         end = strtoul(range + 1, NULL, 0);
 245
 246                 if (!(start <= end))
 247                         return ret_set_errno(NULL, EINVAL);
 248
 249                 if (end >= nbits)
 250                         return ret_set_errno(NULL, EINVAL);
 251
 252                 while (start <= end)
 253                         set_bit(start++, bitarr);
 254         }
 255
 256         return move_ptr(bitarr);
 257 }
 258
 259 /* Turn cpumask into simple, comma-separated cpulist. */
 260 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
 261 {
 262         __do_free_string_list char **cpulist = NULL;
 263         char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
 264         int ret;
 265
 266         for (size_t i = 0; i <= nbits; i++) {
 267                 if (!is_set(i, bitarr))
 268                         continue;
 269
 270                 ret = strnprintf(numstr, sizeof(numstr), "%zu", i);
 271                 if (ret < 0)
 272                         return NULL;
 273
 274                 ret = lxc_append_string(&cpulist, numstr);
 275                 if (ret < 0)
 276                         return ret_set_errno(NULL, ENOMEM);
 277         }
 278
 279         if (!cpulist)
 280                 return ret_set_errno(NULL, ENOMEM);
 281
 282         return lxc_string_join(",", (const char **)cpulist, false);
 283 }
 284
 285 static ssize_t get_max_cpus(char *cpulist)
 286 {
 287         char *c1, *c2;
 288         char *maxcpus = cpulist;
 289         size_t cpus = 0;
 290
 291         c1 = strrchr(maxcpus, ',');
 292         if (c1)
 293                 c1++;
 294
 295         c2 = strrchr(maxcpus, '-');
 296         if (c2)
 297                 c2++;
 298
 299         if (!c1 && !c2)
 300                 c1 = maxcpus;
 301         else if (c1 > c2)
 302                 c2 = c1;
 303         else if (c1 < c2)
 304                 c1 = c2;
 305         else if (!c1 && c2)
 306                 c1 = c2;
 307
 308         errno = 0;
 309         cpus = strtoul(c1, NULL, 0);
 310         if (errno != 0)
 311                 return -1;
 312
 313         return cpus;
 314 }
 315
 316 static inline bool is_unified_hierarchy(const struct hierarchy *h)
 317 {
 318         return h->version == CGROUP2_SUPER_MAGIC;
 319 }
 320
 321 /* Given two null-terminated lists of strings, return true if any string is in
 322  * both.
 323  */
 324 static bool controller_lists_intersect(char **l1, char **l2)
 325 {
 326         if (!l1 || !l2)
 327                 return false;
 328
 329         for (int i = 0; l1[i]; i++)
 330                 if (string_in_list(l2, l1[i]))
 331                         return true;
 332
 333         return false;
 334 }
 335
 336 /* For a null-terminated list of controllers @clist, return true if any of those
 337  * controllers is already listed the null-terminated list of hierarchies @hlist.
 338  * Realistically, if one is present, all must be present.
 339  */
 340 static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
 341 {
 342         if (!hlist)
 343                 return false;
 344
 345         for (int i = 0; hlist[i]; i++)
 346                 if (controller_lists_intersect(hlist[i]->controllers, clist))
 347                         return true;
 348
 349         return false;
 350 }
 351
 352 /* Return true if the controller @entry is found in the null-terminated list of
 353  * hierarchies @hlist.
 354  */
 355 static bool controller_found(struct hierarchy **hlist, char *entry)
 356 {
 357         if (!hlist)
 358                 return false;
 359
 360         for (int i = 0; hlist[i]; i++)
 361                 if (string_in_list(hlist[i]->controllers, entry))
 362                         return true;
 363
 364         return false;
 365 }
 366
 367 /* Return true if all of the controllers which we require have been found.  The
 368  * required list is  freezer and anything in lxc.cgroup.use.
 369  */
 370 static bool all_controllers_found(struct cgroup_ops *ops)
 371 {
 372         struct hierarchy **hlist;
 373
 374         if (!ops->cgroup_use)
 375                 return true;
 376
 377         hlist = ops->hierarchies;
 378         for (char **cur = ops->cgroup_use; cur && *cur; cur++)
 379                 if (!controller_found(hlist, *cur))
 380                         return log_error(false, "No %s controller mountpoint found", *cur);
 381
 382         return true;
 383 }
 384
 385 /* Get the controllers from a mountinfo line There are other ways we could get
 386  * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
 387  * could parse the mount options. But we simply assume that the mountpoint must
 388  * be /sys/fs/cgroup/controller-list
 389  */
 390 static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
 391                                         int type)
 392 {
 393         /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
 394          * for legacy hierarchies.
 395          */
 396         __do_free_string_list char **aret = NULL;
 397         int i;
 398         char *p2, *tok;
 399         char *p = line, *sep = ",";
 400
 401         for (i = 0; i < 4; i++) {
 402                 p = strchr(p, ' ');
 403                 if (!p)
 404                         return NULL;
 405                 p++;
 406         }
 407
 408         /* Note, if we change how mountinfo works, then our caller will need to
 409          * verify /sys/fs/cgroup/ in this field.
 410          */
 411         if (!strnequal(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15))
 412                 return log_warn(NULL, "Found hierarchy not under " DEFAULT_CGROUP_MOUNTPOINT ": \"%s\"", p);
 413
 414         p += 15;
 415         p2 = strchr(p, ' ');
 416         if (!p2)
 417                 return log_error(NULL, "Corrupt mountinfo");
 418         *p2 = '\0';
 419
 420         if (type == CGROUP_SUPER_MAGIC) {
 421                 __do_free char *dup = NULL;
 422
 423                 /* strdup() here for v1 hierarchies. Otherwise
 424                  * lxc_iterate_parts() will destroy mountpoints such as
 425                  * "/sys/fs/cgroup/cpu,cpuacct".
 426                  */
 427                 dup = must_copy_string(p);
 428                 if (!dup)
 429                         return NULL;
 430
 431                 lxc_iterate_parts(tok, dup, sep)
 432                         must_append_controller(klist, nlist, &aret, tok);
 433         }
 434         *p2 = ' ';
 435
 436         return move_ptr(aret);
 437 }
 438
 439 static char **cg_unified_make_empty_controller(void)
 440 {
 441         __do_free_string_list char **aret = NULL;
 442         int newentry;
 443
 444         newentry = append_null_to_list((void ***)&aret);
 445         aret[newentry] = NULL;
 446         return move_ptr(aret);
 447 }
 448
 449 static char **cg_unified_get_controllers(int dfd, const char *file)
 450 {
 451         __do_free char *buf = NULL;
 452         __do_free_string_list char **aret = NULL;
 453         char *sep = " \t\n";
 454         char *tok;
 455
 456         buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
 457         if (!buf)
 458                 return NULL;
 459
 460         lxc_iterate_parts(tok, buf, sep) {
 461                 int newentry;
 462                 char *copy;
 463
 464                 newentry = append_null_to_list((void ***)&aret);
 465                 copy = must_copy_string(tok);
 466                 aret[newentry] = copy;
 467         }
 468
 469         return move_ptr(aret);
 470 }
 471
 472 static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
 473                                        char **controllers)
 474 {
 475         if (!ops->cgroup_use)
 476                 return true;
 477
 478         for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
 479                 bool found = false;
 480
 481                 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
 482                         if (!strequal(*cur_use, *cur_ctrl))
 483                                 continue;
 484
 485                         found = true;
 486                         break;
 487                 }
 488
 489                 if (found)
 490                         continue;
 491
 492                 return false;
 493         }
 494
 495         return true;
 496 }
 497
 498 static int add_hierarchy(struct cgroup_ops *ops, char **clist, char *mountpoint,
 499                          char *container_base_path, int type)
 500 {
 501         __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
 502         __do_free struct hierarchy *new = NULL;
 503         __do_free_string_list char **controllers = clist;
 504         int idx;
 505
 506         if (abspath(container_base_path))
 507                 return syserrno(-errno, "Container base path must be relative to controller mount");
 508
 509         if (!controllers && type != CGROUP2_SUPER_MAGIC)
 510                 return syserrno_set(-EINVAL, "Empty controller list for non-unified cgroup hierarchy passed");
 511
 512         dfd_mnt = open_at(-EBADF, mountpoint, PROTECT_OPATH_DIRECTORY,
 513                           PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
 514         if (dfd_mnt < 0)
 515                 return syserrno(-errno, "Failed to open %s", mountpoint);
 516
 517         if (!is_empty_string(container_base_path)) {
 518                 dfd_base = open_at(dfd_mnt, container_base_path,
 519                                    PROTECT_OPATH_DIRECTORY,
 520                                    PROTECT_LOOKUP_BENEATH_XDEV, 0);
 521                 if (dfd_base < 0)
 522                         return syserrno(-errno, "Failed to open %d(%s)", dfd_base, container_base_path);
 523         }
 524
 525         if (!controllers) {
 526                 /*
 527                 * We assume that the cgroup we're currently in has been delegated to
 528                 * us and we are free to further delege all of the controllers listed
 529                 * in cgroup.controllers further down the hierarchy.
 530                  */
 531                 if (dfd_base < 0)
 532                         controllers = cg_unified_get_controllers(dfd_mnt, "cgroup.controllers");
 533                 else
 534                         controllers = cg_unified_get_controllers(dfd_base, "cgroup.controllers");
 535                 if (!controllers)
 536                         controllers = cg_unified_make_empty_controller();
 537                 if (!controllers[0])
 538                         TRACE("No controllers are enabled for delegation");
 539         }
 540
 541         /* Exclude all controllers that cgroup use does not want. */
 542         if (!cgroup_use_wants_controllers(ops, controllers))
 543                 return log_trace(0, "Skipping cgroup hiearchy with non-requested controllers");
 544
 545         new = zalloc(sizeof(*new));
 546         if (!new)
 547                 return ret_errno(ENOMEM);
 548
 549         new->version                    = type;
 550         new->controllers                = move_ptr(controllers);
 551         new->mountpoint                 = mountpoint;
 552         new->container_base_path        = container_base_path;
 553         new->cgfd_con                   = -EBADF;
 554         new->cgfd_limit                 = -EBADF;
 555         new->cgfd_mon                   = -EBADF;
 556
 557         TRACE("Adding cgroup hierarchy with mountpoint %s and base cgroup %s",
 558               mountpoint, container_base_path);
 559         for (char *const *it = new->controllers; it && *it; it++)
 560                 TRACE("The detected hierarchy contains the %s controller", *it);
 561
 562         idx = append_null_to_list((void ***)&ops->hierarchies);
 563         if (dfd_base < 0)
 564                 new->dfd_base = dfd_mnt;
 565         else
 566                 new->dfd_base = move_fd(dfd_base);
 567         new->dfd_mnt = move_fd(dfd_mnt);
 568         if (type == CGROUP2_SUPER_MAGIC)
 569                 ops->unified = new;
 570         (ops->hierarchies)[idx] = move_ptr(new);
 571         return 0;
 572 }
 573
 574 /* Get a copy of the mountpoint from @line, which is a line from
 575  * /proc/self/mountinfo.
 576  */
 577 static char *cg_hybrid_get_mountpoint(char *line)
 578 {
 579         char *p = line, *sret = NULL;
 580         size_t len;
 581         char *p2;
 582
 583         for (int i = 0; i < 4; i++) {
 584                 p = strchr(p, ' ');
 585                 if (!p)
 586                         return NULL;
 587                 p++;
 588         }
 589
 590         if (!strnequal(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15))
 591                 return NULL;
 592
 593         p2 = strchr(p + 15, ' ');
 594         if (!p2)
 595                 return NULL;
 596         *p2 = '\0';
 597
 598         len = strlen(p);
 599         sret = must_realloc(NULL, len + 1);
 600         memcpy(sret, p, len);
 601         sret[len] = '\0';
 602
 603         return sret;
 604 }
 605
 606 /* Given a multi-line string, return a null-terminated copy of the current line. */
 607 static char *copy_to_eol(char *p)
 608 {
 609         char *p2, *sret;
 610         size_t len;
 611
 612         p2 = strchr(p, '\n');
 613         if (!p2)
 614                 return NULL;
 615
 616         len = p2 - p;
 617         sret = must_realloc(NULL, len + 1);
 618         memcpy(sret, p, len);
 619         sret[len] = '\0';
 620
 621         return sret;
 622 }
 623
 624 /* cgline: pointer to character after the first ':' in a line in a \n-terminated
 625  * /proc/self/cgroup file. Check whether controller c is present.
 626  */
 627 static bool controller_in_clist(char *cgline, char *c)
 628 {
 629         __do_free char *tmp = NULL;
 630         char *tok, *eol;
 631         size_t len;
 632
 633         eol = strchr(cgline, ':');
 634         if (!eol)
 635                 return false;
 636
 637         len = eol - cgline;
 638         tmp = must_realloc(NULL, len + 1);
 639         memcpy(tmp, cgline, len);
 640         tmp[len] = '\0';
 641
 642         lxc_iterate_parts(tok, tmp, ",")
 643                 if (strequal(tok, c))
 644                         return true;
 645
 646         return false;
 647 }
 648
 649 static inline char *trim(char *s)
 650 {
 651         size_t len;
 652
 653         len = strlen(s);
 654         while ((len > 1) && (s[len - 1] == '\n'))
 655                 s[--len] = '\0';
 656
 657         return s;
 658 }
 659
 660 /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
 661  * @controller.
 662  */
 663 static char *cg_hybrid_get_current_cgroup(bool relative, char *basecginfo,
 664                                           char *controller, int type)
 665 {
 666         char *base_cgroup = basecginfo;
 667
 668         for (;;) {
 669                 bool is_cgv2_base_cgroup = false;
 670
 671                 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
 672                 if ((type == CGROUP2_SUPER_MAGIC) && (*base_cgroup == '0'))
 673                         is_cgv2_base_cgroup = true;
 674
 675                 base_cgroup = strchr(base_cgroup, ':');
 676                 if (!base_cgroup)
 677                         return NULL;
 678                 base_cgroup++;
 679
 680                 if (is_cgv2_base_cgroup || (controller && controller_in_clist(base_cgroup, controller))) {
 681                         __do_free char *copy = NULL;
 682
 683                         base_cgroup = strchr(base_cgroup, ':');
 684                         if (!base_cgroup)
 685                                 return NULL;
 686                         base_cgroup++;
 687
 688                         copy = copy_to_eol(base_cgroup);
 689                         if (!copy)
 690                                 return NULL;
 691                         trim(copy);
 692
 693                         if (!relative) {
 694                                 base_cgroup = prune_init_scope(copy);
 695                                 if (!base_cgroup)
 696                                         return NULL;
 697                         } else {
 698                                 base_cgroup = copy;
 699                         }
 700
 701                         if (abspath(base_cgroup))
 702                                 base_cgroup = deabs(base_cgroup);
 703
 704                         /* We're allowing base_cgroup to be "". */
 705                         return strdup(base_cgroup);
 706                 }
 707
 708                 base_cgroup = strchr(base_cgroup, '\n');
 709                 if (!base_cgroup)
 710                         return NULL;
 711                 base_cgroup++;
 712         }
 713 }
 714
 715 static void must_append_string(char ***list, char *entry)
 716 {
 717         int newentry;
 718         char *copy;
 719
 720         newentry = append_null_to_list((void ***)list);
 721         copy = must_copy_string(entry);
 722         (*list)[newentry] = copy;
 723 }
 724
 725 static int get_existing_subsystems(char ***klist, char ***nlist)
 726 {
 727         __do_free char *line = NULL;
 728         __do_fclose FILE *f = NULL;
 729         size_t len = 0;
 730
 731         f = fopen("/proc/self/cgroup", "re");
 732         if (!f)
 733                 return -1;
 734
 735         while (getline(&line, &len, f) != -1) {
 736                 char *p, *p2, *tok;
 737                 p = strchr(line, ':');
 738                 if (!p)
 739                         continue;
 740                 p++;
 741                 p2 = strchr(p, ':');
 742                 if (!p2)
 743                         continue;
 744                 *p2 = '\0';
 745
 746                 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
 747                  * contains an entry of the form:
 748                  *
 749                  *      0::/some/path
 750                  *
 751                  * In this case we use "cgroup2" as controller name.
 752                  */
 753                 if ((p2 - p) == 0) {
 754                         must_append_string(klist, "cgroup2");
 755                         continue;
 756                 }
 757
 758                 lxc_iterate_parts(tok, p, ",") {
 759                         if (strnequal(tok, "name=", 5))
 760                                 must_append_string(nlist, tok);
 761                         else
 762                                 must_append_string(klist, tok);
 763                 }
 764         }
 765
 766         return 0;
 767 }
 768
 769 static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
 770                                               char **nlist)
 771 {
 772         int k;
 773         char **it;
 774
 775         TRACE("basecginfo is:");
 776         TRACE("%s", basecginfo);
 777
 778         for (k = 0, it = klist; it && *it; it++, k++)
 779                 TRACE("kernel subsystem %d: %s", k, *it);
 780
 781         for (k = 0, it = nlist; it && *it; it++, k++)
 782                 TRACE("named subsystem %d: %s", k, *it);
 783 }
 784
 785 static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
 786 {
 787         if (!path_prune || !hierarchies)
 788                 return 0;
 789
 790         for (int i = 0; hierarchies[i]; i++) {
 791                 struct hierarchy *h = hierarchies[i];
 792                 int ret;
 793
 794                 ret = cgroup_tree_prune(h->dfd_base, path_prune);
 795                 if (ret < 0)
 796                         SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
 797                 else
 798                         TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
 799
 800                 if (h->container_limit_path != h->container_full_path)
 801                         free_disarm(h->container_limit_path);
 802                 free_disarm(h->container_full_path);
 803         }
 804
 805         return 0;
 806 }
 807
 808 struct generic_userns_exec_data {
 809         struct hierarchy **hierarchies;
 810         const char *path_prune;
 811         struct lxc_conf *conf;
 812         uid_t origuid; /* target uid in parent namespace */
 813         char *path;
 814 };
 815
 816 static int cgroup_tree_remove_wrapper(void *data)
 817 {
 818         struct generic_userns_exec_data *arg = data;
 819         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
 820         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
 821         int ret;
 822
 823         if (!lxc_drop_groups() && errno != EPERM)
 824                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
 825
 826         ret = setresgid(nsgid, nsgid, nsgid);
 827         if (ret < 0)
 828                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
 829                                        (int)nsgid, (int)nsgid, (int)nsgid);
 830
 831         ret = setresuid(nsuid, nsuid, nsuid);
 832         if (ret < 0)
 833                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
 834                                        (int)nsuid, (int)nsuid, (int)nsuid);
 835
 836         return cgroup_tree_remove(arg->hierarchies, arg->path_prune);
 837 }
 838
 839 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
 840                                                 struct lxc_handler *handler)
 841 {
 842         int ret;
 843
 844         if (!ops) {
 845                 ERROR("Called with uninitialized cgroup operations");
 846                 return;
 847         }
 848
 849         if (!ops->hierarchies)
 850                 return;
 851
 852         if (!handler) {
 853                 ERROR("Called with uninitialized handler");
 854                 return;
 855         }
 856
 857         if (!handler->conf) {
 858                 ERROR("Called with uninitialized conf");
 859                 return;
 860         }
 861
 862         if (!ops->container_limit_cgroup) {
 863                 WARN("Uninitialized limit cgroup");
 864                 return;
 865         }
 866
 867         ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
 868         if (ret < 0)
 869                 WARN("Failed to detach bpf program from cgroup");
 870
 871         if (!lxc_list_empty(&handler->conf->id_map)) {
 872                 struct generic_userns_exec_data wrap = {
 873                         .conf                   = handler->conf,
 874                         .path_prune             = ops->container_limit_cgroup,
 875                         .hierarchies            = ops->hierarchies,
 876                         .origuid                = 0,
 877                 };
 878                 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
 879                                     &wrap, "cgroup_tree_remove_wrapper");
 880         } else {
 881                 ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup);
 882         }
 883         if (ret < 0)
 884                 SYSWARN("Failed to destroy cgroups");
 885 }
 886
 887 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
 888 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
 889 static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
 890                                     bool am_initialized)
 891 {
 892         __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
 893                        *offlinecpus = NULL, *posscpus = NULL;
 894         __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
 895                            *possmask = NULL;
 896         int ret;
 897         ssize_t i;
 898         ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
 899         bool flipped_bit = false;
 900
 901         posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
 902         if (!posscpus)
 903                 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
 904
 905         /* Get maximum number of cpus found in possible cpuset. */
 906         maxposs = get_max_cpus(posscpus);
 907         if (maxposs < 0 || maxposs >= INT_MAX - 1)
 908                 return false;
 909
 910         if (file_exists(__ISOL_CPUS)) {
 911                 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
 912                 if (!isolcpus)
 913                         return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
 914
 915                 if (isdigit(isolcpus[0])) {
 916                         /* Get maximum number of cpus found in isolated cpuset. */
 917                         maxisol = get_max_cpus(isolcpus);
 918                         if (maxisol < 0 || maxisol >= INT_MAX - 1)
 919                                 return false;
 920                 }
 921
 922                 if (maxposs < maxisol)
 923                         maxposs = maxisol;
 924                 maxposs++;
 925         } else {
 926                 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
 927         }
 928
 929         if (file_exists(__OFFLINE_CPUS)) {
 930                 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
 931                 if (!offlinecpus)
 932                         return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
 933
 934                 if (isdigit(offlinecpus[0])) {
 935                         /* Get maximum number of cpus found in offline cpuset. */
 936                         maxoffline = get_max_cpus(offlinecpus);
 937                         if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
 938                                 return false;
 939                 }
 940
 941                 if (maxposs < maxoffline)
 942                         maxposs = maxoffline;
 943                 maxposs++;
 944         } else {
 945                 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
 946         }
 947
 948         if ((maxisol == 0) && (maxoffline == 0)) {
 949                 cpulist = move_ptr(posscpus);
 950                 goto copy_parent;
 951         }
 952
 953         possmask = lxc_cpumask(posscpus, maxposs);
 954         if (!possmask)
 955                 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
 956
 957         if (maxisol > 0) {
 958                 isolmask = lxc_cpumask(isolcpus, maxposs);
 959                 if (!isolmask)
 960                         return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
 961         }
 962
 963         if (maxoffline > 0) {
 964                 offlinemask = lxc_cpumask(offlinecpus, maxposs);
 965                 if (!offlinemask)
 966                         return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
 967         }
 968
 969         for (i = 0; i <= maxposs; i++) {
 970                 if ((isolmask && !is_set(i, isolmask)) ||
 971                     (offlinemask && !is_set(i, offlinemask)) ||
 972                     !is_set(i, possmask))
 973                         continue;
 974
 975                 flipped_bit = true;
 976                 clear_bit(i, possmask);
 977         }
 978
 979         if (!flipped_bit) {
 980                 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
 981                 TRACE("No isolated or offline cpus present in cpuset");
 982         } else {
 983                 cpulist = move_ptr(posscpus);
 984                 TRACE("Removed isolated or offline cpus from cpuset");
 985         }
 986         if (!cpulist)
 987                 return log_error_errno(false, errno, "Failed to create cpu list");
 988
 989 copy_parent:
 990         if (!am_initialized) {
 991                 ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
 992                 if (ret < 0)
 993                         return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
 994
 995                 TRACE("Copied cpu settings of parent cgroup");
 996         }
 997
 998         return true;
 999 }
1000
1001 static bool cpuset1_initialize(int dfd_base, int dfd_next)
1002 {
1003         char mems[PATH_MAX];
1004         ssize_t bytes;
1005         char v;
1006
1007         /*
1008         * Determine whether the base cgroup has cpuset
1009         * inheritance turned on.
1010          */
1011         bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
1012         if (bytes < 0)
1013                 return syserrno(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
1014
1015         /*
1016         * Initialize cpuset.cpus and make remove any isolated
1017         * and offline cpus.
1018          */
1019         if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
1020                 return syserrno(false, "Failed to initialize cpuset.cpus");
1021
1022         /* Read cpuset.mems from parent... */
1023         bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
1024         if (bytes < 0)
1025                 return syserrno(false, "Failed to read file %d(cpuset.mems)", dfd_base);
1026
1027         /* ... and copy to first cgroup in the tree... */
1028         bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
1029         if (bytes < 0)
1030                 return syserrno(false, "Failed to write %d(cpuset.mems)", dfd_next);
1031
1032         /* ... and finally turn on cpuset inheritance. */
1033         bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
1034         if (bytes < 0)
1035                 return syserrno(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
1036
1037         return log_trace(true, "Initialized cpuset in the legacy hierarchy");
1038 }
1039
1040 static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
1041                                 bool cpuset_v1, bool eexist_ignore)
1042 {
1043         __do_close int dfd_final = -EBADF;
1044         int dfd_cur = dfd_base;
1045         int ret = 0;
1046         size_t len;
1047         char *cur;
1048         char buf[PATH_MAX];
1049
1050         if (is_empty_string(path))
1051                 return ret_errno(EINVAL);
1052
1053         len = strlcpy(buf, path, sizeof(buf));
1054         if (len >= sizeof(buf))
1055                 return ret_errno(E2BIG);
1056
1057         lxc_iterate_parts(cur, buf, "/") {
1058                 /*
1059                  * Even though we vetted the paths when we parsed the config
1060                  * we're paranoid here and check that the path is neither
1061                  * absolute nor walks upwards.
1062                  */
1063                 if (abspath(cur))
1064                         return syserrno_set(-EINVAL, "No absolute paths allowed");
1065
1066                 if (strnequal(cur, "..", STRLITERALLEN("..")))
1067                         return syserrno_set(-EINVAL, "No upward walking paths allowed");
1068
1069                 ret = mkdirat(dfd_cur, cur, mode);
1070                 if (ret < 0) {
1071                         if (errno != EEXIST)
1072                                 return syserrno(-errno, "Failed to create %d(%s)", dfd_cur, cur);
1073
1074                         ret = -EEXIST;
1075                 }
1076                 TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
1077
1078                 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
1079                 if (dfd_final < 0)
1080                         return syserrno(-errno, "Fail to open%s directory %d(%s)",
1081                                         !ret ? " newly created" : "", dfd_base, cur);
1082                 if (dfd_cur != dfd_base)
1083                         close(dfd_cur);
1084                 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
1085                         return syserrno(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
1086                 /*
1087                  * Leave dfd_final pointing to the last fd we opened so
1088                  * it will be automatically zapped if we return early.
1089                  */
1090                 dfd_cur = dfd_final;
1091         }
1092
1093         /* The final cgroup must be succesfully creatd by us. */
1094         if (ret) {
1095                 if (ret != -EEXIST || !eexist_ignore)
1096                         return syserrno_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
1097         }
1098
1099         return move_fd(dfd_final);
1100 }
1101
1102 static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
1103                                struct hierarchy *h, const char *cgroup_limit_dir,
1104                                const char *cgroup_leaf, bool payload)
1105 {
1106         __do_close int fd_limit = -EBADF, fd_final = -EBADF;
1107         __do_free char *path = NULL, *limit_path = NULL;
1108         bool cpuset_v1 = false;
1109
1110         /*
1111          * The legacy cpuset controller needs massaging in case inheriting
1112          * settings from its immediate ancestor cgroup hasn't been turned on.
1113          */
1114         cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
1115
1116         if (payload && cgroup_leaf) {
1117                 /* With isolation both parts need to not already exist. */
1118                 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
1119                 if (fd_limit < 0)
1120                         return syserrno(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
1121
1122                 TRACE("Created limit cgroup %d->%d(%s)",
1123                       fd_limit, h->dfd_base, cgroup_limit_dir);
1124
1125                 /*
1126                  * With isolation the devices legacy cgroup needs to be
1127                  * iinitialized early, as it typically contains an 'a' (all)
1128                  * line, which is not possible once a subdirectory has been
1129                  * created.
1130                  */
1131                 if (string_in_list(h->controllers, "devices") &&
1132                     !ops->setup_limits_legacy(ops, conf, true))
1133                         return log_error(false, "Failed to setup legacy device limits");
1134
1135                 limit_path = must_make_path(h->mountpoint, h->container_base_path, cgroup_limit_dir, NULL);
1136                 path = must_make_path(limit_path, cgroup_leaf, NULL);
1137
1138                 /*
1139                  * If we use a separate limit cgroup, the leaf cgroup, i.e. the
1140                  * cgroup the container actually resides in, is below fd_limit.
1141                  */
1142                 fd_final = __cgroup_tree_create(fd_limit, cgroup_leaf, 0755, cpuset_v1, false);
1143                 if (fd_final < 0) {
1144                         /* Ensure we don't leave any garbage behind. */
1145                         if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
1146                                 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
1147                         else
1148                                 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
1149                 }
1150         } else {
1151                 path = must_make_path(h->mountpoint, h->container_base_path, cgroup_limit_dir, NULL);
1152
1153                 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
1154         }
1155         if (fd_final < 0)
1156                 return syserrno(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
1157
1158         if (payload) {
1159                 h->cgfd_con = move_fd(fd_final);
1160                 h->container_full_path = move_ptr(path);
1161
1162                 if (fd_limit < 0)
1163                         h->cgfd_limit = h->cgfd_con;
1164                 else
1165                         h->cgfd_limit = move_fd(fd_limit);
1166
1167                 if (limit_path)
1168                         h->container_limit_path = move_ptr(limit_path);
1169                 else
1170                         h->container_limit_path = h->container_full_path;
1171         } else {
1172                 h->cgfd_mon = move_fd(fd_final);
1173         }
1174
1175         return true;
1176 }
1177
1178 static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
1179                                    bool payload)
1180 {
1181         bool prune = true;
1182
1183         if (payload) {
1184                 /* Check whether we actually created the cgroup to prune. */
1185                 if (h->cgfd_limit < 0)
1186                         prune = false;
1187
1188                 if (h->container_full_path != h->container_limit_path)
1189                         free_disarm(h->container_limit_path);
1190                 free_disarm(h->container_full_path);
1191
1192                 close_prot_errno_disarm(h->cgfd_con);
1193                 close_prot_errno_disarm(h->cgfd_limit);
1194         } else {
1195                 /* Check whether we actually created the cgroup to prune. */
1196                 if (h->cgfd_mon < 0)
1197                         prune = false;
1198
1199                 close_prot_errno_disarm(h->cgfd_mon);
1200         }
1201
1202         /* We didn't create this cgroup. */
1203         if (!prune)
1204                 return;
1205
1206         if (cgroup_tree_prune(h->dfd_base, path_prune))
1207                 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
1208         else
1209                 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
1210 }
1211
1212 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1213                                                 struct lxc_handler *handler)
1214 {
1215         int len;
1216         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1217         const struct lxc_conf *conf;
1218
1219         if (!ops) {
1220                 ERROR("Called with uninitialized cgroup operations");
1221                 return;
1222         }
1223
1224         if (!ops->hierarchies)
1225                 return;
1226
1227         if (!handler) {
1228                 ERROR("Called with uninitialized handler");
1229                 return;
1230         }
1231
1232         if (!handler->conf) {
1233                 ERROR("Called with uninitialized conf");
1234                 return;
1235         }
1236         conf = handler->conf;
1237
1238         if (!ops->monitor_cgroup) {
1239                 WARN("Uninitialized monitor cgroup");
1240                 return;
1241         }
1242
1243         len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1244         if (len < 0)
1245                 return;
1246
1247         for (int i = 0; ops->hierarchies[i]; i++) {
1248                 __do_close int fd_pivot = -EBADF;
1249                 __do_free char *pivot_path = NULL;
1250                 struct hierarchy *h = ops->hierarchies[i];
1251                 bool cpuset_v1 = false;
1252                 int ret;
1253
1254                 /* Monitor might have died before we entered the cgroup. */
1255                 if (handler->monitor_pid <= 0) {
1256                         WARN("No valid monitor process found while destroying cgroups");
1257                         goto cgroup_prune_tree;
1258                 }
1259
1260                 if (conf->cgroup_meta.monitor_pivot_dir)
1261                         pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
1262                 else if (conf->cgroup_meta.dir)
1263                         pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
1264                 else
1265                         pivot_path = must_make_path(CGROUP_PIVOT, NULL);
1266
1267                 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
1268
1269                 fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
1270                 if (fd_pivot < 0) {
1271                         SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
1272                         continue;
1273                 }
1274
1275                 ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
1276                 if (ret != 0) {
1277                         SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
1278                         continue;
1279                 }
1280
1281 cgroup_prune_tree:
1282                 ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup);
1283                 if (ret < 0)
1284                         SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup);
1285                 else
1286                         TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup);
1287         }
1288 }
1289
1290 /*
1291  * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
1292  * proper prefix directory of lxc.cgroup.dir.payload.
1293  *
1294  * Returns the prefix length if it is set, otherwise zero on success.
1295  */
1296 static bool check_cgroup_dir_config(struct lxc_conf *conf)
1297 {
1298         const char *monitor_dir = conf->cgroup_meta.monitor_dir,
1299                    *container_dir = conf->cgroup_meta.container_dir,
1300                    *namespace_dir = conf->cgroup_meta.namespace_dir;
1301
1302         /* none of the new options are set, all is fine */
1303         if (!monitor_dir && !container_dir && !namespace_dir)
1304                 return true;
1305
1306         /* some are set, make sure lxc.cgroup.dir is not also set*/
1307         if (conf->cgroup_meta.dir)
1308                 return log_error_errno(false, EINVAL,
1309                         "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
1310
1311         /* make sure both monitor and payload are set */
1312         if (!monitor_dir || !container_dir)
1313                 return log_error_errno(false, EINVAL,
1314                         "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
1315
1316         /* namespace_dir may be empty */
1317         return true;
1318 }
1319
1320 __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1321 {
1322         __do_free char *monitor_cgroup = NULL;
1323         int idx = 0;
1324         int i;
1325         size_t len;
1326         char *suffix = NULL;
1327         struct lxc_conf *conf;
1328
1329         if (!ops)
1330                 return ret_set_errno(false, ENOENT);
1331
1332         if (!ops->hierarchies)
1333                 return true;
1334
1335         if (ops->monitor_cgroup)
1336                 return ret_set_errno(false, EEXIST);
1337
1338         if (!handler || !handler->conf)
1339                 return ret_set_errno(false, EINVAL);
1340
1341         conf = handler->conf;
1342
1343         if (!check_cgroup_dir_config(conf))
1344                 return false;
1345
1346         if (conf->cgroup_meta.monitor_dir) {
1347                 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
1348         } else if (conf->cgroup_meta.dir) {
1349                 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1350                                              DEFAULT_MONITOR_CGROUP_PREFIX,
1351                                              handler->name,
1352                                              CGROUP_CREATE_RETRY, NULL);
1353         } else if (ops->cgroup_pattern) {
1354                 __do_free char *cgroup_tree = NULL;
1355
1356                 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1357                 if (!cgroup_tree)
1358                         return ret_set_errno(false, ENOMEM);
1359
1360                 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
1361                                              DEFAULT_MONITOR_CGROUP,
1362                                              CGROUP_CREATE_RETRY, NULL);
1363         } else {
1364                 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1365                                              handler->name,
1366                                              CGROUP_CREATE_RETRY, NULL);
1367         }
1368         if (!monitor_cgroup)
1369                 return ret_set_errno(false, ENOMEM);
1370
1371         if (!conf->cgroup_meta.monitor_dir) {
1372                 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1373                 *suffix = '\0';
1374         }
1375         do {
1376                 if (idx && suffix)
1377                         sprintf(suffix, "-%d", idx);
1378
1379                 for (i = 0; ops->hierarchies[i]; i++) {
1380                         if (cgroup_tree_create(ops, handler->conf,
1381                                                ops->hierarchies[i],
1382                                                monitor_cgroup, NULL, false))
1383                                 continue;
1384
1385                         DEBUG("Failed to create cgroup %s)", monitor_cgroup);
1386                         for (int j = 0; j <= i; j++)
1387                                 cgroup_tree_prune_leaf(ops->hierarchies[j],
1388                                                        monitor_cgroup, false);
1389
1390                         idx++;
1391                         break;
1392                 }
1393         } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1394
1395         if (idx == 1000 || (!suffix && idx != 0))
1396                 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
1397
1398         ops->monitor_cgroup = move_ptr(monitor_cgroup);
1399         return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1400 }
1401
1402 /*
1403  * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1404  * next cgroup_pattern-1, -2, ..., -999.
1405  */
1406 __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1407 {
1408         __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL;
1409         char *limit_cgroup;
1410         int idx = 0;
1411         int i;
1412         size_t len;
1413         char *suffix = NULL;
1414         struct lxc_conf *conf;
1415
1416         if (!ops)
1417                 return ret_set_errno(false, ENOENT);
1418
1419         if (!ops->hierarchies)
1420                 return true;
1421
1422         if (ops->container_cgroup || ops->container_limit_cgroup)
1423                 return ret_set_errno(false, EEXIST);
1424
1425         if (!handler || !handler->conf)
1426                 return ret_set_errno(false, EINVAL);
1427
1428         conf = handler->conf;
1429
1430         if (!check_cgroup_dir_config(conf))
1431                 return false;
1432
1433         if (conf->cgroup_meta.container_dir) {
1434                 __limit_cgroup = strdup(conf->cgroup_meta.container_dir);
1435                 if (!__limit_cgroup)
1436                         return ret_set_errno(false, ENOMEM);
1437
1438                 if (conf->cgroup_meta.namespace_dir) {
1439                         container_cgroup = must_make_path(__limit_cgroup,
1440                                                           conf->cgroup_meta.namespace_dir,
1441                                                           NULL);
1442                         limit_cgroup = __limit_cgroup;
1443                 } else {
1444                         /* explicit paths but without isolation */
1445                         limit_cgroup = move_ptr(__limit_cgroup);
1446                         container_cgroup = limit_cgroup;
1447                 }
1448         } else if (conf->cgroup_meta.dir) {
1449                 limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1450                                            DEFAULT_PAYLOAD_CGROUP_PREFIX,
1451                                            handler->name,
1452                                            CGROUP_CREATE_RETRY, NULL);
1453                 container_cgroup = limit_cgroup;
1454         } else if (ops->cgroup_pattern) {
1455                 __do_free char *cgroup_tree = NULL;
1456
1457                 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1458                 if (!cgroup_tree)
1459                         return ret_set_errno(false, ENOMEM);
1460
1461                 limit_cgroup = must_concat(&len, cgroup_tree, "/",
1462                                            DEFAULT_PAYLOAD_CGROUP,
1463                                            CGROUP_CREATE_RETRY, NULL);
1464                 container_cgroup = limit_cgroup;
1465         } else {
1466                 limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1467                                            handler->name,
1468                                            CGROUP_CREATE_RETRY, NULL);
1469                 container_cgroup = limit_cgroup;
1470         }
1471         if (!limit_cgroup)
1472                 return ret_set_errno(false, ENOMEM);
1473
1474         if (!conf->cgroup_meta.container_dir) {
1475                 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1476                 *suffix = '\0';
1477         }
1478         do {
1479                 if (idx && suffix)
1480                         sprintf(suffix, "-%d", idx);
1481
1482                 for (i = 0; ops->hierarchies[i]; i++) {
1483                         if (cgroup_tree_create(ops, handler->conf,
1484                                                ops->hierarchies[i], limit_cgroup,
1485                                                conf->cgroup_meta.namespace_dir,
1486                                                true))
1487                                 continue;
1488
1489                         DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path ?: "(null)");
1490                         for (int j = 0; j <= i; j++)
1491                                 cgroup_tree_prune_leaf(ops->hierarchies[j],
1492                                                        limit_cgroup, true);
1493
1494                         idx++;
1495                         break;
1496                 }
1497         } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1498
1499         if (idx == 1000 || (!suffix && idx != 0))
1500                 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
1501
1502         ops->container_cgroup = move_ptr(container_cgroup);
1503         if (__limit_cgroup)
1504                 ops->container_limit_cgroup = move_ptr(__limit_cgroup);
1505         else
1506                 ops->container_limit_cgroup = ops->container_cgroup;
1507         INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup",
1508              ops->container_cgroup, ops->container_limit_cgroup);
1509         return true;
1510 }
1511
1512 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1513                                               struct lxc_handler *handler)
1514 {
1515         int monitor_len, transient_len = 0;
1516         char monitor[INTTYPE_TO_STRLEN(pid_t)],
1517             transient[INTTYPE_TO_STRLEN(pid_t)];
1518
1519         if (!ops)
1520                 return ret_set_errno(false, ENOENT);
1521
1522         if (!ops->hierarchies)
1523                 return true;
1524
1525         if (!ops->monitor_cgroup)
1526                 return ret_set_errno(false, ENOENT);
1527
1528         if (!handler || !handler->conf)
1529                 return ret_set_errno(false, EINVAL);
1530
1531         monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1532         if (monitor_len < 0)
1533                 return false;
1534
1535         if (handler->transient_pid > 0) {
1536                 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1537                 if (transient_len < 0)
1538                         return false;
1539         }
1540
1541         for (int i = 0; ops->hierarchies[i]; i++) {
1542                 struct hierarchy *h = ops->hierarchies[i];
1543                 int ret;
1544
1545                 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", monitor, monitor_len);
1546                 if (ret)
1547                         return log_error_errno(false, errno, "Failed to enter cgroup %d", h->cgfd_mon);
1548
1549                 TRACE("Moved monitor into cgroup %d", h->cgfd_mon);
1550
1551                 if (handler->transient_pid <= 0)
1552                         continue;
1553
1554                 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", transient, transient_len);
1555                 if (ret)
1556                         return log_error_errno(false, errno, "Failed to enter cgroup %d", h->cgfd_mon);
1557
1558                 TRACE("Moved transient process into cgroup %d", h->cgfd_mon);
1559
1560                 /*
1561                  * we don't keep the fds for non-unified hierarchies around
1562                  * mainly because we don't make use of them anymore after the
1563                  * core cgroup setup is done but also because there are quite a
1564                  * lot of them.
1565                  */
1566                 if (!is_unified_hierarchy(h))
1567                         close_prot_errno_disarm(h->cgfd_mon);
1568         }
1569         handler->transient_pid = -1;
1570
1571         return true;
1572 }
1573
1574 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1575                                               struct lxc_handler *handler)
1576 {
1577         int len;
1578         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1579
1580         if (!ops)
1581                 return ret_set_errno(false, ENOENT);
1582
1583         if (!ops->hierarchies)
1584                 return true;
1585
1586         if (!ops->container_cgroup)
1587                 return ret_set_errno(false, ENOENT);
1588
1589         if (!handler || !handler->conf)
1590                 return ret_set_errno(false, EINVAL);
1591
1592         len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1593         if (len < 0)
1594                 return false;
1595
1596         for (int i = 0; ops->hierarchies[i]; i++) {
1597                 struct hierarchy *h = ops->hierarchies[i];
1598                 int ret;
1599
1600                 if (is_unified_hierarchy(h) &&
1601                     (handler->clone_flags & CLONE_INTO_CGROUP))
1602                         continue;
1603
1604                 ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len);
1605                 if (ret != 0)
1606                         return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path);
1607
1608                 TRACE("Moved container into %s cgroup via %d", h->container_full_path, h->cgfd_con);
1609         }
1610
1611         return true;
1612 }
1613
1614 static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1615                       gid_t chown_gid, mode_t chmod_mode)
1616 {
1617         int ret;
1618
1619         ret = fchownat(dirfd, path, chown_uid, chown_gid,
1620                        AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1621         if (ret < 0)
1622                 return log_warn_errno(-1,
1623                                       errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1624                                       dirfd, path, (int)chown_uid,
1625                                       (int)chown_gid);
1626
1627         ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1628         if (ret < 0)
1629                 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1630                                       dirfd, path, (int)chmod_mode);
1631
1632         return 0;
1633 }
1634
1635 /* chgrp the container cgroups to container group.  We leave
1636  * the container owner as cgroup owner.  So we must make the
1637  * directories 775 so that the container can create sub-cgroups.
1638  *
1639  * Also chown the tasks and cgroup.procs files.  Those may not
1640  * exist depending on kernel version.
1641  */
1642 static int chown_cgroup_wrapper(void *data)
1643 {
1644         int ret;
1645         uid_t destuid;
1646         struct generic_userns_exec_data *arg = data;
1647         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1648         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1649
1650         if (!lxc_drop_groups() && errno != EPERM)
1651                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1652
1653         ret = setresgid(nsgid, nsgid, nsgid);
1654         if (ret < 0)
1655                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
1656                                        (int)nsgid, (int)nsgid, (int)nsgid);
1657
1658         ret = setresuid(nsuid, nsuid, nsuid);
1659         if (ret < 0)
1660                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
1661                                        (int)nsuid, (int)nsuid, (int)nsuid);
1662
1663         destuid = get_ns_uid(arg->origuid);
1664         if (destuid == LXC_INVALID_UID)
1665                 destuid = 0;
1666
1667         for (int i = 0; arg->hierarchies[i]; i++) {
1668                 int dirfd = arg->hierarchies[i]->cgfd_con;
1669
1670                 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
1671
1672                 /*
1673                  * Failures to chown() these are inconvenient but not
1674                  * detrimental We leave these owned by the container launcher,
1675                  * so that container root can write to the files to attach.  We
1676                  * chmod() them 664 so that container systemd can write to the
1677                  * files (which systemd in wily insists on doing).
1678                  */
1679
1680                 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC)
1681                         (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
1682
1683                 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
1684
1685                 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
1686                         continue;
1687
1688                 for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++)
1689                         (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
1690         }
1691
1692         return 0;
1693 }
1694
1695 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1696                                       struct lxc_conf *conf)
1697 {
1698         struct generic_userns_exec_data wrap;
1699
1700         if (!ops)
1701                 return ret_set_errno(false, ENOENT);
1702
1703         if (!ops->hierarchies)
1704                 return true;
1705
1706         if (!ops->container_cgroup)
1707                 return ret_set_errno(false, ENOENT);
1708
1709         if (!conf)
1710                 return ret_set_errno(false, EINVAL);
1711
1712         if (lxc_list_empty(&conf->id_map))
1713                 return true;
1714
1715         wrap.origuid = geteuid();
1716         wrap.path = NULL;
1717         wrap.hierarchies = ops->hierarchies;
1718         wrap.conf = conf;
1719
1720         if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1721                 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1722
1723         return true;
1724 }
1725
1726 __cgfsng_ops static void cgfsng_payload_finalize(struct cgroup_ops *ops)
1727 {
1728         if (!ops)
1729                 return;
1730
1731         if (!ops->hierarchies)
1732                 return;
1733
1734         for (int i = 0; ops->hierarchies[i]; i++) {
1735                 struct hierarchy *h = ops->hierarchies[i];
1736                 /*
1737                  * we don't keep the fds for non-unified hierarchies around
1738                  * mainly because we don't make use of them anymore after the
1739                  * core cgroup setup is done but also because there are quite a
1740                  * lot of them.
1741                  */
1742                 if (!is_unified_hierarchy(h))
1743                         close_prot_errno_disarm(h->cgfd_con);
1744         }
1745
1746         /*
1747          * The checking for freezer support should obviously be done at cgroup
1748          * initialization time but that doesn't work reliable. The freezer
1749          * controller has been demoted (rightly so) to a simple file located in
1750          * each non-root cgroup. At the time when the container is created we
1751          * might still be located in /sys/fs/cgroup and so checking for
1752          * cgroup.freeze won't tell us anything because this file doesn't exist
1753          * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1754          * find an already existing cgroup and then check within that cgroup
1755          * for the existence of cgroup.freeze but that will only work on
1756          * systemd based hosts. Other init systems might not manage cgroups and
1757          * so no cgroup will exist. So we defer until we have created cgroups
1758          * for our container which means we check here.
1759          */
1760         if (pure_unified_layout(ops) &&
1761             !faccessat(ops->unified->cgfd_con, "cgroup.freeze", F_OK,
1762                        AT_SYMLINK_NOFOLLOW)) {
1763                 TRACE("Unified hierarchy supports freezer");
1764                 ops->unified->freezer_controller = 1;
1765         }
1766 }
1767
1768 /* cgroup-full:* is done, no need to create subdirs */
1769 static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
1770 {
1771         switch (cgroup_automount_type) {
1772         case LXC_AUTO_CGROUP_RO:
1773                 return true;
1774         case LXC_AUTO_CGROUP_RW:
1775                 return true;
1776         case LXC_AUTO_CGROUP_MIXED:
1777                 return true;
1778         }
1779
1780         return false;
1781 }
1782
1783 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1784  * remount controller ro if needed and bindmount the cgroupfs onto
1785  * control/the/cg/path.
1786  */
1787 static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
1788                                        char *controllerpath, char *cgpath,
1789                                        const char *container_cgroup)
1790 {
1791         __do_free char *sourcepath = NULL;
1792         int ret, remount_flags;
1793         int flags = MS_BIND;
1794
1795         if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1796             (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
1797                 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1798                 if (ret < 0)
1799                         return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1800                                                controllerpath, controllerpath);
1801
1802                 remount_flags = add_required_remount_flags(controllerpath,
1803                                                            controllerpath,
1804                                                            flags | MS_REMOUNT);
1805                 ret = mount(controllerpath, controllerpath, "cgroup",
1806                             remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1807                             NULL);
1808                 if (ret < 0)
1809                         return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", controllerpath);
1810
1811                 INFO("Remounted %s read-only", controllerpath);
1812         }
1813
1814         sourcepath = must_make_path(h->mountpoint, h->container_base_path,
1815                                     container_cgroup, NULL);
1816         if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
1817                 flags |= MS_RDONLY;
1818
1819         ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1820         if (ret < 0)
1821                 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1822                                        h->controllers[0], cgpath);
1823         INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1824
1825         if (flags & MS_RDONLY) {
1826                 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1827                                                            flags | MS_REMOUNT);
1828                 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1829                 if (ret < 0)
1830                         return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
1831                 INFO("Remounted %s read-only", cgpath);
1832         }
1833
1834         INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1835         return 0;
1836 }
1837
1838 /* __cgroupfs_mount
1839  *
1840  * Mount cgroup hierarchies directly without using bind-mounts. The main
1841  * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1842  * cgroups for the LXC_AUTO_CGROUP_FULL option.
1843  */
1844 static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1845                             struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
1846                             const char *hierarchy_mnt)
1847 {
1848         __do_close int fd_fs = -EBADF;
1849         unsigned int flags = 0;
1850         char *fstype;
1851         int ret;
1852
1853         if (dfd_mnt_cgroupfs < 0)
1854                 return ret_errno(EINVAL);
1855
1856         flags |= MOUNT_ATTR_NOSUID;
1857         flags |= MOUNT_ATTR_NOEXEC;
1858         flags |= MOUNT_ATTR_NODEV;
1859         flags |= MOUNT_ATTR_RELATIME;
1860
1861         if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1862             (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO))
1863                 flags |= MOUNT_ATTR_RDONLY;
1864
1865         if (is_unified_hierarchy(h))
1866                 fstype = "cgroup2";
1867         else
1868                 fstype = "cgroup";
1869
1870         if (can_use_mount_api()) {
1871                 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1872                 if (fd_fs < 0)
1873                         return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1874
1875                 if (!is_unified_hierarchy(h)) {
1876                         for (const char **it = (const char **)h->controllers; it && *it; it++) {
1877                                 if (strnequal(*it, "name=", STRLITERALLEN("name=")))
1878                                         ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1879                                 else
1880                                         ret = fs_set_property(fd_fs, *it, "");
1881                                 if (ret < 0)
1882                                         return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1883                         }
1884                 }
1885
1886                 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1887                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1888                                 flags);
1889         } else {
1890                 __do_free char *controllers = NULL, *target = NULL;
1891                 unsigned int old_flags = 0;
1892                 const char *rootfs_mnt;
1893
1894                 if (!is_unified_hierarchy(h)) {
1895                         controllers = lxc_string_join(",", (const char **)h->controllers, false);
1896                         if (!controllers)
1897                                 return ret_errno(ENOMEM);
1898                 }
1899
1900                 rootfs_mnt = get_rootfs_mnt(rootfs);
1901                 ret = mnt_attributes_old(flags, &old_flags);
1902                 if (ret)
1903                         return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1904
1905                 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
1906                 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
1907         }
1908         if (ret < 0)
1909                 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1910                                        fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1911
1912         DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1913               fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1914         return 0;
1915 }
1916
1917 static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1918                                  struct lxc_rootfs *rootfs,
1919                                  int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
1920 {
1921         return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1922                                 dfd_mnt_cgroupfs, hierarchy_mnt);
1923 }
1924
1925 static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
1926                                       struct lxc_rootfs *rootfs,
1927                                       int dfd_mnt_cgroupfs,
1928                                       const char *hierarchy_mnt)
1929 {
1930         switch (cgroup_automount_type) {
1931         case LXC_AUTO_CGROUP_FULL_RO:
1932                 break;
1933         case LXC_AUTO_CGROUP_FULL_RW:
1934                 break;
1935         case LXC_AUTO_CGROUP_FULL_MIXED:
1936                 break;
1937         default:
1938                 return 0;
1939         }
1940
1941         return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1942                                 dfd_mnt_cgroupfs, hierarchy_mnt);
1943 }
1944
1945 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1946                                       struct lxc_handler *handler, int cg_flags)
1947 {
1948         __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF;
1949         __do_free char *cgroup_root = NULL;
1950         int cgroup_automount_type;
1951         bool in_cgroup_ns = false, wants_force_mount = false;
1952         struct lxc_conf *conf = handler->conf;
1953         struct lxc_rootfs *rootfs = &conf->rootfs;
1954         const char *rootfs_mnt = get_rootfs_mnt(rootfs);
1955         int ret;
1956
1957         if (!ops)
1958                 return ret_set_errno(false, ENOENT);
1959
1960         if (!ops->hierarchies)
1961                 return true;
1962
1963         if (!conf)
1964                 return ret_set_errno(false, EINVAL);
1965
1966         if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
1967                 return log_trace(true, "No cgroup mounts requested");
1968
1969         if (cg_flags & LXC_AUTO_CGROUP_FORCE) {
1970                 cg_flags &= ~LXC_AUTO_CGROUP_FORCE;
1971                 wants_force_mount = true;
1972         }
1973
1974         switch (cg_flags) {
1975         case LXC_AUTO_CGROUP_RO:
1976                 TRACE("Read-only cgroup mounts requested");
1977                 break;
1978         case LXC_AUTO_CGROUP_RW:
1979                 TRACE("Read-write cgroup mounts requested");
1980                 break;
1981         case LXC_AUTO_CGROUP_MIXED:
1982                 TRACE("Mixed cgroup mounts requested");
1983                 break;
1984         case LXC_AUTO_CGROUP_FULL_RO:
1985                 TRACE("Full read-only cgroup mounts requested");
1986                 break;
1987         case LXC_AUTO_CGROUP_FULL_RW:
1988                 TRACE("Full read-write cgroup mounts requested");
1989                 break;
1990         case LXC_AUTO_CGROUP_FULL_MIXED:
1991                 TRACE("Full mixed cgroup mounts requested");
1992                 break;
1993         default:
1994                 return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified");
1995         }
1996         cgroup_automount_type = cg_flags;
1997
1998         if (!wants_force_mount) {
1999                 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
2000
2001                 /*
2002                  * Most recent distro versions currently have init system that
2003                  * do support cgroup2 but do not mount it by default unless
2004                  * explicitly told so even if the host is cgroup2 only. That
2005                  * means they often will fail to boot. Fix this by pre-mounting
2006                  * cgroup2 by default. We will likely need to be doing this a
2007                  * few years until all distros have switched over to cgroup2 at
2008                  * which point we can safely assume that their init systems
2009                  * will mount it themselves.
2010                  */
2011                 if (pure_unified_layout(ops))
2012                         wants_force_mount = true;
2013         }
2014
2015         if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP))
2016                 in_cgroup_ns = true;
2017
2018         if (in_cgroup_ns && !wants_force_mount)
2019                 return log_trace(true, "Mounting cgroups not requested or needed");
2020
2021         /* This is really the codepath that we want. */
2022         if (pure_unified_layout(ops)) {
2023                 __do_close int dfd_mnt_unified = -EBADF;
2024
2025                 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
2026                                           PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
2027                 if (dfd_mnt_unified < 0)
2028                         return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
2029                                         DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
2030                 /*
2031                  * If cgroup namespaces are supported but the container will
2032                  * not have CAP_SYS_ADMIN after it has started we need to mount
2033                  * the cgroups manually.
2034                  *
2035                  * Note that here we know that wants_force_mount is true.
2036                  * Otherwise we would've returned early above.
2037                  */
2038                 if (in_cgroup_ns) {
2039                         /*
2040                          *  1. cgroup:rw:force    -> Mount the cgroup2 filesystem.
2041                          *  2. cgroup:ro:force    -> Mount the cgroup2 filesystem read-only.
2042                          *  3. cgroup:mixed:force -> See comment above how this
2043                          *                           does not apply so
2044                          *                           cgroup:mixed is equal to
2045                          *                           cgroup:rw when cgroup
2046                          *                           namespaces are supported.
2047
2048                          *  4. cgroup:rw    -> No-op; init system responsible for mounting.
2049                          *  5. cgroup:ro    -> No-op; init system responsible for mounting.
2050                          *  6. cgroup:mixed -> No-op; init system responsible for mounting.
2051                          *
2052                          *  7. cgroup-full:rw    -> Not supported.
2053                          *  8. cgroup-full:ro    -> Not supported.
2054                          *  9. cgroup-full:mixed -> Not supported.
2055
2056                          * 10. cgroup-full:rw:force    -> Not supported.
2057                          * 11. cgroup-full:ro:force    -> Not supported.
2058                          * 12. cgroup-full:mixed:force -> Not supported.
2059                          */
2060                         ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
2061                         if (ret < 0)
2062                                 return syserrno(false, "Failed to force mount cgroup filesystem in cgroup namespace");
2063
2064                         return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
2065                 } else {
2066                         /*
2067                          * Either no cgroup namespace supported (highly
2068                          * unlikely unless we're dealing with a Frankenkernel.
2069                          * Or the user requested to keep the cgroup namespace
2070                          * of the host or another container.
2071                          */
2072                         if (wants_force_mount) {
2073                                 /*
2074                                  * 1. cgroup:rw:force    -> Bind-mount the cgroup2 filesystem writable.
2075                                  * 2. cgroup:ro:force    -> Bind-mount the cgroup2 filesystem read-only.
2076                                  * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
2077                                  *                          and make the parent directory of the
2078                                  *                          container's cgroup read-only but the
2079                                  *                          container's cgroup writable.
2080                                  *
2081                                  * 10. cgroup-full:rw:force    ->
2082                                  * 11. cgroup-full:ro:force    ->
2083                                  * 12. cgroup-full:mixed:force ->
2084                                  */
2085                                 errno = EOPNOTSUPP;
2086                                 SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
2087                         } else {
2088                                 errno = EOPNOTSUPP;
2089                                 SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
2090                         }
2091                 }
2092
2093                 return syserrno(false, "Failed to mount cgroups");
2094         }
2095
2096         /*
2097          * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
2098          * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
2099          * DEFAULT_CGROUP_MOUNTPOINT define.
2100          */
2101         if (can_use_mount_api()) {
2102                 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
2103                 if (fd_fs < 0)
2104                         return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
2105
2106                 ret = fs_set_property(fd_fs, "mode", "0755");
2107                 if (ret < 0)
2108                         return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
2109
2110                 ret = fs_set_property(fd_fs, "size", "10240k");
2111                 if (ret < 0)
2112                         return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
2113
2114                 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
2115                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
2116                                 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
2117                                 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
2118         } else {
2119                 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
2120                 ret = safe_mount(NULL, cgroup_root, "tmpfs",
2121                                  MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
2122                                  "size=10240k,mode=755", rootfs_mnt);
2123         }
2124         if (ret < 0)
2125                 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
2126                                        DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
2127
2128         dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
2129                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
2130         if (dfd_mnt_tmpfs < 0)
2131                 return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
2132                                 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
2133
2134         for (int i = 0; ops->hierarchies[i]; i++) {
2135                 __do_free char *controllerpath = NULL, *path2 = NULL;
2136                 struct hierarchy *h = ops->hierarchies[i];
2137                 char *controller = strrchr(h->mountpoint, '/');
2138
2139                 if (!controller)
2140                         continue;
2141                 controller++;
2142
2143                 ret = mkdirat(dfd_mnt_tmpfs, controller, 0000);
2144                 if (ret < 0)
2145                         return log_error_errno(false, errno, "Failed to create cgroup mountpoint %d(%s)", dfd_mnt_tmpfs, controller);
2146
2147                 if (in_cgroup_ns && wants_force_mount) {
2148                         /*
2149                          * If cgroup namespaces are supported but the container
2150                          * will not have CAP_SYS_ADMIN after it has started we
2151                          * need to mount the cgroups manually.
2152                          */
2153                         ret = cgroupfs_mount(cgroup_automount_type, h, rootfs, dfd_mnt_tmpfs, controller);
2154                         if (ret < 0)
2155                                 return false;
2156
2157                         continue;
2158                 }
2159
2160                 /* Here is where the ancient kernel section begins. */
2161                 ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs, dfd_mnt_tmpfs, controller);
2162                 if (ret < 0)
2163                         return false;
2164
2165                 if (!cg_mount_needs_subdirs(cgroup_automount_type))
2166                         continue;
2167
2168                 if (!cgroup_root)
2169                         cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
2170
2171                 controllerpath = must_make_path(cgroup_root, controller, NULL);
2172                 path2 = must_make_path(controllerpath, h->container_base_path, ops->container_cgroup, NULL);
2173                 ret = mkdir_p(path2, 0755);
2174                 if (ret < 0 && (errno != EEXIST))
2175                         return false;
2176
2177                 ret = cg_legacy_mount_controllers(cgroup_automount_type, h, controllerpath, path2, ops->container_cgroup);
2178                 if (ret < 0)
2179                         return false;
2180         }
2181
2182         return true;
2183 }
2184
2185 /* Only root needs to escape to the cgroup of its init. */
2186 __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
2187                                             struct lxc_conf *conf)
2188 {
2189         if (!ops)
2190                 return ret_set_errno(false, ENOENT);
2191
2192         if (!ops->hierarchies)
2193                 return true;
2194
2195         if (!conf)
2196                 return ret_set_errno(false, EINVAL);
2197
2198         if (conf->cgroup_meta.relative || geteuid())
2199                 return true;
2200
2201         for (int i = 0; ops->hierarchies[i]; i++) {
2202                 __do_free char *fullpath = NULL;
2203                 int ret;
2204
2205                 fullpath =
2206                     must_make_path(ops->hierarchies[i]->mountpoint,
2207                                    ops->hierarchies[i]->container_base_path,
2208                                    "cgroup.procs", NULL);
2209                 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
2210                 if (ret != 0)
2211                         return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
2212         }
2213
2214         return true;
2215 }
2216
2217 __cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
2218 {
2219         int i = 0;
2220
2221         if (!ops)
2222                 return ret_set_errno(-1, ENOENT);
2223
2224         if (!ops->hierarchies)
2225                 return 0;
2226
2227         for (; ops->hierarchies[i]; i++)
2228                 ;
2229
2230         return i;
2231 }
2232
2233 __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
2234                                                      int n, char ***out)
2235 {
2236         int i;
2237
2238         if (!ops)
2239                 return ret_set_errno(false, ENOENT);
2240
2241         if (!ops->hierarchies)
2242                 return ret_set_errno(false, ENOENT);
2243
2244         /* sanity check n */
2245         for (i = 0; i < n; i++)
2246                 if (!ops->hierarchies[i])
2247                         return ret_set_errno(false, ENOENT);
2248
2249         *out = ops->hierarchies[i]->controllers;
2250
2251         return true;
2252 }
2253
2254 static bool cg_legacy_freeze(struct cgroup_ops *ops)
2255 {
2256         struct hierarchy *h;
2257
2258         h = get_hierarchy(ops, "freezer");
2259         if (!h)
2260                 return ret_set_errno(-1, ENOENT);
2261
2262         return lxc_write_openat(h->container_full_path, "freezer.state",
2263                                 "FROZEN", STRLITERALLEN("FROZEN"));
2264 }
2265
2266 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
2267                                     struct lxc_epoll_descr *descr)
2268 {
2269         __do_free char *line = NULL;
2270         __do_fclose FILE *f = NULL;
2271         int state = PTR_TO_INT(cbdata);
2272         size_t len;
2273         const char *state_string;
2274
2275         f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
2276         if (!f)
2277                 return LXC_MAINLOOP_ERROR;
2278
2279         if (state == 1)
2280                 state_string = "frozen 1";
2281         else
2282                 state_string = "frozen 0";
2283
2284         while (getline(&line, &len, f) != -1)
2285                 if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2))
2286                         return LXC_MAINLOOP_CLOSE;
2287
2288         rewind(f);
2289
2290         return LXC_MAINLOOP_CONTINUE;
2291 }
2292
2293 static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
2294                                 const char *state_string,
2295                                 int state_num,
2296                                 const char *epoll_error,
2297                                 const char *wait_error)
2298 {
2299         __do_close int fd = -EBADF;
2300         call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
2301         int ret;
2302         struct lxc_epoll_descr descr;
2303         struct hierarchy *h;
2304
2305         h = ops->unified;
2306         if (!h)
2307                 return ret_set_errno(-1, ENOENT);
2308
2309         if (!h->container_full_path)
2310                 return ret_set_errno(-1, EEXIST);
2311
2312         if (timeout != 0) {
2313                 __do_free char *events_file = NULL;
2314
2315                 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
2316                 fd = open(events_file, O_RDONLY | O_CLOEXEC);
2317                 if (fd < 0)
2318                         return log_error_errno(-1, errno, "Failed to open cgroup.events file");
2319
2320                 ret = lxc_mainloop_open(&descr);
2321                 if (ret)
2322                         return log_error_errno(-1, errno, "%s", epoll_error);
2323
2324                 /* automatically cleaned up now */
2325                 descr_ptr = &descr;
2326
2327                 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
2328                 if (ret < 0)
2329                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
2330         }
2331
2332         ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", state_string, 1);
2333         if (ret < 0)
2334                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
2335
2336         if (timeout != 0 && lxc_mainloop(&descr, timeout))
2337                 return log_error_errno(-1, errno, "%s", wait_error);
2338
2339         return 0;
2340 }
2341
2342 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
2343 {
2344         return cg_unified_freeze_do(ops, timeout, "1", 1,
2345                 "Failed to create epoll instance to wait for container freeze",
2346                 "Failed to wait for container to be frozen");
2347 }
2348
2349 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
2350 {
2351         if (!ops->hierarchies)
2352                 return ret_set_errno(-1, ENOENT);
2353
2354         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2355                 return cg_legacy_freeze(ops);
2356
2357         return cg_unified_freeze(ops, timeout);
2358 }
2359
2360 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
2361 {
2362         struct hierarchy *h;
2363
2364         h = get_hierarchy(ops, "freezer");
2365         if (!h)
2366                 return ret_set_errno(-1, ENOENT);
2367
2368         return lxc_write_openat(h->container_full_path, "freezer.state",
2369                                 "THAWED", STRLITERALLEN("THAWED"));
2370 }
2371
2372 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
2373 {
2374         return cg_unified_freeze_do(ops, timeout, "0", 0,
2375                 "Failed to create epoll instance to wait for container unfreeze",
2376                 "Failed to wait for container to be unfrozen");
2377 }
2378
2379 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2380 {
2381         if (!ops->hierarchies)
2382                 return ret_set_errno(-1, ENOENT);
2383
2384         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2385                 return cg_legacy_unfreeze(ops);
2386
2387         return cg_unified_unfreeze(ops, timeout);
2388 }
2389
2390 static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2391                                         const char *controller, bool limiting)
2392 {
2393         struct hierarchy *h;
2394
2395         h = get_hierarchy(ops, controller);
2396         if (!h)
2397                 return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"",
2398                                       controller ? controller : "(null)");
2399
2400         if (limiting)
2401                 return h->container_limit_path
2402                            ? h->container_limit_path + strlen(h->mountpoint)
2403                            : NULL;
2404
2405         return h->container_full_path
2406                    ? h->container_full_path + strlen(h->mountpoint)
2407                    : NULL;
2408 }
2409
2410 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2411                                                   const char *controller)
2412 {
2413     return cgfsng_get_cgroup_do(ops, controller, false);
2414 }
2415
2416 __cgfsng_ops static const char *cgfsng_get_limiting_cgroup(struct cgroup_ops *ops,
2417                                                            const char *controller)
2418 {
2419     return cgfsng_get_cgroup_do(ops, controller, true);
2420 }
2421
2422 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2423  * which must be freed by the caller.
2424  */
2425 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2426                                                        const char *inpath,
2427                                                        const char *filename)
2428 {
2429         return must_make_path(h->mountpoint, inpath, filename, NULL);
2430 }
2431
2432 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
2433 {
2434         int idx = 1;
2435         int ret;
2436         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2437         ssize_t pidstr_len;
2438
2439         /* Create leaf cgroup. */
2440         ret = mkdirat(unified_fd, ".lxc", 0755);
2441         if (ret < 0 && errno != EEXIST)
2442                 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2443
2444         pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2445         if (pidstr_len < 0)
2446                 return pidstr_len;
2447
2448         ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
2449         if (ret < 0)
2450                 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2451         if (ret == 0)
2452                 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
2453
2454         /* this is a non-leaf node */
2455         if (errno != EBUSY)
2456                 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
2457
2458         do {
2459                 bool rm = false;
2460                 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
2461                 char *slash = attach_cgroup;
2462
2463                 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2464                 if (ret < 0)
2465                         return ret;
2466
2467                 /*
2468                  * This shouldn't really happen but the compiler might complain
2469                  * that a short write would cause a buffer overrun. So be on
2470                  * the safe side.
2471                  */
2472                 if (ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2473                         return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2474
2475                 slash += (ret - STRLITERALLEN("/cgroup.procs"));
2476                 *slash = '\0';
2477
2478                 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2479                 if (ret < 0 && errno != EEXIST)
2480                         return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2481                 if (ret == 0)
2482                         rm = true;
2483
2484                 *slash = '/';
2485
2486                 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2487                 if (ret == 0)
2488                         return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
2489
2490                 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2491                         SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2492
2493                 /* this is a non-leaf node */
2494                 if (errno != EBUSY)
2495                         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2496
2497                 idx++;
2498         } while (idx < 1000);
2499
2500         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2501 }
2502
2503 static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2504                                      int unified_fd, int *sk_fd)
2505 {
2506         __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2507         int target_fds[2];
2508         ssize_t ret;
2509
2510         /* Create leaf cgroup. */
2511         ret = mkdirat(unified_fd, ".lxc", 0755);
2512         if (ret < 0 && errno != EEXIST)
2513                 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2514
2515         target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2516         if (target_fd0 < 0)
2517                 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2518         target_fds[0] = target_fd0;
2519
2520         target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2521         if (target_fd1 < 0)
2522                 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2523         target_fds[1] = target_fd1;
2524
2525         ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
2526         if (ret <= 0)
2527                 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
2528                                        target_fd0, target_fd1);
2529
2530         return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
2531 }
2532
2533 static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2534                                         int *sk_fd, pid_t pid)
2535 {
2536         __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2537         int target_fds[2];
2538         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2539         size_t pidstr_len;
2540         ssize_t ret;
2541
2542         ret = lxc_abstract_unix_recv_fds(sk, target_fds, 2, NULL, 0);
2543         if (ret <= 0)
2544                 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
2545         target_fd0 = target_fds[0];
2546         target_fd1 = target_fds[1];
2547
2548         pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2549
2550         ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2551         if (ret > 0 && ret == pidstr_len)
2552                 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2553
2554         ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
2555         if (ret > 0 && ret == pidstr_len)
2556                 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
2557
2558         return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2559                                target_fd0, target_fd1);
2560 }
2561
2562 struct userns_exec_unified_attach_data {
2563         const struct lxc_conf *conf;
2564         int unified_fd;
2565         int sk_pair[2];
2566         pid_t pid;
2567 };
2568
2569 static int cgroup_unified_attach_child_wrapper(void *data)
2570 {
2571         struct userns_exec_unified_attach_data *args = data;
2572
2573         if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2574             args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2575                 return ret_errno(EINVAL);
2576
2577         close_prot_errno_disarm(args->sk_pair[0]);
2578         return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2579                                          &args->sk_pair[1]);
2580 }
2581
2582 static int cgroup_unified_attach_parent_wrapper(void *data)
2583 {
2584         struct userns_exec_unified_attach_data *args = data;
2585
2586         if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2587             args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2588                 return ret_errno(EINVAL);
2589
2590         close_prot_errno_disarm(args->sk_pair[1]);
2591         return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2592                                             args->pid);
2593 }
2594
2595 /* Technically, we're always at a delegation boundary here (This is especially
2596  * true when cgroup namespaces are available.). The reasoning is that in order
2597  * for us to have been able to start a container in the first place the root
2598  * cgroup must have been a leaf node. Now, either the container's init system
2599  * has populated the cgroup and kept it as a leaf node or it has created
2600  * subtrees. In the former case we will simply attach to the leaf node we
2601  * created when we started the container in the latter case we create our own
2602  * cgroup for the attaching process.
2603  */
2604 static int __cg_unified_attach(const struct hierarchy *h,
2605                                const struct lxc_conf *conf, const char *name,
2606                                const char *lxcpath, pid_t pid,
2607                                const char *controller)
2608 {
2609         __do_close int unified_fd = -EBADF;
2610         __do_free char *path = NULL, *cgroup = NULL;
2611         int ret;
2612
2613         if (!conf || !name || !lxcpath || pid <= 0)
2614                 return ret_errno(EINVAL);
2615
2616         ret = cgroup_attach(conf, name, lxcpath, pid);
2617         if (ret == 0)
2618                 return log_trace(0, "Attached to unified cgroup via command handler");
2619         if (ret != -ENOCGROUP2)
2620                 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2621
2622         /* Fall back to retrieving the path for the unified cgroup. */
2623         cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2624         /* not running */
2625         if (!cgroup)
2626                 return 0;
2627
2628         path = must_make_path(h->mountpoint, cgroup, NULL);
2629
2630         unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
2631         if (unified_fd < 0)
2632                 return ret_errno(EBADF);
2633
2634         if (!lxc_list_empty(&conf->id_map)) {
2635                 struct userns_exec_unified_attach_data args = {
2636                         .conf           = conf,
2637                         .unified_fd     = unified_fd,
2638                         .pid            = pid,
2639                 };
2640
2641                 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2642                 if (ret < 0)
2643                         return -errno;
2644
2645                 ret = userns_exec_minimal(conf,
2646                                           cgroup_unified_attach_parent_wrapper,
2647                                           &args,
2648                                           cgroup_unified_attach_child_wrapper,
2649                                           &args);
2650         } else {
2651                 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2652         }
2653
2654         return ret;
2655 }
2656
2657 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2658                                        const struct lxc_conf *conf,
2659                                        const char *name, const char *lxcpath,
2660                                        pid_t pid)
2661 {
2662         int len, ret;
2663         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2664
2665         if (!ops)
2666                 return ret_set_errno(false, ENOENT);
2667
2668         if (!ops->hierarchies)
2669                 return true;
2670
2671         len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2672         if (len < 0)
2673                 return false;
2674
2675         for (int i = 0; ops->hierarchies[i]; i++) {
2676                 __do_free char *fullpath = NULL, *path = NULL;
2677                 struct hierarchy *h = ops->hierarchies[i];
2678
2679                 if (h->version == CGROUP2_SUPER_MAGIC) {
2680                         ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
2681                                                   h->controllers[0]);
2682                         if (ret < 0)
2683                                 return false;
2684
2685                         continue;
2686                 }
2687
2688                 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2689                 /* not running */
2690                 if (!path)
2691                         return false;
2692
2693                 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2694                 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2695                 if (ret < 0)
2696                         return log_error_errno(false, errno, "Failed to attach %d to %s",
2697                                                (int)pid, fullpath);
2698         }
2699
2700         return true;
2701 }
2702
2703 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.  Here we
2704  * don't have a cgroup_data set up, so we ask the running container through the
2705  * commands API for the cgroup path.
2706  */
2707 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2708                                      char *value, size_t len, const char *name,
2709                                      const char *lxcpath)
2710 {
2711         __do_free char *path = NULL;
2712         __do_free char *controller = NULL;
2713         char *p;
2714         struct hierarchy *h;
2715         int ret = -1;
2716
2717         if (!ops)
2718                 return ret_set_errno(-1, ENOENT);
2719
2720         controller = must_copy_string(filename);
2721         p = strchr(controller, '.');
2722         if (p)
2723                 *p = '\0';
2724
2725         path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
2726         /* not running */
2727         if (!path)
2728                 return -1;
2729
2730         h = get_hierarchy(ops, controller);
2731         if (h) {
2732                 __do_free char *fullpath = NULL;
2733
2734                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2735                 ret = lxc_read_from_file(fullpath, value, len);
2736         }
2737
2738         return ret;
2739 }
2740
2741 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2742 {
2743         for (int count = 0; count < 3; count++, val++) {
2744                 switch (*val) {
2745                 case 'r':
2746                         device->access[count] = *val;
2747                         break;
2748                 case 'w':
2749                         device->access[count] = *val;
2750                         break;
2751                 case 'm':
2752                         device->access[count] = *val;
2753                         break;
2754                 case '\n':
2755                 case '\0':
2756                         count = 3;
2757                         break;
2758                 default:
2759                         return ret_errno(EINVAL);
2760                 }
2761         }
2762
2763         return 0;
2764 }
2765
2766 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2767                                     const char *val)
2768 {
2769         int count, ret;
2770         char temp[50];
2771
2772         if (strequal("devices.allow", key))
2773                 device->allow = 1; /* allow the device */
2774         else
2775                 device->allow = 0; /* deny the device */
2776
2777         if (strequal(val, "a")) {
2778                 /* global rule */
2779                 device->type = 'a';
2780                 device->major = -1;
2781                 device->minor = -1;
2782
2783                 if (device->allow) /* allow all devices */
2784                         device->global_rule = LXC_BPF_DEVICE_CGROUP_DENYLIST;
2785                 else /* deny all devices */
2786                         device->global_rule = LXC_BPF_DEVICE_CGROUP_ALLOWLIST;
2787
2788                 device->allow = -1;
2789                 return 0;
2790         }
2791
2792         /* local rule */
2793         device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
2794
2795         switch (*val) {
2796         case 'a':
2797                 __fallthrough;
2798         case 'b':
2799                 __fallthrough;
2800         case 'c':
2801                 device->type = *val;
2802                 break;
2803         default:
2804                 return -1;
2805         }
2806
2807         val++;
2808         if (!isspace(*val))
2809                 return -1;
2810         val++;
2811         if (*val == '*') {
2812                 device->major = -1;
2813                 val++;
2814         } else if (isdigit(*val)) {
2815                 memset(temp, 0, sizeof(temp));
2816                 for (count = 0; count < sizeof(temp) - 1; count++) {
2817                         temp[count] = *val;
2818                         val++;
2819                         if (!isdigit(*val))
2820                                 break;
2821                 }
2822                 ret = lxc_safe_int(temp, &device->major);
2823                 if (ret)
2824                         return -1;
2825         } else {
2826                 return -1;
2827         }
2828         if (*val != ':')
2829                 return -1;
2830         val++;
2831
2832         /* read minor */
2833         if (*val == '*') {
2834                 device->minor = -1;
2835                 val++;
2836         } else if (isdigit(*val)) {
2837                 memset(temp, 0, sizeof(temp));
2838                 for (count = 0; count < sizeof(temp) - 1; count++) {
2839                         temp[count] = *val;
2840                         val++;
2841                         if (!isdigit(*val))
2842                                 break;
2843                 }
2844                 ret = lxc_safe_int(temp, &device->minor);
2845                 if (ret)
2846                         return -1;
2847         } else {
2848                 return -1;
2849         }
2850         if (!isspace(*val))
2851                 return -1;
2852
2853         return device_cgroup_parse_access(device, ++val);
2854 }
2855
2856 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.  Here we
2857  * don't have a cgroup_data set up, so we ask the running container through the
2858  * commands API for the cgroup path.
2859  */
2860 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2861                                      const char *key, const char *value,
2862                                      const char *name, const char *lxcpath)
2863 {
2864         __do_free char *path = NULL;
2865         __do_free char *controller = NULL;
2866         char *p;
2867         struct hierarchy *h;
2868         int ret = -1;
2869
2870         if (!ops || is_empty_string(key) || is_empty_string(value) ||
2871             is_empty_string(name) || is_empty_string(lxcpath))
2872                 return ret_errno(EINVAL);
2873
2874         controller = must_copy_string(key);
2875         p = strchr(controller, '.');
2876         if (p)
2877                 *p = '\0';
2878
2879         if (pure_unified_layout(ops) && strequal(controller, "devices")) {
2880                 struct device_item device = {};
2881
2882                 ret = device_cgroup_rule_parse(&device, key, value);
2883                 if (ret < 0)
2884                         return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2885                                                key, value);
2886
2887                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2888                 if (ret < 0)
2889                         return -1;
2890
2891                 return 0;
2892         }
2893
2894         path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
2895         /* not running */
2896         if (!path)
2897                 return -1;
2898
2899         h = get_hierarchy(ops, controller);
2900         if (h) {
2901                 __do_free char *fullpath = NULL;
2902
2903                 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2904                 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2905         }
2906
2907         return ret;
2908 }
2909
2910 /* take devices cgroup line
2911  *    /dev/foo rwx
2912  * and convert it to a valid
2913  *    type major:minor mode
2914  * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2915  * the output.
2916  */
2917 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2918                                             const char *devpath)
2919 {
2920         __do_free char *path = NULL;
2921         char *mode = NULL;
2922         int n_parts, ret;
2923         char *p;
2924         struct stat sb;
2925
2926         path = must_copy_string(devpath);
2927
2928         /*
2929          * Read path followed by mode. Ignore any trailing text.
2930          * A '    # comment' would be legal. Technically other text is not
2931          * legal, we could check for that if we cared to.
2932          */
2933         for (n_parts = 1, p = path; *p; p++) {
2934                 if (*p != ' ')
2935                         continue;
2936                 *p = '\0';
2937
2938                 if (n_parts != 1)
2939                         break;
2940                 p++;
2941                 n_parts++;
2942
2943                 while (*p == ' ')
2944                         p++;
2945
2946                 mode = p;
2947
2948                 if (*p == '\0')
2949                         return ret_set_errno(-1, EINVAL);
2950         }
2951
2952         if (!mode)
2953                 return ret_errno(EINVAL);
2954
2955         if (device_cgroup_parse_access(device, mode) < 0)
2956                 return -1;
2957
2958         ret = stat(path, &sb);
2959         if (ret < 0)
2960                 return ret_set_errno(-1, errno);
2961
2962         mode_t m = sb.st_mode & S_IFMT;
2963         switch (m) {
2964         case S_IFBLK:
2965                 device->type = 'b';
2966                 break;
2967         case S_IFCHR:
2968                 device->type = 'c';
2969                 break;
2970         default:
2971                 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
2972         }
2973
2974         device->major = MAJOR(sb.st_rdev);
2975         device->minor = MINOR(sb.st_rdev);
2976         device->allow = 1;
2977         device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
2978
2979         return 0;
2980 }
2981
2982 static int convert_devpath(const char *invalue, char *dest)
2983 {
2984         struct device_item device = {};
2985         int ret;
2986
2987         ret = device_cgroup_rule_parse_devpath(&device, invalue);
2988         if (ret < 0)
2989                 return -1;
2990
2991         ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2992                          device.minor, device.access);
2993         if (ret < 0)
2994                 return log_error_errno(ret, -ret,
2995                                        "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2996                                        device.type, device.major, device.minor,
2997                                        device.access);
2998
2999         return 0;
3000 }
3001
3002 /* Called from setup_limits - here we have the container's cgroup_data because
3003  * we created the cgroups.
3004  */
3005 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
3006                               const char *value, bool is_cpuset)
3007 {
3008         __do_free char *controller = NULL;
3009         char *p;
3010         /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
3011         char converted_value[50];
3012         struct hierarchy *h;
3013
3014         controller = must_copy_string(filename);
3015         p = strchr(controller, '.');
3016         if (p)
3017                 *p = '\0';
3018
3019         if (strequal("devices.allow", filename) && value[0] == '/') {
3020                 int ret;
3021
3022                 ret = convert_devpath(value, converted_value);
3023                 if (ret < 0)
3024                         return ret;
3025                 value = converted_value;
3026         }
3027
3028         h = get_hierarchy(ops, controller);
3029         if (!h)
3030                 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
3031
3032         if (is_cpuset) {
3033                 int ret = lxc_write_openat(h->container_full_path, filename, value, strlen(value));
3034                 if (ret)
3035                         return ret;
3036         }
3037         return lxc_write_openat(h->container_limit_path, filename, value, strlen(value));
3038 }
3039
3040 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
3041                                                     struct lxc_conf *conf,
3042                                                     bool do_devices)
3043 {
3044         __do_free struct lxc_list *sorted_cgroup_settings = NULL;
3045         struct lxc_list *cgroup_settings = &conf->cgroup;
3046         struct lxc_list *iterator, *next;
3047         struct lxc_cgroup *cg;
3048         bool ret = false;
3049
3050         if (!ops)
3051                 return ret_set_errno(false, ENOENT);
3052
3053         if (!conf)
3054                 return ret_set_errno(false, EINVAL);
3055
3056         cgroup_settings = &conf->cgroup;
3057         if (lxc_list_empty(cgroup_settings))
3058                 return true;
3059
3060         if (!ops->hierarchies)
3061                 return ret_set_errno(false, EINVAL);
3062
3063         if (pure_unified_layout(ops))
3064                 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
3065
3066         sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
3067         if (!sorted_cgroup_settings)
3068                 return false;
3069
3070         lxc_list_for_each(iterator, sorted_cgroup_settings) {
3071                 cg = iterator->elem;
3072
3073                 if (do_devices == strnequal("devices", cg->subsystem, 7)) {
3074                         if (cg_legacy_set_data(ops, cg->subsystem, cg->value, strnequal("cpuset", cg->subsystem, 6))) {
3075                                 if (do_devices && (errno == EACCES || errno == EPERM)) {
3076                                         SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3077                                         continue;
3078                                 }
3079                                 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3080                                 goto out;
3081                         }
3082                         DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
3083                 }
3084         }
3085
3086         ret = true;
3087         INFO("Limits for the legacy cgroup hierarchies have been setup");
3088 out:
3089         lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
3090                 lxc_list_del(iterator);
3091                 free(iterator);
3092         }
3093
3094         return ret;
3095 }
3096
3097 /*
3098  * Some of the parsing logic comes from the original cgroup device v1
3099  * implementation in the kernel.
3100  */
3101 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
3102                                      struct lxc_conf *conf, const char *key,
3103                                      const char *val)
3104 {
3105         struct device_item device_item = {};
3106         int ret;
3107
3108         if (strequal("devices.allow", key) && *val == '/')
3109                 ret = device_cgroup_rule_parse_devpath(&device_item, val);
3110         else
3111                 ret = device_cgroup_rule_parse(&device_item, key, val);
3112         if (ret < 0)
3113                 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", key, val);
3114
3115         ret = bpf_list_add_device(&conf->devices, &device_item);
3116         if (ret < 0)
3117                 return -1;
3118         return 0;
3119 }
3120
3121 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
3122                                              struct lxc_handler *handler)
3123 {
3124         struct lxc_list *cgroup_settings, *iterator;
3125         struct hierarchy *h;
3126         struct lxc_conf *conf;
3127
3128         if (!ops)
3129                 return ret_set_errno(false, ENOENT);
3130
3131         if (!ops->hierarchies)
3132                 return true;
3133
3134         if (!ops->container_cgroup)
3135                 return ret_set_errno(false, EINVAL);
3136
3137         if (!handler || !handler->conf)
3138                 return ret_set_errno(false, EINVAL);
3139         conf = handler->conf;
3140
3141         cgroup_settings = &conf->cgroup2;
3142         if (lxc_list_empty(cgroup_settings))
3143                 return true;
3144
3145         if (!pure_unified_layout(ops))
3146                 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
3147
3148         if (!ops->unified)
3149                 return false;
3150         h = ops->unified;
3151
3152         lxc_list_for_each (iterator, cgroup_settings) {
3153                 struct lxc_cgroup *cg = iterator->elem;
3154                 int ret;
3155
3156                 if (strnequal("devices", cg->subsystem, 7))
3157                         ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, cg->value);
3158                 else
3159                         ret = lxc_write_openat(h->container_limit_path, cg->subsystem, cg->value, strlen(cg->value));
3160                 if (ret < 0)
3161                         return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3162
3163                 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
3164         }
3165
3166         return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
3167 }
3168
3169 __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
3170 {
3171         struct lxc_conf *conf;
3172         struct hierarchy *unified;
3173
3174         if (!ops)
3175                 return ret_set_errno(false, ENOENT);
3176
3177         if (!ops->hierarchies)
3178                 return true;
3179
3180         if (!ops->container_cgroup)
3181                 return ret_set_errno(false, EEXIST);
3182
3183         if (!handler || !handler->conf)
3184                 return ret_set_errno(false, EINVAL);
3185         conf = handler->conf;
3186
3187         unified = ops->unified;
3188         if (!unified || !unified->bpf_device_controller ||
3189             !unified->container_full_path || lxc_list_empty(&conf->devices))
3190                 return true;
3191
3192         return bpf_cgroup_devices_attach(ops, &conf->devices);
3193 }
3194
3195 static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
3196 {
3197         __do_close int dfd_final = -EBADF;
3198         __do_free char *add_controllers = NULL, *copy = NULL;
3199         size_t full_len = 0;
3200         struct hierarchy *unified;
3201         int dfd_cur, ret;
3202         char *cur;
3203         char **it;
3204
3205         if (!ops->hierarchies || !pure_unified_layout(ops))
3206                 return true;
3207
3208         unified = ops->unified;
3209         if (!unified->controllers[0])
3210                 return true;
3211
3212         /* For now we simply enable all controllers that we have detected by
3213          * creating a string like "+memory +pids +cpu +io".
3214          * TODO: In the near future we might want to support "-<controller>"
3215          * etc. but whether supporting semantics like this make sense will need
3216          * some thinking.
3217          */
3218         for (it = unified->controllers; it && *it; it++) {
3219                 full_len += strlen(*it) + 2;
3220                 add_controllers = must_realloc(add_controllers, full_len + 1);
3221
3222                 if (unified->controllers[0] == *it)
3223                         add_controllers[0] = '\0';
3224
3225                 (void)strlcat(add_controllers, "+", full_len + 1);
3226                 (void)strlcat(add_controllers, *it, full_len + 1);
3227
3228                 if ((it + 1) && *(it + 1))
3229                         (void)strlcat(add_controllers, " ", full_len + 1);
3230         }
3231
3232         copy = strdup(cgroup);
3233         if (!copy)
3234                 return false;
3235
3236         /*
3237          * Placing the write to cgroup.subtree_control before the open() is
3238          * intentional because of the cgroup2 delegation model. It enforces
3239          * that leaf cgroups don't have any controllers enabled for delegation.
3240          */
3241         dfd_cur = unified->dfd_base;
3242         lxc_iterate_parts(cur, copy, "/") {
3243                 /*
3244                  * Even though we vetted the paths when we parsed the config
3245                  * we're paranoid here and check that the path is neither
3246                  * absolute nor walks upwards.
3247                  */
3248                 if (abspath(cur))
3249                         return syserrno_set(-EINVAL, "No absolute paths allowed");
3250
3251                 if (strnequal(cur, "..", STRLITERALLEN("..")))
3252                         return syserrno_set(-EINVAL, "No upward walking paths allowed");
3253
3254                 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
3255                 if (ret < 0)
3256                         return syserrno(-errno, "Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
3257
3258                 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
3259
3260                 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
3261                 if (dfd_final < 0)
3262                         return syserrno(-errno, "Fail to open directory %d(%s)", dfd_cur, cur);
3263                 if (dfd_cur != unified->dfd_base)
3264                         close(dfd_cur);
3265                 /*
3266                  * Leave dfd_final pointing to the last fd we opened so
3267                  * it will be automatically zapped if we return early.
3268                  */
3269                 dfd_cur = dfd_final;
3270         }
3271
3272         return true;
3273 }
3274
3275 __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
3276 {
3277         if (!ops)
3278                 return ret_set_errno(false, ENOENT);
3279
3280         return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
3281 }
3282
3283 __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
3284 {
3285         if (!ops)
3286                 return ret_set_errno(false, ENOENT);
3287
3288         return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
3289 }
3290
3291 static void cg_unified_delegate(char ***delegate)
3292 {
3293         __do_free char *buf = NULL;
3294         char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
3295         char *token;
3296         int idx;
3297
3298         buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
3299         if (!buf) {
3300                 for (char **p = standard; p && *p; p++) {
3301                         idx = append_null_to_list((void ***)delegate);
3302                         (*delegate)[idx] = must_copy_string(*p);
3303                 }
3304                 SYSWARN("Failed to read /sys/kernel/cgroup/delegate");
3305                 return;
3306         }
3307
3308         lxc_iterate_parts(token, buf, " \t\n") {
3309                 /*
3310                  * We always need to chown this for both cgroup and
3311                  * cgroup2.
3312                  */
3313                 if (strequal(token, "cgroup.procs"))
3314                         continue;
3315
3316                 idx = append_null_to_list((void ***)delegate);
3317                 (*delegate)[idx] = must_copy_string(token);
3318         }
3319 }
3320
3321 /* At startup, parse_hierarchies finds all the info we need about cgroup
3322  * mountpoints and current cgroups, and stores it in @d.
3323  */
3324 static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileged)
3325 {
3326         __do_free char *basecginfo = NULL, *line = NULL;
3327         __do_free_string_list char **klist = NULL, **nlist = NULL;
3328         __do_fclose FILE *f = NULL;
3329         int ret;
3330         size_t len = 0;
3331
3332         /* Root spawned containers escape the current cgroup, so use init's
3333          * cgroups as our base in that case.
3334          */
3335         if (!relative && (geteuid() == 0))
3336                 basecginfo = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3337         else
3338                 basecginfo = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3339         if (!basecginfo)
3340                 return ret_set_errno(-1, ENOMEM);
3341
3342         ret = get_existing_subsystems(&klist, &nlist);
3343         if (ret < 0)
3344                 return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
3345
3346         f = fopen("/proc/self/mountinfo", "re");
3347         if (!f)
3348                 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
3349
3350         lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
3351
3352         while (getline(&line, &len, f) != -1) {
3353                 __do_free char *base_cgroup = NULL, *mountpoint = NULL;
3354                 __do_free_string_list char **controller_list = NULL;
3355                 int type;
3356                 bool writeable;
3357
3358                 type = get_cgroup_version(line);
3359                 if (type == 0)
3360                         continue;
3361
3362                 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
3363                         continue;
3364
3365                 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
3366                         if (type == CGROUP2_SUPER_MAGIC)
3367                                 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3368                         else if (type == CGROUP_SUPER_MAGIC)
3369                                 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3370                 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
3371                         if (type == CGROUP_SUPER_MAGIC)
3372                                 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3373                 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3374                         if (type == CGROUP2_SUPER_MAGIC)
3375                                 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3376                 }
3377
3378                 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
3379                 if (!controller_list && type == CGROUP_SUPER_MAGIC)
3380                         continue;
3381
3382                 if (type == CGROUP_SUPER_MAGIC)
3383                         if (controller_list_is_dup(ops->hierarchies, controller_list)) {
3384                                 TRACE("Skipping duplicating controller");
3385                                 continue;
3386                         }
3387
3388                 mountpoint = cg_hybrid_get_mountpoint(line);
3389                 if (!mountpoint) {
3390                         WARN("Failed parsing mountpoint from \"%s\"", line);
3391                         continue;
3392                 }
3393
3394                 if (type == CGROUP_SUPER_MAGIC)
3395                         base_cgroup = cg_hybrid_get_current_cgroup(relative, basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
3396                 else
3397                         base_cgroup = cg_hybrid_get_current_cgroup(relative, basecginfo, NULL, CGROUP2_SUPER_MAGIC);
3398                 if (!base_cgroup) {
3399                         WARN("Failed to find current cgroup");
3400                         continue;
3401                 }
3402
3403                 if (type == CGROUP2_SUPER_MAGIC)
3404                         writeable = test_writeable_v2(mountpoint, base_cgroup);
3405                 else
3406                         writeable = test_writeable_v1(mountpoint, base_cgroup);
3407                 if (!writeable) {
3408                         TRACE("The %s group is not writeable", base_cgroup);
3409                         continue;
3410                 }
3411
3412                 if (type == CGROUP2_SUPER_MAGIC)
3413                         ret = add_hierarchy(ops, NULL, move_ptr(mountpoint), move_ptr(base_cgroup), type);
3414                 else
3415                         ret = add_hierarchy(ops, move_ptr(controller_list), move_ptr(mountpoint), move_ptr(base_cgroup), type);
3416                 if (ret)
3417                         return syserrno(ret, "Failed to add cgroup hierarchy");
3418                 if (ops->unified && unprivileged)
3419                         cg_unified_delegate(&(ops->unified)->cgroup2_chown);
3420         }
3421
3422         /* verify that all controllers in cgroup.use and all crucial
3423          * controllers are accounted for
3424          */
3425         if (!all_controllers_found(ops))
3426                 return log_error_errno(-1, ENOENT, "Failed to find all required controllers");
3427
3428         return 0;
3429 }
3430
3431 /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
3432 static char *cg_unified_get_current_cgroup(bool relative)
3433 {
3434         __do_free char *basecginfo = NULL, *copy = NULL;
3435         char *base_cgroup;
3436
3437         if (!relative && (geteuid() == 0))
3438                 basecginfo = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3439         else
3440                 basecginfo = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3441         if (!basecginfo)
3442                 return NULL;
3443
3444         base_cgroup = strstr(basecginfo, "0::/");
3445         if (!base_cgroup)
3446                 return NULL;
3447
3448         base_cgroup = base_cgroup + 3;
3449         copy = copy_to_eol(base_cgroup);
3450         if (!copy)
3451                 return NULL;
3452         trim(copy);
3453
3454         if (!relative) {
3455                 base_cgroup = prune_init_scope(copy);
3456                 if (!base_cgroup)
3457                         return NULL;
3458         } else {
3459                 base_cgroup = copy;
3460         }
3461
3462         if (abspath(base_cgroup))
3463                 base_cgroup = deabs(base_cgroup);
3464
3465         /* We're allowing base_cgroup to be "". */
3466         return strdup(base_cgroup);
3467 }
3468
3469 static int cg_unified_init(struct cgroup_ops *ops, bool relative,
3470                            bool unprivileged)
3471 {
3472         __do_free char *base_cgroup = NULL;
3473         int ret;
3474
3475         base_cgroup = cg_unified_get_current_cgroup(relative);
3476         if (!base_cgroup)
3477                 return ret_errno(EINVAL);
3478
3479         /* TODO: If the user requested specific controllers via lxc.cgroup.use
3480          * we should verify here. The reason I'm not doing it right is that I'm
3481          * not convinced that lxc.cgroup.use will be the future since it is a
3482          * global property. I much rather have an option that lets you request
3483          * controllers per container.
3484          */
3485
3486         ret = add_hierarchy(ops, NULL,
3487                             must_copy_string(DEFAULT_CGROUP_MOUNTPOINT),
3488                             move_ptr(base_cgroup), CGROUP2_SUPER_MAGIC);
3489         if (ret)
3490                 return syserrno(ret, "Failed to add unified cgroup hierarchy");
3491
3492         if (unprivileged)
3493                 cg_unified_delegate(&(ops->unified)->cgroup2_chown);
3494
3495         if (bpf_devices_cgroup_supported())
3496                 ops->unified->bpf_device_controller = 1;
3497
3498         ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3499         return CGROUP2_SUPER_MAGIC;
3500 }
3501
3502 static int __cgroup_init(struct cgroup_ops *ops, struct lxc_conf *conf)
3503 {
3504         __do_close int dfd = -EBADF;
3505         bool relative = conf->cgroup_meta.relative;
3506         int ret;
3507         const char *tmp;
3508
3509         if (ops->dfd_mnt_cgroupfs_host >= 0)
3510                 return ret_errno(EINVAL);
3511
3512         /*
3513          * I don't see the need for allowing symlinks here. If users want to
3514          * have their hierarchy available in different locations I strongly
3515          * suggest bind-mounts.
3516          */
3517         dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3518                         PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3519         if (dfd < 0)
3520                 return syserrno(-errno, "Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
3521
3522         tmp = lxc_global_config_value("lxc.cgroup.use");
3523         if (tmp) {
3524                 __do_free char *pin = NULL;
3525                 char *chop, *cur;
3526
3527                 pin = must_copy_string(tmp);
3528                 chop = pin;
3529
3530                 lxc_iterate_parts(cur, chop, ",")
3531                         must_append_string(&ops->cgroup_use, cur);
3532         }
3533
3534         /*
3535          * Keep dfd referenced by the cleanup function and actually move the fd
3536          * once we know the initialization succeeded. So if we fail we clean up
3537          * the dfd.
3538          */
3539         ops->dfd_mnt_cgroupfs_host = dfd;
3540
3541         if (unified_cgroup_fd(dfd))
3542                 ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
3543         else
3544                 ret = cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
3545         if (ret < 0)
3546                 return syserrno(ret, "Failed to initialize cgroups");
3547
3548         /* Transfer ownership to cgroup_ops. */
3549         move_fd(dfd);
3550         return 0;
3551 }
3552
3553 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3554 {
3555         const char *cgroup_pattern;
3556
3557         if (!ops)
3558                 return ret_set_errno(-1, ENOENT);
3559
3560         /* copy system-wide cgroup information */
3561         cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3562         if (cgroup_pattern && !strequal(cgroup_pattern, ""))
3563                 ops->cgroup_pattern = must_copy_string(cgroup_pattern);
3564
3565         return 0;
3566 }
3567
3568 struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
3569 {
3570         __do_free struct cgroup_ops *cgfsng_ops = NULL;
3571
3572         cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
3573         if (!cgfsng_ops)
3574                 return ret_set_errno(NULL, ENOMEM);
3575
3576         cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3577         cgfsng_ops->dfd_mnt_cgroupfs_host = -EBADF;
3578
3579         if (__cgroup_init(cgfsng_ops, conf))
3580                 return NULL;
3581
3582         cgfsng_ops->data_init                           = cgfsng_data_init;
3583         cgfsng_ops->payload_destroy                     = cgfsng_payload_destroy;
3584         cgfsng_ops->monitor_destroy                     = cgfsng_monitor_destroy;
3585         cgfsng_ops->monitor_create                      = cgfsng_monitor_create;
3586         cgfsng_ops->monitor_enter                       = cgfsng_monitor_enter;
3587         cgfsng_ops->monitor_delegate_controllers        = cgfsng_monitor_delegate_controllers;
3588         cgfsng_ops->payload_delegate_controllers        = cgfsng_payload_delegate_controllers;
3589         cgfsng_ops->payload_create                      = cgfsng_payload_create;
3590         cgfsng_ops->payload_enter                       = cgfsng_payload_enter;
3591         cgfsng_ops->payload_finalize                    = cgfsng_payload_finalize;
3592         cgfsng_ops->get_cgroup                          = cgfsng_get_cgroup;
3593         cgfsng_ops->get                                 = cgfsng_get;
3594         cgfsng_ops->set                                 = cgfsng_set;
3595         cgfsng_ops->freeze                              = cgfsng_freeze;
3596         cgfsng_ops->unfreeze                            = cgfsng_unfreeze;
3597         cgfsng_ops->setup_limits_legacy                 = cgfsng_setup_limits_legacy;
3598         cgfsng_ops->setup_limits                        = cgfsng_setup_limits;
3599         cgfsng_ops->driver                              = "cgfsng";
3600         cgfsng_ops->version                             = "1.0.0";
3601         cgfsng_ops->attach                              = cgfsng_attach;
3602         cgfsng_ops->chown                               = cgfsng_chown;
3603         cgfsng_ops->mount                               = cgfsng_mount;
3604         cgfsng_ops->devices_activate                    = cgfsng_devices_activate;
3605         cgfsng_ops->get_limiting_cgroup                 = cgfsng_get_limiting_cgroup;
3606
3607         cgfsng_ops->criu_escape                         = cgfsng_criu_escape;
3608         cgfsng_ops->criu_num_hierarchies                = cgfsng_criu_num_hierarchies;
3609         cgfsng_ops->criu_get_hierarchies                = cgfsng_criu_get_hierarchies;
3610
3611         return move_ptr(cgfsng_ops);
3612 }
3613
3614 int cgroup_attach(const struct lxc_conf *conf, const char *name,
3615                   const char *lxcpath, pid_t pid)
3616 {
3617         __do_close int unified_fd = -EBADF;
3618         int ret;
3619
3620         if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
3621                 return ret_errno(EINVAL);
3622
3623         unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3624         if (unified_fd < 0)
3625                 return ret_errno(ENOCGROUP2);
3626
3627         if (!lxc_list_empty(&conf->id_map)) {
3628                 struct userns_exec_unified_attach_data args = {
3629                         .conf           = conf,
3630                         .unified_fd     = unified_fd,
3631                         .pid            = pid,
3632                 };
3633
3634                 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3635                 if (ret < 0)
3636                         return -errno;
3637
3638                 ret = userns_exec_minimal(conf,
3639                                           cgroup_unified_attach_parent_wrapper,
3640                                           &args,
3641                                           cgroup_unified_attach_child_wrapper,
3642                                           &args);
3643         } else {
3644                 ret = cgroup_attach_leaf(conf, unified_fd, pid);
3645         }
3646
3647         return ret;
3648 }
3649
3650 /* Connects to command socket therefore isn't callable from command handler. */
3651 int cgroup_get(const char *name, const char *lxcpath,
3652                const char *filename, char *buf, size_t len)
3653 {
3654         __do_close int unified_fd = -EBADF;
3655         ssize_t ret;
3656
3657         if (is_empty_string(filename) || is_empty_string(name) ||
3658             is_empty_string(lxcpath))
3659                 return ret_errno(EINVAL);
3660
3661         if ((buf && !len) || (len && !buf))
3662                 return ret_errno(EINVAL);
3663
3664         unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3665         if (unified_fd < 0)
3666                 return ret_errno(ENOCGROUP2);
3667
3668         ret = lxc_read_try_buf_at(unified_fd, filename, buf, len);
3669         if (ret < 0)
3670                 SYSERROR("Failed to read cgroup value");
3671
3672         return ret;
3673 }
3674
3675 /* Connects to command socket therefore isn't callable from command handler. */
3676 int cgroup_set(const char *name, const char *lxcpath,
3677                const char *filename, const char *value)
3678 {
3679         __do_close int unified_fd = -EBADF;
3680         ssize_t ret;
3681
3682         if (is_empty_string(filename) || is_empty_string(value) ||
3683             is_empty_string(name) || is_empty_string(lxcpath))
3684                 return ret_errno(EINVAL);
3685
3686         unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3687         if (unified_fd < 0)
3688                 return ret_errno(ENOCGROUP2);
3689
3690         if (strnequal(filename, "devices.", STRLITERALLEN("devices."))) {
3691                 struct device_item device = {};
3692
3693                 ret = device_cgroup_rule_parse(&device, filename, value);
3694                 if (ret < 0)
3695                         return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", filename, value);
3696
3697                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3698         } else {
3699                 ret = lxc_writeat(unified_fd, filename, value, strlen(value));
3700         }
3701
3702         return ret;
3703 }
3704
3705 static int do_cgroup_freeze(int unified_fd,
3706                             const char *state_string,
3707                             int state_num,
3708                             int timeout,
3709                             const char *epoll_error,
3710                             const char *wait_error)
3711 {
3712         __do_close int events_fd = -EBADF;
3713         call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
3714         int ret;
3715         struct lxc_epoll_descr descr = {};
3716
3717         if (timeout != 0) {
3718                 ret = lxc_mainloop_open(&descr);
3719                 if (ret)
3720                         return log_error_errno(-1, errno, "%s", epoll_error);
3721
3722                 /* automatically cleaned up now */
3723                 descr_ptr = &descr;
3724
3725                 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3726                 if (events_fd < 0)
3727                         return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3728
3729                 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
3730                 if (ret < 0)
3731                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3732         }
3733
3734         ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3735         if (ret < 0)
3736                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3737
3738         if (timeout != 0) {
3739                 ret = lxc_mainloop(&descr, timeout);
3740                 if (ret)
3741                         return log_error_errno(-1, errno, "%s", wait_error);
3742         }
3743
3744         return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3745 }
3746
3747 static inline int __cgroup_freeze(int unified_fd, int timeout)
3748 {
3749         return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3750                                 "Failed to create epoll instance to wait for container freeze",
3751                                 "Failed to wait for container to be frozen");
3752 }
3753
3754 int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
3755 {
3756         __do_close int unified_fd = -EBADF;
3757         int ret;
3758
3759         if (is_empty_string(name) || is_empty_string(lxcpath))
3760                 return ret_errno(EINVAL);
3761
3762         unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3763         if (unified_fd < 0)
3764                 return ret_errno(ENOCGROUP2);
3765
3766         lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
3767         ret = __cgroup_freeze(unified_fd, timeout);
3768         lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
3769         return ret;
3770 }
3771
3772 int __cgroup_unfreeze(int unified_fd, int timeout)
3773 {
3774         return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3775                                 "Failed to create epoll instance to wait for container freeze",
3776                                 "Failed to wait for container to be frozen");
3777 }
3778
3779 int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
3780 {
3781         __do_close int unified_fd = -EBADF;
3782         int ret;
3783
3784         if (is_empty_string(name) || is_empty_string(lxcpath))
3785                 return ret_errno(EINVAL);
3786
3787         unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3788         if (unified_fd < 0)
3789                 return ret_errno(ENOCGROUP2);
3790
3791         lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
3792         ret = __cgroup_unfreeze(unified_fd, timeout);
3793         lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
3794         return ret;
3795 }