src/lxc/cgroups/cgfsng.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 /*
   4  * cgfs-ng.c: this is a new, simplified implementation of a filesystem
   5  * cgroup backend.  The original cgfs.c was designed to be as flexible
   6  * as possible.  It would try to find cgroup filesystems no matter where
   7  * or how you had them mounted, and deduce the most usable mount for
   8  * each controller.
   9  *
  10  * This new implementation assumes that cgroup filesystems are mounted
  11  * under /sys/fs/cgroup/clist where clist is either the controller, or
  12  * a comma-separated list of controllers.
  13  */
  14
  15 #ifndef _GNU_SOURCE
  16 #define _GNU_SOURCE 1
  17 #endif
  18 #include <ctype.h>
  19 #include <dirent.h>
  20 #include <errno.h>
  21 #include <grp.h>
  22 #include <linux/kdev_t.h>
  23 #include <linux/types.h>
  24 #include <poll.h>
  25 #include <signal.h>
  26 #include <stdint.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/types.h>
  31 #include <unistd.h>
  32
  33 #include "caps.h"
  34 #include "cgroup.h"
  35 #include "cgroup2_devices.h"
  36 #include "cgroup_utils.h"
  37 #include "commands.h"
  38 #include "conf.h"
  39 #include "config.h"
  40 #include "log.h"
  41 #include "macro.h"
  42 #include "mainloop.h"
  43 #include "memory_utils.h"
  44 #include "storage/storage.h"
  45 #include "utils.h"
  46
  47 #ifndef HAVE_STRLCPY
  48 #include "include/strlcpy.h"
  49 #endif
  50
  51 #ifndef HAVE_STRLCAT
  52 #include "include/strlcat.h"
  53 #endif
  54
  55 lxc_log_define(cgfsng, cgroup);
  56
  57 static void free_string_list(char **clist)
  58 {
  59         int i;
  60
  61         if (!clist)
  62                 return;
  63
  64         for (i = 0; clist[i]; i++)
  65                 free(clist[i]);
  66
  67         free(clist);
  68 }
  69
  70 /* Given a pointer to a null-terminated array of pointers, realloc to add one
  71  * entry, and point the new entry to NULL. Do not fail. Return the index to the
  72  * second-to-last entry - that is, the one which is now available for use
  73  * (keeping the list null-terminated).
  74  */
  75 static int append_null_to_list(void ***list)
  76 {
  77         int newentry = 0;
  78
  79         if (*list)
  80                 for (; (*list)[newentry]; newentry++)
  81                         ;
  82
  83         *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
  84         (*list)[newentry + 1] = NULL;
  85         return newentry;
  86 }
  87
  88 /* Given a null-terminated array of strings, check whether @entry is one of the
  89  * strings.
  90  */
  91 static bool string_in_list(char **list, const char *entry)
  92 {
  93         int i;
  94
  95         if (!list)
  96                 return false;
  97
  98         for (i = 0; list[i]; i++)
  99                 if (strcmp(list[i], entry) == 0)
 100                         return true;
 101
 102         return false;
 103 }
 104
 105 /* Return a copy of @entry prepending "name=", i.e.  turn "systemd" into
 106  * "name=systemd". Do not fail.
 107  */
 108 static char *cg_legacy_must_prefix_named(char *entry)
 109 {
 110         size_t len;
 111         char *prefixed;
 112
 113         len = strlen(entry);
 114         prefixed = must_realloc(NULL, len + 6);
 115
 116         memcpy(prefixed, "name=", STRLITERALLEN("name="));
 117         memcpy(prefixed + STRLITERALLEN("name="), entry, len);
 118         prefixed[len + 5] = '\0';
 119
 120         return prefixed;
 121 }
 122
 123 /* Append an entry to the clist. Do not fail. @clist must be NULL the first time
 124  * we are called.
 125  *
 126  * We also handle named subsystems here. Any controller which is not a kernel
 127  * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
 128  * we refuse to use because we're not sure which we have here.
 129  * (TODO: We could work around this in some cases by just remounting to be
 130  * unambiguous, or by comparing mountpoint contents with current cgroup.)
 131  *
 132  * The last entry will always be NULL.
 133  */
 134 static void must_append_controller(char **klist, char **nlist, char ***clist,
 135                                    char *entry)
 136 {
 137         int newentry;
 138         char *copy;
 139
 140         if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
 141                 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
 142                 ERROR("It is both a named and kernel subsystem");
 143                 return;
 144         }
 145
 146         newentry = append_null_to_list((void ***)clist);
 147
 148         if (strncmp(entry, "name=", 5) == 0)
 149                 copy = must_copy_string(entry);
 150         else if (string_in_list(klist, entry))
 151                 copy = must_copy_string(entry);
 152         else
 153                 copy = cg_legacy_must_prefix_named(entry);
 154
 155         (*clist)[newentry] = copy;
 156 }
 157
 158 static inline bool pure_unified_layout(const struct cgroup_ops *ops)
 159 {
 160         return ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED;
 161 }
 162
 163 /* Given a handler's cgroup data, return the struct hierarchy for the controller
 164  * @c, or NULL if there is none.
 165  */
 166 struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
 167 {
 168         int i;
 169
 170         errno = ENOENT;
 171
 172         if (!ops->hierarchies) {
 173                 TRACE("There are no useable cgroup controllers");
 174                 return NULL;
 175         }
 176
 177         for (i = 0; ops->hierarchies[i]; i++) {
 178                 if (!controller) {
 179                         /* This is the empty unified hierarchy. */
 180                         if (ops->hierarchies[i]->controllers &&
 181                             !ops->hierarchies[i]->controllers[0])
 182                                 return ops->hierarchies[i];
 183                         continue;
 184                 } else if (pure_unified_layout(ops) &&
 185                            strcmp(controller, "devices") == 0) {
 186                         if (ops->unified->bpf_device_controller)
 187                                 return ops->unified;
 188                         break;
 189                 }
 190
 191                 if (string_in_list(ops->hierarchies[i]->controllers, controller))
 192                         return ops->hierarchies[i];
 193         }
 194
 195         if (controller)
 196                 WARN("There is no useable %s controller", controller);
 197         else
 198                 WARN("There is no empty unified cgroup hierarchy");
 199
 200         return NULL;
 201 }
 202
 203 #define BATCH_SIZE 50
 204 static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
 205 {
 206         int newbatches = (newlen / BATCH_SIZE) + 1;
 207         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 208
 209         if (!*mem || newbatches > oldbatches) {
 210                 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
 211         }
 212 }
 213
 214 static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
 215 {
 216         size_t full = oldlen + newlen;
 217
 218         batch_realloc(dest, oldlen, full + 1);
 219
 220         memcpy(*dest + oldlen, new, newlen + 1);
 221 }
 222
 223 /* Slurp in a whole file */
 224 static char *read_file(const char *fnam)
 225 {
 226         __do_free char *line = NULL;
 227         __do_fclose FILE *f = NULL;
 228         int linelen;
 229         char *buf = NULL;
 230         size_t len = 0, fulllen = 0;
 231
 232         f = fopen(fnam, "r");
 233         if (!f)
 234                 return NULL;
 235         while ((linelen = getline(&line, &len, f)) != -1) {
 236                 append_line(&buf, fulllen, line, linelen);
 237                 fulllen += linelen;
 238         }
 239         return buf;
 240 }
 241
 242 /* Taken over modified from the kernel sources. */
 243 #define NBITS 32 /* bits in uint32_t */
 244 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
 245 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
 246
 247 static void set_bit(unsigned bit, uint32_t *bitarr)
 248 {
 249         bitarr[bit / NBITS] |= (1 << (bit % NBITS));
 250 }
 251
 252 static void clear_bit(unsigned bit, uint32_t *bitarr)
 253 {
 254         bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
 255 }
 256
 257 static bool is_set(unsigned bit, uint32_t *bitarr)
 258 {
 259         return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
 260 }
 261
 262 /* Create cpumask from cpulist aka turn:
 263  *
 264  *      0,2-3
 265  *
 266  * into bit array
 267  *
 268  *      1 0 1 1
 269  */
 270 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
 271 {
 272         char *token;
 273         size_t arrlen;
 274         uint32_t *bitarr;
 275
 276         arrlen = BITS_TO_LONGS(nbits);
 277         bitarr = calloc(arrlen, sizeof(uint32_t));
 278         if (!bitarr)
 279                 return NULL;
 280
 281         lxc_iterate_parts(token, buf, ",") {
 282                 errno = 0;
 283                 unsigned end, start;
 284                 char *range;
 285
 286                 start = strtoul(token, NULL, 0);
 287                 end = start;
 288                 range = strchr(token, '-');
 289                 if (range)
 290                         end = strtoul(range + 1, NULL, 0);
 291
 292                 if (!(start <= end)) {
 293                         free(bitarr);
 294                         return NULL;
 295                 }
 296
 297                 if (end >= nbits) {
 298                         free(bitarr);
 299                         return NULL;
 300                 }
 301
 302                 while (start <= end)
 303                         set_bit(start++, bitarr);
 304         }
 305
 306         return bitarr;
 307 }
 308
 309 /* Turn cpumask into simple, comma-separated cpulist. */
 310 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
 311 {
 312         int ret;
 313         size_t i;
 314         char *tmp = NULL;
 315         char **cpulist = NULL;
 316         char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
 317
 318         for (i = 0; i <= nbits; i++) {
 319                 if (!is_set(i, bitarr))
 320                         continue;
 321
 322                 ret = snprintf(numstr, sizeof(numstr), "%zu", i);
 323                 if (ret < 0 || (size_t)ret >= sizeof(numstr)) {
 324                         lxc_free_array((void **)cpulist, free);
 325                         return NULL;
 326                 }
 327
 328                 ret = lxc_append_string(&cpulist, numstr);
 329                 if (ret < 0) {
 330                         lxc_free_array((void **)cpulist, free);
 331                         return NULL;
 332                 }
 333         }
 334
 335         if (!cpulist)
 336                 return NULL;
 337
 338         tmp = lxc_string_join(",", (const char **)cpulist, false);
 339         lxc_free_array((void **)cpulist, free);
 340
 341         return tmp;
 342 }
 343
 344 static ssize_t get_max_cpus(char *cpulist)
 345 {
 346         char *c1, *c2;
 347         char *maxcpus = cpulist;
 348         size_t cpus = 0;
 349
 350         c1 = strrchr(maxcpus, ',');
 351         if (c1)
 352                 c1++;
 353
 354         c2 = strrchr(maxcpus, '-');
 355         if (c2)
 356                 c2++;
 357
 358         if (!c1 && !c2)
 359                 c1 = maxcpus;
 360         else if (c1 > c2)
 361                 c2 = c1;
 362         else if (c1 < c2)
 363                 c1 = c2;
 364         else if (!c1 && c2)
 365                 c1 = c2;
 366
 367         errno = 0;
 368         cpus = strtoul(c1, NULL, 0);
 369         if (errno != 0)
 370                 return -1;
 371
 372         return cpus;
 373 }
 374
 375 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
 376 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
 377 static bool cg_legacy_filter_and_set_cpus(char *path, bool am_initialized)
 378 {
 379         __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
 380                        *offlinecpus = NULL, *posscpus = NULL;
 381         __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
 382                            *possmask = NULL;
 383         int ret;
 384         ssize_t i;
 385         char *lastslash;
 386         ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
 387         bool bret = false, flipped_bit = false;
 388
 389         lastslash = strrchr(path, '/');
 390         if (!lastslash) {
 391                 ERROR("Failed to detect \"/\" in \"%s\"", path);
 392                 return bret;
 393         }
 394         *lastslash = '\0';
 395         fpath = must_make_path(path, "cpuset.cpus", NULL);
 396         *lastslash = '/';
 397         posscpus = read_file(fpath);
 398         if (!posscpus) {
 399                 SYSERROR("Failed to read file \"%s\"", fpath);
 400                 return false;
 401         }
 402
 403         /* Get maximum number of cpus found in possible cpuset. */
 404         maxposs = get_max_cpus(posscpus);
 405         if (maxposs < 0 || maxposs >= INT_MAX - 1)
 406                 return false;
 407
 408         if (file_exists(__ISOL_CPUS)) {
 409                 isolcpus = read_file(__ISOL_CPUS);
 410                 if (!isolcpus) {
 411                         SYSERROR("Failed to read file \"%s\"", __ISOL_CPUS);
 412                         return false;
 413                 }
 414
 415                 if (isdigit(isolcpus[0])) {
 416                         /* Get maximum number of cpus found in isolated cpuset. */
 417                         maxisol = get_max_cpus(isolcpus);
 418                         if (maxisol < 0 || maxisol >= INT_MAX - 1)
 419                                 return false;
 420                 }
 421
 422                 if (maxposs < maxisol)
 423                         maxposs = maxisol;
 424                 maxposs++;
 425         } else {
 426                 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
 427         }
 428
 429         if (file_exists(__OFFLINE_CPUS)) {
 430                 offlinecpus = read_file(__OFFLINE_CPUS);
 431                 if (!offlinecpus) {
 432                         SYSERROR("Failed to read file \"%s\"", __OFFLINE_CPUS);
 433                         return false;
 434                 }
 435
 436                 if (isdigit(offlinecpus[0])) {
 437                         /* Get maximum number of cpus found in offline cpuset. */
 438                         maxoffline = get_max_cpus(offlinecpus);
 439                         if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
 440                                 return false;
 441                 }
 442
 443                 if (maxposs < maxoffline)
 444                         maxposs = maxoffline;
 445                 maxposs++;
 446         } else {
 447                 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
 448         }
 449
 450         if ((maxisol == 0) && (maxoffline == 0)) {
 451                 cpulist = move_ptr(posscpus);
 452                 goto copy_parent;
 453         }
 454
 455         possmask = lxc_cpumask(posscpus, maxposs);
 456         if (!possmask) {
 457                 ERROR("Failed to create cpumask for possible cpus");
 458                 return false;
 459         }
 460
 461         if (maxisol > 0) {
 462                 isolmask = lxc_cpumask(isolcpus, maxposs);
 463                 if (!isolmask) {
 464                         ERROR("Failed to create cpumask for isolated cpus");
 465                         return false;
 466                 }
 467         }
 468
 469         if (maxoffline > 0) {
 470                 offlinemask = lxc_cpumask(offlinecpus, maxposs);
 471                 if (!offlinemask) {
 472                         ERROR("Failed to create cpumask for offline cpus");
 473                         return false;
 474                 }
 475         }
 476
 477         for (i = 0; i <= maxposs; i++) {
 478                 if ((isolmask && !is_set(i, isolmask)) ||
 479                     (offlinemask && !is_set(i, offlinemask)) ||
 480                     !is_set(i, possmask))
 481                         continue;
 482
 483                 flipped_bit = true;
 484                 clear_bit(i, possmask);
 485         }
 486
 487         if (!flipped_bit) {
 488                 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
 489                 TRACE("No isolated or offline cpus present in cpuset");
 490         } else {
 491                 cpulist = move_ptr(posscpus);
 492                 TRACE("Removed isolated or offline cpus from cpuset");
 493         }
 494         if (!cpulist) {
 495                 ERROR("Failed to create cpu list");
 496                 return false;
 497         }
 498
 499 copy_parent:
 500         if (!am_initialized) {
 501                 ret = lxc_write_openat(path, "cpuset.cpus", cpulist, strlen(cpulist));
 502                 if (ret < 0)
 503                         return log_error_errno(false,
 504                                                errno, "Failed to write cpu list to \"%s/cpuset.cpus\"",
 505                                                path);
 506
 507                 TRACE("Copied cpu settings of parent cgroup");
 508         }
 509
 510         return true;
 511 }
 512
 513 /* Copy contents of parent(@path)/@file to @path/@file */
 514 static bool copy_parent_file(char *path, char *file)
 515 {
 516         __do_free char *parent_path = NULL, *value = NULL;
 517         int len = 0;
 518         char *lastslash = NULL;
 519         int ret;
 520
 521         lastslash = strrchr(path, '/');
 522         if (!lastslash)
 523                 return log_error_errno(false, ENOENT,
 524                                        "Failed to detect \"/\" in \"%s\"", path);
 525
 526         *lastslash = '\0';
 527         parent_path = must_make_path(path, file, NULL);
 528         *lastslash = '/';
 529
 530         len = lxc_read_from_file(parent_path, NULL, 0);
 531         if (len <= 0)
 532                 return log_error_errno(false, errno,
 533                                        "Failed to determine buffer size");
 534
 535         value = must_realloc(NULL, len + 1);
 536         value[len] = '\0';
 537         ret = lxc_read_from_file(parent_path, value, len);
 538         if (ret != len)
 539                 return log_error_errno(false, errno,
 540                                        "Failed to read from parent file \"%s\"",
 541                                        parent_path);
 542
 543         ret = lxc_write_openat(path, file, value, len);
 544         if (ret < 0 && errno != EACCES)
 545                 return log_error_errno(false,
 546                                        errno, "Failed to write \"%s\" to file \"%s/%s\"",
 547                                        value, path, file);
 548         return true;
 549 }
 550
 551 static bool is_unified_hierarchy(const struct hierarchy *h)
 552 {
 553         return h->version == CGROUP2_SUPER_MAGIC;
 554 }
 555
 556 /* Initialize the cpuset hierarchy in first directory of @gname and set
 557  * cgroup.clone_children so that children inherit settings. Since the
 558  * h->base_path is populated by init or ourselves, we know it is already
 559  * initialized.
 560  *
 561  * returns -1 on error, 0 when we didn't created a cgroup, 1 if we created a
 562  * cgroup.
 563  */
 564 static int cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
 565 {
 566         int fret = -1;
 567         __do_free char *cgpath = NULL;
 568         __do_close_prot_errno int cgroup_fd = -EBADF;
 569         int ret;
 570         char v;
 571         char *slash;
 572
 573         if (is_unified_hierarchy(h))
 574                 return 0;
 575
 576         if (!string_in_list(h->controllers, "cpuset"))
 577                 return 0;
 578
 579         if (*cgname == '/')
 580                 cgname++;
 581         slash = strchr(cgname, '/');
 582         if (slash)
 583                 *slash = '\0';
 584
 585         cgpath = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
 586         if (slash)
 587                 *slash = '/';
 588
 589         fret = 1;
 590         ret = mkdir(cgpath, 0755);
 591         if (ret < 0) {
 592                 if (errno != EEXIST)
 593                         return log_error_errno(-1, errno, "Failed to create directory \"%s\"", cgpath);
 594
 595                 fret = 0;
 596         }
 597
 598         cgroup_fd = lxc_open_dirfd(cgpath);
 599         if (cgroup_fd < 0)
 600                 return -1;
 601
 602         ret = lxc_readat(cgroup_fd, "cgroup.clone_children", &v, 1);
 603         if (ret < 0)
 604                 return log_error_errno(-1, errno, "Failed to read file \"%s/cgroup.clone_children\"", cgpath);
 605
 606         /* Make sure any isolated cpus are removed from cpuset.cpus. */
 607         if (!cg_legacy_filter_and_set_cpus(cgpath, v == '1'))
 608                 return log_error_errno(-1, errno, "Failed to remove isolated cpus");
 609
 610         /* Already set for us by someone else. */
 611         if (v == '1')
 612                 TRACE("\"cgroup.clone_children\" was already set to \"1\"");
 613
 614         /* copy parent's settings */
 615         if (!copy_parent_file(cgpath, "cpuset.mems"))
 616                 return log_error_errno(-1, errno, "Failed to copy \"cpuset.mems\" settings");
 617
 618         /* Set clone_children so children inherit our settings */
 619         ret = lxc_writeat(cgroup_fd, "cgroup.clone_children", "1", 1);
 620         if (ret < 0)
 621                 return log_error_errno(-1, errno, "Failed to write 1 to \"%s/cgroup.clone_children\"", cgpath);
 622
 623         return fret;
 624 }
 625
 626 /* Given two null-terminated lists of strings, return true if any string is in
 627  * both.
 628  */
 629 static bool controller_lists_intersect(char **l1, char **l2)
 630 {
 631         int i;
 632
 633         if (!l1 || !l2)
 634                 return false;
 635
 636         for (i = 0; l1[i]; i++) {
 637                 if (string_in_list(l2, l1[i]))
 638                         return true;
 639         }
 640
 641         return false;
 642 }
 643
 644 /* For a null-terminated list of controllers @clist, return true if any of those
 645  * controllers is already listed the null-terminated list of hierarchies @hlist.
 646  * Realistically, if one is present, all must be present.
 647  */
 648 static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
 649 {
 650         int i;
 651
 652         if (!hlist)
 653                 return false;
 654
 655         for (i = 0; hlist[i]; i++)
 656                 if (controller_lists_intersect(hlist[i]->controllers, clist))
 657                         return true;
 658
 659         return false;
 660 }
 661
 662 /* Return true if the controller @entry is found in the null-terminated list of
 663  * hierarchies @hlist.
 664  */
 665 static bool controller_found(struct hierarchy **hlist, char *entry)
 666 {
 667         int i;
 668
 669         if (!hlist)
 670                 return false;
 671
 672         for (i = 0; hlist[i]; i++)
 673                 if (string_in_list(hlist[i]->controllers, entry))
 674                         return true;
 675
 676         return false;
 677 }
 678
 679 /* Return true if all of the controllers which we require have been found.  The
 680  * required list is  freezer and anything in lxc.cgroup.use.
 681  */
 682 static bool all_controllers_found(struct cgroup_ops *ops)
 683 {
 684         char **cur;
 685         struct hierarchy **hlist = ops->hierarchies;
 686
 687         if (!ops->cgroup_use)
 688                 return true;
 689
 690         for (cur = ops->cgroup_use; cur && *cur; cur++)
 691                 if (!controller_found(hlist, *cur)) {
 692                         ERROR("No %s controller mountpoint found", *cur);
 693                         return false;
 694                 }
 695
 696         return true;
 697 }
 698
 699 /* Get the controllers from a mountinfo line There are other ways we could get
 700  * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
 701  * could parse the mount options. But we simply assume that the mountpoint must
 702  * be /sys/fs/cgroup/controller-list
 703  */
 704 static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
 705                                         int type)
 706 {
 707         /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
 708          * for legacy hierarchies.
 709          */
 710         int i;
 711         char *p2, *tok;
 712         char *p = line, *sep = ",";
 713         char **aret = NULL;
 714
 715         for (i = 0; i < 4; i++) {
 716                 p = strchr(p, ' ');
 717                 if (!p)
 718                         return NULL;
 719                 p++;
 720         }
 721
 722         /* Note, if we change how mountinfo works, then our caller will need to
 723          * verify /sys/fs/cgroup/ in this field.
 724          */
 725         if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0) {
 726                 ERROR("Found hierarchy not under " DEFAULT_CGROUP_MOUNTPOINT ": \"%s\"", p);
 727                 return NULL;
 728         }
 729
 730         p += 15;
 731         p2 = strchr(p, ' ');
 732         if (!p2) {
 733                 ERROR("Corrupt mountinfo");
 734                 return NULL;
 735         }
 736         *p2 = '\0';
 737
 738         if (type == CGROUP_SUPER_MAGIC) {
 739                 __do_free char *dup = NULL;
 740
 741                 /* strdup() here for v1 hierarchies. Otherwise
 742                  * lxc_iterate_parts() will destroy mountpoints such as
 743                  * "/sys/fs/cgroup/cpu,cpuacct".
 744                  */
 745                 dup = must_copy_string(p);
 746                 if (!dup)
 747                         return NULL;
 748
 749                 lxc_iterate_parts (tok, dup, sep)
 750                         must_append_controller(klist, nlist, &aret, tok);
 751         }
 752         *p2 = ' ';
 753
 754         return aret;
 755 }
 756
 757 static char **cg_unified_make_empty_controller(void)
 758 {
 759         int newentry;
 760         char **aret = NULL;
 761
 762         newentry = append_null_to_list((void ***)&aret);
 763         aret[newentry] = NULL;
 764         return aret;
 765 }
 766
 767 static char **cg_unified_get_controllers(const char *file)
 768 {
 769         __do_free char *buf = NULL;
 770         char *sep = " \t\n";
 771         char **aret = NULL;
 772         char *tok;
 773
 774         buf = read_file(file);
 775         if (!buf)
 776                 return NULL;
 777
 778         lxc_iterate_parts(tok, buf, sep) {
 779                 int newentry;
 780                 char *copy;
 781
 782                 newentry = append_null_to_list((void ***)&aret);
 783                 copy = must_copy_string(tok);
 784                 aret[newentry] = copy;
 785         }
 786
 787         return aret;
 788 }
 789
 790 static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
 791                                        char *container_base_path, int type)
 792 {
 793         struct hierarchy *new;
 794         int newentry;
 795
 796         new = must_realloc(NULL, sizeof(*new));
 797         new->controllers = clist;
 798         new->mountpoint = mountpoint;
 799         new->container_base_path = container_base_path;
 800         new->container_full_path = NULL;
 801         new->monitor_full_path = NULL;
 802         new->version = type;
 803         new->cgroup2_chown = NULL;
 804
 805         newentry = append_null_to_list((void ***)h);
 806         (*h)[newentry] = new;
 807         return new;
 808 }
 809
 810 /* Get a copy of the mountpoint from @line, which is a line from
 811  * /proc/self/mountinfo.
 812  */
 813 static char *cg_hybrid_get_mountpoint(char *line)
 814 {
 815         int i;
 816         size_t len;
 817         char *p2;
 818         char *p = line, *sret = NULL;
 819
 820         for (i = 0; i < 4; i++) {
 821                 p = strchr(p, ' ');
 822                 if (!p)
 823                         return NULL;
 824                 p++;
 825         }
 826
 827         if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
 828                 return NULL;
 829
 830         p2 = strchr(p + 15, ' ');
 831         if (!p2)
 832                 return NULL;
 833         *p2 = '\0';
 834
 835         len = strlen(p);
 836         sret = must_realloc(NULL, len + 1);
 837         memcpy(sret, p, len);
 838         sret[len] = '\0';
 839         return sret;
 840 }
 841
 842 /* Given a multi-line string, return a null-terminated copy of the current line. */
 843 static char *copy_to_eol(char *p)
 844 {
 845         char *p2 = strchr(p, '\n'), *sret;
 846         size_t len;
 847
 848         if (!p2)
 849                 return NULL;
 850
 851         len = p2 - p;
 852         sret = must_realloc(NULL, len + 1);
 853         memcpy(sret, p, len);
 854         sret[len] = '\0';
 855         return sret;
 856 }
 857
 858 /* cgline: pointer to character after the first ':' in a line in a \n-terminated
 859  * /proc/self/cgroup file. Check whether controller c is present.
 860  */
 861 static bool controller_in_clist(char *cgline, char *c)
 862 {
 863         __do_free char *tmp = NULL;
 864         char *tok, *eol;
 865         size_t len;
 866
 867         eol = strchr(cgline, ':');
 868         if (!eol)
 869                 return false;
 870
 871         len = eol - cgline;
 872         tmp = must_realloc(NULL, len + 1);
 873         memcpy(tmp, cgline, len);
 874         tmp[len] = '\0';
 875
 876         lxc_iterate_parts(tok, tmp, ",")
 877                 if (strcmp(tok, c) == 0)
 878                         return true;
 879
 880         return false;
 881 }
 882
 883 /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
 884  * @controller.
 885  */
 886 static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
 887                                           int type)
 888 {
 889         char *p = basecginfo;
 890
 891         for (;;) {
 892                 bool is_cgv2_base_cgroup = false;
 893
 894                 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
 895                 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
 896                         is_cgv2_base_cgroup = true;
 897
 898                 p = strchr(p, ':');
 899                 if (!p)
 900                         return NULL;
 901                 p++;
 902
 903                 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
 904                         p = strchr(p, ':');
 905                         if (!p)
 906                                 return NULL;
 907                         p++;
 908                         return copy_to_eol(p);
 909                 }
 910
 911                 p = strchr(p, '\n');
 912                 if (!p)
 913                         return NULL;
 914                 p++;
 915         }
 916 }
 917
 918 static void must_append_string(char ***list, char *entry)
 919 {
 920         int newentry;
 921         char *copy;
 922
 923         newentry = append_null_to_list((void ***)list);
 924         copy = must_copy_string(entry);
 925         (*list)[newentry] = copy;
 926 }
 927
 928 static int get_existing_subsystems(char ***klist, char ***nlist)
 929 {
 930         __do_free char *line = NULL;
 931         __do_fclose FILE *f = NULL;
 932         size_t len = 0;
 933
 934         f = fopen("/proc/self/cgroup", "r");
 935         if (!f)
 936                 return -1;
 937
 938         while (getline(&line, &len, f) != -1) {
 939                 char *p, *p2, *tok;
 940                 p = strchr(line, ':');
 941                 if (!p)
 942                         continue;
 943                 p++;
 944                 p2 = strchr(p, ':');
 945                 if (!p2)
 946                         continue;
 947                 *p2 = '\0';
 948
 949                 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
 950                  * contains an entry of the form:
 951                  *
 952                  *      0::/some/path
 953                  *
 954                  * In this case we use "cgroup2" as controller name.
 955                  */
 956                 if ((p2 - p) == 0) {
 957                         must_append_string(klist, "cgroup2");
 958                         continue;
 959                 }
 960
 961                 lxc_iterate_parts(tok, p, ",") {
 962                         if (strncmp(tok, "name=", 5) == 0)
 963                                 must_append_string(nlist, tok);
 964                         else
 965                                 must_append_string(klist, tok);
 966                 }
 967         }
 968
 969         return 0;
 970 }
 971
 972 static void trim(char *s)
 973 {
 974         size_t len;
 975
 976         len = strlen(s);
 977         while ((len > 1) && (s[len - 1] == '\n'))
 978                 s[--len] = '\0';
 979 }
 980
 981 static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
 982 {
 983         int i;
 984         struct hierarchy **it;
 985
 986         if (!ops->hierarchies) {
 987                 TRACE("  No hierarchies found");
 988                 return;
 989         }
 990
 991         TRACE("  Hierarchies:");
 992         for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
 993                 int j;
 994                 char **cit;
 995
 996                 TRACE("  %d: base_cgroup: %s", i, (*it)->container_base_path ? (*it)->container_base_path : "(null)");
 997                 TRACE("      mountpoint:  %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
 998                 TRACE("      controllers:");
 999                 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
1000                         TRACE("      %d: %s", j, *cit);
1001         }
1002 }
1003
1004 static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
1005                                               char **nlist)
1006 {
1007         int k;
1008         char **it;
1009
1010         TRACE("basecginfo is:");
1011         TRACE("%s", basecginfo);
1012
1013         for (k = 0, it = klist; it && *it; it++, k++)
1014                 TRACE("kernel subsystem %d: %s", k, *it);
1015
1016         for (k = 0, it = nlist; it && *it; it++, k++)
1017                 TRACE("named subsystem %d: %s", k, *it);
1018 }
1019
1020 static int cgroup_rmdir(struct hierarchy **hierarchies,
1021                         const char *container_cgroup)
1022 {
1023         int i;
1024
1025         if (!container_cgroup || !hierarchies)
1026                 return 0;
1027
1028         for (i = 0; hierarchies[i]; i++) {
1029                 int ret;
1030                 struct hierarchy *h = hierarchies[i];
1031
1032                 if (!h->container_full_path)
1033                         continue;
1034
1035                 ret = recursive_destroy(h->container_full_path);
1036                 if (ret < 0)
1037                         WARN("Failed to destroy \"%s\"", h->container_full_path);
1038
1039                 free(h->container_full_path);
1040                 h->container_full_path = NULL;
1041         }
1042
1043         return 0;
1044 }
1045
1046 struct generic_userns_exec_data {
1047         struct hierarchy **hierarchies;
1048         const char *container_cgroup;
1049         struct lxc_conf *conf;
1050         uid_t origuid; /* target uid in parent namespace */
1051         char *path;
1052 };
1053
1054 static int cgroup_rmdir_wrapper(void *data)
1055 {
1056         int ret;
1057         struct generic_userns_exec_data *arg = data;
1058         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1059         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1060
1061         ret = setresgid(nsgid, nsgid, nsgid);
1062         if (ret < 0) {
1063                 SYSERROR("Failed to setresgid(%d, %d, %d)", (int)nsgid,
1064                          (int)nsgid, (int)nsgid);
1065                 return -1;
1066         }
1067
1068         ret = setresuid(nsuid, nsuid, nsuid);
1069         if (ret < 0) {
1070                 SYSERROR("Failed to setresuid(%d, %d, %d)", (int)nsuid,
1071                          (int)nsuid, (int)nsuid);
1072                 return -1;
1073         }
1074
1075         ret = setgroups(0, NULL);
1076         if (ret < 0 && errno != EPERM) {
1077                 SYSERROR("Failed to setgroups(0, NULL)");
1078                 return -1;
1079         }
1080
1081         return cgroup_rmdir(arg->hierarchies, arg->container_cgroup);
1082 }
1083
1084 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
1085                                                 struct lxc_handler *handler)
1086 {
1087         int ret;
1088         struct generic_userns_exec_data wrap;
1089
1090         if (!ops)
1091                 log_error_errno(return, ENOENT, "Called with uninitialized cgroup operations");
1092
1093         if (!ops->hierarchies)
1094                 return;
1095
1096         if (!handler)
1097                 log_error_errno(return, EINVAL, "Called with uninitialized handler");
1098
1099         if (!handler->conf)
1100                 log_error_errno(return, EINVAL, "Called with uninitialized conf");
1101
1102         wrap.origuid = 0;
1103         wrap.container_cgroup = ops->container_cgroup;
1104         wrap.hierarchies = ops->hierarchies;
1105         wrap.conf = handler->conf;
1106
1107 #ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
1108         ret = bpf_program_cgroup_detach(handler->conf->cgroup2_devices);
1109         if (ret < 0)
1110                 WARN("Failed to detach bpf program from cgroup");
1111 #endif
1112
1113         if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
1114                 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
1115                                     "cgroup_rmdir_wrapper");
1116         else
1117                 ret = cgroup_rmdir(ops->hierarchies, ops->container_cgroup);
1118         if (ret < 0) {
1119                 WARN("Failed to destroy cgroups");
1120                 return;
1121         }
1122 }
1123
1124 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1125                                                 struct lxc_handler *handler)
1126 {
1127         int len;
1128         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1129         struct lxc_conf *conf;
1130
1131         if (!ops)
1132                 log_error_errno(return, ENOENT, "Called with uninitialized cgroup operations");
1133
1134         if (!ops->hierarchies)
1135                 return;
1136
1137         if (!handler)
1138                 log_error_errno(return, EINVAL, "Called with uninitialized handler");
1139
1140         if (!handler->conf)
1141                 log_error_errno(return, EINVAL, "Called with uninitialized conf");
1142
1143         conf = handler->conf;
1144
1145         len = snprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1146         if (len < 0 || (size_t)len >= sizeof(pidstr))
1147                 return;
1148
1149         for (int i = 0; ops->hierarchies[i]; i++) {
1150                 __do_free char *pivot_path = NULL;
1151                 char pivot_cgroup[] = CGROUP_PIVOT;
1152                 struct hierarchy *h = ops->hierarchies[i];
1153                 int ret;
1154
1155                 if (!h->monitor_full_path)
1156                         continue;
1157
1158                 if (conf && conf->cgroup_meta.dir)
1159                         pivot_path = must_make_path(h->mountpoint,
1160                                                     h->container_base_path,
1161                                                     conf->cgroup_meta.dir,
1162                                                     CGROUP_PIVOT, NULL);
1163                 else
1164                         pivot_path = must_make_path(h->mountpoint,
1165                                                     h->container_base_path,
1166                                                     CGROUP_PIVOT, NULL);
1167
1168                 /*
1169                  * Make sure not to pass in the ro string literal CGROUP_PIVOT
1170                  * here.
1171                  */
1172                 if (cg_legacy_handle_cpuset_hierarchy(h, pivot_cgroup) < 0)
1173                         log_warn_errno(continue, errno, "Failed to handle legacy cpuset controller");
1174
1175                 ret = mkdir_p(pivot_path, 0755);
1176                 if (ret < 0 && errno != EEXIST)
1177                         log_warn_errno(continue, errno,
1178                                        "Failed to create cgroup \"%s\"\n",
1179                                        pivot_path);
1180
1181                 /*
1182                  * Move ourselves into the pivot cgroup to delete our own
1183                  * cgroup.
1184                  */
1185                 ret = lxc_write_openat(pivot_path, "cgroup.procs", pidstr, len);
1186                 if (ret != 0)
1187                         log_warn_errno(continue, errno,
1188                                        "Failed to move monitor %s to \"%s\"\n",
1189                                        pidstr, pivot_path);
1190
1191                 ret = recursive_destroy(h->monitor_full_path);
1192                 if (ret < 0)
1193                         WARN("Failed to destroy \"%s\"", h->monitor_full_path);
1194         }
1195 }
1196
1197 static int mkdir_eexist_on_last(const char *dir, mode_t mode)
1198 {
1199         const char *tmp = dir;
1200         const char *orig = dir;
1201         size_t orig_len;
1202
1203         orig_len = strlen(dir);
1204         do {
1205                 __do_free char *makeme = NULL;
1206                 int ret;
1207                 size_t cur_len;
1208
1209                 dir = tmp + strspn(tmp, "/");
1210                 tmp = dir + strcspn(dir, "/");
1211
1212                 errno = ENOMEM;
1213                 cur_len = dir - orig;
1214                 makeme = strndup(orig, cur_len);
1215                 if (!makeme)
1216                         return -1;
1217
1218                 ret = mkdir(makeme, mode);
1219                 if (ret < 0) {
1220                         if ((errno != EEXIST) || (orig_len == cur_len)) {
1221                                 SYSERROR("Failed to create directory \"%s\"", makeme);
1222                                 return -1;
1223                         }
1224                 }
1225         } while (tmp != dir);
1226
1227         return 0;
1228 }
1229
1230 static bool create_cgroup_tree(struct hierarchy *h, const char *cgroup_tree,
1231                                char *cgroup_leaf, bool payload)
1232 {
1233         __do_free char *path = NULL;
1234         int ret, ret_cpuset;
1235
1236         path = must_make_path(h->mountpoint, h->container_base_path, cgroup_leaf, NULL);
1237         if (dir_exists(path))
1238                 return log_warn_errno(false, errno, "The %s cgroup already existed", path);
1239
1240         ret_cpuset = cg_legacy_handle_cpuset_hierarchy(h, cgroup_leaf);
1241         if (ret_cpuset < 0)
1242                 return log_error_errno(false, errno, "Failed to handle legacy cpuset controller");
1243
1244         ret = mkdir_eexist_on_last(path, 0755);
1245         if (ret < 0) {
1246                 /*
1247                  * This is the cpuset controller and
1248                  * cg_legacy_handle_cpuset_hierarchy() has created our target
1249                  * directory for us to ensure correct initialization.
1250                  */
1251                 if (ret_cpuset != 1 || cgroup_tree)
1252                         return log_error_errno(false, errno, "Failed to create %s cgroup", path);
1253         }
1254
1255         if (payload)
1256                 h->container_full_path = move_ptr(path);
1257         else
1258                 h->monitor_full_path = move_ptr(path);
1259
1260         return true;
1261 }
1262
1263 static void cgroup_remove_leaf(struct hierarchy *h, bool payload)
1264 {
1265         __do_free char *full_path = NULL;
1266
1267         if (payload)
1268                 full_path = h->container_full_path;
1269         else
1270                 full_path = h->monitor_full_path;
1271
1272         if (rmdir(full_path))
1273                 SYSWARN("Failed to rmdir(\"%s\") cgroup", full_path);
1274
1275         if (payload)
1276                 h->container_full_path = NULL;
1277         else
1278                 h->monitor_full_path = NULL;
1279 }
1280
1281 __cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
1282                                                       struct lxc_handler *handler)
1283 {
1284         __do_free char *monitor_cgroup = NULL;
1285         const char *cgroup_tree;
1286         int idx = 0;
1287         int i;
1288         size_t len;
1289         char *suffix;
1290         struct lxc_conf *conf;
1291
1292         if (!ops)
1293                 return ret_set_errno(false, ENOENT);
1294
1295         if (!ops->hierarchies)
1296                 return true;
1297
1298         if (ops->monitor_cgroup)
1299                 return ret_set_errno(false, EEXIST);
1300
1301         if (!handler || !handler->conf)
1302                 return ret_set_errno(false, EINVAL);
1303
1304         conf = handler->conf;
1305         cgroup_tree = conf->cgroup_meta.dir;
1306
1307         if (cgroup_tree)
1308                 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1309                                              DEFAULT_MONITOR_CGROUP_PREFIX,
1310                                              handler->name,
1311                                              CGROUP_CREATE_RETRY, NULL);
1312         else
1313                 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1314                                              handler->name,
1315                                              CGROUP_CREATE_RETRY, NULL);
1316         if (!monitor_cgroup)
1317                 return ret_set_errno(false, ENOMEM);
1318
1319         suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1320         *suffix = '\0';
1321         do {
1322                 if (idx)
1323                         sprintf(suffix, "-%d", idx);
1324
1325                 for (i = 0; ops->hierarchies[i]; i++) {
1326                         if (create_cgroup_tree(ops->hierarchies[i], cgroup_tree, monitor_cgroup, false))
1327                                 continue;
1328
1329                         ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->monitor_full_path ?: "(null)");
1330                         for (int j = 0; j < i; j++)
1331                                 cgroup_remove_leaf(ops->hierarchies[j], false);
1332
1333                         idx++;
1334                         break;
1335                 }
1336         } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
1337
1338         if (idx == 1000)
1339                 return ret_set_errno(false, ERANGE);
1340
1341         ops->monitor_cgroup = move_ptr(monitor_cgroup);
1342         return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1343 }
1344
1345 /*
1346  * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1347  * next cgroup_pattern-1, -2, ..., -999.
1348  */
1349 __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
1350                                                       struct lxc_handler *handler)
1351 {
1352         __do_free char *container_cgroup = NULL;
1353         const char *cgroup_tree;
1354         int idx = 0;
1355         int i;
1356         size_t len;
1357         char *suffix;
1358         struct lxc_conf *conf;
1359
1360         if (!ops)
1361                 return ret_set_errno(false, ENOENT);
1362
1363         if (!ops->hierarchies)
1364                 return true;
1365
1366         if (ops->container_cgroup)
1367                 return ret_set_errno(false, EEXIST);
1368
1369         if (!handler || !handler->conf)
1370                 return ret_set_errno(false, EINVAL);
1371
1372         conf = handler->conf;
1373         cgroup_tree = conf->cgroup_meta.dir;
1374
1375         if (cgroup_tree)
1376                 container_cgroup = must_concat(&len, cgroup_tree, "/",
1377                                              DEFAULT_PAYLOAD_CGROUP_PREFIX,
1378                                              handler->name,
1379                                              CGROUP_CREATE_RETRY, NULL);
1380         else
1381                 container_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1382                                              handler->name,
1383                                              CGROUP_CREATE_RETRY, NULL);
1384         if (!container_cgroup)
1385                 return ret_set_errno(false, ENOMEM);
1386
1387         suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1388         *suffix = '\0';
1389         do {
1390                 if (idx)
1391                         sprintf(suffix, "-%d", idx);
1392
1393                 for (i = 0; ops->hierarchies[i]; i++) {
1394                         if (create_cgroup_tree(ops->hierarchies[i], cgroup_tree, container_cgroup, true))
1395                                 continue;
1396
1397                         ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path ?: "(null)");
1398                         for (int j = 0; j < i; j++)
1399                                 cgroup_remove_leaf(ops->hierarchies[j], true);
1400
1401                         idx++;
1402                         break;
1403                 }
1404         } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
1405
1406         if (idx == 1000)
1407                 return ret_set_errno(false, ERANGE);
1408
1409         if (ops->unified && ops->unified->container_full_path) {
1410                 int ret;
1411
1412                 ret = open(ops->unified->container_full_path,
1413                            O_DIRECTORY | O_RDONLY | O_CLOEXEC);
1414                 if (ret < 0)
1415                         return log_error_errno(false,
1416                                                errno, "Failed to open file descriptor for unified hierarchy");
1417                 ops->unified_fd = ret;
1418         }
1419
1420         ops->container_cgroup = move_ptr(container_cgroup);
1421         INFO("The container process uses \"%s\" as cgroup", ops->container_cgroup);
1422         return true;
1423 }
1424
1425 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1426                                               struct lxc_handler *handler)
1427 {
1428         int monitor_len, transient_len;
1429         char monitor[INTTYPE_TO_STRLEN(pid_t)],
1430             transient[INTTYPE_TO_STRLEN(pid_t)];
1431
1432         if (!ops)
1433                 return ret_set_errno(false, ENOENT);
1434
1435         if (!ops->hierarchies)
1436                 return true;
1437
1438         if (!ops->monitor_cgroup)
1439                 return ret_set_errno(false, ENOENT);
1440
1441         if (!handler || !handler->conf)
1442                 return ret_set_errno(false, EINVAL);
1443
1444         monitor_len = snprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1445         if (handler->transient_pid > 0)
1446                 transient_len = snprintf(transient, sizeof(transient), "%d",
1447                                          handler->transient_pid);
1448
1449         for (int i = 0; ops->hierarchies[i]; i++) {
1450                 __do_free char *path = NULL;
1451                 int ret;
1452
1453                 path = must_make_path(ops->hierarchies[i]->monitor_full_path,
1454                                       "cgroup.procs", NULL);
1455                 ret = lxc_writeat(-1, path, monitor, monitor_len);
1456                 if (ret != 0)
1457                         return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", path);
1458
1459                 if (handler->transient_pid < 0)
1460                         return true;
1461
1462                 ret = lxc_writeat(-1, path, transient, transient_len);
1463                 if (ret != 0)
1464                         return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", path);
1465         }
1466         handler->transient_pid = -1;
1467
1468         return true;
1469 }
1470
1471 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1472                                               struct lxc_handler *handler)
1473 {
1474         int len;
1475         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1476
1477         if (!ops)
1478                 return ret_set_errno(false, ENOENT);
1479
1480         if (!ops->hierarchies)
1481                 return true;
1482
1483         if (!ops->container_cgroup)
1484                 return ret_set_errno(false, ENOENT);
1485
1486         if (!handler || !handler->conf)
1487                 return ret_set_errno(false, EINVAL);
1488
1489         len = snprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1490
1491         for (int i = 0; ops->hierarchies[i]; i++) {
1492                 __do_free char *path = NULL;
1493                 int ret;
1494
1495                 path = must_make_path(ops->hierarchies[i]->container_full_path,
1496                                       "cgroup.procs", NULL);
1497                 ret = lxc_writeat(-1, path, pidstr, len);
1498                 if (ret != 0)
1499                         return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", path);
1500         }
1501
1502         return true;
1503 }
1504
1505 static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid,
1506                    mode_t chmod_mode)
1507 {
1508         int ret;
1509
1510         ret = chown(path, chown_uid, chown_gid);
1511         if (ret < 0) {
1512                 SYSWARN("Failed to chown(%s, %d, %d)", path, (int)chown_uid, (int)chown_gid);
1513                 return -1;
1514         }
1515
1516         ret = chmod(path, chmod_mode);
1517         if (ret < 0) {
1518                 SYSWARN("Failed to chmod(%s, %d)", path, (int)chmod_mode);
1519                 return -1;
1520         }
1521
1522         return 0;
1523 }
1524
1525 /* chgrp the container cgroups to container group.  We leave
1526  * the container owner as cgroup owner.  So we must make the
1527  * directories 775 so that the container can create sub-cgroups.
1528  *
1529  * Also chown the tasks and cgroup.procs files.  Those may not
1530  * exist depending on kernel version.
1531  */
1532 static int chown_cgroup_wrapper(void *data)
1533 {
1534         int ret;
1535         uid_t destuid;
1536         struct generic_userns_exec_data *arg = data;
1537         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1538         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1539
1540         ret = setresgid(nsgid, nsgid, nsgid);
1541         if (ret < 0)
1542                 return log_error_errno(-1, errno,
1543                                        "Failed to setresgid(%d, %d, %d)",
1544                                        (int)nsgid, (int)nsgid, (int)nsgid);
1545
1546         ret = setresuid(nsuid, nsuid, nsuid);
1547         if (ret < 0)
1548                 return log_error_errno(-1, errno,
1549                                        "Failed to setresuid(%d, %d, %d)",
1550                                        (int)nsuid, (int)nsuid, (int)nsuid);
1551
1552         ret = setgroups(0, NULL);
1553         if (ret < 0 && errno != EPERM)
1554                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1555
1556         destuid = get_ns_uid(arg->origuid);
1557         if (destuid == LXC_INVALID_UID)
1558                 destuid = 0;
1559
1560         for (int i = 0; arg->hierarchies[i]; i++) {
1561                 __do_free char *fullpath = NULL;
1562                 char *path = arg->hierarchies[i]->container_full_path;
1563
1564                 ret = chowmod(path, destuid, nsgid, 0775);
1565                 if (ret < 0)
1566                         log_info_errno(continue,
1567                                        errno, "Failed to change %s to uid %d and gid %d and mode 0755",
1568                                        path, destuid, nsgid);
1569
1570                 /* Failures to chown() these are inconvenient but not
1571                  * detrimental We leave these owned by the container launcher,
1572                  * so that container root can write to the files to attach.  We
1573                  * chmod() them 664 so that container systemd can write to the
1574                  * files (which systemd in wily insists on doing).
1575                  */
1576
1577                 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC) {
1578                         fullpath = must_make_path(path, "tasks", NULL);
1579                         ret = chowmod(fullpath, destuid, nsgid, 0664);
1580                         if (ret < 0)
1581                                 SYSINFO("Failed to change %s to uid %d and gid %d and mode 0664",
1582                                         fullpath, destuid, nsgid);
1583                 }
1584
1585                 fullpath = must_make_path(path, "cgroup.procs", NULL);
1586                 ret = chowmod(fullpath, destuid, nsgid, 0664);
1587                 if (ret < 0)
1588                         SYSINFO("Failed to change %s to uid %d and gid %d and mode 0664",
1589                                 fullpath, destuid, nsgid);
1590
1591                 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
1592                         continue;
1593
1594                 for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++) {
1595                         fullpath = must_make_path(path, *p, NULL);
1596                         ret = chowmod(fullpath, destuid, nsgid, 0664);
1597                         if (ret < 0)
1598                                 SYSINFO("Failed to change %s to uid %d and gid %d and mode 0664",
1599                                         fullpath, destuid, nsgid);
1600                 }
1601         }
1602
1603         return 0;
1604 }
1605
1606 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1607                                       struct lxc_conf *conf)
1608 {
1609         struct generic_userns_exec_data wrap;
1610
1611         if (!ops)
1612                 return ret_set_errno(false, ENOENT);
1613
1614         if (!ops->hierarchies)
1615                 return true;
1616
1617         if (!ops->container_cgroup)
1618                 return ret_set_errno(false, ENOENT);
1619
1620         if (!conf)
1621                 return ret_set_errno(false, EINVAL);
1622
1623         if (lxc_list_empty(&conf->id_map))
1624                 return true;
1625
1626         wrap.origuid = geteuid();
1627         wrap.path = NULL;
1628         wrap.hierarchies = ops->hierarchies;
1629         wrap.conf = conf;
1630
1631         if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1632                 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1633
1634         return true;
1635 }
1636
1637 /* cgroup-full:* is done, no need to create subdirs */
1638 static bool cg_mount_needs_subdirs(int type)
1639 {
1640         if (type >= LXC_AUTO_CGROUP_FULL_RO)
1641                 return false;
1642
1643         return true;
1644 }
1645
1646 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1647  * remount controller ro if needed and bindmount the cgroupfs onto
1648  * control/the/cg/path.
1649  */
1650 static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1651                                        char *controllerpath, char *cgpath,
1652                                        const char *container_cgroup)
1653 {
1654         __do_free char *sourcepath = NULL;
1655         int ret, remount_flags;
1656         int flags = MS_BIND;
1657
1658         if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1659                 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1660                 if (ret < 0) {
1661                         SYSERROR("Failed to bind mount \"%s\" onto \"%s\"",
1662                                  controllerpath, controllerpath);
1663                         return -1;
1664                 }
1665
1666                 remount_flags = add_required_remount_flags(controllerpath,
1667                                                            controllerpath,
1668                                                            flags | MS_REMOUNT);
1669                 ret = mount(controllerpath, controllerpath, "cgroup",
1670                             remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1671                             NULL);
1672                 if (ret < 0) {
1673                         SYSERROR("Failed to remount \"%s\" ro", controllerpath);
1674                         return -1;
1675                 }
1676
1677                 INFO("Remounted %s read-only", controllerpath);
1678         }
1679
1680         sourcepath = must_make_path(h->mountpoint, h->container_base_path,
1681                                     container_cgroup, NULL);
1682         if (type == LXC_AUTO_CGROUP_RO)
1683                 flags |= MS_RDONLY;
1684
1685         ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1686         if (ret < 0) {
1687                 SYSERROR("Failed to mount \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1688                 return -1;
1689         }
1690         INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1691
1692         if (flags & MS_RDONLY) {
1693                 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1694                                                            flags | MS_REMOUNT);
1695                 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1696                 if (ret < 0) {
1697                         SYSERROR("Failed to remount \"%s\" ro", cgpath);
1698                         return -1;
1699                 }
1700                 INFO("Remounted %s read-only", cgpath);
1701         }
1702
1703         INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1704         return 0;
1705 }
1706
1707 /* __cg_mount_direct
1708  *
1709  * Mount cgroup hierarchies directly without using bind-mounts. The main
1710  * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1711  * cgroups for the LXC_AUTO_CGROUP_FULL option.
1712  */
1713 static int __cg_mount_direct(int type, struct hierarchy *h,
1714                              const char *controllerpath)
1715 {
1716          __do_free char *controllers = NULL;
1717          char *fstype = "cgroup2";
1718          unsigned long flags = 0;
1719          int ret;
1720
1721          flags |= MS_NOSUID;
1722          flags |= MS_NOEXEC;
1723          flags |= MS_NODEV;
1724          flags |= MS_RELATIME;
1725
1726          if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1727                  flags |= MS_RDONLY;
1728
1729          if (h->version != CGROUP2_SUPER_MAGIC) {
1730                  controllers = lxc_string_join(",", (const char **)h->controllers, false);
1731                  if (!controllers)
1732                          return -ENOMEM;
1733                  fstype = "cgroup";
1734         }
1735
1736         ret = mount("cgroup", controllerpath, fstype, flags, controllers);
1737         if (ret < 0) {
1738                 SYSERROR("Failed to mount \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1739                 return -1;
1740         }
1741
1742         DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1743         return 0;
1744 }
1745
1746 static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1747                                                const char *controllerpath)
1748 {
1749         return __cg_mount_direct(type, h, controllerpath);
1750 }
1751
1752 static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
1753                                        const char *controllerpath)
1754 {
1755         if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1756                 return 0;
1757
1758         return __cg_mount_direct(type, h, controllerpath);
1759 }
1760
1761 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1762                                       struct lxc_handler *handler,
1763                                       const char *root, int type)
1764 {
1765         __do_free char *cgroup_root = NULL;
1766         int ret;
1767         bool has_cgns = false, retval = false, wants_force_mount = false;
1768
1769         if (!ops)
1770                 return ret_set_errno(false, ENOENT);
1771
1772         if (!ops->hierarchies)
1773                 return true;
1774
1775         if (!handler || !handler->conf)
1776                 return ret_set_errno(false, EINVAL);
1777
1778         if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1779                 return true;
1780
1781         if (type & LXC_AUTO_CGROUP_FORCE) {
1782                 type &= ~LXC_AUTO_CGROUP_FORCE;
1783                 wants_force_mount = true;
1784         }
1785
1786         if (!wants_force_mount){
1787                 if (!lxc_list_empty(&handler->conf->keepcaps))
1788                         wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1789                 else
1790                         wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1791         }
1792
1793         has_cgns = cgns_supported();
1794         if (has_cgns && !wants_force_mount)
1795                 return true;
1796
1797         if (type == LXC_AUTO_CGROUP_NOSPEC)
1798                 type = LXC_AUTO_CGROUP_MIXED;
1799         else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1800                 type = LXC_AUTO_CGROUP_FULL_MIXED;
1801
1802         cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1803         if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
1804                 if (has_cgns && wants_force_mount) {
1805                         /* If cgroup namespaces are supported but the container
1806                          * will not have CAP_SYS_ADMIN after it has started we
1807                          * need to mount the cgroups manually.
1808                          */
1809                         return cg_mount_in_cgroup_namespace(type, ops->unified,
1810                                                             cgroup_root) == 0;
1811                 }
1812
1813                 return cg_mount_cgroup_full(type, ops->unified, cgroup_root) == 0;
1814         }
1815
1816         /* mount tmpfs */
1817         ret = safe_mount(NULL, cgroup_root, "tmpfs",
1818                          MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1819                          "size=10240k,mode=755", root);
1820         if (ret < 0)
1821                 goto on_error;
1822
1823         for (int i = 0; ops->hierarchies[i]; i++) {
1824                 __do_free char *controllerpath = NULL, *path2 = NULL;
1825                 struct hierarchy *h = ops->hierarchies[i];
1826                 char *controller = strrchr(h->mountpoint, '/');
1827
1828                 if (!controller)
1829                         continue;
1830                 controller++;
1831
1832                 controllerpath = must_make_path(cgroup_root, controller, NULL);
1833                 if (dir_exists(controllerpath))
1834                         continue;
1835
1836                 ret = mkdir(controllerpath, 0755);
1837                 if (ret < 0)
1838                         log_error_errno(goto on_error, errno,
1839                                         "Error creating cgroup path: %s",
1840                                         controllerpath);
1841
1842                 if (has_cgns && wants_force_mount) {
1843                         /* If cgroup namespaces are supported but the container
1844                          * will not have CAP_SYS_ADMIN after it has started we
1845                          * need to mount the cgroups manually.
1846                          */
1847                         ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
1848                         if (ret < 0)
1849                                 goto on_error;
1850
1851                         continue;
1852                 }
1853
1854                 ret = cg_mount_cgroup_full(type, h, controllerpath);
1855                 if (ret < 0)
1856                         goto on_error;
1857
1858                 if (!cg_mount_needs_subdirs(type))
1859                         continue;
1860
1861                 path2 = must_make_path(controllerpath, h->container_base_path,
1862                                        ops->container_cgroup, NULL);
1863                 ret = mkdir_p(path2, 0755);
1864                 if (ret < 0)
1865                         goto on_error;
1866
1867                 ret = cg_legacy_mount_controllers(type, h, controllerpath,
1868                                                   path2, ops->container_cgroup);
1869                 if (ret < 0)
1870                         goto on_error;
1871         }
1872         retval = true;
1873
1874 on_error:
1875         return retval;
1876 }
1877
1878 static int recursive_count_nrtasks(char *dirname)
1879 {
1880         __do_free char *path = NULL;
1881         __do_closedir DIR *dir = NULL;
1882         struct dirent *direntp;
1883         int count = 0, ret;
1884
1885         dir = opendir(dirname);
1886         if (!dir)
1887                 return 0;
1888
1889         while ((direntp = readdir(dir))) {
1890                 struct stat mystat;
1891
1892                 if (!strcmp(direntp->d_name, ".") ||
1893                     !strcmp(direntp->d_name, ".."))
1894                         continue;
1895
1896                 path = must_make_path(dirname, direntp->d_name, NULL);
1897
1898                 if (lstat(path, &mystat))
1899                         continue;
1900
1901                 if (!S_ISDIR(mystat.st_mode))
1902                         continue;
1903
1904                 count += recursive_count_nrtasks(path);
1905         }
1906
1907         path = must_make_path(dirname, "cgroup.procs", NULL);
1908         ret = lxc_count_file_lines(path);
1909         if (ret != -1)
1910                 count += ret;
1911
1912         return count;
1913 }
1914
1915 __cgfsng_ops static int cgfsng_nrtasks(struct cgroup_ops *ops)
1916 {
1917         __do_free char *path = NULL;
1918
1919         if (!ops)
1920                 return ret_set_errno(-1, ENOENT);
1921
1922         if (!ops->container_cgroup || !ops->hierarchies)
1923                 return ret_set_errno(-1, EINVAL);
1924
1925         path = must_make_path(ops->hierarchies[0]->container_full_path, NULL);
1926         return recursive_count_nrtasks(path);
1927 }
1928
1929 /* Only root needs to escape to the cgroup of its init. */
1930 __cgfsng_ops static bool cgfsng_escape(const struct cgroup_ops *ops,
1931                                        struct lxc_conf *conf)
1932 {
1933         if (!ops)
1934                 return ret_set_errno(false, ENOENT);
1935
1936         if (!ops->hierarchies)
1937                 return true;
1938
1939         if (!conf)
1940                 return ret_set_errno(false, EINVAL);
1941
1942         if (conf->cgroup_meta.relative || geteuid())
1943                 return true;
1944
1945         for (int i = 0; ops->hierarchies[i]; i++) {
1946                 __do_free char *fullpath = NULL;
1947                 int ret;
1948
1949                 fullpath =
1950                     must_make_path(ops->hierarchies[i]->mountpoint,
1951                                    ops->hierarchies[i]->container_base_path,
1952                                    "cgroup.procs", NULL);
1953                 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1954                 if (ret != 0)
1955                         return log_error_errno(false,
1956                                                errno, "Failed to escape to cgroup \"%s\"",
1957                                                fullpath);
1958         }
1959
1960         return true;
1961 }
1962
1963 __cgfsng_ops static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
1964 {
1965         int i = 0;
1966
1967         if (!ops)
1968                 return ret_set_errno(-1, ENOENT);
1969
1970         if (!ops->hierarchies)
1971                 return 0;
1972
1973         for (; ops->hierarchies[i]; i++)
1974                 ;
1975
1976         return i;
1977 }
1978
1979 __cgfsng_ops static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n,
1980                                                 char ***out)
1981 {
1982         int i;
1983
1984         if (!ops)
1985                 return ret_set_errno(false, ENOENT);
1986
1987         if (!ops->hierarchies)
1988                 return false;
1989
1990         /* sanity check n */
1991         for (i = 0; i < n; i++)
1992                 if (!ops->hierarchies[i])
1993                         return ret_set_errno(false, ENOENT);
1994
1995         *out = ops->hierarchies[i]->controllers;
1996
1997         return true;
1998 }
1999
2000 static bool cg_legacy_freeze(struct cgroup_ops *ops)
2001 {
2002         struct hierarchy *h;
2003
2004         h = get_hierarchy(ops, "freezer");
2005         if (!h)
2006                 return ret_set_errno(-1, ENOENT);
2007
2008         return lxc_write_openat(h->container_full_path, "freezer.state",
2009                                 "FROZEN", STRLITERALLEN("FROZEN"));
2010 }
2011
2012 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
2013                                     struct lxc_epoll_descr *descr)
2014 {
2015         __do_close_prot_errno int duped_fd = -EBADF;
2016         __do_free char *line = NULL;
2017         __do_fclose FILE *f = NULL;
2018         int state = PTR_TO_INT(cbdata);
2019         size_t len;
2020         const char *state_string;
2021
2022         duped_fd = dup(fd);
2023         if (duped_fd < 0)
2024                 return LXC_MAINLOOP_ERROR;
2025
2026         if (lseek(duped_fd, 0, SEEK_SET) < (off_t)-1)
2027                 return LXC_MAINLOOP_ERROR;
2028
2029         f = fdopen(duped_fd, "re");
2030         if (!f)
2031                 return LXC_MAINLOOP_ERROR;
2032         move_fd(duped_fd);
2033
2034         if (state == 1)
2035                 state_string = "frozen 1";
2036         else
2037                 state_string = "frozen 0";
2038
2039         while (getline(&line, &len, f) != -1)
2040                 if (strncmp(line, state_string, STRLITERALLEN("frozen") + 2) == 0)
2041                         return LXC_MAINLOOP_CLOSE;
2042
2043         return LXC_MAINLOOP_CONTINUE;
2044 }
2045
2046 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
2047 {
2048         __do_close_prot_errno int fd = -EBADF;
2049         __do_lxc_mainloop_close struct lxc_epoll_descr *descr_ptr = NULL;
2050         int ret;
2051         struct lxc_epoll_descr descr;
2052         struct hierarchy *h;
2053
2054         h = ops->unified;
2055         if (!h)
2056                 return ret_set_errno(-1, ENOENT);
2057
2058         if (!h->container_full_path)
2059                 return ret_set_errno(-1, EEXIST);
2060
2061         if (timeout != 0) {
2062                 __do_free char *events_file = NULL;
2063
2064                 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
2065                 fd = open(events_file, O_RDONLY | O_CLOEXEC);
2066                 if (fd < 0)
2067                         return log_error_errno(-1, errno, "Failed to open cgroup.events file");
2068
2069                 ret = lxc_mainloop_open(&descr);
2070                 if (ret)
2071                         return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container freeze");
2072
2073                 /* automatically cleaned up now */
2074                 descr_ptr = &descr;
2075
2076                 ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, INT_TO_PTR((int){1}));
2077                 if (ret < 0)
2078                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
2079         }
2080
2081         ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", "1", 1);
2082         if (ret < 0)
2083                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
2084
2085         if (timeout != 0 && lxc_mainloop(&descr, timeout))
2086                 return log_error_errno(-1, errno, "Failed to wait for container to be frozen");
2087
2088         return 0;
2089 }
2090
2091 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
2092 {
2093         if (!ops->hierarchies)
2094                 return ret_set_errno(-1, ENOENT);
2095
2096         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2097                 return cg_legacy_freeze(ops);
2098
2099         return cg_unified_freeze(ops, timeout);
2100 }
2101
2102 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
2103 {
2104         struct hierarchy *h;
2105
2106         h = get_hierarchy(ops, "freezer");
2107         if (!h)
2108                 return ret_set_errno(-1, ENOENT);
2109
2110         return lxc_write_openat(h->container_full_path, "freezer.state",
2111                                 "THAWED", STRLITERALLEN("THAWED"));
2112 }
2113
2114 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
2115 {
2116         __do_close_prot_errno int fd = -EBADF;
2117         __do_lxc_mainloop_close struct lxc_epoll_descr *descr_ptr = NULL;
2118         int ret;
2119         struct lxc_epoll_descr descr;
2120         struct hierarchy *h;
2121
2122         h = ops->unified;
2123         if (!h)
2124                 return ret_set_errno(-1, ENOENT);
2125
2126         if (!h->container_full_path)
2127                 return ret_set_errno(-1, EEXIST);
2128
2129         if (timeout != 0) {
2130                 __do_free char *events_file = NULL;
2131
2132                 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
2133                 fd = open(events_file, O_RDONLY | O_CLOEXEC);
2134                 if (fd < 0)
2135                         return log_error_errno(-1, errno, "Failed to open cgroup.events file");
2136
2137                 ret = lxc_mainloop_open(&descr);
2138                 if (ret)
2139                         return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container unfreeze");
2140
2141                 /* automatically cleaned up now */
2142                 descr_ptr = &descr;
2143
2144                 ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, INT_TO_PTR((int){0}));
2145                 if (ret < 0)
2146                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
2147         }
2148
2149         ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", "0", 1);
2150         if (ret < 0)
2151                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
2152
2153         if (timeout != 0 && lxc_mainloop(&descr, timeout))
2154                 return log_error_errno(-1, errno, "Failed to wait for container to be unfrozen");
2155
2156         return 0;
2157 }
2158
2159 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2160 {
2161         if (!ops->hierarchies)
2162                 return ret_set_errno(-1, ENOENT);
2163
2164         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2165                 return cg_legacy_unfreeze(ops);
2166
2167         return cg_unified_unfreeze(ops, timeout);
2168 }
2169
2170 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2171                                                   const char *controller)
2172 {
2173         struct hierarchy *h;
2174
2175         h = get_hierarchy(ops, controller);
2176         if (!h)
2177                 return log_warn_errno(NULL,
2178                                       ENOENT, "Failed to find hierarchy for controller \"%s\"",
2179                                       controller ? controller : "(null)");
2180
2181         return h->container_full_path
2182                    ? h->container_full_path + strlen(h->mountpoint)
2183                    : NULL;
2184 }
2185
2186 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2187  * which must be freed by the caller.
2188  */
2189 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2190                                                        const char *inpath,
2191                                                        const char *filename)
2192 {
2193         return must_make_path(h->mountpoint, inpath, filename, NULL);
2194 }
2195
2196 static int cgroup_attach_leaf(int unified_fd, int64_t pid)
2197 {
2198         int idx = 1;
2199         int ret;
2200         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2201         char attach_cgroup[STRLITERALLEN("lxc-1000/cgroup.procs") + 1];
2202         size_t pidstr_len;
2203
2204         /* Create leaf cgroup. */
2205         ret = mkdirat(unified_fd, "lxc", 0755);
2206         if (ret < 0 && errno != EEXIST)
2207                 return log_error_errno(-1, errno, "Failed to create leaf cgroup \"lxc\"");
2208
2209         pidstr_len = sprintf(pidstr, INT64_FMT, pid);
2210         ret = lxc_writeat(unified_fd, "lxc/cgroup.procs", pidstr, pidstr_len);
2211         if (ret < 0)
2212                 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2213         if (ret == 0)
2214                 return 0;
2215
2216         /* this is a non-leaf node */
2217         if (errno != EBUSY)
2218                 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2219
2220         do {
2221                 char *slash;
2222
2223                 sprintf(attach_cgroup, "lxc-%d/cgroup.procs", idx);
2224                 slash = &attach_cgroup[ret] - STRLITERALLEN("/cgroup.procs");
2225                 *slash = '\0';
2226
2227                 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2228                 if (ret < 0 && errno != EEXIST)
2229                         return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2230
2231                 *slash = '/';
2232
2233                 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2234                 if (ret == 0)
2235                         return 0;
2236
2237                 /* this is a non-leaf node */
2238                 if (errno != EBUSY)
2239                         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2240
2241                 idx++;
2242         } while (idx < 1000);
2243
2244         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2245 }
2246
2247 int cgroup_attach(const char *name, const char *lxcpath, int64_t pid)
2248 {
2249         __do_close_prot_errno int unified_fd = -EBADF;
2250
2251         unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
2252         if (unified_fd < 0)
2253                 return -1;
2254
2255         return cgroup_attach_leaf(unified_fd, pid);
2256 }
2257
2258 /* Technically, we're always at a delegation boundary here (This is especially
2259  * true when cgroup namespaces are available.). The reasoning is that in order
2260  * for us to have been able to start a container in the first place the root
2261  * cgroup must have been a leaf node. Now, either the container's init system
2262  * has populated the cgroup and kept it as a leaf node or it has created
2263  * subtrees. In the former case we will simply attach to the leaf node we
2264  * created when we started the container in the latter case we create our own
2265  * cgroup for the attaching process.
2266  */
2267 static int __cg_unified_attach(const struct hierarchy *h, const char *name,
2268                                const char *lxcpath, pid_t pid,
2269                                const char *controller)
2270 {
2271         __do_close_prot_errno int unified_fd = -EBADF;
2272         int ret;
2273
2274         ret = cgroup_attach(name, lxcpath, pid);
2275         if (ret < 0) {
2276                 __do_free char *path = NULL, *cgroup = NULL;
2277
2278                 cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2279                 /* not running */
2280                 if (!cgroup)
2281                         return 0;
2282
2283                 path = must_make_path(h->mountpoint, cgroup, NULL);
2284                 unified_fd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
2285         }
2286         if (unified_fd < 0)
2287                 return -1;
2288
2289         return cgroup_attach_leaf(unified_fd, pid);
2290 }
2291
2292 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops, const char *name,
2293                                          const char *lxcpath, pid_t pid)
2294 {
2295         int len, ret;
2296         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2297
2298         if (!ops)
2299                 return ret_set_errno(false, ENOENT);
2300
2301         if (!ops->hierarchies)
2302                 return true;
2303
2304         len = snprintf(pidstr, sizeof(pidstr), "%d", pid);
2305         if (len < 0 || (size_t)len >= sizeof(pidstr))
2306                 return false;
2307
2308         for (int i = 0; ops->hierarchies[i]; i++) {
2309                 __do_free char *fullpath = NULL, *path = NULL;
2310                 struct hierarchy *h = ops->hierarchies[i];
2311
2312                 if (h->version == CGROUP2_SUPER_MAGIC) {
2313                         ret = __cg_unified_attach(h, name, lxcpath, pid,
2314                                                   h->controllers[0]);
2315                         if (ret < 0)
2316                                 return false;
2317
2318                         continue;
2319                 }
2320
2321                 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2322                 /* not running */
2323                 if (!path)
2324                         return false;
2325
2326                 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2327                 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2328                 if (ret < 0)
2329                         return log_error_errno(false, errno,
2330                                                "Failed to attach %d to %s",
2331                                                (int)pid, fullpath);
2332         }
2333
2334         return true;
2335 }
2336
2337 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.  Here we
2338  * don't have a cgroup_data set up, so we ask the running container through the
2339  * commands API for the cgroup path.
2340  */
2341 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2342                                      char *value, size_t len, const char *name,
2343                                      const char *lxcpath)
2344 {
2345         __do_free char *path = NULL;
2346         __do_free char *controller = NULL;
2347         char *p;
2348         struct hierarchy *h;
2349         int ret = -1;
2350
2351         if (!ops)
2352                 return ret_set_errno(-1, ENOENT);
2353
2354         controller = must_copy_string(filename);
2355         p = strchr(controller, '.');
2356         if (p)
2357                 *p = '\0';
2358
2359         path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2360         /* not running */
2361         if (!path)
2362                 return -1;
2363
2364         h = get_hierarchy(ops, controller);
2365         if (h) {
2366                 __do_free char *fullpath = NULL;
2367
2368                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2369                 ret = lxc_read_from_file(fullpath, value, len);
2370         }
2371
2372         return ret;
2373 }
2374
2375 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2376 {
2377         for (int count = 0; count < 3; count++, val++) {
2378                 switch (*val) {
2379                 case 'r':
2380                         device->access[count] = *val;
2381                         break;
2382                 case 'w':
2383                         device->access[count] = *val;
2384                         break;
2385                 case 'm':
2386                         device->access[count] = *val;
2387                         break;
2388                 case '\n':
2389                 case '\0':
2390                         count = 3;
2391                         break;
2392                 default:
2393                         return ret_errno(EINVAL);
2394                 }
2395         }
2396
2397         return 0;
2398 }
2399
2400 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2401                                     const char *val)
2402 {
2403         int count, ret;
2404         char temp[50];
2405
2406         if (strcmp("devices.allow", key) == 0)
2407                 device->allow = 1;
2408         else
2409                 device->allow = 0;
2410
2411         if (strcmp(val, "a") == 0) {
2412                 /* global rule */
2413                 device->type = 'a';
2414                 device->major = -1;
2415                 device->minor = -1;
2416                 device->global_rule = device->allow
2417                                           ? LXC_BPF_DEVICE_CGROUP_BLACKLIST
2418                                           : LXC_BPF_DEVICE_CGROUP_WHITELIST;
2419                 device->allow = -1;
2420                 return 0;
2421         } else {
2422                 device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
2423         }
2424
2425         switch (*val) {
2426         case 'a':
2427                 __fallthrough;
2428         case 'b':
2429                 __fallthrough;
2430         case 'c':
2431                 device->type = *val;
2432                 break;
2433         default:
2434                 return -1;
2435         }
2436
2437         val++;
2438         if (!isspace(*val))
2439                 return -1;
2440         val++;
2441         if (*val == '*') {
2442                 device->major = -1;
2443                 val++;
2444         } else if (isdigit(*val)) {
2445                 memset(temp, 0, sizeof(temp));
2446                 for (count = 0; count < sizeof(temp) - 1; count++) {
2447                         temp[count] = *val;
2448                         val++;
2449                         if (!isdigit(*val))
2450                                 break;
2451                 }
2452                 ret = lxc_safe_int(temp, &device->major);
2453                 if (ret)
2454                         return -1;
2455         } else {
2456                 return -1;
2457         }
2458         if (*val != ':')
2459                 return -1;
2460         val++;
2461
2462         /* read minor */
2463         if (*val == '*') {
2464                 device->minor = -1;
2465                 val++;
2466         } else if (isdigit(*val)) {
2467                 memset(temp, 0, sizeof(temp));
2468                 for (count = 0; count < sizeof(temp) - 1; count++) {
2469                         temp[count] = *val;
2470                         val++;
2471                         if (!isdigit(*val))
2472                                 break;
2473                 }
2474                 ret = lxc_safe_int(temp, &device->minor);
2475                 if (ret)
2476                         return -1;
2477         } else {
2478                 return -1;
2479         }
2480         if (!isspace(*val))
2481                 return -1;
2482
2483         return device_cgroup_parse_access(device, ++val);
2484 }
2485
2486 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.  Here we
2487  * don't have a cgroup_data set up, so we ask the running container through the
2488  * commands API for the cgroup path.
2489  */
2490 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2491                                      const char *key, const char *value,
2492                                      const char *name, const char *lxcpath)
2493 {
2494         __do_free char *path = NULL;
2495         __do_free char *controller = NULL;
2496         char *p;
2497         struct hierarchy *h;
2498         int ret = -1;
2499
2500         if (!ops)
2501                 return ret_set_errno(-1, ENOENT);
2502
2503         controller = must_copy_string(key);
2504         p = strchr(controller, '.');
2505         if (p)
2506                 *p = '\0';
2507
2508         if (pure_unified_layout(ops) && strcmp(controller, "devices") == 0) {
2509                 struct device_item device = {0};
2510
2511                 ret = device_cgroup_rule_parse(&device, key, value);
2512                 if (ret < 0)
2513                         return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2514                                                key, value);
2515
2516                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2517                 if (ret < 0)
2518                         return -1;
2519
2520                 return 0;
2521         }
2522
2523         path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2524         /* not running */
2525         if (!path)
2526                 return -1;
2527
2528         h = get_hierarchy(ops, controller);
2529         if (h) {
2530                 __do_free char *fullpath = NULL;
2531
2532                 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2533                 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2534         }
2535
2536         return ret;
2537 }
2538
2539 /* take devices cgroup line
2540  *    /dev/foo rwx
2541  * and convert it to a valid
2542  *    type major:minor mode
2543  * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2544  * the output.
2545  */
2546 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2547                                             const char *devpath)
2548 {
2549         __do_free char *path = NULL;
2550         char *mode = NULL;
2551         int n_parts, ret;
2552         char *p;
2553         struct stat sb;
2554
2555         path = must_copy_string(devpath);
2556
2557         /*
2558          * Read path followed by mode. Ignore any trailing text.
2559          * A '    # comment' would be legal. Technically other text is not
2560          * legal, we could check for that if we cared to.
2561          */
2562         for (n_parts = 1, p = path; *p; p++) {
2563                 if (*p != ' ')
2564                         continue;
2565                 *p = '\0';
2566
2567                 if (n_parts != 1)
2568                         break;
2569                 p++;
2570                 n_parts++;
2571
2572                 while (*p == ' ')
2573                         p++;
2574
2575                 mode = p;
2576
2577                 if (*p == '\0')
2578                         return ret_set_errno(-1, EINVAL);
2579         }
2580
2581         if (device_cgroup_parse_access(device, mode) < 0)
2582                 return -1;
2583
2584         if (n_parts == 1)
2585                 return ret_set_errno(-1, EINVAL);
2586
2587         ret = stat(path, &sb);
2588         if (ret < 0)
2589                 return ret_set_errno(-1, errno);
2590
2591         mode_t m = sb.st_mode & S_IFMT;
2592         switch (m) {
2593         case S_IFBLK:
2594                 device->type = 'b';
2595                 break;
2596         case S_IFCHR:
2597                 device->type = 'c';
2598                 break;
2599         default:
2600                 return log_error_errno(-1, EINVAL,
2601                                        "Unsupported device type %i for \"%s\"",
2602                                        m, path);
2603         }
2604
2605         device->major = MAJOR(sb.st_rdev);
2606         device->minor = MINOR(sb.st_rdev);
2607         device->allow = 1;
2608         device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
2609
2610         return 0;
2611 }
2612
2613 static int convert_devpath(const char *invalue, char *dest)
2614 {
2615         struct device_item device = {0};
2616         int ret;
2617
2618         ret = device_cgroup_rule_parse_devpath(&device, invalue);
2619         if (ret < 0)
2620                 return -1;
2621
2622         ret = snprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2623                        device.minor, device.access);
2624         if (ret < 0 || ret >= 50)
2625                 return log_error_errno(-1,
2626                                        ENAMETOOLONG, "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2627                                        device.type, device.major, device.minor,
2628                                        device.access);
2629
2630         return 0;
2631 }
2632
2633 /* Called from setup_limits - here we have the container's cgroup_data because
2634  * we created the cgroups.
2635  */
2636 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2637                               const char *value)
2638 {
2639         __do_free char *controller = NULL;
2640         char *p;
2641         /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2642         char converted_value[50];
2643         struct hierarchy *h;
2644
2645         controller = must_copy_string(filename);
2646         p = strchr(controller, '.');
2647         if (p)
2648                 *p = '\0';
2649
2650         if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
2651                 int ret;
2652
2653                 ret = convert_devpath(value, converted_value);
2654                 if (ret < 0)
2655                         return ret;
2656                 value = converted_value;
2657         }
2658
2659         h = get_hierarchy(ops, controller);
2660         if (!h) {
2661                 ERROR("Failed to setup limits for the \"%s\" controller. "
2662                       "The controller seems to be unused by \"cgfsng\" cgroup "
2663                       "driver or not enabled on the cgroup hierarchy",
2664                       controller);
2665                 errno = ENOENT;
2666                 return -ENOENT;
2667         }
2668
2669         return lxc_write_openat(h->container_full_path, filename, value, strlen(value));
2670 }
2671
2672 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2673                                                     struct lxc_conf *conf,
2674                                                     bool do_devices)
2675 {
2676         __do_free struct lxc_list *sorted_cgroup_settings = NULL;
2677         struct lxc_list *cgroup_settings = &conf->cgroup;
2678         struct lxc_list *iterator, *next;
2679         struct lxc_cgroup *cg;
2680         bool ret = false;
2681
2682         if (!ops)
2683                 return ret_set_errno(false, ENOENT);
2684
2685         if (!conf)
2686                 return ret_set_errno(false, EINVAL);
2687
2688         cgroup_settings = &conf->cgroup;
2689         if (lxc_list_empty(cgroup_settings))
2690                 return true;
2691
2692         if (!ops->hierarchies)
2693                 return ret_set_errno(false, EINVAL);
2694
2695         sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2696         if (!sorted_cgroup_settings)
2697                 return false;
2698
2699         lxc_list_for_each(iterator, sorted_cgroup_settings) {
2700                 cg = iterator->elem;
2701
2702                 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2703                         if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
2704                                 if (do_devices && (errno == EACCES || errno == EPERM))
2705                                         log_warn_errno(continue,
2706                                                        errno, "Failed to set \"%s\" to \"%s\"",
2707                                                        cg->subsystem, cg->value);
2708                                 log_warn_errno(goto out, errno,
2709                                                "Failed to set \"%s\" to \"%s\"",
2710                                                cg->subsystem, cg->value);
2711                         }
2712                         DEBUG("Set controller \"%s\" set to \"%s\"",
2713                               cg->subsystem, cg->value);
2714                 }
2715         }
2716
2717         ret = true;
2718         INFO("Limits for the legacy cgroup hierarchies have been setup");
2719 out:
2720         lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2721                 lxc_list_del(iterator);
2722                 free(iterator);
2723         }
2724
2725         return ret;
2726 }
2727
2728 /*
2729  * Some of the parsing logic comes from the original cgroup device v1
2730  * implementation in the kernel.
2731  */
2732 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2733                                      struct lxc_conf *conf, const char *key,
2734                                      const char *val)
2735 {
2736 #ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
2737         struct device_item device_item = {0};
2738         int ret;
2739
2740         if (strcmp("devices.allow", key) == 0 && *val == '/')
2741                 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2742         else
2743                 ret = device_cgroup_rule_parse(&device_item, key, val);
2744         if (ret < 0)
2745                 return log_error_errno(-1, EINVAL,
2746                                        "Failed to parse device string %s=%s",
2747                                        key, val);
2748
2749         ret = bpf_list_add_device(conf, &device_item);
2750         if (ret < 0)
2751                 return -1;
2752 #endif
2753         return 0;
2754 }
2755
2756 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2757                                              struct lxc_handler *handler)
2758 {
2759         struct lxc_list *cgroup_settings, *iterator;
2760         struct hierarchy *h;
2761         struct lxc_conf *conf;
2762
2763         if (!ops)
2764                 return ret_set_errno(false, ENOENT);
2765
2766         if (!ops->hierarchies)
2767                 return true;
2768
2769         if (!ops->container_cgroup)
2770                 return ret_set_errno(false, EINVAL);
2771
2772         if (!handler || !handler->conf)
2773                 return ret_set_errno(false, EINVAL);
2774         conf = handler->conf;
2775
2776         if (lxc_list_empty(&conf->cgroup2))
2777                 return true;
2778         cgroup_settings = &conf->cgroup2;
2779
2780         if (!ops->unified)
2781                 return false;
2782         h = ops->unified;
2783
2784         lxc_list_for_each (iterator, cgroup_settings) {
2785                 struct lxc_cgroup *cg = iterator->elem;
2786                 int ret;
2787
2788                 if (strncmp("devices", cg->subsystem, 7) == 0) {
2789                         ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem,
2790                                                         cg->value);
2791                 } else {
2792                         ret = lxc_write_openat(h->container_full_path,
2793                                                cg->subsystem, cg->value,
2794                                                strlen(cg->value));
2795                         if (ret < 0)
2796                                 return log_error_errno(false,
2797                                                        errno, "Failed to set \"%s\" to \"%s\"",
2798                                                        cg->subsystem, cg->value);
2799                 }
2800                 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2801         }
2802
2803         return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
2804 }
2805
2806 __cgfsng_ops bool cgfsng_devices_activate(struct cgroup_ops *ops,
2807                                           struct lxc_handler *handler)
2808 {
2809 #ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
2810         __do_bpf_program_free struct bpf_program *devices = NULL;
2811         int ret;
2812         struct lxc_conf *conf;
2813         struct hierarchy *unified;
2814         struct lxc_list *it;
2815         struct bpf_program *devices_old;
2816
2817         if (!ops)
2818                 return ret_set_errno(false, ENOENT);
2819
2820         if (!ops->hierarchies)
2821                 return true;
2822
2823         if (!ops->container_cgroup)
2824                 return ret_set_errno(false, EEXIST);
2825
2826         if (!handler || !handler->conf)
2827                 return ret_set_errno(false, EINVAL);
2828         conf = handler->conf;
2829
2830         unified = ops->unified;
2831         if (!unified || !unified->bpf_device_controller ||
2832             !unified->container_full_path || lxc_list_empty(&conf->devices))
2833                 return true;
2834
2835         devices = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE);
2836         if (!devices)
2837                 return log_error_errno(false, ENOMEM,
2838                                        "Failed to create new bpf program");
2839
2840         ret = bpf_program_init(devices);
2841         if (ret)
2842                 return log_error_errno(false, ENOMEM,
2843                                        "Failed to initialize bpf program");
2844
2845         lxc_list_for_each(it, &conf->devices) {
2846                 struct device_item *cur = it->elem;
2847
2848                 ret = bpf_program_append_device(devices, cur);
2849                 if (ret)
2850                         return log_error_errno(false,
2851                                                ENOMEM, "Failed to add new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
2852                                                cur->type, cur->major,
2853                                                cur->minor, cur->access,
2854                                                cur->allow, cur->global_rule);
2855                 TRACE("Added rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
2856                       cur->type, cur->major, cur->minor, cur->access,
2857                       cur->allow, cur->global_rule);
2858         }
2859
2860         ret = bpf_program_finalize(devices);
2861         if (ret)
2862                 return log_error_errno(false, ENOMEM,
2863                                        "Failed to finalize bpf program");
2864
2865         ret = bpf_program_cgroup_attach(devices, BPF_CGROUP_DEVICE,
2866                                         unified->container_full_path,
2867                                         BPF_F_ALLOW_MULTI);
2868         if (ret)
2869                 return log_error_errno(false, ENOMEM,
2870                                        "Failed to attach bpf program");
2871
2872         /* Replace old bpf program. */
2873         devices_old = move_ptr(conf->cgroup2_devices);
2874         conf->cgroup2_devices = move_ptr(devices);
2875         devices = move_ptr(devices_old);
2876 #endif
2877         return true;
2878 }
2879
2880 bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
2881 {
2882         __do_free char *add_controllers = NULL, *base_path = NULL;
2883         struct hierarchy *unified = ops->unified;
2884         ssize_t parts_len;
2885         char **it;
2886         size_t full_len = 0;
2887         char **parts = NULL;
2888         bool bret = false;
2889
2890         if (!ops->hierarchies || !pure_unified_layout(ops) ||
2891             !unified->controllers[0])
2892                 return true;
2893
2894         /* For now we simply enable all controllers that we have detected by
2895          * creating a string like "+memory +pids +cpu +io".
2896          * TODO: In the near future we might want to support "-<controller>"
2897          * etc. but whether supporting semantics like this make sense will need
2898          * some thinking.
2899          */
2900         for (it = unified->controllers; it && *it; it++) {
2901                 full_len += strlen(*it) + 2;
2902                 add_controllers = must_realloc(add_controllers, full_len + 1);
2903
2904                 if (unified->controllers[0] == *it)
2905                         add_controllers[0] = '\0';
2906
2907                 (void)strlcat(add_controllers, "+", full_len + 1);
2908                 (void)strlcat(add_controllers, *it, full_len + 1);
2909
2910                 if ((it + 1) && *(it + 1))
2911                         (void)strlcat(add_controllers, " ", full_len + 1);
2912         }
2913
2914         parts = lxc_string_split(cgroup, '/');
2915         if (!parts)
2916                 goto on_error;
2917
2918         parts_len = lxc_array_len((void **)parts);
2919         if (parts_len > 0)
2920                 parts_len--;
2921
2922         base_path = must_make_path(unified->mountpoint, unified->container_base_path, NULL);
2923         for (ssize_t i = -1; i < parts_len; i++) {
2924                 int ret;
2925                 __do_free char *target = NULL;
2926
2927                 if (i >= 0)
2928                         base_path = must_append_path(base_path, parts[i], NULL);
2929                 target = must_make_path(base_path, "cgroup.subtree_control", NULL);
2930                 ret = lxc_writeat(-1, target, add_controllers, full_len);
2931                 if (ret < 0)
2932                         log_error_errno(goto on_error,
2933                                         errno, "Could not enable \"%s\" controllers in the unified cgroup \"%s\"",
2934                                         add_controllers, target);
2935                 TRACE("Enable \"%s\" controllers in the unified cgroup \"%s\"", add_controllers, target);
2936         }
2937
2938         bret = true;
2939
2940 on_error:
2941         lxc_free_array((void **)parts, free);
2942         return bret;
2943 }
2944
2945 __cgfsng_ops bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2946 {
2947         if (!ops)
2948                 return ret_set_errno(false, ENOENT);
2949
2950         return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2951 }
2952
2953 __cgfsng_ops bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2954 {
2955         if (!ops)
2956                 return ret_set_errno(false, ENOENT);
2957
2958         return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2959 }
2960
2961 static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
2962                                        char **controllers)
2963 {
2964         if (!ops->cgroup_use)
2965                 return true;
2966
2967         for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
2968                 bool found = false;
2969
2970                 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
2971                         if (strcmp(*cur_use, *cur_ctrl) != 0)
2972                                 continue;
2973
2974                         found = true;
2975                         break;
2976                 }
2977
2978                 if (found)
2979                         continue;
2980
2981                 return false;
2982         }
2983
2984         return true;
2985 }
2986
2987 static void cg_unified_delegate(char ***delegate)
2988 {
2989         __do_free char *buf = NULL;
2990         char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
2991         char *token;
2992         int idx;
2993
2994         buf = read_file("/sys/kernel/cgroup/delegate");
2995         if (!buf) {
2996                 for (char **p = standard; p && *p; p++) {
2997                         idx = append_null_to_list((void ***)delegate);
2998                         (*delegate)[idx] = must_copy_string(*p);
2999                 }
3000                 log_warn_errno(return, errno, "Failed to read /sys/kernel/cgroup/delegate");
3001         }
3002
3003         lxc_iterate_parts (token, buf, " \t\n") {
3004                 /*
3005                  * We always need to chown this for both cgroup and
3006                  * cgroup2.
3007                  */
3008                 if (strcmp(token, "cgroup.procs") == 0)
3009                         continue;
3010
3011                 idx = append_null_to_list((void ***)delegate);
3012                 (*delegate)[idx] = must_copy_string(token);
3013         }
3014 }
3015
3016 /* At startup, parse_hierarchies finds all the info we need about cgroup
3017  * mountpoints and current cgroups, and stores it in @d.
3018  */
3019 static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileged)
3020 {
3021         __do_free char *basecginfo = NULL;
3022         __do_free char *line = NULL;
3023         __do_fclose FILE *f = NULL;
3024         int ret;
3025         size_t len = 0;
3026         char **klist = NULL, **nlist = NULL;
3027
3028         /* Root spawned containers escape the current cgroup, so use init's
3029          * cgroups as our base in that case.
3030          */
3031         if (!relative && (geteuid() == 0))
3032                 basecginfo = read_file("/proc/1/cgroup");
3033         else
3034                 basecginfo = read_file("/proc/self/cgroup");
3035         if (!basecginfo)
3036                 return ret_set_errno(-1, ENOMEM);
3037
3038         ret = get_existing_subsystems(&klist, &nlist);
3039         if (ret < 0)
3040                 return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
3041
3042         f = fopen("/proc/self/mountinfo", "r");
3043         if (!f)
3044                 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
3045
3046         lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
3047
3048         while (getline(&line, &len, f) != -1) {
3049                 int type;
3050                 bool writeable;
3051                 struct hierarchy *new;
3052                 char *base_cgroup = NULL, *mountpoint = NULL;
3053                 char **controller_list = NULL;
3054
3055                 type = get_cgroup_version(line);
3056                 if (type == 0)
3057                         continue;
3058
3059                 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
3060                         continue;
3061
3062                 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
3063                         if (type == CGROUP2_SUPER_MAGIC)
3064                                 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3065                         else if (type == CGROUP_SUPER_MAGIC)
3066                                 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3067                 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
3068                         if (type == CGROUP_SUPER_MAGIC)
3069                                 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3070                 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3071                         if (type == CGROUP2_SUPER_MAGIC)
3072                                 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3073                 }
3074
3075                 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
3076                 if (!controller_list && type == CGROUP_SUPER_MAGIC)
3077                         continue;
3078
3079                 if (type == CGROUP_SUPER_MAGIC)
3080                         if (controller_list_is_dup(ops->hierarchies, controller_list))
3081                                 log_trace_errno(goto next, EEXIST, "Skipping duplicating controller");
3082
3083                 mountpoint = cg_hybrid_get_mountpoint(line);
3084                 if (!mountpoint)
3085                         log_error_errno(goto next, EINVAL, "Failed parsing mountpoint from \"%s\"", line);
3086
3087                 if (type == CGROUP_SUPER_MAGIC)
3088                         base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
3089                 else
3090                         base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
3091                 if (!base_cgroup)
3092                         log_error_errno(goto next, EINVAL, "Failed to find current cgroup");
3093
3094                 trim(base_cgroup);
3095                 prune_init_scope(base_cgroup);
3096                 if (type == CGROUP2_SUPER_MAGIC)
3097                         writeable = test_writeable_v2(mountpoint, base_cgroup);
3098                 else
3099                         writeable = test_writeable_v1(mountpoint, base_cgroup);
3100                 if (!writeable)
3101                         log_trace_errno(goto next, EROFS, "The %s group is not writeable", base_cgroup);
3102
3103                 if (type == CGROUP2_SUPER_MAGIC) {
3104                         char *cgv2_ctrl_path;
3105
3106                         cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
3107                                                         "cgroup.controllers",
3108                                                         NULL);
3109
3110                         controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
3111                         free(cgv2_ctrl_path);
3112                         if (!controller_list) {
3113                                 controller_list = cg_unified_make_empty_controller();
3114                                 TRACE("No controllers are enabled for "
3115                                       "delegation in the unified hierarchy");
3116                         }
3117                 }
3118
3119                 /* Exclude all controllers that cgroup use does not want. */
3120                 if (!cgroup_use_wants_controllers(ops, controller_list))
3121                         log_trace_errno(goto next, EINVAL, "Skipping controller");
3122
3123                 new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
3124                 if (type == CGROUP2_SUPER_MAGIC && !ops->unified) {
3125                         if (unprivileged)
3126                                 cg_unified_delegate(&new->cgroup2_chown);
3127                         ops->unified = new;
3128                 }
3129
3130                 continue;
3131
3132         next:
3133                 free_string_list(controller_list);
3134                 free(mountpoint);
3135                 free(base_cgroup);
3136         }
3137
3138         free_string_list(klist);
3139         free_string_list(nlist);
3140
3141         TRACE("Writable cgroup hierarchies:");
3142         lxc_cgfsng_print_hierarchies(ops);
3143
3144         /* verify that all controllers in cgroup.use and all crucial
3145          * controllers are accounted for
3146          */
3147         if (!all_controllers_found(ops))
3148                 return log_error_errno(-1, ENOENT, "Failed to find all required controllers");
3149
3150         return 0;
3151 }
3152
3153 /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
3154 static char *cg_unified_get_current_cgroup(bool relative)
3155 {
3156         __do_free char *basecginfo = NULL;
3157         char *base_cgroup;
3158         char *copy = NULL;
3159
3160         if (!relative && (geteuid() == 0))
3161                 basecginfo = read_file("/proc/1/cgroup");
3162         else
3163                 basecginfo = read_file("/proc/self/cgroup");
3164         if (!basecginfo)
3165                 return NULL;
3166
3167         base_cgroup = strstr(basecginfo, "0::/");
3168         if (!base_cgroup)
3169                 goto cleanup_on_err;
3170
3171         base_cgroup = base_cgroup + 3;
3172         copy = copy_to_eol(base_cgroup);
3173         if (!copy)
3174                 goto cleanup_on_err;
3175
3176 cleanup_on_err:
3177         if (copy)
3178                 trim(copy);
3179
3180         return copy;
3181 }
3182
3183 static int cg_unified_init(struct cgroup_ops *ops, bool relative,
3184                            bool unprivileged)
3185 {
3186         __do_free char *subtree_path = NULL;
3187         int ret;
3188         char *mountpoint;
3189         char **delegatable;
3190         struct hierarchy *new;
3191         char *base_cgroup = NULL;
3192
3193         ret = unified_cgroup_hierarchy();
3194         if (ret == -ENOMEDIUM)
3195                 return ret_errno(ENOMEDIUM);
3196
3197         if (ret != CGROUP2_SUPER_MAGIC)
3198                 return 0;
3199
3200         base_cgroup = cg_unified_get_current_cgroup(relative);
3201         if (!base_cgroup)
3202                 return ret_errno(EINVAL);
3203         if (!relative)
3204                 prune_init_scope(base_cgroup);
3205
3206         /*
3207          * We assume that the cgroup we're currently in has been delegated to
3208          * us and we are free to further delege all of the controllers listed
3209          * in cgroup.controllers further down the hierarchy.
3210          */
3211         mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT);
3212         subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL);
3213         delegatable = cg_unified_get_controllers(subtree_path);
3214         if (!delegatable)
3215                 delegatable = cg_unified_make_empty_controller();
3216         if (!delegatable[0])
3217                 TRACE("No controllers are enabled for delegation");
3218
3219         /* TODO: If the user requested specific controllers via lxc.cgroup.use
3220          * we should verify here. The reason I'm not doing it right is that I'm
3221          * not convinced that lxc.cgroup.use will be the future since it is a
3222          * global property. I much rather have an option that lets you request
3223          * controllers per container.
3224          */
3225
3226         new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
3227         if (unprivileged)
3228                 cg_unified_delegate(&new->cgroup2_chown);
3229
3230         if (bpf_devices_cgroup_supported())
3231                 new->bpf_device_controller = 1;
3232
3233         ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3234         ops->unified = new;
3235         return CGROUP2_SUPER_MAGIC;
3236 }
3237
3238 static int cg_init(struct cgroup_ops *ops, struct lxc_conf *conf)
3239 {
3240         int ret;
3241         const char *tmp;
3242         bool relative = conf->cgroup_meta.relative;
3243
3244         tmp = lxc_global_config_value("lxc.cgroup.use");
3245         if (tmp) {
3246                 __do_free char *pin = NULL;
3247                 char *chop, *cur;
3248
3249                 pin = must_copy_string(tmp);
3250                 chop = pin;
3251
3252                 lxc_iterate_parts(cur, chop, ",")
3253                         must_append_string(&ops->cgroup_use, cur);
3254         }
3255
3256         ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
3257         if (ret < 0)
3258                 return -1;
3259
3260         if (ret == CGROUP2_SUPER_MAGIC)
3261                 return 0;
3262
3263         return cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
3264 }
3265
3266 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3267 {
3268         const char *cgroup_pattern;
3269
3270         if (!ops)
3271                 return ret_set_errno(-1, ENOENT);
3272
3273         /* copy system-wide cgroup information */
3274         cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3275         if (!cgroup_pattern) {
3276                 /* lxc.cgroup.pattern is only NULL on error. */
3277                 ERROR("Failed to retrieve cgroup pattern");
3278                 return ret_set_errno(-1, ENOMEM);
3279         }
3280         ops->cgroup_pattern = must_copy_string(cgroup_pattern);
3281
3282         return 0;
3283 }
3284
3285 struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
3286 {
3287         __do_free struct cgroup_ops *cgfsng_ops = NULL;
3288
3289         cgfsng_ops = malloc(sizeof(struct cgroup_ops));
3290         if (!cgfsng_ops)
3291                 return ret_set_errno(NULL, ENOMEM);
3292
3293         memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
3294         cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3295
3296         if (cg_init(cgfsng_ops, conf))
3297                 return NULL;
3298
3299         cgfsng_ops->unified_fd = -EBADF;
3300
3301         cgfsng_ops->data_init = cgfsng_data_init;
3302         cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3303         cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
3304         cgfsng_ops->monitor_create = cgfsng_monitor_create;
3305         cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
3306         cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3307         cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
3308         cgfsng_ops->payload_create = cgfsng_payload_create;
3309         cgfsng_ops->payload_enter = cgfsng_payload_enter;
3310         cgfsng_ops->escape = cgfsng_escape;
3311         cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
3312         cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
3313         cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3314         cgfsng_ops->get = cgfsng_get;
3315         cgfsng_ops->set = cgfsng_set;
3316         cgfsng_ops->freeze = cgfsng_freeze;
3317         cgfsng_ops->unfreeze = cgfsng_unfreeze;
3318         cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
3319         cgfsng_ops->setup_limits = cgfsng_setup_limits;
3320         cgfsng_ops->driver = "cgfsng";
3321         cgfsng_ops->version = "1.0.0";
3322         cgfsng_ops->attach = cgfsng_attach;
3323         cgfsng_ops->chown = cgfsng_chown;
3324         cgfsng_ops->mount = cgfsng_mount;
3325         cgfsng_ops->nrtasks = cgfsng_nrtasks;
3326         cgfsng_ops->devices_activate = cgfsng_devices_activate;
3327
3328         return move_ptr(cgfsng_ops);
3329 }