src/lxc/cgroups/cgfsng.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 /*
   4  * cgfs-ng.c: this is a new, simplified implementation of a filesystem
   5  * cgroup backend.  The original cgfs.c was designed to be as flexible
   6  * as possible.  It would try to find cgroup filesystems no matter where
   7  * or how you had them mounted, and deduce the most usable mount for
   8  * each controller.
   9  *
  10  * This new implementation assumes that cgroup filesystems are mounted
  11  * under /sys/fs/cgroup/clist where clist is either the controller, or
  12  * a comma-separated list of controllers.
  13  */
  14
  15 #ifndef _GNU_SOURCE
  16 #define _GNU_SOURCE 1
  17 #endif
  18 #include <ctype.h>
  19 #include <dirent.h>
  20 #include <errno.h>
  21 #include <grp.h>
  22 #include <linux/kdev_t.h>
  23 #include <linux/types.h>
  24 #include <poll.h>
  25 #include <signal.h>
  26 #include <stdint.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/types.h>
  31 #include <unistd.h>
  32
  33 #include "caps.h"
  34 #include "cgroup.h"
  35 #include "cgroup2_devices.h"
  36 #include "cgroup_utils.h"
  37 #include "commands.h"
  38 #include "conf.h"
  39 #include "config.h"
  40 #include "log.h"
  41 #include "macro.h"
  42 #include "mainloop.h"
  43 #include "memory_utils.h"
  44 #include "storage/storage.h"
  45 #include "utils.h"
  46
  47 #ifndef HAVE_STRLCPY
  48 #include "include/strlcpy.h"
  49 #endif
  50
  51 #ifndef HAVE_STRLCAT
  52 #include "include/strlcat.h"
  53 #endif
  54
  55 lxc_log_define(cgfsng, cgroup);
  56
  57 /* Given a pointer to a null-terminated array of pointers, realloc to add one
  58  * entry, and point the new entry to NULL. Do not fail. Return the index to the
  59  * second-to-last entry - that is, the one which is now available for use
  60  * (keeping the list null-terminated).
  61  */
  62 static int append_null_to_list(void ***list)
  63 {
  64         int newentry = 0;
  65
  66         if (*list)
  67                 for (; (*list)[newentry]; newentry++)
  68                         ;
  69
  70         *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
  71         (*list)[newentry + 1] = NULL;
  72         return newentry;
  73 }
  74
  75 /* Given a null-terminated array of strings, check whether @entry is one of the
  76  * strings.
  77  */
  78 static bool string_in_list(char **list, const char *entry)
  79 {
  80         if (!list)
  81                 return false;
  82
  83         for (int i = 0; list[i]; i++)
  84                 if (strcmp(list[i], entry) == 0)
  85                         return true;
  86
  87         return false;
  88 }
  89
  90 /* Return a copy of @entry prepending "name=", i.e.  turn "systemd" into
  91  * "name=systemd". Do not fail.
  92  */
  93 static char *cg_legacy_must_prefix_named(char *entry)
  94 {
  95         size_t len;
  96         char *prefixed;
  97
  98         len = strlen(entry);
  99         prefixed = must_realloc(NULL, len + 6);
 100
 101         memcpy(prefixed, "name=", STRLITERALLEN("name="));
 102         memcpy(prefixed + STRLITERALLEN("name="), entry, len);
 103         prefixed[len + 5] = '\0';
 104
 105         return prefixed;
 106 }
 107
 108 /* Append an entry to the clist. Do not fail. @clist must be NULL the first time
 109  * we are called.
 110  *
 111  * We also handle named subsystems here. Any controller which is not a kernel
 112  * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
 113  * we refuse to use because we're not sure which we have here.
 114  * (TODO: We could work around this in some cases by just remounting to be
 115  * unambiguous, or by comparing mountpoint contents with current cgroup.)
 116  *
 117  * The last entry will always be NULL.
 118  */
 119 static void must_append_controller(char **klist, char **nlist, char ***clist,
 120                                    char *entry)
 121 {
 122         int newentry;
 123         char *copy;
 124
 125         if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
 126                 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
 127                 ERROR("It is both a named and kernel subsystem");
 128                 return;
 129         }
 130
 131         newentry = append_null_to_list((void ***)clist);
 132
 133         if (strncmp(entry, "name=", 5) == 0)
 134                 copy = must_copy_string(entry);
 135         else if (string_in_list(klist, entry))
 136                 copy = must_copy_string(entry);
 137         else
 138                 copy = cg_legacy_must_prefix_named(entry);
 139
 140         (*clist)[newentry] = copy;
 141 }
 142
 143 /* Given a handler's cgroup data, return the struct hierarchy for the controller
 144  * @c, or NULL if there is none.
 145  */
 146 struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
 147 {
 148         if (!ops->hierarchies)
 149                 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
 150
 151         for (int i = 0; ops->hierarchies[i]; i++) {
 152                 if (!controller) {
 153                         /* This is the empty unified hierarchy. */
 154                         if (ops->hierarchies[i]->controllers &&
 155                             !ops->hierarchies[i]->controllers[0])
 156                                 return ops->hierarchies[i];
 157                         continue;
 158                 } else if (pure_unified_layout(ops) &&
 159                            strcmp(controller, "devices") == 0) {
 160                         if (ops->unified->bpf_device_controller)
 161                                 return ops->unified;
 162                         break;
 163                 }
 164
 165                 if (string_in_list(ops->hierarchies[i]->controllers, controller))
 166                         return ops->hierarchies[i];
 167         }
 168
 169         if (controller)
 170                 WARN("There is no useable %s controller", controller);
 171         else
 172                 WARN("There is no empty unified cgroup hierarchy");
 173
 174         return ret_set_errno(NULL, ENOENT);
 175 }
 176
 177 #define BATCH_SIZE 50
 178 static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
 179 {
 180         int newbatches = (newlen / BATCH_SIZE) + 1;
 181         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 182
 183         if (!*mem || newbatches > oldbatches)
 184                 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
 185 }
 186
 187 static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
 188 {
 189         size_t full = oldlen + newlen;
 190
 191         batch_realloc(dest, oldlen, full + 1);
 192
 193         memcpy(*dest + oldlen, new, newlen + 1);
 194 }
 195
 196 /* Slurp in a whole file */
 197 static char *read_file(const char *fnam)
 198 {
 199         __do_free char *buf = NULL, *line = NULL;
 200         __do_fclose FILE *f = NULL;
 201         size_t len = 0, fulllen = 0;
 202         int linelen;
 203
 204         f = fopen(fnam, "re");
 205         if (!f)
 206                 return NULL;
 207
 208         while ((linelen = getline(&line, &len, f)) != -1) {
 209                 append_line(&buf, fulllen, line, linelen);
 210                 fulllen += linelen;
 211         }
 212
 213         return move_ptr(buf);
 214 }
 215
 216 /* Taken over modified from the kernel sources. */
 217 #define NBITS 32 /* bits in uint32_t */
 218 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
 219 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
 220
 221 static void set_bit(unsigned bit, uint32_t *bitarr)
 222 {
 223         bitarr[bit / NBITS] |= (1 << (bit % NBITS));
 224 }
 225
 226 static void clear_bit(unsigned bit, uint32_t *bitarr)
 227 {
 228         bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
 229 }
 230
 231 static bool is_set(unsigned bit, uint32_t *bitarr)
 232 {
 233         return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
 234 }
 235
 236 /* Create cpumask from cpulist aka turn:
 237  *
 238  *      0,2-3
 239  *
 240  * into bit array
 241  *
 242  *      1 0 1 1
 243  */
 244 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
 245 {
 246         __do_free uint32_t *bitarr = NULL;
 247         char *token;
 248         size_t arrlen;
 249
 250         arrlen = BITS_TO_LONGS(nbits);
 251         bitarr = calloc(arrlen, sizeof(uint32_t));
 252         if (!bitarr)
 253                 return ret_set_errno(NULL, ENOMEM);
 254
 255         lxc_iterate_parts(token, buf, ",") {
 256                 errno = 0;
 257                 unsigned end, start;
 258                 char *range;
 259
 260                 start = strtoul(token, NULL, 0);
 261                 end = start;
 262                 range = strchr(token, '-');
 263                 if (range)
 264                         end = strtoul(range + 1, NULL, 0);
 265
 266                 if (!(start <= end))
 267                         return ret_set_errno(NULL, EINVAL);
 268
 269                 if (end >= nbits)
 270                         return ret_set_errno(NULL, EINVAL);
 271
 272                 while (start <= end)
 273                         set_bit(start++, bitarr);
 274         }
 275
 276         return move_ptr(bitarr);
 277 }
 278
 279 /* Turn cpumask into simple, comma-separated cpulist. */
 280 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
 281 {
 282         __do_free_string_list char **cpulist = NULL;
 283         char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
 284         int ret;
 285
 286         for (size_t i = 0; i <= nbits; i++) {
 287                 if (!is_set(i, bitarr))
 288                         continue;
 289
 290                 ret = snprintf(numstr, sizeof(numstr), "%zu", i);
 291                 if (ret < 0 || (size_t)ret >= sizeof(numstr))
 292                         return NULL;
 293
 294                 ret = lxc_append_string(&cpulist, numstr);
 295                 if (ret < 0)
 296                         return ret_set_errno(NULL, ENOMEM);
 297         }
 298
 299         if (!cpulist)
 300                 return ret_set_errno(NULL, ENOMEM);
 301
 302         return lxc_string_join(",", (const char **)cpulist, false);
 303 }
 304
 305 static ssize_t get_max_cpus(char *cpulist)
 306 {
 307         char *c1, *c2;
 308         char *maxcpus = cpulist;
 309         size_t cpus = 0;
 310
 311         c1 = strrchr(maxcpus, ',');
 312         if (c1)
 313                 c1++;
 314
 315         c2 = strrchr(maxcpus, '-');
 316         if (c2)
 317                 c2++;
 318
 319         if (!c1 && !c2)
 320                 c1 = maxcpus;
 321         else if (c1 > c2)
 322                 c2 = c1;
 323         else if (c1 < c2)
 324                 c1 = c2;
 325         else if (!c1 && c2)
 326                 c1 = c2;
 327
 328         errno = 0;
 329         cpus = strtoul(c1, NULL, 0);
 330         if (errno != 0)
 331                 return -1;
 332
 333         return cpus;
 334 }
 335
 336 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
 337 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
 338 static bool cg_legacy_filter_and_set_cpus(const char *parent_cgroup,
 339                                           char *child_cgroup, bool am_initialized)
 340 {
 341         __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
 342                        *offlinecpus = NULL, *posscpus = NULL;
 343         __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
 344                            *possmask = NULL;
 345         int ret;
 346         ssize_t i;
 347         ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
 348         bool flipped_bit = false;
 349
 350         fpath = must_make_path(parent_cgroup, "cpuset.cpus", NULL);
 351         posscpus = read_file(fpath);
 352         if (!posscpus)
 353                 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
 354
 355         /* Get maximum number of cpus found in possible cpuset. */
 356         maxposs = get_max_cpus(posscpus);
 357         if (maxposs < 0 || maxposs >= INT_MAX - 1)
 358                 return false;
 359
 360         if (file_exists(__ISOL_CPUS)) {
 361                 isolcpus = read_file(__ISOL_CPUS);
 362                 if (!isolcpus)
 363                         return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
 364
 365                 if (isdigit(isolcpus[0])) {
 366                         /* Get maximum number of cpus found in isolated cpuset. */
 367                         maxisol = get_max_cpus(isolcpus);
 368                         if (maxisol < 0 || maxisol >= INT_MAX - 1)
 369                                 return false;
 370                 }
 371
 372                 if (maxposs < maxisol)
 373                         maxposs = maxisol;
 374                 maxposs++;
 375         } else {
 376                 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
 377         }
 378
 379         if (file_exists(__OFFLINE_CPUS)) {
 380                 offlinecpus = read_file(__OFFLINE_CPUS);
 381                 if (!offlinecpus)
 382                         return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
 383
 384                 if (isdigit(offlinecpus[0])) {
 385                         /* Get maximum number of cpus found in offline cpuset. */
 386                         maxoffline = get_max_cpus(offlinecpus);
 387                         if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
 388                                 return false;
 389                 }
 390
 391                 if (maxposs < maxoffline)
 392                         maxposs = maxoffline;
 393                 maxposs++;
 394         } else {
 395                 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
 396         }
 397
 398         if ((maxisol == 0) && (maxoffline == 0)) {
 399                 cpulist = move_ptr(posscpus);
 400                 goto copy_parent;
 401         }
 402
 403         possmask = lxc_cpumask(posscpus, maxposs);
 404         if (!possmask)
 405                 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
 406
 407         if (maxisol > 0) {
 408                 isolmask = lxc_cpumask(isolcpus, maxposs);
 409                 if (!isolmask)
 410                         return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
 411         }
 412
 413         if (maxoffline > 0) {
 414                 offlinemask = lxc_cpumask(offlinecpus, maxposs);
 415                 if (!offlinemask)
 416                         return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
 417         }
 418
 419         for (i = 0; i <= maxposs; i++) {
 420                 if ((isolmask && !is_set(i, isolmask)) ||
 421                     (offlinemask && !is_set(i, offlinemask)) ||
 422                     !is_set(i, possmask))
 423                         continue;
 424
 425                 flipped_bit = true;
 426                 clear_bit(i, possmask);
 427         }
 428
 429         if (!flipped_bit) {
 430                 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
 431                 TRACE("No isolated or offline cpus present in cpuset");
 432         } else {
 433                 cpulist = move_ptr(posscpus);
 434                 TRACE("Removed isolated or offline cpus from cpuset");
 435         }
 436         if (!cpulist)
 437                 return log_error_errno(false, errno, "Failed to create cpu list");
 438
 439 copy_parent:
 440         if (!am_initialized) {
 441                 ret = lxc_write_openat(child_cgroup, "cpuset.cpus", cpulist, strlen(cpulist));
 442                 if (ret < 0)
 443                         return log_error_errno(false,
 444                                                errno, "Failed to write cpu list to \"%s/cpuset.cpus\"",
 445                                                child_cgroup);
 446
 447                 TRACE("Copied cpu settings of parent cgroup");
 448         }
 449
 450         return true;
 451 }
 452
 453 /* Copy contents of parent(@path)/@file to @path/@file */
 454 static bool copy_parent_file(const char *parent_cgroup,
 455                              const char *child_cgroup, const char *file)
 456 {
 457         __do_free char *parent_file = NULL, *value = NULL;
 458         int len = 0;
 459         int ret;
 460
 461         parent_file = must_make_path(parent_cgroup, file, NULL);
 462         len = lxc_read_from_file(parent_file, NULL, 0);
 463         if (len <= 0)
 464                 return log_error_errno(false, errno, "Failed to determine buffer size");
 465
 466         value = must_realloc(NULL, len + 1);
 467         value[len] = '\0';
 468         ret = lxc_read_from_file(parent_file, value, len);
 469         if (ret != len)
 470                 return log_error_errno(false, errno, "Failed to read from parent file \"%s\"", parent_file);
 471
 472         ret = lxc_write_openat(child_cgroup, file, value, len);
 473         if (ret < 0 && errno != EACCES)
 474                 return log_error_errno(false, errno, "Failed to write \"%s\" to file \"%s/%s\"",
 475                                        value, child_cgroup, file);
 476         return true;
 477 }
 478
 479 static inline bool is_unified_hierarchy(const struct hierarchy *h)
 480 {
 481         return h->version == CGROUP2_SUPER_MAGIC;
 482 }
 483
 484 /*
 485  * Initialize the cpuset hierarchy in first directory of @cgroup_leaf and set
 486  * cgroup.clone_children so that children inherit settings. Since the
 487  * h->base_path is populated by init or ourselves, we know it is already
 488  * initialized.
 489  *
 490  * returns -1 on error, 0 when we didn't created a cgroup, 1 if we created a
 491  * cgroup.
 492  */
 493 static int cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h,
 494                                              const char *cgroup_leaf)
 495 {
 496         __do_free char *parent_cgroup = NULL, *child_cgroup = NULL, *dup = NULL;
 497         __do_close int cgroup_fd = -EBADF;
 498         int fret = -1;
 499         int ret;
 500         char v;
 501         char *leaf, *slash;
 502
 503         if (is_unified_hierarchy(h))
 504                 return 0;
 505
 506         if (!string_in_list(h->controllers, "cpuset"))
 507                 return 0;
 508
 509         if (!cgroup_leaf)
 510                 return ret_set_errno(-1, EINVAL);
 511
 512         dup = strdup(cgroup_leaf);
 513         if (!dup)
 514                 return ret_set_errno(-1, ENOMEM);
 515
 516         parent_cgroup = must_make_path(h->mountpoint, h->container_base_path, NULL);
 517
 518         leaf = dup;
 519         leaf += strspn(leaf, "/");
 520         slash = strchr(leaf, '/');
 521         if (slash)
 522                 *slash = '\0';
 523         child_cgroup = must_make_path(parent_cgroup, leaf, NULL);
 524         if (slash)
 525                 *slash = '/';
 526
 527         fret = 1;
 528         ret = mkdir(child_cgroup, 0755);
 529         if (ret < 0) {
 530                 if (errno != EEXIST)
 531                         return log_error_errno(-1, errno, "Failed to create directory \"%s\"", child_cgroup);
 532
 533                 fret = 0;
 534         }
 535
 536         cgroup_fd = lxc_open_dirfd(child_cgroup);
 537         if (cgroup_fd < 0)
 538                 return -1;
 539
 540         ret = lxc_readat(cgroup_fd, "cgroup.clone_children", &v, 1);
 541         if (ret < 0)
 542                 return log_error_errno(-1, errno, "Failed to read file \"%s/cgroup.clone_children\"", child_cgroup);
 543
 544         /* Make sure any isolated cpus are removed from cpuset.cpus. */
 545         if (!cg_legacy_filter_and_set_cpus(parent_cgroup, child_cgroup, v == '1'))
 546                 return log_error_errno(-1, errno, "Failed to remove isolated cpus");
 547
 548         /* Already set for us by someone else. */
 549         if (v == '1')
 550                 TRACE("\"cgroup.clone_children\" was already set to \"1\"");
 551
 552         /* copy parent's settings */
 553         if (!copy_parent_file(parent_cgroup, child_cgroup, "cpuset.mems"))
 554                 return log_error_errno(-1, errno, "Failed to copy \"cpuset.mems\" settings");
 555
 556         /* Set clone_children so children inherit our settings */
 557         ret = lxc_writeat(cgroup_fd, "cgroup.clone_children", "1", 1);
 558         if (ret < 0)
 559                 return log_error_errno(-1, errno, "Failed to write 1 to \"%s/cgroup.clone_children\"", child_cgroup);
 560
 561         return fret;
 562 }
 563
 564 /* Given two null-terminated lists of strings, return true if any string is in
 565  * both.
 566  */
 567 static bool controller_lists_intersect(char **l1, char **l2)
 568 {
 569         if (!l1 || !l2)
 570                 return false;
 571
 572         for (int i = 0; l1[i]; i++)
 573                 if (string_in_list(l2, l1[i]))
 574                         return true;
 575
 576         return false;
 577 }
 578
 579 /* For a null-terminated list of controllers @clist, return true if any of those
 580  * controllers is already listed the null-terminated list of hierarchies @hlist.
 581  * Realistically, if one is present, all must be present.
 582  */
 583 static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
 584 {
 585         if (!hlist)
 586                 return false;
 587
 588         for (int i = 0; hlist[i]; i++)
 589                 if (controller_lists_intersect(hlist[i]->controllers, clist))
 590                         return true;
 591
 592         return false;
 593 }
 594
 595 /* Return true if the controller @entry is found in the null-terminated list of
 596  * hierarchies @hlist.
 597  */
 598 static bool controller_found(struct hierarchy **hlist, char *entry)
 599 {
 600         if (!hlist)
 601                 return false;
 602
 603         for (int i = 0; hlist[i]; i++)
 604                 if (string_in_list(hlist[i]->controllers, entry))
 605                         return true;
 606
 607         return false;
 608 }
 609
 610 /* Return true if all of the controllers which we require have been found.  The
 611  * required list is  freezer and anything in lxc.cgroup.use.
 612  */
 613 static bool all_controllers_found(struct cgroup_ops *ops)
 614 {
 615         struct hierarchy **hlist;
 616
 617         if (!ops->cgroup_use)
 618                 return true;
 619
 620         hlist = ops->hierarchies;
 621         for (char **cur = ops->cgroup_use; cur && *cur; cur++)
 622                 if (!controller_found(hlist, *cur))
 623                         return log_error(false, "No %s controller mountpoint found", *cur);
 624
 625         return true;
 626 }
 627
 628 /* Get the controllers from a mountinfo line There are other ways we could get
 629  * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
 630  * could parse the mount options. But we simply assume that the mountpoint must
 631  * be /sys/fs/cgroup/controller-list
 632  */
 633 static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
 634                                         int type)
 635 {
 636         /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
 637          * for legacy hierarchies.
 638          */
 639         __do_free_string_list char **aret = NULL;
 640         int i;
 641         char *p2, *tok;
 642         char *p = line, *sep = ",";
 643
 644         for (i = 0; i < 4; i++) {
 645                 p = strchr(p, ' ');
 646                 if (!p)
 647                         return NULL;
 648                 p++;
 649         }
 650
 651         /* Note, if we change how mountinfo works, then our caller will need to
 652          * verify /sys/fs/cgroup/ in this field.
 653          */
 654         if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
 655                 return log_error(NULL, "Found hierarchy not under " DEFAULT_CGROUP_MOUNTPOINT ": \"%s\"", p);
 656
 657         p += 15;
 658         p2 = strchr(p, ' ');
 659         if (!p2)
 660                 return log_error(NULL, "Corrupt mountinfo");
 661         *p2 = '\0';
 662
 663         if (type == CGROUP_SUPER_MAGIC) {
 664                 __do_free char *dup = NULL;
 665
 666                 /* strdup() here for v1 hierarchies. Otherwise
 667                  * lxc_iterate_parts() will destroy mountpoints such as
 668                  * "/sys/fs/cgroup/cpu,cpuacct".
 669                  */
 670                 dup = must_copy_string(p);
 671                 if (!dup)
 672                         return NULL;
 673
 674                 lxc_iterate_parts (tok, dup, sep)
 675                         must_append_controller(klist, nlist, &aret, tok);
 676         }
 677         *p2 = ' ';
 678
 679         return move_ptr(aret);
 680 }
 681
 682 static char **cg_unified_make_empty_controller(void)
 683 {
 684         __do_free_string_list char **aret = NULL;
 685         int newentry;
 686
 687         newentry = append_null_to_list((void ***)&aret);
 688         aret[newentry] = NULL;
 689         return move_ptr(aret);
 690 }
 691
 692 static char **cg_unified_get_controllers(const char *file)
 693 {
 694         __do_free char *buf = NULL;
 695         __do_free_string_list char **aret = NULL;
 696         char *sep = " \t\n";
 697         char *tok;
 698
 699         buf = read_file(file);
 700         if (!buf)
 701                 return NULL;
 702
 703         lxc_iterate_parts(tok, buf, sep) {
 704                 int newentry;
 705                 char *copy;
 706
 707                 newentry = append_null_to_list((void ***)&aret);
 708                 copy = must_copy_string(tok);
 709                 aret[newentry] = copy;
 710         }
 711
 712         return move_ptr(aret);
 713 }
 714
 715 static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
 716                                        char *container_base_path, int type)
 717 {
 718         struct hierarchy *new;
 719         int newentry;
 720
 721         new = zalloc(sizeof(*new));
 722         new->controllers = clist;
 723         new->mountpoint = mountpoint;
 724         new->container_base_path = container_base_path;
 725         new->version = type;
 726         new->cgfd_con = -EBADF;
 727         new->cgfd_mon = -EBADF;
 728
 729         newentry = append_null_to_list((void ***)h);
 730         (*h)[newentry] = new;
 731         return new;
 732 }
 733
 734 /* Get a copy of the mountpoint from @line, which is a line from
 735  * /proc/self/mountinfo.
 736  */
 737 static char *cg_hybrid_get_mountpoint(char *line)
 738 {
 739         char *p = line, *sret = NULL;
 740         size_t len;
 741         char *p2;
 742
 743         for (int i = 0; i < 4; i++) {
 744                 p = strchr(p, ' ');
 745                 if (!p)
 746                         return NULL;
 747                 p++;
 748         }
 749
 750         if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0)
 751                 return NULL;
 752
 753         p2 = strchr(p + 15, ' ');
 754         if (!p2)
 755                 return NULL;
 756         *p2 = '\0';
 757
 758         len = strlen(p);
 759         sret = must_realloc(NULL, len + 1);
 760         memcpy(sret, p, len);
 761         sret[len] = '\0';
 762
 763         return sret;
 764 }
 765
 766 /* Given a multi-line string, return a null-terminated copy of the current line. */
 767 static char *copy_to_eol(char *p)
 768 {
 769         char *p2, *sret;
 770         size_t len;
 771
 772         p2 = strchr(p, '\n');
 773         if (!p2)
 774                 return NULL;
 775
 776         len = p2 - p;
 777         sret = must_realloc(NULL, len + 1);
 778         memcpy(sret, p, len);
 779         sret[len] = '\0';
 780
 781         return sret;
 782 }
 783
 784 /* cgline: pointer to character after the first ':' in a line in a \n-terminated
 785  * /proc/self/cgroup file. Check whether controller c is present.
 786  */
 787 static bool controller_in_clist(char *cgline, char *c)
 788 {
 789         __do_free char *tmp = NULL;
 790         char *tok, *eol;
 791         size_t len;
 792
 793         eol = strchr(cgline, ':');
 794         if (!eol)
 795                 return false;
 796
 797         len = eol - cgline;
 798         tmp = must_realloc(NULL, len + 1);
 799         memcpy(tmp, cgline, len);
 800         tmp[len] = '\0';
 801
 802         lxc_iterate_parts(tok, tmp, ",")
 803                 if (strcmp(tok, c) == 0)
 804                         return true;
 805
 806         return false;
 807 }
 808
 809 /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
 810  * @controller.
 811  */
 812 static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
 813                                           int type)
 814 {
 815         char *p = basecginfo;
 816
 817         for (;;) {
 818                 bool is_cgv2_base_cgroup = false;
 819
 820                 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
 821                 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
 822                         is_cgv2_base_cgroup = true;
 823
 824                 p = strchr(p, ':');
 825                 if (!p)
 826                         return NULL;
 827                 p++;
 828
 829                 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
 830                         p = strchr(p, ':');
 831                         if (!p)
 832                                 return NULL;
 833                         p++;
 834                         return copy_to_eol(p);
 835                 }
 836
 837                 p = strchr(p, '\n');
 838                 if (!p)
 839                         return NULL;
 840                 p++;
 841         }
 842 }
 843
 844 static void must_append_string(char ***list, char *entry)
 845 {
 846         int newentry;
 847         char *copy;
 848
 849         newentry = append_null_to_list((void ***)list);
 850         copy = must_copy_string(entry);
 851         (*list)[newentry] = copy;
 852 }
 853
 854 static int get_existing_subsystems(char ***klist, char ***nlist)
 855 {
 856         __do_free char *line = NULL;
 857         __do_fclose FILE *f = NULL;
 858         size_t len = 0;
 859
 860         f = fopen("/proc/self/cgroup", "re");
 861         if (!f)
 862                 return -1;
 863
 864         while (getline(&line, &len, f) != -1) {
 865                 char *p, *p2, *tok;
 866                 p = strchr(line, ':');
 867                 if (!p)
 868                         continue;
 869                 p++;
 870                 p2 = strchr(p, ':');
 871                 if (!p2)
 872                         continue;
 873                 *p2 = '\0';
 874
 875                 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
 876                  * contains an entry of the form:
 877                  *
 878                  *      0::/some/path
 879                  *
 880                  * In this case we use "cgroup2" as controller name.
 881                  */
 882                 if ((p2 - p) == 0) {
 883                         must_append_string(klist, "cgroup2");
 884                         continue;
 885                 }
 886
 887                 lxc_iterate_parts(tok, p, ",") {
 888                         if (strncmp(tok, "name=", 5) == 0)
 889                                 must_append_string(nlist, tok);
 890                         else
 891                                 must_append_string(klist, tok);
 892                 }
 893         }
 894
 895         return 0;
 896 }
 897
 898 static char *trim(char *s)
 899 {
 900         size_t len;
 901
 902         len = strlen(s);
 903         while ((len > 1) && (s[len - 1] == '\n'))
 904                 s[--len] = '\0';
 905
 906         return s;
 907 }
 908
 909 static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
 910 {
 911         int i;
 912         struct hierarchy **it;
 913
 914         if (!ops->hierarchies) {
 915                 TRACE("  No hierarchies found");
 916                 return;
 917         }
 918
 919         TRACE("  Hierarchies:");
 920         for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
 921                 int j;
 922                 char **cit;
 923
 924                 TRACE("  %d: base_cgroup: %s", i, (*it)->container_base_path ? (*it)->container_base_path : "(null)");
 925                 TRACE("      mountpoint:  %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
 926                 TRACE("      controllers:");
 927                 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
 928                         TRACE("      %d: %s", j, *cit);
 929         }
 930 }
 931
 932 static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
 933                                               char **nlist)
 934 {
 935         int k;
 936         char **it;
 937
 938         TRACE("basecginfo is:");
 939         TRACE("%s", basecginfo);
 940
 941         for (k = 0, it = klist; it && *it; it++, k++)
 942                 TRACE("kernel subsystem %d: %s", k, *it);
 943
 944         for (k = 0, it = nlist; it && *it; it++, k++)
 945                 TRACE("named subsystem %d: %s", k, *it);
 946 }
 947
 948 static int cgroup_rmdir(struct hierarchy **hierarchies,
 949                         const char *container_cgroup)
 950 {
 951         if (!container_cgroup || !hierarchies)
 952                 return 0;
 953
 954         for (int i = 0; hierarchies[i]; i++) {
 955                 struct hierarchy *h = hierarchies[i];
 956                 int ret;
 957
 958                 if (!h->container_full_path)
 959                         continue;
 960
 961                 ret = recursive_destroy(h->container_full_path);
 962                 if (ret < 0)
 963                         WARN("Failed to destroy \"%s\"", h->container_full_path);
 964
 965                 free_disarm(h->container_full_path);
 966         }
 967
 968         return 0;
 969 }
 970
 971 struct generic_userns_exec_data {
 972         struct hierarchy **hierarchies;
 973         const char *container_cgroup;
 974         struct lxc_conf *conf;
 975         uid_t origuid; /* target uid in parent namespace */
 976         char *path;
 977 };
 978
 979 static int cgroup_rmdir_wrapper(void *data)
 980 {
 981         struct generic_userns_exec_data *arg = data;
 982         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
 983         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
 984         int ret;
 985
 986         if (!lxc_setgroups(0, NULL) && errno != EPERM)
 987                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
 988
 989         ret = setresgid(nsgid, nsgid, nsgid);
 990         if (ret < 0)
 991                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
 992                                        (int)nsgid, (int)nsgid, (int)nsgid);
 993
 994         ret = setresuid(nsuid, nsuid, nsuid);
 995         if (ret < 0)
 996                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
 997                                        (int)nsuid, (int)nsuid, (int)nsuid);
 998
 999         return cgroup_rmdir(arg->hierarchies, arg->container_cgroup);
1000 }
1001
1002 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
1003                                                 struct lxc_handler *handler)
1004 {
1005         int ret;
1006
1007         if (!ops) {
1008                 ERROR("Called with uninitialized cgroup operations");
1009                 return;
1010         }
1011
1012         if (!ops->hierarchies)
1013                 return;
1014
1015         if (!handler) {
1016                 ERROR("Called with uninitialized handler");
1017                 return;
1018         }
1019
1020         if (!handler->conf) {
1021                 ERROR("Called with uninitialized conf");
1022                 return;
1023         }
1024
1025 #ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
1026         ret = bpf_program_cgroup_detach(handler->conf->cgroup2_devices);
1027         if (ret < 0)
1028                 WARN("Failed to detach bpf program from cgroup");
1029 #endif
1030
1031         if (handler->conf && !lxc_list_empty(&handler->conf->id_map)) {
1032                 struct generic_userns_exec_data wrap = {
1033                         .conf                   = handler->conf,
1034                         .container_cgroup       = ops->container_cgroup,
1035                         .hierarchies            = ops->hierarchies,
1036                         .origuid                = 0,
1037                 };
1038                 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
1039                                     "cgroup_rmdir_wrapper");
1040         } else {
1041                 ret = cgroup_rmdir(ops->hierarchies, ops->container_cgroup);
1042         }
1043         if (ret < 0)
1044                 SYSWARN("Failed to destroy cgroups");
1045 }
1046
1047 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1048                                                 struct lxc_handler *handler)
1049 {
1050         int len;
1051         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1052         const struct lxc_conf *conf;
1053
1054         if (!ops) {
1055                 ERROR("Called with uninitialized cgroup operations");
1056                 return;
1057         }
1058
1059         if (!ops->hierarchies)
1060                 return;
1061
1062         if (!handler) {
1063                 ERROR("Called with uninitialized handler");
1064                 return;
1065         }
1066
1067         if (!handler->conf) {
1068                 ERROR("Called with uninitialized conf");
1069                 return;
1070         }
1071         conf = handler->conf;
1072
1073         len = snprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1074         if (len < 0 || (size_t)len >= sizeof(pidstr))
1075                 return;
1076
1077         for (int i = 0; ops->hierarchies[i]; i++) {
1078                 __do_free char *pivot_path = NULL;
1079                 struct hierarchy *h = ops->hierarchies[i];
1080                 int ret;
1081
1082                 if (!h->monitor_full_path)
1083                         continue;
1084
1085                 if (conf && conf->cgroup_meta.dir)
1086                         pivot_path = must_make_path(h->mountpoint,
1087                                                     h->container_base_path,
1088                                                     conf->cgroup_meta.dir,
1089                                                     CGROUP_PIVOT, NULL);
1090                 else
1091                         pivot_path = must_make_path(h->mountpoint,
1092                                                     h->container_base_path,
1093                                                     CGROUP_PIVOT, NULL);
1094
1095                 ret = mkdir_p(pivot_path, 0755);
1096                 if (ret < 0 && errno != EEXIST) {
1097                         ERROR("Failed to create %s", pivot_path);
1098                         goto try_recursive_destroy;
1099                 }
1100
1101                 ret = lxc_write_openat(pivot_path, "cgroup.procs", pidstr, len);
1102                 if (ret != 0) {
1103                         SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
1104                         continue;
1105                 }
1106
1107 try_recursive_destroy:
1108                 ret = recursive_destroy(h->monitor_full_path);
1109                 if (ret < 0)
1110                         WARN("Failed to destroy \"%s\"", h->monitor_full_path);
1111         }
1112 }
1113
1114 static int mkdir_eexist_on_last(const char *dir, mode_t mode)
1115 {
1116         const char *tmp = dir;
1117         const char *orig = dir;
1118         size_t orig_len;
1119
1120         orig_len = strlen(dir);
1121         do {
1122                 __do_free char *makeme = NULL;
1123                 int ret;
1124                 size_t cur_len;
1125
1126                 dir = tmp + strspn(tmp, "/");
1127                 tmp = dir + strcspn(dir, "/");
1128
1129                 cur_len = dir - orig;
1130                 makeme = strndup(orig, cur_len);
1131                 if (!makeme)
1132                         return ret_set_errno(-1, ENOMEM);
1133
1134                 ret = mkdir(makeme, mode);
1135                 if (ret < 0 && ((errno != EEXIST) || (orig_len == cur_len)))
1136                         return log_error_errno(-1, errno, "Failed to create directory \"%s\"", makeme);
1137         } while (tmp != dir);
1138
1139         return 0;
1140 }
1141
1142 static bool create_cgroup_tree(struct hierarchy *h, const char *cgroup_tree,
1143                                const char *cgroup_leaf, bool payload)
1144 {
1145         __do_free char *path = NULL;
1146         int ret, ret_cpuset;
1147
1148         path = must_make_path(h->mountpoint, h->container_base_path, cgroup_leaf, NULL);
1149         if (dir_exists(path))
1150                 return log_warn_errno(false, errno, "The %s cgroup already existed", path);
1151
1152         ret_cpuset = cg_legacy_handle_cpuset_hierarchy(h, cgroup_leaf);
1153         if (ret_cpuset < 0)
1154                 return log_error_errno(false, errno, "Failed to handle legacy cpuset controller");
1155
1156         ret = mkdir_eexist_on_last(path, 0755);
1157         if (ret < 0) {
1158                 /*
1159                  * This is the cpuset controller and
1160                  * cg_legacy_handle_cpuset_hierarchy() has created our target
1161                  * directory for us to ensure correct initialization.
1162                  */
1163                 if (ret_cpuset != 1 || cgroup_tree)
1164                         return log_error_errno(false, errno, "Failed to create %s cgroup", path);
1165         }
1166
1167         if (payload) {
1168                 h->cgfd_con = lxc_open_dirfd(path);
1169                 if (h->cgfd_con < 0)
1170                         return log_error_errno(false, errno, "Failed to open %s", path);
1171                 h->container_full_path = move_ptr(path);
1172         } else {
1173                 h->cgfd_mon = lxc_open_dirfd(path);
1174                 if (h->cgfd_mon < 0)
1175                         return log_error_errno(false, errno, "Failed to open %s", path);
1176                 h->monitor_full_path = move_ptr(path);
1177         }
1178
1179         return true;
1180 }
1181
1182 static void cgroup_remove_leaf(struct hierarchy *h, bool payload)
1183 {
1184         __do_free char *full_path = NULL;
1185
1186         if (payload) {
1187                 __lxc_unused __do_close int fd = move_fd(h->cgfd_con);
1188                 full_path = move_ptr(h->container_full_path);
1189         } else {
1190                 __lxc_unused __do_close int fd = move_fd(h->cgfd_mon);
1191                 full_path = move_ptr(h->monitor_full_path);
1192         }
1193
1194         if (full_path && rmdir(full_path))
1195                 SYSWARN("Failed to rmdir(\"%s\") cgroup", full_path);
1196 }
1197
1198 __cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
1199                                                       struct lxc_handler *handler)
1200 {
1201         __do_free char *monitor_cgroup = NULL, *__cgroup_tree = NULL;
1202         const char *cgroup_tree;
1203         int idx = 0;
1204         int i;
1205         size_t len;
1206         char *suffix;
1207         struct lxc_conf *conf;
1208
1209         if (!ops)
1210                 return ret_set_errno(false, ENOENT);
1211
1212         if (!ops->hierarchies)
1213                 return true;
1214
1215         if (ops->monitor_cgroup)
1216                 return ret_set_errno(false, EEXIST);
1217
1218         if (!handler || !handler->conf)
1219                 return ret_set_errno(false, EINVAL);
1220
1221         conf = handler->conf;
1222
1223         if (conf->cgroup_meta.dir) {
1224                 cgroup_tree = conf->cgroup_meta.dir;
1225                 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1226                                              DEFAULT_MONITOR_CGROUP_PREFIX,
1227                                              handler->name,
1228                                              CGROUP_CREATE_RETRY, NULL);
1229         } else if (ops->cgroup_pattern) {
1230                 __cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1231                 if (!__cgroup_tree)
1232                         return ret_set_errno(false, ENOMEM);
1233
1234                 cgroup_tree = __cgroup_tree;
1235                 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
1236                                              DEFAULT_MONITOR_CGROUP,
1237                                              CGROUP_CREATE_RETRY, NULL);
1238         } else {
1239                 cgroup_tree = NULL;
1240                 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1241                                              handler->name,
1242                                              CGROUP_CREATE_RETRY, NULL);
1243         }
1244         if (!monitor_cgroup)
1245                 return ret_set_errno(false, ENOMEM);
1246
1247         suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1248         *suffix = '\0';
1249         do {
1250                 if (idx)
1251                         sprintf(suffix, "-%d", idx);
1252
1253                 for (i = 0; ops->hierarchies[i]; i++) {
1254                         if (create_cgroup_tree(ops->hierarchies[i], cgroup_tree, monitor_cgroup, false))
1255                                 continue;
1256
1257                         ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->monitor_full_path ?: "(null)");
1258                         for (int j = 0; j < i; j++)
1259                                 cgroup_remove_leaf(ops->hierarchies[j], false);
1260
1261                         idx++;
1262                         break;
1263                 }
1264         } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
1265
1266         if (idx == 1000)
1267                 return ret_set_errno(false, ERANGE);
1268
1269         ops->monitor_cgroup = move_ptr(monitor_cgroup);
1270         return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1271 }
1272
1273 /*
1274  * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1275  * next cgroup_pattern-1, -2, ..., -999.
1276  */
1277 __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
1278                                                       struct lxc_handler *handler)
1279 {
1280         __do_free char *container_cgroup = NULL, *__cgroup_tree = NULL;
1281         const char *cgroup_tree;
1282         int idx = 0;
1283         int i;
1284         size_t len;
1285         char *suffix;
1286         struct lxc_conf *conf;
1287
1288         if (!ops)
1289                 return ret_set_errno(false, ENOENT);
1290
1291         if (!ops->hierarchies)
1292                 return true;
1293
1294         if (ops->container_cgroup)
1295                 return ret_set_errno(false, EEXIST);
1296
1297         if (!handler || !handler->conf)
1298                 return ret_set_errno(false, EINVAL);
1299
1300         conf = handler->conf;
1301
1302         if (conf->cgroup_meta.dir) {
1303                 cgroup_tree = conf->cgroup_meta.dir;
1304                 container_cgroup = must_concat(&len, cgroup_tree, "/",
1305                                              DEFAULT_PAYLOAD_CGROUP_PREFIX,
1306                                              handler->name,
1307                                              CGROUP_CREATE_RETRY, NULL);
1308         } else if (ops->cgroup_pattern) {
1309                 __cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1310                 if (!__cgroup_tree)
1311                         return ret_set_errno(false, ENOMEM);
1312
1313                 cgroup_tree = __cgroup_tree;
1314                 container_cgroup = must_concat(&len, cgroup_tree, "/",
1315                                                DEFAULT_PAYLOAD_CGROUP,
1316                                                CGROUP_CREATE_RETRY, NULL);
1317         } else {
1318                 cgroup_tree = NULL;
1319                 container_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1320                                              handler->name,
1321                                              CGROUP_CREATE_RETRY, NULL);
1322         }
1323         if (!container_cgroup)
1324                 return ret_set_errno(false, ENOMEM);
1325
1326         suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1327         *suffix = '\0';
1328         do {
1329                 if (idx)
1330                         sprintf(suffix, "-%d", idx);
1331
1332                 for (i = 0; ops->hierarchies[i]; i++) {
1333                         if (create_cgroup_tree(ops->hierarchies[i], cgroup_tree, container_cgroup, true))
1334                                 continue;
1335
1336                         ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path ?: "(null)");
1337                         for (int j = 0; j < i; j++)
1338                                 cgroup_remove_leaf(ops->hierarchies[j], true);
1339
1340                         idx++;
1341                         break;
1342                 }
1343         } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
1344
1345         if (idx == 1000)
1346                 return ret_set_errno(false, ERANGE);
1347
1348         ops->container_cgroup = move_ptr(container_cgroup);
1349         INFO("The container process uses \"%s\" as cgroup", ops->container_cgroup);
1350         return true;
1351 }
1352
1353 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1354                                               struct lxc_handler *handler)
1355 {
1356         int monitor_len, transient_len;
1357         char monitor[INTTYPE_TO_STRLEN(pid_t)],
1358             transient[INTTYPE_TO_STRLEN(pid_t)];
1359
1360         if (!ops)
1361                 return ret_set_errno(false, ENOENT);
1362
1363         if (!ops->hierarchies)
1364                 return true;
1365
1366         if (!ops->monitor_cgroup)
1367                 return ret_set_errno(false, ENOENT);
1368
1369         if (!handler || !handler->conf)
1370                 return ret_set_errno(false, EINVAL);
1371
1372         monitor_len = snprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1373         if (handler->transient_pid > 0)
1374                 transient_len = snprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1375
1376         for (int i = 0; ops->hierarchies[i]; i++) {
1377                 struct hierarchy *h = ops->hierarchies[i];
1378                 int ret;
1379
1380                 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", monitor, monitor_len);
1381                 if (ret)
1382                         return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->monitor_full_path);
1383
1384                 if (handler->transient_pid < 0)
1385                         return true;
1386
1387                 ret = lxc_writeat(h->cgfd_mon, "cgroup.procs", transient, transient_len);
1388                 if (ret)
1389                         return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->monitor_full_path);
1390
1391                 /*
1392                  * we don't keep the fds for non-unified hierarchies around
1393                  * mainly because we don't make use of them anymore after the
1394                  * core cgroup setup is done but also because there are quite a
1395                  * lot of them.
1396                  */
1397                 if (!is_unified_hierarchy(h))
1398                         close_prot_errno_disarm(h->cgfd_mon);
1399         }
1400         handler->transient_pid = -1;
1401
1402         return true;
1403 }
1404
1405 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1406                                               struct lxc_handler *handler)
1407 {
1408         int len;
1409         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1410
1411         if (!ops)
1412                 return ret_set_errno(false, ENOENT);
1413
1414         if (!ops->hierarchies)
1415                 return true;
1416
1417         if (!ops->container_cgroup)
1418                 return ret_set_errno(false, ENOENT);
1419
1420         if (!handler || !handler->conf)
1421                 return ret_set_errno(false, EINVAL);
1422
1423         len = snprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1424
1425         for (int i = 0; ops->hierarchies[i]; i++) {
1426                 struct hierarchy *h = ops->hierarchies[i];
1427                 int ret;
1428
1429                 ret = lxc_writeat(h->cgfd_con, "cgroup.procs", pidstr, len);
1430                 if (ret != 0)
1431                         return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->container_full_path);
1432         }
1433
1434         return true;
1435 }
1436
1437 static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1438                       gid_t chown_gid, mode_t chmod_mode)
1439 {
1440         int ret;
1441
1442         ret = fchownat(dirfd, path, chown_uid, chown_gid,
1443                        AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1444         if (ret < 0)
1445                 return log_warn_errno(-1,
1446                                       errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1447                                       dirfd, path, (int)chown_uid,
1448                                       (int)chown_gid);
1449
1450         ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1451         if (ret < 0)
1452                 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1453                                       dirfd, path, (int)chmod_mode);
1454
1455         return 0;
1456 }
1457
1458 /* chgrp the container cgroups to container group.  We leave
1459  * the container owner as cgroup owner.  So we must make the
1460  * directories 775 so that the container can create sub-cgroups.
1461  *
1462  * Also chown the tasks and cgroup.procs files.  Those may not
1463  * exist depending on kernel version.
1464  */
1465 static int chown_cgroup_wrapper(void *data)
1466 {
1467         int ret;
1468         uid_t destuid;
1469         struct generic_userns_exec_data *arg = data;
1470         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1471         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1472
1473         if (!lxc_setgroups(0, NULL) && errno != EPERM)
1474                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1475
1476         ret = setresgid(nsgid, nsgid, nsgid);
1477         if (ret < 0)
1478                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
1479                                        (int)nsgid, (int)nsgid, (int)nsgid);
1480
1481         ret = setresuid(nsuid, nsuid, nsuid);
1482         if (ret < 0)
1483                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
1484                                        (int)nsuid, (int)nsuid, (int)nsuid);
1485
1486         destuid = get_ns_uid(arg->origuid);
1487         if (destuid == LXC_INVALID_UID)
1488                 destuid = 0;
1489
1490         for (int i = 0; arg->hierarchies[i]; i++) {
1491                 int dirfd = arg->hierarchies[i]->cgfd_con;
1492
1493                 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
1494
1495                 /*
1496                  * Failures to chown() these are inconvenient but not
1497                  * detrimental We leave these owned by the container launcher,
1498                  * so that container root can write to the files to attach.  We
1499                  * chmod() them 664 so that container systemd can write to the
1500                  * files (which systemd in wily insists on doing).
1501                  */
1502
1503                 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC)
1504                         (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
1505
1506                 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
1507
1508                 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
1509                         continue;
1510
1511                 for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++)
1512                         (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
1513         }
1514
1515         return 0;
1516 }
1517
1518 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1519                                       struct lxc_conf *conf)
1520 {
1521         struct generic_userns_exec_data wrap;
1522
1523         if (!ops)
1524                 return ret_set_errno(false, ENOENT);
1525
1526         if (!ops->hierarchies)
1527                 return true;
1528
1529         if (!ops->container_cgroup)
1530                 return ret_set_errno(false, ENOENT);
1531
1532         if (!conf)
1533                 return ret_set_errno(false, EINVAL);
1534
1535         if (lxc_list_empty(&conf->id_map))
1536                 return true;
1537
1538         wrap.origuid = geteuid();
1539         wrap.path = NULL;
1540         wrap.hierarchies = ops->hierarchies;
1541         wrap.conf = conf;
1542
1543         if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1544                 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1545
1546         return true;
1547 }
1548
1549 __cgfsng_ops void cgfsng_payload_finalize(struct cgroup_ops *ops)
1550 {
1551         if (!ops)
1552                 return;
1553
1554         if (!ops->hierarchies)
1555                 return;
1556
1557         for (int i = 0; ops->hierarchies[i]; i++) {
1558                 struct hierarchy *h = ops->hierarchies[i];
1559                 /*
1560                  * we don't keep the fds for non-unified hierarchies around
1561                  * mainly because we don't make use of them anymore after the
1562                  * core cgroup setup is done but also because there are quite a
1563                  * lot of them.
1564                  */
1565                 if (!is_unified_hierarchy(h))
1566                         close_prot_errno_disarm(h->cgfd_con);
1567         }
1568 }
1569
1570 /* cgroup-full:* is done, no need to create subdirs */
1571 static inline bool cg_mount_needs_subdirs(int type)
1572 {
1573         return !(type >= LXC_AUTO_CGROUP_FULL_RO);
1574 }
1575
1576 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1577  * remount controller ro if needed and bindmount the cgroupfs onto
1578  * control/the/cg/path.
1579  */
1580 static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1581                                        char *controllerpath, char *cgpath,
1582                                        const char *container_cgroup)
1583 {
1584         __do_free char *sourcepath = NULL;
1585         int ret, remount_flags;
1586         int flags = MS_BIND;
1587
1588         if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1589                 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1590                 if (ret < 0)
1591                         return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1592                                                controllerpath, controllerpath);
1593
1594                 remount_flags = add_required_remount_flags(controllerpath,
1595                                                            controllerpath,
1596                                                            flags | MS_REMOUNT);
1597                 ret = mount(controllerpath, controllerpath, "cgroup",
1598                             remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1599                             NULL);
1600                 if (ret < 0)
1601                         return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", controllerpath);
1602
1603                 INFO("Remounted %s read-only", controllerpath);
1604         }
1605
1606         sourcepath = must_make_path(h->mountpoint, h->container_base_path,
1607                                     container_cgroup, NULL);
1608         if (type == LXC_AUTO_CGROUP_RO)
1609                 flags |= MS_RDONLY;
1610
1611         ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1612         if (ret < 0)
1613                 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1614                                        h->controllers[0], cgpath);
1615         INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1616
1617         if (flags & MS_RDONLY) {
1618                 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1619                                                            flags | MS_REMOUNT);
1620                 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1621                 if (ret < 0)
1622                         return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
1623                 INFO("Remounted %s read-only", cgpath);
1624         }
1625
1626         INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1627         return 0;
1628 }
1629
1630 /* __cg_mount_direct
1631  *
1632  * Mount cgroup hierarchies directly without using bind-mounts. The main
1633  * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1634  * cgroups for the LXC_AUTO_CGROUP_FULL option.
1635  */
1636 static int __cg_mount_direct(int type, struct hierarchy *h,
1637                              const char *controllerpath)
1638 {
1639          __do_free char *controllers = NULL;
1640          char *fstype = "cgroup2";
1641          unsigned long flags = 0;
1642          int ret;
1643
1644          flags |= MS_NOSUID;
1645          flags |= MS_NOEXEC;
1646          flags |= MS_NODEV;
1647          flags |= MS_RELATIME;
1648
1649          if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1650                  flags |= MS_RDONLY;
1651
1652          if (h->version != CGROUP2_SUPER_MAGIC) {
1653                  controllers = lxc_string_join(",", (const char **)h->controllers, false);
1654                  if (!controllers)
1655                          return -ENOMEM;
1656                  fstype = "cgroup";
1657         }
1658
1659         ret = mount("cgroup", controllerpath, fstype, flags, controllers);
1660         if (ret < 0)
1661                 return log_error_errno(-1, errno, "Failed to mount \"%s\" with cgroup filesystem type %s",
1662                                        controllerpath, fstype);
1663
1664         DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1665         return 0;
1666 }
1667
1668 static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1669                                                const char *controllerpath)
1670 {
1671         return __cg_mount_direct(type, h, controllerpath);
1672 }
1673
1674 static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
1675                                        const char *controllerpath)
1676 {
1677         if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1678                 return 0;
1679
1680         return __cg_mount_direct(type, h, controllerpath);
1681 }
1682
1683 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1684                                       struct lxc_handler *handler,
1685                                       const char *root, int type)
1686 {
1687         __do_free char *cgroup_root = NULL;
1688         bool has_cgns = false, wants_force_mount = false;
1689         int ret;
1690
1691         if (!ops)
1692                 return ret_set_errno(false, ENOENT);
1693
1694         if (!ops->hierarchies)
1695                 return true;
1696
1697         if (!handler || !handler->conf)
1698                 return ret_set_errno(false, EINVAL);
1699
1700         if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1701                 return true;
1702
1703         if (type & LXC_AUTO_CGROUP_FORCE) {
1704                 type &= ~LXC_AUTO_CGROUP_FORCE;
1705                 wants_force_mount = true;
1706         }
1707
1708         if (!wants_force_mount){
1709                 if (!lxc_list_empty(&handler->conf->keepcaps))
1710                         wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1711                 else
1712                         wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1713         }
1714
1715         has_cgns = cgns_supported();
1716         if (has_cgns && !wants_force_mount)
1717                 return true;
1718
1719         if (type == LXC_AUTO_CGROUP_NOSPEC)
1720                 type = LXC_AUTO_CGROUP_MIXED;
1721         else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1722                 type = LXC_AUTO_CGROUP_FULL_MIXED;
1723
1724         cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1725         if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
1726                 if (has_cgns && wants_force_mount) {
1727                         /*
1728                          * If cgroup namespaces are supported but the container
1729                          * will not have CAP_SYS_ADMIN after it has started we
1730                          * need to mount the cgroups manually.
1731                          */
1732                         return cg_mount_in_cgroup_namespace(type, ops->unified, cgroup_root) == 0;
1733                 }
1734
1735                 return cg_mount_cgroup_full(type, ops->unified, cgroup_root) == 0;
1736         }
1737
1738         /* mount tmpfs */
1739         ret = safe_mount(NULL, cgroup_root, "tmpfs",
1740                          MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1741                          "size=10240k,mode=755", root);
1742         if (ret < 0)
1743                 return false;
1744
1745         for (int i = 0; ops->hierarchies[i]; i++) {
1746                 __do_free char *controllerpath = NULL, *path2 = NULL;
1747                 struct hierarchy *h = ops->hierarchies[i];
1748                 char *controller = strrchr(h->mountpoint, '/');
1749
1750                 if (!controller)
1751                         continue;
1752                 controller++;
1753
1754                 controllerpath = must_make_path(cgroup_root, controller, NULL);
1755                 if (dir_exists(controllerpath))
1756                         continue;
1757
1758                 ret = mkdir(controllerpath, 0755);
1759                 if (ret < 0)
1760                         return log_error_errno(false, errno, "Error creating cgroup path: %s", controllerpath);
1761
1762                 if (has_cgns && wants_force_mount) {
1763                         /* If cgroup namespaces are supported but the container
1764                          * will not have CAP_SYS_ADMIN after it has started we
1765                          * need to mount the cgroups manually.
1766                          */
1767                         ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
1768                         if (ret < 0)
1769                                 return false;
1770
1771                         continue;
1772                 }
1773
1774                 ret = cg_mount_cgroup_full(type, h, controllerpath);
1775                 if (ret < 0)
1776                         return false;
1777
1778                 if (!cg_mount_needs_subdirs(type))
1779                         continue;
1780
1781                 path2 = must_make_path(controllerpath, h->container_base_path,
1782                                        ops->container_cgroup, NULL);
1783                 ret = mkdir_p(path2, 0755);
1784                 if (ret < 0)
1785                         return false;
1786
1787                 ret = cg_legacy_mount_controllers(type, h, controllerpath,
1788                                                   path2, ops->container_cgroup);
1789                 if (ret < 0)
1790                         return false;
1791         }
1792
1793         return true;
1794 }
1795
1796 /* Only root needs to escape to the cgroup of its init. */
1797 __cgfsng_ops static bool cgfsng_escape(const struct cgroup_ops *ops,
1798                                        struct lxc_conf *conf)
1799 {
1800         if (!ops)
1801                 return ret_set_errno(false, ENOENT);
1802
1803         if (!ops->hierarchies)
1804                 return true;
1805
1806         if (!conf)
1807                 return ret_set_errno(false, EINVAL);
1808
1809         if (conf->cgroup_meta.relative || geteuid())
1810                 return true;
1811
1812         for (int i = 0; ops->hierarchies[i]; i++) {
1813                 __do_free char *fullpath = NULL;
1814                 int ret;
1815
1816                 fullpath =
1817                     must_make_path(ops->hierarchies[i]->mountpoint,
1818                                    ops->hierarchies[i]->container_base_path,
1819                                    "cgroup.procs", NULL);
1820                 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1821                 if (ret != 0)
1822                         return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
1823         }
1824
1825         return true;
1826 }
1827
1828 __cgfsng_ops static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
1829 {
1830         int i = 0;
1831
1832         if (!ops)
1833                 return ret_set_errno(-1, ENOENT);
1834
1835         if (!ops->hierarchies)
1836                 return 0;
1837
1838         for (; ops->hierarchies[i]; i++)
1839                 ;
1840
1841         return i;
1842 }
1843
1844 __cgfsng_ops static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n,
1845                                                 char ***out)
1846 {
1847         int i;
1848
1849         if (!ops)
1850                 return ret_set_errno(false, ENOENT);
1851
1852         if (!ops->hierarchies)
1853                 return ret_set_errno(false, ENOENT);
1854
1855         /* sanity check n */
1856         for (i = 0; i < n; i++)
1857                 if (!ops->hierarchies[i])
1858                         return ret_set_errno(false, ENOENT);
1859
1860         *out = ops->hierarchies[i]->controllers;
1861
1862         return true;
1863 }
1864
1865 static bool cg_legacy_freeze(struct cgroup_ops *ops)
1866 {
1867         struct hierarchy *h;
1868
1869         h = get_hierarchy(ops, "freezer");
1870         if (!h)
1871                 return ret_set_errno(-1, ENOENT);
1872
1873         return lxc_write_openat(h->container_full_path, "freezer.state",
1874                                 "FROZEN", STRLITERALLEN("FROZEN"));
1875 }
1876
1877 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1878                                     struct lxc_epoll_descr *descr)
1879 {
1880         __do_close int duped_fd = -EBADF;
1881         __do_free char *line = NULL;
1882         __do_fclose FILE *f = NULL;
1883         int state = PTR_TO_INT(cbdata);
1884         size_t len;
1885         const char *state_string;
1886
1887         duped_fd = dup(fd);
1888         if (duped_fd < 0)
1889                 return LXC_MAINLOOP_ERROR;
1890
1891         if (lseek(duped_fd, 0, SEEK_SET) < (off_t)-1)
1892                 return LXC_MAINLOOP_ERROR;
1893
1894         f = fdopen(duped_fd, "re");
1895         if (!f)
1896                 return LXC_MAINLOOP_ERROR;
1897         move_fd(duped_fd);
1898
1899         if (state == 1)
1900                 state_string = "frozen 1";
1901         else
1902                 state_string = "frozen 0";
1903
1904         while (getline(&line, &len, f) != -1)
1905                 if (strncmp(line, state_string, STRLITERALLEN("frozen") + 2) == 0)
1906                         return LXC_MAINLOOP_CLOSE;
1907
1908         return LXC_MAINLOOP_CONTINUE;
1909 }
1910
1911 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
1912 {
1913         __do_close int fd = -EBADF;
1914         call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
1915         int ret;
1916         struct lxc_epoll_descr descr;
1917         struct hierarchy *h;
1918
1919         h = ops->unified;
1920         if (!h)
1921                 return ret_set_errno(-1, ENOENT);
1922
1923         if (!h->container_full_path)
1924                 return ret_set_errno(-1, EEXIST);
1925
1926         if (timeout != 0) {
1927                 __do_free char *events_file = NULL;
1928
1929                 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
1930                 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1931                 if (fd < 0)
1932                         return log_error_errno(-1, errno, "Failed to open cgroup.events file");
1933
1934                 ret = lxc_mainloop_open(&descr);
1935                 if (ret)
1936                         return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container freeze");
1937
1938                 /* automatically cleaned up now */
1939                 descr_ptr = &descr;
1940
1941                 ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, INT_TO_PTR((int){1}));
1942                 if (ret < 0)
1943                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
1944         }
1945
1946         ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", "1", 1);
1947         if (ret < 0)
1948                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
1949
1950         if (timeout != 0 && lxc_mainloop(&descr, timeout))
1951                 return log_error_errno(-1, errno, "Failed to wait for container to be frozen");
1952
1953         return 0;
1954 }
1955
1956 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
1957 {
1958         if (!ops->hierarchies)
1959                 return ret_set_errno(-1, ENOENT);
1960
1961         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
1962                 return cg_legacy_freeze(ops);
1963
1964         return cg_unified_freeze(ops, timeout);
1965 }
1966
1967 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
1968 {
1969         struct hierarchy *h;
1970
1971         h = get_hierarchy(ops, "freezer");
1972         if (!h)
1973                 return ret_set_errno(-1, ENOENT);
1974
1975         return lxc_write_openat(h->container_full_path, "freezer.state",
1976                                 "THAWED", STRLITERALLEN("THAWED"));
1977 }
1978
1979 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
1980 {
1981         __do_close int fd = -EBADF;
1982         call_cleaner(lxc_mainloop_close)struct lxc_epoll_descr *descr_ptr = NULL;
1983         int ret;
1984         struct lxc_epoll_descr descr;
1985         struct hierarchy *h;
1986
1987         h = ops->unified;
1988         if (!h)
1989                 return ret_set_errno(-1, ENOENT);
1990
1991         if (!h->container_full_path)
1992                 return ret_set_errno(-1, EEXIST);
1993
1994         if (timeout != 0) {
1995                 __do_free char *events_file = NULL;
1996
1997                 events_file = must_make_path(h->container_full_path, "cgroup.events", NULL);
1998                 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1999                 if (fd < 0)
2000                         return log_error_errno(-1, errno, "Failed to open cgroup.events file");
2001
2002                 ret = lxc_mainloop_open(&descr);
2003                 if (ret)
2004                         return log_error_errno(-1, errno, "Failed to create epoll instance to wait for container unfreeze");
2005
2006                 /* automatically cleaned up now */
2007                 descr_ptr = &descr;
2008
2009                 ret = lxc_mainloop_add_handler(&descr, fd, freezer_cgroup_events_cb, INT_TO_PTR((int){0}));
2010                 if (ret < 0)
2011                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
2012         }
2013
2014         ret = lxc_write_openat(h->container_full_path, "cgroup.freeze", "0", 1);
2015         if (ret < 0)
2016                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
2017
2018         if (timeout != 0 && lxc_mainloop(&descr, timeout))
2019                 return log_error_errno(-1, errno, "Failed to wait for container to be unfrozen");
2020
2021         return 0;
2022 }
2023
2024 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2025 {
2026         if (!ops->hierarchies)
2027                 return ret_set_errno(-1, ENOENT);
2028
2029         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2030                 return cg_legacy_unfreeze(ops);
2031
2032         return cg_unified_unfreeze(ops, timeout);
2033 }
2034
2035 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2036                                                   const char *controller)
2037 {
2038         struct hierarchy *h;
2039
2040         h = get_hierarchy(ops, controller);
2041         if (!h)
2042                 return log_warn_errno(NULL, ENOENT, "Failed to find hierarchy for controller \"%s\"",
2043                                       controller ? controller : "(null)");
2044
2045         return h->container_full_path
2046                    ? h->container_full_path + strlen(h->mountpoint)
2047                    : NULL;
2048 }
2049
2050 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2051  * which must be freed by the caller.
2052  */
2053 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2054                                                        const char *inpath,
2055                                                        const char *filename)
2056 {
2057         return must_make_path(h->mountpoint, inpath, filename, NULL);
2058 }
2059
2060 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
2061 {
2062         int idx = 1;
2063         int ret;
2064         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2065         size_t pidstr_len;
2066
2067         /* Create leaf cgroup. */
2068         ret = mkdirat(unified_fd, "lxc", 0755);
2069         if (ret < 0 && errno != EEXIST)
2070                 return log_error_errno(-1, errno, "Failed to create leaf cgroup \"lxc\"");
2071
2072         pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2073         ret = lxc_writeat(unified_fd, "lxc/cgroup.procs", pidstr, pidstr_len);
2074         if (ret < 0)
2075                 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2076         if (ret == 0)
2077                 return 0;
2078
2079         /* this is a non-leaf node */
2080         if (errno != EBUSY)
2081                 return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2082
2083         do {
2084                 bool rm = false;
2085                 char attach_cgroup[STRLITERALLEN("lxc-1000/cgroup.procs") + 1];
2086                 char *slash;
2087
2088                 sprintf(attach_cgroup, "lxc-%d/cgroup.procs", idx);
2089                 slash = &attach_cgroup[ret] - STRLITERALLEN("/cgroup.procs");
2090                 *slash = '\0';
2091
2092                 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2093                 if (ret < 0 && errno != EEXIST)
2094                         return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2095                 if (ret == 0)
2096                         rm = true;
2097
2098                 *slash = '/';
2099
2100                 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2101                 if (ret == 0)
2102                         return 0;
2103
2104                 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2105                         SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2106
2107                 /* this is a non-leaf node */
2108                 if (errno != EBUSY)
2109                         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2110
2111                 idx++;
2112         } while (idx < 1000);
2113
2114         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2115 }
2116
2117 struct userns_exec_unified_attach_data {
2118         const struct lxc_conf *conf;
2119         int unified_fd;
2120         pid_t pid;
2121 };
2122
2123 static int cgroup_unified_attach_wrapper(void *data)
2124 {
2125         struct userns_exec_unified_attach_data *args = data;
2126         uid_t nsuid;
2127         gid_t nsgid;
2128         int ret;
2129
2130         if (!args->conf || args->unified_fd < 0 || args->pid <= 0)
2131                 return ret_errno(EINVAL);
2132
2133         if (!lxc_setgroups(0, NULL) && errno != EPERM)
2134                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
2135
2136         nsuid = (args->conf->root_nsuid_map != NULL) ? 0 : args->conf->init_uid;
2137         nsgid = (args->conf->root_nsgid_map != NULL) ? 0 : args->conf->init_gid;
2138
2139         ret = setresgid(nsgid, nsgid, nsgid);
2140         if (ret < 0)
2141                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
2142                                        (int)nsgid, (int)nsgid, (int)nsgid);
2143
2144         ret = setresuid(nsuid, nsuid, nsuid);
2145         if (ret < 0)
2146                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
2147                                        (int)nsuid, (int)nsuid, (int)nsuid);
2148
2149         return cgroup_attach_leaf(args->conf, args->unified_fd, args->pid);
2150 }
2151
2152 int cgroup_attach(const struct lxc_conf *conf, const char *name,
2153                   const char *lxcpath, pid_t pid)
2154 {
2155         __do_close int unified_fd = -EBADF;
2156         int ret;
2157
2158         if (!conf || !name || !lxcpath || pid <= 0)
2159                 return ret_errno(EINVAL);
2160
2161         unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
2162         if (unified_fd < 0)
2163                 return ret_errno(EBADF);
2164
2165         if (!lxc_list_empty(&conf->id_map)) {
2166                 struct userns_exec_unified_attach_data args = {
2167                         .conf           = conf,
2168                         .unified_fd     = unified_fd,
2169                         .pid            = pid,
2170                 };
2171
2172                 ret = userns_exec_minimal(conf, cgroup_unified_attach_wrapper, &args);
2173         } else {
2174                 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2175         }
2176
2177         return ret;
2178 }
2179
2180 /* Technically, we're always at a delegation boundary here (This is especially
2181  * true when cgroup namespaces are available.). The reasoning is that in order
2182  * for us to have been able to start a container in the first place the root
2183  * cgroup must have been a leaf node. Now, either the container's init system
2184  * has populated the cgroup and kept it as a leaf node or it has created
2185  * subtrees. In the former case we will simply attach to the leaf node we
2186  * created when we started the container in the latter case we create our own
2187  * cgroup for the attaching process.
2188  */
2189 static int __cg_unified_attach(const struct hierarchy *h,
2190                                const struct lxc_conf *conf, const char *name,
2191                                const char *lxcpath, pid_t pid,
2192                                const char *controller)
2193 {
2194         __do_close int unified_fd = -EBADF;
2195         __do_free char *path = NULL, *cgroup = NULL;
2196         int ret;
2197
2198         if (!conf || !name || !lxcpath || pid <= 0)
2199                 return ret_errno(EINVAL);
2200
2201         ret = cgroup_attach(conf, name, lxcpath, pid);
2202         if (ret == 0)
2203                 return log_trace(0, "Attached to unified cgroup via command handler");
2204         if (ret != -EBADF)
2205                 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2206
2207         /* Fall back to retrieving the path for the unified cgroup. */
2208         cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2209         /* not running */
2210         if (!cgroup)
2211                 return 0;
2212
2213         path = must_make_path(h->mountpoint, cgroup, NULL);
2214
2215         unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
2216         if (unified_fd < 0)
2217                 return ret_errno(EBADF);
2218
2219         if (!lxc_list_empty(&conf->id_map)) {
2220                 struct userns_exec_unified_attach_data args = {
2221                         .conf           = conf,
2222                         .unified_fd     = unified_fd,
2223                         .pid            = pid,
2224                 };
2225
2226                 ret = userns_exec_minimal(conf, cgroup_unified_attach_wrapper, &args);
2227         } else {
2228                 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2229         }
2230
2231         return ret;
2232 }
2233
2234 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2235                                        const struct lxc_conf *conf,
2236                                        const char *name, const char *lxcpath,
2237                                        pid_t pid)
2238 {
2239         int len, ret;
2240         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2241
2242         if (!ops)
2243                 return ret_set_errno(false, ENOENT);
2244
2245         if (!ops->hierarchies)
2246                 return true;
2247
2248         len = snprintf(pidstr, sizeof(pidstr), "%d", pid);
2249         if (len < 0 || (size_t)len >= sizeof(pidstr))
2250                 return false;
2251
2252         for (int i = 0; ops->hierarchies[i]; i++) {
2253                 __do_free char *fullpath = NULL, *path = NULL;
2254                 struct hierarchy *h = ops->hierarchies[i];
2255
2256                 if (h->version == CGROUP2_SUPER_MAGIC) {
2257                         ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
2258                                                   h->controllers[0]);
2259                         if (ret < 0)
2260                                 return false;
2261
2262                         continue;
2263                 }
2264
2265                 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2266                 /* not running */
2267                 if (!path)
2268                         return false;
2269
2270                 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2271                 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2272                 if (ret < 0)
2273                         return log_error_errno(false, errno, "Failed to attach %d to %s",
2274                                                (int)pid, fullpath);
2275         }
2276
2277         return true;
2278 }
2279
2280 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.  Here we
2281  * don't have a cgroup_data set up, so we ask the running container through the
2282  * commands API for the cgroup path.
2283  */
2284 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2285                                      char *value, size_t len, const char *name,
2286                                      const char *lxcpath)
2287 {
2288         __do_free char *path = NULL;
2289         __do_free char *controller = NULL;
2290         char *p;
2291         struct hierarchy *h;
2292         int ret = -1;
2293
2294         if (!ops)
2295                 return ret_set_errno(-1, ENOENT);
2296
2297         controller = must_copy_string(filename);
2298         p = strchr(controller, '.');
2299         if (p)
2300                 *p = '\0';
2301
2302         path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2303         /* not running */
2304         if (!path)
2305                 return -1;
2306
2307         h = get_hierarchy(ops, controller);
2308         if (h) {
2309                 __do_free char *fullpath = NULL;
2310
2311                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2312                 ret = lxc_read_from_file(fullpath, value, len);
2313         }
2314
2315         return ret;
2316 }
2317
2318 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2319 {
2320         for (int count = 0; count < 3; count++, val++) {
2321                 switch (*val) {
2322                 case 'r':
2323                         device->access[count] = *val;
2324                         break;
2325                 case 'w':
2326                         device->access[count] = *val;
2327                         break;
2328                 case 'm':
2329                         device->access[count] = *val;
2330                         break;
2331                 case '\n':
2332                 case '\0':
2333                         count = 3;
2334                         break;
2335                 default:
2336                         return ret_errno(EINVAL);
2337                 }
2338         }
2339
2340         return 0;
2341 }
2342
2343 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2344                                     const char *val)
2345 {
2346         int count, ret;
2347         char temp[50];
2348
2349         if (strcmp("devices.allow", key) == 0)
2350                 device->allow = 1;
2351         else
2352                 device->allow = 0;
2353
2354         if (strcmp(val, "a") == 0) {
2355                 /* global rule */
2356                 device->type = 'a';
2357                 device->major = -1;
2358                 device->minor = -1;
2359                 device->global_rule = device->allow
2360                                           ? LXC_BPF_DEVICE_CGROUP_BLACKLIST
2361                                           : LXC_BPF_DEVICE_CGROUP_WHITELIST;
2362                 device->allow = -1;
2363                 return 0;
2364         }
2365
2366         /* local rule */
2367         device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
2368
2369         switch (*val) {
2370         case 'a':
2371                 __fallthrough;
2372         case 'b':
2373                 __fallthrough;
2374         case 'c':
2375                 device->type = *val;
2376                 break;
2377         default:
2378                 return -1;
2379         }
2380
2381         val++;
2382         if (!isspace(*val))
2383                 return -1;
2384         val++;
2385         if (*val == '*') {
2386                 device->major = -1;
2387                 val++;
2388         } else if (isdigit(*val)) {
2389                 memset(temp, 0, sizeof(temp));
2390                 for (count = 0; count < sizeof(temp) - 1; count++) {
2391                         temp[count] = *val;
2392                         val++;
2393                         if (!isdigit(*val))
2394                                 break;
2395                 }
2396                 ret = lxc_safe_int(temp, &device->major);
2397                 if (ret)
2398                         return -1;
2399         } else {
2400                 return -1;
2401         }
2402         if (*val != ':')
2403                 return -1;
2404         val++;
2405
2406         /* read minor */
2407         if (*val == '*') {
2408                 device->minor = -1;
2409                 val++;
2410         } else if (isdigit(*val)) {
2411                 memset(temp, 0, sizeof(temp));
2412                 for (count = 0; count < sizeof(temp) - 1; count++) {
2413                         temp[count] = *val;
2414                         val++;
2415                         if (!isdigit(*val))
2416                                 break;
2417                 }
2418                 ret = lxc_safe_int(temp, &device->minor);
2419                 if (ret)
2420                         return -1;
2421         } else {
2422                 return -1;
2423         }
2424         if (!isspace(*val))
2425                 return -1;
2426
2427         return device_cgroup_parse_access(device, ++val);
2428 }
2429
2430 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.  Here we
2431  * don't have a cgroup_data set up, so we ask the running container through the
2432  * commands API for the cgroup path.
2433  */
2434 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2435                                      const char *key, const char *value,
2436                                      const char *name, const char *lxcpath)
2437 {
2438         __do_free char *path = NULL;
2439         __do_free char *controller = NULL;
2440         char *p;
2441         struct hierarchy *h;
2442         int ret = -1;
2443
2444         if (!ops)
2445                 return ret_set_errno(-1, ENOENT);
2446
2447         controller = must_copy_string(key);
2448         p = strchr(controller, '.');
2449         if (p)
2450                 *p = '\0';
2451
2452         if (pure_unified_layout(ops) && strcmp(controller, "devices") == 0) {
2453                 struct device_item device = {0};
2454
2455                 ret = device_cgroup_rule_parse(&device, key, value);
2456                 if (ret < 0)
2457                         return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2458                                                key, value);
2459
2460                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2461                 if (ret < 0)
2462                         return -1;
2463
2464                 return 0;
2465         }
2466
2467         path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2468         /* not running */
2469         if (!path)
2470                 return -1;
2471
2472         h = get_hierarchy(ops, controller);
2473         if (h) {
2474                 __do_free char *fullpath = NULL;
2475
2476                 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2477                 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2478         }
2479
2480         return ret;
2481 }
2482
2483 /* take devices cgroup line
2484  *    /dev/foo rwx
2485  * and convert it to a valid
2486  *    type major:minor mode
2487  * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2488  * the output.
2489  */
2490 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2491                                             const char *devpath)
2492 {
2493         __do_free char *path = NULL;
2494         char *mode = NULL;
2495         int n_parts, ret;
2496         char *p;
2497         struct stat sb;
2498
2499         path = must_copy_string(devpath);
2500
2501         /*
2502          * Read path followed by mode. Ignore any trailing text.
2503          * A '    # comment' would be legal. Technically other text is not
2504          * legal, we could check for that if we cared to.
2505          */
2506         for (n_parts = 1, p = path; *p; p++) {
2507                 if (*p != ' ')
2508                         continue;
2509                 *p = '\0';
2510
2511                 if (n_parts != 1)
2512                         break;
2513                 p++;
2514                 n_parts++;
2515
2516                 while (*p == ' ')
2517                         p++;
2518
2519                 mode = p;
2520
2521                 if (*p == '\0')
2522                         return ret_set_errno(-1, EINVAL);
2523         }
2524
2525         if (device_cgroup_parse_access(device, mode) < 0)
2526                 return -1;
2527
2528         if (n_parts == 1)
2529                 return ret_set_errno(-1, EINVAL);
2530
2531         ret = stat(path, &sb);
2532         if (ret < 0)
2533                 return ret_set_errno(-1, errno);
2534
2535         mode_t m = sb.st_mode & S_IFMT;
2536         switch (m) {
2537         case S_IFBLK:
2538                 device->type = 'b';
2539                 break;
2540         case S_IFCHR:
2541                 device->type = 'c';
2542                 break;
2543         default:
2544                 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
2545         }
2546
2547         device->major = MAJOR(sb.st_rdev);
2548         device->minor = MINOR(sb.st_rdev);
2549         device->allow = 1;
2550         device->global_rule = LXC_BPF_DEVICE_CGROUP_LOCAL_RULE;
2551
2552         return 0;
2553 }
2554
2555 static int convert_devpath(const char *invalue, char *dest)
2556 {
2557         struct device_item device = {0};
2558         int ret;
2559
2560         ret = device_cgroup_rule_parse_devpath(&device, invalue);
2561         if (ret < 0)
2562                 return -1;
2563
2564         ret = snprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2565                        device.minor, device.access);
2566         if (ret < 0 || ret >= 50)
2567                 return log_error_errno(-1, ENAMETOOLONG, "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2568                                        device.type, device.major, device.minor, device.access);
2569
2570         return 0;
2571 }
2572
2573 /* Called from setup_limits - here we have the container's cgroup_data because
2574  * we created the cgroups.
2575  */
2576 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2577                               const char *value)
2578 {
2579         __do_free char *controller = NULL;
2580         char *p;
2581         /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2582         char converted_value[50];
2583         struct hierarchy *h;
2584
2585         controller = must_copy_string(filename);
2586         p = strchr(controller, '.');
2587         if (p)
2588                 *p = '\0';
2589
2590         if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
2591                 int ret;
2592
2593                 ret = convert_devpath(value, converted_value);
2594                 if (ret < 0)
2595                         return ret;
2596                 value = converted_value;
2597         }
2598
2599         h = get_hierarchy(ops, controller);
2600         if (!h)
2601                 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
2602
2603         return lxc_write_openat(h->container_full_path, filename, value, strlen(value));
2604 }
2605
2606 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2607                                                     struct lxc_conf *conf,
2608                                                     bool do_devices)
2609 {
2610         __do_free struct lxc_list *sorted_cgroup_settings = NULL;
2611         struct lxc_list *cgroup_settings = &conf->cgroup;
2612         struct lxc_list *iterator, *next;
2613         struct lxc_cgroup *cg;
2614         bool ret = false;
2615
2616         if (!ops)
2617                 return ret_set_errno(false, ENOENT);
2618
2619         if (!conf)
2620                 return ret_set_errno(false, EINVAL);
2621
2622         cgroup_settings = &conf->cgroup;
2623         if (lxc_list_empty(cgroup_settings))
2624                 return true;
2625
2626         if (!ops->hierarchies)
2627                 return ret_set_errno(false, EINVAL);
2628
2629         sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2630         if (!sorted_cgroup_settings)
2631                 return false;
2632
2633         lxc_list_for_each(iterator, sorted_cgroup_settings) {
2634                 cg = iterator->elem;
2635
2636                 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2637                         if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
2638                                 if (do_devices && (errno == EACCES || errno == EPERM)) {
2639                                         SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2640                                         continue;
2641                                 }
2642                                 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2643                                 goto out;
2644                         }
2645                         DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
2646                 }
2647         }
2648
2649         ret = true;
2650         INFO("Limits for the legacy cgroup hierarchies have been setup");
2651 out:
2652         lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2653                 lxc_list_del(iterator);
2654                 free(iterator);
2655         }
2656
2657         return ret;
2658 }
2659
2660 /*
2661  * Some of the parsing logic comes from the original cgroup device v1
2662  * implementation in the kernel.
2663  */
2664 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2665                                      struct lxc_conf *conf, const char *key,
2666                                      const char *val)
2667 {
2668 #ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
2669         struct device_item device_item = {0};
2670         int ret;
2671
2672         if (strcmp("devices.allow", key) == 0 && *val == '/')
2673                 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2674         else
2675                 ret = device_cgroup_rule_parse(&device_item, key, val);
2676         if (ret < 0)
2677                 return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", key, val);
2678
2679         ret = bpf_list_add_device(conf, &device_item);
2680         if (ret < 0)
2681                 return -1;
2682 #endif
2683         return 0;
2684 }
2685
2686 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2687                                              struct lxc_handler *handler)
2688 {
2689         struct lxc_list *cgroup_settings, *iterator;
2690         struct hierarchy *h;
2691         struct lxc_conf *conf;
2692
2693         if (!ops)
2694                 return ret_set_errno(false, ENOENT);
2695
2696         if (!ops->hierarchies)
2697                 return true;
2698
2699         if (!ops->container_cgroup)
2700                 return ret_set_errno(false, EINVAL);
2701
2702         if (!handler || !handler->conf)
2703                 return ret_set_errno(false, EINVAL);
2704         conf = handler->conf;
2705
2706         if (lxc_list_empty(&conf->cgroup2))
2707                 return true;
2708         cgroup_settings = &conf->cgroup2;
2709
2710         if (!ops->unified)
2711                 return false;
2712         h = ops->unified;
2713
2714         lxc_list_for_each (iterator, cgroup_settings) {
2715                 struct lxc_cgroup *cg = iterator->elem;
2716                 int ret;
2717
2718                 if (strncmp("devices", cg->subsystem, 7) == 0) {
2719                         ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem,
2720                                                         cg->value);
2721                 } else {
2722                         ret = lxc_write_openat(h->container_full_path,
2723                                                cg->subsystem, cg->value,
2724                                                strlen(cg->value));
2725                         if (ret < 0)
2726                                 return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"",
2727                                                        cg->subsystem, cg->value);
2728                 }
2729                 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2730         }
2731
2732         return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
2733 }
2734
2735 __cgfsng_ops bool cgfsng_devices_activate(struct cgroup_ops *ops,
2736                                           struct lxc_handler *handler)
2737 {
2738 #ifdef HAVE_STRUCT_BPF_CGROUP_DEV_CTX
2739         __do_bpf_program_free struct bpf_program *devices = NULL;
2740         int ret;
2741         struct lxc_conf *conf;
2742         struct hierarchy *unified;
2743         struct lxc_list *it;
2744         struct bpf_program *devices_old;
2745
2746         if (!ops)
2747                 return ret_set_errno(false, ENOENT);
2748
2749         if (!ops->hierarchies)
2750                 return true;
2751
2752         if (!ops->container_cgroup)
2753                 return ret_set_errno(false, EEXIST);
2754
2755         if (!handler || !handler->conf)
2756                 return ret_set_errno(false, EINVAL);
2757         conf = handler->conf;
2758
2759         unified = ops->unified;
2760         if (!unified || !unified->bpf_device_controller ||
2761             !unified->container_full_path || lxc_list_empty(&conf->devices))
2762                 return true;
2763
2764         devices = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE);
2765         if (!devices)
2766                 return log_error_errno(false, ENOMEM, "Failed to create new bpf program");
2767
2768         ret = bpf_program_init(devices);
2769         if (ret)
2770                 return log_error_errno(false, ENOMEM, "Failed to initialize bpf program");
2771
2772         lxc_list_for_each(it, &conf->devices) {
2773                 struct device_item *cur = it->elem;
2774
2775                 ret = bpf_program_append_device(devices, cur);
2776                 if (ret)
2777                         return log_error_errno(false, ENOMEM, "Failed to add new rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
2778                                                cur->type,
2779                                                cur->major,
2780                                                cur->minor,
2781                                                cur->access,
2782                                                cur->allow,
2783                                                cur->global_rule);
2784                 TRACE("Added rule to bpf device program: type %c, major %d, minor %d, access %s, allow %d, global_rule %d",
2785                       cur->type,
2786                       cur->major,
2787                       cur->minor,
2788                       cur->access,
2789                       cur->allow,
2790                       cur->global_rule);
2791         }
2792
2793         ret = bpf_program_finalize(devices);
2794         if (ret)
2795                 return log_error_errno(false, ENOMEM, "Failed to finalize bpf program");
2796
2797         ret = bpf_program_cgroup_attach(devices, BPF_CGROUP_DEVICE,
2798                                         unified->container_full_path,
2799                                         BPF_F_ALLOW_MULTI);
2800         if (ret)
2801                 return log_error_errno(false, ENOMEM, "Failed to attach bpf program");
2802
2803         /* Replace old bpf program. */
2804         devices_old = move_ptr(conf->cgroup2_devices);
2805         conf->cgroup2_devices = move_ptr(devices);
2806         devices = move_ptr(devices_old);
2807 #endif
2808         return true;
2809 }
2810
2811 bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
2812 {
2813         __do_free char *add_controllers = NULL, *base_path = NULL;
2814         __do_free_string_list char **parts = NULL;
2815         struct hierarchy *unified = ops->unified;
2816         ssize_t parts_len;
2817         char **it;
2818         size_t full_len = 0;
2819
2820         if (!ops->hierarchies || !pure_unified_layout(ops) ||
2821             !unified->controllers[0])
2822                 return true;
2823
2824         /* For now we simply enable all controllers that we have detected by
2825          * creating a string like "+memory +pids +cpu +io".
2826          * TODO: In the near future we might want to support "-<controller>"
2827          * etc. but whether supporting semantics like this make sense will need
2828          * some thinking.
2829          */
2830         for (it = unified->controllers; it && *it; it++) {
2831                 full_len += strlen(*it) + 2;
2832                 add_controllers = must_realloc(add_controllers, full_len + 1);
2833
2834                 if (unified->controllers[0] == *it)
2835                         add_controllers[0] = '\0';
2836
2837                 (void)strlcat(add_controllers, "+", full_len + 1);
2838                 (void)strlcat(add_controllers, *it, full_len + 1);
2839
2840                 if ((it + 1) && *(it + 1))
2841                         (void)strlcat(add_controllers, " ", full_len + 1);
2842         }
2843
2844         parts = lxc_string_split(cgroup, '/');
2845         if (!parts)
2846                 return false;
2847
2848         parts_len = lxc_array_len((void **)parts);
2849         if (parts_len > 0)
2850                 parts_len--;
2851
2852         base_path = must_make_path(unified->mountpoint, unified->container_base_path, NULL);
2853         for (ssize_t i = -1; i < parts_len; i++) {
2854                 int ret;
2855                 __do_free char *target = NULL;
2856
2857                 if (i >= 0)
2858                         base_path = must_append_path(base_path, parts[i], NULL);
2859                 target = must_make_path(base_path, "cgroup.subtree_control", NULL);
2860                 ret = lxc_writeat(-1, target, add_controllers, full_len);
2861                 if (ret < 0)
2862                         return log_error_errno(false, errno, "Could not enable \"%s\" controllers in the unified cgroup \"%s\"",
2863                                                add_controllers, target);
2864                 TRACE("Enable \"%s\" controllers in the unified cgroup \"%s\"", add_controllers, target);
2865         }
2866
2867         return true;
2868 }
2869
2870 __cgfsng_ops bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2871 {
2872         if (!ops)
2873                 return ret_set_errno(false, ENOENT);
2874
2875         return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2876 }
2877
2878 __cgfsng_ops bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2879 {
2880         if (!ops)
2881                 return ret_set_errno(false, ENOENT);
2882
2883         return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2884 }
2885
2886 static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
2887                                        char **controllers)
2888 {
2889         if (!ops->cgroup_use)
2890                 return true;
2891
2892         for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
2893                 bool found = false;
2894
2895                 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
2896                         if (strcmp(*cur_use, *cur_ctrl) != 0)
2897                                 continue;
2898
2899                         found = true;
2900                         break;
2901                 }
2902
2903                 if (found)
2904                         continue;
2905
2906                 return false;
2907         }
2908
2909         return true;
2910 }
2911
2912 static void cg_unified_delegate(char ***delegate)
2913 {
2914         __do_free char *buf = NULL;
2915         char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
2916         char *token;
2917         int idx;
2918
2919         buf = read_file("/sys/kernel/cgroup/delegate");
2920         if (!buf) {
2921                 for (char **p = standard; p && *p; p++) {
2922                         idx = append_null_to_list((void ***)delegate);
2923                         (*delegate)[idx] = must_copy_string(*p);
2924                 }
2925                 SYSWARN("Failed to read /sys/kernel/cgroup/delegate");
2926                 return;
2927         }
2928
2929         lxc_iterate_parts (token, buf, " \t\n") {
2930                 /*
2931                  * We always need to chown this for both cgroup and
2932                  * cgroup2.
2933                  */
2934                 if (strcmp(token, "cgroup.procs") == 0)
2935                         continue;
2936
2937                 idx = append_null_to_list((void ***)delegate);
2938                 (*delegate)[idx] = must_copy_string(token);
2939         }
2940 }
2941
2942 /* At startup, parse_hierarchies finds all the info we need about cgroup
2943  * mountpoints and current cgroups, and stores it in @d.
2944  */
2945 static int cg_hybrid_init(struct cgroup_ops *ops, bool relative, bool unprivileged)
2946 {
2947         __do_free char *basecginfo = NULL, *line = NULL;
2948         __do_free_string_list char **klist = NULL, **nlist = NULL;
2949         __do_fclose FILE *f = NULL;
2950         int ret;
2951         size_t len = 0;
2952
2953         /* Root spawned containers escape the current cgroup, so use init's
2954          * cgroups as our base in that case.
2955          */
2956         if (!relative && (geteuid() == 0))
2957                 basecginfo = read_file("/proc/1/cgroup");
2958         else
2959                 basecginfo = read_file("/proc/self/cgroup");
2960         if (!basecginfo)
2961                 return ret_set_errno(-1, ENOMEM);
2962
2963         ret = get_existing_subsystems(&klist, &nlist);
2964         if (ret < 0)
2965                 return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers");
2966
2967         f = fopen("/proc/self/mountinfo", "re");
2968         if (!f)
2969                 return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\"");
2970
2971         lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
2972
2973         while (getline(&line, &len, f) != -1) {
2974                 __do_free char *base_cgroup = NULL, *mountpoint = NULL;
2975                 __do_free_string_list char **controller_list = NULL;
2976                 int type;
2977                 bool writeable;
2978                 struct hierarchy *new;
2979
2980                 type = get_cgroup_version(line);
2981                 if (type == 0)
2982                         continue;
2983
2984                 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
2985                         continue;
2986
2987                 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
2988                         if (type == CGROUP2_SUPER_MAGIC)
2989                                 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2990                         else if (type == CGROUP_SUPER_MAGIC)
2991                                 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
2992                 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
2993                         if (type == CGROUP_SUPER_MAGIC)
2994                                 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2995                 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
2996                         if (type == CGROUP2_SUPER_MAGIC)
2997                                 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2998                 }
2999
3000                 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
3001                 if (!controller_list && type == CGROUP_SUPER_MAGIC)
3002                         continue;
3003
3004                 if (type == CGROUP_SUPER_MAGIC)
3005                         if (controller_list_is_dup(ops->hierarchies, controller_list)) {
3006                                 TRACE("Skipping duplicating controller");
3007                                 continue;
3008                         }
3009
3010                 mountpoint = cg_hybrid_get_mountpoint(line);
3011                 if (!mountpoint) {
3012                         ERROR("Failed parsing mountpoint from \"%s\"", line);
3013                         continue;
3014                 }
3015
3016                 if (type == CGROUP_SUPER_MAGIC)
3017                         base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
3018                 else
3019                         base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
3020                 if (!base_cgroup) {
3021                         ERROR("Failed to find current cgroup");
3022                         continue;
3023                 }
3024
3025                 trim(base_cgroup);
3026                 prune_init_scope(base_cgroup);
3027                 if (type == CGROUP2_SUPER_MAGIC)
3028                         writeable = test_writeable_v2(mountpoint, base_cgroup);
3029                 else
3030                         writeable = test_writeable_v1(mountpoint, base_cgroup);
3031                 if (!writeable) {
3032                         TRACE("The %s group is not writeable", base_cgroup);
3033                         continue;
3034                 }
3035
3036                 if (type == CGROUP2_SUPER_MAGIC) {
3037                         char *cgv2_ctrl_path;
3038
3039                         cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
3040                                                         "cgroup.controllers",
3041                                                         NULL);
3042
3043                         controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
3044                         free(cgv2_ctrl_path);
3045                         if (!controller_list) {
3046                                 controller_list = cg_unified_make_empty_controller();
3047                                 TRACE("No controllers are enabled for "
3048                                       "delegation in the unified hierarchy");
3049                         }
3050                 }
3051
3052                 /* Exclude all controllers that cgroup use does not want. */
3053                 if (!cgroup_use_wants_controllers(ops, controller_list)) {
3054                         TRACE("Skipping controller");
3055                         continue;
3056                 }
3057
3058                 new = add_hierarchy(&ops->hierarchies, move_ptr(controller_list), move_ptr(mountpoint), move_ptr(base_cgroup), type);
3059                 if (type == CGROUP2_SUPER_MAGIC && !ops->unified) {
3060                         if (unprivileged)
3061                                 cg_unified_delegate(&new->cgroup2_chown);
3062                         ops->unified = new;
3063                 }
3064         }
3065
3066         TRACE("Writable cgroup hierarchies:");
3067         lxc_cgfsng_print_hierarchies(ops);
3068
3069         /* verify that all controllers in cgroup.use and all crucial
3070          * controllers are accounted for
3071          */
3072         if (!all_controllers_found(ops))
3073                 return log_error_errno(-1, ENOENT, "Failed to find all required controllers");
3074
3075         return 0;
3076 }
3077
3078 /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
3079 static char *cg_unified_get_current_cgroup(bool relative)
3080 {
3081         __do_free char *basecginfo = NULL;
3082         char *copy;
3083         char *base_cgroup;
3084
3085         if (!relative && (geteuid() == 0))
3086                 basecginfo = read_file("/proc/1/cgroup");
3087         else
3088                 basecginfo = read_file("/proc/self/cgroup");
3089         if (!basecginfo)
3090                 return NULL;
3091
3092         base_cgroup = strstr(basecginfo, "0::/");
3093         if (!base_cgroup)
3094                 return NULL;
3095
3096         base_cgroup = base_cgroup + 3;
3097         copy = copy_to_eol(base_cgroup);
3098         if (!copy)
3099                 return NULL;
3100
3101         return trim(copy);
3102 }
3103
3104 static int cg_unified_init(struct cgroup_ops *ops, bool relative,
3105                            bool unprivileged)
3106 {
3107         __do_free char *subtree_path = NULL;
3108         int ret;
3109         char *mountpoint;
3110         char **delegatable;
3111         struct hierarchy *new;
3112         char *base_cgroup = NULL;
3113
3114         ret = unified_cgroup_hierarchy();
3115         if (ret == -ENOMEDIUM)
3116                 return ret_errno(ENOMEDIUM);
3117
3118         if (ret != CGROUP2_SUPER_MAGIC)
3119                 return 0;
3120
3121         base_cgroup = cg_unified_get_current_cgroup(relative);
3122         if (!base_cgroup)
3123                 return ret_errno(EINVAL);
3124         if (!relative)
3125                 prune_init_scope(base_cgroup);
3126
3127         /*
3128          * We assume that the cgroup we're currently in has been delegated to
3129          * us and we are free to further delege all of the controllers listed
3130          * in cgroup.controllers further down the hierarchy.
3131          */
3132         mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT);
3133         subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL);
3134         delegatable = cg_unified_get_controllers(subtree_path);
3135         if (!delegatable)
3136                 delegatable = cg_unified_make_empty_controller();
3137         if (!delegatable[0])
3138                 TRACE("No controllers are enabled for delegation");
3139
3140         /* TODO: If the user requested specific controllers via lxc.cgroup.use
3141          * we should verify here. The reason I'm not doing it right is that I'm
3142          * not convinced that lxc.cgroup.use will be the future since it is a
3143          * global property. I much rather have an option that lets you request
3144          * controllers per container.
3145          */
3146
3147         new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
3148         if (unprivileged)
3149                 cg_unified_delegate(&new->cgroup2_chown);
3150
3151         if (bpf_devices_cgroup_supported())
3152                 new->bpf_device_controller = 1;
3153
3154         ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3155         ops->unified = new;
3156
3157         return CGROUP2_SUPER_MAGIC;
3158 }
3159
3160 static int cg_init(struct cgroup_ops *ops, struct lxc_conf *conf)
3161 {
3162         int ret;
3163         const char *tmp;
3164         bool relative = conf->cgroup_meta.relative;
3165
3166         tmp = lxc_global_config_value("lxc.cgroup.use");
3167         if (tmp) {
3168                 __do_free char *pin = NULL;
3169                 char *chop, *cur;
3170
3171                 pin = must_copy_string(tmp);
3172                 chop = pin;
3173
3174                 lxc_iterate_parts(cur, chop, ",")
3175                         must_append_string(&ops->cgroup_use, cur);
3176         }
3177
3178         ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
3179         if (ret < 0)
3180                 return -1;
3181
3182         if (ret == CGROUP2_SUPER_MAGIC)
3183                 return 0;
3184
3185         return cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
3186 }
3187
3188 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3189 {
3190         const char *cgroup_pattern;
3191
3192         if (!ops)
3193                 return ret_set_errno(-1, ENOENT);
3194
3195         /* copy system-wide cgroup information */
3196         cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3197         if (cgroup_pattern && strcmp(cgroup_pattern, "") != 0)
3198                 ops->cgroup_pattern = must_copy_string(cgroup_pattern);
3199
3200         return 0;
3201 }
3202
3203 struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
3204 {
3205         __do_free struct cgroup_ops *cgfsng_ops = NULL;
3206
3207         cgfsng_ops = malloc(sizeof(struct cgroup_ops));
3208         if (!cgfsng_ops)
3209                 return ret_set_errno(NULL, ENOMEM);
3210
3211         memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
3212         cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3213
3214         if (cg_init(cgfsng_ops, conf))
3215                 return NULL;
3216
3217         cgfsng_ops->data_init = cgfsng_data_init;
3218         cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
3219         cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
3220         cgfsng_ops->monitor_create = cgfsng_monitor_create;
3221         cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
3222         cgfsng_ops->monitor_delegate_controllers = cgfsng_monitor_delegate_controllers;
3223         cgfsng_ops->payload_delegate_controllers = cgfsng_payload_delegate_controllers;
3224         cgfsng_ops->payload_create = cgfsng_payload_create;
3225         cgfsng_ops->payload_enter = cgfsng_payload_enter;
3226         cgfsng_ops->payload_finalize = cgfsng_payload_finalize;
3227         cgfsng_ops->escape = cgfsng_escape;
3228         cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
3229         cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
3230         cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
3231         cgfsng_ops->get = cgfsng_get;
3232         cgfsng_ops->set = cgfsng_set;
3233         cgfsng_ops->freeze = cgfsng_freeze;
3234         cgfsng_ops->unfreeze = cgfsng_unfreeze;
3235         cgfsng_ops->setup_limits_legacy = cgfsng_setup_limits_legacy;
3236         cgfsng_ops->setup_limits = cgfsng_setup_limits;
3237         cgfsng_ops->driver = "cgfsng";
3238         cgfsng_ops->version = "1.0.0";
3239         cgfsng_ops->attach = cgfsng_attach;
3240         cgfsng_ops->chown = cgfsng_chown;
3241         cgfsng_ops->mount = cgfsng_mount;
3242         cgfsng_ops->devices_activate = cgfsng_devices_activate;
3243
3244         return move_ptr(cgfsng_ops);
3245 }