src/lxc/cgroups/cgfsng.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 /*
   4  * cgfs-ng.c: this is a new, simplified implementation of a filesystem
   5  * cgroup backend.  The original cgfs.c was designed to be as flexible
   6  * as possible.  It would try to find cgroup filesystems no matter where
   7  * or how you had them mounted, and deduce the most usable mount for
   8  * each controller.
   9  *
  10  * This new implementation assumes that cgroup filesystems are mounted
  11  * under /sys/fs/cgroup/clist where clist is either the controller, or
  12  * a comma-separated list of controllers.
  13  */
  14
  15 #ifndef _GNU_SOURCE
  16 #define _GNU_SOURCE 1
  17 #endif
  18 #include <ctype.h>
  19 #include <dirent.h>
  20 #include <errno.h>
  21 #include <grp.h>
  22 #include <linux/kdev_t.h>
  23 #include <linux/types.h>
  24 #include <poll.h>
  25 #include <signal.h>
  26 #include <stdint.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/epoll.h>
  31 #include <sys/types.h>
  32 #include <unistd.h>
  33
  34 #include "af_unix.h"
  35 #include "caps.h"
  36 #include "cgroup.h"
  37 #include "cgroup2_devices.h"
  38 #include "cgroup_utils.h"
  39 #include "commands.h"
  40 #include "commands_utils.h"
  41 #include "conf.h"
  42 #include "config.h"
  43 #include "error_utils.h"
  44 #include "log.h"
  45 #include "macro.h"
  46 #include "mainloop.h"
  47 #include "memory_utils.h"
  48 #include "mount_utils.h"
  49 #include "storage/storage.h"
  50 #include "string_utils.h"
  51 #include "syscall_wrappers.h"
  52 #include "utils.h"
  53
  54 #ifndef HAVE_STRLCPY
  55 #include "include/strlcpy.h"
  56 #endif
  57
  58 #ifndef HAVE_STRLCAT
  59 #include "include/strlcat.h"
  60 #endif
  61
  62 lxc_log_define(cgfsng, cgroup);
  63
  64 /*
  65  * Given a pointer to a null-terminated array of pointers, realloc to add one
  66  * entry, and point the new entry to NULL. Do not fail. Return the index to the
  67  * second-to-last entry - that is, the one which is now available for use
  68  * (keeping the list null-terminated).
  69  */
  70 static int list_add(void ***list)
  71 {
  72         int idx = 0;
  73         void **p;
  74
  75         if (*list)
  76                 for (; (*list)[idx]; idx++)
  77                         ;
  78
  79         p = realloc(*list, (idx + 2) * sizeof(void **));
  80         if (!p)
  81                 return ret_errno(ENOMEM);
  82
  83         p[idx + 1] = NULL;
  84         *list = p;
  85
  86         return idx;
  87 }
  88
  89 /* Given a null-terminated array of strings, check whether @entry is one of the
  90  * strings.
  91  */
  92 static bool string_in_list(char **list, const char *entry)
  93 {
  94         if (!list)
  95                 return false;
  96
  97         for (int i = 0; list[i]; i++)
  98                 if (strequal(list[i], entry))
  99                         return true;
 100
 101         return false;
 102 }
 103
 104 /* Given a handler's cgroup data, return the struct hierarchy for the controller
 105  * @c, or NULL if there is none.
 106  */
 107 static struct hierarchy *get_hierarchy(const struct cgroup_ops *ops, const char *controller)
 108 {
 109         if (!ops->hierarchies)
 110                 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
 111
 112         for (int i = 0; ops->hierarchies[i]; i++) {
 113                 if (!controller) {
 114                         /* This is the empty unified hierarchy. */
 115                         if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
 116                                 return ops->hierarchies[i];
 117
 118                         continue;
 119                 }
 120
 121                 /*
 122                  * Handle controllers with significant implementation changes
 123                  * from cgroup to cgroup2.
 124                  */
 125                 if (pure_unified_layout(ops)) {
 126                         if (strequal(controller, "devices")) {
 127                                 if (device_utility_controller(ops->unified))
 128                                         return ops->unified;
 129
 130                                 break;
 131                         } else if (strequal(controller, "freezer")) {
 132                                 if (freezer_utility_controller(ops->unified))
 133                                         return ops->unified;
 134
 135                                 break;
 136                         }
 137                 }
 138
 139                 if (string_in_list(ops->hierarchies[i]->controllers, controller))
 140                         return ops->hierarchies[i];
 141         }
 142
 143         if (controller)
 144                 WARN("There is no useable %s controller", controller);
 145         else
 146                 WARN("There is no empty unified cgroup hierarchy");
 147
 148         return ret_set_errno(NULL, ENOENT);
 149 }
 150
 151 int prepare_cgroup_fd(const struct cgroup_ops *ops, struct cgroup_fd *fd, bool limit)
 152 {
 153         int dfd;
 154         const struct hierarchy *h;
 155
 156         h = get_hierarchy(ops, fd->controller);
 157         if (!h)
 158                 return ret_errno(ENOENT);
 159
 160         /*
 161          * The client requested that the controller must be in a specific
 162          * cgroup version.
 163          */
 164         if (fd->type != 0 && fd->type != h->fs_type)
 165                 return ret_errno(EINVAL);
 166
 167         if (limit)
 168                 dfd = h->dfd_con;
 169         else
 170                 dfd = h->dfd_lim;
 171         if (dfd < 0)
 172                 return ret_errno(EBADF);
 173
 174         fd->layout = ops->cgroup_layout;
 175         fd->type = h->fs_type;
 176         if (fd->type == UNIFIED_HIERARCHY)
 177                 fd->utilities = h->utilities;
 178         fd->fd = dfd;
 179
 180         return 0;
 181 }
 182
 183 /* Taken over modified from the kernel sources. */
 184 #define NBITS 32 /* bits in uint32_t */
 185 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
 186 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
 187
 188 static void set_bit(unsigned bit, uint32_t *bitarr)
 189 {
 190         bitarr[bit / NBITS] |= (1 << (bit % NBITS));
 191 }
 192
 193 static void clear_bit(unsigned bit, uint32_t *bitarr)
 194 {
 195         bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
 196 }
 197
 198 static bool is_set(unsigned bit, uint32_t *bitarr)
 199 {
 200         return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
 201 }
 202
 203 /* Create cpumask from cpulist aka turn:
 204  *
 205  *      0,2-3
 206  *
 207  * into bit array
 208  *
 209  *      1 0 1 1
 210  */
 211 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
 212 {
 213         __do_free uint32_t *bitarr = NULL;
 214         char *token;
 215         size_t arrlen;
 216
 217         arrlen = BITS_TO_LONGS(nbits);
 218         bitarr = calloc(arrlen, sizeof(uint32_t));
 219         if (!bitarr)
 220                 return ret_set_errno(NULL, ENOMEM);
 221
 222         lxc_iterate_parts(token, buf, ",") {
 223                 errno = 0;
 224                 unsigned end, start;
 225                 char *range;
 226
 227                 start = strtoul(token, NULL, 0);
 228                 end = start;
 229                 range = strchr(token, '-');
 230                 if (range)
 231                         end = strtoul(range + 1, NULL, 0);
 232
 233                 if (!(start <= end))
 234                         return ret_set_errno(NULL, EINVAL);
 235
 236                 if (end >= nbits)
 237                         return ret_set_errno(NULL, EINVAL);
 238
 239                 while (start <= end)
 240                         set_bit(start++, bitarr);
 241         }
 242
 243         return move_ptr(bitarr);
 244 }
 245
 246 /* Turn cpumask into simple, comma-separated cpulist. */
 247 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
 248 {
 249         __do_free_string_list char **cpulist = NULL;
 250         char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
 251         int ret;
 252
 253         for (size_t i = 0; i <= nbits; i++) {
 254                 if (!is_set(i, bitarr))
 255                         continue;
 256
 257                 ret = strnprintf(numstr, sizeof(numstr), "%zu", i);
 258                 if (ret < 0)
 259                         return NULL;
 260
 261                 ret = lxc_append_string(&cpulist, numstr);
 262                 if (ret < 0)
 263                         return ret_set_errno(NULL, ENOMEM);
 264         }
 265
 266         if (!cpulist)
 267                 return ret_set_errno(NULL, ENOMEM);
 268
 269         return lxc_string_join(",", (const char **)cpulist, false);
 270 }
 271
 272 static ssize_t get_max_cpus(char *cpulist)
 273 {
 274         char *c1, *c2;
 275         char *maxcpus = cpulist;
 276         size_t cpus = 0;
 277
 278         c1 = strrchr(maxcpus, ',');
 279         if (c1)
 280                 c1++;
 281
 282         c2 = strrchr(maxcpus, '-');
 283         if (c2)
 284                 c2++;
 285
 286         if (!c1 && !c2)
 287                 c1 = maxcpus;
 288         else if (c1 > c2)
 289                 c2 = c1;
 290         else if (c1 < c2)
 291                 c1 = c2;
 292         else if (!c1 && c2)
 293                 c1 = c2;
 294
 295         errno = 0;
 296         cpus = strtoul(c1, NULL, 0);
 297         if (errno != 0)
 298                 return -1;
 299
 300         return cpus;
 301 }
 302
 303 static inline bool is_unified_hierarchy(const struct hierarchy *h)
 304 {
 305         return h->fs_type == UNIFIED_HIERARCHY;
 306 }
 307
 308 /* Return true if the controller @entry is found in the null-terminated list of
 309  * hierarchies @hlist.
 310  */
 311 static bool controller_available(struct hierarchy **hlist, char *entry)
 312 {
 313         if (!hlist)
 314                 return false;
 315
 316         for (int i = 0; hlist[i]; i++)
 317                 if (string_in_list(hlist[i]->controllers, entry))
 318                         return true;
 319
 320         return false;
 321 }
 322
 323 static bool controllers_available(struct cgroup_ops *ops)
 324 {
 325         struct hierarchy **hlist;
 326
 327         if (!ops->cgroup_use)
 328                 return true;
 329
 330         hlist = ops->hierarchies;
 331         for (char **cur = ops->cgroup_use; cur && *cur; cur++)
 332                 if (!controller_available(hlist, *cur))
 333                         return log_error(false, "The %s controller found", *cur);
 334
 335         return true;
 336 }
 337
 338 static char **list_new(void)
 339 {
 340         __do_free_string_list char **list = NULL;
 341         int idx;
 342
 343         idx = list_add((void ***)&list);
 344         if (idx < 0)
 345                 return NULL;
 346
 347         list[idx] = NULL;
 348         return move_ptr(list);
 349 }
 350
 351 static int list_add_string(char ***list, char *entry)
 352 {
 353         __do_free char *dup = NULL;
 354         int idx;
 355
 356         dup = strdup(entry);
 357         if (!dup)
 358                 return ret_errno(ENOMEM);
 359
 360         idx = list_add((void ***)list);
 361         if (idx < 0)
 362                 return idx;
 363
 364         (*list)[idx] = move_ptr(dup);
 365         return 0;
 366 }
 367
 368 static char **list_add_controllers(char *controllers)
 369 {
 370         __do_free_string_list char **list = NULL;
 371         char *it;
 372
 373         lxc_iterate_parts(it, controllers, ", \t\n") {
 374                 int ret;
 375
 376                 ret = list_add_string(&list, it);
 377                 if (ret < 0)
 378                         return NULL;
 379         }
 380
 381         return move_ptr(list);
 382 }
 383
 384 static char **unified_controllers(int dfd, const char *file)
 385 {
 386         __do_free char *buf = NULL;
 387
 388         buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
 389         if (!buf)
 390                 return NULL;
 391
 392         return list_add_controllers(buf);
 393 }
 394
 395 static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers)
 396 {
 397         if (!ops->cgroup_use)
 398                 return false;
 399
 400         for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
 401                 bool found = false;
 402
 403                 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
 404                         if (!strequal(*cur_use, *cur_ctrl))
 405                                 continue;
 406
 407                         found = true;
 408                         break;
 409                 }
 410
 411                 if (found)
 412                         continue;
 413
 414                 return true;
 415         }
 416
 417         return false;
 418 }
 419
 420 static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt,
 421                                 int dfd_base, char *base_cgroup,
 422                                 char **controllers, cgroupfs_type_magic_t fs_type)
 423 {
 424         __do_free struct hierarchy *new = NULL;
 425         int idx;
 426
 427         if (abspath(base_cgroup))
 428                 return syserror_set(-EINVAL, "Container base path must be relative to controller mount");
 429
 430         new = zalloc(sizeof(*new));
 431         if (!new)
 432                 return ret_errno(ENOMEM);
 433
 434         new->dfd_con            = -EBADF;
 435         new->dfd_lim            = -EBADF;
 436         new->dfd_mon            = -EBADF;
 437
 438         new->fs_type            = fs_type;
 439         new->controllers        = controllers;
 440         new->at_mnt             = mnt;
 441         new->at_base            = base_cgroup;
 442
 443         new->dfd_mnt            = dfd_mnt;
 444         new->dfd_base           = dfd_base;
 445
 446         TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s",
 447               mnt, maybe_empty(base_cgroup));
 448         for (char *const *it = new->controllers; it && *it; it++)
 449                 TRACE("The hierarchy contains the %s controller", *it);
 450
 451         idx = list_add((void ***)&ops->hierarchies);
 452         if (idx < 0)
 453                 return ret_errno(idx);
 454
 455         if (fs_type == UNIFIED_HIERARCHY)
 456                 ops->unified = new;
 457         (ops->hierarchies)[idx] = move_ptr(new);
 458
 459         return 0;
 460 }
 461
 462 static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
 463 {
 464         if (!path_prune || !hierarchies)
 465                 return 0;
 466
 467         for (int i = 0; hierarchies[i]; i++) {
 468                 struct hierarchy *h = hierarchies[i];
 469                 int ret;
 470
 471                 ret = cgroup_tree_prune(h->dfd_base, path_prune);
 472                 if (ret < 0)
 473                         SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
 474                 else
 475                         TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
 476
 477                 free_equal(h->path_lim, h->path_con);
 478         }
 479
 480         return 0;
 481 }
 482
 483 struct generic_userns_exec_data {
 484         struct hierarchy **hierarchies;
 485         const char *path_prune;
 486         struct lxc_conf *conf;
 487         uid_t origuid; /* target uid in parent namespace */
 488         char *path;
 489 };
 490
 491 static int cgroup_tree_remove_wrapper(void *data)
 492 {
 493         struct generic_userns_exec_data *arg = data;
 494         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
 495         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
 496         int ret;
 497
 498         if (!lxc_drop_groups() && errno != EPERM)
 499                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
 500
 501         ret = setresgid(nsgid, nsgid, nsgid);
 502         if (ret < 0)
 503                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
 504                                        (int)nsgid, (int)nsgid, (int)nsgid);
 505
 506         ret = setresuid(nsuid, nsuid, nsuid);
 507         if (ret < 0)
 508                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
 509                                        (int)nsuid, (int)nsuid, (int)nsuid);
 510
 511         return cgroup_tree_remove(arg->hierarchies, arg->path_prune);
 512 }
 513
 514 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
 515                                                 struct lxc_handler *handler)
 516 {
 517         int ret;
 518
 519         if (!ops) {
 520                 ERROR("Called with uninitialized cgroup operations");
 521                 return;
 522         }
 523
 524         if (!ops->hierarchies)
 525                 return;
 526
 527         if (!handler) {
 528                 ERROR("Called with uninitialized handler");
 529                 return;
 530         }
 531
 532         if (!handler->conf) {
 533                 ERROR("Called with uninitialized conf");
 534                 return;
 535         }
 536
 537         if (!ops->container_limit_cgroup) {
 538                 WARN("Uninitialized limit cgroup");
 539                 return;
 540         }
 541
 542         ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
 543         if (ret < 0)
 544                 WARN("Failed to detach bpf program from cgroup");
 545
 546         if (!lxc_list_empty(&handler->conf->id_map)) {
 547                 struct generic_userns_exec_data wrap = {
 548                         .conf                   = handler->conf,
 549                         .path_prune             = ops->container_limit_cgroup,
 550                         .hierarchies            = ops->hierarchies,
 551                         .origuid                = 0,
 552                 };
 553                 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
 554                                     &wrap, "cgroup_tree_remove_wrapper");
 555         } else {
 556                 ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup);
 557         }
 558         if (ret < 0)
 559                 SYSWARN("Failed to destroy cgroups");
 560 }
 561
 562 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
 563 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
 564 static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
 565                                     bool am_initialized)
 566 {
 567         __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
 568                        *offlinecpus = NULL, *posscpus = NULL;
 569         __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
 570                            *possmask = NULL;
 571         int ret;
 572         ssize_t i;
 573         ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
 574         bool flipped_bit = false;
 575
 576         posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
 577         if (!posscpus)
 578                 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
 579
 580         /* Get maximum number of cpus found in possible cpuset. */
 581         maxposs = get_max_cpus(posscpus);
 582         if (maxposs < 0 || maxposs >= INT_MAX - 1)
 583                 return false;
 584
 585         if (file_exists(__ISOL_CPUS)) {
 586                 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
 587                 if (!isolcpus)
 588                         return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
 589
 590                 if (isdigit(isolcpus[0])) {
 591                         /* Get maximum number of cpus found in isolated cpuset. */
 592                         maxisol = get_max_cpus(isolcpus);
 593                         if (maxisol < 0 || maxisol >= INT_MAX - 1)
 594                                 return false;
 595                 }
 596
 597                 if (maxposs < maxisol)
 598                         maxposs = maxisol;
 599                 maxposs++;
 600         } else {
 601                 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
 602         }
 603
 604         if (file_exists(__OFFLINE_CPUS)) {
 605                 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
 606                 if (!offlinecpus)
 607                         return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
 608
 609                 if (isdigit(offlinecpus[0])) {
 610                         /* Get maximum number of cpus found in offline cpuset. */
 611                         maxoffline = get_max_cpus(offlinecpus);
 612                         if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
 613                                 return false;
 614                 }
 615
 616                 if (maxposs < maxoffline)
 617                         maxposs = maxoffline;
 618                 maxposs++;
 619         } else {
 620                 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
 621         }
 622
 623         if ((maxisol == 0) && (maxoffline == 0)) {
 624                 cpulist = move_ptr(posscpus);
 625                 goto copy_parent;
 626         }
 627
 628         possmask = lxc_cpumask(posscpus, maxposs);
 629         if (!possmask)
 630                 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
 631
 632         if (maxisol > 0) {
 633                 isolmask = lxc_cpumask(isolcpus, maxposs);
 634                 if (!isolmask)
 635                         return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
 636         }
 637
 638         if (maxoffline > 0) {
 639                 offlinemask = lxc_cpumask(offlinecpus, maxposs);
 640                 if (!offlinemask)
 641                         return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
 642         }
 643
 644         for (i = 0; i <= maxposs; i++) {
 645                 if ((isolmask && !is_set(i, isolmask)) ||
 646                     (offlinemask && !is_set(i, offlinemask)) ||
 647                     !is_set(i, possmask))
 648                         continue;
 649
 650                 flipped_bit = true;
 651                 clear_bit(i, possmask);
 652         }
 653
 654         if (!flipped_bit) {
 655                 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
 656                 TRACE("No isolated or offline cpus present in cpuset");
 657         } else {
 658                 cpulist = move_ptr(posscpus);
 659                 TRACE("Removed isolated or offline cpus from cpuset");
 660         }
 661         if (!cpulist)
 662                 return log_error_errno(false, errno, "Failed to create cpu list");
 663
 664 copy_parent:
 665         if (!am_initialized) {
 666                 ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
 667                 if (ret < 0)
 668                         return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
 669
 670                 TRACE("Copied cpu settings of parent cgroup");
 671         }
 672
 673         return true;
 674 }
 675
 676 static bool cpuset1_initialize(int dfd_base, int dfd_next)
 677 {
 678         char mems[PATH_MAX];
 679         ssize_t bytes;
 680         char v;
 681
 682         /*
 683         * Determine whether the base cgroup has cpuset
 684         * inheritance turned on.
 685          */
 686         bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
 687         if (bytes < 0)
 688                 return syserror_ret(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
 689
 690         /*
 691         * Initialize cpuset.cpus and make remove any isolated
 692         * and offline cpus.
 693          */
 694         if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
 695                 return syserror_ret(false, "Failed to initialize cpuset.cpus");
 696
 697         /* Read cpuset.mems from parent... */
 698         bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
 699         if (bytes < 0)
 700                 return syserror_ret(false, "Failed to read file %d(cpuset.mems)", dfd_base);
 701
 702         /* ... and copy to first cgroup in the tree... */
 703         bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
 704         if (bytes < 0)
 705                 return syserror_ret(false, "Failed to write %d(cpuset.mems)", dfd_next);
 706
 707         /* ... and finally turn on cpuset inheritance. */
 708         bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
 709         if (bytes < 0)
 710                 return syserror_ret(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
 711
 712         return log_trace(true, "Initialized cpuset in the legacy hierarchy");
 713 }
 714
 715 static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
 716                                 bool cpuset_v1, bool eexist_ignore)
 717 {
 718         __do_close int dfd_final = -EBADF;
 719         int dfd_cur = dfd_base;
 720         int ret = 0;
 721         size_t len;
 722         char *cur;
 723         char buf[PATH_MAX];
 724
 725         if (is_empty_string(path))
 726                 return ret_errno(EINVAL);
 727
 728         len = strlcpy(buf, path, sizeof(buf));
 729         if (len >= sizeof(buf))
 730                 return ret_errno(E2BIG);
 731
 732         lxc_iterate_parts(cur, buf, "/") {
 733                 /*
 734                  * Even though we vetted the paths when we parsed the config
 735                  * we're paranoid here and check that the path is neither
 736                  * absolute nor walks upwards.
 737                  */
 738                 if (abspath(cur))
 739                         return syserror_set(-EINVAL, "No absolute paths allowed");
 740
 741                 if (strnequal(cur, "..", STRLITERALLEN("..")))
 742                         return syserror_set(-EINVAL, "No upward walking paths allowed");
 743
 744                 ret = mkdirat(dfd_cur, cur, mode);
 745                 if (ret < 0) {
 746                         if (errno != EEXIST)
 747                                 return syserror("Failed to create %d(%s)", dfd_cur, cur);
 748
 749                         ret = -EEXIST;
 750                 }
 751                 TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
 752
 753                 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
 754                 if (dfd_final < 0)
 755                         return syserror("Fail to open%s directory %d(%s)",
 756                                         !ret ? " newly created" : "", dfd_base, cur);
 757                 if (dfd_cur != dfd_base)
 758                         close(dfd_cur);
 759                 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
 760                         return syserror_set(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
 761                 /*
 762                  * Leave dfd_final pointing to the last fd we opened so
 763                  * it will be automatically zapped if we return early.
 764                  */
 765                 dfd_cur = dfd_final;
 766         }
 767
 768         /* The final cgroup must be succesfully creatd by us. */
 769         if (ret) {
 770                 if (ret != -EEXIST || !eexist_ignore)
 771                         return syserror_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
 772         }
 773
 774         return move_fd(dfd_final);
 775 }
 776
 777 static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
 778                                struct hierarchy *h, const char *cgroup_limit_dir,
 779                                const char *cgroup_leaf, bool payload)
 780 {
 781         __do_close int fd_limit = -EBADF, fd_final = -EBADF;
 782         __do_free char *path = NULL, *limit_path = NULL;
 783         bool cpuset_v1 = false;
 784
 785         /*
 786          * The legacy cpuset controller needs massaging in case inheriting
 787          * settings from its immediate ancestor cgroup hasn't been turned on.
 788          */
 789         cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
 790
 791         if (payload && cgroup_leaf) {
 792                 /* With isolation both parts need to not already exist. */
 793                 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
 794                 if (fd_limit < 0)
 795                         return syserror_ret(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
 796
 797                 TRACE("Created limit cgroup %d->%d(%s)",
 798                       fd_limit, h->dfd_base, cgroup_limit_dir);
 799
 800                 /*
 801                  * With isolation the devices legacy cgroup needs to be
 802                  * iinitialized early, as it typically contains an 'a' (all)
 803                  * line, which is not possible once a subdirectory has been
 804                  * created.
 805                  */
 806                 if (string_in_list(h->controllers, "devices") &&
 807                     !ops->setup_limits_legacy(ops, conf, true))
 808                         return log_error(false, "Failed to setup legacy device limits");
 809
 810                 limit_path = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
 811                 path = must_make_path(limit_path, cgroup_leaf, NULL);
 812
 813                 /*
 814                  * If we use a separate limit cgroup, the leaf cgroup, i.e. the
 815                  * cgroup the container actually resides in, is below fd_limit.
 816                  */
 817                 fd_final = __cgroup_tree_create(fd_limit, cgroup_leaf, 0755, cpuset_v1, false);
 818                 if (fd_final < 0) {
 819                         /* Ensure we don't leave any garbage behind. */
 820                         if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
 821                                 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
 822                         else
 823                                 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
 824                 }
 825         } else {
 826                 path = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
 827
 828                 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
 829         }
 830         if (fd_final < 0)
 831                 return syserror_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
 832
 833         if (payload) {
 834                 h->dfd_con = move_fd(fd_final);
 835                 h->path_con = move_ptr(path);
 836
 837                 if (fd_limit < 0)
 838                         h->dfd_lim = h->dfd_con;
 839                 else
 840                         h->dfd_lim = move_fd(fd_limit);
 841
 842                 if (limit_path)
 843                         h->path_lim = move_ptr(limit_path);
 844                 else
 845                         h->path_lim = h->path_con;
 846         } else {
 847                 h->dfd_mon = move_fd(fd_final);
 848         }
 849
 850         return true;
 851 }
 852
 853 static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
 854                                    bool payload)
 855 {
 856         bool prune = true;
 857
 858         if (payload) {
 859                 /* Check whether we actually created the cgroup to prune. */
 860                 if (h->dfd_lim < 0)
 861                         prune = false;
 862
 863                 free_equal(h->path_con, h->path_lim);
 864                 close_equal(h->dfd_con, h->dfd_lim);
 865         } else {
 866                 /* Check whether we actually created the cgroup to prune. */
 867                 if (h->dfd_mon < 0)
 868                         prune = false;
 869
 870                 close_prot_errno_disarm(h->dfd_mon);
 871         }
 872
 873         /* We didn't create this cgroup. */
 874         if (!prune)
 875                 return;
 876
 877         if (cgroup_tree_prune(h->dfd_base, path_prune))
 878                 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
 879         else
 880                 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
 881 }
 882
 883 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
 884                                                 struct lxc_handler *handler)
 885 {
 886         int len;
 887         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
 888         const struct lxc_conf *conf;
 889
 890         if (!ops) {
 891                 ERROR("Called with uninitialized cgroup operations");
 892                 return;
 893         }
 894
 895         if (!ops->hierarchies)
 896                 return;
 897
 898         if (!handler) {
 899                 ERROR("Called with uninitialized handler");
 900                 return;
 901         }
 902
 903         if (!handler->conf) {
 904                 ERROR("Called with uninitialized conf");
 905                 return;
 906         }
 907         conf = handler->conf;
 908
 909         if (!ops->monitor_cgroup) {
 910                 WARN("Uninitialized monitor cgroup");
 911                 return;
 912         }
 913
 914         len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
 915         if (len < 0)
 916                 return;
 917
 918         for (int i = 0; ops->hierarchies[i]; i++) {
 919                 __do_close int fd_pivot = -EBADF;
 920                 __do_free char *pivot_path = NULL;
 921                 struct hierarchy *h = ops->hierarchies[i];
 922                 bool cpuset_v1 = false;
 923                 int ret;
 924
 925                 /* Monitor might have died before we entered the cgroup. */
 926                 if (handler->monitor_pid <= 0) {
 927                         WARN("No valid monitor process found while destroying cgroups");
 928                         goto cgroup_prune_tree;
 929                 }
 930
 931                 if (conf->cgroup_meta.monitor_pivot_dir)
 932                         pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
 933                 else if (conf->cgroup_meta.dir)
 934                         pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
 935                 else
 936                         pivot_path = must_make_path(CGROUP_PIVOT, NULL);
 937
 938                 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
 939
 940                 fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
 941                 if (fd_pivot < 0) {
 942                         SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
 943                         continue;
 944                 }
 945
 946                 ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
 947                 if (ret != 0) {
 948                         SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
 949                         continue;
 950                 }
 951
 952 cgroup_prune_tree:
 953                 ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup);
 954                 if (ret < 0)
 955                         SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup);
 956                 else
 957                         TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup);
 958         }
 959 }
 960
 961 /*
 962  * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
 963  * proper prefix directory of lxc.cgroup.dir.payload.
 964  *
 965  * Returns the prefix length if it is set, otherwise zero on success.
 966  */
 967 static bool check_cgroup_dir_config(struct lxc_conf *conf)
 968 {
 969         const char *monitor_dir = conf->cgroup_meta.monitor_dir,
 970                    *container_dir = conf->cgroup_meta.container_dir,
 971                    *namespace_dir = conf->cgroup_meta.namespace_dir;
 972
 973         /* none of the new options are set, all is fine */
 974         if (!monitor_dir && !container_dir && !namespace_dir)
 975                 return true;
 976
 977         /* some are set, make sure lxc.cgroup.dir is not also set*/
 978         if (conf->cgroup_meta.dir)
 979                 return log_error_errno(false, EINVAL,
 980                         "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
 981
 982         /* make sure both monitor and payload are set */
 983         if (!monitor_dir || !container_dir)
 984                 return log_error_errno(false, EINVAL,
 985                         "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
 986
 987         /* namespace_dir may be empty */
 988         return true;
 989 }
 990
 991 __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
 992 {
 993         __do_free char *monitor_cgroup = NULL;
 994         int idx = 0;
 995         int i;
 996         size_t len;
 997         char *suffix = NULL;
 998         struct lxc_conf *conf;
 999
1000         if (!ops)
1001                 return ret_set_errno(false, ENOENT);
1002
1003         if (!ops->hierarchies)
1004                 return true;
1005
1006         if (ops->monitor_cgroup)
1007                 return ret_set_errno(false, EEXIST);
1008
1009         if (!handler || !handler->conf)
1010                 return ret_set_errno(false, EINVAL);
1011
1012         conf = handler->conf;
1013
1014         if (!check_cgroup_dir_config(conf))
1015                 return false;
1016
1017         if (conf->cgroup_meta.monitor_dir) {
1018                 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
1019         } else if (conf->cgroup_meta.dir) {
1020                 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1021                                              DEFAULT_MONITOR_CGROUP_PREFIX,
1022                                              handler->name,
1023                                              CGROUP_CREATE_RETRY, NULL);
1024         } else if (ops->cgroup_pattern) {
1025                 __do_free char *cgroup_tree = NULL;
1026
1027                 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1028                 if (!cgroup_tree)
1029                         return ret_set_errno(false, ENOMEM);
1030
1031                 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
1032                                              DEFAULT_MONITOR_CGROUP,
1033                                              CGROUP_CREATE_RETRY, NULL);
1034         } else {
1035                 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1036                                              handler->name,
1037                                              CGROUP_CREATE_RETRY, NULL);
1038         }
1039         if (!monitor_cgroup)
1040                 return ret_set_errno(false, ENOMEM);
1041
1042         if (!conf->cgroup_meta.monitor_dir) {
1043                 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1044                 *suffix = '\0';
1045         }
1046         do {
1047                 if (idx && suffix)
1048                         sprintf(suffix, "-%d", idx);
1049
1050                 for (i = 0; ops->hierarchies[i]; i++) {
1051                         if (cgroup_tree_create(ops, handler->conf,
1052                                                ops->hierarchies[i],
1053                                                monitor_cgroup, NULL, false))
1054                                 continue;
1055
1056                         DEBUG("Failed to create cgroup %s)", monitor_cgroup);
1057                         for (int j = 0; j <= i; j++)
1058                                 cgroup_tree_prune_leaf(ops->hierarchies[j],
1059                                                        monitor_cgroup, false);
1060
1061                         idx++;
1062                         break;
1063                 }
1064         } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1065
1066         if (idx == 1000 || (!suffix && idx != 0))
1067                 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
1068
1069         ops->monitor_cgroup = move_ptr(monitor_cgroup);
1070         return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1071 }
1072
1073 /*
1074  * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1075  * next cgroup_pattern-1, -2, ..., -999.
1076  */
1077 __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1078 {
1079         __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL;
1080         char *limit_cgroup;
1081         int idx = 0;
1082         int i;
1083         size_t len;
1084         char *suffix = NULL;
1085         struct lxc_conf *conf;
1086
1087         if (!ops)
1088                 return ret_set_errno(false, ENOENT);
1089
1090         if (!ops->hierarchies)
1091                 return true;
1092
1093         if (ops->container_cgroup || ops->container_limit_cgroup)
1094                 return ret_set_errno(false, EEXIST);
1095
1096         if (!handler || !handler->conf)
1097                 return ret_set_errno(false, EINVAL);
1098
1099         conf = handler->conf;
1100
1101         if (!check_cgroup_dir_config(conf))
1102                 return false;
1103
1104         if (conf->cgroup_meta.container_dir) {
1105                 __limit_cgroup = strdup(conf->cgroup_meta.container_dir);
1106                 if (!__limit_cgroup)
1107                         return ret_set_errno(false, ENOMEM);
1108
1109                 if (conf->cgroup_meta.namespace_dir) {
1110                         container_cgroup = must_make_path(__limit_cgroup,
1111                                                           conf->cgroup_meta.namespace_dir,
1112                                                           NULL);
1113                         limit_cgroup = __limit_cgroup;
1114                 } else {
1115                         /* explicit paths but without isolation */
1116                         limit_cgroup = move_ptr(__limit_cgroup);
1117                         container_cgroup = limit_cgroup;
1118                 }
1119         } else if (conf->cgroup_meta.dir) {
1120                 limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1121                                            DEFAULT_PAYLOAD_CGROUP_PREFIX,
1122                                            handler->name,
1123                                            CGROUP_CREATE_RETRY, NULL);
1124                 container_cgroup = limit_cgroup;
1125         } else if (ops->cgroup_pattern) {
1126                 __do_free char *cgroup_tree = NULL;
1127
1128                 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1129                 if (!cgroup_tree)
1130                         return ret_set_errno(false, ENOMEM);
1131
1132                 limit_cgroup = must_concat(&len, cgroup_tree, "/",
1133                                            DEFAULT_PAYLOAD_CGROUP,
1134                                            CGROUP_CREATE_RETRY, NULL);
1135                 container_cgroup = limit_cgroup;
1136         } else {
1137                 limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1138                                            handler->name,
1139                                            CGROUP_CREATE_RETRY, NULL);
1140                 container_cgroup = limit_cgroup;
1141         }
1142         if (!limit_cgroup)
1143                 return ret_set_errno(false, ENOMEM);
1144
1145         if (!conf->cgroup_meta.container_dir) {
1146                 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1147                 *suffix = '\0';
1148         }
1149         do {
1150                 if (idx && suffix)
1151                         sprintf(suffix, "-%d", idx);
1152
1153                 for (i = 0; ops->hierarchies[i]; i++) {
1154                         if (cgroup_tree_create(ops, handler->conf,
1155                                                ops->hierarchies[i], limit_cgroup,
1156                                                conf->cgroup_meta.namespace_dir,
1157                                                true))
1158                                 continue;
1159
1160                         DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->path_con ?: "(null)");
1161                         for (int j = 0; j <= i; j++)
1162                                 cgroup_tree_prune_leaf(ops->hierarchies[j],
1163                                                        limit_cgroup, true);
1164
1165                         idx++;
1166                         break;
1167                 }
1168         } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1169
1170         if (idx == 1000 || (!suffix && idx != 0))
1171                 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
1172
1173         ops->container_cgroup = move_ptr(container_cgroup);
1174         if (__limit_cgroup)
1175                 ops->container_limit_cgroup = move_ptr(__limit_cgroup);
1176         else
1177                 ops->container_limit_cgroup = ops->container_cgroup;
1178         INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup",
1179              ops->container_cgroup, ops->container_limit_cgroup);
1180         return true;
1181 }
1182
1183 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1184                                               struct lxc_handler *handler)
1185 {
1186         int monitor_len, transient_len = 0;
1187         char monitor[INTTYPE_TO_STRLEN(pid_t)],
1188             transient[INTTYPE_TO_STRLEN(pid_t)];
1189
1190         if (!ops)
1191                 return ret_set_errno(false, ENOENT);
1192
1193         if (!ops->hierarchies)
1194                 return true;
1195
1196         if (!ops->monitor_cgroup)
1197                 return ret_set_errno(false, ENOENT);
1198
1199         if (!handler || !handler->conf)
1200                 return ret_set_errno(false, EINVAL);
1201
1202         monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1203         if (monitor_len < 0)
1204                 return false;
1205
1206         if (handler->transient_pid > 0) {
1207                 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1208                 if (transient_len < 0)
1209                         return false;
1210         }
1211
1212         for (int i = 0; ops->hierarchies[i]; i++) {
1213                 struct hierarchy *h = ops->hierarchies[i];
1214                 int ret;
1215
1216                 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", monitor, monitor_len);
1217                 if (ret)
1218                         return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1219
1220                 TRACE("Moved monitor into cgroup %d", h->dfd_mon);
1221
1222                 if (handler->transient_pid <= 0)
1223                         continue;
1224
1225                 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len);
1226                 if (ret)
1227                         return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1228
1229                 TRACE("Moved transient process into cgroup %d", h->dfd_mon);
1230
1231                 /*
1232                  * we don't keep the fds for non-unified hierarchies around
1233                  * mainly because we don't make use of them anymore after the
1234                  * core cgroup setup is done but also because there are quite a
1235                  * lot of them.
1236                  */
1237                 if (!is_unified_hierarchy(h))
1238                         close_prot_errno_disarm(h->dfd_mon);
1239         }
1240         handler->transient_pid = -1;
1241
1242         return true;
1243 }
1244
1245 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1246                                               struct lxc_handler *handler)
1247 {
1248         int len;
1249         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1250
1251         if (!ops)
1252                 return ret_set_errno(false, ENOENT);
1253
1254         if (!ops->hierarchies)
1255                 return true;
1256
1257         if (!ops->container_cgroup)
1258                 return ret_set_errno(false, ENOENT);
1259
1260         if (!handler || !handler->conf)
1261                 return ret_set_errno(false, EINVAL);
1262
1263         len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1264         if (len < 0)
1265                 return false;
1266
1267         for (int i = 0; ops->hierarchies[i]; i++) {
1268                 struct hierarchy *h = ops->hierarchies[i];
1269                 int ret;
1270
1271                 if (is_unified_hierarchy(h) &&
1272                     (handler->clone_flags & CLONE_INTO_CGROUP))
1273                         continue;
1274
1275                 ret = lxc_writeat(h->dfd_con, "cgroup.procs", pidstr, len);
1276                 if (ret != 0)
1277                         return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->path_con);
1278
1279                 TRACE("Moved container into %s cgroup via %d", h->path_con, h->dfd_con);
1280         }
1281
1282         return true;
1283 }
1284
1285 static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1286                       gid_t chown_gid, mode_t chmod_mode)
1287 {
1288         int ret;
1289
1290         ret = fchownat(dirfd, path, chown_uid, chown_gid,
1291                        AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1292         if (ret < 0)
1293                 return log_warn_errno(-1,
1294                                       errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1295                                       dirfd, path, (int)chown_uid,
1296                                       (int)chown_gid);
1297
1298         ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1299         if (ret < 0)
1300                 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1301                                       dirfd, path, (int)chmod_mode);
1302
1303         return 0;
1304 }
1305
1306 /* chgrp the container cgroups to container group.  We leave
1307  * the container owner as cgroup owner.  So we must make the
1308  * directories 775 so that the container can create sub-cgroups.
1309  *
1310  * Also chown the tasks and cgroup.procs files.  Those may not
1311  * exist depending on kernel version.
1312  */
1313 static int chown_cgroup_wrapper(void *data)
1314 {
1315         int ret;
1316         uid_t destuid;
1317         struct generic_userns_exec_data *arg = data;
1318         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1319         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1320
1321         if (!lxc_drop_groups() && errno != EPERM)
1322                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1323
1324         ret = setresgid(nsgid, nsgid, nsgid);
1325         if (ret < 0)
1326                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
1327                                        (int)nsgid, (int)nsgid, (int)nsgid);
1328
1329         ret = setresuid(nsuid, nsuid, nsuid);
1330         if (ret < 0)
1331                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
1332                                        (int)nsuid, (int)nsuid, (int)nsuid);
1333
1334         destuid = get_ns_uid(arg->origuid);
1335         if (destuid == LXC_INVALID_UID)
1336                 destuid = 0;
1337
1338         for (int i = 0; arg->hierarchies[i]; i++) {
1339                 int dirfd = arg->hierarchies[i]->dfd_con;
1340
1341                 if (dirfd < 0)
1342                         return syserror_set(-EBADF, "Invalid cgroup file descriptor");
1343
1344                 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
1345
1346                 /*
1347                  * Failures to chown() these are inconvenient but not
1348                  * detrimental We leave these owned by the container launcher,
1349                  * so that container root can write to the files to attach.  We
1350                  * chmod() them 664 so that container systemd can write to the
1351                  * files (which systemd in wily insists on doing).
1352                  */
1353
1354                 if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY)
1355                         (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
1356
1357                 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
1358
1359                 if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY)
1360                         continue;
1361
1362                 for (char **p = arg->hierarchies[i]->delegate; p && *p; p++)
1363                         (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
1364         }
1365
1366         return 0;
1367 }
1368
1369 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1370                                       struct lxc_conf *conf)
1371 {
1372         struct generic_userns_exec_data wrap;
1373
1374         if (!ops)
1375                 return ret_set_errno(false, ENOENT);
1376
1377         if (!ops->hierarchies)
1378                 return true;
1379
1380         if (!ops->container_cgroup)
1381                 return ret_set_errno(false, ENOENT);
1382
1383         if (!conf)
1384                 return ret_set_errno(false, EINVAL);
1385
1386         if (lxc_list_empty(&conf->id_map))
1387                 return true;
1388
1389         wrap.origuid = geteuid();
1390         wrap.path = NULL;
1391         wrap.hierarchies = ops->hierarchies;
1392         wrap.conf = conf;
1393
1394         if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1395                 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1396
1397         return true;
1398 }
1399
1400 __cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops)
1401 {
1402         if (!ops)
1403                 return;
1404
1405         if (!ops->hierarchies)
1406                 return;
1407
1408         for (int i = 0; ops->hierarchies[i]; i++) {
1409                 struct hierarchy *h = ops->hierarchies[i];
1410
1411                 /* Close all monitor cgroup file descriptors. */
1412                 close_prot_errno_disarm(h->dfd_mon);
1413         }
1414         /* Close the cgroup root file descriptor. */
1415         close_prot_errno_disarm(ops->dfd_mnt);
1416
1417         /*
1418          * The checking for freezer support should obviously be done at cgroup
1419          * initialization time but that doesn't work reliable. The freezer
1420          * controller has been demoted (rightly so) to a simple file located in
1421          * each non-root cgroup. At the time when the container is created we
1422          * might still be located in /sys/fs/cgroup and so checking for
1423          * cgroup.freeze won't tell us anything because this file doesn't exist
1424          * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1425          * find an already existing cgroup and then check within that cgroup
1426          * for the existence of cgroup.freeze but that will only work on
1427          * systemd based hosts. Other init systems might not manage cgroups and
1428          * so no cgroup will exist. So we defer until we have created cgroups
1429          * for our container which means we check here.
1430          */
1431         if (pure_unified_layout(ops) &&
1432             !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK,
1433                        AT_SYMLINK_NOFOLLOW)) {
1434                 TRACE("Unified hierarchy supports freezer");
1435                 ops->unified->utilities |= FREEZER_CONTROLLER;
1436         }
1437 }
1438
1439 /* cgroup-full:* is done, no need to create subdirs */
1440 static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
1441 {
1442         switch (cgroup_automount_type) {
1443         case LXC_AUTO_CGROUP_RO:
1444                 return true;
1445         case LXC_AUTO_CGROUP_RW:
1446                 return true;
1447         case LXC_AUTO_CGROUP_MIXED:
1448                 return true;
1449         }
1450
1451         return false;
1452 }
1453
1454 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1455  * remount controller ro if needed and bindmount the cgroupfs onto
1456  * control/the/cg/path.
1457  */
1458 static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
1459                                        char *hierarchy_mnt, char *cgpath,
1460                                        const char *container_cgroup)
1461 {
1462         __do_free char *sourcepath = NULL;
1463         int ret, remount_flags;
1464         int flags = MS_BIND;
1465
1466         if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1467             (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
1468                 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL);
1469                 if (ret < 0)
1470                         return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1471                                                hierarchy_mnt, hierarchy_mnt);
1472
1473                 remount_flags = add_required_remount_flags(hierarchy_mnt,
1474                                                            hierarchy_mnt,
1475                                                            flags | MS_REMOUNT);
1476                 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup",
1477                             remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1478                             NULL);
1479                 if (ret < 0)
1480                         return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt);
1481
1482                 INFO("Remounted %s read-only", hierarchy_mnt);
1483         }
1484
1485         sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL);
1486         if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
1487                 flags |= MS_RDONLY;
1488
1489         ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1490         if (ret < 0)
1491                 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1492                                        h->controllers[0], cgpath);
1493         INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1494
1495         if (flags & MS_RDONLY) {
1496                 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1497                                                            flags | MS_REMOUNT);
1498                 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1499                 if (ret < 0)
1500                         return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
1501                 INFO("Remounted %s read-only", cgpath);
1502         }
1503
1504         INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1505         return 0;
1506 }
1507
1508 /* __cgroupfs_mount
1509  *
1510  * Mount cgroup hierarchies directly without using bind-mounts. The main
1511  * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1512  * cgroups for the LXC_AUTO_CGROUP_FULL option.
1513  */
1514 static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1515                             struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
1516                             const char *hierarchy_mnt)
1517 {
1518         __do_close int fd_fs = -EBADF;
1519         unsigned int flags = 0;
1520         char *fstype;
1521         int ret;
1522
1523         if (dfd_mnt_cgroupfs < 0)
1524                 return ret_errno(EINVAL);
1525
1526         flags |= MOUNT_ATTR_NOSUID;
1527         flags |= MOUNT_ATTR_NOEXEC;
1528         flags |= MOUNT_ATTR_NODEV;
1529         flags |= MOUNT_ATTR_RELATIME;
1530
1531         if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1532             (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO))
1533                 flags |= MOUNT_ATTR_RDONLY;
1534
1535         if (is_unified_hierarchy(h))
1536                 fstype = "cgroup2";
1537         else
1538                 fstype = "cgroup";
1539
1540         if (can_use_mount_api()) {
1541                 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1542                 if (fd_fs < 0)
1543                         return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1544
1545                 if (!is_unified_hierarchy(h)) {
1546                         for (const char **it = (const char **)h->controllers; it && *it; it++) {
1547                                 if (strnequal(*it, "name=", STRLITERALLEN("name=")))
1548                                         ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1549                                 else
1550                                         ret = fs_set_property(fd_fs, *it, "");
1551                                 if (ret < 0)
1552                                         return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1553                         }
1554                 }
1555
1556                 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1557                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1558                                 flags);
1559         } else {
1560                 __do_free char *controllers = NULL, *target = NULL;
1561                 unsigned int old_flags = 0;
1562                 const char *rootfs_mnt;
1563
1564                 if (!is_unified_hierarchy(h)) {
1565                         controllers = lxc_string_join(",", (const char **)h->controllers, false);
1566                         if (!controllers)
1567                                 return ret_errno(ENOMEM);
1568                 }
1569
1570                 rootfs_mnt = get_rootfs_mnt(rootfs);
1571                 ret = mnt_attributes_old(flags, &old_flags);
1572                 if (ret)
1573                         return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1574
1575                 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
1576                 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
1577         }
1578         if (ret < 0)
1579                 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1580                                        fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1581
1582         DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1583               fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1584         return 0;
1585 }
1586
1587 static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1588                                  struct lxc_rootfs *rootfs,
1589                                  int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
1590 {
1591         return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1592                                 dfd_mnt_cgroupfs, hierarchy_mnt);
1593 }
1594
1595 static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
1596                                       struct lxc_rootfs *rootfs,
1597                                       int dfd_mnt_cgroupfs,
1598                                       const char *hierarchy_mnt)
1599 {
1600         switch (cgroup_automount_type) {
1601         case LXC_AUTO_CGROUP_FULL_RO:
1602                 break;
1603         case LXC_AUTO_CGROUP_FULL_RW:
1604                 break;
1605         case LXC_AUTO_CGROUP_FULL_MIXED:
1606                 break;
1607         default:
1608                 return 0;
1609         }
1610
1611         return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1612                                 dfd_mnt_cgroupfs, hierarchy_mnt);
1613 }
1614
1615 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1616                                       struct lxc_handler *handler, int cg_flags)
1617 {
1618         __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF;
1619         __do_free char *cgroup_root = NULL;
1620         int cgroup_automount_type;
1621         bool in_cgroup_ns = false, wants_force_mount = false;
1622         struct lxc_conf *conf = handler->conf;
1623         struct lxc_rootfs *rootfs = &conf->rootfs;
1624         const char *rootfs_mnt = get_rootfs_mnt(rootfs);
1625         int ret;
1626
1627         if (!ops)
1628                 return ret_set_errno(false, ENOENT);
1629
1630         if (!ops->hierarchies)
1631                 return true;
1632
1633         if (!conf)
1634                 return ret_set_errno(false, EINVAL);
1635
1636         if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
1637                 return log_trace(true, "No cgroup mounts requested");
1638
1639         if (cg_flags & LXC_AUTO_CGROUP_FORCE) {
1640                 cg_flags &= ~LXC_AUTO_CGROUP_FORCE;
1641                 wants_force_mount = true;
1642         }
1643
1644         switch (cg_flags) {
1645         case LXC_AUTO_CGROUP_RO:
1646                 TRACE("Read-only cgroup mounts requested");
1647                 break;
1648         case LXC_AUTO_CGROUP_RW:
1649                 TRACE("Read-write cgroup mounts requested");
1650                 break;
1651         case LXC_AUTO_CGROUP_MIXED:
1652                 TRACE("Mixed cgroup mounts requested");
1653                 break;
1654         case LXC_AUTO_CGROUP_FULL_RO:
1655                 TRACE("Full read-only cgroup mounts requested");
1656                 break;
1657         case LXC_AUTO_CGROUP_FULL_RW:
1658                 TRACE("Full read-write cgroup mounts requested");
1659                 break;
1660         case LXC_AUTO_CGROUP_FULL_MIXED:
1661                 TRACE("Full mixed cgroup mounts requested");
1662                 break;
1663         default:
1664                 return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified");
1665         }
1666         cgroup_automount_type = cg_flags;
1667
1668         if (!wants_force_mount) {
1669                 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
1670
1671                 /*
1672                  * Most recent distro versions currently have init system that
1673                  * do support cgroup2 but do not mount it by default unless
1674                  * explicitly told so even if the host is cgroup2 only. That
1675                  * means they often will fail to boot. Fix this by pre-mounting
1676                  * cgroup2 by default. We will likely need to be doing this a
1677                  * few years until all distros have switched over to cgroup2 at
1678                  * which point we can safely assume that their init systems
1679                  * will mount it themselves.
1680                  */
1681                 if (pure_unified_layout(ops))
1682                         wants_force_mount = true;
1683         }
1684
1685         if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP))
1686                 in_cgroup_ns = true;
1687
1688         if (in_cgroup_ns && !wants_force_mount)
1689                 return log_trace(true, "Mounting cgroups not requested or needed");
1690
1691         /* This is really the codepath that we want. */
1692         if (pure_unified_layout(ops)) {
1693                 __do_close int dfd_mnt_unified = -EBADF;
1694
1695                 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1696                                           PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1697                 if (dfd_mnt_unified < 0)
1698                         return syserror_ret(false, "Failed to open %d(%s)",
1699                                             rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1700                 /*
1701                  * If cgroup namespaces are supported but the container will
1702                  * not have CAP_SYS_ADMIN after it has started we need to mount
1703                  * the cgroups manually.
1704                  *
1705                  * Note that here we know that wants_force_mount is true.
1706                  * Otherwise we would've returned early above.
1707                  */
1708                 if (in_cgroup_ns) {
1709                         /*
1710                          *  1. cgroup:rw:force    -> Mount the cgroup2 filesystem.
1711                          *  2. cgroup:ro:force    -> Mount the cgroup2 filesystem read-only.
1712                          *  3. cgroup:mixed:force -> See comment above how this
1713                          *                           does not apply so
1714                          *                           cgroup:mixed is equal to
1715                          *                           cgroup:rw when cgroup
1716                          *                           namespaces are supported.
1717
1718                          *  4. cgroup:rw    -> No-op; init system responsible for mounting.
1719                          *  5. cgroup:ro    -> No-op; init system responsible for mounting.
1720                          *  6. cgroup:mixed -> No-op; init system responsible for mounting.
1721                          *
1722                          *  7. cgroup-full:rw    -> Not supported.
1723                          *  8. cgroup-full:ro    -> Not supported.
1724                          *  9. cgroup-full:mixed -> Not supported.
1725
1726                          * 10. cgroup-full:rw:force    -> Not supported.
1727                          * 11. cgroup-full:ro:force    -> Not supported.
1728                          * 12. cgroup-full:mixed:force -> Not supported.
1729                          */
1730                         ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
1731                         if (ret < 0)
1732                                 return syserror_ret(false, "Failed to force mount cgroup filesystem in cgroup namespace");
1733
1734                         return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
1735                 } else {
1736                         /*
1737                          * Either no cgroup namespace supported (highly
1738                          * unlikely unless we're dealing with a Frankenkernel.
1739                          * Or the user requested to keep the cgroup namespace
1740                          * of the host or another container.
1741                          */
1742                         if (wants_force_mount) {
1743                                 /*
1744                                  * 1. cgroup:rw:force    -> Bind-mount the cgroup2 filesystem writable.
1745                                  * 2. cgroup:ro:force    -> Bind-mount the cgroup2 filesystem read-only.
1746                                  * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
1747                                  *                          and make the parent directory of the
1748                                  *                          container's cgroup read-only but the
1749                                  *                          container's cgroup writable.
1750                                  *
1751                                  * 10. cgroup-full:rw:force    ->
1752                                  * 11. cgroup-full:ro:force    ->
1753                                  * 12. cgroup-full:mixed:force ->
1754                                  */
1755                                 errno = EOPNOTSUPP;
1756                                 SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1757                         } else {
1758                                 errno = EOPNOTSUPP;
1759                                 SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1760                         }
1761                 }
1762
1763                 return syserror_ret(false, "Failed to mount cgroups");
1764         }
1765
1766         /*
1767          * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
1768          * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
1769          * DEFAULT_CGROUP_MOUNTPOINT define.
1770          */
1771         if (can_use_mount_api()) {
1772                 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
1773                 if (fd_fs < 0)
1774                         return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
1775
1776                 ret = fs_set_property(fd_fs, "mode", "0755");
1777                 if (ret < 0)
1778                         return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1779
1780                 ret = fs_set_property(fd_fs, "size", "10240k");
1781                 if (ret < 0)
1782                         return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1783
1784                 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1785                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
1786                                 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
1787                                 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
1788         } else {
1789                 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1790                 ret = safe_mount(NULL, cgroup_root, "tmpfs",
1791                                  MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1792                                  "size=10240k,mode=755", rootfs_mnt);
1793         }
1794         if (ret < 0)
1795                 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
1796                                        DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1797
1798         dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1799                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1800         if (dfd_mnt_tmpfs < 0)
1801                 return syserror_ret(false, "Failed to open %d(%s)",
1802                                     rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1803
1804         for (int i = 0; ops->hierarchies[i]; i++) {
1805                 __do_free char *hierarchy_mnt = NULL, *path2 = NULL;
1806                 struct hierarchy *h = ops->hierarchies[i];
1807
1808                 ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000);
1809                 if (ret < 0)
1810                         return syserror_ret(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt);
1811
1812                 if (in_cgroup_ns && wants_force_mount) {
1813                         /*
1814                          * If cgroup namespaces are supported but the container
1815                          * will not have CAP_SYS_ADMIN after it has started we
1816                          * need to mount the cgroups manually.
1817                          */
1818                         ret = cgroupfs_mount(cgroup_automount_type, h, rootfs,
1819                                              dfd_mnt_tmpfs, h->at_mnt);
1820                         if (ret < 0)
1821                                 return false;
1822
1823                         continue;
1824                 }
1825
1826                 /* Here is where the ancient kernel section begins. */
1827                 ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs,
1828                                           dfd_mnt_tmpfs, h->at_mnt);
1829                 if (ret < 0)
1830                         return false;
1831
1832                 if (!cg_mount_needs_subdirs(cgroup_automount_type))
1833                         continue;
1834
1835                 if (!cgroup_root)
1836                         cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1837
1838                 hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL);
1839                 path2 = must_make_path(hierarchy_mnt, h->at_base,
1840                                        ops->container_cgroup, NULL);
1841                 ret = mkdir_p(path2, 0755);
1842                 if (ret < 0 && (errno != EEXIST))
1843                         return false;
1844
1845                 ret = cg_legacy_mount_controllers(cgroup_automount_type, h,
1846                                                   hierarchy_mnt, path2,
1847                                                   ops->container_cgroup);
1848                 if (ret < 0)
1849                         return false;
1850         }
1851
1852         return true;
1853 }
1854
1855 /* Only root needs to escape to the cgroup of its init. */
1856 __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
1857                                             struct lxc_conf *conf)
1858 {
1859         if (!ops)
1860                 return ret_set_errno(false, ENOENT);
1861
1862         if (!ops->hierarchies)
1863                 return true;
1864
1865         if (!conf)
1866                 return ret_set_errno(false, EINVAL);
1867
1868         if (conf->cgroup_meta.relative || geteuid())
1869                 return true;
1870
1871         for (int i = 0; ops->hierarchies[i]; i++) {
1872                 __do_free char *fullpath = NULL;
1873                 int ret;
1874
1875                 fullpath = make_cgroup_path(ops->hierarchies[i],
1876                                             ops->hierarchies[i]->at_base,
1877                                             "cgroup.procs", NULL);
1878                 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1879                 if (ret != 0)
1880                         return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
1881         }
1882
1883         return true;
1884 }
1885
1886 __cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
1887 {
1888         int i = 0;
1889
1890         if (!ops)
1891                 return ret_set_errno(-1, ENOENT);
1892
1893         if (!ops->hierarchies)
1894                 return 0;
1895
1896         for (; ops->hierarchies[i]; i++)
1897                 ;
1898
1899         return i;
1900 }
1901
1902 __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
1903                                                      int n, char ***out)
1904 {
1905         int i;
1906
1907         if (!ops)
1908                 return ret_set_errno(false, ENOENT);
1909
1910         if (!ops->hierarchies)
1911                 return ret_set_errno(false, ENOENT);
1912
1913         /* consistency check n */
1914         for (i = 0; i < n; i++)
1915                 if (!ops->hierarchies[i])
1916                         return ret_set_errno(false, ENOENT);
1917
1918         *out = ops->hierarchies[i]->controllers;
1919
1920         return true;
1921 }
1922
1923 static int cg_legacy_freeze(struct cgroup_ops *ops)
1924 {
1925         struct hierarchy *h;
1926
1927         h = get_hierarchy(ops, "freezer");
1928         if (!h)
1929                 return ret_set_errno(-1, ENOENT);
1930
1931         return lxc_write_openat(h->path_con, "freezer.state",
1932                                 "FROZEN", STRLITERALLEN("FROZEN"));
1933 }
1934
1935 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1936                                     struct lxc_epoll_descr *descr)
1937 {
1938         __do_free char *line = NULL;
1939         __do_fclose FILE *f = NULL;
1940         int state = PTR_TO_INT(cbdata);
1941         size_t len;
1942         const char *state_string;
1943
1944         f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
1945         if (!f)
1946                 return LXC_MAINLOOP_ERROR;
1947
1948         if (state == 1)
1949                 state_string = "frozen 1";
1950         else
1951                 state_string = "frozen 0";
1952
1953         while (getline(&line, &len, f) != -1)
1954                 if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2))
1955                         return LXC_MAINLOOP_CLOSE;
1956
1957         rewind(f);
1958
1959         return LXC_MAINLOOP_CONTINUE;
1960 }
1961
1962 static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
1963                                 const char *state_string,
1964                                 int state_num,
1965                                 const char *epoll_error,
1966                                 const char *wait_error)
1967 {
1968         __do_close int fd = -EBADF;
1969         call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
1970         int ret;
1971         struct lxc_epoll_descr descr;
1972         struct hierarchy *h;
1973
1974         h = ops->unified;
1975         if (!h)
1976                 return ret_set_errno(-1, ENOENT);
1977
1978         if (!h->path_con)
1979                 return ret_set_errno(-1, EEXIST);
1980
1981         if (timeout != 0) {
1982                 __do_free char *events_file = NULL;
1983
1984                 events_file = must_make_path(h->path_con, "cgroup.events", NULL);
1985                 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1986                 if (fd < 0)
1987                         return log_error_errno(-1, errno, "Failed to open cgroup.events file");
1988
1989                 ret = lxc_mainloop_open(&descr);
1990                 if (ret)
1991                         return log_error_errno(-1, errno, "%s", epoll_error);
1992
1993                 /* automatically cleaned up now */
1994                 descr_ptr = &descr;
1995
1996                 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
1997                 if (ret < 0)
1998                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
1999         }
2000
2001         ret = lxc_write_openat(h->path_con, "cgroup.freeze", state_string, 1);
2002         if (ret < 0)
2003                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
2004
2005         if (timeout != 0 && lxc_mainloop(&descr, timeout))
2006                 return log_error_errno(-1, errno, "%s", wait_error);
2007
2008         return 0;
2009 }
2010
2011 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
2012 {
2013         return cg_unified_freeze_do(ops, timeout, "1", 1,
2014                 "Failed to create epoll instance to wait for container freeze",
2015                 "Failed to wait for container to be frozen");
2016 }
2017
2018 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
2019 {
2020         if (!ops->hierarchies)
2021                 return ret_set_errno(-1, ENOENT);
2022
2023         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2024                 return cg_legacy_freeze(ops);
2025
2026         return cg_unified_freeze(ops, timeout);
2027 }
2028
2029 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
2030 {
2031         struct hierarchy *h;
2032
2033         h = get_hierarchy(ops, "freezer");
2034         if (!h)
2035                 return ret_set_errno(-1, ENOENT);
2036
2037         return lxc_write_openat(h->path_con, "freezer.state",
2038                                 "THAWED", STRLITERALLEN("THAWED"));
2039 }
2040
2041 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
2042 {
2043         return cg_unified_freeze_do(ops, timeout, "0", 0,
2044                 "Failed to create epoll instance to wait for container unfreeze",
2045                 "Failed to wait for container to be unfrozen");
2046 }
2047
2048 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2049 {
2050         if (!ops->hierarchies)
2051                 return ret_set_errno(-1, ENOENT);
2052
2053         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2054                 return cg_legacy_unfreeze(ops);
2055
2056         return cg_unified_unfreeze(ops, timeout);
2057 }
2058
2059 static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2060                                         const char *controller, bool limiting)
2061 {
2062         struct hierarchy *h;
2063         size_t len;
2064         const char *path;
2065
2066         h = get_hierarchy(ops, controller);
2067         if (!h)
2068                 return log_warn_errno(NULL, ENOENT,
2069                                       "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller));
2070
2071         if (limiting)
2072                 path = h->path_lim;
2073         else
2074                 path = h->path_con;
2075         if (!path)
2076                 return NULL;
2077
2078         len = strlen(h->at_mnt);
2079         if (!strnequal(h->at_mnt, DEFAULT_CGROUP_MOUNTPOINT,
2080                        STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) {
2081                 path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT);
2082                 path += strspn(path, "/");
2083         }
2084         return path += len;
2085 }
2086
2087 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2088                                                   const char *controller)
2089 {
2090     return cgfsng_get_cgroup_do(ops, controller, false);
2091 }
2092
2093 __cgfsng_ops static const char *cgfsng_get_limit_cgroup(struct cgroup_ops *ops,
2094                                                         const char *controller)
2095 {
2096     return cgfsng_get_cgroup_do(ops, controller, true);
2097 }
2098
2099 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2100  * which must be freed by the caller.
2101  */
2102 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2103                                                        const char *inpath,
2104                                                        const char *filename)
2105 {
2106         return make_cgroup_path(h, inpath, filename, NULL);
2107 }
2108
2109 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
2110 {
2111         int idx = 1;
2112         int ret;
2113         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2114         ssize_t pidstr_len;
2115
2116         /* Create leaf cgroup. */
2117         ret = mkdirat(unified_fd, ".lxc", 0755);
2118         if (ret < 0 && errno != EEXIST)
2119                 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2120
2121         pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2122         if (pidstr_len < 0)
2123                 return pidstr_len;
2124
2125         ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
2126         if (ret < 0)
2127                 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2128         if (ret == 0)
2129                 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
2130
2131         /* this is a non-leaf node */
2132         if (errno != EBUSY)
2133                 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
2134
2135         do {
2136                 bool rm = false;
2137                 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
2138                 char *slash = attach_cgroup;
2139
2140                 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2141                 if (ret < 0)
2142                         return ret;
2143
2144                 /*
2145                  * This shouldn't really happen but the compiler might complain
2146                  * that a short write would cause a buffer overrun. So be on
2147                  * the safe side.
2148                  */
2149                 if (ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2150                         return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2151
2152                 slash += (ret - STRLITERALLEN("/cgroup.procs"));
2153                 *slash = '\0';
2154
2155                 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2156                 if (ret < 0 && errno != EEXIST)
2157                         return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2158                 if (ret == 0)
2159                         rm = true;
2160
2161                 *slash = '/';
2162
2163                 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2164                 if (ret == 0)
2165                         return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
2166
2167                 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2168                         SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2169
2170                 /* this is a non-leaf node */
2171                 if (errno != EBUSY)
2172                         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2173
2174                 idx++;
2175         } while (idx < 1000);
2176
2177         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2178 }
2179
2180 static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2181                                      int unified_fd, int *sk_fd)
2182 {
2183         __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2184         int target_fds[2];
2185         ssize_t ret;
2186
2187         /* Create leaf cgroup. */
2188         ret = mkdirat(unified_fd, ".lxc", 0755);
2189         if (ret < 0 && errno != EEXIST)
2190                 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2191
2192         target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2193         if (target_fd0 < 0)
2194                 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2195         target_fds[0] = target_fd0;
2196
2197         target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2198         if (target_fd1 < 0)
2199                 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2200         target_fds[1] = target_fd1;
2201
2202         ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
2203         if (ret <= 0)
2204                 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
2205                                        target_fd0, target_fd1);
2206
2207         return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
2208 }
2209
2210 static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2211                                         int *sk_fd, pid_t pid)
2212 {
2213         __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2214         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2215         size_t pidstr_len;
2216         ssize_t ret;
2217
2218         ret = lxc_abstract_unix_recv_two_fds(sk, &target_fd0, &target_fd1);
2219         if (ret < 0)
2220                 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
2221
2222         pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2223
2224         ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2225         if (ret > 0 && ret == pidstr_len)
2226                 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2227
2228         ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
2229         if (ret > 0 && ret == pidstr_len)
2230                 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
2231
2232         return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2233                                target_fd0, target_fd1);
2234 }
2235
2236 struct userns_exec_unified_attach_data {
2237         const struct lxc_conf *conf;
2238         int unified_fd;
2239         int sk_pair[2];
2240         pid_t pid;
2241 };
2242
2243 static int cgroup_unified_attach_child_wrapper(void *data)
2244 {
2245         struct userns_exec_unified_attach_data *args = data;
2246
2247         if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2248             args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2249                 return ret_errno(EINVAL);
2250
2251         close_prot_errno_disarm(args->sk_pair[0]);
2252         return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2253                                          &args->sk_pair[1]);
2254 }
2255
2256 static int cgroup_unified_attach_parent_wrapper(void *data)
2257 {
2258         struct userns_exec_unified_attach_data *args = data;
2259
2260         if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2261             args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2262                 return ret_errno(EINVAL);
2263
2264         close_prot_errno_disarm(args->sk_pair[1]);
2265         return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2266                                             args->pid);
2267 }
2268
2269 /* Technically, we're always at a delegation boundary here (This is especially
2270  * true when cgroup namespaces are available.). The reasoning is that in order
2271  * for us to have been able to start a container in the first place the root
2272  * cgroup must have been a leaf node. Now, either the container's init system
2273  * has populated the cgroup and kept it as a leaf node or it has created
2274  * subtrees. In the former case we will simply attach to the leaf node we
2275  * created when we started the container in the latter case we create our own
2276  * cgroup for the attaching process.
2277  */
2278 static int __cg_unified_attach(const struct hierarchy *h,
2279                                const struct lxc_conf *conf, const char *name,
2280                                const char *lxcpath, pid_t pid,
2281                                const char *controller)
2282 {
2283         __do_close int unified_fd = -EBADF;
2284         __do_free char *path = NULL, *cgroup = NULL;
2285         int ret;
2286
2287         if (!conf || !name || !lxcpath || pid <= 0)
2288                 return ret_errno(EINVAL);
2289
2290         ret = cgroup_attach(conf, name, lxcpath, pid);
2291         if (ret == 0)
2292                 return log_trace(0, "Attached to unified cgroup via command handler");
2293         if (!ERRNO_IS_NOT_SUPPORTED(ret) && ret != -ENOCGROUP2)
2294                 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2295
2296         /* Fall back to retrieving the path for the unified cgroup. */
2297         cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2298         /* not running */
2299         if (!cgroup)
2300                 return 0;
2301
2302         path = make_cgroup_path(h, cgroup, NULL);
2303
2304         unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
2305         if (unified_fd < 0)
2306                 return ret_errno(EBADF);
2307
2308         if (!lxc_list_empty(&conf->id_map)) {
2309                 struct userns_exec_unified_attach_data args = {
2310                         .conf           = conf,
2311                         .unified_fd     = unified_fd,
2312                         .pid            = pid,
2313                 };
2314
2315                 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2316                 if (ret < 0)
2317                         return -errno;
2318
2319                 ret = userns_exec_minimal(conf,
2320                                           cgroup_unified_attach_parent_wrapper,
2321                                           &args,
2322                                           cgroup_unified_attach_child_wrapper,
2323                                           &args);
2324         } else {
2325                 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2326         }
2327
2328         return ret;
2329 }
2330
2331 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2332                                        const struct lxc_conf *conf,
2333                                        const char *name, const char *lxcpath,
2334                                        pid_t pid)
2335 {
2336         int len, ret;
2337         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2338
2339         if (!ops)
2340                 return ret_set_errno(false, ENOENT);
2341
2342         if (!ops->hierarchies)
2343                 return true;
2344
2345         len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2346         if (len < 0)
2347                 return false;
2348
2349         for (int i = 0; ops->hierarchies[i]; i++) {
2350                 __do_free char *fullpath = NULL, *path = NULL;
2351                 struct hierarchy *h = ops->hierarchies[i];
2352
2353                 if (h->fs_type == UNIFIED_HIERARCHY) {
2354                         ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
2355                                                   h->controllers[0]);
2356                         if (ret < 0)
2357                                 return false;
2358
2359                         continue;
2360                 }
2361
2362                 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2363                 if (!path) {
2364                         /*
2365                          * Someone might have created a name=<controller>
2366                          * controller after the container has started and so
2367                          * the container doesn't make use of this controller.
2368                          *
2369                          * Link: https://github.com/lxc/lxd/issues/8577
2370                          */
2371                         TRACE("Skipping unused %s controller", maybe_empty(h->controllers[0]));
2372                         continue;
2373                 }
2374
2375                 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2376                 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2377                 if (ret < 0)
2378                         return log_error_errno(false, errno, "Failed to attach %d to %s",
2379                                                (int)pid, fullpath);
2380         }
2381
2382         return true;
2383 }
2384
2385 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.  Here we
2386  * don't have a cgroup_data set up, so we ask the running container through the
2387  * commands API for the cgroup path.
2388  */
2389 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2390                                      char *value, size_t len, const char *name,
2391                                      const char *lxcpath)
2392 {
2393         __do_free char *path = NULL;
2394         __do_free char *controller = NULL;
2395         char *p;
2396         struct hierarchy *h;
2397         int ret = -1;
2398
2399         if (!ops)
2400                 return ret_set_errno(-1, ENOENT);
2401
2402         controller = strdup(filename);
2403         if (!controller)
2404                 return ret_errno(ENOMEM);
2405
2406         p = strchr(controller, '.');
2407         if (p)
2408                 *p = '\0';
2409
2410         path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
2411         /* not running */
2412         if (!path)
2413                 return -1;
2414
2415         h = get_hierarchy(ops, controller);
2416         if (h) {
2417                 __do_free char *fullpath = NULL;
2418
2419                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2420                 ret = lxc_read_from_file(fullpath, value, len);
2421         }
2422
2423         return ret;
2424 }
2425
2426 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2427 {
2428         for (int count = 0; count < 3; count++, val++) {
2429                 switch (*val) {
2430                 case 'r':
2431                         device->access[count] = *val;
2432                         break;
2433                 case 'w':
2434                         device->access[count] = *val;
2435                         break;
2436                 case 'm':
2437                         device->access[count] = *val;
2438                         break;
2439                 case '\n':
2440                 case '\0':
2441                         count = 3;
2442                         break;
2443                 default:
2444                         return ret_errno(EINVAL);
2445                 }
2446         }
2447
2448         return 0;
2449 }
2450
2451 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2452                                     const char *val)
2453 {
2454         int count, ret;
2455         char temp[50];
2456
2457         if (strequal("devices.allow", key))
2458                 device->allow = 1; /* allow the device */
2459         else
2460                 device->allow = 0; /* deny the device */
2461
2462         if (strequal(val, "a")) {
2463                 /* global rule */
2464                 device->type = 'a';
2465                 device->major = -1;
2466                 device->minor = -1;
2467                 return 0;
2468         }
2469
2470         switch (*val) {
2471         case 'a':
2472                 __fallthrough;
2473         case 'b':
2474                 __fallthrough;
2475         case 'c':
2476                 device->type = *val;
2477                 break;
2478         default:
2479                 return -1;
2480         }
2481
2482         val++;
2483         if (!isspace(*val))
2484                 return -1;
2485         val++;
2486         if (*val == '*') {
2487                 device->major = -1;
2488                 val++;
2489         } else if (isdigit(*val)) {
2490                 memset(temp, 0, sizeof(temp));
2491                 for (count = 0; count < sizeof(temp) - 1; count++) {
2492                         temp[count] = *val;
2493                         val++;
2494                         if (!isdigit(*val))
2495                                 break;
2496                 }
2497                 ret = lxc_safe_int(temp, &device->major);
2498                 if (ret)
2499                         return -1;
2500         } else {
2501                 return -1;
2502         }
2503         if (*val != ':')
2504                 return -1;
2505         val++;
2506
2507         /* read minor */
2508         if (*val == '*') {
2509                 device->minor = -1;
2510                 val++;
2511         } else if (isdigit(*val)) {
2512                 memset(temp, 0, sizeof(temp));
2513                 for (count = 0; count < sizeof(temp) - 1; count++) {
2514                         temp[count] = *val;
2515                         val++;
2516                         if (!isdigit(*val))
2517                                 break;
2518                 }
2519                 ret = lxc_safe_int(temp, &device->minor);
2520                 if (ret)
2521                         return -1;
2522         } else {
2523                 return -1;
2524         }
2525         if (!isspace(*val))
2526                 return -1;
2527
2528         return device_cgroup_parse_access(device, ++val);
2529 }
2530
2531 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.  Here we
2532  * don't have a cgroup_data set up, so we ask the running container through the
2533  * commands API for the cgroup path.
2534  */
2535 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2536                                      const char *key, const char *value,
2537                                      const char *name, const char *lxcpath)
2538 {
2539         __do_free char *path = NULL;
2540         __do_free char *controller = NULL;
2541         char *p;
2542         struct hierarchy *h;
2543         int ret = -1;
2544
2545         if (!ops || is_empty_string(key) || is_empty_string(value) ||
2546             is_empty_string(name) || is_empty_string(lxcpath))
2547                 return ret_errno(EINVAL);
2548
2549         controller = strdup(key);
2550         if (!controller)
2551                 return ret_errno(ENOMEM);
2552
2553         p = strchr(controller, '.');
2554         if (p)
2555                 *p = '\0';
2556
2557         if (pure_unified_layout(ops) && strequal(controller, "devices")) {
2558                 struct device_item device = {};
2559
2560                 ret = device_cgroup_rule_parse(&device, key, value);
2561                 if (ret < 0)
2562                         return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2563                                                key, value);
2564
2565                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2566                 if (ret < 0)
2567                         return -1;
2568
2569                 return 0;
2570         }
2571
2572         path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
2573         /* not running */
2574         if (!path)
2575                 return -1;
2576
2577         h = get_hierarchy(ops, controller);
2578         if (h) {
2579                 __do_free char *fullpath = NULL;
2580
2581                 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2582                 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2583         }
2584
2585         return ret;
2586 }
2587
2588 /* take devices cgroup line
2589  *    /dev/foo rwx
2590  * and convert it to a valid
2591  *    type major:minor mode
2592  * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2593  * the output.
2594  */
2595 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2596                                             const char *devpath)
2597 {
2598         __do_free char *path = NULL;
2599         char *mode = NULL;
2600         int n_parts, ret;
2601         char *p;
2602         struct stat sb;
2603
2604         path = strdup(devpath);
2605         if (!path)
2606                 return ret_errno(ENOMEM);
2607
2608         /*
2609          * Read path followed by mode. Ignore any trailing text.
2610          * A '    # comment' would be legal. Technically other text is not
2611          * legal, we could check for that if we cared to.
2612          */
2613         for (n_parts = 1, p = path; *p; p++) {
2614                 if (*p != ' ')
2615                         continue;
2616                 *p = '\0';
2617
2618                 if (n_parts != 1)
2619                         break;
2620                 p++;
2621                 n_parts++;
2622
2623                 while (*p == ' ')
2624                         p++;
2625
2626                 mode = p;
2627
2628                 if (*p == '\0')
2629                         return ret_set_errno(-1, EINVAL);
2630         }
2631
2632         if (!mode)
2633                 return ret_errno(EINVAL);
2634
2635         if (device_cgroup_parse_access(device, mode) < 0)
2636                 return -1;
2637
2638         ret = stat(path, &sb);
2639         if (ret < 0)
2640                 return ret_set_errno(-1, errno);
2641
2642         mode_t m = sb.st_mode & S_IFMT;
2643         switch (m) {
2644         case S_IFBLK:
2645                 device->type = 'b';
2646                 break;
2647         case S_IFCHR:
2648                 device->type = 'c';
2649                 break;
2650         default:
2651                 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
2652         }
2653
2654         device->major = MAJOR(sb.st_rdev);
2655         device->minor = MINOR(sb.st_rdev);
2656         device->allow = 1;
2657
2658         return 0;
2659 }
2660
2661 static int convert_devpath(const char *invalue, char *dest)
2662 {
2663         struct device_item device = {};
2664         int ret;
2665
2666         ret = device_cgroup_rule_parse_devpath(&device, invalue);
2667         if (ret < 0)
2668                 return -1;
2669
2670         ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2671                          device.minor, device.access);
2672         if (ret < 0)
2673                 return log_error_errno(ret, -ret,
2674                                        "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2675                                        device.type, device.major, device.minor,
2676                                        device.access);
2677
2678         return 0;
2679 }
2680
2681 /* Called from setup_limits - here we have the container's cgroup_data because
2682  * we created the cgroups.
2683  */
2684 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2685                               const char *value, bool is_cpuset)
2686 {
2687         __do_free char *controller = NULL;
2688         char *p;
2689         /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2690         char converted_value[50];
2691         struct hierarchy *h;
2692
2693         controller = strdup(filename);
2694         if (!controller)
2695                 return ret_errno(ENOMEM);
2696
2697         p = strchr(controller, '.');
2698         if (p)
2699                 *p = '\0';
2700
2701         if (strequal("devices.allow", filename) && value[0] == '/') {
2702                 int ret;
2703
2704                 ret = convert_devpath(value, converted_value);
2705                 if (ret < 0)
2706                         return ret;
2707                 value = converted_value;
2708         }
2709
2710         h = get_hierarchy(ops, controller);
2711         if (!h)
2712                 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
2713
2714         if (is_cpuset) {
2715                 int ret = lxc_write_openat(h->path_con, filename, value, strlen(value));
2716                 if (ret)
2717                         return ret;
2718         }
2719         return lxc_write_openat(h->path_lim, filename, value, strlen(value));
2720 }
2721
2722 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2723                                                     struct lxc_conf *conf,
2724                                                     bool do_devices)
2725 {
2726         __do_free struct lxc_list *sorted_cgroup_settings = NULL;
2727         struct lxc_list *cgroup_settings = &conf->cgroup;
2728         struct lxc_list *iterator, *next;
2729         struct lxc_cgroup *cg;
2730         bool ret = false;
2731
2732         if (!ops)
2733                 return ret_set_errno(false, ENOENT);
2734
2735         if (!conf)
2736                 return ret_set_errno(false, EINVAL);
2737
2738         cgroup_settings = &conf->cgroup;
2739         if (lxc_list_empty(cgroup_settings))
2740                 return true;
2741
2742         if (!ops->hierarchies)
2743                 return ret_set_errno(false, EINVAL);
2744
2745         if (pure_unified_layout(ops))
2746                 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
2747
2748         sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2749         if (!sorted_cgroup_settings)
2750                 return false;
2751
2752         lxc_list_for_each(iterator, sorted_cgroup_settings) {
2753                 cg = iterator->elem;
2754
2755                 if (do_devices == strnequal("devices", cg->subsystem, 7)) {
2756                         if (cg_legacy_set_data(ops, cg->subsystem, cg->value, strnequal("cpuset", cg->subsystem, 6))) {
2757                                 if (do_devices && (errno == EACCES || errno == EPERM)) {
2758                                         SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2759                                         continue;
2760                                 }
2761                                 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2762                                 goto out;
2763                         }
2764                         DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
2765                 }
2766         }
2767
2768         ret = true;
2769         INFO("Limits for the legacy cgroup hierarchies have been setup");
2770 out:
2771         lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2772                 lxc_list_del(iterator);
2773                 free(iterator);
2774         }
2775
2776         return ret;
2777 }
2778
2779 /*
2780  * Some of the parsing logic comes from the original cgroup device v1
2781  * implementation in the kernel.
2782  */
2783 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2784                                      struct lxc_conf *conf, const char *key,
2785                                      const char *val)
2786 {
2787         struct device_item device_item = {};
2788         int ret;
2789
2790         if (strequal("devices.allow", key) && abspath(val))
2791                 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2792         else
2793                 ret = device_cgroup_rule_parse(&device_item, key, val);
2794         if (ret < 0)
2795                 return syserror_set(EINVAL, "Failed to parse device rule %s=%s", key, val);
2796
2797         /*
2798          * Note that bpf_list_add_device() returns 1 if it altered the device
2799          * list and 0 if it didn't; both return values indicate success.
2800          * Only a negative return value indicates an error.
2801          */
2802         ret = bpf_list_add_device(&conf->bpf_devices, &device_item);
2803         if (ret < 0)
2804                 return -1;
2805
2806         return 0;
2807 }
2808
2809 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2810                                              struct lxc_handler *handler)
2811 {
2812         struct lxc_list *cgroup_settings, *iterator;
2813         struct hierarchy *h;
2814         struct lxc_conf *conf;
2815
2816         if (!ops)
2817                 return ret_set_errno(false, ENOENT);
2818
2819         if (!ops->hierarchies)
2820                 return true;
2821
2822         if (!ops->container_cgroup)
2823                 return ret_set_errno(false, EINVAL);
2824
2825         if (!handler || !handler->conf)
2826                 return ret_set_errno(false, EINVAL);
2827         conf = handler->conf;
2828
2829         cgroup_settings = &conf->cgroup2;
2830         if (lxc_list_empty(cgroup_settings))
2831                 return true;
2832
2833         if (!pure_unified_layout(ops))
2834                 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
2835
2836         if (!ops->unified)
2837                 return false;
2838         h = ops->unified;
2839
2840         lxc_list_for_each (iterator, cgroup_settings) {
2841                 struct lxc_cgroup *cg = iterator->elem;
2842                 int ret;
2843
2844                 if (strnequal("devices", cg->subsystem, 7))
2845                         ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, cg->value);
2846                 else
2847                         ret = lxc_write_openat(h->path_lim, cg->subsystem, cg->value, strlen(cg->value));
2848                 if (ret < 0)
2849                         return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2850
2851                 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2852         }
2853
2854         return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
2855 }
2856
2857 __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
2858 {
2859         struct lxc_conf *conf;
2860         struct hierarchy *unified;
2861
2862         if (!ops)
2863                 return ret_set_errno(false, ENOENT);
2864
2865         if (!ops->hierarchies)
2866                 return true;
2867
2868         if (!ops->container_cgroup)
2869                 return ret_set_errno(false, EEXIST);
2870
2871         if (!handler || !handler->conf)
2872                 return ret_set_errno(false, EINVAL);
2873         conf = handler->conf;
2874
2875         unified = ops->unified;
2876         if (!unified || !device_utility_controller(unified) ||
2877             !unified->path_con ||
2878             lxc_list_empty(&(conf->bpf_devices).device_item))
2879                 return true;
2880
2881         return bpf_cgroup_devices_attach(ops, &conf->bpf_devices);
2882 }
2883
2884 static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
2885 {
2886         __do_close int dfd_final = -EBADF;
2887         __do_free char *add_controllers = NULL, *copy = NULL;
2888         size_t full_len = 0;
2889         struct hierarchy *unified;
2890         int dfd_cur, ret;
2891         char *cur;
2892         char **it;
2893
2894         if (!ops->hierarchies || !pure_unified_layout(ops))
2895                 return true;
2896
2897         unified = ops->unified;
2898         if (!unified->controllers[0])
2899                 return true;
2900
2901         /* For now we simply enable all controllers that we have detected by
2902          * creating a string like "+memory +pids +cpu +io".
2903          * TODO: In the near future we might want to support "-<controller>"
2904          * etc. but whether supporting semantics like this make sense will need
2905          * some thinking.
2906          */
2907         for (it = unified->controllers; it && *it; it++) {
2908                 full_len += strlen(*it) + 2;
2909                 add_controllers = must_realloc(add_controllers, full_len + 1);
2910
2911                 if (unified->controllers[0] == *it)
2912                         add_controllers[0] = '\0';
2913
2914                 (void)strlcat(add_controllers, "+", full_len + 1);
2915                 (void)strlcat(add_controllers, *it, full_len + 1);
2916
2917                 if ((it + 1) && *(it + 1))
2918                         (void)strlcat(add_controllers, " ", full_len + 1);
2919         }
2920
2921         copy = strdup(cgroup);
2922         if (!copy)
2923                 return false;
2924
2925         /*
2926          * Placing the write to cgroup.subtree_control before the open() is
2927          * intentional because of the cgroup2 delegation model. It enforces
2928          * that leaf cgroups don't have any controllers enabled for delegation.
2929          */
2930         dfd_cur = unified->dfd_base;
2931         lxc_iterate_parts(cur, copy, "/") {
2932                 /*
2933                  * Even though we vetted the paths when we parsed the config
2934                  * we're paranoid here and check that the path is neither
2935                  * absolute nor walks upwards.
2936                  */
2937                 if (abspath(cur))
2938                         return syserror_set(-EINVAL, "No absolute paths allowed");
2939
2940                 if (strnequal(cur, "..", STRLITERALLEN("..")))
2941                         return syserror_set(-EINVAL, "No upward walking paths allowed");
2942
2943                 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
2944                 if (ret < 0)
2945                         return syserror("Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2946
2947                 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2948
2949                 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
2950                 if (dfd_final < 0)
2951                         return syserror("Fail to open directory %d(%s)", dfd_cur, cur);
2952                 if (dfd_cur != unified->dfd_base)
2953                         close(dfd_cur);
2954                 /*
2955                  * Leave dfd_final pointing to the last fd we opened so
2956                  * it will be automatically zapped if we return early.
2957                  */
2958                 dfd_cur = dfd_final;
2959         }
2960
2961         return true;
2962 }
2963
2964 __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2965 {
2966         if (!ops)
2967                 return ret_set_errno(false, ENOENT);
2968
2969         return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2970 }
2971
2972 __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2973 {
2974         if (!ops)
2975                 return ret_set_errno(false, ENOENT);
2976
2977         return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2978 }
2979
2980 static inline bool unified_cgroup(const char *line)
2981 {
2982         return *line == '0';
2983 }
2984
2985 static inline char *current_unified_cgroup(bool relative, char *line)
2986 {
2987         char *current_cgroup;
2988
2989         line += STRLITERALLEN("0::");
2990
2991         if (!abspath(line))
2992                 return ERR_PTR(-EINVAL);
2993
2994         /* remove init.scope */
2995         if (!relative)
2996                 line = prune_init_scope(line);
2997
2998         /* create a relative path */
2999         line = deabs(line);
3000
3001         current_cgroup = strdup(line);
3002         if (!current_cgroup)
3003                 return ERR_PTR(-ENOMEM);
3004
3005         return current_cgroup;
3006 }
3007
3008 static inline const char *unprefix(const char *controllers)
3009 {
3010         if (strnequal(controllers, "name=", STRLITERALLEN("name=")))
3011                 return controllers + STRLITERALLEN("name=");
3012         return controllers;
3013 }
3014
3015 static int __list_cgroup_delegate(char ***delegate)
3016 {
3017         __do_free char **list = NULL;
3018         __do_free char *buf = NULL;
3019         char *standard[] = {
3020                 "cgroup.procs",
3021                 "cgroup.threads",
3022                 "cgroup.subtree_control",
3023                 "memory.oom.group",
3024                 NULL,
3025         };
3026         char *token;
3027         int ret;
3028
3029         buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
3030         if (!buf) {
3031                 for (char **p = standard; p && *p; p++) {
3032                         ret = list_add_string(&list, *p);
3033                         if (ret < 0)
3034                                 return ret;
3035                 }
3036
3037                 *delegate = move_ptr(list);
3038                 return syswarn_ret(0, "Failed to read /sys/kernel/cgroup/delegate");
3039         }
3040
3041         lxc_iterate_parts(token, buf, " \t\n") {
3042                 /*
3043                  * We always need to chown this for both cgroup and
3044                  * cgroup2.
3045                  */
3046                 if (strequal(token, "cgroup.procs"))
3047                         continue;
3048
3049                 ret = list_add_string(&list, token);
3050                 if (ret < 0)
3051                         return ret;
3052         }
3053
3054         *delegate = move_ptr(list);
3055         return 0;
3056 }
3057
3058 static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files)
3059 {
3060         __do_free_string_list char **list = NULL;
3061         int ret;
3062
3063         ret = __list_cgroup_delegate(&list);
3064         if (ret < 0)
3065                 return syserror_ret(ret, "Failed to determine unified cgroup delegation requirements");
3066
3067         for (char *const *s = list; s && *s; s++) {
3068                 if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT)
3069                         continue;
3070
3071                 return sysinfo_ret(false, "The %s file is not writable, skipping unified hierarchy", *s);
3072         }
3073
3074         *ret_files = move_ptr(list);
3075         return true;
3076 }
3077
3078 static bool legacy_hierarchy_delegated(int dfd_base)
3079 {
3080         int ret;
3081
3082         ret = faccessat(dfd_base, ".", W_OK, 0);
3083         if (ret < 0 && errno != ENOENT)
3084                 return sysinfo_ret(false, "Legacy hierarchy not writable, skipping");
3085
3086         return true;
3087 }
3088
3089 /**
3090  * systemd guarantees that the order of co-mounted controllers is stable. On
3091  * some systems the order of the controllers might be reversed though.
3092  *
3093  * For example, this is how the order is mismatched on CentOS 7:
3094  *
3095  *      [root@localhost ~]# cat /proc/self/cgroup
3096  *      11:perf_event:/
3097  *      10:pids:/
3098  *      9:freezer:/
3099  * >>>> 8:cpuacct,cpu:/
3100  *      7:memory:/
3101  *      6:blkio:/
3102  *      5:devices:/
3103  *      4:hugetlb:/
3104  * >>>> 3:net_prio,net_cls:/
3105  *      2:cpuset:/
3106  *      1:name=systemd:/user.slice/user-0.slice/session-c1.scope
3107  *
3108  * whereas the mountpoint:
3109  *
3110  *      | |-/sys/fs/cgroup                    tmpfs         tmpfs      ro,nosuid,nodev,noexec,mode=755
3111  *      | | |-/sys/fs/cgroup/systemd          cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
3112  *      | | |-/sys/fs/cgroup/cpuset           cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,cpuset
3113  * >>>> | | |-/sys/fs/cgroup/net_cls,net_prio cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,net_prio,net_cls
3114  *      | | |-/sys/fs/cgroup/hugetlb          cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,hugetlb
3115  *      | | |-/sys/fs/cgroup/devices          cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,devices
3116  *      | | |-/sys/fs/cgroup/blkio            cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,blkio
3117  *      | | |-/sys/fs/cgroup/memory           cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,memory
3118  * >>>> | | |-/sys/fs/cgroup/cpu,cpuacct      cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,cpuacct,cpu
3119  *      | | |-/sys/fs/cgroup/freezer          cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,freezer
3120  *      | | |-/sys/fs/cgroup/pids             cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,pids
3121  *      | | `-/sys/fs/cgroup/perf_event       cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,perf_event
3122  *
3123  * Ensure that we always use the systemd-guaranteed stable order when checking
3124  * for the mountpoint.
3125  */
3126 __attribute__((returns_nonnull)) __attribute__((nonnull))
3127 static const char *stable_order(const char *controllers)
3128 {
3129         if (strequal(controllers, "cpuacct,cpu"))
3130                 return "cpu,cpuacct";
3131
3132         if (strequal(controllers, "net_prio,net_cls"))
3133                 return "net_cls,net_prio";
3134
3135         return unprefix(controllers);
3136 }
3137
3138 static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
3139                                 bool unprivileged)
3140 {
3141         __do_free char *cgroup_info = NULL;
3142         char *it;
3143
3144         /*
3145          * Root spawned containers escape the current cgroup, so use init's
3146          * cgroups as our base in that case.
3147          */
3148         if (!relative && (geteuid() == 0))
3149                 cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3150         else
3151                 cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3152         if (!cgroup_info)
3153                 return ret_errno(ENOMEM);
3154
3155         lxc_iterate_parts(it, cgroup_info, "\n") {
3156                 __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
3157                 __do_free char *controllers = NULL, *current_cgroup = NULL;
3158                 __do_free_string_list char **controller_list = NULL,
3159                                            **delegate = NULL;
3160                 char *line;
3161                 int dfd, ret, type;
3162
3163                 /* Handle the unified cgroup hierarchy. */
3164                 line = it;
3165                 if (unified_cgroup(line)) {
3166                         char *unified_mnt;
3167
3168                         type = UNIFIED_HIERARCHY;
3169
3170                         current_cgroup = current_unified_cgroup(relative, line);
3171                         if (IS_ERR(current_cgroup))
3172                                 return PTR_ERR(current_cgroup);
3173
3174                         if (unified_cgroup_fd(ops->dfd_mnt)) {
3175                                 dfd_mnt = dup_cloexec(ops->dfd_mnt);
3176                                 unified_mnt = "";
3177                         } else {
3178                                 dfd_mnt = open_at(ops->dfd_mnt,
3179                                                   "unified",
3180                                                   PROTECT_OPATH_DIRECTORY,
3181                                                   PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3182                                 unified_mnt = "unified";
3183                         }
3184                         if (dfd_mnt < 0) {
3185                                 if (errno != ENOENT)
3186                                         return syserror("Failed to open %d/unified", ops->dfd_mnt);
3187
3188                                 SYSTRACE("Unified cgroup not mounted");
3189                                 continue;
3190                         }
3191                         dfd = dfd_mnt;
3192
3193                         if (!is_empty_string(current_cgroup)) {
3194                                 dfd_base = open_at(dfd_mnt, current_cgroup,
3195                                                    PROTECT_OPATH_DIRECTORY,
3196                                                    PROTECT_LOOKUP_BENEATH_XDEV, 0);
3197                                 if (dfd_base < 0) {
3198                                         if (errno != ENOENT)
3199                                                 return syserror("Failed to open %d/%s",
3200                                                                 dfd_mnt, current_cgroup);
3201
3202                                         SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)",
3203                                                  dfd_mnt, current_cgroup);
3204                                         continue;
3205                                 }
3206                                 dfd = dfd_base;
3207                         }
3208
3209                         if (!unified_hierarchy_delegated(dfd, &delegate))
3210                                 continue;
3211
3212                         controller_list = unified_controllers(dfd, "cgroup.controllers");
3213                         if (!controller_list) {
3214                                 TRACE("No controllers are enabled for delegation in the unified hierarchy");
3215                                 controller_list = list_new();
3216                                 if (!controller_list)
3217                                         return syserror_set(-ENOMEM, "Failed to create empty controller list");
3218                         }
3219
3220                         controllers = strdup(unified_mnt);
3221                         if (!controllers)
3222                                 return ret_errno(ENOMEM);
3223                 } else {
3224                         char *__controllers, *__current_cgroup;
3225
3226                         type = LEGACY_HIERARCHY;
3227
3228                         __controllers = strchr(line, ':');
3229                         if (!__controllers)
3230                                 return ret_errno(EINVAL);
3231                         __controllers++;
3232
3233                         __current_cgroup = strchr(__controllers, ':');
3234                         if (!__current_cgroup)
3235                                 return ret_errno(EINVAL);
3236                         *__current_cgroup = '\0';
3237                         __current_cgroup++;
3238
3239                         controllers = strdup(stable_order(__controllers));
3240                         if (!controllers)
3241                                 return ret_errno(ENOMEM);
3242
3243                         dfd_mnt = open_at(ops->dfd_mnt,
3244                                           controllers,
3245                                           PROTECT_OPATH_DIRECTORY,
3246                                           PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3247                         if (dfd_mnt < 0) {
3248                                 if (errno != ENOENT)
3249                                         return syserror("Failed to open %d/%s",
3250                                                         ops->dfd_mnt, controllers);
3251
3252                                 SYSTRACE("%s not mounted", controllers);
3253                                 continue;
3254                         }
3255                         dfd = dfd_mnt;
3256
3257                         if (!abspath(__current_cgroup))
3258                                 return ret_errno(EINVAL);
3259
3260                         /* remove init.scope */
3261                         if (!relative)
3262                                 __current_cgroup = prune_init_scope(__current_cgroup);
3263
3264                         /* create a relative path */
3265                         __current_cgroup = deabs(__current_cgroup);
3266
3267                         current_cgroup = strdup(__current_cgroup);
3268                         if (!current_cgroup)
3269                                 return ret_errno(ENOMEM);
3270
3271                         if (!is_empty_string(current_cgroup)) {
3272                                 dfd_base = open_at(dfd_mnt, current_cgroup,
3273                                                    PROTECT_OPATH_DIRECTORY,
3274                                                    PROTECT_LOOKUP_BENEATH_XDEV, 0);
3275                                 if (dfd_base < 0) {
3276                                         if (errno != ENOENT)
3277                                                 return syserror("Failed to open %d/%s",
3278                                                                 dfd_mnt, current_cgroup);
3279
3280                                         SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)",
3281                                                  dfd_mnt, current_cgroup);
3282                                         continue;
3283                                 }
3284                                 dfd = dfd_base;
3285                         }
3286
3287                         if (!legacy_hierarchy_delegated(dfd))
3288                                 continue;
3289
3290                         /*
3291                          * We intentionally pass __current_cgroup here and not
3292                          * controllers because we would otherwise chop the
3293                          * mountpoint.
3294                          */
3295                         controller_list = list_add_controllers(__controllers);
3296                         if (!controller_list)
3297                                 return syserror_set(-ENOMEM, "Failed to create controller list from %s", __controllers);
3298
3299                         if (skip_hierarchy(ops, controller_list))
3300                                 continue;
3301
3302                         ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3303                 }
3304
3305                 ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd,
3306                                            current_cgroup, controller_list, type);
3307                 if (ret < 0)
3308                         return syserror_ret(ret, "Failed to add %s hierarchy", controllers);
3309
3310                 /* Transfer ownership. */
3311                 move_fd(dfd_mnt);
3312                 move_fd(dfd_base);
3313                 move_ptr(current_cgroup);
3314                 move_ptr(controllers);
3315                 move_ptr(controller_list);
3316                 if (type == UNIFIED_HIERARCHY)
3317                         ops->unified->delegate = move_ptr(delegate);
3318         }
3319
3320         /* determine cgroup layout */
3321         if (ops->unified) {
3322                 if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3323                         ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3324                 } else {
3325                         if (bpf_devices_cgroup_supported())
3326                                 ops->unified->utilities |= DEVICES_CONTROLLER;
3327                         ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3328                 }
3329         }
3330
3331         if (!controllers_available(ops))
3332                 return syserror_set(-ENOENT, "One or more requested controllers unavailable or not delegated");
3333
3334         return 0;
3335 }
3336
3337 static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf)
3338 {
3339         __do_close int dfd = -EBADF;
3340         int ret;
3341         const char *controllers_use;
3342
3343         if (ops->dfd_mnt >= 0)
3344                 return ret_errno(EBUSY);
3345
3346         /*
3347          * I don't see the need for allowing symlinks here. If users want to
3348          * have their hierarchy available in different locations I strongly
3349          * suggest bind-mounts.
3350          */
3351         dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3352                         PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3353         if (dfd < 0)
3354                 return syserror("Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
3355
3356         controllers_use = lxc_global_config_value("lxc.cgroup.use");
3357         if (controllers_use) {
3358                 __do_free char *dup = NULL;
3359                 char *it;
3360
3361                 dup = strdup(controllers_use);
3362                 if (!dup)
3363                         return -errno;
3364
3365                 lxc_iterate_parts(it, dup, ",") {
3366                         ret = list_add_string(&ops->cgroup_use, it);
3367                         if (ret < 0)
3368                                 return ret;
3369                 }
3370         }
3371
3372         /*
3373          * Keep dfd referenced by the cleanup function and actually move the fd
3374          * once we know the initialization succeeded. So if we fail we clean up
3375          * the dfd.
3376          */
3377         ops->dfd_mnt = dfd;
3378
3379         ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !lxc_list_empty(&conf->id_map));
3380         if (ret < 0)
3381                 return syserror_ret(ret, "Failed to initialize cgroups");
3382
3383         /* Transfer ownership to cgroup_ops. */
3384         move_fd(dfd);
3385         return 0;
3386 }
3387
3388 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3389 {
3390         const char *cgroup_pattern;
3391
3392         if (!ops)
3393                 return ret_set_errno(-1, ENOENT);
3394
3395         /* copy system-wide cgroup information */
3396         cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3397         if (cgroup_pattern && !strequal(cgroup_pattern, "")) {
3398                 ops->cgroup_pattern = strdup(cgroup_pattern);
3399                 if (!ops->cgroup_pattern)
3400                         return ret_errno(ENOMEM);
3401         }
3402
3403         return 0;
3404 }
3405
3406 struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
3407 {
3408         __cleanup_cgroup_ops struct cgroup_ops *cgfsng_ops = NULL;
3409
3410         cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
3411         if (!cgfsng_ops)
3412                 return ret_set_errno(NULL, ENOMEM);
3413
3414         cgfsng_ops->cgroup_layout       = CGROUP_LAYOUT_UNKNOWN;
3415         cgfsng_ops->dfd_mnt             = -EBADF;
3416
3417         if (initialize_cgroups(cgfsng_ops, conf))
3418                 return NULL;
3419
3420         cgfsng_ops->data_init                           = cgfsng_data_init;
3421         cgfsng_ops->payload_destroy                     = cgfsng_payload_destroy;
3422         cgfsng_ops->monitor_destroy                     = cgfsng_monitor_destroy;
3423         cgfsng_ops->monitor_create                      = cgfsng_monitor_create;
3424         cgfsng_ops->monitor_enter                       = cgfsng_monitor_enter;
3425         cgfsng_ops->monitor_delegate_controllers        = cgfsng_monitor_delegate_controllers;
3426         cgfsng_ops->payload_delegate_controllers        = cgfsng_payload_delegate_controllers;
3427         cgfsng_ops->payload_create                      = cgfsng_payload_create;
3428         cgfsng_ops->payload_enter                       = cgfsng_payload_enter;
3429         cgfsng_ops->finalize                            = cgfsng_finalize;
3430         cgfsng_ops->get_cgroup                          = cgfsng_get_cgroup;
3431         cgfsng_ops->get                                 = cgfsng_get;
3432         cgfsng_ops->set                                 = cgfsng_set;
3433         cgfsng_ops->freeze                              = cgfsng_freeze;
3434         cgfsng_ops->unfreeze                            = cgfsng_unfreeze;
3435         cgfsng_ops->setup_limits_legacy                 = cgfsng_setup_limits_legacy;
3436         cgfsng_ops->setup_limits                        = cgfsng_setup_limits;
3437         cgfsng_ops->driver                              = "cgfsng";
3438         cgfsng_ops->version                             = "1.0.0";
3439         cgfsng_ops->attach                              = cgfsng_attach;
3440         cgfsng_ops->chown                               = cgfsng_chown;
3441         cgfsng_ops->mount                               = cgfsng_mount;
3442         cgfsng_ops->devices_activate                    = cgfsng_devices_activate;
3443         cgfsng_ops->get_limit_cgroup                    = cgfsng_get_limit_cgroup;
3444
3445         cgfsng_ops->criu_escape                         = cgfsng_criu_escape;
3446         cgfsng_ops->criu_num_hierarchies                = cgfsng_criu_num_hierarchies;
3447         cgfsng_ops->criu_get_hierarchies                = cgfsng_criu_get_hierarchies;
3448
3449         return move_ptr(cgfsng_ops);
3450 }
3451
3452 static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_t pid)
3453 {
3454         int ret;
3455
3456         if (!lxc_list_empty(&conf->id_map)) {
3457                 struct userns_exec_unified_attach_data args = {
3458                         .conf           = conf,
3459                         .unified_fd     = fd_unified,
3460                         .pid            = pid,
3461                 };
3462
3463                 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3464                 if (ret < 0)
3465                         return -errno;
3466
3467                 ret = userns_exec_minimal(conf,
3468                                           cgroup_unified_attach_parent_wrapper,
3469                                           &args,
3470                                           cgroup_unified_attach_child_wrapper,
3471                                           &args);
3472         } else {
3473                 ret = cgroup_attach_leaf(conf, fd_unified, pid);
3474         }
3475
3476         return ret;
3477 }
3478
3479 static int __cgroup_attach_many(const struct lxc_conf *conf, const char *name,
3480                                 const char *lxcpath, pid_t pid)
3481 {
3482         call_cleaner(put_cgroup_ctx) struct cgroup_ctx *ctx = &(struct cgroup_ctx){};
3483         int ret;
3484         size_t idx;
3485         ssize_t pidstr_len;
3486         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
3487
3488         ret = lxc_cmd_get_cgroup_ctx(name, lxcpath, sizeof(struct cgroup_ctx), ctx);
3489         if (ret < 0)
3490                 return ret_errno(ENOSYS);
3491
3492         pidstr_len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
3493         if (pidstr_len < 0)
3494                 return pidstr_len;
3495
3496         for (idx = 0; idx < ctx->fd_len; idx++) {
3497                 int dfd_con = ctx->fd[idx];
3498
3499                 if (unified_cgroup_fd(dfd_con))
3500                         ret = __unified_attach_fd(conf, dfd_con, pid);
3501                 else
3502                         ret = lxc_writeat(dfd_con, "cgroup.procs", pidstr, pidstr_len);
3503                 if (ret)
3504                         return syserror_ret(ret, "Failed to attach to cgroup fd %d", dfd_con);
3505                 else
3506                         TRACE("Attached to cgroup fd %d", dfd_con);
3507         }
3508
3509         if (idx == 0)
3510                 return syserror_set(-ENOENT, "Failed to attach to cgroups");
3511
3512         TRACE("Attached to %s cgroup layout", cgroup_layout_name(ctx->layout));
3513         return 0;
3514 }
3515
3516 static int __cgroup_attach_unified(const struct lxc_conf *conf, const char *name,
3517                                    const char *lxcpath, pid_t pid)
3518 {
3519         __do_close int dfd_unified = -EBADF;
3520
3521         if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
3522                 return ret_errno(EINVAL);
3523
3524         dfd_unified = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3525         if (dfd_unified < 0)
3526                 return ret_errno(ENOSYS);
3527
3528         return __unified_attach_fd(conf, dfd_unified, pid);
3529 }
3530
3531 int cgroup_attach(const struct lxc_conf *conf, const char *name,
3532                   const char *lxcpath, pid_t pid)
3533 {
3534         int ret;
3535
3536         ret = __cgroup_attach_many(conf, name, lxcpath, pid);
3537         if (ret < 0) {
3538                 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3539                         return ret;
3540
3541                 ret = __cgroup_attach_unified(conf, name, lxcpath, pid);
3542                 if (ret < 0 && ERRNO_IS_NOT_SUPPORTED(ret))
3543                         return ret_errno(ENOSYS);
3544         }
3545
3546         return ret;
3547 }
3548
3549 /* Connects to command socket therefore isn't callable from command handler. */
3550 int cgroup_get(const char *name, const char *lxcpath, const char *key, char *buf, size_t len)
3551 {
3552         __do_close int dfd = -EBADF;
3553         struct cgroup_fd fd = {
3554                 .fd = -EBADF,
3555         };
3556         size_t len_controller;
3557         int ret;
3558
3559         if (is_empty_string(name) || is_empty_string(lxcpath) ||
3560             is_empty_string(key))
3561                 return ret_errno(EINVAL);
3562
3563         if ((buf && !len) || (len && !buf))
3564                 return ret_errno(EINVAL);
3565
3566         len_controller = strcspn(key, ".");
3567         len_controller++; /* Don't forget the \0 byte. */
3568         if (len_controller >= MAX_CGROUP_ROOT_NAMELEN)
3569                 return ret_errno(EINVAL);
3570         (void)strlcpy(fd.controller, key, len_controller);
3571
3572         ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd);
3573         if (ret < 0) {
3574                 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3575                         return ret;
3576
3577                 dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3578                 if (dfd < 0) {
3579                         if (!ERRNO_IS_NOT_SUPPORTED(ret))
3580                                 return ret;
3581
3582                         return ret_errno(ENOSYS);
3583                 }
3584                 fd.type = UNIFIED_HIERARCHY;
3585                 fd.fd = move_fd(dfd);
3586         }
3587         dfd = move_fd(fd.fd);
3588
3589         TRACE("Reading %s from %s cgroup hierarchy", key, cgroup_hierarchy_name(fd.type));
3590
3591         if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices"))
3592                 return ret_errno(EOPNOTSUPP);
3593         else
3594                 ret = lxc_read_try_buf_at(dfd, key, buf, len);
3595
3596         return ret;
3597 }
3598
3599 /* Connects to command socket therefore isn't callable from command handler. */
3600 int cgroup_set(const char *name, const char *lxcpath, const char *key, const char *value)
3601 {
3602         __do_close int dfd = -EBADF;
3603         struct cgroup_fd fd = {
3604                 .fd = -EBADF,
3605         };
3606         size_t len_controller;
3607         int ret;
3608
3609         if (is_empty_string(name) || is_empty_string(lxcpath) ||
3610             is_empty_string(key) || is_empty_string(value))
3611                 return ret_errno(EINVAL);
3612
3613         len_controller = strcspn(key, ".");
3614         len_controller++; /* Don't forget the \0 byte. */
3615         if (len_controller >= MAX_CGROUP_ROOT_NAMELEN)
3616                 return ret_errno(EINVAL);
3617         (void)strlcpy(fd.controller, key, len_controller);
3618
3619         ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd);
3620         if (ret < 0) {
3621                 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3622                         return ret;
3623
3624                 dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3625                 if (dfd < 0) {
3626                         if (!ERRNO_IS_NOT_SUPPORTED(ret))
3627                                 return ret;
3628
3629                         return ret_errno(ENOSYS);
3630                 }
3631                 fd.type = UNIFIED_HIERARCHY;
3632                 fd.fd = move_fd(dfd);
3633         }
3634         dfd = move_fd(fd.fd);
3635
3636         TRACE("Setting %s to %s in %s cgroup hierarchy", key, value, cgroup_hierarchy_name(fd.type));
3637
3638         if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices")) {
3639                 struct device_item device = {};
3640
3641                 ret = device_cgroup_rule_parse(&device, key, value);
3642                 if (ret < 0)
3643                         return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
3644                                                key, value);
3645
3646                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3647         } else {
3648                 ret = lxc_writeat(dfd, key, value, strlen(value));
3649         }
3650
3651         return ret;
3652 }
3653
3654 static int do_cgroup_freeze(int unified_fd,
3655                             const char *state_string,
3656                             int state_num,
3657                             int timeout,
3658                             const char *epoll_error,
3659                             const char *wait_error)
3660 {
3661         __do_close int events_fd = -EBADF;
3662         call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
3663         int ret;
3664         struct lxc_epoll_descr descr = {};
3665
3666         if (timeout != 0) {
3667                 ret = lxc_mainloop_open(&descr);
3668                 if (ret)
3669                         return log_error_errno(-1, errno, "%s", epoll_error);
3670
3671                 /* automatically cleaned up now */
3672                 descr_ptr = &descr;
3673
3674                 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3675                 if (events_fd < 0)
3676                         return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3677
3678                 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
3679                 if (ret < 0)
3680                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3681         }
3682
3683         ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3684         if (ret < 0)
3685                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3686
3687         if (timeout != 0) {
3688                 ret = lxc_mainloop(&descr, timeout);
3689                 if (ret)
3690                         return log_error_errno(-1, errno, "%s", wait_error);
3691         }
3692
3693         return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3694 }
3695
3696 static inline int __cgroup_freeze(int unified_fd, int timeout)
3697 {
3698         return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3699                                 "Failed to create epoll instance to wait for container freeze",
3700                                 "Failed to wait for container to be frozen");
3701 }
3702
3703 int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
3704 {
3705         __do_close int unified_fd = -EBADF;
3706         int ret;
3707
3708         if (is_empty_string(name) || is_empty_string(lxcpath))
3709                 return ret_errno(EINVAL);
3710
3711         unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3712         if (unified_fd < 0)
3713                 return ret_errno(ENOCGROUP2);
3714
3715         lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
3716         ret = __cgroup_freeze(unified_fd, timeout);
3717         lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
3718         return ret;
3719 }
3720
3721 int __cgroup_unfreeze(int unified_fd, int timeout)
3722 {
3723         return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3724                                 "Failed to create epoll instance to wait for container freeze",
3725                                 "Failed to wait for container to be frozen");
3726 }
3727
3728 int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
3729 {
3730         __do_close int unified_fd = -EBADF;
3731         int ret;
3732
3733         if (is_empty_string(name) || is_empty_string(lxcpath))
3734                 return ret_errno(EINVAL);
3735
3736         unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3737         if (unified_fd < 0)
3738                 return ret_errno(ENOCGROUP2);
3739
3740         lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
3741         ret = __cgroup_unfreeze(unified_fd, timeout);
3742         lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
3743         return ret;
3744 }