src/lxc/cgroups/cgfsng.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 /*
   4  * cgfs-ng.c: this is a new, simplified implementation of a filesystem
   5  * cgroup backend.  The original cgfs.c was designed to be as flexible
   6  * as possible.  It would try to find cgroup filesystems no matter where
   7  * or how you had them mounted, and deduce the most usable mount for
   8  * each controller.
   9  *
  10  * This new implementation assumes that cgroup filesystems are mounted
  11  * under /sys/fs/cgroup/clist where clist is either the controller, or
  12  * a comma-separated list of controllers.
  13  */
  14
  15 #include "config.h"
  16
  17 #include <ctype.h>
  18 #include <dirent.h>
  19 #include <errno.h>
  20 #include <grp.h>
  21 #include <linux/kdev_t.h>
  22 #include <linux/types.h>
  23 #include <poll.h>
  24 #include <signal.h>
  25 #include <stdint.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <sys/epoll.h>
  30 #include <sys/types.h>
  31 #include <unistd.h>
  32
  33 #include "cgroup.h"
  34 #include "af_unix.h"
  35 #include "caps.h"
  36 #include "cgroup2_devices.h"
  37 #include "cgroup_utils.h"
  38 #include "commands.h"
  39 #include "commands_utils.h"
  40 #include "conf.h"
  41 #include "error_utils.h"
  42 #include "log.h"
  43 #include "macro.h"
  44 #include "mainloop.h"
  45 #include "memory_utils.h"
  46 #include "mount_utils.h"
  47 #include "storage/storage.h"
  48 #include "string_utils.h"
  49 #include "syscall_wrappers.h"
  50 #include "utils.h"
  51
  52 #if !HAVE_STRLCPY
  53 #include "strlcpy.h"
  54 #endif
  55
  56 #if !HAVE_STRLCAT
  57 #include "strlcat.h"
  58 #endif
  59
  60 lxc_log_define(cgfsng, cgroup);
  61
  62 /*
  63  * Given a pointer to a null-terminated array of pointers, realloc to add one
  64  * entry, and point the new entry to NULL. Do not fail. Return the index to the
  65  * second-to-last entry - that is, the one which is now available for use
  66  * (keeping the list null-terminated).
  67  */
  68 static int cg_list_add(void ***list)
  69 {
  70         int idx = 0;
  71         void **p;
  72
  73         if (*list)
  74                 for (; (*list)[idx]; idx++)
  75                         ;
  76
  77         p = realloc(*list, (idx + 2) * sizeof(void **));
  78         if (!p)
  79                 return ret_errno(ENOMEM);
  80
  81         p[idx + 1] = NULL;
  82         *list = p;
  83
  84         return idx;
  85 }
  86
  87 /* Given a null-terminated array of strings, check whether @entry is one of the
  88  * strings.
  89  */
  90 static bool string_in_list(char **list, const char *entry)
  91 {
  92         if (!list)
  93                 return false;
  94
  95         for (int i = 0; list[i]; i++)
  96                 if (strequal(list[i], entry))
  97                         return true;
  98
  99         return false;
 100 }
 101
 102 /* Given a handler's cgroup data, return the struct hierarchy for the controller
 103  * @c, or NULL if there is none.
 104  */
 105 static struct hierarchy *get_hierarchy(const struct cgroup_ops *ops, const char *controller)
 106 {
 107         if (!ops->hierarchies)
 108                 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
 109
 110         for (int i = 0; ops->hierarchies[i]; i++) {
 111                 if (!controller) {
 112                         /* This is the empty unified hierarchy. */
 113                         if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
 114                                 return ops->hierarchies[i];
 115
 116                         continue;
 117                 }
 118
 119                 /*
 120                  * Handle controllers with significant implementation changes
 121                  * from cgroup to cgroup2.
 122                  */
 123                 if (pure_unified_layout(ops)) {
 124                         if (strequal(controller, "devices")) {
 125                                 if (device_utility_controller(ops->unified))
 126                                         return ops->unified;
 127
 128                                 break;
 129                         } else if (strequal(controller, "freezer")) {
 130                                 if (freezer_utility_controller(ops->unified))
 131                                         return ops->unified;
 132
 133                                 break;
 134                         }
 135                 }
 136
 137                 if (string_in_list(ops->hierarchies[i]->controllers, controller))
 138                         return ops->hierarchies[i];
 139         }
 140
 141         if (controller)
 142                 WARN("There is no useable %s controller", controller);
 143         else
 144                 WARN("There is no empty unified cgroup hierarchy");
 145
 146         return ret_set_errno(NULL, ENOENT);
 147 }
 148
 149 int prepare_cgroup_fd(const struct cgroup_ops *ops, struct cgroup_fd *fd, bool limit)
 150 {
 151         int dfd;
 152         const struct hierarchy *h;
 153
 154         h = get_hierarchy(ops, fd->controller);
 155         if (!h)
 156                 return ret_errno(ENOENT);
 157
 158         /*
 159          * The client requested that the controller must be in a specific
 160          * cgroup version.
 161          */
 162         if (fd->type != 0 && (cgroupfs_type_magic_t)fd->type != h->fs_type)
 163                 return ret_errno(EINVAL);
 164
 165         if (limit)
 166                 dfd = h->dfd_con;
 167         else
 168                 dfd = h->dfd_lim;
 169         if (dfd < 0)
 170                 return ret_errno(EBADF);
 171
 172         fd->layout = ops->cgroup_layout;
 173         fd->type = h->fs_type;
 174         if (fd->type == UNIFIED_HIERARCHY)
 175                 fd->utilities = h->utilities;
 176         fd->fd = dfd;
 177
 178         return 0;
 179 }
 180
 181 /* Create cpumask from cpulist aka turn:
 182  *
 183  *      0,2-3
 184  *
 185  * into bit array
 186  *
 187  *      1 0 1 1
 188  */
 189 static int lxc_cpumask(char *buf, __u32 **bitarr, __u32 *last_set_bit)
 190 {
 191         __do_free __u32 *arr_u32 = NULL;
 192         __u32 cur_last_set_bit = 0, nbits = 256;
 193         __u32 nr_u32;
 194         char *token;
 195
 196         nr_u32 = BITS_TO_LONGS(nbits);
 197         arr_u32 = zalloc(nr_u32 * sizeof(__u32));
 198         if (!arr_u32)
 199                 return ret_errno(ENOMEM);
 200
 201         lxc_iterate_parts(token, buf, ",") {
 202                 __u32 last_bit, first_bit;
 203                 char *range;
 204
 205                 errno = 0;
 206                 first_bit = strtoul(token, NULL, 0);
 207                 last_bit = first_bit;
 208                 range = strchr(token, '-');
 209                 if (range)
 210                         last_bit = strtoul(range + 1, NULL, 0);
 211
 212                 if (!(first_bit <= last_bit))
 213                         return ret_errno(EINVAL);
 214
 215                 if (last_bit >= nbits) {
 216                         __u32 add_bits = last_bit - nbits + 32;
 217                         __u32 new_nr_u32;
 218                         __u32 *p;
 219
 220                         new_nr_u32 = BITS_TO_LONGS(nbits + add_bits);
 221                         p = realloc(arr_u32, new_nr_u32 * sizeof(uint32_t));
 222                         if (!p)
 223                                 return ret_errno(ENOMEM);
 224                         arr_u32 = move_ptr(p);
 225
 226                         memset(arr_u32 + nr_u32, 0,
 227                                (new_nr_u32 - nr_u32) * sizeof(uint32_t));
 228                         nbits += add_bits;
 229                 }
 230
 231                 while (first_bit <= last_bit)
 232                         set_bit(first_bit++, arr_u32);
 233
 234                 if (last_bit > cur_last_set_bit)
 235                         cur_last_set_bit = last_bit;
 236         }
 237
 238         *last_set_bit = cur_last_set_bit;
 239         *bitarr = move_ptr(arr_u32);
 240         return 0;
 241 }
 242
 243 static int lxc_cpumask_update(char *buf, __u32 *bitarr, __u32 last_set_bit,
 244                               bool clear)
 245 {
 246         bool flipped = false;
 247         char *token;
 248
 249         lxc_iterate_parts(token, buf, ",") {
 250                 __u32 last_bit, first_bit;
 251                 char *range;
 252
 253                 errno = 0;
 254                 first_bit = strtoul(token, NULL, 0);
 255                 last_bit = first_bit;
 256                 range = strchr(token, '-');
 257                 if (range)
 258                         last_bit = strtoul(range + 1, NULL, 0);
 259
 260                 if (!(first_bit <= last_bit)) {
 261                         WARN("The cup range seems to be inverted: %u-%u", first_bit, last_bit);
 262                         continue;
 263                 }
 264
 265                 if (last_bit > last_set_bit)
 266                         continue;
 267
 268                 while (first_bit <= last_bit) {
 269                         if (clear && is_set(first_bit, bitarr)) {
 270                                 flipped = true;
 271                                 clear_bit(first_bit, bitarr);
 272                         } else if (!clear && !is_set(first_bit, bitarr)) {
 273                                 flipped = true;
 274                                 set_bit(first_bit, bitarr);
 275                         }
 276
 277                         first_bit++;
 278                 }
 279         }
 280
 281         if (flipped)
 282                 return 1;
 283
 284         return 0;
 285 }
 286
 287 /* Turn cpumask into simple, comma-separated cpulist. */
 288 static char *lxc_cpumask_to_cpulist(__u32 *bitarr, __u32 last_set_bit)
 289 {
 290         __do_free_string_list char **cpulist = NULL;
 291         char numstr[INTTYPE_TO_STRLEN(__u32)] = {0};
 292         int ret;
 293
 294         for (__u32 bit = 0; bit <= last_set_bit; bit++) {
 295                 if (!is_set(bit, bitarr))
 296                         continue;
 297
 298                 ret = strnprintf(numstr, sizeof(numstr), "%u", bit);
 299                 if (ret < 0)
 300                         return NULL;
 301
 302                 ret = lxc_append_string(&cpulist, numstr);
 303                 if (ret < 0)
 304                         return ret_set_errno(NULL, ENOMEM);
 305         }
 306
 307         if (!cpulist)
 308                 return ret_set_errno(NULL, ENOMEM);
 309
 310         return lxc_string_join(",", (const char **)cpulist, false);
 311 }
 312
 313 static inline bool is_unified_hierarchy(const struct hierarchy *h)
 314 {
 315         return h->fs_type == UNIFIED_HIERARCHY;
 316 }
 317
 318 /* Return true if the controller @entry is found in the null-terminated list of
 319  * hierarchies @hlist.
 320  */
 321 static bool controller_available(struct hierarchy **hlist, char *entry)
 322 {
 323         if (!hlist)
 324                 return false;
 325
 326         for (int i = 0; hlist[i]; i++)
 327                 if (string_in_list(hlist[i]->controllers, entry))
 328                         return true;
 329
 330         return false;
 331 }
 332
 333 static bool controllers_available(struct cgroup_ops *ops)
 334 {
 335         struct hierarchy **hlist;
 336
 337         if (!ops->cgroup_use)
 338                 return true;
 339
 340         hlist = ops->hierarchies;
 341         for (char **cur = ops->cgroup_use; cur && *cur; cur++)
 342                 if (!controller_available(hlist, *cur))
 343                         return log_error(false, "The %s controller found", *cur);
 344
 345         return true;
 346 }
 347
 348 static char **list_new(void)
 349 {
 350         __do_free_string_list char **list = NULL;
 351         int idx;
 352
 353         idx = cg_list_add((void ***)&list);
 354         if (idx < 0)
 355                 return NULL;
 356
 357         list[idx] = NULL;
 358         return move_ptr(list);
 359 }
 360
 361 static int list_add_string(char ***list, char *entry)
 362 {
 363         __do_free char *dup = NULL;
 364         int idx;
 365
 366         dup = strdup(entry);
 367         if (!dup)
 368                 return ret_errno(ENOMEM);
 369
 370         idx = cg_list_add((void ***)list);
 371         if (idx < 0)
 372                 return idx;
 373
 374         (*list)[idx] = move_ptr(dup);
 375         return 0;
 376 }
 377
 378 static char **list_add_controllers(char *controllers)
 379 {
 380         __do_free_string_list char **list = NULL;
 381         char *it;
 382
 383         lxc_iterate_parts(it, controllers, ", \t\n") {
 384                 int ret;
 385
 386                 ret = list_add_string(&list, it);
 387                 if (ret < 0)
 388                         return NULL;
 389         }
 390
 391         return move_ptr(list);
 392 }
 393
 394 static char **unified_controllers(int dfd, const char *file)
 395 {
 396         __do_free char *buf = NULL;
 397
 398         buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
 399         if (!buf)
 400                 return NULL;
 401
 402         return list_add_controllers(buf);
 403 }
 404
 405 static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers)
 406 {
 407         if (!ops->cgroup_use)
 408                 return false;
 409
 410         for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
 411                 bool found = false;
 412
 413                 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
 414                         if (!strequal(*cur_use, *cur_ctrl))
 415                                 continue;
 416
 417                         found = true;
 418                         break;
 419                 }
 420
 421                 if (found)
 422                         continue;
 423
 424                 return true;
 425         }
 426
 427         return false;
 428 }
 429
 430 static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt,
 431                                 int dfd_base, char *base_cgroup,
 432                                 char **controllers, cgroupfs_type_magic_t fs_type)
 433 {
 434         __do_free struct hierarchy *new = NULL;
 435         int idx;
 436
 437         if (abspath(base_cgroup))
 438                 return syserror_set(-EINVAL, "Container base path must be relative to controller mount");
 439
 440         new = zalloc(sizeof(*new));
 441         if (!new)
 442                 return ret_errno(ENOMEM);
 443
 444         new->dfd_con            = -EBADF;
 445         new->dfd_lim            = -EBADF;
 446         new->dfd_mon            = -EBADF;
 447
 448         new->fs_type            = fs_type;
 449         new->controllers        = controllers;
 450         new->at_mnt             = mnt;
 451         new->at_base            = base_cgroup;
 452
 453         new->dfd_mnt            = dfd_mnt;
 454         new->dfd_base           = dfd_base;
 455
 456         TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s",
 457               mnt, maybe_empty(base_cgroup));
 458         for (char *const *it = new->controllers; it && *it; it++)
 459                 TRACE("The hierarchy contains the %s controller", *it);
 460
 461         idx = cg_list_add((void ***)&ops->hierarchies);
 462         if (idx < 0)
 463                 return ret_errno(idx);
 464
 465         if (fs_type == UNIFIED_HIERARCHY)
 466                 ops->unified = new;
 467         (ops->hierarchies)[idx] = move_ptr(new);
 468
 469         return 0;
 470 }
 471
 472 static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
 473 {
 474         if (!path_prune || !hierarchies)
 475                 return 0;
 476
 477         for (int i = 0; hierarchies[i]; i++) {
 478                 struct hierarchy *h = hierarchies[i];
 479                 int ret;
 480
 481                 ret = cgroup_tree_prune(h->dfd_base, path_prune);
 482                 if (ret < 0)
 483                         SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
 484                 else
 485                         TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
 486
 487                 free_equal(h->path_lim, h->path_con);
 488         }
 489
 490         return 0;
 491 }
 492
 493 struct generic_userns_exec_data {
 494         struct hierarchy **hierarchies;
 495         const char *path_prune;
 496         struct lxc_conf *conf;
 497         uid_t origuid; /* target uid in parent namespace */
 498         char *path;
 499 };
 500
 501 static int cgroup_tree_remove_wrapper(void *data)
 502 {
 503         struct generic_userns_exec_data *arg = data;
 504         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
 505         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
 506         int ret;
 507
 508         if (!lxc_drop_groups() && errno != EPERM)
 509                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
 510
 511         ret = setresgid(nsgid, nsgid, nsgid);
 512         if (ret < 0)
 513                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
 514                                        (int)nsgid, (int)nsgid, (int)nsgid);
 515
 516         ret = setresuid(nsuid, nsuid, nsuid);
 517         if (ret < 0)
 518                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
 519                                        (int)nsuid, (int)nsuid, (int)nsuid);
 520
 521         return cgroup_tree_remove(arg->hierarchies, arg->path_prune);
 522 }
 523
 524 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
 525                                                 struct lxc_handler *handler)
 526 {
 527         int ret;
 528
 529         if (!ops) {
 530                 ERROR("Called with uninitialized cgroup operations");
 531                 return;
 532         }
 533
 534         if (!ops->hierarchies)
 535                 return;
 536
 537         if (!handler) {
 538                 ERROR("Called with uninitialized handler");
 539                 return;
 540         }
 541
 542         if (!handler->conf) {
 543                 ERROR("Called with uninitialized conf");
 544                 return;
 545         }
 546
 547         if (!ops->container_limit_cgroup) {
 548                 WARN("Uninitialized limit cgroup");
 549                 return;
 550         }
 551
 552         ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
 553         if (ret < 0)
 554                 WARN("Failed to detach bpf program from cgroup");
 555
 556         if (!list_empty(&handler->conf->id_map)) {
 557                 struct generic_userns_exec_data wrap = {
 558                         .conf                   = handler->conf,
 559                         .path_prune             = ops->container_limit_cgroup,
 560                         .hierarchies            = ops->hierarchies,
 561                         .origuid                = 0,
 562                 };
 563                 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
 564                                     &wrap, "cgroup_tree_remove_wrapper");
 565         } else {
 566                 ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup);
 567         }
 568         if (ret < 0)
 569                 SYSWARN("Failed to destroy cgroups");
 570 }
 571
 572 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
 573 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
 574 static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
 575                                     bool am_initialized)
 576 {
 577         __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
 578                        *offlinecpus = NULL, *posscpus = NULL;
 579         __do_free __u32 *possmask = NULL;
 580         int ret;
 581         __u32 poss_last_set_bit = 0;
 582
 583         posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
 584         if (!posscpus)
 585                 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
 586
 587         if (file_exists(__ISOL_CPUS)) {
 588                 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
 589                 if (!isolcpus)
 590                         return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
 591
 592                 if (!isdigit(isolcpus[0]))
 593                         free_disarm(isolcpus);
 594         } else {
 595                 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
 596         }
 597
 598         if (file_exists(__OFFLINE_CPUS)) {
 599                 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
 600                 if (!offlinecpus)
 601                         return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
 602
 603                 if (!isdigit(offlinecpus[0]))
 604                         free_disarm(offlinecpus);
 605         } else {
 606                 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
 607         }
 608
 609         if (!isolcpus && !offlinecpus) {
 610                 cpulist = move_ptr(posscpus);
 611                 goto copy_parent;
 612         }
 613
 614         ret = lxc_cpumask(posscpus, &possmask, &poss_last_set_bit);
 615         if (ret)
 616                 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
 617
 618         if (isolcpus)
 619                 ret = lxc_cpumask_update(isolcpus, possmask, poss_last_set_bit, true);
 620
 621         if (offlinecpus)
 622                 ret |= lxc_cpumask_update(offlinecpus, possmask, poss_last_set_bit, true);
 623
 624         if (!ret) {
 625                 cpulist = lxc_cpumask_to_cpulist(possmask, poss_last_set_bit);
 626                 TRACE("No isolated or offline cpus present in cpuset");
 627         } else {
 628                 cpulist = move_ptr(posscpus);
 629                 TRACE("Removed isolated or offline cpus from cpuset");
 630         }
 631         if (!cpulist)
 632                 return log_error_errno(false, errno, "Failed to create cpu list");
 633
 634 copy_parent:
 635         if (!am_initialized) {
 636                 ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
 637                 if (ret < 0)
 638                         return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
 639
 640                 TRACE("Copied cpu settings of parent cgroup");
 641         }
 642
 643         return true;
 644 }
 645
 646 static bool cpuset1_initialize(int dfd_base, int dfd_next)
 647 {
 648         char mems[PATH_MAX];
 649         ssize_t bytes;
 650         char v;
 651
 652         /* Determine whether the base cgroup has cpuset inheritance turned on. */
 653         bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
 654         if (bytes < 0)
 655                 return syserror_ret(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
 656
 657         /* Initialize cpuset.cpus removing any isolated and offline cpus. */
 658         if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
 659                 return syserror_ret(false, "Failed to initialize cpuset.cpus");
 660
 661         /* Read cpuset.mems from parent... */
 662         bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
 663         if (bytes < 0)
 664                 return syserror_ret(false, "Failed to read file %d(cpuset.mems)", dfd_base);
 665
 666         /* and copy to first cgroup in the tree... */
 667         bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
 668         if (bytes < 0)
 669                 return syserror_ret(false, "Failed to write %d(cpuset.mems)", dfd_next);
 670
 671         /* and finally turn on cpuset inheritance. */
 672         bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
 673         if (bytes < 0)
 674                 return syserror_ret(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
 675
 676         return log_trace(true, "Initialized cpuset in the legacy hierarchy");
 677 }
 678
 679 static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
 680                                 bool cpuset_v1, bool eexist_ignore)
 681 {
 682         __do_close int dfd_final = -EBADF;
 683         int dfd_cur = dfd_base;
 684         int ret = 0;
 685         size_t len;
 686         char *cur;
 687         char buf[PATH_MAX];
 688
 689         if (is_empty_string(path))
 690                 return ret_errno(EINVAL);
 691
 692         len = strlcpy(buf, path, sizeof(buf));
 693         if (len >= sizeof(buf))
 694                 return ret_errno(E2BIG);
 695
 696         lxc_iterate_parts(cur, buf, "/") {
 697                 /*
 698                  * Even though we vetted the paths when we parsed the config
 699                  * we're paranoid here and check that the path is neither
 700                  * absolute nor walks upwards.
 701                  */
 702                 if (abspath(cur))
 703                         return syserror_set(-EINVAL, "No absolute paths allowed");
 704
 705                 if (strnequal(cur, "..", STRLITERALLEN("..")))
 706                         return syserror_set(-EINVAL, "No upward walking paths allowed");
 707
 708                 ret = mkdirat(dfd_cur, cur, mode);
 709                 if (ret < 0) {
 710                         if (errno != EEXIST)
 711                                 return syserror("Failed to create %d(%s)", dfd_cur, cur);
 712
 713                         ret = -EEXIST;
 714                 }
 715                 TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
 716
 717                 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
 718                 if (dfd_final < 0)
 719                         return syserror("Fail to open%s directory %d(%s)",
 720                                         !ret ? " newly created" : "", dfd_base, cur);
 721                 if (dfd_cur != dfd_base)
 722                         close(dfd_cur);
 723                 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
 724                         return syserror_set(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
 725                 /*
 726                  * Leave dfd_final pointing to the last fd we opened so
 727                  * it will be automatically zapped if we return early.
 728                  */
 729                 dfd_cur = dfd_final;
 730         }
 731
 732         /* The final cgroup must be succesfully creatd by us. */
 733         if (ret) {
 734                 if (ret != -EEXIST || !eexist_ignore)
 735                         return syswarn_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
 736         }
 737
 738         return move_fd(dfd_final);
 739 }
 740
 741 static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
 742                                struct hierarchy *h, const char *cgroup_limit_dir,
 743                                const char *cgroup_leaf, bool payload)
 744 {
 745         __do_close int fd_limit = -EBADF, fd_final = -EBADF;
 746         bool cpuset_v1 = false;
 747
 748         /*
 749          * The legacy cpuset controller needs massaging in case inheriting
 750          * settings from its immediate ancestor cgroup hasn't been turned on.
 751          */
 752         cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
 753
 754         if (payload && cgroup_leaf) {
 755                 /* With isolation both parts need to not already exist. */
 756                 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
 757                 if (fd_limit < 0)
 758                         return syswarn_ret(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
 759
 760                 h->path_lim = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
 761                 h->dfd_lim = move_fd(fd_limit);
 762
 763                 TRACE("Created limit cgroup %d->%d(%s)",
 764                       h->dfd_lim, h->dfd_base, cgroup_limit_dir);
 765
 766                 /*
 767                  * With isolation the devices legacy cgroup needs to be
 768                  * iinitialized early, as it typically contains an 'a' (all)
 769                  * line, which is not possible once a subdirectory has been
 770                  * created.
 771                  */
 772                 if (string_in_list(h->controllers, "devices") &&
 773                     !ops->setup_limits_legacy(ops, conf, true))
 774                         return log_warn(false, "Failed to setup legacy device limits");
 775
 776                 /*
 777                  * If we use a separate limit cgroup, the leaf cgroup, i.e. the
 778                  * cgroup the container actually resides in, is below fd_limit.
 779                  */
 780                 fd_final = __cgroup_tree_create(h->dfd_lim, cgroup_leaf, 0755, cpuset_v1, false);
 781                 if (fd_final < 0) {
 782                         /* Ensure we don't leave any garbage behind. */
 783                         if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
 784                                 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
 785                         else
 786                                 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
 787                         return syswarn_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
 788                 }
 789                 h->dfd_con = move_fd(fd_final);
 790                 h->path_con = must_make_path(h->path_lim, cgroup_leaf, NULL);
 791
 792         } else {
 793                 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
 794                 if (fd_final < 0)
 795                         return syswarn_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
 796
 797                 if (payload) {
 798                         h->dfd_con = move_fd(fd_final);
 799                         h->dfd_lim = h->dfd_con;
 800                         h->path_con = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
 801
 802                         h->path_lim = h->path_con;
 803                 } else {
 804                         h->dfd_mon = move_fd(fd_final);
 805                 }
 806         }
 807
 808         return true;
 809 }
 810
 811 static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
 812                                    bool payload)
 813 {
 814         bool prune = true;
 815
 816         if (payload) {
 817                 /* Check whether we actually created the cgroup to prune. */
 818                 if (h->dfd_lim < 0)
 819                         prune = false;
 820
 821                 free_equal(h->path_con, h->path_lim);
 822                 close_equal(h->dfd_con, h->dfd_lim);
 823         } else {
 824                 /* Check whether we actually created the cgroup to prune. */
 825                 if (h->dfd_mon < 0)
 826                         prune = false;
 827
 828                 close_prot_errno_disarm(h->dfd_mon);
 829         }
 830
 831         /* We didn't create this cgroup. */
 832         if (!prune)
 833                 return;
 834
 835         if (cgroup_tree_prune(h->dfd_base, path_prune))
 836                 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
 837         else
 838                 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
 839 }
 840
 841 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
 842                                                 struct lxc_handler *handler)
 843 {
 844         int len;
 845         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
 846         const struct lxc_conf *conf;
 847
 848         if (!ops) {
 849                 ERROR("Called with uninitialized cgroup operations");
 850                 return;
 851         }
 852
 853         if (!ops->hierarchies)
 854                 return;
 855
 856         if (!handler) {
 857                 ERROR("Called with uninitialized handler");
 858                 return;
 859         }
 860
 861         if (!handler->conf) {
 862                 ERROR("Called with uninitialized conf");
 863                 return;
 864         }
 865         conf = handler->conf;
 866
 867         if (!ops->monitor_cgroup) {
 868                 WARN("Uninitialized monitor cgroup");
 869                 return;
 870         }
 871
 872         len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
 873         if (len < 0)
 874                 return;
 875
 876         for (int i = 0; ops->hierarchies[i]; i++) {
 877                 __do_close int fd_pivot = -EBADF;
 878                 __do_free char *pivot_path = NULL;
 879                 struct hierarchy *h = ops->hierarchies[i];
 880                 bool cpuset_v1 = false;
 881                 int ret;
 882
 883                 /* Monitor might have died before we entered the cgroup. */
 884                 if (handler->monitor_pid <= 0) {
 885                         WARN("No valid monitor process found while destroying cgroups");
 886                         goto cgroup_prune_tree;
 887                 }
 888
 889                 if (conf->cgroup_meta.monitor_pivot_dir)
 890                         pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
 891                 else if (conf->cgroup_meta.dir)
 892                         pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
 893                 else
 894                         pivot_path = must_make_path(CGROUP_PIVOT, NULL);
 895
 896                 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
 897
 898                 fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
 899                 if (fd_pivot < 0) {
 900                         SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
 901                         continue;
 902                 }
 903
 904                 ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
 905                 if (ret != 0) {
 906                         SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
 907                         continue;
 908                 }
 909
 910 cgroup_prune_tree:
 911                 ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup);
 912                 if (ret < 0)
 913                         SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup);
 914                 else
 915                         TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup);
 916         }
 917 }
 918
 919 /*
 920  * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
 921  * proper prefix directory of lxc.cgroup.dir.payload.
 922  *
 923  * Returns the prefix length if it is set, otherwise zero on success.
 924  */
 925 static bool check_cgroup_dir_config(struct lxc_conf *conf)
 926 {
 927         const char *monitor_dir = conf->cgroup_meta.monitor_dir,
 928                    *container_dir = conf->cgroup_meta.container_dir,
 929                    *namespace_dir = conf->cgroup_meta.namespace_dir;
 930
 931         /* none of the new options are set, all is fine */
 932         if (!monitor_dir && !container_dir && !namespace_dir)
 933                 return true;
 934
 935         /* some are set, make sure lxc.cgroup.dir is not also set*/
 936         if (conf->cgroup_meta.dir)
 937                 return log_error_errno(false, EINVAL,
 938                         "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
 939
 940         /* make sure both monitor and payload are set */
 941         if (!monitor_dir || !container_dir)
 942                 return log_error_errno(false, EINVAL,
 943                         "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
 944
 945         /* namespace_dir may be empty */
 946         return true;
 947 }
 948
 949 __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
 950 {
 951         __do_free char *monitor_cgroup = NULL;
 952         int idx = 0;
 953         int i;
 954         size_t len;
 955         char *suffix = NULL;
 956         struct lxc_conf *conf;
 957
 958         if (!ops)
 959                 return ret_set_errno(false, ENOENT);
 960
 961         if (!ops->hierarchies)
 962                 return true;
 963
 964         if (ops->monitor_cgroup)
 965                 return ret_set_errno(false, EEXIST);
 966
 967         if (!handler || !handler->conf)
 968                 return ret_set_errno(false, EINVAL);
 969
 970         conf = handler->conf;
 971
 972         if (!check_cgroup_dir_config(conf))
 973                 return false;
 974
 975         if (conf->cgroup_meta.monitor_dir) {
 976                 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
 977         } else if (conf->cgroup_meta.dir) {
 978                 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
 979                                              DEFAULT_MONITOR_CGROUP_PREFIX,
 980                                              handler->name,
 981                                              CGROUP_CREATE_RETRY, NULL);
 982         } else if (ops->cgroup_pattern) {
 983                 __do_free char *cgroup_tree = NULL;
 984
 985                 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
 986                 if (!cgroup_tree)
 987                         return ret_set_errno(false, ENOMEM);
 988
 989                 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
 990                                              DEFAULT_MONITOR_CGROUP,
 991                                              CGROUP_CREATE_RETRY, NULL);
 992         } else {
 993                 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
 994                                              handler->name,
 995                                              CGROUP_CREATE_RETRY, NULL);
 996         }
 997         if (!monitor_cgroup)
 998                 return ret_set_errno(false, ENOMEM);
 999
1000         if (!conf->cgroup_meta.monitor_dir) {
1001                 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1002                 *suffix = '\0';
1003         }
1004         do {
1005                 if (idx && suffix)
1006                         sprintf(suffix, "-%d", idx);
1007
1008                 for (i = 0; ops->hierarchies[i]; i++) {
1009                         if (cgroup_tree_create(ops, handler->conf,
1010                                                ops->hierarchies[i],
1011                                                monitor_cgroup, NULL, false))
1012                                 continue;
1013
1014                         DEBUG("Failed to create cgroup %s)", monitor_cgroup);
1015                         for (int j = 0; j <= i; j++)
1016                                 cgroup_tree_prune_leaf(ops->hierarchies[j],
1017                                                        monitor_cgroup, false);
1018
1019                         idx++;
1020                         break;
1021                 }
1022         } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1023
1024         if (idx == 1000 || (!suffix && idx != 0))
1025                 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
1026
1027         ops->monitor_cgroup = move_ptr(monitor_cgroup);
1028         return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1029 }
1030
1031 /*
1032  * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1033  * next cgroup_pattern-1, -2, ..., -999.
1034  */
1035 __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1036 {
1037         __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL;
1038         char *limit_cgroup;
1039         int idx = 0;
1040         int i;
1041         size_t len;
1042         char *suffix = NULL;
1043         struct lxc_conf *conf;
1044
1045         if (!ops)
1046                 return ret_set_errno(false, ENOENT);
1047
1048         if (!ops->hierarchies)
1049                 return true;
1050
1051         if (ops->container_cgroup || ops->container_limit_cgroup)
1052                 return ret_set_errno(false, EEXIST);
1053
1054         if (!handler || !handler->conf)
1055                 return ret_set_errno(false, EINVAL);
1056
1057         conf = handler->conf;
1058
1059         if (!check_cgroup_dir_config(conf))
1060                 return false;
1061
1062         if (conf->cgroup_meta.container_dir) {
1063                 __limit_cgroup = strdup(conf->cgroup_meta.container_dir);
1064                 if (!__limit_cgroup)
1065                         return ret_set_errno(false, ENOMEM);
1066
1067                 if (conf->cgroup_meta.namespace_dir) {
1068                         container_cgroup = must_make_path(__limit_cgroup,
1069                                                           conf->cgroup_meta.namespace_dir,
1070                                                           NULL);
1071                         limit_cgroup = __limit_cgroup;
1072                 } else {
1073                         /* explicit paths but without isolation */
1074                         limit_cgroup = move_ptr(__limit_cgroup);
1075                         container_cgroup = limit_cgroup;
1076                 }
1077         } else if (conf->cgroup_meta.dir) {
1078                 limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1079                                            DEFAULT_PAYLOAD_CGROUP_PREFIX,
1080                                            handler->name,
1081                                            CGROUP_CREATE_RETRY, NULL);
1082                 container_cgroup = limit_cgroup;
1083         } else if (ops->cgroup_pattern) {
1084                 __do_free char *cgroup_tree = NULL;
1085
1086                 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1087                 if (!cgroup_tree)
1088                         return ret_set_errno(false, ENOMEM);
1089
1090                 limit_cgroup = must_concat(&len, cgroup_tree, "/",
1091                                            DEFAULT_PAYLOAD_CGROUP,
1092                                            CGROUP_CREATE_RETRY, NULL);
1093                 container_cgroup = limit_cgroup;
1094         } else {
1095                 limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1096                                            handler->name,
1097                                            CGROUP_CREATE_RETRY, NULL);
1098                 container_cgroup = limit_cgroup;
1099         }
1100         if (!limit_cgroup)
1101                 return ret_set_errno(false, ENOMEM);
1102
1103         if (!conf->cgroup_meta.container_dir) {
1104                 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1105                 *suffix = '\0';
1106         }
1107         do {
1108                 if (idx && suffix)
1109                         sprintf(suffix, "-%d", idx);
1110
1111                 for (i = 0; ops->hierarchies[i]; i++) {
1112                         if (cgroup_tree_create(ops, handler->conf,
1113                                                ops->hierarchies[i], limit_cgroup,
1114                                                conf->cgroup_meta.namespace_dir,
1115                                                true))
1116                                 continue;
1117
1118                         DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->path_con ?: "(null)");
1119                         for (int j = 0; j <= i; j++)
1120                                 cgroup_tree_prune_leaf(ops->hierarchies[j],
1121                                                        limit_cgroup, true);
1122
1123                         idx++;
1124                         break;
1125                 }
1126         } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1127
1128         if (idx == 1000 || (!suffix && idx != 0))
1129                 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
1130
1131         ops->container_cgroup = move_ptr(container_cgroup);
1132         if (__limit_cgroup)
1133                 ops->container_limit_cgroup = move_ptr(__limit_cgroup);
1134         else
1135                 ops->container_limit_cgroup = ops->container_cgroup;
1136         INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup",
1137              ops->container_cgroup, ops->container_limit_cgroup);
1138         return true;
1139 }
1140
1141 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1142                                               struct lxc_handler *handler)
1143 {
1144         int monitor_len, transient_len = 0;
1145         char monitor[INTTYPE_TO_STRLEN(pid_t)],
1146             transient[INTTYPE_TO_STRLEN(pid_t)];
1147
1148         if (!ops)
1149                 return ret_set_errno(false, ENOENT);
1150
1151         if (!ops->hierarchies)
1152                 return true;
1153
1154         if (!ops->monitor_cgroup)
1155                 return ret_set_errno(false, ENOENT);
1156
1157         if (!handler || !handler->conf)
1158                 return ret_set_errno(false, EINVAL);
1159
1160         monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1161         if (monitor_len < 0)
1162                 return false;
1163
1164         if (handler->transient_pid > 0) {
1165                 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1166                 if (transient_len < 0)
1167                         return false;
1168         }
1169
1170         for (int i = 0; ops->hierarchies[i]; i++) {
1171                 struct hierarchy *h = ops->hierarchies[i];
1172                 int ret;
1173
1174                 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", monitor, monitor_len);
1175                 if (ret)
1176                         return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1177
1178                 TRACE("Moved monitor into cgroup %d", h->dfd_mon);
1179
1180                 if (handler->transient_pid <= 0)
1181                         continue;
1182
1183                 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len);
1184                 if (ret)
1185                         return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1186
1187                 TRACE("Moved transient process into cgroup %d", h->dfd_mon);
1188
1189                 /*
1190                  * we don't keep the fds for non-unified hierarchies around
1191                  * mainly because we don't make use of them anymore after the
1192                  * core cgroup setup is done but also because there are quite a
1193                  * lot of them.
1194                  */
1195                 if (!is_unified_hierarchy(h))
1196                         close_prot_errno_disarm(h->dfd_mon);
1197         }
1198         handler->transient_pid = -1;
1199
1200         return true;
1201 }
1202
1203 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1204                                               struct lxc_handler *handler)
1205 {
1206         int len;
1207         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1208
1209         if (!ops)
1210                 return ret_set_errno(false, ENOENT);
1211
1212         if (!ops->hierarchies)
1213                 return true;
1214
1215         if (!ops->container_cgroup)
1216                 return ret_set_errno(false, ENOENT);
1217
1218         if (!handler || !handler->conf)
1219                 return ret_set_errno(false, EINVAL);
1220
1221         len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1222         if (len < 0)
1223                 return false;
1224
1225         for (int i = 0; ops->hierarchies[i]; i++) {
1226                 struct hierarchy *h = ops->hierarchies[i];
1227                 int ret;
1228
1229                 if (is_unified_hierarchy(h) &&
1230                     (handler->clone_flags & CLONE_INTO_CGROUP))
1231                         continue;
1232
1233                 ret = lxc_writeat(h->dfd_con, "cgroup.procs", pidstr, len);
1234                 if (ret != 0)
1235                         return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->path_con);
1236
1237                 TRACE("Moved container into %s cgroup via %d", h->path_con, h->dfd_con);
1238         }
1239
1240         return true;
1241 }
1242
1243 static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1244                       gid_t chown_gid, mode_t chmod_mode)
1245 {
1246         int ret;
1247
1248         ret = fchownat(dirfd, path, chown_uid, chown_gid,
1249                        AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1250         if (ret < 0)
1251                 return log_warn_errno(-1,
1252                                       errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1253                                       dirfd, path, (int)chown_uid,
1254                                       (int)chown_gid);
1255
1256         ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1257         if (ret < 0)
1258                 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1259                                       dirfd, path, (int)chmod_mode);
1260
1261         return 0;
1262 }
1263
1264 /* chgrp the container cgroups to container group.  We leave
1265  * the container owner as cgroup owner.  So we must make the
1266  * directories 775 so that the container can create sub-cgroups.
1267  *
1268  * Also chown the tasks and cgroup.procs files.  Those may not
1269  * exist depending on kernel version.
1270  */
1271 static int chown_cgroup_wrapper(void *data)
1272 {
1273         int ret;
1274         uid_t destuid;
1275         struct generic_userns_exec_data *arg = data;
1276         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1277         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1278
1279         if (!lxc_drop_groups() && errno != EPERM)
1280                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1281
1282         ret = setresgid(nsgid, nsgid, nsgid);
1283         if (ret < 0)
1284                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
1285                                        (int)nsgid, (int)nsgid, (int)nsgid);
1286
1287         ret = setresuid(nsuid, nsuid, nsuid);
1288         if (ret < 0)
1289                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
1290                                        (int)nsuid, (int)nsuid, (int)nsuid);
1291
1292         destuid = get_ns_uid(arg->origuid);
1293         if (destuid == LXC_INVALID_UID)
1294                 destuid = 0;
1295
1296         for (int i = 0; arg->hierarchies[i]; i++) {
1297                 int dirfd = arg->hierarchies[i]->dfd_con;
1298
1299                 if (dirfd < 0)
1300                         return syserror_set(-EBADF, "Invalid cgroup file descriptor");
1301
1302                 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
1303
1304                 /*
1305                  * Failures to chown() these are inconvenient but not
1306                  * detrimental We leave these owned by the container launcher,
1307                  * so that container root can write to the files to attach.  We
1308                  * chmod() them 664 so that container systemd can write to the
1309                  * files (which systemd in wily insists on doing).
1310                  */
1311
1312                 if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY)
1313                         (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
1314
1315                 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
1316
1317                 if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY)
1318                         continue;
1319
1320                 for (char **p = arg->hierarchies[i]->delegate; p && *p; p++)
1321                         (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
1322         }
1323
1324         return 0;
1325 }
1326
1327 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1328                                       struct lxc_conf *conf)
1329 {
1330         struct generic_userns_exec_data wrap;
1331
1332         if (!ops)
1333                 return ret_set_errno(false, ENOENT);
1334
1335         if (!ops->hierarchies)
1336                 return true;
1337
1338         if (!ops->container_cgroup)
1339                 return ret_set_errno(false, ENOENT);
1340
1341         if (!conf)
1342                 return ret_set_errno(false, EINVAL);
1343
1344         if (list_empty(&conf->id_map))
1345                 return true;
1346
1347         wrap.origuid = geteuid();
1348         wrap.path = NULL;
1349         wrap.hierarchies = ops->hierarchies;
1350         wrap.conf = conf;
1351
1352         if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1353                 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1354
1355         return true;
1356 }
1357
1358 __cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops)
1359 {
1360         if (!ops)
1361                 return;
1362
1363         if (!ops->hierarchies)
1364                 return;
1365
1366         for (int i = 0; ops->hierarchies[i]; i++) {
1367                 struct hierarchy *h = ops->hierarchies[i];
1368
1369                 /* Close all monitor cgroup file descriptors. */
1370                 close_prot_errno_disarm(h->dfd_mon);
1371         }
1372         /* Close the cgroup root file descriptor. */
1373         close_prot_errno_disarm(ops->dfd_mnt);
1374
1375         /*
1376          * The checking for freezer support should obviously be done at cgroup
1377          * initialization time but that doesn't work reliable. The freezer
1378          * controller has been demoted (rightly so) to a simple file located in
1379          * each non-root cgroup. At the time when the container is created we
1380          * might still be located in /sys/fs/cgroup and so checking for
1381          * cgroup.freeze won't tell us anything because this file doesn't exist
1382          * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1383          * find an already existing cgroup and then check within that cgroup
1384          * for the existence of cgroup.freeze but that will only work on
1385          * systemd based hosts. Other init systems might not manage cgroups and
1386          * so no cgroup will exist. So we defer until we have created cgroups
1387          * for our container which means we check here.
1388          */
1389         if (pure_unified_layout(ops) &&
1390             !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK,
1391                        AT_SYMLINK_NOFOLLOW)) {
1392                 TRACE("Unified hierarchy supports freezer");
1393                 ops->unified->utilities |= FREEZER_CONTROLLER;
1394         }
1395 }
1396
1397 /* cgroup-full:* is done, no need to create subdirs */
1398 static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
1399 {
1400         switch (cgroup_automount_type) {
1401         case LXC_AUTO_CGROUP_RO:
1402                 return true;
1403         case LXC_AUTO_CGROUP_RW:
1404                 return true;
1405         case LXC_AUTO_CGROUP_MIXED:
1406                 return true;
1407         }
1408
1409         return false;
1410 }
1411
1412 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1413  * remount controller ro if needed and bindmount the cgroupfs onto
1414  * control/the/cg/path.
1415  */
1416 static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
1417                                        char *hierarchy_mnt, char *cgpath,
1418                                        const char *container_cgroup)
1419 {
1420         __do_free char *sourcepath = NULL;
1421         int ret, remount_flags;
1422         int flags = MS_BIND;
1423
1424         if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1425             (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
1426                 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL);
1427                 if (ret < 0)
1428                         return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1429                                                hierarchy_mnt, hierarchy_mnt);
1430
1431                 remount_flags = add_required_remount_flags(hierarchy_mnt,
1432                                                            hierarchy_mnt,
1433                                                            flags | MS_REMOUNT);
1434                 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup",
1435                             remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1436                             NULL);
1437                 if (ret < 0)
1438                         return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt);
1439
1440                 INFO("Remounted %s read-only", hierarchy_mnt);
1441         }
1442
1443         sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL);
1444         if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
1445                 flags |= MS_RDONLY;
1446
1447         ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1448         if (ret < 0)
1449                 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1450                                        h->controllers[0], cgpath);
1451         INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1452
1453         if (flags & MS_RDONLY) {
1454                 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1455                                                            flags | MS_REMOUNT);
1456                 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1457                 if (ret < 0)
1458                         return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
1459                 INFO("Remounted %s read-only", cgpath);
1460         }
1461
1462         INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1463         return 0;
1464 }
1465
1466 /* __cgroupfs_mount
1467  *
1468  * Mount cgroup hierarchies directly without using bind-mounts. The main
1469  * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1470  * cgroups for the LXC_AUTO_CGROUP_FULL option.
1471  */
1472 static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1473                             struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
1474                             const char *hierarchy_mnt)
1475 {
1476         __do_close int fd_fs = -EBADF;
1477         unsigned int flags = 0;
1478         char *fstype;
1479         int ret;
1480
1481         if (dfd_mnt_cgroupfs < 0)
1482                 return ret_errno(EINVAL);
1483
1484         flags |= MOUNT_ATTR_NOSUID;
1485         flags |= MOUNT_ATTR_NOEXEC;
1486         flags |= MOUNT_ATTR_NODEV;
1487         flags |= MOUNT_ATTR_RELATIME;
1488
1489         if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1490             (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO))
1491                 flags |= MOUNT_ATTR_RDONLY;
1492
1493         if (is_unified_hierarchy(h))
1494                 fstype = "cgroup2";
1495         else
1496                 fstype = "cgroup";
1497
1498         if (can_use_mount_api()) {
1499                 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1500                 if (fd_fs < 0)
1501                         return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1502
1503                 if (!is_unified_hierarchy(h)) {
1504                         for (const char **it = (const char **)h->controllers; it && *it; it++) {
1505                                 if (strnequal(*it, "name=", STRLITERALLEN("name=")))
1506                                         ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1507                                 else
1508                                         ret = fs_set_property(fd_fs, *it, "");
1509                                 if (ret < 0)
1510                                         return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1511                         }
1512                 }
1513
1514                 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1515                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1516                                 flags);
1517         } else {
1518                 __do_free char *controllers = NULL, *target = NULL;
1519                 unsigned int old_flags = 0;
1520                 const char *rootfs_mnt;
1521
1522                 if (!is_unified_hierarchy(h)) {
1523                         controllers = lxc_string_join(",", (const char **)h->controllers, false);
1524                         if (!controllers)
1525                                 return ret_errno(ENOMEM);
1526                 }
1527
1528                 rootfs_mnt = get_rootfs_mnt(rootfs);
1529                 ret = mnt_attributes_old(flags, &old_flags);
1530                 if (ret)
1531                         return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1532
1533                 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
1534                 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
1535         }
1536         if (ret < 0)
1537                 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1538                                        fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1539
1540         DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1541               fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1542         return 0;
1543 }
1544
1545 static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1546                                  struct lxc_rootfs *rootfs,
1547                                  int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
1548 {
1549         return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1550                                 dfd_mnt_cgroupfs, hierarchy_mnt);
1551 }
1552
1553 static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
1554                                       struct lxc_rootfs *rootfs,
1555                                       int dfd_mnt_cgroupfs,
1556                                       const char *hierarchy_mnt)
1557 {
1558         switch (cgroup_automount_type) {
1559         case LXC_AUTO_CGROUP_FULL_RO:
1560                 break;
1561         case LXC_AUTO_CGROUP_FULL_RW:
1562                 break;
1563         case LXC_AUTO_CGROUP_FULL_MIXED:
1564                 break;
1565         default:
1566                 return 0;
1567         }
1568
1569         return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1570                                 dfd_mnt_cgroupfs, hierarchy_mnt);
1571 }
1572
1573 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1574                                       struct lxc_handler *handler, int cg_flags)
1575 {
1576         __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF;
1577         __do_free char *cgroup_root = NULL;
1578         int cgroup_automount_type;
1579         bool in_cgroup_ns = false, wants_force_mount = false;
1580         struct lxc_conf *conf = handler->conf;
1581         struct lxc_rootfs *rootfs = &conf->rootfs;
1582         const char *rootfs_mnt = get_rootfs_mnt(rootfs);
1583         int ret;
1584
1585         if (!ops)
1586                 return ret_set_errno(false, ENOENT);
1587
1588         if (!ops->hierarchies)
1589                 return true;
1590
1591         if (!conf)
1592                 return ret_set_errno(false, EINVAL);
1593
1594         if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
1595                 return log_trace(true, "No cgroup mounts requested");
1596
1597         if (cg_flags & LXC_AUTO_CGROUP_FORCE) {
1598                 cg_flags &= ~LXC_AUTO_CGROUP_FORCE;
1599                 wants_force_mount = true;
1600         }
1601
1602         switch (cg_flags) {
1603         case LXC_AUTO_CGROUP_RO:
1604                 TRACE("Read-only cgroup mounts requested");
1605                 break;
1606         case LXC_AUTO_CGROUP_RW:
1607                 TRACE("Read-write cgroup mounts requested");
1608                 break;
1609         case LXC_AUTO_CGROUP_MIXED:
1610                 TRACE("Mixed cgroup mounts requested");
1611                 break;
1612         case LXC_AUTO_CGROUP_FULL_RO:
1613                 TRACE("Full read-only cgroup mounts requested");
1614                 break;
1615         case LXC_AUTO_CGROUP_FULL_RW:
1616                 TRACE("Full read-write cgroup mounts requested");
1617                 break;
1618         case LXC_AUTO_CGROUP_FULL_MIXED:
1619                 TRACE("Full mixed cgroup mounts requested");
1620                 break;
1621         default:
1622                 return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified");
1623         }
1624         cgroup_automount_type = cg_flags;
1625
1626         if (!wants_force_mount) {
1627                 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
1628
1629                 /*
1630                  * Most recent distro versions currently have init system that
1631                  * do support cgroup2 but do not mount it by default unless
1632                  * explicitly told so even if the host is cgroup2 only. That
1633                  * means they often will fail to boot. Fix this by pre-mounting
1634                  * cgroup2 by default. We will likely need to be doing this a
1635                  * few years until all distros have switched over to cgroup2 at
1636                  * which point we can safely assume that their init systems
1637                  * will mount it themselves.
1638                  */
1639                 if (pure_unified_layout(ops))
1640                         wants_force_mount = true;
1641         }
1642
1643         if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP))
1644                 in_cgroup_ns = true;
1645
1646         if (in_cgroup_ns && !wants_force_mount)
1647                 return log_trace(true, "Mounting cgroups not requested or needed");
1648
1649         /* This is really the codepath that we want. */
1650         if (pure_unified_layout(ops)) {
1651                 __do_close int dfd_mnt_unified = -EBADF;
1652
1653                 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1654                                           PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1655                 if (dfd_mnt_unified < 0)
1656                         return syserror_ret(false, "Failed to open %d(%s)",
1657                                             rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1658                 /*
1659                  * If cgroup namespaces are supported but the container will
1660                  * not have CAP_SYS_ADMIN after it has started we need to mount
1661                  * the cgroups manually.
1662                  *
1663                  * Note that here we know that wants_force_mount is true.
1664                  * Otherwise we would've returned early above.
1665                  */
1666                 if (in_cgroup_ns) {
1667                         /*
1668                          *  1. cgroup:rw:force    -> Mount the cgroup2 filesystem.
1669                          *  2. cgroup:ro:force    -> Mount the cgroup2 filesystem read-only.
1670                          *  3. cgroup:mixed:force -> See comment above how this
1671                          *                           does not apply so
1672                          *                           cgroup:mixed is equal to
1673                          *                           cgroup:rw when cgroup
1674                          *                           namespaces are supported.
1675
1676                          *  4. cgroup:rw    -> No-op; init system responsible for mounting.
1677                          *  5. cgroup:ro    -> No-op; init system responsible for mounting.
1678                          *  6. cgroup:mixed -> No-op; init system responsible for mounting.
1679                          *
1680                          *  7. cgroup-full:rw    -> Not supported.
1681                          *  8. cgroup-full:ro    -> Not supported.
1682                          *  9. cgroup-full:mixed -> Not supported.
1683
1684                          * 10. cgroup-full:rw:force    -> Not supported.
1685                          * 11. cgroup-full:ro:force    -> Not supported.
1686                          * 12. cgroup-full:mixed:force -> Not supported.
1687                          */
1688                         ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
1689                         if (ret < 0)
1690                                 return syserror_ret(false, "Failed to force mount cgroup filesystem in cgroup namespace");
1691
1692                         return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
1693                 } else {
1694                         /*
1695                          * Either no cgroup namespace supported (highly
1696                          * unlikely unless we're dealing with a Frankenkernel.
1697                          * Or the user requested to keep the cgroup namespace
1698                          * of the host or another container.
1699                          */
1700                         if (wants_force_mount) {
1701                                 /*
1702                                  * 1. cgroup:rw:force    -> Bind-mount the cgroup2 filesystem writable.
1703                                  * 2. cgroup:ro:force    -> Bind-mount the cgroup2 filesystem read-only.
1704                                  * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
1705                                  *                          and make the parent directory of the
1706                                  *                          container's cgroup read-only but the
1707                                  *                          container's cgroup writable.
1708                                  *
1709                                  * 10. cgroup-full:rw:force    ->
1710                                  * 11. cgroup-full:ro:force    ->
1711                                  * 12. cgroup-full:mixed:force ->
1712                                  */
1713                                 errno = EOPNOTSUPP;
1714                                 SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1715                         } else {
1716                                 errno = EOPNOTSUPP;
1717                                 SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1718                         }
1719                 }
1720
1721                 return syserror_ret(false, "Failed to mount cgroups");
1722         }
1723
1724         /*
1725          * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
1726          * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
1727          * DEFAULT_CGROUP_MOUNTPOINT define.
1728          */
1729         if (can_use_mount_api()) {
1730                 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
1731                 if (fd_fs < 0)
1732                         return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
1733
1734                 ret = fs_set_property(fd_fs, "mode", "0755");
1735                 if (ret < 0)
1736                         return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1737
1738                 ret = fs_set_property(fd_fs, "size", "10240k");
1739                 if (ret < 0)
1740                         return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1741
1742                 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1743                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
1744                                 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
1745                                 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
1746         } else {
1747                 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1748                 ret = safe_mount(NULL, cgroup_root, "tmpfs",
1749                                  MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1750                                  "size=10240k,mode=755", rootfs_mnt);
1751         }
1752         if (ret < 0)
1753                 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
1754                                        DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1755
1756         dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1757                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1758         if (dfd_mnt_tmpfs < 0)
1759                 return syserror_ret(false, "Failed to open %d(%s)",
1760                                     rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1761
1762         for (int i = 0; ops->hierarchies[i]; i++) {
1763                 __do_free char *hierarchy_mnt = NULL, *path2 = NULL;
1764                 struct hierarchy *h = ops->hierarchies[i];
1765
1766                 ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000);
1767                 if (ret < 0)
1768                         return syserror_ret(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt);
1769
1770                 if (in_cgroup_ns && wants_force_mount) {
1771                         /*
1772                          * If cgroup namespaces are supported but the container
1773                          * will not have CAP_SYS_ADMIN after it has started we
1774                          * need to mount the cgroups manually.
1775                          */
1776                         ret = cgroupfs_mount(cgroup_automount_type, h, rootfs,
1777                                              dfd_mnt_tmpfs, h->at_mnt);
1778                         if (ret < 0)
1779                                 return false;
1780
1781                         continue;
1782                 }
1783
1784                 /* Here is where the ancient kernel section begins. */
1785                 ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs,
1786                                           dfd_mnt_tmpfs, h->at_mnt);
1787                 if (ret < 0)
1788                         return false;
1789
1790                 if (!cg_mount_needs_subdirs(cgroup_automount_type))
1791                         continue;
1792
1793                 if (!cgroup_root)
1794                         cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1795
1796                 hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL);
1797                 path2 = must_make_path(hierarchy_mnt, h->at_base,
1798                                        ops->container_cgroup, NULL);
1799                 ret = mkdir_p(path2, 0755);
1800                 if (ret < 0 && (errno != EEXIST))
1801                         return false;
1802
1803                 ret = cg_legacy_mount_controllers(cgroup_automount_type, h,
1804                                                   hierarchy_mnt, path2,
1805                                                   ops->container_cgroup);
1806                 if (ret < 0)
1807                         return false;
1808         }
1809
1810         return true;
1811 }
1812
1813 /* Only root needs to escape to the cgroup of its init. */
1814 __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
1815                                             struct lxc_conf *conf)
1816 {
1817         if (!ops)
1818                 return ret_set_errno(false, ENOENT);
1819
1820         if (!ops->hierarchies)
1821                 return true;
1822
1823         if (!conf)
1824                 return ret_set_errno(false, EINVAL);
1825
1826         if (conf->cgroup_meta.relative || geteuid())
1827                 return true;
1828
1829         for (int i = 0; ops->hierarchies[i]; i++) {
1830                 __do_free char *fullpath = NULL;
1831                 int ret;
1832
1833                 fullpath = make_cgroup_path(ops->hierarchies[i],
1834                                             ops->hierarchies[i]->at_base,
1835                                             "cgroup.procs", NULL);
1836                 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1837                 if (ret != 0)
1838                         return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
1839         }
1840
1841         return true;
1842 }
1843
1844 __cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
1845 {
1846         int i = 0;
1847
1848         if (!ops)
1849                 return ret_set_errno(-1, ENOENT);
1850
1851         if (!ops->hierarchies)
1852                 return 0;
1853
1854         for (; ops->hierarchies[i]; i++)
1855                 ;
1856
1857         return i;
1858 }
1859
1860 __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
1861                                                      int n, char ***out)
1862 {
1863         int i;
1864
1865         if (!ops)
1866                 return ret_set_errno(false, ENOENT);
1867
1868         if (!ops->hierarchies)
1869                 return ret_set_errno(false, ENOENT);
1870
1871         /* consistency check n */
1872         for (i = 0; i < n; i++)
1873                 if (!ops->hierarchies[i])
1874                         return ret_set_errno(false, ENOENT);
1875
1876         *out = ops->hierarchies[i]->controllers;
1877
1878         return true;
1879 }
1880
1881 static int cg_legacy_freeze(struct cgroup_ops *ops)
1882 {
1883         struct hierarchy *h;
1884
1885         h = get_hierarchy(ops, "freezer");
1886         if (!h)
1887                 return ret_set_errno(-1, ENOENT);
1888
1889         return lxc_write_openat(h->path_con, "freezer.state",
1890                                 "FROZEN", STRLITERALLEN("FROZEN"));
1891 }
1892
1893 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1894                                     struct lxc_async_descr *descr)
1895 {
1896         __do_free char *line = NULL;
1897         __do_fclose FILE *f = NULL;
1898         int state = PTR_TO_INT(cbdata);
1899         size_t len;
1900         const char *state_string;
1901
1902         f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
1903         if (!f)
1904                 return LXC_MAINLOOP_ERROR;
1905
1906         if (state == 1)
1907                 state_string = "frozen 1";
1908         else
1909                 state_string = "frozen 0";
1910
1911         while (getline(&line, &len, f) != -1)
1912                 if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2))
1913                         return LXC_MAINLOOP_CLOSE;
1914
1915         rewind(f);
1916
1917         return LXC_MAINLOOP_CONTINUE;
1918 }
1919
1920 static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
1921                                 const char *state_string,
1922                                 int state_num,
1923                                 const char *epoll_error,
1924                                 const char *wait_error)
1925 {
1926         __do_close int fd = -EBADF;
1927         call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL;
1928         int ret;
1929         struct lxc_async_descr descr;
1930         struct hierarchy *h;
1931
1932         h = ops->unified;
1933         if (!h)
1934                 return ret_set_errno(-1, ENOENT);
1935
1936         if (!h->path_con)
1937                 return ret_set_errno(-1, EEXIST);
1938
1939         if (timeout != 0) {
1940                 __do_free char *events_file = NULL;
1941
1942                 events_file = must_make_path(h->path_con, "cgroup.events", NULL);
1943                 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1944                 if (fd < 0)
1945                         return log_error_errno(-1, errno, "Failed to open cgroup.events file");
1946
1947                 ret = lxc_mainloop_open(&descr);
1948                 if (ret)
1949                         return log_error_errno(-1, errno, "%s", epoll_error);
1950
1951                 /* automatically cleaned up now */
1952                 descr_ptr = &descr;
1953
1954                 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI,
1955                                                       freezer_cgroup_events_cb,
1956                                                       default_cleanup_handler,
1957                                                       INT_TO_PTR(state_num),
1958                                                       "freezer_cgroup_events_cb");
1959                 if (ret < 0)
1960                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
1961         }
1962
1963         ret = lxc_write_openat(h->path_con, "cgroup.freeze", state_string, 1);
1964         if (ret < 0)
1965                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
1966
1967         if (timeout != 0 && lxc_mainloop(&descr, timeout))
1968                 return log_error_errno(-1, errno, "%s", wait_error);
1969
1970         return 0;
1971 }
1972
1973 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
1974 {
1975         return cg_unified_freeze_do(ops, timeout, "1", 1,
1976                 "Failed to create epoll instance to wait for container freeze",
1977                 "Failed to wait for container to be frozen");
1978 }
1979
1980 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
1981 {
1982         if (!ops->hierarchies)
1983                 return ret_set_errno(-1, ENOENT);
1984
1985         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
1986                 return cg_legacy_freeze(ops);
1987
1988         return cg_unified_freeze(ops, timeout);
1989 }
1990
1991 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
1992 {
1993         struct hierarchy *h;
1994
1995         h = get_hierarchy(ops, "freezer");
1996         if (!h)
1997                 return ret_set_errno(-1, ENOENT);
1998
1999         return lxc_write_openat(h->path_con, "freezer.state",
2000                                 "THAWED", STRLITERALLEN("THAWED"));
2001 }
2002
2003 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
2004 {
2005         return cg_unified_freeze_do(ops, timeout, "0", 0,
2006                 "Failed to create epoll instance to wait for container unfreeze",
2007                 "Failed to wait for container to be unfrozen");
2008 }
2009
2010 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2011 {
2012         if (!ops->hierarchies)
2013                 return ret_set_errno(-1, ENOENT);
2014
2015         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2016                 return cg_legacy_unfreeze(ops);
2017
2018         return cg_unified_unfreeze(ops, timeout);
2019 }
2020
2021 static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2022                                         const char *controller, bool limiting)
2023 {
2024         struct hierarchy *h;
2025         size_t len;
2026         const char *path;
2027
2028         h = get_hierarchy(ops, controller);
2029         if (!h)
2030                 return log_warn_errno(NULL, ENOENT,
2031                                       "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller));
2032
2033         if (limiting)
2034                 path = h->path_lim;
2035         else
2036                 path = h->path_con;
2037         if (!path)
2038                 return NULL;
2039
2040         len = strlen(h->at_mnt);
2041         if (!strnequal(h->at_mnt, DEFAULT_CGROUP_MOUNTPOINT,
2042                        STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) {
2043                 path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT);
2044                 path += strspn(path, "/");
2045         }
2046         return path += len;
2047 }
2048
2049 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2050                                                   const char *controller)
2051 {
2052     return cgfsng_get_cgroup_do(ops, controller, false);
2053 }
2054
2055 __cgfsng_ops static const char *cgfsng_get_limit_cgroup(struct cgroup_ops *ops,
2056                                                         const char *controller)
2057 {
2058     return cgfsng_get_cgroup_do(ops, controller, true);
2059 }
2060
2061 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2062  * which must be freed by the caller.
2063  */
2064 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2065                                                        const char *inpath,
2066                                                        const char *filename)
2067 {
2068         return make_cgroup_path(h, inpath, filename, NULL);
2069 }
2070
2071 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
2072 {
2073         int idx = 1;
2074         int ret;
2075         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2076         ssize_t pidstr_len;
2077
2078         /* Create leaf cgroup. */
2079         ret = mkdirat(unified_fd, ".lxc", 0755);
2080         if (ret < 0 && errno != EEXIST)
2081                 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2082
2083         pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2084         if (pidstr_len < 0)
2085                 return pidstr_len;
2086
2087         ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
2088         if (ret < 0)
2089                 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2090         if (ret == 0)
2091                 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
2092
2093         /* this is a non-leaf node */
2094         if (errno != EBUSY)
2095                 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
2096
2097         do {
2098                 bool rm = false;
2099                 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
2100                 char *slash = attach_cgroup;
2101
2102                 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2103                 if (ret < 0)
2104                         return ret;
2105
2106                 /*
2107                  * This shouldn't really happen but the compiler might complain
2108                  * that a short write would cause a buffer overrun. So be on
2109                  * the safe side.
2110                  */
2111                 if ((size_t)ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2112                         return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2113
2114                 slash += (ret - STRLITERALLEN("/cgroup.procs"));
2115                 *slash = '\0';
2116
2117                 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2118                 if (ret < 0 && errno != EEXIST)
2119                         return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2120                 if (ret == 0)
2121                         rm = true;
2122
2123                 *slash = '/';
2124
2125                 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2126                 if (ret == 0)
2127                         return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
2128
2129                 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2130                         SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2131
2132                 /* this is a non-leaf node */
2133                 if (errno != EBUSY)
2134                         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2135
2136                 idx++;
2137         } while (idx < 1000);
2138
2139         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2140 }
2141
2142 static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2143                                      int unified_fd, int *sk_fd)
2144 {
2145         __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2146         int target_fds[2];
2147         ssize_t ret;
2148
2149         /* Create leaf cgroup. */
2150         ret = mkdirat(unified_fd, ".lxc", 0755);
2151         if (ret < 0 && errno != EEXIST)
2152                 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2153
2154         target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2155         if (target_fd0 < 0)
2156                 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2157         target_fds[0] = target_fd0;
2158
2159         target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2160         if (target_fd1 < 0)
2161                 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2162         target_fds[1] = target_fd1;
2163
2164         ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
2165         if (ret <= 0)
2166                 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
2167                                        target_fd0, target_fd1);
2168
2169         return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
2170 }
2171
2172 static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2173                                         int *sk_fd, pid_t pid)
2174 {
2175         __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2176         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2177         size_t pidstr_len;
2178         ssize_t ret;
2179
2180         ret = lxc_abstract_unix_recv_two_fds(sk, &target_fd0, &target_fd1);
2181         if (ret < 0)
2182                 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
2183
2184         pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2185
2186         ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2187         if (ret > 0 && (size_t)ret == pidstr_len)
2188                 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2189
2190         ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
2191         if (ret > 0 && (size_t)ret == pidstr_len)
2192                 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
2193
2194         return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2195                                target_fd0, target_fd1);
2196 }
2197
2198 struct userns_exec_unified_attach_data {
2199         const struct lxc_conf *conf;
2200         int unified_fd;
2201         int sk_pair[2];
2202         pid_t pid;
2203 };
2204
2205 static int cgroup_unified_attach_child_wrapper(void *data)
2206 {
2207         struct userns_exec_unified_attach_data *args = data;
2208
2209         if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2210             args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2211                 return ret_errno(EINVAL);
2212
2213         close_prot_errno_disarm(args->sk_pair[0]);
2214         return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2215                                          &args->sk_pair[1]);
2216 }
2217
2218 static int cgroup_unified_attach_parent_wrapper(void *data)
2219 {
2220         struct userns_exec_unified_attach_data *args = data;
2221
2222         if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2223             args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2224                 return ret_errno(EINVAL);
2225
2226         close_prot_errno_disarm(args->sk_pair[1]);
2227         return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2228                                             args->pid);
2229 }
2230
2231 /* Technically, we're always at a delegation boundary here (This is especially
2232  * true when cgroup namespaces are available.). The reasoning is that in order
2233  * for us to have been able to start a container in the first place the root
2234  * cgroup must have been a leaf node. Now, either the container's init system
2235  * has populated the cgroup and kept it as a leaf node or it has created
2236  * subtrees. In the former case we will simply attach to the leaf node we
2237  * created when we started the container in the latter case we create our own
2238  * cgroup for the attaching process.
2239  */
2240 static int __cg_unified_attach(const struct hierarchy *h,
2241                                const struct lxc_conf *conf, const char *name,
2242                                const char *lxcpath, pid_t pid,
2243                                const char *controller)
2244 {
2245         __do_close int unified_fd = -EBADF;
2246         __do_free char *path = NULL, *cgroup = NULL;
2247         int ret;
2248
2249         if (!conf || !name || !lxcpath || pid <= 0)
2250                 return ret_errno(EINVAL);
2251
2252         ret = cgroup_attach(conf, name, lxcpath, pid);
2253         if (ret == 0)
2254                 return log_trace(0, "Attached to unified cgroup via command handler");
2255         if (!ERRNO_IS_NOT_SUPPORTED(ret) && ret != -ENOCGROUP2)
2256                 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2257
2258         /* Fall back to retrieving the path for the unified cgroup. */
2259         cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2260         /* not running */
2261         if (!cgroup)
2262                 return 0;
2263
2264         path = make_cgroup_path(h, cgroup, NULL);
2265
2266         unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
2267         if (unified_fd < 0)
2268                 return ret_errno(EBADF);
2269
2270         if (!list_empty(&conf->id_map)) {
2271                 struct userns_exec_unified_attach_data args = {
2272                         .conf           = conf,
2273                         .unified_fd     = unified_fd,
2274                         .pid            = pid,
2275                 };
2276
2277                 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2278                 if (ret < 0)
2279                         return -errno;
2280
2281                 ret = userns_exec_minimal(conf,
2282                                           cgroup_unified_attach_parent_wrapper,
2283                                           &args,
2284                                           cgroup_unified_attach_child_wrapper,
2285                                           &args);
2286         } else {
2287                 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2288         }
2289
2290         return ret;
2291 }
2292
2293 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2294                                        const struct lxc_conf *conf,
2295                                        const char *name, const char *lxcpath,
2296                                        pid_t pid)
2297 {
2298         int len, ret;
2299         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2300
2301         if (!ops)
2302                 return ret_set_errno(false, ENOENT);
2303
2304         if (!ops->hierarchies)
2305                 return true;
2306
2307         len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2308         if (len < 0)
2309                 return false;
2310
2311         for (int i = 0; ops->hierarchies[i]; i++) {
2312                 __do_free char *fullpath = NULL, *path = NULL;
2313                 struct hierarchy *h = ops->hierarchies[i];
2314
2315                 if (h->fs_type == UNIFIED_HIERARCHY) {
2316                         ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
2317                                                   h->controllers[0]);
2318                         if (ret < 0)
2319                                 return false;
2320
2321                         continue;
2322                 }
2323
2324                 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2325                 if (!path) {
2326                         /*
2327                          * Someone might have created a name=<controller>
2328                          * controller after the container has started and so
2329                          * the container doesn't make use of this controller.
2330                          *
2331                          * Link: https://github.com/lxc/lxd/issues/8577
2332                          */
2333                         TRACE("Skipping unused %s controller", maybe_empty(h->controllers[0]));
2334                         continue;
2335                 }
2336
2337                 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2338                 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2339                 if (ret < 0)
2340                         return log_error_errno(false, errno, "Failed to attach %d to %s",
2341                                                (int)pid, fullpath);
2342         }
2343
2344         return true;
2345 }
2346
2347 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.  Here we
2348  * don't have a cgroup_data set up, so we ask the running container through the
2349  * commands API for the cgroup path.
2350  */
2351 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2352                                      char *value, size_t len, const char *name,
2353                                      const char *lxcpath)
2354 {
2355         __do_free char *path = NULL;
2356         __do_free char *controller = NULL;
2357         char *p;
2358         struct hierarchy *h;
2359         int ret = -1;
2360
2361         if (!ops)
2362                 return ret_set_errno(-1, ENOENT);
2363
2364         controller = strdup(filename);
2365         if (!controller)
2366                 return ret_errno(ENOMEM);
2367
2368         p = strchr(controller, '.');
2369         if (p)
2370                 *p = '\0';
2371
2372         path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
2373         /* not running */
2374         if (!path)
2375                 return -1;
2376
2377         h = get_hierarchy(ops, controller);
2378         if (h) {
2379                 __do_free char *fullpath = NULL;
2380
2381                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2382                 ret = lxc_read_from_file(fullpath, value, len);
2383         }
2384
2385         return ret;
2386 }
2387
2388 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2389 {
2390         for (int count = 0; count < 3; count++, val++) {
2391                 switch (*val) {
2392                 case 'r':
2393                         device->access[count] = *val;
2394                         break;
2395                 case 'w':
2396                         device->access[count] = *val;
2397                         break;
2398                 case 'm':
2399                         device->access[count] = *val;
2400                         break;
2401                 case '\n':
2402                 case '\0':
2403                         count = 3;
2404                         break;
2405                 default:
2406                         return ret_errno(EINVAL);
2407                 }
2408         }
2409
2410         return 0;
2411 }
2412
2413 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2414                                     const char *val)
2415 {
2416         size_t count;
2417         int ret;
2418         char temp[50];
2419
2420         if (strequal("devices.allow", key))
2421                 device->allow = 1; /* allow the device */
2422         else
2423                 device->allow = 0; /* deny the device */
2424
2425         if (strequal(val, "a")) {
2426                 /* global rule */
2427                 device->type = 'a';
2428                 device->major = -1;
2429                 device->minor = -1;
2430                 return 0;
2431         }
2432
2433         switch (*val) {
2434         case 'a':
2435                 __fallthrough;
2436         case 'b':
2437                 __fallthrough;
2438         case 'c':
2439                 device->type = *val;
2440                 break;
2441         default:
2442                 return -1;
2443         }
2444
2445         val++;
2446         if (!isspace(*val))
2447                 return -1;
2448         val++;
2449         if (*val == '*') {
2450                 device->major = -1;
2451                 val++;
2452         } else if (isdigit(*val)) {
2453                 memset(temp, 0, sizeof(temp));
2454                 for (count = 0; count < sizeof(temp) - 1; count++) {
2455                         temp[count] = *val;
2456                         val++;
2457                         if (!isdigit(*val))
2458                                 break;
2459                 }
2460                 ret = lxc_safe_int(temp, &device->major);
2461                 if (ret)
2462                         return -1;
2463         } else {
2464                 return -1;
2465         }
2466         if (*val != ':')
2467                 return -1;
2468         val++;
2469
2470         /* read minor */
2471         if (*val == '*') {
2472                 device->minor = -1;
2473                 val++;
2474         } else if (isdigit(*val)) {
2475                 memset(temp, 0, sizeof(temp));
2476                 for (count = 0; count < sizeof(temp) - 1; count++) {
2477                         temp[count] = *val;
2478                         val++;
2479                         if (!isdigit(*val))
2480                                 break;
2481                 }
2482                 ret = lxc_safe_int(temp, &device->minor);
2483                 if (ret)
2484                         return -1;
2485         } else {
2486                 return -1;
2487         }
2488         if (!isspace(*val))
2489                 return -1;
2490
2491         return device_cgroup_parse_access(device, ++val);
2492 }
2493
2494 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.  Here we
2495  * don't have a cgroup_data set up, so we ask the running container through the
2496  * commands API for the cgroup path.
2497  */
2498 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2499                                      const char *key, const char *value,
2500                                      const char *name, const char *lxcpath)
2501 {
2502         __do_free char *path = NULL;
2503         __do_free char *controller = NULL;
2504         char *p;
2505         struct hierarchy *h;
2506         int ret = -1;
2507
2508         if (!ops || is_empty_string(key) || is_empty_string(value) ||
2509             is_empty_string(name) || is_empty_string(lxcpath))
2510                 return ret_errno(EINVAL);
2511
2512         controller = strdup(key);
2513         if (!controller)
2514                 return ret_errno(ENOMEM);
2515
2516         p = strchr(controller, '.');
2517         if (p)
2518                 *p = '\0';
2519
2520         if (pure_unified_layout(ops) && strequal(controller, "devices")) {
2521                 struct device_item device = {};
2522
2523                 ret = device_cgroup_rule_parse(&device, key, value);
2524                 if (ret < 0)
2525                         return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2526                                                key, value);
2527
2528                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2529                 if (ret < 0)
2530                         return -1;
2531
2532                 return 0;
2533         }
2534
2535         path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
2536         /* not running */
2537         if (!path)
2538                 return -1;
2539
2540         h = get_hierarchy(ops, controller);
2541         if (h) {
2542                 __do_free char *fullpath = NULL;
2543
2544                 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2545                 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2546         }
2547
2548         return ret;
2549 }
2550
2551 /* take devices cgroup line
2552  *    /dev/foo rwx
2553  * and convert it to a valid
2554  *    type major:minor mode
2555  * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2556  * the output.
2557  */
2558 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2559                                             const char *devpath)
2560 {
2561         __do_free char *path = NULL;
2562         char *mode = NULL;
2563         int n_parts, ret;
2564         char *p;
2565         struct stat sb;
2566
2567         path = strdup(devpath);
2568         if (!path)
2569                 return ret_errno(ENOMEM);
2570
2571         /*
2572          * Read path followed by mode. Ignore any trailing text.
2573          * A '    # comment' would be legal. Technically other text is not
2574          * legal, we could check for that if we cared to.
2575          */
2576         for (n_parts = 1, p = path; *p; p++) {
2577                 if (*p != ' ')
2578                         continue;
2579                 *p = '\0';
2580
2581                 if (n_parts != 1)
2582                         break;
2583                 p++;
2584                 n_parts++;
2585
2586                 while (*p == ' ')
2587                         p++;
2588
2589                 mode = p;
2590
2591                 if (*p == '\0')
2592                         return ret_set_errno(-1, EINVAL);
2593         }
2594
2595         if (!mode)
2596                 return ret_errno(EINVAL);
2597
2598         if (device_cgroup_parse_access(device, mode) < 0)
2599                 return -1;
2600
2601         ret = stat(path, &sb);
2602         if (ret < 0)
2603                 return ret_set_errno(-1, errno);
2604
2605         mode_t m = sb.st_mode & S_IFMT;
2606         switch (m) {
2607         case S_IFBLK:
2608                 device->type = 'b';
2609                 break;
2610         case S_IFCHR:
2611                 device->type = 'c';
2612                 break;
2613         default:
2614                 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
2615         }
2616
2617         device->major = MAJOR(sb.st_rdev);
2618         device->minor = MINOR(sb.st_rdev);
2619         device->allow = 1;
2620
2621         return 0;
2622 }
2623
2624 static int convert_devpath(const char *invalue, char *dest)
2625 {
2626         struct device_item device = {};
2627         int ret;
2628
2629         ret = device_cgroup_rule_parse_devpath(&device, invalue);
2630         if (ret < 0)
2631                 return -1;
2632
2633         ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2634                          device.minor, device.access);
2635         if (ret < 0)
2636                 return log_error_errno(ret, -ret,
2637                                        "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2638                                        device.type, device.major, device.minor,
2639                                        device.access);
2640
2641         return 0;
2642 }
2643
2644 /* Called from setup_limits - here we have the container's cgroup_data because
2645  * we created the cgroups.
2646  */
2647 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2648                               const char *value, bool is_cpuset)
2649 {
2650         __do_free char *controller = NULL;
2651         char *p;
2652         /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2653         char converted_value[50];
2654         struct hierarchy *h;
2655
2656         controller = strdup(filename);
2657         if (!controller)
2658                 return ret_errno(ENOMEM);
2659
2660         p = strchr(controller, '.');
2661         if (p)
2662                 *p = '\0';
2663
2664         if (strequal("devices.allow", filename) && value[0] == '/') {
2665                 int ret;
2666
2667                 ret = convert_devpath(value, converted_value);
2668                 if (ret < 0)
2669                         return ret;
2670                 value = converted_value;
2671         }
2672
2673         h = get_hierarchy(ops, controller);
2674         if (!h)
2675                 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
2676
2677         if (is_cpuset) {
2678                 int ret = lxc_write_openat(h->path_con, filename, value, strlen(value));
2679                 if (ret)
2680                         return ret;
2681         }
2682         return lxc_write_openat(h->path_lim, filename, value, strlen(value));
2683 }
2684
2685 /*
2686  * Return the list of cgroup_settings sorted according to the following rules
2687  * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
2688  */
2689 static void sort_cgroup_settings(struct lxc_conf *conf)
2690 {
2691         LIST_HEAD(memsw_list);
2692         struct lxc_cgroup *cgroup, *ncgroup;
2693
2694         /* Iterate over the cgroup settings and copy them to the output list. */
2695         list_for_each_entry_safe(cgroup, ncgroup, &conf->cgroup, head) {
2696                 if (!strequal(cgroup->subsystem, "memory.memsw.limit_in_bytes"))
2697                         continue;
2698
2699                 /* Move the memsw entry from the cgroup settings list. */
2700                 list_move_tail(&cgroup->head, &memsw_list);
2701         }
2702
2703         /*
2704          * Append all the memsw entries to the end of the cgroup settings list
2705          * to make sure they are applied after all memory limit settings.
2706          */
2707         list_splice_tail(&memsw_list, &conf->cgroup);
2708
2709 }
2710
2711 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2712                                                     struct lxc_conf *conf,
2713                                                     bool do_devices)
2714 {
2715         struct list_head *cgroup_settings;
2716         struct lxc_cgroup *cgroup;
2717
2718         if (!ops)
2719                 return ret_set_errno(false, ENOENT);
2720
2721         if (!conf)
2722                 return ret_set_errno(false, EINVAL);
2723
2724         cgroup_settings = &conf->cgroup;
2725         if (list_empty(cgroup_settings))
2726                 return true;
2727
2728         if (!ops->hierarchies)
2729                 return ret_set_errno(false, EINVAL);
2730
2731         if (pure_unified_layout(ops))
2732                 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
2733
2734         sort_cgroup_settings(conf);
2735         list_for_each_entry(cgroup, cgroup_settings, head) {
2736                 if (do_devices == strnequal("devices", cgroup->subsystem, 7)) {
2737                         if (cg_legacy_set_data(ops, cgroup->subsystem, cgroup->value, strnequal("cpuset", cgroup->subsystem, 6))) {
2738                                 if (do_devices && (errno == EACCES || errno == EPERM)) {
2739                                         SYSWARN("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2740                                         continue;
2741                                 }
2742                                 SYSERROR("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2743                                 return false;
2744                         }
2745                         DEBUG("Set controller \"%s\" set to \"%s\"", cgroup->subsystem, cgroup->value);
2746                 }
2747         }
2748
2749         INFO("Limits for the legacy cgroup hierarchies have been setup");
2750         return true;
2751 }
2752
2753 /*
2754  * Some of the parsing logic comes from the original cgroup device v1
2755  * implementation in the kernel.
2756  */
2757 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2758                                      struct lxc_conf *conf, const char *key,
2759                                      const char *val)
2760 {
2761         struct device_item device_item = {};
2762         int ret;
2763
2764         if (strequal("devices.allow", key) && abspath(val))
2765                 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2766         else
2767                 ret = device_cgroup_rule_parse(&device_item, key, val);
2768         if (ret < 0)
2769                 return syserror_set(EINVAL, "Failed to parse device rule %s=%s", key, val);
2770
2771         /*
2772          * Note that bpf_list_add_device() returns 1 if it altered the device
2773          * list and 0 if it didn't; both return values indicate success.
2774          * Only a negative return value indicates an error.
2775          */
2776         ret = bpf_list_add_device(&conf->bpf_devices, &device_item);
2777         if (ret < 0)
2778                 return -1;
2779
2780         return 0;
2781 }
2782
2783 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2784                                              struct lxc_handler *handler)
2785 {
2786         struct list_head *cgroup_settings;
2787         struct hierarchy *h;
2788         struct lxc_conf *conf;
2789         struct lxc_cgroup *cgroup;
2790
2791         if (!ops)
2792                 return ret_set_errno(false, ENOENT);
2793
2794         if (!ops->hierarchies)
2795                 return true;
2796
2797         if (!ops->container_cgroup)
2798                 return ret_set_errno(false, EINVAL);
2799
2800         if (!handler || !handler->conf)
2801                 return ret_set_errno(false, EINVAL);
2802         conf = handler->conf;
2803
2804         cgroup_settings = &conf->cgroup2;
2805         if (list_empty(cgroup_settings))
2806                 return true;
2807
2808         if (!pure_unified_layout(ops))
2809                 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
2810
2811         if (!ops->unified)
2812                 return false;
2813         h = ops->unified;
2814
2815         list_for_each_entry(cgroup, cgroup_settings, head) {
2816                 int ret;
2817
2818                 if (strnequal("devices", cgroup->subsystem, 7))
2819                         ret = bpf_device_cgroup_prepare(ops, conf, cgroup->subsystem, cgroup->value);
2820                 else
2821                         ret = lxc_write_openat(h->path_lim, cgroup->subsystem, cgroup->value, strlen(cgroup->value));
2822                 if (ret < 0)
2823                         return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2824
2825                 TRACE("Set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2826         }
2827
2828         return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
2829 }
2830
2831 __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
2832 {
2833         struct lxc_conf *conf;
2834         struct hierarchy *unified;
2835
2836         if (!ops)
2837                 return ret_set_errno(false, ENOENT);
2838
2839         if (!ops->hierarchies)
2840                 return true;
2841
2842         if (!ops->container_cgroup)
2843                 return ret_set_errno(false, EEXIST);
2844
2845         if (!handler || !handler->conf)
2846                 return ret_set_errno(false, EINVAL);
2847         conf = handler->conf;
2848
2849         unified = ops->unified;
2850         if (!unified || !device_utility_controller(unified) ||
2851             !unified->path_con || list_empty(&(conf->bpf_devices).devices))
2852                 return true;
2853
2854         return bpf_cgroup_devices_attach(ops, &conf->bpf_devices);
2855 }
2856
2857 static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
2858 {
2859         __do_close int dfd_final = -EBADF;
2860         __do_free char *add_controllers = NULL, *copy = NULL;
2861         size_t full_len = 0;
2862         struct hierarchy *unified;
2863         int dfd_cur, ret;
2864         char *cur;
2865         char **it;
2866
2867         if (!ops->hierarchies || !pure_unified_layout(ops))
2868                 return true;
2869
2870         unified = ops->unified;
2871         if (!unified->controllers[0])
2872                 return true;
2873
2874         /* For now we simply enable all controllers that we have detected by
2875          * creating a string like "+memory +pids +cpu +io".
2876          * TODO: In the near future we might want to support "-<controller>"
2877          * etc. but whether supporting semantics like this make sense will need
2878          * some thinking.
2879          */
2880         for (it = unified->controllers; it && *it; it++) {
2881                 full_len += strlen(*it) + 2;
2882                 add_controllers = must_realloc(add_controllers, full_len + 1);
2883
2884                 if (unified->controllers[0] == *it)
2885                         add_controllers[0] = '\0';
2886
2887                 (void)strlcat(add_controllers, "+", full_len + 1);
2888                 (void)strlcat(add_controllers, *it, full_len + 1);
2889
2890                 if ((it + 1) && *(it + 1))
2891                         (void)strlcat(add_controllers, " ", full_len + 1);
2892         }
2893
2894         copy = strdup(cgroup);
2895         if (!copy)
2896                 return false;
2897
2898         /*
2899          * Placing the write to cgroup.subtree_control before the open() is
2900          * intentional because of the cgroup2 delegation model. It enforces
2901          * that leaf cgroups don't have any controllers enabled for delegation.
2902          */
2903         dfd_cur = unified->dfd_base;
2904         lxc_iterate_parts(cur, copy, "/") {
2905                 /*
2906                  * Even though we vetted the paths when we parsed the config
2907                  * we're paranoid here and check that the path is neither
2908                  * absolute nor walks upwards.
2909                  */
2910                 if (abspath(cur))
2911                         return syserror_set(-EINVAL, "No absolute paths allowed");
2912
2913                 if (strnequal(cur, "..", STRLITERALLEN("..")))
2914                         return syserror_set(-EINVAL, "No upward walking paths allowed");
2915
2916                 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
2917                 if (ret < 0)
2918                         return syserror("Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2919
2920                 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2921
2922                 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
2923                 if (dfd_final < 0)
2924                         return syserror("Fail to open directory %d(%s)", dfd_cur, cur);
2925                 if (dfd_cur != unified->dfd_base)
2926                         close(dfd_cur);
2927                 /*
2928                  * Leave dfd_final pointing to the last fd we opened so
2929                  * it will be automatically zapped if we return early.
2930                  */
2931                 dfd_cur = dfd_final;
2932         }
2933
2934         return true;
2935 }
2936
2937 __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2938 {
2939         if (!ops)
2940                 return ret_set_errno(false, ENOENT);
2941
2942         return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2943 }
2944
2945 __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2946 {
2947         if (!ops)
2948                 return ret_set_errno(false, ENOENT);
2949
2950         return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2951 }
2952
2953 static inline bool unified_cgroup(const char *line)
2954 {
2955         return *line == '0';
2956 }
2957
2958 static inline char *current_unified_cgroup(bool relative, char *line)
2959 {
2960         char *current_cgroup;
2961
2962         line += STRLITERALLEN("0::");
2963
2964         if (!abspath(line))
2965                 return ERR_PTR(-EINVAL);
2966
2967         /* remove init.scope */
2968         if (!relative)
2969                 line = prune_init_scope(line);
2970
2971         /* create a relative path */
2972         line = deabs(line);
2973
2974         current_cgroup = strdup(line);
2975         if (!current_cgroup)
2976                 return ERR_PTR(-ENOMEM);
2977
2978         return current_cgroup;
2979 }
2980
2981 static inline const char *unprefix(const char *controllers)
2982 {
2983         if (strnequal(controllers, "name=", STRLITERALLEN("name=")))
2984                 return controllers + STRLITERALLEN("name=");
2985         return controllers;
2986 }
2987
2988 static int __list_cgroup_delegate(char ***delegate)
2989 {
2990         __do_free char **list = NULL;
2991         __do_free char *buf = NULL;
2992         char *standard[] = {
2993                 "cgroup.procs",
2994                 "cgroup.threads",
2995                 "cgroup.subtree_control",
2996                 "memory.oom.group",
2997                 NULL,
2998         };
2999         char *token;
3000         int ret;
3001
3002         buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
3003         if (!buf) {
3004                 for (char **p = standard; p && *p; p++) {
3005                         ret = list_add_string(&list, *p);
3006                         if (ret < 0)
3007                                 return ret;
3008                 }
3009
3010                 *delegate = move_ptr(list);
3011                 return syswarn_ret(0, "Failed to read /sys/kernel/cgroup/delegate");
3012         }
3013
3014         lxc_iterate_parts(token, buf, " \t\n") {
3015                 /*
3016                  * We always need to chown this for both cgroup and
3017                  * cgroup2.
3018                  */
3019                 if (strequal(token, "cgroup.procs"))
3020                         continue;
3021
3022                 ret = list_add_string(&list, token);
3023                 if (ret < 0)
3024                         return ret;
3025         }
3026
3027         *delegate = move_ptr(list);
3028         return 0;
3029 }
3030
3031 static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files)
3032 {
3033         __do_free_string_list char **list = NULL;
3034         int ret;
3035
3036         ret = __list_cgroup_delegate(&list);
3037         if (ret < 0)
3038                 return syserror_ret(ret, "Failed to determine unified cgroup delegation requirements");
3039
3040         for (char *const *s = list; s && *s; s++) {
3041                 if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT)
3042                         continue;
3043
3044                 return sysinfo_ret(false, "The %s file is not writable, skipping unified hierarchy", *s);
3045         }
3046
3047         *ret_files = move_ptr(list);
3048         return true;
3049 }
3050
3051 static bool legacy_hierarchy_delegated(int dfd_base)
3052 {
3053         int ret;
3054
3055         ret = faccessat(dfd_base, ".", W_OK, 0);
3056         if (ret < 0 && errno != ENOENT)
3057                 return sysinfo_ret(false, "Legacy hierarchy not writable, skipping");
3058
3059         return true;
3060 }
3061
3062 /**
3063  * systemd guarantees that the order of co-mounted controllers is stable. On
3064  * some systems the order of the controllers might be reversed though.
3065  *
3066  * For example, this is how the order is mismatched on CentOS 7:
3067  *
3068  *      [root@localhost ~]# cat /proc/self/cgroup
3069  *      11:perf_event:/
3070  *      10:pids:/
3071  *      9:freezer:/
3072  * >>>> 8:cpuacct,cpu:/
3073  *      7:memory:/
3074  *      6:blkio:/
3075  *      5:devices:/
3076  *      4:hugetlb:/
3077  * >>>> 3:net_prio,net_cls:/
3078  *      2:cpuset:/
3079  *      1:name=systemd:/user.slice/user-0.slice/session-c1.scope
3080  *
3081  * whereas the mountpoint:
3082  *
3083  *      | |-/sys/fs/cgroup                    tmpfs         tmpfs      ro,nosuid,nodev,noexec,mode=755
3084  *      | | |-/sys/fs/cgroup/systemd          cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
3085  *      | | |-/sys/fs/cgroup/cpuset           cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,cpuset
3086  * >>>> | | |-/sys/fs/cgroup/net_cls,net_prio cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,net_prio,net_cls
3087  *      | | |-/sys/fs/cgroup/hugetlb          cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,hugetlb
3088  *      | | |-/sys/fs/cgroup/devices          cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,devices
3089  *      | | |-/sys/fs/cgroup/blkio            cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,blkio
3090  *      | | |-/sys/fs/cgroup/memory           cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,memory
3091  * >>>> | | |-/sys/fs/cgroup/cpu,cpuacct      cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,cpuacct,cpu
3092  *      | | |-/sys/fs/cgroup/freezer          cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,freezer
3093  *      | | |-/sys/fs/cgroup/pids             cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,pids
3094  *      | | `-/sys/fs/cgroup/perf_event       cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,perf_event
3095  *
3096  * Ensure that we always use the systemd-guaranteed stable order when checking
3097  * for the mountpoint.
3098  */
3099 #if HAVE_COMPILER_ATTR_NONNULL
3100 __attribute__((nonnull))
3101 #endif
3102 #if HAVE_COMPILER_ATTR_RETURNS_NONNULL
3103 __attribute__((returns_nonnull))
3104 #endif
3105 static const char *stable_order(const char *controllers)
3106 {
3107         if (strequal(controllers, "cpuacct,cpu"))
3108                 return "cpu,cpuacct";
3109
3110         if (strequal(controllers, "net_prio,net_cls"))
3111                 return "net_cls,net_prio";
3112
3113         return unprefix(controllers);
3114 }
3115
3116 static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
3117                                 bool unprivileged)
3118 {
3119         __do_free char *cgroup_info = NULL;
3120         char *it;
3121
3122         /*
3123          * Root spawned containers escape the current cgroup, so use init's
3124          * cgroups as our base in that case.
3125          */
3126         if (!relative && (geteuid() == 0))
3127                 cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3128         else
3129                 cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3130         if (!cgroup_info)
3131                 return ret_errno(ENOMEM);
3132
3133         lxc_iterate_parts(it, cgroup_info, "\n") {
3134                 __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
3135                 __do_free char *controllers = NULL, *current_cgroup = NULL;
3136                 __do_free_string_list char **controller_list = NULL,
3137                                            **delegate = NULL;
3138                 char *line;
3139                 int dfd, ret, type;
3140
3141                 /* Handle the unified cgroup hierarchy. */
3142                 line = it;
3143                 if (unified_cgroup(line)) {
3144                         char *unified_mnt;
3145
3146                         type = UNIFIED_HIERARCHY;
3147
3148                         current_cgroup = current_unified_cgroup(relative, line);
3149                         if (IS_ERR(current_cgroup))
3150                                 return PTR_ERR(current_cgroup);
3151
3152                         if (unified_cgroup_fd(ops->dfd_mnt)) {
3153                                 dfd_mnt = dup_cloexec(ops->dfd_mnt);
3154                                 unified_mnt = "";
3155                         } else {
3156                                 dfd_mnt = open_at(ops->dfd_mnt,
3157                                                   "unified",
3158                                                   PROTECT_OPATH_DIRECTORY,
3159                                                   PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3160                                 unified_mnt = "unified";
3161                         }
3162                         if (dfd_mnt < 0) {
3163                                 if (errno != ENOENT)
3164                                         return syserror("Failed to open %d/unified", ops->dfd_mnt);
3165
3166                                 SYSTRACE("Unified cgroup not mounted");
3167                                 continue;
3168                         }
3169                         dfd = dfd_mnt;
3170
3171                         if (!is_empty_string(current_cgroup)) {
3172                                 dfd_base = open_at(dfd_mnt, current_cgroup,
3173                                                    PROTECT_OPATH_DIRECTORY,
3174                                                    PROTECT_LOOKUP_BENEATH_XDEV, 0);
3175                                 if (dfd_base < 0) {
3176                                         if (errno != ENOENT)
3177                                                 return syserror("Failed to open %d/%s",
3178                                                                 dfd_mnt, current_cgroup);
3179
3180                                         SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)",
3181                                                  dfd_mnt, current_cgroup);
3182                                         continue;
3183                                 }
3184                                 dfd = dfd_base;
3185                         }
3186
3187                         if (!unified_hierarchy_delegated(dfd, &delegate))
3188                                 continue;
3189
3190                         controller_list = unified_controllers(dfd, "cgroup.controllers");
3191                         if (!controller_list) {
3192                                 TRACE("No controllers are enabled for delegation in the unified hierarchy");
3193                                 controller_list = list_new();
3194                                 if (!controller_list)
3195                                         return syserror_set(-ENOMEM, "Failed to create empty controller list");
3196                         }
3197
3198                         controllers = strdup(unified_mnt);
3199                         if (!controllers)
3200                                 return ret_errno(ENOMEM);
3201                 } else {
3202                         char *__controllers, *__current_cgroup;
3203
3204                         type = LEGACY_HIERARCHY;
3205
3206                         __controllers = strchr(line, ':');
3207                         if (!__controllers)
3208                                 return ret_errno(EINVAL);
3209                         __controllers++;
3210
3211                         __current_cgroup = strchr(__controllers, ':');
3212                         if (!__current_cgroup)
3213                                 return ret_errno(EINVAL);
3214                         *__current_cgroup = '\0';
3215                         __current_cgroup++;
3216
3217                         controllers = strdup(stable_order(__controllers));
3218                         if (!controllers)
3219                                 return ret_errno(ENOMEM);
3220
3221                         dfd_mnt = open_at(ops->dfd_mnt,
3222                                           controllers,
3223                                           PROTECT_OPATH_DIRECTORY,
3224                                           PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3225                         if (dfd_mnt < 0) {
3226                                 if (errno != ENOENT)
3227                                         return syserror("Failed to open %d/%s",
3228                                                         ops->dfd_mnt, controllers);
3229
3230                                 SYSTRACE("%s not mounted", controllers);
3231                                 continue;
3232                         }
3233                         dfd = dfd_mnt;
3234
3235                         if (!abspath(__current_cgroup))
3236                                 return ret_errno(EINVAL);
3237
3238                         /* remove init.scope */
3239                         if (!relative)
3240                                 __current_cgroup = prune_init_scope(__current_cgroup);
3241
3242                         /* create a relative path */
3243                         __current_cgroup = deabs(__current_cgroup);
3244
3245                         current_cgroup = strdup(__current_cgroup);
3246                         if (!current_cgroup)
3247                                 return ret_errno(ENOMEM);
3248
3249                         if (!is_empty_string(current_cgroup)) {
3250                                 dfd_base = open_at(dfd_mnt, current_cgroup,
3251                                                    PROTECT_OPATH_DIRECTORY,
3252                                                    PROTECT_LOOKUP_BENEATH_XDEV, 0);
3253                                 if (dfd_base < 0) {
3254                                         if (errno != ENOENT)
3255                                                 return syserror("Failed to open %d/%s",
3256                                                                 dfd_mnt, current_cgroup);
3257
3258                                         SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)",
3259                                                  dfd_mnt, current_cgroup);
3260                                         continue;
3261                                 }
3262                                 dfd = dfd_base;
3263                         }
3264
3265                         if (!legacy_hierarchy_delegated(dfd))
3266                                 continue;
3267
3268                         /*
3269                          * We intentionally pass __current_cgroup here and not
3270                          * controllers because we would otherwise chop the
3271                          * mountpoint.
3272                          */
3273                         controller_list = list_add_controllers(__controllers);
3274                         if (!controller_list)
3275                                 return syserror_set(-ENOMEM, "Failed to create controller list from %s", __controllers);
3276
3277                         if (skip_hierarchy(ops, controller_list))
3278                                 continue;
3279
3280                         ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3281                 }
3282
3283                 ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd,
3284                                            current_cgroup, controller_list, type);
3285                 if (ret < 0)
3286                         return syserror_ret(ret, "Failed to add %s hierarchy", controllers);
3287
3288                 /* Transfer ownership. */
3289                 move_fd(dfd_mnt);
3290                 move_fd(dfd_base);
3291                 move_ptr(current_cgroup);
3292                 move_ptr(controllers);
3293                 move_ptr(controller_list);
3294                 if (type == UNIFIED_HIERARCHY)
3295                         ops->unified->delegate = move_ptr(delegate);
3296         }
3297
3298         /* determine cgroup layout */
3299         if (ops->unified) {
3300                 if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3301                         ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3302                 } else {
3303                         if (bpf_devices_cgroup_supported())
3304                                 ops->unified->utilities |= DEVICES_CONTROLLER;
3305                         ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3306                 }
3307         }
3308
3309         if (!controllers_available(ops))
3310                 return syserror_set(-ENOENT, "One or more requested controllers unavailable or not delegated");
3311
3312         return 0;
3313 }
3314
3315 static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf)
3316 {
3317         __do_close int dfd = -EBADF;
3318         int ret;
3319         const char *controllers_use;
3320
3321         if (ops->dfd_mnt >= 0)
3322                 return ret_errno(EBUSY);
3323
3324         /*
3325          * I don't see the need for allowing symlinks here. If users want to
3326          * have their hierarchy available in different locations I strongly
3327          * suggest bind-mounts.
3328          */
3329         dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3330                         PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3331         if (dfd < 0)
3332                 return syserror("Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
3333
3334         controllers_use = lxc_global_config_value("lxc.cgroup.use");
3335         if (controllers_use) {
3336                 __do_free char *dup = NULL;
3337                 char *it;
3338
3339                 dup = strdup(controllers_use);
3340                 if (!dup)
3341                         return -errno;
3342
3343                 lxc_iterate_parts(it, dup, ",") {
3344                         ret = list_add_string(&ops->cgroup_use, it);
3345                         if (ret < 0)
3346                                 return ret;
3347                 }
3348         }
3349
3350         /*
3351          * Keep dfd referenced by the cleanup function and actually move the fd
3352          * once we know the initialization succeeded. So if we fail we clean up
3353          * the dfd.
3354          */
3355         ops->dfd_mnt = dfd;
3356
3357         ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !list_empty(&conf->id_map));
3358         if (ret < 0)
3359                 return syserror_ret(ret, "Failed to initialize cgroups");
3360
3361         /* Transfer ownership to cgroup_ops. */
3362         move_fd(dfd);
3363         return 0;
3364 }
3365
3366 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3367 {
3368         const char *cgroup_pattern;
3369
3370         if (!ops)
3371                 return ret_set_errno(-1, ENOENT);
3372
3373         /* copy system-wide cgroup information */
3374         cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3375         if (cgroup_pattern && !strequal(cgroup_pattern, "")) {
3376                 ops->cgroup_pattern = strdup(cgroup_pattern);
3377                 if (!ops->cgroup_pattern)
3378                         return ret_errno(ENOMEM);
3379         }
3380
3381         return 0;
3382 }
3383
3384 struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
3385 {
3386         __cleanup_cgroup_ops struct cgroup_ops *cgfsng_ops = NULL;
3387
3388         cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
3389         if (!cgfsng_ops)
3390                 return ret_set_errno(NULL, ENOMEM);
3391
3392         cgfsng_ops->cgroup_layout       = CGROUP_LAYOUT_UNKNOWN;
3393         cgfsng_ops->dfd_mnt             = -EBADF;
3394
3395         if (initialize_cgroups(cgfsng_ops, conf))
3396                 return NULL;
3397
3398         cgfsng_ops->data_init                           = cgfsng_data_init;
3399         cgfsng_ops->payload_destroy                     = cgfsng_payload_destroy;
3400         cgfsng_ops->monitor_destroy                     = cgfsng_monitor_destroy;
3401         cgfsng_ops->monitor_create                      = cgfsng_monitor_create;
3402         cgfsng_ops->monitor_enter                       = cgfsng_monitor_enter;
3403         cgfsng_ops->monitor_delegate_controllers        = cgfsng_monitor_delegate_controllers;
3404         cgfsng_ops->payload_delegate_controllers        = cgfsng_payload_delegate_controllers;
3405         cgfsng_ops->payload_create                      = cgfsng_payload_create;
3406         cgfsng_ops->payload_enter                       = cgfsng_payload_enter;
3407         cgfsng_ops->finalize                            = cgfsng_finalize;
3408         cgfsng_ops->get_cgroup                          = cgfsng_get_cgroup;
3409         cgfsng_ops->get                                 = cgfsng_get;
3410         cgfsng_ops->set                                 = cgfsng_set;
3411         cgfsng_ops->freeze                              = cgfsng_freeze;
3412         cgfsng_ops->unfreeze                            = cgfsng_unfreeze;
3413         cgfsng_ops->setup_limits_legacy                 = cgfsng_setup_limits_legacy;
3414         cgfsng_ops->setup_limits                        = cgfsng_setup_limits;
3415         cgfsng_ops->driver                              = "cgfsng";
3416         cgfsng_ops->version                             = "1.0.0";
3417         cgfsng_ops->attach                              = cgfsng_attach;
3418         cgfsng_ops->chown                               = cgfsng_chown;
3419         cgfsng_ops->mount                               = cgfsng_mount;
3420         cgfsng_ops->devices_activate                    = cgfsng_devices_activate;
3421         cgfsng_ops->get_limit_cgroup                    = cgfsng_get_limit_cgroup;
3422
3423         cgfsng_ops->criu_escape                         = cgfsng_criu_escape;
3424         cgfsng_ops->criu_num_hierarchies                = cgfsng_criu_num_hierarchies;
3425         cgfsng_ops->criu_get_hierarchies                = cgfsng_criu_get_hierarchies;
3426
3427         return move_ptr(cgfsng_ops);
3428 }
3429
3430 static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_t pid)
3431 {
3432         int ret;
3433
3434         if (!list_empty(&conf->id_map)) {
3435                 struct userns_exec_unified_attach_data args = {
3436                         .conf           = conf,
3437                         .unified_fd     = fd_unified,
3438                         .pid            = pid,
3439                 };
3440
3441                 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3442                 if (ret < 0)
3443                         return -errno;
3444
3445                 ret = userns_exec_minimal(conf,
3446                                           cgroup_unified_attach_parent_wrapper,
3447                                           &args,
3448                                           cgroup_unified_attach_child_wrapper,
3449                                           &args);
3450         } else {
3451                 ret = cgroup_attach_leaf(conf, fd_unified, pid);
3452         }
3453
3454         return ret;
3455 }
3456
3457 static int __cgroup_attach_many(const struct lxc_conf *conf, const char *name,
3458                                 const char *lxcpath, pid_t pid)
3459 {
3460         call_cleaner(put_cgroup_ctx) struct cgroup_ctx *ctx = &(struct cgroup_ctx){};
3461         int ret;
3462         size_t idx;
3463         ssize_t pidstr_len;
3464         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
3465
3466         ret = lxc_cmd_get_cgroup_ctx(name, lxcpath, sizeof(struct cgroup_ctx), ctx);
3467         if (ret < 0)
3468                 return ret_errno(ENOSYS);
3469
3470         pidstr_len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
3471         if (pidstr_len < 0)
3472                 return pidstr_len;
3473
3474         for (idx = 0; idx < ctx->fd_len; idx++) {
3475                 int dfd_con = ctx->fd[idx];
3476
3477                 if (unified_cgroup_fd(dfd_con))
3478                         ret = __unified_attach_fd(conf, dfd_con, pid);
3479                 else
3480                         ret = lxc_writeat(dfd_con, "cgroup.procs", pidstr, pidstr_len);
3481                 if (ret)
3482                         return syserror_ret(ret, "Failed to attach to cgroup fd %d", dfd_con);
3483                 else
3484                         TRACE("Attached to cgroup fd %d", dfd_con);
3485         }
3486
3487         if (idx == 0)
3488                 return syserror_set(-ENOENT, "Failed to attach to cgroups");
3489
3490         TRACE("Attached to %s cgroup layout", cgroup_layout_name(ctx->layout));
3491         return 0;
3492 }
3493
3494 static int __cgroup_attach_unified(const struct lxc_conf *conf, const char *name,
3495                                    const char *lxcpath, pid_t pid)
3496 {
3497         __do_close int dfd_unified = -EBADF;
3498
3499         if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
3500                 return ret_errno(EINVAL);
3501
3502         dfd_unified = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3503         if (dfd_unified < 0)
3504                 return ret_errno(ENOSYS);
3505
3506         return __unified_attach_fd(conf, dfd_unified, pid);
3507 }
3508
3509 int cgroup_attach(const struct lxc_conf *conf, const char *name,
3510                   const char *lxcpath, pid_t pid)
3511 {
3512         int ret;
3513
3514         ret = __cgroup_attach_many(conf, name, lxcpath, pid);
3515         if (ret < 0) {
3516                 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3517                         return ret;
3518
3519                 ret = __cgroup_attach_unified(conf, name, lxcpath, pid);
3520                 if (ret < 0 && ERRNO_IS_NOT_SUPPORTED(ret))
3521                         return ret_errno(ENOSYS);
3522         }
3523
3524         return ret;
3525 }
3526
3527 /* Connects to command socket therefore isn't callable from command handler. */
3528 int cgroup_get(const char *name, const char *lxcpath, const char *key, char *buf, size_t len)
3529 {
3530         __do_close int dfd = -EBADF;
3531         struct cgroup_fd fd = {
3532                 .fd = -EBADF,
3533         };
3534         size_t len_controller;
3535         int ret;
3536
3537         if (is_empty_string(name) || is_empty_string(lxcpath) ||
3538             is_empty_string(key))
3539                 return ret_errno(EINVAL);
3540
3541         if ((buf && !len) || (len && !buf))
3542                 return ret_errno(EINVAL);
3543
3544         len_controller = strcspn(key, ".");
3545         len_controller++; /* Don't forget the \0 byte. */
3546         if (len_controller >= MAX_CGROUP_ROOT_NAMELEN)
3547                 return ret_errno(EINVAL);
3548         (void)strlcpy(fd.controller, key, len_controller);
3549
3550         ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd);
3551         if (ret < 0) {
3552                 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3553                         return ret;
3554
3555                 dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3556                 if (dfd < 0) {
3557                         if (!ERRNO_IS_NOT_SUPPORTED(ret))
3558                                 return ret;
3559
3560                         return ret_errno(ENOSYS);
3561                 }
3562                 fd.type = UNIFIED_HIERARCHY;
3563                 fd.fd = move_fd(dfd);
3564         }
3565         dfd = move_fd(fd.fd);
3566
3567         TRACE("Reading %s from %s cgroup hierarchy", key, cgroup_hierarchy_name(fd.type));
3568
3569         if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices"))
3570                 return ret_errno(EOPNOTSUPP);
3571         else
3572                 ret = lxc_read_try_buf_at(dfd, key, buf, len);
3573
3574         return ret;
3575 }
3576
3577 /* Connects to command socket therefore isn't callable from command handler. */
3578 int cgroup_set(const char *name, const char *lxcpath, const char *key, const char *value)
3579 {
3580         __do_close int dfd = -EBADF;
3581         struct cgroup_fd fd = {
3582                 .fd = -EBADF,
3583         };
3584         size_t len_controller;
3585         int ret;
3586
3587         if (is_empty_string(name) || is_empty_string(lxcpath) ||
3588             is_empty_string(key) || is_empty_string(value))
3589                 return ret_errno(EINVAL);
3590
3591         len_controller = strcspn(key, ".");
3592         len_controller++; /* Don't forget the \0 byte. */
3593         if (len_controller >= MAX_CGROUP_ROOT_NAMELEN)
3594                 return ret_errno(EINVAL);
3595         (void)strlcpy(fd.controller, key, len_controller);
3596
3597         ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd);
3598         if (ret < 0) {
3599                 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3600                         return ret;
3601
3602                 dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3603                 if (dfd < 0) {
3604                         if (!ERRNO_IS_NOT_SUPPORTED(ret))
3605                                 return ret;
3606
3607                         return ret_errno(ENOSYS);
3608                 }
3609                 fd.type = UNIFIED_HIERARCHY;
3610                 fd.fd = move_fd(dfd);
3611         }
3612         dfd = move_fd(fd.fd);
3613
3614         TRACE("Setting %s to %s in %s cgroup hierarchy", key, value, cgroup_hierarchy_name(fd.type));
3615
3616         if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices")) {
3617                 struct device_item device = {};
3618
3619                 ret = device_cgroup_rule_parse(&device, key, value);
3620                 if (ret < 0)
3621                         return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
3622                                                key, value);
3623
3624                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3625         } else {
3626                 ret = lxc_writeat(dfd, key, value, strlen(value));
3627         }
3628
3629         return ret;
3630 }
3631
3632 static int do_cgroup_freeze(int unified_fd,
3633                             const char *state_string,
3634                             int state_num,
3635                             int timeout,
3636                             const char *epoll_error,
3637                             const char *wait_error)
3638 {
3639         __do_close int events_fd = -EBADF;
3640         call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL;
3641         int ret;
3642         struct lxc_async_descr descr = {};
3643
3644         if (timeout != 0) {
3645                 ret = lxc_mainloop_open(&descr);
3646                 if (ret)
3647                         return log_error_errno(-1, errno, "%s", epoll_error);
3648
3649                 /* automatically cleaned up now */
3650                 descr_ptr = &descr;
3651
3652                 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3653                 if (events_fd < 0)
3654                         return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3655
3656                 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI,
3657                                                       freezer_cgroup_events_cb,
3658                                                       default_cleanup_handler,
3659                                                       INT_TO_PTR(state_num),
3660                                                       "freezer_cgroup_events_cb");
3661                 if (ret < 0)
3662                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3663         }
3664
3665         ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3666         if (ret < 0)
3667                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3668
3669         if (timeout != 0) {
3670                 ret = lxc_mainloop(&descr, timeout);
3671                 if (ret)
3672                         return log_error_errno(-1, errno, "%s", wait_error);
3673         }
3674
3675         return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3676 }
3677
3678 static inline int __cgroup_freeze(int unified_fd, int timeout)
3679 {
3680         return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3681                                 "Failed to create epoll instance to wait for container freeze",
3682                                 "Failed to wait for container to be frozen");
3683 }
3684
3685 int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
3686 {
3687         __do_close int unified_fd = -EBADF;
3688         int ret;
3689
3690         if (is_empty_string(name) || is_empty_string(lxcpath))
3691                 return ret_errno(EINVAL);
3692
3693         unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3694         if (unified_fd < 0)
3695                 return ret_errno(ENOCGROUP2);
3696
3697         lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
3698         ret = __cgroup_freeze(unified_fd, timeout);
3699         lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
3700         return ret;
3701 }
3702
3703 int __cgroup_unfreeze(int unified_fd, int timeout)
3704 {
3705         return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3706                                 "Failed to create epoll instance to wait for container freeze",
3707                                 "Failed to wait for container to be frozen");
3708 }
3709
3710 int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
3711 {
3712         __do_close int unified_fd = -EBADF;
3713         int ret;
3714
3715         if (is_empty_string(name) || is_empty_string(lxcpath))
3716                 return ret_errno(EINVAL);
3717
3718         unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3719         if (unified_fd < 0)
3720                 return ret_errno(ENOCGROUP2);
3721
3722         lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
3723         ret = __cgroup_unfreeze(unified_fd, timeout);
3724         lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
3725         return ret;
3726 }