src/lxc/cgroups/cgfsng.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 /*
   4  * cgfs-ng.c: this is a new, simplified implementation of a filesystem
   5  * cgroup backend.  The original cgfs.c was designed to be as flexible
   6  * as possible.  It would try to find cgroup filesystems no matter where
   7  * or how you had them mounted, and deduce the most usable mount for
   8  * each controller.
   9  *
  10  * This new implementation assumes that cgroup filesystems are mounted
  11  * under /sys/fs/cgroup/clist where clist is either the controller, or
  12  * a comma-separated list of controllers.
  13  */
  14
  15 #include "config.h"
  16
  17 #include <ctype.h>
  18 #include <dirent.h>
  19 #include <errno.h>
  20 #include <grp.h>
  21 #include <linux/kdev_t.h>
  22 #include <linux/types.h>
  23 #include <poll.h>
  24 #include <signal.h>
  25 #include <stdint.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <sys/epoll.h>
  30 #include <sys/types.h>
  31 #include <unistd.h>
  32
  33 #include "cgroup.h"
  34 #include "af_unix.h"
  35 #include "caps.h"
  36 #include "cgroup2_devices.h"
  37 #include "cgroup_utils.h"
  38 #include "commands.h"
  39 #include "commands_utils.h"
  40 #include "conf.h"
  41 #include "error_utils.h"
  42 #include "log.h"
  43 #include "macro.h"
  44 #include "mainloop.h"
  45 #include "memory_utils.h"
  46 #include "mount_utils.h"
  47 #include "storage/storage.h"
  48 #include "string_utils.h"
  49 #include "syscall_wrappers.h"
  50 #include "utils.h"
  51
  52 #ifndef HAVE_STRLCPY
  53 #include "strlcpy.h"
  54 #endif
  55
  56 #ifndef HAVE_STRLCAT
  57 #include "strlcat.h"
  58 #endif
  59
  60 lxc_log_define(cgfsng, cgroup);
  61
  62 /*
  63  * Given a pointer to a null-terminated array of pointers, realloc to add one
  64  * entry, and point the new entry to NULL. Do not fail. Return the index to the
  65  * second-to-last entry - that is, the one which is now available for use
  66  * (keeping the list null-terminated).
  67  */
  68 static int cg_list_add(void ***list)
  69 {
  70         int idx = 0;
  71         void **p;
  72
  73         if (*list)
  74                 for (; (*list)[idx]; idx++)
  75                         ;
  76
  77         p = realloc(*list, (idx + 2) * sizeof(void **));
  78         if (!p)
  79                 return ret_errno(ENOMEM);
  80
  81         p[idx + 1] = NULL;
  82         *list = p;
  83
  84         return idx;
  85 }
  86
  87 /* Given a null-terminated array of strings, check whether @entry is one of the
  88  * strings.
  89  */
  90 static bool string_in_list(char **list, const char *entry)
  91 {
  92         if (!list)
  93                 return false;
  94
  95         for (int i = 0; list[i]; i++)
  96                 if (strequal(list[i], entry))
  97                         return true;
  98
  99         return false;
 100 }
 101
 102 /* Given a handler's cgroup data, return the struct hierarchy for the controller
 103  * @c, or NULL if there is none.
 104  */
 105 static struct hierarchy *get_hierarchy(const struct cgroup_ops *ops, const char *controller)
 106 {
 107         if (!ops->hierarchies)
 108                 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
 109
 110         for (int i = 0; ops->hierarchies[i]; i++) {
 111                 if (!controller) {
 112                         /* This is the empty unified hierarchy. */
 113                         if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
 114                                 return ops->hierarchies[i];
 115
 116                         continue;
 117                 }
 118
 119                 /*
 120                  * Handle controllers with significant implementation changes
 121                  * from cgroup to cgroup2.
 122                  */
 123                 if (pure_unified_layout(ops)) {
 124                         if (strequal(controller, "devices")) {
 125                                 if (device_utility_controller(ops->unified))
 126                                         return ops->unified;
 127
 128                                 break;
 129                         } else if (strequal(controller, "freezer")) {
 130                                 if (freezer_utility_controller(ops->unified))
 131                                         return ops->unified;
 132
 133                                 break;
 134                         }
 135                 }
 136
 137                 if (string_in_list(ops->hierarchies[i]->controllers, controller))
 138                         return ops->hierarchies[i];
 139         }
 140
 141         if (controller)
 142                 WARN("There is no useable %s controller", controller);
 143         else
 144                 WARN("There is no empty unified cgroup hierarchy");
 145
 146         return ret_set_errno(NULL, ENOENT);
 147 }
 148
 149 int prepare_cgroup_fd(const struct cgroup_ops *ops, struct cgroup_fd *fd, bool limit)
 150 {
 151         int dfd;
 152         const struct hierarchy *h;
 153
 154         h = get_hierarchy(ops, fd->controller);
 155         if (!h)
 156                 return ret_errno(ENOENT);
 157
 158         /*
 159          * The client requested that the controller must be in a specific
 160          * cgroup version.
 161          */
 162         if (fd->type != 0 && (cgroupfs_type_magic_t)fd->type != h->fs_type)
 163                 return ret_errno(EINVAL);
 164
 165         if (limit)
 166                 dfd = h->dfd_con;
 167         else
 168                 dfd = h->dfd_lim;
 169         if (dfd < 0)
 170                 return ret_errno(EBADF);
 171
 172         fd->layout = ops->cgroup_layout;
 173         fd->type = h->fs_type;
 174         if (fd->type == UNIFIED_HIERARCHY)
 175                 fd->utilities = h->utilities;
 176         fd->fd = dfd;
 177
 178         return 0;
 179 }
 180
 181 /* Taken over modified from the kernel sources. */
 182 #define NBITS 32 /* bits in uint32_t */
 183 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
 184 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
 185
 186 static void set_bit(unsigned bit, uint32_t *bitarr)
 187 {
 188         bitarr[bit / NBITS] |= (1 << (bit % NBITS));
 189 }
 190
 191 static void clear_bit(unsigned bit, uint32_t *bitarr)
 192 {
 193         bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
 194 }
 195
 196 static bool is_set(unsigned bit, uint32_t *bitarr)
 197 {
 198         return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
 199 }
 200
 201 /* Create cpumask from cpulist aka turn:
 202  *
 203  *      0,2-3
 204  *
 205  * into bit array
 206  *
 207  *      1 0 1 1
 208  */
 209 static int lxc_cpumask(char *buf, uint32_t **bitarr, size_t *last_set_bit)
 210 {
 211         __do_free uint32_t *arr_u32 = NULL;
 212         size_t cur_last_set_bit = 0, nbits = 256;
 213         size_t nr_u32;
 214         char *token;
 215
 216         nr_u32 = BITS_TO_LONGS(nbits);
 217         arr_u32 = zalloc(nr_u32 * sizeof(uint32_t));
 218         if (!arr_u32)
 219                 return ret_errno(ENOMEM);
 220
 221         lxc_iterate_parts(token, buf, ",") {
 222                 unsigned last_bit, first_bit;
 223                 char *range;
 224
 225                 errno = 0;
 226                 first_bit = strtoul(token, NULL, 0);
 227                 last_bit = first_bit;
 228                 range = strchr(token, '-');
 229                 if (range)
 230                         last_bit = strtoul(range + 1, NULL, 0);
 231
 232                 if (!(first_bit <= last_bit))
 233                         return ret_errno(EINVAL);
 234
 235                 if (last_bit >= nbits) {
 236                         size_t add_bits = last_bit - nbits + 32;
 237                         size_t new_nr_u32;
 238                         uint32_t *p;
 239
 240                         new_nr_u32 = BITS_TO_LONGS(nbits + add_bits);
 241                         p = realloc(arr_u32, new_nr_u32 * sizeof(uint32_t));
 242                         if (!p)
 243                                 return ret_errno(ENOMEM);
 244                         arr_u32 = move_ptr(p);
 245
 246                         memset(arr_u32 + nr_u32, 0,
 247                                (new_nr_u32 - nr_u32) * sizeof(uint32_t));
 248                         nbits += add_bits;
 249                 }
 250
 251                 while (first_bit <= last_bit)
 252                         set_bit(first_bit++, arr_u32);
 253
 254                 if (last_bit > cur_last_set_bit)
 255                         cur_last_set_bit = last_bit;
 256         }
 257
 258         *last_set_bit = cur_last_set_bit;
 259         *bitarr = move_ptr(arr_u32);
 260         return 0;
 261 }
 262
 263 static int lxc_cpumask_update(char *buf, uint32_t *bitarr, size_t last_set_bit,
 264                               bool clear)
 265 {
 266         bool flipped = false;
 267         char *token;
 268
 269         lxc_iterate_parts(token, buf, ",") {
 270                 unsigned last_bit, first_bit;
 271                 char *range;
 272
 273                 errno = 0;
 274                 first_bit = strtoul(token, NULL, 0);
 275                 last_bit = first_bit;
 276                 range = strchr(token, '-');
 277                 if (range)
 278                         last_bit = strtoul(range + 1, NULL, 0);
 279
 280                 if (!(first_bit <= last_bit)) {
 281                         WARN("The cup range seems to be inverted: %u-%u", first_bit, last_bit);
 282                         continue;
 283                 }
 284
 285                 if (last_bit > last_set_bit)
 286                         continue;
 287
 288                 while (first_bit <= last_bit) {
 289                         if (clear && is_set(first_bit, bitarr)) {
 290                                 flipped = true;
 291                                 clear_bit(first_bit, bitarr);
 292                         } else if (!clear && !is_set(first_bit, bitarr)) {
 293                                 flipped = true;
 294                                 set_bit(first_bit, bitarr);
 295                         }
 296
 297                         first_bit++;
 298                 }
 299         }
 300
 301         if (flipped)
 302                 return 1;
 303
 304         return 0;
 305 }
 306
 307 /* Turn cpumask into simple, comma-separated cpulist. */
 308 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t last_set_bit)
 309 {
 310         __do_free_string_list char **cpulist = NULL;
 311         char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
 312         int ret;
 313
 314         for (size_t i = 0; i <= last_set_bit; i++) {
 315                 if (!is_set(i, bitarr))
 316                         continue;
 317
 318                 ret = strnprintf(numstr, sizeof(numstr), "%zu", i);
 319                 if (ret < 0)
 320                         return NULL;
 321
 322                 ret = lxc_append_string(&cpulist, numstr);
 323                 if (ret < 0)
 324                         return ret_set_errno(NULL, ENOMEM);
 325         }
 326
 327         if (!cpulist)
 328                 return ret_set_errno(NULL, ENOMEM);
 329
 330         return lxc_string_join(",", (const char **)cpulist, false);
 331 }
 332
 333 static inline bool is_unified_hierarchy(const struct hierarchy *h)
 334 {
 335         return h->fs_type == UNIFIED_HIERARCHY;
 336 }
 337
 338 /* Return true if the controller @entry is found in the null-terminated list of
 339  * hierarchies @hlist.
 340  */
 341 static bool controller_available(struct hierarchy **hlist, char *entry)
 342 {
 343         if (!hlist)
 344                 return false;
 345
 346         for (int i = 0; hlist[i]; i++)
 347                 if (string_in_list(hlist[i]->controllers, entry))
 348                         return true;
 349
 350         return false;
 351 }
 352
 353 static bool controllers_available(struct cgroup_ops *ops)
 354 {
 355         struct hierarchy **hlist;
 356
 357         if (!ops->cgroup_use)
 358                 return true;
 359
 360         hlist = ops->hierarchies;
 361         for (char **cur = ops->cgroup_use; cur && *cur; cur++)
 362                 if (!controller_available(hlist, *cur))
 363                         return log_error(false, "The %s controller found", *cur);
 364
 365         return true;
 366 }
 367
 368 static char **list_new(void)
 369 {
 370         __do_free_string_list char **list = NULL;
 371         int idx;
 372
 373         idx = cg_list_add((void ***)&list);
 374         if (idx < 0)
 375                 return NULL;
 376
 377         list[idx] = NULL;
 378         return move_ptr(list);
 379 }
 380
 381 static int list_add_string(char ***list, char *entry)
 382 {
 383         __do_free char *dup = NULL;
 384         int idx;
 385
 386         dup = strdup(entry);
 387         if (!dup)
 388                 return ret_errno(ENOMEM);
 389
 390         idx = cg_list_add((void ***)list);
 391         if (idx < 0)
 392                 return idx;
 393
 394         (*list)[idx] = move_ptr(dup);
 395         return 0;
 396 }
 397
 398 static char **list_add_controllers(char *controllers)
 399 {
 400         __do_free_string_list char **list = NULL;
 401         char *it;
 402
 403         lxc_iterate_parts(it, controllers, ", \t\n") {
 404                 int ret;
 405
 406                 ret = list_add_string(&list, it);
 407                 if (ret < 0)
 408                         return NULL;
 409         }
 410
 411         return move_ptr(list);
 412 }
 413
 414 static char **unified_controllers(int dfd, const char *file)
 415 {
 416         __do_free char *buf = NULL;
 417
 418         buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
 419         if (!buf)
 420                 return NULL;
 421
 422         return list_add_controllers(buf);
 423 }
 424
 425 static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers)
 426 {
 427         if (!ops->cgroup_use)
 428                 return false;
 429
 430         for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
 431                 bool found = false;
 432
 433                 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
 434                         if (!strequal(*cur_use, *cur_ctrl))
 435                                 continue;
 436
 437                         found = true;
 438                         break;
 439                 }
 440
 441                 if (found)
 442                         continue;
 443
 444                 return true;
 445         }
 446
 447         return false;
 448 }
 449
 450 static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt,
 451                                 int dfd_base, char *base_cgroup,
 452                                 char **controllers, cgroupfs_type_magic_t fs_type)
 453 {
 454         __do_free struct hierarchy *new = NULL;
 455         int idx;
 456
 457         if (abspath(base_cgroup))
 458                 return syserror_set(-EINVAL, "Container base path must be relative to controller mount");
 459
 460         new = zalloc(sizeof(*new));
 461         if (!new)
 462                 return ret_errno(ENOMEM);
 463
 464         new->dfd_con            = -EBADF;
 465         new->dfd_lim            = -EBADF;
 466         new->dfd_mon            = -EBADF;
 467
 468         new->fs_type            = fs_type;
 469         new->controllers        = controllers;
 470         new->at_mnt             = mnt;
 471         new->at_base            = base_cgroup;
 472
 473         new->dfd_mnt            = dfd_mnt;
 474         new->dfd_base           = dfd_base;
 475
 476         TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s",
 477               mnt, maybe_empty(base_cgroup));
 478         for (char *const *it = new->controllers; it && *it; it++)
 479                 TRACE("The hierarchy contains the %s controller", *it);
 480
 481         idx = cg_list_add((void ***)&ops->hierarchies);
 482         if (idx < 0)
 483                 return ret_errno(idx);
 484
 485         if (fs_type == UNIFIED_HIERARCHY)
 486                 ops->unified = new;
 487         (ops->hierarchies)[idx] = move_ptr(new);
 488
 489         return 0;
 490 }
 491
 492 static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
 493 {
 494         if (!path_prune || !hierarchies)
 495                 return 0;
 496
 497         for (int i = 0; hierarchies[i]; i++) {
 498                 struct hierarchy *h = hierarchies[i];
 499                 int ret;
 500
 501                 ret = cgroup_tree_prune(h->dfd_base, path_prune);
 502                 if (ret < 0)
 503                         SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
 504                 else
 505                         TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
 506
 507                 free_equal(h->path_lim, h->path_con);
 508         }
 509
 510         return 0;
 511 }
 512
 513 struct generic_userns_exec_data {
 514         struct hierarchy **hierarchies;
 515         const char *path_prune;
 516         struct lxc_conf *conf;
 517         uid_t origuid; /* target uid in parent namespace */
 518         char *path;
 519 };
 520
 521 static int cgroup_tree_remove_wrapper(void *data)
 522 {
 523         struct generic_userns_exec_data *arg = data;
 524         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
 525         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
 526         int ret;
 527
 528         if (!lxc_drop_groups() && errno != EPERM)
 529                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
 530
 531         ret = setresgid(nsgid, nsgid, nsgid);
 532         if (ret < 0)
 533                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
 534                                        (int)nsgid, (int)nsgid, (int)nsgid);
 535
 536         ret = setresuid(nsuid, nsuid, nsuid);
 537         if (ret < 0)
 538                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
 539                                        (int)nsuid, (int)nsuid, (int)nsuid);
 540
 541         return cgroup_tree_remove(arg->hierarchies, arg->path_prune);
 542 }
 543
 544 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
 545                                                 struct lxc_handler *handler)
 546 {
 547         int ret;
 548
 549         if (!ops) {
 550                 ERROR("Called with uninitialized cgroup operations");
 551                 return;
 552         }
 553
 554         if (!ops->hierarchies)
 555                 return;
 556
 557         if (!handler) {
 558                 ERROR("Called with uninitialized handler");
 559                 return;
 560         }
 561
 562         if (!handler->conf) {
 563                 ERROR("Called with uninitialized conf");
 564                 return;
 565         }
 566
 567         if (!ops->container_limit_cgroup) {
 568                 WARN("Uninitialized limit cgroup");
 569                 return;
 570         }
 571
 572         ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
 573         if (ret < 0)
 574                 WARN("Failed to detach bpf program from cgroup");
 575
 576         if (!list_empty(&handler->conf->id_map)) {
 577                 struct generic_userns_exec_data wrap = {
 578                         .conf                   = handler->conf,
 579                         .path_prune             = ops->container_limit_cgroup,
 580                         .hierarchies            = ops->hierarchies,
 581                         .origuid                = 0,
 582                 };
 583                 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
 584                                     &wrap, "cgroup_tree_remove_wrapper");
 585         } else {
 586                 ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup);
 587         }
 588         if (ret < 0)
 589                 SYSWARN("Failed to destroy cgroups");
 590 }
 591
 592 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
 593 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
 594 static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
 595                                     bool am_initialized)
 596 {
 597         __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
 598                        *offlinecpus = NULL, *posscpus = NULL;
 599         __do_free uint32_t *possmask = NULL;
 600         int ret;
 601         size_t poss_last_set_bit = 0;
 602
 603         posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
 604         if (!posscpus)
 605                 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
 606
 607         if (file_exists(__ISOL_CPUS)) {
 608                 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
 609                 if (!isolcpus)
 610                         return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
 611
 612                 if (!isdigit(isolcpus[0]))
 613                         free_disarm(isolcpus);
 614         } else {
 615                 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
 616         }
 617
 618         if (file_exists(__OFFLINE_CPUS)) {
 619                 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
 620                 if (!offlinecpus)
 621                         return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
 622
 623                 if (!isdigit(offlinecpus[0]))
 624                         free_disarm(offlinecpus);
 625         } else {
 626                 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
 627         }
 628
 629         if (!isolcpus && !offlinecpus) {
 630                 cpulist = move_ptr(posscpus);
 631                 goto copy_parent;
 632         }
 633
 634         ret = lxc_cpumask(posscpus, &possmask, &poss_last_set_bit);
 635         if (ret)
 636                 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
 637
 638         if (isolcpus)
 639                 ret = lxc_cpumask_update(isolcpus, possmask, poss_last_set_bit, true);
 640
 641         if (offlinecpus)
 642                 ret |= lxc_cpumask_update(offlinecpus, possmask, poss_last_set_bit, true);
 643
 644         if (!ret) {
 645                 cpulist = lxc_cpumask_to_cpulist(possmask, poss_last_set_bit);
 646                 TRACE("No isolated or offline cpus present in cpuset");
 647         } else {
 648                 cpulist = move_ptr(posscpus);
 649                 TRACE("Removed isolated or offline cpus from cpuset");
 650         }
 651         if (!cpulist)
 652                 return log_error_errno(false, errno, "Failed to create cpu list");
 653
 654 copy_parent:
 655         if (!am_initialized) {
 656                 ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
 657                 if (ret < 0)
 658                         return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
 659
 660                 TRACE("Copied cpu settings of parent cgroup");
 661         }
 662
 663         return true;
 664 }
 665
 666 static bool cpuset1_initialize(int dfd_base, int dfd_next)
 667 {
 668         char mems[PATH_MAX];
 669         ssize_t bytes;
 670         char v;
 671
 672         /* Determine whether the base cgroup has cpuset inheritance turned on. */
 673         bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
 674         if (bytes < 0)
 675                 return syserror_ret(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
 676
 677         /* Initialize cpuset.cpus removing any isolated and offline cpus. */
 678         if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
 679                 return syserror_ret(false, "Failed to initialize cpuset.cpus");
 680
 681         /* Read cpuset.mems from parent... */
 682         bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
 683         if (bytes < 0)
 684                 return syserror_ret(false, "Failed to read file %d(cpuset.mems)", dfd_base);
 685
 686         /* and copy to first cgroup in the tree... */
 687         bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
 688         if (bytes < 0)
 689                 return syserror_ret(false, "Failed to write %d(cpuset.mems)", dfd_next);
 690
 691         /* and finally turn on cpuset inheritance. */
 692         bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
 693         if (bytes < 0)
 694                 return syserror_ret(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
 695
 696         return log_trace(true, "Initialized cpuset in the legacy hierarchy");
 697 }
 698
 699 static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
 700                                 bool cpuset_v1, bool eexist_ignore)
 701 {
 702         __do_close int dfd_final = -EBADF;
 703         int dfd_cur = dfd_base;
 704         int ret = 0;
 705         size_t len;
 706         char *cur;
 707         char buf[PATH_MAX];
 708
 709         if (is_empty_string(path))
 710                 return ret_errno(EINVAL);
 711
 712         len = strlcpy(buf, path, sizeof(buf));
 713         if (len >= sizeof(buf))
 714                 return ret_errno(E2BIG);
 715
 716         lxc_iterate_parts(cur, buf, "/") {
 717                 /*
 718                  * Even though we vetted the paths when we parsed the config
 719                  * we're paranoid here and check that the path is neither
 720                  * absolute nor walks upwards.
 721                  */
 722                 if (abspath(cur))
 723                         return syserror_set(-EINVAL, "No absolute paths allowed");
 724
 725                 if (strnequal(cur, "..", STRLITERALLEN("..")))
 726                         return syserror_set(-EINVAL, "No upward walking paths allowed");
 727
 728                 ret = mkdirat(dfd_cur, cur, mode);
 729                 if (ret < 0) {
 730                         if (errno != EEXIST)
 731                                 return syserror("Failed to create %d(%s)", dfd_cur, cur);
 732
 733                         ret = -EEXIST;
 734                 }
 735                 TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
 736
 737                 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
 738                 if (dfd_final < 0)
 739                         return syserror("Fail to open%s directory %d(%s)",
 740                                         !ret ? " newly created" : "", dfd_base, cur);
 741                 if (dfd_cur != dfd_base)
 742                         close(dfd_cur);
 743                 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
 744                         return syserror_set(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
 745                 /*
 746                  * Leave dfd_final pointing to the last fd we opened so
 747                  * it will be automatically zapped if we return early.
 748                  */
 749                 dfd_cur = dfd_final;
 750         }
 751
 752         /* The final cgroup must be succesfully creatd by us. */
 753         if (ret) {
 754                 if (ret != -EEXIST || !eexist_ignore)
 755                         return syswarn_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
 756         }
 757
 758         return move_fd(dfd_final);
 759 }
 760
 761 static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
 762                                struct hierarchy *h, const char *cgroup_limit_dir,
 763                                const char *cgroup_leaf, bool payload)
 764 {
 765         __do_close int fd_limit = -EBADF, fd_final = -EBADF;
 766         bool cpuset_v1 = false;
 767
 768         /*
 769          * The legacy cpuset controller needs massaging in case inheriting
 770          * settings from its immediate ancestor cgroup hasn't been turned on.
 771          */
 772         cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
 773
 774         if (payload && cgroup_leaf) {
 775                 /* With isolation both parts need to not already exist. */
 776                 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
 777                 if (fd_limit < 0)
 778                         return syswarn_ret(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
 779
 780                 h->path_lim = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
 781                 h->dfd_lim = move_fd(fd_limit);
 782
 783                 TRACE("Created limit cgroup %d->%d(%s)",
 784                       h->dfd_lim, h->dfd_base, cgroup_limit_dir);
 785
 786                 /*
 787                  * With isolation the devices legacy cgroup needs to be
 788                  * iinitialized early, as it typically contains an 'a' (all)
 789                  * line, which is not possible once a subdirectory has been
 790                  * created.
 791                  */
 792                 if (string_in_list(h->controllers, "devices") &&
 793                     !ops->setup_limits_legacy(ops, conf, true))
 794                         return log_warn(false, "Failed to setup legacy device limits");
 795
 796                 /*
 797                  * If we use a separate limit cgroup, the leaf cgroup, i.e. the
 798                  * cgroup the container actually resides in, is below fd_limit.
 799                  */
 800                 fd_final = __cgroup_tree_create(h->dfd_lim, cgroup_leaf, 0755, cpuset_v1, false);
 801                 if (fd_final < 0) {
 802                         /* Ensure we don't leave any garbage behind. */
 803                         if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
 804                                 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
 805                         else
 806                                 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
 807                         return syswarn_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
 808                 }
 809                 h->dfd_con = move_fd(fd_final);
 810                 h->path_con = must_make_path(h->path_lim, cgroup_leaf, NULL);
 811
 812         } else {
 813                 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
 814                 if (fd_final < 0)
 815                         return syswarn_ret(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
 816
 817                 if (payload) {
 818                         h->dfd_con = move_fd(fd_final);
 819                         h->dfd_lim = h->dfd_con;
 820                         h->path_con = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
 821
 822                         h->path_lim = h->path_con;
 823                 } else {
 824                         h->dfd_mon = move_fd(fd_final);
 825                 }
 826         }
 827
 828         return true;
 829 }
 830
 831 static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
 832                                    bool payload)
 833 {
 834         bool prune = true;
 835
 836         if (payload) {
 837                 /* Check whether we actually created the cgroup to prune. */
 838                 if (h->dfd_lim < 0)
 839                         prune = false;
 840
 841                 free_equal(h->path_con, h->path_lim);
 842                 close_equal(h->dfd_con, h->dfd_lim);
 843         } else {
 844                 /* Check whether we actually created the cgroup to prune. */
 845                 if (h->dfd_mon < 0)
 846                         prune = false;
 847
 848                 close_prot_errno_disarm(h->dfd_mon);
 849         }
 850
 851         /* We didn't create this cgroup. */
 852         if (!prune)
 853                 return;
 854
 855         if (cgroup_tree_prune(h->dfd_base, path_prune))
 856                 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
 857         else
 858                 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
 859 }
 860
 861 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
 862                                                 struct lxc_handler *handler)
 863 {
 864         int len;
 865         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
 866         const struct lxc_conf *conf;
 867
 868         if (!ops) {
 869                 ERROR("Called with uninitialized cgroup operations");
 870                 return;
 871         }
 872
 873         if (!ops->hierarchies)
 874                 return;
 875
 876         if (!handler) {
 877                 ERROR("Called with uninitialized handler");
 878                 return;
 879         }
 880
 881         if (!handler->conf) {
 882                 ERROR("Called with uninitialized conf");
 883                 return;
 884         }
 885         conf = handler->conf;
 886
 887         if (!ops->monitor_cgroup) {
 888                 WARN("Uninitialized monitor cgroup");
 889                 return;
 890         }
 891
 892         len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
 893         if (len < 0)
 894                 return;
 895
 896         for (int i = 0; ops->hierarchies[i]; i++) {
 897                 __do_close int fd_pivot = -EBADF;
 898                 __do_free char *pivot_path = NULL;
 899                 struct hierarchy *h = ops->hierarchies[i];
 900                 bool cpuset_v1 = false;
 901                 int ret;
 902
 903                 /* Monitor might have died before we entered the cgroup. */
 904                 if (handler->monitor_pid <= 0) {
 905                         WARN("No valid monitor process found while destroying cgroups");
 906                         goto cgroup_prune_tree;
 907                 }
 908
 909                 if (conf->cgroup_meta.monitor_pivot_dir)
 910                         pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
 911                 else if (conf->cgroup_meta.dir)
 912                         pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
 913                 else
 914                         pivot_path = must_make_path(CGROUP_PIVOT, NULL);
 915
 916                 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
 917
 918                 fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
 919                 if (fd_pivot < 0) {
 920                         SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
 921                         continue;
 922                 }
 923
 924                 ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
 925                 if (ret != 0) {
 926                         SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
 927                         continue;
 928                 }
 929
 930 cgroup_prune_tree:
 931                 ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup);
 932                 if (ret < 0)
 933                         SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup);
 934                 else
 935                         TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup);
 936         }
 937 }
 938
 939 /*
 940  * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
 941  * proper prefix directory of lxc.cgroup.dir.payload.
 942  *
 943  * Returns the prefix length if it is set, otherwise zero on success.
 944  */
 945 static bool check_cgroup_dir_config(struct lxc_conf *conf)
 946 {
 947         const char *monitor_dir = conf->cgroup_meta.monitor_dir,
 948                    *container_dir = conf->cgroup_meta.container_dir,
 949                    *namespace_dir = conf->cgroup_meta.namespace_dir;
 950
 951         /* none of the new options are set, all is fine */
 952         if (!monitor_dir && !container_dir && !namespace_dir)
 953                 return true;
 954
 955         /* some are set, make sure lxc.cgroup.dir is not also set*/
 956         if (conf->cgroup_meta.dir)
 957                 return log_error_errno(false, EINVAL,
 958                         "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
 959
 960         /* make sure both monitor and payload are set */
 961         if (!monitor_dir || !container_dir)
 962                 return log_error_errno(false, EINVAL,
 963                         "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
 964
 965         /* namespace_dir may be empty */
 966         return true;
 967 }
 968
 969 __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
 970 {
 971         __do_free char *monitor_cgroup = NULL;
 972         int idx = 0;
 973         int i;
 974         size_t len;
 975         char *suffix = NULL;
 976         struct lxc_conf *conf;
 977
 978         if (!ops)
 979                 return ret_set_errno(false, ENOENT);
 980
 981         if (!ops->hierarchies)
 982                 return true;
 983
 984         if (ops->monitor_cgroup)
 985                 return ret_set_errno(false, EEXIST);
 986
 987         if (!handler || !handler->conf)
 988                 return ret_set_errno(false, EINVAL);
 989
 990         conf = handler->conf;
 991
 992         if (!check_cgroup_dir_config(conf))
 993                 return false;
 994
 995         if (conf->cgroup_meta.monitor_dir) {
 996                 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
 997         } else if (conf->cgroup_meta.dir) {
 998                 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
 999                                              DEFAULT_MONITOR_CGROUP_PREFIX,
1000                                              handler->name,
1001                                              CGROUP_CREATE_RETRY, NULL);
1002         } else if (ops->cgroup_pattern) {
1003                 __do_free char *cgroup_tree = NULL;
1004
1005                 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1006                 if (!cgroup_tree)
1007                         return ret_set_errno(false, ENOMEM);
1008
1009                 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
1010                                              DEFAULT_MONITOR_CGROUP,
1011                                              CGROUP_CREATE_RETRY, NULL);
1012         } else {
1013                 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1014                                              handler->name,
1015                                              CGROUP_CREATE_RETRY, NULL);
1016         }
1017         if (!monitor_cgroup)
1018                 return ret_set_errno(false, ENOMEM);
1019
1020         if (!conf->cgroup_meta.monitor_dir) {
1021                 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1022                 *suffix = '\0';
1023         }
1024         do {
1025                 if (idx && suffix)
1026                         sprintf(suffix, "-%d", idx);
1027
1028                 for (i = 0; ops->hierarchies[i]; i++) {
1029                         if (cgroup_tree_create(ops, handler->conf,
1030                                                ops->hierarchies[i],
1031                                                monitor_cgroup, NULL, false))
1032                                 continue;
1033
1034                         DEBUG("Failed to create cgroup %s)", monitor_cgroup);
1035                         for (int j = 0; j <= i; j++)
1036                                 cgroup_tree_prune_leaf(ops->hierarchies[j],
1037                                                        monitor_cgroup, false);
1038
1039                         idx++;
1040                         break;
1041                 }
1042         } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1043
1044         if (idx == 1000 || (!suffix && idx != 0))
1045                 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
1046
1047         ops->monitor_cgroup = move_ptr(monitor_cgroup);
1048         return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1049 }
1050
1051 /*
1052  * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1053  * next cgroup_pattern-1, -2, ..., -999.
1054  */
1055 __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1056 {
1057         __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL;
1058         char *limit_cgroup;
1059         int idx = 0;
1060         int i;
1061         size_t len;
1062         char *suffix = NULL;
1063         struct lxc_conf *conf;
1064
1065         if (!ops)
1066                 return ret_set_errno(false, ENOENT);
1067
1068         if (!ops->hierarchies)
1069                 return true;
1070
1071         if (ops->container_cgroup || ops->container_limit_cgroup)
1072                 return ret_set_errno(false, EEXIST);
1073
1074         if (!handler || !handler->conf)
1075                 return ret_set_errno(false, EINVAL);
1076
1077         conf = handler->conf;
1078
1079         if (!check_cgroup_dir_config(conf))
1080                 return false;
1081
1082         if (conf->cgroup_meta.container_dir) {
1083                 __limit_cgroup = strdup(conf->cgroup_meta.container_dir);
1084                 if (!__limit_cgroup)
1085                         return ret_set_errno(false, ENOMEM);
1086
1087                 if (conf->cgroup_meta.namespace_dir) {
1088                         container_cgroup = must_make_path(__limit_cgroup,
1089                                                           conf->cgroup_meta.namespace_dir,
1090                                                           NULL);
1091                         limit_cgroup = __limit_cgroup;
1092                 } else {
1093                         /* explicit paths but without isolation */
1094                         limit_cgroup = move_ptr(__limit_cgroup);
1095                         container_cgroup = limit_cgroup;
1096                 }
1097         } else if (conf->cgroup_meta.dir) {
1098                 limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1099                                            DEFAULT_PAYLOAD_CGROUP_PREFIX,
1100                                            handler->name,
1101                                            CGROUP_CREATE_RETRY, NULL);
1102                 container_cgroup = limit_cgroup;
1103         } else if (ops->cgroup_pattern) {
1104                 __do_free char *cgroup_tree = NULL;
1105
1106                 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1107                 if (!cgroup_tree)
1108                         return ret_set_errno(false, ENOMEM);
1109
1110                 limit_cgroup = must_concat(&len, cgroup_tree, "/",
1111                                            DEFAULT_PAYLOAD_CGROUP,
1112                                            CGROUP_CREATE_RETRY, NULL);
1113                 container_cgroup = limit_cgroup;
1114         } else {
1115                 limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1116                                            handler->name,
1117                                            CGROUP_CREATE_RETRY, NULL);
1118                 container_cgroup = limit_cgroup;
1119         }
1120         if (!limit_cgroup)
1121                 return ret_set_errno(false, ENOMEM);
1122
1123         if (!conf->cgroup_meta.container_dir) {
1124                 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1125                 *suffix = '\0';
1126         }
1127         do {
1128                 if (idx && suffix)
1129                         sprintf(suffix, "-%d", idx);
1130
1131                 for (i = 0; ops->hierarchies[i]; i++) {
1132                         if (cgroup_tree_create(ops, handler->conf,
1133                                                ops->hierarchies[i], limit_cgroup,
1134                                                conf->cgroup_meta.namespace_dir,
1135                                                true))
1136                                 continue;
1137
1138                         DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->path_con ?: "(null)");
1139                         for (int j = 0; j <= i; j++)
1140                                 cgroup_tree_prune_leaf(ops->hierarchies[j],
1141                                                        limit_cgroup, true);
1142
1143                         idx++;
1144                         break;
1145                 }
1146         } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1147
1148         if (idx == 1000 || (!suffix && idx != 0))
1149                 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
1150
1151         ops->container_cgroup = move_ptr(container_cgroup);
1152         if (__limit_cgroup)
1153                 ops->container_limit_cgroup = move_ptr(__limit_cgroup);
1154         else
1155                 ops->container_limit_cgroup = ops->container_cgroup;
1156         INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup",
1157              ops->container_cgroup, ops->container_limit_cgroup);
1158         return true;
1159 }
1160
1161 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1162                                               struct lxc_handler *handler)
1163 {
1164         int monitor_len, transient_len = 0;
1165         char monitor[INTTYPE_TO_STRLEN(pid_t)],
1166             transient[INTTYPE_TO_STRLEN(pid_t)];
1167
1168         if (!ops)
1169                 return ret_set_errno(false, ENOENT);
1170
1171         if (!ops->hierarchies)
1172                 return true;
1173
1174         if (!ops->monitor_cgroup)
1175                 return ret_set_errno(false, ENOENT);
1176
1177         if (!handler || !handler->conf)
1178                 return ret_set_errno(false, EINVAL);
1179
1180         monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1181         if (monitor_len < 0)
1182                 return false;
1183
1184         if (handler->transient_pid > 0) {
1185                 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1186                 if (transient_len < 0)
1187                         return false;
1188         }
1189
1190         for (int i = 0; ops->hierarchies[i]; i++) {
1191                 struct hierarchy *h = ops->hierarchies[i];
1192                 int ret;
1193
1194                 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", monitor, monitor_len);
1195                 if (ret)
1196                         return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1197
1198                 TRACE("Moved monitor into cgroup %d", h->dfd_mon);
1199
1200                 if (handler->transient_pid <= 0)
1201                         continue;
1202
1203                 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len);
1204                 if (ret)
1205                         return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1206
1207                 TRACE("Moved transient process into cgroup %d", h->dfd_mon);
1208
1209                 /*
1210                  * we don't keep the fds for non-unified hierarchies around
1211                  * mainly because we don't make use of them anymore after the
1212                  * core cgroup setup is done but also because there are quite a
1213                  * lot of them.
1214                  */
1215                 if (!is_unified_hierarchy(h))
1216                         close_prot_errno_disarm(h->dfd_mon);
1217         }
1218         handler->transient_pid = -1;
1219
1220         return true;
1221 }
1222
1223 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1224                                               struct lxc_handler *handler)
1225 {
1226         int len;
1227         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1228
1229         if (!ops)
1230                 return ret_set_errno(false, ENOENT);
1231
1232         if (!ops->hierarchies)
1233                 return true;
1234
1235         if (!ops->container_cgroup)
1236                 return ret_set_errno(false, ENOENT);
1237
1238         if (!handler || !handler->conf)
1239                 return ret_set_errno(false, EINVAL);
1240
1241         len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1242         if (len < 0)
1243                 return false;
1244
1245         for (int i = 0; ops->hierarchies[i]; i++) {
1246                 struct hierarchy *h = ops->hierarchies[i];
1247                 int ret;
1248
1249                 if (is_unified_hierarchy(h) &&
1250                     (handler->clone_flags & CLONE_INTO_CGROUP))
1251                         continue;
1252
1253                 ret = lxc_writeat(h->dfd_con, "cgroup.procs", pidstr, len);
1254                 if (ret != 0)
1255                         return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->path_con);
1256
1257                 TRACE("Moved container into %s cgroup via %d", h->path_con, h->dfd_con);
1258         }
1259
1260         return true;
1261 }
1262
1263 static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1264                       gid_t chown_gid, mode_t chmod_mode)
1265 {
1266         int ret;
1267
1268         ret = fchownat(dirfd, path, chown_uid, chown_gid,
1269                        AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1270         if (ret < 0)
1271                 return log_warn_errno(-1,
1272                                       errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1273                                       dirfd, path, (int)chown_uid,
1274                                       (int)chown_gid);
1275
1276         ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1277         if (ret < 0)
1278                 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1279                                       dirfd, path, (int)chmod_mode);
1280
1281         return 0;
1282 }
1283
1284 /* chgrp the container cgroups to container group.  We leave
1285  * the container owner as cgroup owner.  So we must make the
1286  * directories 775 so that the container can create sub-cgroups.
1287  *
1288  * Also chown the tasks and cgroup.procs files.  Those may not
1289  * exist depending on kernel version.
1290  */
1291 static int chown_cgroup_wrapper(void *data)
1292 {
1293         int ret;
1294         uid_t destuid;
1295         struct generic_userns_exec_data *arg = data;
1296         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1297         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1298
1299         if (!lxc_drop_groups() && errno != EPERM)
1300                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1301
1302         ret = setresgid(nsgid, nsgid, nsgid);
1303         if (ret < 0)
1304                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
1305                                        (int)nsgid, (int)nsgid, (int)nsgid);
1306
1307         ret = setresuid(nsuid, nsuid, nsuid);
1308         if (ret < 0)
1309                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
1310                                        (int)nsuid, (int)nsuid, (int)nsuid);
1311
1312         destuid = get_ns_uid(arg->origuid);
1313         if (destuid == LXC_INVALID_UID)
1314                 destuid = 0;
1315
1316         for (int i = 0; arg->hierarchies[i]; i++) {
1317                 int dirfd = arg->hierarchies[i]->dfd_con;
1318
1319                 if (dirfd < 0)
1320                         return syserror_set(-EBADF, "Invalid cgroup file descriptor");
1321
1322                 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
1323
1324                 /*
1325                  * Failures to chown() these are inconvenient but not
1326                  * detrimental We leave these owned by the container launcher,
1327                  * so that container root can write to the files to attach.  We
1328                  * chmod() them 664 so that container systemd can write to the
1329                  * files (which systemd in wily insists on doing).
1330                  */
1331
1332                 if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY)
1333                         (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
1334
1335                 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
1336
1337                 if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY)
1338                         continue;
1339
1340                 for (char **p = arg->hierarchies[i]->delegate; p && *p; p++)
1341                         (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
1342         }
1343
1344         return 0;
1345 }
1346
1347 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1348                                       struct lxc_conf *conf)
1349 {
1350         struct generic_userns_exec_data wrap;
1351
1352         if (!ops)
1353                 return ret_set_errno(false, ENOENT);
1354
1355         if (!ops->hierarchies)
1356                 return true;
1357
1358         if (!ops->container_cgroup)
1359                 return ret_set_errno(false, ENOENT);
1360
1361         if (!conf)
1362                 return ret_set_errno(false, EINVAL);
1363
1364         if (list_empty(&conf->id_map))
1365                 return true;
1366
1367         wrap.origuid = geteuid();
1368         wrap.path = NULL;
1369         wrap.hierarchies = ops->hierarchies;
1370         wrap.conf = conf;
1371
1372         if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1373                 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1374
1375         return true;
1376 }
1377
1378 __cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops)
1379 {
1380         if (!ops)
1381                 return;
1382
1383         if (!ops->hierarchies)
1384                 return;
1385
1386         for (int i = 0; ops->hierarchies[i]; i++) {
1387                 struct hierarchy *h = ops->hierarchies[i];
1388
1389                 /* Close all monitor cgroup file descriptors. */
1390                 close_prot_errno_disarm(h->dfd_mon);
1391         }
1392         /* Close the cgroup root file descriptor. */
1393         close_prot_errno_disarm(ops->dfd_mnt);
1394
1395         /*
1396          * The checking for freezer support should obviously be done at cgroup
1397          * initialization time but that doesn't work reliable. The freezer
1398          * controller has been demoted (rightly so) to a simple file located in
1399          * each non-root cgroup. At the time when the container is created we
1400          * might still be located in /sys/fs/cgroup and so checking for
1401          * cgroup.freeze won't tell us anything because this file doesn't exist
1402          * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1403          * find an already existing cgroup and then check within that cgroup
1404          * for the existence of cgroup.freeze but that will only work on
1405          * systemd based hosts. Other init systems might not manage cgroups and
1406          * so no cgroup will exist. So we defer until we have created cgroups
1407          * for our container which means we check here.
1408          */
1409         if (pure_unified_layout(ops) &&
1410             !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK,
1411                        AT_SYMLINK_NOFOLLOW)) {
1412                 TRACE("Unified hierarchy supports freezer");
1413                 ops->unified->utilities |= FREEZER_CONTROLLER;
1414         }
1415 }
1416
1417 /* cgroup-full:* is done, no need to create subdirs */
1418 static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
1419 {
1420         switch (cgroup_automount_type) {
1421         case LXC_AUTO_CGROUP_RO:
1422                 return true;
1423         case LXC_AUTO_CGROUP_RW:
1424                 return true;
1425         case LXC_AUTO_CGROUP_MIXED:
1426                 return true;
1427         }
1428
1429         return false;
1430 }
1431
1432 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1433  * remount controller ro if needed and bindmount the cgroupfs onto
1434  * control/the/cg/path.
1435  */
1436 static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
1437                                        char *hierarchy_mnt, char *cgpath,
1438                                        const char *container_cgroup)
1439 {
1440         __do_free char *sourcepath = NULL;
1441         int ret, remount_flags;
1442         int flags = MS_BIND;
1443
1444         if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1445             (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
1446                 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL);
1447                 if (ret < 0)
1448                         return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1449                                                hierarchy_mnt, hierarchy_mnt);
1450
1451                 remount_flags = add_required_remount_flags(hierarchy_mnt,
1452                                                            hierarchy_mnt,
1453                                                            flags | MS_REMOUNT);
1454                 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup",
1455                             remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1456                             NULL);
1457                 if (ret < 0)
1458                         return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt);
1459
1460                 INFO("Remounted %s read-only", hierarchy_mnt);
1461         }
1462
1463         sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL);
1464         if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
1465                 flags |= MS_RDONLY;
1466
1467         ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1468         if (ret < 0)
1469                 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1470                                        h->controllers[0], cgpath);
1471         INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1472
1473         if (flags & MS_RDONLY) {
1474                 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1475                                                            flags | MS_REMOUNT);
1476                 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1477                 if (ret < 0)
1478                         return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
1479                 INFO("Remounted %s read-only", cgpath);
1480         }
1481
1482         INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1483         return 0;
1484 }
1485
1486 /* __cgroupfs_mount
1487  *
1488  * Mount cgroup hierarchies directly without using bind-mounts. The main
1489  * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1490  * cgroups for the LXC_AUTO_CGROUP_FULL option.
1491  */
1492 static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1493                             struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
1494                             const char *hierarchy_mnt)
1495 {
1496         __do_close int fd_fs = -EBADF;
1497         unsigned int flags = 0;
1498         char *fstype;
1499         int ret;
1500
1501         if (dfd_mnt_cgroupfs < 0)
1502                 return ret_errno(EINVAL);
1503
1504         flags |= MOUNT_ATTR_NOSUID;
1505         flags |= MOUNT_ATTR_NOEXEC;
1506         flags |= MOUNT_ATTR_NODEV;
1507         flags |= MOUNT_ATTR_RELATIME;
1508
1509         if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1510             (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO))
1511                 flags |= MOUNT_ATTR_RDONLY;
1512
1513         if (is_unified_hierarchy(h))
1514                 fstype = "cgroup2";
1515         else
1516                 fstype = "cgroup";
1517
1518         if (can_use_mount_api()) {
1519                 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1520                 if (fd_fs < 0)
1521                         return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1522
1523                 if (!is_unified_hierarchy(h)) {
1524                         for (const char **it = (const char **)h->controllers; it && *it; it++) {
1525                                 if (strnequal(*it, "name=", STRLITERALLEN("name=")))
1526                                         ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1527                                 else
1528                                         ret = fs_set_property(fd_fs, *it, "");
1529                                 if (ret < 0)
1530                                         return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1531                         }
1532                 }
1533
1534                 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1535                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1536                                 flags);
1537         } else {
1538                 __do_free char *controllers = NULL, *target = NULL;
1539                 unsigned int old_flags = 0;
1540                 const char *rootfs_mnt;
1541
1542                 if (!is_unified_hierarchy(h)) {
1543                         controllers = lxc_string_join(",", (const char **)h->controllers, false);
1544                         if (!controllers)
1545                                 return ret_errno(ENOMEM);
1546                 }
1547
1548                 rootfs_mnt = get_rootfs_mnt(rootfs);
1549                 ret = mnt_attributes_old(flags, &old_flags);
1550                 if (ret)
1551                         return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1552
1553                 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
1554                 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
1555         }
1556         if (ret < 0)
1557                 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1558                                        fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1559
1560         DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1561               fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1562         return 0;
1563 }
1564
1565 static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1566                                  struct lxc_rootfs *rootfs,
1567                                  int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
1568 {
1569         return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1570                                 dfd_mnt_cgroupfs, hierarchy_mnt);
1571 }
1572
1573 static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
1574                                       struct lxc_rootfs *rootfs,
1575                                       int dfd_mnt_cgroupfs,
1576                                       const char *hierarchy_mnt)
1577 {
1578         switch (cgroup_automount_type) {
1579         case LXC_AUTO_CGROUP_FULL_RO:
1580                 break;
1581         case LXC_AUTO_CGROUP_FULL_RW:
1582                 break;
1583         case LXC_AUTO_CGROUP_FULL_MIXED:
1584                 break;
1585         default:
1586                 return 0;
1587         }
1588
1589         return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1590                                 dfd_mnt_cgroupfs, hierarchy_mnt);
1591 }
1592
1593 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1594                                       struct lxc_handler *handler, int cg_flags)
1595 {
1596         __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF;
1597         __do_free char *cgroup_root = NULL;
1598         int cgroup_automount_type;
1599         bool in_cgroup_ns = false, wants_force_mount = false;
1600         struct lxc_conf *conf = handler->conf;
1601         struct lxc_rootfs *rootfs = &conf->rootfs;
1602         const char *rootfs_mnt = get_rootfs_mnt(rootfs);
1603         int ret;
1604
1605         if (!ops)
1606                 return ret_set_errno(false, ENOENT);
1607
1608         if (!ops->hierarchies)
1609                 return true;
1610
1611         if (!conf)
1612                 return ret_set_errno(false, EINVAL);
1613
1614         if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
1615                 return log_trace(true, "No cgroup mounts requested");
1616
1617         if (cg_flags & LXC_AUTO_CGROUP_FORCE) {
1618                 cg_flags &= ~LXC_AUTO_CGROUP_FORCE;
1619                 wants_force_mount = true;
1620         }
1621
1622         switch (cg_flags) {
1623         case LXC_AUTO_CGROUP_RO:
1624                 TRACE("Read-only cgroup mounts requested");
1625                 break;
1626         case LXC_AUTO_CGROUP_RW:
1627                 TRACE("Read-write cgroup mounts requested");
1628                 break;
1629         case LXC_AUTO_CGROUP_MIXED:
1630                 TRACE("Mixed cgroup mounts requested");
1631                 break;
1632         case LXC_AUTO_CGROUP_FULL_RO:
1633                 TRACE("Full read-only cgroup mounts requested");
1634                 break;
1635         case LXC_AUTO_CGROUP_FULL_RW:
1636                 TRACE("Full read-write cgroup mounts requested");
1637                 break;
1638         case LXC_AUTO_CGROUP_FULL_MIXED:
1639                 TRACE("Full mixed cgroup mounts requested");
1640                 break;
1641         default:
1642                 return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified");
1643         }
1644         cgroup_automount_type = cg_flags;
1645
1646         if (!wants_force_mount) {
1647                 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
1648
1649                 /*
1650                  * Most recent distro versions currently have init system that
1651                  * do support cgroup2 but do not mount it by default unless
1652                  * explicitly told so even if the host is cgroup2 only. That
1653                  * means they often will fail to boot. Fix this by pre-mounting
1654                  * cgroup2 by default. We will likely need to be doing this a
1655                  * few years until all distros have switched over to cgroup2 at
1656                  * which point we can safely assume that their init systems
1657                  * will mount it themselves.
1658                  */
1659                 if (pure_unified_layout(ops))
1660                         wants_force_mount = true;
1661         }
1662
1663         if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP))
1664                 in_cgroup_ns = true;
1665
1666         if (in_cgroup_ns && !wants_force_mount)
1667                 return log_trace(true, "Mounting cgroups not requested or needed");
1668
1669         /* This is really the codepath that we want. */
1670         if (pure_unified_layout(ops)) {
1671                 __do_close int dfd_mnt_unified = -EBADF;
1672
1673                 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1674                                           PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1675                 if (dfd_mnt_unified < 0)
1676                         return syserror_ret(false, "Failed to open %d(%s)",
1677                                             rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1678                 /*
1679                  * If cgroup namespaces are supported but the container will
1680                  * not have CAP_SYS_ADMIN after it has started we need to mount
1681                  * the cgroups manually.
1682                  *
1683                  * Note that here we know that wants_force_mount is true.
1684                  * Otherwise we would've returned early above.
1685                  */
1686                 if (in_cgroup_ns) {
1687                         /*
1688                          *  1. cgroup:rw:force    -> Mount the cgroup2 filesystem.
1689                          *  2. cgroup:ro:force    -> Mount the cgroup2 filesystem read-only.
1690                          *  3. cgroup:mixed:force -> See comment above how this
1691                          *                           does not apply so
1692                          *                           cgroup:mixed is equal to
1693                          *                           cgroup:rw when cgroup
1694                          *                           namespaces are supported.
1695
1696                          *  4. cgroup:rw    -> No-op; init system responsible for mounting.
1697                          *  5. cgroup:ro    -> No-op; init system responsible for mounting.
1698                          *  6. cgroup:mixed -> No-op; init system responsible for mounting.
1699                          *
1700                          *  7. cgroup-full:rw    -> Not supported.
1701                          *  8. cgroup-full:ro    -> Not supported.
1702                          *  9. cgroup-full:mixed -> Not supported.
1703
1704                          * 10. cgroup-full:rw:force    -> Not supported.
1705                          * 11. cgroup-full:ro:force    -> Not supported.
1706                          * 12. cgroup-full:mixed:force -> Not supported.
1707                          */
1708                         ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
1709                         if (ret < 0)
1710                                 return syserror_ret(false, "Failed to force mount cgroup filesystem in cgroup namespace");
1711
1712                         return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
1713                 } else {
1714                         /*
1715                          * Either no cgroup namespace supported (highly
1716                          * unlikely unless we're dealing with a Frankenkernel.
1717                          * Or the user requested to keep the cgroup namespace
1718                          * of the host or another container.
1719                          */
1720                         if (wants_force_mount) {
1721                                 /*
1722                                  * 1. cgroup:rw:force    -> Bind-mount the cgroup2 filesystem writable.
1723                                  * 2. cgroup:ro:force    -> Bind-mount the cgroup2 filesystem read-only.
1724                                  * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
1725                                  *                          and make the parent directory of the
1726                                  *                          container's cgroup read-only but the
1727                                  *                          container's cgroup writable.
1728                                  *
1729                                  * 10. cgroup-full:rw:force    ->
1730                                  * 11. cgroup-full:ro:force    ->
1731                                  * 12. cgroup-full:mixed:force ->
1732                                  */
1733                                 errno = EOPNOTSUPP;
1734                                 SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1735                         } else {
1736                                 errno = EOPNOTSUPP;
1737                                 SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1738                         }
1739                 }
1740
1741                 return syserror_ret(false, "Failed to mount cgroups");
1742         }
1743
1744         /*
1745          * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
1746          * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
1747          * DEFAULT_CGROUP_MOUNTPOINT define.
1748          */
1749         if (can_use_mount_api()) {
1750                 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
1751                 if (fd_fs < 0)
1752                         return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
1753
1754                 ret = fs_set_property(fd_fs, "mode", "0755");
1755                 if (ret < 0)
1756                         return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1757
1758                 ret = fs_set_property(fd_fs, "size", "10240k");
1759                 if (ret < 0)
1760                         return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1761
1762                 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1763                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
1764                                 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
1765                                 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
1766         } else {
1767                 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1768                 ret = safe_mount(NULL, cgroup_root, "tmpfs",
1769                                  MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1770                                  "size=10240k,mode=755", rootfs_mnt);
1771         }
1772         if (ret < 0)
1773                 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
1774                                        DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1775
1776         dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1777                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1778         if (dfd_mnt_tmpfs < 0)
1779                 return syserror_ret(false, "Failed to open %d(%s)",
1780                                     rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1781
1782         for (int i = 0; ops->hierarchies[i]; i++) {
1783                 __do_free char *hierarchy_mnt = NULL, *path2 = NULL;
1784                 struct hierarchy *h = ops->hierarchies[i];
1785
1786                 ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000);
1787                 if (ret < 0)
1788                         return syserror_ret(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt);
1789
1790                 if (in_cgroup_ns && wants_force_mount) {
1791                         /*
1792                          * If cgroup namespaces are supported but the container
1793                          * will not have CAP_SYS_ADMIN after it has started we
1794                          * need to mount the cgroups manually.
1795                          */
1796                         ret = cgroupfs_mount(cgroup_automount_type, h, rootfs,
1797                                              dfd_mnt_tmpfs, h->at_mnt);
1798                         if (ret < 0)
1799                                 return false;
1800
1801                         continue;
1802                 }
1803
1804                 /* Here is where the ancient kernel section begins. */
1805                 ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs,
1806                                           dfd_mnt_tmpfs, h->at_mnt);
1807                 if (ret < 0)
1808                         return false;
1809
1810                 if (!cg_mount_needs_subdirs(cgroup_automount_type))
1811                         continue;
1812
1813                 if (!cgroup_root)
1814                         cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1815
1816                 hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL);
1817                 path2 = must_make_path(hierarchy_mnt, h->at_base,
1818                                        ops->container_cgroup, NULL);
1819                 ret = mkdir_p(path2, 0755);
1820                 if (ret < 0 && (errno != EEXIST))
1821                         return false;
1822
1823                 ret = cg_legacy_mount_controllers(cgroup_automount_type, h,
1824                                                   hierarchy_mnt, path2,
1825                                                   ops->container_cgroup);
1826                 if (ret < 0)
1827                         return false;
1828         }
1829
1830         return true;
1831 }
1832
1833 /* Only root needs to escape to the cgroup of its init. */
1834 __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
1835                                             struct lxc_conf *conf)
1836 {
1837         if (!ops)
1838                 return ret_set_errno(false, ENOENT);
1839
1840         if (!ops->hierarchies)
1841                 return true;
1842
1843         if (!conf)
1844                 return ret_set_errno(false, EINVAL);
1845
1846         if (conf->cgroup_meta.relative || geteuid())
1847                 return true;
1848
1849         for (int i = 0; ops->hierarchies[i]; i++) {
1850                 __do_free char *fullpath = NULL;
1851                 int ret;
1852
1853                 fullpath = make_cgroup_path(ops->hierarchies[i],
1854                                             ops->hierarchies[i]->at_base,
1855                                             "cgroup.procs", NULL);
1856                 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1857                 if (ret != 0)
1858                         return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
1859         }
1860
1861         return true;
1862 }
1863
1864 __cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
1865 {
1866         int i = 0;
1867
1868         if (!ops)
1869                 return ret_set_errno(-1, ENOENT);
1870
1871         if (!ops->hierarchies)
1872                 return 0;
1873
1874         for (; ops->hierarchies[i]; i++)
1875                 ;
1876
1877         return i;
1878 }
1879
1880 __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
1881                                                      int n, char ***out)
1882 {
1883         int i;
1884
1885         if (!ops)
1886                 return ret_set_errno(false, ENOENT);
1887
1888         if (!ops->hierarchies)
1889                 return ret_set_errno(false, ENOENT);
1890
1891         /* consistency check n */
1892         for (i = 0; i < n; i++)
1893                 if (!ops->hierarchies[i])
1894                         return ret_set_errno(false, ENOENT);
1895
1896         *out = ops->hierarchies[i]->controllers;
1897
1898         return true;
1899 }
1900
1901 static int cg_legacy_freeze(struct cgroup_ops *ops)
1902 {
1903         struct hierarchy *h;
1904
1905         h = get_hierarchy(ops, "freezer");
1906         if (!h)
1907                 return ret_set_errno(-1, ENOENT);
1908
1909         return lxc_write_openat(h->path_con, "freezer.state",
1910                                 "FROZEN", STRLITERALLEN("FROZEN"));
1911 }
1912
1913 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1914                                     struct lxc_async_descr *descr)
1915 {
1916         __do_free char *line = NULL;
1917         __do_fclose FILE *f = NULL;
1918         int state = PTR_TO_INT(cbdata);
1919         size_t len;
1920         const char *state_string;
1921
1922         f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
1923         if (!f)
1924                 return LXC_MAINLOOP_ERROR;
1925
1926         if (state == 1)
1927                 state_string = "frozen 1";
1928         else
1929                 state_string = "frozen 0";
1930
1931         while (getline(&line, &len, f) != -1)
1932                 if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2))
1933                         return LXC_MAINLOOP_CLOSE;
1934
1935         rewind(f);
1936
1937         return LXC_MAINLOOP_CONTINUE;
1938 }
1939
1940 static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
1941                                 const char *state_string,
1942                                 int state_num,
1943                                 const char *epoll_error,
1944                                 const char *wait_error)
1945 {
1946         __do_close int fd = -EBADF;
1947         call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL;
1948         int ret;
1949         struct lxc_async_descr descr;
1950         struct hierarchy *h;
1951
1952         h = ops->unified;
1953         if (!h)
1954                 return ret_set_errno(-1, ENOENT);
1955
1956         if (!h->path_con)
1957                 return ret_set_errno(-1, EEXIST);
1958
1959         if (timeout != 0) {
1960                 __do_free char *events_file = NULL;
1961
1962                 events_file = must_make_path(h->path_con, "cgroup.events", NULL);
1963                 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1964                 if (fd < 0)
1965                         return log_error_errno(-1, errno, "Failed to open cgroup.events file");
1966
1967                 ret = lxc_mainloop_open(&descr);
1968                 if (ret)
1969                         return log_error_errno(-1, errno, "%s", epoll_error);
1970
1971                 /* automatically cleaned up now */
1972                 descr_ptr = &descr;
1973
1974                 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI,
1975                                                       freezer_cgroup_events_cb,
1976                                                       default_cleanup_handler,
1977                                                       INT_TO_PTR(state_num),
1978                                                       "freezer_cgroup_events_cb");
1979                 if (ret < 0)
1980                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
1981         }
1982
1983         ret = lxc_write_openat(h->path_con, "cgroup.freeze", state_string, 1);
1984         if (ret < 0)
1985                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
1986
1987         if (timeout != 0 && lxc_mainloop(&descr, timeout))
1988                 return log_error_errno(-1, errno, "%s", wait_error);
1989
1990         return 0;
1991 }
1992
1993 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
1994 {
1995         return cg_unified_freeze_do(ops, timeout, "1", 1,
1996                 "Failed to create epoll instance to wait for container freeze",
1997                 "Failed to wait for container to be frozen");
1998 }
1999
2000 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
2001 {
2002         if (!ops->hierarchies)
2003                 return ret_set_errno(-1, ENOENT);
2004
2005         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2006                 return cg_legacy_freeze(ops);
2007
2008         return cg_unified_freeze(ops, timeout);
2009 }
2010
2011 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
2012 {
2013         struct hierarchy *h;
2014
2015         h = get_hierarchy(ops, "freezer");
2016         if (!h)
2017                 return ret_set_errno(-1, ENOENT);
2018
2019         return lxc_write_openat(h->path_con, "freezer.state",
2020                                 "THAWED", STRLITERALLEN("THAWED"));
2021 }
2022
2023 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
2024 {
2025         return cg_unified_freeze_do(ops, timeout, "0", 0,
2026                 "Failed to create epoll instance to wait for container unfreeze",
2027                 "Failed to wait for container to be unfrozen");
2028 }
2029
2030 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2031 {
2032         if (!ops->hierarchies)
2033                 return ret_set_errno(-1, ENOENT);
2034
2035         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2036                 return cg_legacy_unfreeze(ops);
2037
2038         return cg_unified_unfreeze(ops, timeout);
2039 }
2040
2041 static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2042                                         const char *controller, bool limiting)
2043 {
2044         struct hierarchy *h;
2045         size_t len;
2046         const char *path;
2047
2048         h = get_hierarchy(ops, controller);
2049         if (!h)
2050                 return log_warn_errno(NULL, ENOENT,
2051                                       "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller));
2052
2053         if (limiting)
2054                 path = h->path_lim;
2055         else
2056                 path = h->path_con;
2057         if (!path)
2058                 return NULL;
2059
2060         len = strlen(h->at_mnt);
2061         if (!strnequal(h->at_mnt, DEFAULT_CGROUP_MOUNTPOINT,
2062                        STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) {
2063                 path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT);
2064                 path += strspn(path, "/");
2065         }
2066         return path += len;
2067 }
2068
2069 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2070                                                   const char *controller)
2071 {
2072     return cgfsng_get_cgroup_do(ops, controller, false);
2073 }
2074
2075 __cgfsng_ops static const char *cgfsng_get_limit_cgroup(struct cgroup_ops *ops,
2076                                                         const char *controller)
2077 {
2078     return cgfsng_get_cgroup_do(ops, controller, true);
2079 }
2080
2081 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2082  * which must be freed by the caller.
2083  */
2084 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2085                                                        const char *inpath,
2086                                                        const char *filename)
2087 {
2088         return make_cgroup_path(h, inpath, filename, NULL);
2089 }
2090
2091 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
2092 {
2093         int idx = 1;
2094         int ret;
2095         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2096         ssize_t pidstr_len;
2097
2098         /* Create leaf cgroup. */
2099         ret = mkdirat(unified_fd, ".lxc", 0755);
2100         if (ret < 0 && errno != EEXIST)
2101                 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2102
2103         pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2104         if (pidstr_len < 0)
2105                 return pidstr_len;
2106
2107         ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
2108         if (ret < 0)
2109                 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2110         if (ret == 0)
2111                 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
2112
2113         /* this is a non-leaf node */
2114         if (errno != EBUSY)
2115                 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
2116
2117         do {
2118                 bool rm = false;
2119                 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
2120                 char *slash = attach_cgroup;
2121
2122                 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2123                 if (ret < 0)
2124                         return ret;
2125
2126                 /*
2127                  * This shouldn't really happen but the compiler might complain
2128                  * that a short write would cause a buffer overrun. So be on
2129                  * the safe side.
2130                  */
2131                 if ((size_t)ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2132                         return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2133
2134                 slash += (ret - STRLITERALLEN("/cgroup.procs"));
2135                 *slash = '\0';
2136
2137                 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2138                 if (ret < 0 && errno != EEXIST)
2139                         return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2140                 if (ret == 0)
2141                         rm = true;
2142
2143                 *slash = '/';
2144
2145                 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2146                 if (ret == 0)
2147                         return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
2148
2149                 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2150                         SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2151
2152                 /* this is a non-leaf node */
2153                 if (errno != EBUSY)
2154                         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2155
2156                 idx++;
2157         } while (idx < 1000);
2158
2159         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2160 }
2161
2162 static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2163                                      int unified_fd, int *sk_fd)
2164 {
2165         __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2166         int target_fds[2];
2167         ssize_t ret;
2168
2169         /* Create leaf cgroup. */
2170         ret = mkdirat(unified_fd, ".lxc", 0755);
2171         if (ret < 0 && errno != EEXIST)
2172                 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2173
2174         target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2175         if (target_fd0 < 0)
2176                 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2177         target_fds[0] = target_fd0;
2178
2179         target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2180         if (target_fd1 < 0)
2181                 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2182         target_fds[1] = target_fd1;
2183
2184         ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
2185         if (ret <= 0)
2186                 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
2187                                        target_fd0, target_fd1);
2188
2189         return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
2190 }
2191
2192 static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2193                                         int *sk_fd, pid_t pid)
2194 {
2195         __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2196         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2197         size_t pidstr_len;
2198         ssize_t ret;
2199
2200         ret = lxc_abstract_unix_recv_two_fds(sk, &target_fd0, &target_fd1);
2201         if (ret < 0)
2202                 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
2203
2204         pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2205
2206         ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2207         if (ret > 0 && (size_t)ret == pidstr_len)
2208                 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2209
2210         ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
2211         if (ret > 0 && (size_t)ret == pidstr_len)
2212                 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
2213
2214         return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2215                                target_fd0, target_fd1);
2216 }
2217
2218 struct userns_exec_unified_attach_data {
2219         const struct lxc_conf *conf;
2220         int unified_fd;
2221         int sk_pair[2];
2222         pid_t pid;
2223 };
2224
2225 static int cgroup_unified_attach_child_wrapper(void *data)
2226 {
2227         struct userns_exec_unified_attach_data *args = data;
2228
2229         if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2230             args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2231                 return ret_errno(EINVAL);
2232
2233         close_prot_errno_disarm(args->sk_pair[0]);
2234         return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2235                                          &args->sk_pair[1]);
2236 }
2237
2238 static int cgroup_unified_attach_parent_wrapper(void *data)
2239 {
2240         struct userns_exec_unified_attach_data *args = data;
2241
2242         if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2243             args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2244                 return ret_errno(EINVAL);
2245
2246         close_prot_errno_disarm(args->sk_pair[1]);
2247         return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2248                                             args->pid);
2249 }
2250
2251 /* Technically, we're always at a delegation boundary here (This is especially
2252  * true when cgroup namespaces are available.). The reasoning is that in order
2253  * for us to have been able to start a container in the first place the root
2254  * cgroup must have been a leaf node. Now, either the container's init system
2255  * has populated the cgroup and kept it as a leaf node or it has created
2256  * subtrees. In the former case we will simply attach to the leaf node we
2257  * created when we started the container in the latter case we create our own
2258  * cgroup for the attaching process.
2259  */
2260 static int __cg_unified_attach(const struct hierarchy *h,
2261                                const struct lxc_conf *conf, const char *name,
2262                                const char *lxcpath, pid_t pid,
2263                                const char *controller)
2264 {
2265         __do_close int unified_fd = -EBADF;
2266         __do_free char *path = NULL, *cgroup = NULL;
2267         int ret;
2268
2269         if (!conf || !name || !lxcpath || pid <= 0)
2270                 return ret_errno(EINVAL);
2271
2272         ret = cgroup_attach(conf, name, lxcpath, pid);
2273         if (ret == 0)
2274                 return log_trace(0, "Attached to unified cgroup via command handler");
2275         if (!ERRNO_IS_NOT_SUPPORTED(ret) && ret != -ENOCGROUP2)
2276                 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2277
2278         /* Fall back to retrieving the path for the unified cgroup. */
2279         cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2280         /* not running */
2281         if (!cgroup)
2282                 return 0;
2283
2284         path = make_cgroup_path(h, cgroup, NULL);
2285
2286         unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
2287         if (unified_fd < 0)
2288                 return ret_errno(EBADF);
2289
2290         if (!list_empty(&conf->id_map)) {
2291                 struct userns_exec_unified_attach_data args = {
2292                         .conf           = conf,
2293                         .unified_fd     = unified_fd,
2294                         .pid            = pid,
2295                 };
2296
2297                 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2298                 if (ret < 0)
2299                         return -errno;
2300
2301                 ret = userns_exec_minimal(conf,
2302                                           cgroup_unified_attach_parent_wrapper,
2303                                           &args,
2304                                           cgroup_unified_attach_child_wrapper,
2305                                           &args);
2306         } else {
2307                 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2308         }
2309
2310         return ret;
2311 }
2312
2313 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2314                                        const struct lxc_conf *conf,
2315                                        const char *name, const char *lxcpath,
2316                                        pid_t pid)
2317 {
2318         int len, ret;
2319         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2320
2321         if (!ops)
2322                 return ret_set_errno(false, ENOENT);
2323
2324         if (!ops->hierarchies)
2325                 return true;
2326
2327         len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2328         if (len < 0)
2329                 return false;
2330
2331         for (int i = 0; ops->hierarchies[i]; i++) {
2332                 __do_free char *fullpath = NULL, *path = NULL;
2333                 struct hierarchy *h = ops->hierarchies[i];
2334
2335                 if (h->fs_type == UNIFIED_HIERARCHY) {
2336                         ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
2337                                                   h->controllers[0]);
2338                         if (ret < 0)
2339                                 return false;
2340
2341                         continue;
2342                 }
2343
2344                 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2345                 if (!path) {
2346                         /*
2347                          * Someone might have created a name=<controller>
2348                          * controller after the container has started and so
2349                          * the container doesn't make use of this controller.
2350                          *
2351                          * Link: https://github.com/lxc/lxd/issues/8577
2352                          */
2353                         TRACE("Skipping unused %s controller", maybe_empty(h->controllers[0]));
2354                         continue;
2355                 }
2356
2357                 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2358                 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2359                 if (ret < 0)
2360                         return log_error_errno(false, errno, "Failed to attach %d to %s",
2361                                                (int)pid, fullpath);
2362         }
2363
2364         return true;
2365 }
2366
2367 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.  Here we
2368  * don't have a cgroup_data set up, so we ask the running container through the
2369  * commands API for the cgroup path.
2370  */
2371 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2372                                      char *value, size_t len, const char *name,
2373                                      const char *lxcpath)
2374 {
2375         __do_free char *path = NULL;
2376         __do_free char *controller = NULL;
2377         char *p;
2378         struct hierarchy *h;
2379         int ret = -1;
2380
2381         if (!ops)
2382                 return ret_set_errno(-1, ENOENT);
2383
2384         controller = strdup(filename);
2385         if (!controller)
2386                 return ret_errno(ENOMEM);
2387
2388         p = strchr(controller, '.');
2389         if (p)
2390                 *p = '\0';
2391
2392         path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
2393         /* not running */
2394         if (!path)
2395                 return -1;
2396
2397         h = get_hierarchy(ops, controller);
2398         if (h) {
2399                 __do_free char *fullpath = NULL;
2400
2401                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2402                 ret = lxc_read_from_file(fullpath, value, len);
2403         }
2404
2405         return ret;
2406 }
2407
2408 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2409 {
2410         for (int count = 0; count < 3; count++, val++) {
2411                 switch (*val) {
2412                 case 'r':
2413                         device->access[count] = *val;
2414                         break;
2415                 case 'w':
2416                         device->access[count] = *val;
2417                         break;
2418                 case 'm':
2419                         device->access[count] = *val;
2420                         break;
2421                 case '\n':
2422                 case '\0':
2423                         count = 3;
2424                         break;
2425                 default:
2426                         return ret_errno(EINVAL);
2427                 }
2428         }
2429
2430         return 0;
2431 }
2432
2433 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2434                                     const char *val)
2435 {
2436         size_t count;
2437         int ret;
2438         char temp[50];
2439
2440         if (strequal("devices.allow", key))
2441                 device->allow = 1; /* allow the device */
2442         else
2443                 device->allow = 0; /* deny the device */
2444
2445         if (strequal(val, "a")) {
2446                 /* global rule */
2447                 device->type = 'a';
2448                 device->major = -1;
2449                 device->minor = -1;
2450                 return 0;
2451         }
2452
2453         switch (*val) {
2454         case 'a':
2455                 __fallthrough;
2456         case 'b':
2457                 __fallthrough;
2458         case 'c':
2459                 device->type = *val;
2460                 break;
2461         default:
2462                 return -1;
2463         }
2464
2465         val++;
2466         if (!isspace(*val))
2467                 return -1;
2468         val++;
2469         if (*val == '*') {
2470                 device->major = -1;
2471                 val++;
2472         } else if (isdigit(*val)) {
2473                 memset(temp, 0, sizeof(temp));
2474                 for (count = 0; count < sizeof(temp) - 1; count++) {
2475                         temp[count] = *val;
2476                         val++;
2477                         if (!isdigit(*val))
2478                                 break;
2479                 }
2480                 ret = lxc_safe_int(temp, &device->major);
2481                 if (ret)
2482                         return -1;
2483         } else {
2484                 return -1;
2485         }
2486         if (*val != ':')
2487                 return -1;
2488         val++;
2489
2490         /* read minor */
2491         if (*val == '*') {
2492                 device->minor = -1;
2493                 val++;
2494         } else if (isdigit(*val)) {
2495                 memset(temp, 0, sizeof(temp));
2496                 for (count = 0; count < sizeof(temp) - 1; count++) {
2497                         temp[count] = *val;
2498                         val++;
2499                         if (!isdigit(*val))
2500                                 break;
2501                 }
2502                 ret = lxc_safe_int(temp, &device->minor);
2503                 if (ret)
2504                         return -1;
2505         } else {
2506                 return -1;
2507         }
2508         if (!isspace(*val))
2509                 return -1;
2510
2511         return device_cgroup_parse_access(device, ++val);
2512 }
2513
2514 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.  Here we
2515  * don't have a cgroup_data set up, so we ask the running container through the
2516  * commands API for the cgroup path.
2517  */
2518 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2519                                      const char *key, const char *value,
2520                                      const char *name, const char *lxcpath)
2521 {
2522         __do_free char *path = NULL;
2523         __do_free char *controller = NULL;
2524         char *p;
2525         struct hierarchy *h;
2526         int ret = -1;
2527
2528         if (!ops || is_empty_string(key) || is_empty_string(value) ||
2529             is_empty_string(name) || is_empty_string(lxcpath))
2530                 return ret_errno(EINVAL);
2531
2532         controller = strdup(key);
2533         if (!controller)
2534                 return ret_errno(ENOMEM);
2535
2536         p = strchr(controller, '.');
2537         if (p)
2538                 *p = '\0';
2539
2540         if (pure_unified_layout(ops) && strequal(controller, "devices")) {
2541                 struct device_item device = {};
2542
2543                 ret = device_cgroup_rule_parse(&device, key, value);
2544                 if (ret < 0)
2545                         return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2546                                                key, value);
2547
2548                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2549                 if (ret < 0)
2550                         return -1;
2551
2552                 return 0;
2553         }
2554
2555         path = lxc_cmd_get_limit_cgroup_path(name, lxcpath, controller);
2556         /* not running */
2557         if (!path)
2558                 return -1;
2559
2560         h = get_hierarchy(ops, controller);
2561         if (h) {
2562                 __do_free char *fullpath = NULL;
2563
2564                 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2565                 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2566         }
2567
2568         return ret;
2569 }
2570
2571 /* take devices cgroup line
2572  *    /dev/foo rwx
2573  * and convert it to a valid
2574  *    type major:minor mode
2575  * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2576  * the output.
2577  */
2578 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2579                                             const char *devpath)
2580 {
2581         __do_free char *path = NULL;
2582         char *mode = NULL;
2583         int n_parts, ret;
2584         char *p;
2585         struct stat sb;
2586
2587         path = strdup(devpath);
2588         if (!path)
2589                 return ret_errno(ENOMEM);
2590
2591         /*
2592          * Read path followed by mode. Ignore any trailing text.
2593          * A '    # comment' would be legal. Technically other text is not
2594          * legal, we could check for that if we cared to.
2595          */
2596         for (n_parts = 1, p = path; *p; p++) {
2597                 if (*p != ' ')
2598                         continue;
2599                 *p = '\0';
2600
2601                 if (n_parts != 1)
2602                         break;
2603                 p++;
2604                 n_parts++;
2605
2606                 while (*p == ' ')
2607                         p++;
2608
2609                 mode = p;
2610
2611                 if (*p == '\0')
2612                         return ret_set_errno(-1, EINVAL);
2613         }
2614
2615         if (!mode)
2616                 return ret_errno(EINVAL);
2617
2618         if (device_cgroup_parse_access(device, mode) < 0)
2619                 return -1;
2620
2621         ret = stat(path, &sb);
2622         if (ret < 0)
2623                 return ret_set_errno(-1, errno);
2624
2625         mode_t m = sb.st_mode & S_IFMT;
2626         switch (m) {
2627         case S_IFBLK:
2628                 device->type = 'b';
2629                 break;
2630         case S_IFCHR:
2631                 device->type = 'c';
2632                 break;
2633         default:
2634                 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
2635         }
2636
2637         device->major = MAJOR(sb.st_rdev);
2638         device->minor = MINOR(sb.st_rdev);
2639         device->allow = 1;
2640
2641         return 0;
2642 }
2643
2644 static int convert_devpath(const char *invalue, char *dest)
2645 {
2646         struct device_item device = {};
2647         int ret;
2648
2649         ret = device_cgroup_rule_parse_devpath(&device, invalue);
2650         if (ret < 0)
2651                 return -1;
2652
2653         ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2654                          device.minor, device.access);
2655         if (ret < 0)
2656                 return log_error_errno(ret, -ret,
2657                                        "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2658                                        device.type, device.major, device.minor,
2659                                        device.access);
2660
2661         return 0;
2662 }
2663
2664 /* Called from setup_limits - here we have the container's cgroup_data because
2665  * we created the cgroups.
2666  */
2667 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2668                               const char *value, bool is_cpuset)
2669 {
2670         __do_free char *controller = NULL;
2671         char *p;
2672         /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2673         char converted_value[50];
2674         struct hierarchy *h;
2675
2676         controller = strdup(filename);
2677         if (!controller)
2678                 return ret_errno(ENOMEM);
2679
2680         p = strchr(controller, '.');
2681         if (p)
2682                 *p = '\0';
2683
2684         if (strequal("devices.allow", filename) && value[0] == '/') {
2685                 int ret;
2686
2687                 ret = convert_devpath(value, converted_value);
2688                 if (ret < 0)
2689                         return ret;
2690                 value = converted_value;
2691         }
2692
2693         h = get_hierarchy(ops, controller);
2694         if (!h)
2695                 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
2696
2697         if (is_cpuset) {
2698                 int ret = lxc_write_openat(h->path_con, filename, value, strlen(value));
2699                 if (ret)
2700                         return ret;
2701         }
2702         return lxc_write_openat(h->path_lim, filename, value, strlen(value));
2703 }
2704
2705 /*
2706  * Return the list of cgroup_settings sorted according to the following rules
2707  * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
2708  */
2709 static void sort_cgroup_settings(struct lxc_conf *conf)
2710 {
2711         LIST_HEAD(memsw_list);
2712         struct lxc_cgroup *cgroup, *ncgroup;
2713
2714         /* Iterate over the cgroup settings and copy them to the output list. */
2715         list_for_each_entry_safe(cgroup, ncgroup, &conf->cgroup, head) {
2716                 if (!strequal(cgroup->subsystem, "memory.memsw.limit_in_bytes"))
2717                         continue;
2718
2719                 /* Move the memsw entry from the cgroup settings list. */
2720                 list_move_tail(&cgroup->head, &memsw_list);
2721         }
2722
2723         /*
2724          * Append all the memsw entries to the end of the cgroup settings list
2725          * to make sure they are applied after all memory limit settings.
2726          */
2727         list_splice_tail(&memsw_list, &conf->cgroup);
2728
2729 }
2730
2731 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2732                                                     struct lxc_conf *conf,
2733                                                     bool do_devices)
2734 {
2735         struct list_head *cgroup_settings;
2736         struct lxc_cgroup *cgroup;
2737
2738         if (!ops)
2739                 return ret_set_errno(false, ENOENT);
2740
2741         if (!conf)
2742                 return ret_set_errno(false, EINVAL);
2743
2744         cgroup_settings = &conf->cgroup;
2745         if (list_empty(cgroup_settings))
2746                 return true;
2747
2748         if (!ops->hierarchies)
2749                 return ret_set_errno(false, EINVAL);
2750
2751         if (pure_unified_layout(ops))
2752                 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
2753
2754         sort_cgroup_settings(conf);
2755         list_for_each_entry(cgroup, cgroup_settings, head) {
2756                 if (do_devices == strnequal("devices", cgroup->subsystem, 7)) {
2757                         if (cg_legacy_set_data(ops, cgroup->subsystem, cgroup->value, strnequal("cpuset", cgroup->subsystem, 6))) {
2758                                 if (do_devices && (errno == EACCES || errno == EPERM)) {
2759                                         SYSWARN("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2760                                         continue;
2761                                 }
2762                                 SYSERROR("Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2763                                 return false;
2764                         }
2765                         DEBUG("Set controller \"%s\" set to \"%s\"", cgroup->subsystem, cgroup->value);
2766                 }
2767         }
2768
2769         INFO("Limits for the legacy cgroup hierarchies have been setup");
2770         return true;
2771 }
2772
2773 /*
2774  * Some of the parsing logic comes from the original cgroup device v1
2775  * implementation in the kernel.
2776  */
2777 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2778                                      struct lxc_conf *conf, const char *key,
2779                                      const char *val)
2780 {
2781         struct device_item device_item = {};
2782         int ret;
2783
2784         if (strequal("devices.allow", key) && abspath(val))
2785                 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2786         else
2787                 ret = device_cgroup_rule_parse(&device_item, key, val);
2788         if (ret < 0)
2789                 return syserror_set(EINVAL, "Failed to parse device rule %s=%s", key, val);
2790
2791         /*
2792          * Note that bpf_list_add_device() returns 1 if it altered the device
2793          * list and 0 if it didn't; both return values indicate success.
2794          * Only a negative return value indicates an error.
2795          */
2796         ret = bpf_list_add_device(&conf->bpf_devices, &device_item);
2797         if (ret < 0)
2798                 return -1;
2799
2800         return 0;
2801 }
2802
2803 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2804                                              struct lxc_handler *handler)
2805 {
2806         struct list_head *cgroup_settings;
2807         struct hierarchy *h;
2808         struct lxc_conf *conf;
2809         struct lxc_cgroup *cgroup;
2810
2811         if (!ops)
2812                 return ret_set_errno(false, ENOENT);
2813
2814         if (!ops->hierarchies)
2815                 return true;
2816
2817         if (!ops->container_cgroup)
2818                 return ret_set_errno(false, EINVAL);
2819
2820         if (!handler || !handler->conf)
2821                 return ret_set_errno(false, EINVAL);
2822         conf = handler->conf;
2823
2824         cgroup_settings = &conf->cgroup2;
2825         if (list_empty(cgroup_settings))
2826                 return true;
2827
2828         if (!pure_unified_layout(ops))
2829                 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
2830
2831         if (!ops->unified)
2832                 return false;
2833         h = ops->unified;
2834
2835         list_for_each_entry(cgroup, cgroup_settings, head) {
2836                 int ret;
2837
2838                 if (strnequal("devices", cgroup->subsystem, 7))
2839                         ret = bpf_device_cgroup_prepare(ops, conf, cgroup->subsystem, cgroup->value);
2840                 else
2841                         ret = lxc_write_openat(h->path_lim, cgroup->subsystem, cgroup->value, strlen(cgroup->value));
2842                 if (ret < 0)
2843                         return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2844
2845                 TRACE("Set \"%s\" to \"%s\"", cgroup->subsystem, cgroup->value);
2846         }
2847
2848         return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
2849 }
2850
2851 __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
2852 {
2853         struct lxc_conf *conf;
2854         struct hierarchy *unified;
2855
2856         if (!ops)
2857                 return ret_set_errno(false, ENOENT);
2858
2859         if (!ops->hierarchies)
2860                 return true;
2861
2862         if (!ops->container_cgroup)
2863                 return ret_set_errno(false, EEXIST);
2864
2865         if (!handler || !handler->conf)
2866                 return ret_set_errno(false, EINVAL);
2867         conf = handler->conf;
2868
2869         unified = ops->unified;
2870         if (!unified || !device_utility_controller(unified) ||
2871             !unified->path_con || list_empty(&(conf->bpf_devices).devices))
2872                 return true;
2873
2874         return bpf_cgroup_devices_attach(ops, &conf->bpf_devices);
2875 }
2876
2877 static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
2878 {
2879         __do_close int dfd_final = -EBADF;
2880         __do_free char *add_controllers = NULL, *copy = NULL;
2881         size_t full_len = 0;
2882         struct hierarchy *unified;
2883         int dfd_cur, ret;
2884         char *cur;
2885         char **it;
2886
2887         if (!ops->hierarchies || !pure_unified_layout(ops))
2888                 return true;
2889
2890         unified = ops->unified;
2891         if (!unified->controllers[0])
2892                 return true;
2893
2894         /* For now we simply enable all controllers that we have detected by
2895          * creating a string like "+memory +pids +cpu +io".
2896          * TODO: In the near future we might want to support "-<controller>"
2897          * etc. but whether supporting semantics like this make sense will need
2898          * some thinking.
2899          */
2900         for (it = unified->controllers; it && *it; it++) {
2901                 full_len += strlen(*it) + 2;
2902                 add_controllers = must_realloc(add_controllers, full_len + 1);
2903
2904                 if (unified->controllers[0] == *it)
2905                         add_controllers[0] = '\0';
2906
2907                 (void)strlcat(add_controllers, "+", full_len + 1);
2908                 (void)strlcat(add_controllers, *it, full_len + 1);
2909
2910                 if ((it + 1) && *(it + 1))
2911                         (void)strlcat(add_controllers, " ", full_len + 1);
2912         }
2913
2914         copy = strdup(cgroup);
2915         if (!copy)
2916                 return false;
2917
2918         /*
2919          * Placing the write to cgroup.subtree_control before the open() is
2920          * intentional because of the cgroup2 delegation model. It enforces
2921          * that leaf cgroups don't have any controllers enabled for delegation.
2922          */
2923         dfd_cur = unified->dfd_base;
2924         lxc_iterate_parts(cur, copy, "/") {
2925                 /*
2926                  * Even though we vetted the paths when we parsed the config
2927                  * we're paranoid here and check that the path is neither
2928                  * absolute nor walks upwards.
2929                  */
2930                 if (abspath(cur))
2931                         return syserror_set(-EINVAL, "No absolute paths allowed");
2932
2933                 if (strnequal(cur, "..", STRLITERALLEN("..")))
2934                         return syserror_set(-EINVAL, "No upward walking paths allowed");
2935
2936                 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
2937                 if (ret < 0)
2938                         return syserror("Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2939
2940                 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2941
2942                 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
2943                 if (dfd_final < 0)
2944                         return syserror("Fail to open directory %d(%s)", dfd_cur, cur);
2945                 if (dfd_cur != unified->dfd_base)
2946                         close(dfd_cur);
2947                 /*
2948                  * Leave dfd_final pointing to the last fd we opened so
2949                  * it will be automatically zapped if we return early.
2950                  */
2951                 dfd_cur = dfd_final;
2952         }
2953
2954         return true;
2955 }
2956
2957 __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2958 {
2959         if (!ops)
2960                 return ret_set_errno(false, ENOENT);
2961
2962         return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2963 }
2964
2965 __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2966 {
2967         if (!ops)
2968                 return ret_set_errno(false, ENOENT);
2969
2970         return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2971 }
2972
2973 static inline bool unified_cgroup(const char *line)
2974 {
2975         return *line == '0';
2976 }
2977
2978 static inline char *current_unified_cgroup(bool relative, char *line)
2979 {
2980         char *current_cgroup;
2981
2982         line += STRLITERALLEN("0::");
2983
2984         if (!abspath(line))
2985                 return ERR_PTR(-EINVAL);
2986
2987         /* remove init.scope */
2988         if (!relative)
2989                 line = prune_init_scope(line);
2990
2991         /* create a relative path */
2992         line = deabs(line);
2993
2994         current_cgroup = strdup(line);
2995         if (!current_cgroup)
2996                 return ERR_PTR(-ENOMEM);
2997
2998         return current_cgroup;
2999 }
3000
3001 static inline const char *unprefix(const char *controllers)
3002 {
3003         if (strnequal(controllers, "name=", STRLITERALLEN("name=")))
3004                 return controllers + STRLITERALLEN("name=");
3005         return controllers;
3006 }
3007
3008 static int __list_cgroup_delegate(char ***delegate)
3009 {
3010         __do_free char **list = NULL;
3011         __do_free char *buf = NULL;
3012         char *standard[] = {
3013                 "cgroup.procs",
3014                 "cgroup.threads",
3015                 "cgroup.subtree_control",
3016                 "memory.oom.group",
3017                 NULL,
3018         };
3019         char *token;
3020         int ret;
3021
3022         buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
3023         if (!buf) {
3024                 for (char **p = standard; p && *p; p++) {
3025                         ret = list_add_string(&list, *p);
3026                         if (ret < 0)
3027                                 return ret;
3028                 }
3029
3030                 *delegate = move_ptr(list);
3031                 return syswarn_ret(0, "Failed to read /sys/kernel/cgroup/delegate");
3032         }
3033
3034         lxc_iterate_parts(token, buf, " \t\n") {
3035                 /*
3036                  * We always need to chown this for both cgroup and
3037                  * cgroup2.
3038                  */
3039                 if (strequal(token, "cgroup.procs"))
3040                         continue;
3041
3042                 ret = list_add_string(&list, token);
3043                 if (ret < 0)
3044                         return ret;
3045         }
3046
3047         *delegate = move_ptr(list);
3048         return 0;
3049 }
3050
3051 static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files)
3052 {
3053         __do_free_string_list char **list = NULL;
3054         int ret;
3055
3056         ret = __list_cgroup_delegate(&list);
3057         if (ret < 0)
3058                 return syserror_ret(ret, "Failed to determine unified cgroup delegation requirements");
3059
3060         for (char *const *s = list; s && *s; s++) {
3061                 if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT)
3062                         continue;
3063
3064                 return sysinfo_ret(false, "The %s file is not writable, skipping unified hierarchy", *s);
3065         }
3066
3067         *ret_files = move_ptr(list);
3068         return true;
3069 }
3070
3071 static bool legacy_hierarchy_delegated(int dfd_base)
3072 {
3073         int ret;
3074
3075         ret = faccessat(dfd_base, ".", W_OK, 0);
3076         if (ret < 0 && errno != ENOENT)
3077                 return sysinfo_ret(false, "Legacy hierarchy not writable, skipping");
3078
3079         return true;
3080 }
3081
3082 /**
3083  * systemd guarantees that the order of co-mounted controllers is stable. On
3084  * some systems the order of the controllers might be reversed though.
3085  *
3086  * For example, this is how the order is mismatched on CentOS 7:
3087  *
3088  *      [root@localhost ~]# cat /proc/self/cgroup
3089  *      11:perf_event:/
3090  *      10:pids:/
3091  *      9:freezer:/
3092  * >>>> 8:cpuacct,cpu:/
3093  *      7:memory:/
3094  *      6:blkio:/
3095  *      5:devices:/
3096  *      4:hugetlb:/
3097  * >>>> 3:net_prio,net_cls:/
3098  *      2:cpuset:/
3099  *      1:name=systemd:/user.slice/user-0.slice/session-c1.scope
3100  *
3101  * whereas the mountpoint:
3102  *
3103  *      | |-/sys/fs/cgroup                    tmpfs         tmpfs      ro,nosuid,nodev,noexec,mode=755
3104  *      | | |-/sys/fs/cgroup/systemd          cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
3105  *      | | |-/sys/fs/cgroup/cpuset           cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,cpuset
3106  * >>>> | | |-/sys/fs/cgroup/net_cls,net_prio cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,net_prio,net_cls
3107  *      | | |-/sys/fs/cgroup/hugetlb          cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,hugetlb
3108  *      | | |-/sys/fs/cgroup/devices          cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,devices
3109  *      | | |-/sys/fs/cgroup/blkio            cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,blkio
3110  *      | | |-/sys/fs/cgroup/memory           cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,memory
3111  * >>>> | | |-/sys/fs/cgroup/cpu,cpuacct      cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,cpuacct,cpu
3112  *      | | |-/sys/fs/cgroup/freezer          cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,freezer
3113  *      | | |-/sys/fs/cgroup/pids             cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,pids
3114  *      | | `-/sys/fs/cgroup/perf_event       cgroup        cgroup     rw,nosuid,nodev,noexec,relatime,perf_event
3115  *
3116  * Ensure that we always use the systemd-guaranteed stable order when checking
3117  * for the mountpoint.
3118  */
3119 __attribute__((returns_nonnull)) __attribute__((nonnull))
3120 static const char *stable_order(const char *controllers)
3121 {
3122         if (strequal(controllers, "cpuacct,cpu"))
3123                 return "cpu,cpuacct";
3124
3125         if (strequal(controllers, "net_prio,net_cls"))
3126                 return "net_cls,net_prio";
3127
3128         return unprefix(controllers);
3129 }
3130
3131 static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
3132                                 bool unprivileged)
3133 {
3134         __do_free char *cgroup_info = NULL;
3135         char *it;
3136
3137         /*
3138          * Root spawned containers escape the current cgroup, so use init's
3139          * cgroups as our base in that case.
3140          */
3141         if (!relative && (geteuid() == 0))
3142                 cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3143         else
3144                 cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3145         if (!cgroup_info)
3146                 return ret_errno(ENOMEM);
3147
3148         lxc_iterate_parts(it, cgroup_info, "\n") {
3149                 __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
3150                 __do_free char *controllers = NULL, *current_cgroup = NULL;
3151                 __do_free_string_list char **controller_list = NULL,
3152                                            **delegate = NULL;
3153                 char *line;
3154                 int dfd, ret, type;
3155
3156                 /* Handle the unified cgroup hierarchy. */
3157                 line = it;
3158                 if (unified_cgroup(line)) {
3159                         char *unified_mnt;
3160
3161                         type = UNIFIED_HIERARCHY;
3162
3163                         current_cgroup = current_unified_cgroup(relative, line);
3164                         if (IS_ERR(current_cgroup))
3165                                 return PTR_ERR(current_cgroup);
3166
3167                         if (unified_cgroup_fd(ops->dfd_mnt)) {
3168                                 dfd_mnt = dup_cloexec(ops->dfd_mnt);
3169                                 unified_mnt = "";
3170                         } else {
3171                                 dfd_mnt = open_at(ops->dfd_mnt,
3172                                                   "unified",
3173                                                   PROTECT_OPATH_DIRECTORY,
3174                                                   PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3175                                 unified_mnt = "unified";
3176                         }
3177                         if (dfd_mnt < 0) {
3178                                 if (errno != ENOENT)
3179                                         return syserror("Failed to open %d/unified", ops->dfd_mnt);
3180
3181                                 SYSTRACE("Unified cgroup not mounted");
3182                                 continue;
3183                         }
3184                         dfd = dfd_mnt;
3185
3186                         if (!is_empty_string(current_cgroup)) {
3187                                 dfd_base = open_at(dfd_mnt, current_cgroup,
3188                                                    PROTECT_OPATH_DIRECTORY,
3189                                                    PROTECT_LOOKUP_BENEATH_XDEV, 0);
3190                                 if (dfd_base < 0) {
3191                                         if (errno != ENOENT)
3192                                                 return syserror("Failed to open %d/%s",
3193                                                                 dfd_mnt, current_cgroup);
3194
3195                                         SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)",
3196                                                  dfd_mnt, current_cgroup);
3197                                         continue;
3198                                 }
3199                                 dfd = dfd_base;
3200                         }
3201
3202                         if (!unified_hierarchy_delegated(dfd, &delegate))
3203                                 continue;
3204
3205                         controller_list = unified_controllers(dfd, "cgroup.controllers");
3206                         if (!controller_list) {
3207                                 TRACE("No controllers are enabled for delegation in the unified hierarchy");
3208                                 controller_list = list_new();
3209                                 if (!controller_list)
3210                                         return syserror_set(-ENOMEM, "Failed to create empty controller list");
3211                         }
3212
3213                         controllers = strdup(unified_mnt);
3214                         if (!controllers)
3215                                 return ret_errno(ENOMEM);
3216                 } else {
3217                         char *__controllers, *__current_cgroup;
3218
3219                         type = LEGACY_HIERARCHY;
3220
3221                         __controllers = strchr(line, ':');
3222                         if (!__controllers)
3223                                 return ret_errno(EINVAL);
3224                         __controllers++;
3225
3226                         __current_cgroup = strchr(__controllers, ':');
3227                         if (!__current_cgroup)
3228                                 return ret_errno(EINVAL);
3229                         *__current_cgroup = '\0';
3230                         __current_cgroup++;
3231
3232                         controllers = strdup(stable_order(__controllers));
3233                         if (!controllers)
3234                                 return ret_errno(ENOMEM);
3235
3236                         dfd_mnt = open_at(ops->dfd_mnt,
3237                                           controllers,
3238                                           PROTECT_OPATH_DIRECTORY,
3239                                           PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3240                         if (dfd_mnt < 0) {
3241                                 if (errno != ENOENT)
3242                                         return syserror("Failed to open %d/%s",
3243                                                         ops->dfd_mnt, controllers);
3244
3245                                 SYSTRACE("%s not mounted", controllers);
3246                                 continue;
3247                         }
3248                         dfd = dfd_mnt;
3249
3250                         if (!abspath(__current_cgroup))
3251                                 return ret_errno(EINVAL);
3252
3253                         /* remove init.scope */
3254                         if (!relative)
3255                                 __current_cgroup = prune_init_scope(__current_cgroup);
3256
3257                         /* create a relative path */
3258                         __current_cgroup = deabs(__current_cgroup);
3259
3260                         current_cgroup = strdup(__current_cgroup);
3261                         if (!current_cgroup)
3262                                 return ret_errno(ENOMEM);
3263
3264                         if (!is_empty_string(current_cgroup)) {
3265                                 dfd_base = open_at(dfd_mnt, current_cgroup,
3266                                                    PROTECT_OPATH_DIRECTORY,
3267                                                    PROTECT_LOOKUP_BENEATH_XDEV, 0);
3268                                 if (dfd_base < 0) {
3269                                         if (errno != ENOENT)
3270                                                 return syserror("Failed to open %d/%s",
3271                                                                 dfd_mnt, current_cgroup);
3272
3273                                         SYSTRACE("Current cgroup %d/%s does not exist (funky cgroup layout?)",
3274                                                  dfd_mnt, current_cgroup);
3275                                         continue;
3276                                 }
3277                                 dfd = dfd_base;
3278                         }
3279
3280                         if (!legacy_hierarchy_delegated(dfd))
3281                                 continue;
3282
3283                         /*
3284                          * We intentionally pass __current_cgroup here and not
3285                          * controllers because we would otherwise chop the
3286                          * mountpoint.
3287                          */
3288                         controller_list = list_add_controllers(__controllers);
3289                         if (!controller_list)
3290                                 return syserror_set(-ENOMEM, "Failed to create controller list from %s", __controllers);
3291
3292                         if (skip_hierarchy(ops, controller_list))
3293                                 continue;
3294
3295                         ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3296                 }
3297
3298                 ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd,
3299                                            current_cgroup, controller_list, type);
3300                 if (ret < 0)
3301                         return syserror_ret(ret, "Failed to add %s hierarchy", controllers);
3302
3303                 /* Transfer ownership. */
3304                 move_fd(dfd_mnt);
3305                 move_fd(dfd_base);
3306                 move_ptr(current_cgroup);
3307                 move_ptr(controllers);
3308                 move_ptr(controller_list);
3309                 if (type == UNIFIED_HIERARCHY)
3310                         ops->unified->delegate = move_ptr(delegate);
3311         }
3312
3313         /* determine cgroup layout */
3314         if (ops->unified) {
3315                 if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3316                         ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3317                 } else {
3318                         if (bpf_devices_cgroup_supported())
3319                                 ops->unified->utilities |= DEVICES_CONTROLLER;
3320                         ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3321                 }
3322         }
3323
3324         if (!controllers_available(ops))
3325                 return syserror_set(-ENOENT, "One or more requested controllers unavailable or not delegated");
3326
3327         return 0;
3328 }
3329
3330 static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf)
3331 {
3332         __do_close int dfd = -EBADF;
3333         int ret;
3334         const char *controllers_use;
3335
3336         if (ops->dfd_mnt >= 0)
3337                 return ret_errno(EBUSY);
3338
3339         /*
3340          * I don't see the need for allowing symlinks here. If users want to
3341          * have their hierarchy available in different locations I strongly
3342          * suggest bind-mounts.
3343          */
3344         dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3345                         PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3346         if (dfd < 0)
3347                 return syserror("Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
3348
3349         controllers_use = lxc_global_config_value("lxc.cgroup.use");
3350         if (controllers_use) {
3351                 __do_free char *dup = NULL;
3352                 char *it;
3353
3354                 dup = strdup(controllers_use);
3355                 if (!dup)
3356                         return -errno;
3357
3358                 lxc_iterate_parts(it, dup, ",") {
3359                         ret = list_add_string(&ops->cgroup_use, it);
3360                         if (ret < 0)
3361                                 return ret;
3362                 }
3363         }
3364
3365         /*
3366          * Keep dfd referenced by the cleanup function and actually move the fd
3367          * once we know the initialization succeeded. So if we fail we clean up
3368          * the dfd.
3369          */
3370         ops->dfd_mnt = dfd;
3371
3372         ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !list_empty(&conf->id_map));
3373         if (ret < 0)
3374                 return syserror_ret(ret, "Failed to initialize cgroups");
3375
3376         /* Transfer ownership to cgroup_ops. */
3377         move_fd(dfd);
3378         return 0;
3379 }
3380
3381 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3382 {
3383         const char *cgroup_pattern;
3384
3385         if (!ops)
3386                 return ret_set_errno(-1, ENOENT);
3387
3388         /* copy system-wide cgroup information */
3389         cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3390         if (cgroup_pattern && !strequal(cgroup_pattern, "")) {
3391                 ops->cgroup_pattern = strdup(cgroup_pattern);
3392                 if (!ops->cgroup_pattern)
3393                         return ret_errno(ENOMEM);
3394         }
3395
3396         return 0;
3397 }
3398
3399 struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
3400 {
3401         __cleanup_cgroup_ops struct cgroup_ops *cgfsng_ops = NULL;
3402
3403         cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
3404         if (!cgfsng_ops)
3405                 return ret_set_errno(NULL, ENOMEM);
3406
3407         cgfsng_ops->cgroup_layout       = CGROUP_LAYOUT_UNKNOWN;
3408         cgfsng_ops->dfd_mnt             = -EBADF;
3409
3410         if (initialize_cgroups(cgfsng_ops, conf))
3411                 return NULL;
3412
3413         cgfsng_ops->data_init                           = cgfsng_data_init;
3414         cgfsng_ops->payload_destroy                     = cgfsng_payload_destroy;
3415         cgfsng_ops->monitor_destroy                     = cgfsng_monitor_destroy;
3416         cgfsng_ops->monitor_create                      = cgfsng_monitor_create;
3417         cgfsng_ops->monitor_enter                       = cgfsng_monitor_enter;
3418         cgfsng_ops->monitor_delegate_controllers        = cgfsng_monitor_delegate_controllers;
3419         cgfsng_ops->payload_delegate_controllers        = cgfsng_payload_delegate_controllers;
3420         cgfsng_ops->payload_create                      = cgfsng_payload_create;
3421         cgfsng_ops->payload_enter                       = cgfsng_payload_enter;
3422         cgfsng_ops->finalize                            = cgfsng_finalize;
3423         cgfsng_ops->get_cgroup                          = cgfsng_get_cgroup;
3424         cgfsng_ops->get                                 = cgfsng_get;
3425         cgfsng_ops->set                                 = cgfsng_set;
3426         cgfsng_ops->freeze                              = cgfsng_freeze;
3427         cgfsng_ops->unfreeze                            = cgfsng_unfreeze;
3428         cgfsng_ops->setup_limits_legacy                 = cgfsng_setup_limits_legacy;
3429         cgfsng_ops->setup_limits                        = cgfsng_setup_limits;
3430         cgfsng_ops->driver                              = "cgfsng";
3431         cgfsng_ops->version                             = "1.0.0";
3432         cgfsng_ops->attach                              = cgfsng_attach;
3433         cgfsng_ops->chown                               = cgfsng_chown;
3434         cgfsng_ops->mount                               = cgfsng_mount;
3435         cgfsng_ops->devices_activate                    = cgfsng_devices_activate;
3436         cgfsng_ops->get_limit_cgroup                    = cgfsng_get_limit_cgroup;
3437
3438         cgfsng_ops->criu_escape                         = cgfsng_criu_escape;
3439         cgfsng_ops->criu_num_hierarchies                = cgfsng_criu_num_hierarchies;
3440         cgfsng_ops->criu_get_hierarchies                = cgfsng_criu_get_hierarchies;
3441
3442         return move_ptr(cgfsng_ops);
3443 }
3444
3445 static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_t pid)
3446 {
3447         int ret;
3448
3449         if (!list_empty(&conf->id_map)) {
3450                 struct userns_exec_unified_attach_data args = {
3451                         .conf           = conf,
3452                         .unified_fd     = fd_unified,
3453                         .pid            = pid,
3454                 };
3455
3456                 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3457                 if (ret < 0)
3458                         return -errno;
3459
3460                 ret = userns_exec_minimal(conf,
3461                                           cgroup_unified_attach_parent_wrapper,
3462                                           &args,
3463                                           cgroup_unified_attach_child_wrapper,
3464                                           &args);
3465         } else {
3466                 ret = cgroup_attach_leaf(conf, fd_unified, pid);
3467         }
3468
3469         return ret;
3470 }
3471
3472 static int __cgroup_attach_many(const struct lxc_conf *conf, const char *name,
3473                                 const char *lxcpath, pid_t pid)
3474 {
3475         call_cleaner(put_cgroup_ctx) struct cgroup_ctx *ctx = &(struct cgroup_ctx){};
3476         int ret;
3477         size_t idx;
3478         ssize_t pidstr_len;
3479         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
3480
3481         ret = lxc_cmd_get_cgroup_ctx(name, lxcpath, sizeof(struct cgroup_ctx), ctx);
3482         if (ret < 0)
3483                 return ret_errno(ENOSYS);
3484
3485         pidstr_len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
3486         if (pidstr_len < 0)
3487                 return pidstr_len;
3488
3489         for (idx = 0; idx < ctx->fd_len; idx++) {
3490                 int dfd_con = ctx->fd[idx];
3491
3492                 if (unified_cgroup_fd(dfd_con))
3493                         ret = __unified_attach_fd(conf, dfd_con, pid);
3494                 else
3495                         ret = lxc_writeat(dfd_con, "cgroup.procs", pidstr, pidstr_len);
3496                 if (ret)
3497                         return syserror_ret(ret, "Failed to attach to cgroup fd %d", dfd_con);
3498                 else
3499                         TRACE("Attached to cgroup fd %d", dfd_con);
3500         }
3501
3502         if (idx == 0)
3503                 return syserror_set(-ENOENT, "Failed to attach to cgroups");
3504
3505         TRACE("Attached to %s cgroup layout", cgroup_layout_name(ctx->layout));
3506         return 0;
3507 }
3508
3509 static int __cgroup_attach_unified(const struct lxc_conf *conf, const char *name,
3510                                    const char *lxcpath, pid_t pid)
3511 {
3512         __do_close int dfd_unified = -EBADF;
3513
3514         if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
3515                 return ret_errno(EINVAL);
3516
3517         dfd_unified = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3518         if (dfd_unified < 0)
3519                 return ret_errno(ENOSYS);
3520
3521         return __unified_attach_fd(conf, dfd_unified, pid);
3522 }
3523
3524 int cgroup_attach(const struct lxc_conf *conf, const char *name,
3525                   const char *lxcpath, pid_t pid)
3526 {
3527         int ret;
3528
3529         ret = __cgroup_attach_many(conf, name, lxcpath, pid);
3530         if (ret < 0) {
3531                 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3532                         return ret;
3533
3534                 ret = __cgroup_attach_unified(conf, name, lxcpath, pid);
3535                 if (ret < 0 && ERRNO_IS_NOT_SUPPORTED(ret))
3536                         return ret_errno(ENOSYS);
3537         }
3538
3539         return ret;
3540 }
3541
3542 /* Connects to command socket therefore isn't callable from command handler. */
3543 int cgroup_get(const char *name, const char *lxcpath, const char *key, char *buf, size_t len)
3544 {
3545         __do_close int dfd = -EBADF;
3546         struct cgroup_fd fd = {
3547                 .fd = -EBADF,
3548         };
3549         size_t len_controller;
3550         int ret;
3551
3552         if (is_empty_string(name) || is_empty_string(lxcpath) ||
3553             is_empty_string(key))
3554                 return ret_errno(EINVAL);
3555
3556         if ((buf && !len) || (len && !buf))
3557                 return ret_errno(EINVAL);
3558
3559         len_controller = strcspn(key, ".");
3560         len_controller++; /* Don't forget the \0 byte. */
3561         if (len_controller >= MAX_CGROUP_ROOT_NAMELEN)
3562                 return ret_errno(EINVAL);
3563         (void)strlcpy(fd.controller, key, len_controller);
3564
3565         ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd);
3566         if (ret < 0) {
3567                 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3568                         return ret;
3569
3570                 dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3571                 if (dfd < 0) {
3572                         if (!ERRNO_IS_NOT_SUPPORTED(ret))
3573                                 return ret;
3574
3575                         return ret_errno(ENOSYS);
3576                 }
3577                 fd.type = UNIFIED_HIERARCHY;
3578                 fd.fd = move_fd(dfd);
3579         }
3580         dfd = move_fd(fd.fd);
3581
3582         TRACE("Reading %s from %s cgroup hierarchy", key, cgroup_hierarchy_name(fd.type));
3583
3584         if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices"))
3585                 return ret_errno(EOPNOTSUPP);
3586         else
3587                 ret = lxc_read_try_buf_at(dfd, key, buf, len);
3588
3589         return ret;
3590 }
3591
3592 /* Connects to command socket therefore isn't callable from command handler. */
3593 int cgroup_set(const char *name, const char *lxcpath, const char *key, const char *value)
3594 {
3595         __do_close int dfd = -EBADF;
3596         struct cgroup_fd fd = {
3597                 .fd = -EBADF,
3598         };
3599         size_t len_controller;
3600         int ret;
3601
3602         if (is_empty_string(name) || is_empty_string(lxcpath) ||
3603             is_empty_string(key) || is_empty_string(value))
3604                 return ret_errno(EINVAL);
3605
3606         len_controller = strcspn(key, ".");
3607         len_controller++; /* Don't forget the \0 byte. */
3608         if (len_controller >= MAX_CGROUP_ROOT_NAMELEN)
3609                 return ret_errno(EINVAL);
3610         (void)strlcpy(fd.controller, key, len_controller);
3611
3612         ret = lxc_cmd_get_limit_cgroup_fd(name, lxcpath, sizeof(struct cgroup_fd), &fd);
3613         if (ret < 0) {
3614                 if (!ERRNO_IS_NOT_SUPPORTED(ret))
3615                         return ret;
3616
3617                 dfd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3618                 if (dfd < 0) {
3619                         if (!ERRNO_IS_NOT_SUPPORTED(ret))
3620                                 return ret;
3621
3622                         return ret_errno(ENOSYS);
3623                 }
3624                 fd.type = UNIFIED_HIERARCHY;
3625                 fd.fd = move_fd(dfd);
3626         }
3627         dfd = move_fd(fd.fd);
3628
3629         TRACE("Setting %s to %s in %s cgroup hierarchy", key, value, cgroup_hierarchy_name(fd.type));
3630
3631         if (fd.type == UNIFIED_HIERARCHY && strequal(fd.controller, "devices")) {
3632                 struct device_item device = {};
3633
3634                 ret = device_cgroup_rule_parse(&device, key, value);
3635                 if (ret < 0)
3636                         return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
3637                                                key, value);
3638
3639                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3640         } else {
3641                 ret = lxc_writeat(dfd, key, value, strlen(value));
3642         }
3643
3644         return ret;
3645 }
3646
3647 static int do_cgroup_freeze(int unified_fd,
3648                             const char *state_string,
3649                             int state_num,
3650                             int timeout,
3651                             const char *epoll_error,
3652                             const char *wait_error)
3653 {
3654         __do_close int events_fd = -EBADF;
3655         call_cleaner(lxc_mainloop_close) struct lxc_async_descr *descr_ptr = NULL;
3656         int ret;
3657         struct lxc_async_descr descr = {};
3658
3659         if (timeout != 0) {
3660                 ret = lxc_mainloop_open(&descr);
3661                 if (ret)
3662                         return log_error_errno(-1, errno, "%s", epoll_error);
3663
3664                 /* automatically cleaned up now */
3665                 descr_ptr = &descr;
3666
3667                 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3668                 if (events_fd < 0)
3669                         return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3670
3671                 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI,
3672                                                       freezer_cgroup_events_cb,
3673                                                       default_cleanup_handler,
3674                                                       INT_TO_PTR(state_num),
3675                                                       "freezer_cgroup_events_cb");
3676                 if (ret < 0)
3677                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3678         }
3679
3680         ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3681         if (ret < 0)
3682                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3683
3684         if (timeout != 0) {
3685                 ret = lxc_mainloop(&descr, timeout);
3686                 if (ret)
3687                         return log_error_errno(-1, errno, "%s", wait_error);
3688         }
3689
3690         return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3691 }
3692
3693 static inline int __cgroup_freeze(int unified_fd, int timeout)
3694 {
3695         return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3696                                 "Failed to create epoll instance to wait for container freeze",
3697                                 "Failed to wait for container to be frozen");
3698 }
3699
3700 int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
3701 {
3702         __do_close int unified_fd = -EBADF;
3703         int ret;
3704
3705         if (is_empty_string(name) || is_empty_string(lxcpath))
3706                 return ret_errno(EINVAL);
3707
3708         unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3709         if (unified_fd < 0)
3710                 return ret_errno(ENOCGROUP2);
3711
3712         lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
3713         ret = __cgroup_freeze(unified_fd, timeout);
3714         lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
3715         return ret;
3716 }
3717
3718 int __cgroup_unfreeze(int unified_fd, int timeout)
3719 {
3720         return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3721                                 "Failed to create epoll instance to wait for container freeze",
3722                                 "Failed to wait for container to be frozen");
3723 }
3724
3725 int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
3726 {
3727         __do_close int unified_fd = -EBADF;
3728         int ret;
3729
3730         if (is_empty_string(name) || is_empty_string(lxcpath))
3731                 return ret_errno(EINVAL);
3732
3733         unified_fd = lxc_cmd_get_limit_cgroup2_fd(name, lxcpath);
3734         if (unified_fd < 0)
3735                 return ret_errno(ENOCGROUP2);
3736
3737         lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
3738         ret = __cgroup_unfreeze(unified_fd, timeout);
3739         lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
3740         return ret;
3741 }