src/lxc/cgroups/cgfsng.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 /*
   4  * cgfs-ng.c: this is a new, simplified implementation of a filesystem
   5  * cgroup backend.  The original cgfs.c was designed to be as flexible
   6  * as possible.  It would try to find cgroup filesystems no matter where
   7  * or how you had them mounted, and deduce the most usable mount for
   8  * each controller.
   9  *
  10  * This new implementation assumes that cgroup filesystems are mounted
  11  * under /sys/fs/cgroup/clist where clist is either the controller, or
  12  * a comma-separated list of controllers.
  13  */
  14
  15 #ifndef _GNU_SOURCE
  16 #define _GNU_SOURCE 1
  17 #endif
  18 #include <ctype.h>
  19 #include <dirent.h>
  20 #include <errno.h>
  21 #include <grp.h>
  22 #include <linux/kdev_t.h>
  23 #include <linux/types.h>
  24 #include <poll.h>
  25 #include <signal.h>
  26 #include <stdint.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/epoll.h>
  31 #include <sys/types.h>
  32 #include <unistd.h>
  33
  34 #include "af_unix.h"
  35 #include "caps.h"
  36 #include "cgroup.h"
  37 #include "cgroup2_devices.h"
  38 #include "cgroup_utils.h"
  39 #include "commands.h"
  40 #include "commands_utils.h"
  41 #include "conf.h"
  42 #include "config.h"
  43 #include "log.h"
  44 #include "macro.h"
  45 #include "mainloop.h"
  46 #include "memory_utils.h"
  47 #include "mount_utils.h"
  48 #include "storage/storage.h"
  49 #include "string_utils.h"
  50 #include "syscall_wrappers.h"
  51 #include "utils.h"
  52
  53 #ifndef HAVE_STRLCPY
  54 #include "include/strlcpy.h"
  55 #endif
  56
  57 #ifndef HAVE_STRLCAT
  58 #include "include/strlcat.h"
  59 #endif
  60
  61 lxc_log_define(cgfsng, cgroup);
  62
  63 /*
  64  * Given a pointer to a null-terminated array of pointers, realloc to add one
  65  * entry, and point the new entry to NULL. Do not fail. Return the index to the
  66  * second-to-last entry - that is, the one which is now available for use
  67  * (keeping the list null-terminated).
  68  */
  69 static int list_add(void ***list)
  70 {
  71         int idx = 0;
  72         void **p;
  73
  74         if (*list)
  75                 for (; (*list)[idx]; idx++)
  76                         ;
  77
  78         p = realloc(*list, (idx + 2) * sizeof(void **));
  79         if (!p)
  80                 return ret_errno(ENOMEM);
  81
  82         p[idx + 1] = NULL;
  83         *list = p;
  84
  85         return idx;
  86 }
  87
  88 /* Given a null-terminated array of strings, check whether @entry is one of the
  89  * strings.
  90  */
  91 static bool string_in_list(char **list, const char *entry)
  92 {
  93         if (!list)
  94                 return false;
  95
  96         for (int i = 0; list[i]; i++)
  97                 if (strequal(list[i], entry))
  98                         return true;
  99
 100         return false;
 101 }
 102
 103 /* Given a handler's cgroup data, return the struct hierarchy for the controller
 104  * @c, or NULL if there is none.
 105  */
 106 static struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
 107 {
 108         if (!ops->hierarchies)
 109                 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
 110
 111         for (int i = 0; ops->hierarchies[i]; i++) {
 112                 if (!controller) {
 113                         /* This is the empty unified hierarchy. */
 114                         if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
 115                                 return ops->hierarchies[i];
 116
 117                         continue;
 118                 }
 119
 120                 /*
 121                  * Handle controllers with significant implementation changes
 122                  * from cgroup to cgroup2.
 123                  */
 124                 if (pure_unified_layout(ops)) {
 125                         if (strequal(controller, "devices")) {
 126                                 if (device_utility_controller(ops->unified))
 127                                         return ops->unified;
 128
 129                                 break;
 130                         } else if (strequal(controller, "freezer")) {
 131                                 if (freezer_utility_controller(ops->unified))
 132                                         return ops->unified;
 133
 134                                 break;
 135                         }
 136                 }
 137
 138                 if (string_in_list(ops->hierarchies[i]->controllers, controller))
 139                         return ops->hierarchies[i];
 140         }
 141
 142         if (controller)
 143                 WARN("There is no useable %s controller", controller);
 144         else
 145                 WARN("There is no empty unified cgroup hierarchy");
 146
 147         return ret_set_errno(NULL, ENOENT);
 148 }
 149
 150 /* Taken over modified from the kernel sources. */
 151 #define NBITS 32 /* bits in uint32_t */
 152 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
 153 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
 154
 155 static void set_bit(unsigned bit, uint32_t *bitarr)
 156 {
 157         bitarr[bit / NBITS] |= (1 << (bit % NBITS));
 158 }
 159
 160 static void clear_bit(unsigned bit, uint32_t *bitarr)
 161 {
 162         bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
 163 }
 164
 165 static bool is_set(unsigned bit, uint32_t *bitarr)
 166 {
 167         return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
 168 }
 169
 170 /* Create cpumask from cpulist aka turn:
 171  *
 172  *      0,2-3
 173  *
 174  * into bit array
 175  *
 176  *      1 0 1 1
 177  */
 178 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
 179 {
 180         __do_free uint32_t *bitarr = NULL;
 181         char *token;
 182         size_t arrlen;
 183
 184         arrlen = BITS_TO_LONGS(nbits);
 185         bitarr = calloc(arrlen, sizeof(uint32_t));
 186         if (!bitarr)
 187                 return ret_set_errno(NULL, ENOMEM);
 188
 189         lxc_iterate_parts(token, buf, ",") {
 190                 errno = 0;
 191                 unsigned end, start;
 192                 char *range;
 193
 194                 start = strtoul(token, NULL, 0);
 195                 end = start;
 196                 range = strchr(token, '-');
 197                 if (range)
 198                         end = strtoul(range + 1, NULL, 0);
 199
 200                 if (!(start <= end))
 201                         return ret_set_errno(NULL, EINVAL);
 202
 203                 if (end >= nbits)
 204                         return ret_set_errno(NULL, EINVAL);
 205
 206                 while (start <= end)
 207                         set_bit(start++, bitarr);
 208         }
 209
 210         return move_ptr(bitarr);
 211 }
 212
 213 /* Turn cpumask into simple, comma-separated cpulist. */
 214 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
 215 {
 216         __do_free_string_list char **cpulist = NULL;
 217         char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
 218         int ret;
 219
 220         for (size_t i = 0; i <= nbits; i++) {
 221                 if (!is_set(i, bitarr))
 222                         continue;
 223
 224                 ret = strnprintf(numstr, sizeof(numstr), "%zu", i);
 225                 if (ret < 0)
 226                         return NULL;
 227
 228                 ret = lxc_append_string(&cpulist, numstr);
 229                 if (ret < 0)
 230                         return ret_set_errno(NULL, ENOMEM);
 231         }
 232
 233         if (!cpulist)
 234                 return ret_set_errno(NULL, ENOMEM);
 235
 236         return lxc_string_join(",", (const char **)cpulist, false);
 237 }
 238
 239 static ssize_t get_max_cpus(char *cpulist)
 240 {
 241         char *c1, *c2;
 242         char *maxcpus = cpulist;
 243         size_t cpus = 0;
 244
 245         c1 = strrchr(maxcpus, ',');
 246         if (c1)
 247                 c1++;
 248
 249         c2 = strrchr(maxcpus, '-');
 250         if (c2)
 251                 c2++;
 252
 253         if (!c1 && !c2)
 254                 c1 = maxcpus;
 255         else if (c1 > c2)
 256                 c2 = c1;
 257         else if (c1 < c2)
 258                 c1 = c2;
 259         else if (!c1 && c2)
 260                 c1 = c2;
 261
 262         errno = 0;
 263         cpus = strtoul(c1, NULL, 0);
 264         if (errno != 0)
 265                 return -1;
 266
 267         return cpus;
 268 }
 269
 270 static inline bool is_unified_hierarchy(const struct hierarchy *h)
 271 {
 272         return h->fs_type == UNIFIED_HIERARCHY;
 273 }
 274
 275 /* Return true if the controller @entry is found in the null-terminated list of
 276  * hierarchies @hlist.
 277  */
 278 static bool controller_available(struct hierarchy **hlist, char *entry)
 279 {
 280         if (!hlist)
 281                 return false;
 282
 283         for (int i = 0; hlist[i]; i++)
 284                 if (string_in_list(hlist[i]->controllers, entry))
 285                         return true;
 286
 287         return false;
 288 }
 289
 290 static bool controllers_available(struct cgroup_ops *ops)
 291 {
 292         struct hierarchy **hlist;
 293
 294         if (!ops->cgroup_use)
 295                 return true;
 296
 297         hlist = ops->hierarchies;
 298         for (char **cur = ops->cgroup_use; cur && *cur; cur++)
 299                 if (!controller_available(hlist, *cur))
 300                         return log_error(false, "The %s controller found", *cur);
 301
 302         return true;
 303 }
 304
 305 static char **list_new(void)
 306 {
 307         __do_free_string_list char **list = NULL;
 308         int idx;
 309
 310         idx = list_add((void ***)&list);
 311         if (idx < 0)
 312                 return NULL;
 313
 314         list[idx] = NULL;
 315         return move_ptr(list);
 316 }
 317
 318 static int list_add_string(char ***list, char *entry)
 319 {
 320         __do_free char *dup = NULL;
 321         int idx;
 322
 323         dup = strdup(entry);
 324         if (!dup)
 325                 return ret_errno(ENOMEM);
 326
 327         idx = list_add((void ***)list);
 328         if (idx < 0)
 329                 return idx;
 330
 331         (*list)[idx] = move_ptr(dup);
 332         return 0;
 333 }
 334
 335 static char **list_add_controllers(char *controllers)
 336 {
 337         __do_free_string_list char **list = NULL;
 338         char *it;
 339
 340         lxc_iterate_parts(it, controllers, " \t\n") {
 341                 int ret;
 342
 343                 ret = list_add_string(&list, it);
 344                 if (ret < 0)
 345                         return NULL;
 346         }
 347
 348         return move_ptr(list);
 349 }
 350
 351 static char **unified_controllers(int dfd, const char *file)
 352 {
 353         __do_free char *buf = NULL;
 354
 355         buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
 356         if (!buf)
 357                 return NULL;
 358
 359         return list_add_controllers(buf);
 360 }
 361
 362 static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers)
 363 {
 364         if (!ops->cgroup_use)
 365                 return false;
 366
 367         for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
 368                 bool found = false;
 369
 370                 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
 371                         if (!strequal(*cur_use, *cur_ctrl))
 372                                 continue;
 373
 374                         found = true;
 375                         break;
 376                 }
 377
 378                 if (found)
 379                         continue;
 380
 381                 return true;
 382         }
 383
 384         return false;
 385 }
 386
 387 static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt,
 388                                 int dfd_base, char *base_cgroup,
 389                                 char **controllers, cgroupfs_type_magic_t fs_type)
 390 {
 391         __do_free struct hierarchy *new = NULL;
 392         int idx;
 393
 394         if (abspath(base_cgroup))
 395                 return syserrno_set(-EINVAL, "Container base path must be relative to controller mount");
 396
 397         new = zalloc(sizeof(*new));
 398         if (!new)
 399                 return ret_errno(ENOMEM);
 400
 401         new->dfd_con            = -EBADF;
 402         new->dfd_lim            = -EBADF;
 403         new->dfd_mon            = -EBADF;
 404
 405         new->fs_type            = fs_type;
 406         new->controllers        = controllers;
 407         new->at_mnt             = mnt;
 408         new->at_base            = base_cgroup;
 409
 410         new->dfd_mnt            = dfd_mnt;
 411         new->dfd_base           = dfd_base;
 412
 413         TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s",
 414               mnt, maybe_empty(base_cgroup));
 415         for (char *const *it = new->controllers; it && *it; it++)
 416                 TRACE("The hierarchy contains the %s controller", *it);
 417
 418         idx = list_add((void ***)&ops->hierarchies);
 419         if (idx < 0)
 420                 return ret_errno(idx);
 421
 422         if (fs_type == UNIFIED_HIERARCHY)
 423                 ops->unified = new;
 424         (ops->hierarchies)[idx] = move_ptr(new);
 425
 426         return 0;
 427 }
 428
 429 static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
 430 {
 431         if (!path_prune || !hierarchies)
 432                 return 0;
 433
 434         for (int i = 0; hierarchies[i]; i++) {
 435                 struct hierarchy *h = hierarchies[i];
 436                 int ret;
 437
 438                 ret = cgroup_tree_prune(h->dfd_base, path_prune);
 439                 if (ret < 0)
 440                         SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
 441                 else
 442                         TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
 443
 444                 free_equal(h->path_lim, h->path_con);
 445         }
 446
 447         return 0;
 448 }
 449
 450 struct generic_userns_exec_data {
 451         struct hierarchy **hierarchies;
 452         const char *path_prune;
 453         struct lxc_conf *conf;
 454         uid_t origuid; /* target uid in parent namespace */
 455         char *path;
 456 };
 457
 458 static int cgroup_tree_remove_wrapper(void *data)
 459 {
 460         struct generic_userns_exec_data *arg = data;
 461         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
 462         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
 463         int ret;
 464
 465         if (!lxc_drop_groups() && errno != EPERM)
 466                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
 467
 468         ret = setresgid(nsgid, nsgid, nsgid);
 469         if (ret < 0)
 470                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
 471                                        (int)nsgid, (int)nsgid, (int)nsgid);
 472
 473         ret = setresuid(nsuid, nsuid, nsuid);
 474         if (ret < 0)
 475                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
 476                                        (int)nsuid, (int)nsuid, (int)nsuid);
 477
 478         return cgroup_tree_remove(arg->hierarchies, arg->path_prune);
 479 }
 480
 481 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
 482                                                 struct lxc_handler *handler)
 483 {
 484         int ret;
 485
 486         if (!ops) {
 487                 ERROR("Called with uninitialized cgroup operations");
 488                 return;
 489         }
 490
 491         if (!ops->hierarchies)
 492                 return;
 493
 494         if (!handler) {
 495                 ERROR("Called with uninitialized handler");
 496                 return;
 497         }
 498
 499         if (!handler->conf) {
 500                 ERROR("Called with uninitialized conf");
 501                 return;
 502         }
 503
 504         if (!ops->container_limit_cgroup) {
 505                 WARN("Uninitialized limit cgroup");
 506                 return;
 507         }
 508
 509         ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
 510         if (ret < 0)
 511                 WARN("Failed to detach bpf program from cgroup");
 512
 513         if (!lxc_list_empty(&handler->conf->id_map)) {
 514                 struct generic_userns_exec_data wrap = {
 515                         .conf                   = handler->conf,
 516                         .path_prune             = ops->container_limit_cgroup,
 517                         .hierarchies            = ops->hierarchies,
 518                         .origuid                = 0,
 519                 };
 520                 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
 521                                     &wrap, "cgroup_tree_remove_wrapper");
 522         } else {
 523                 ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup);
 524         }
 525         if (ret < 0)
 526                 SYSWARN("Failed to destroy cgroups");
 527 }
 528
 529 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
 530 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
 531 static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
 532                                     bool am_initialized)
 533 {
 534         __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
 535                        *offlinecpus = NULL, *posscpus = NULL;
 536         __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
 537                            *possmask = NULL;
 538         int ret;
 539         ssize_t i;
 540         ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
 541         bool flipped_bit = false;
 542
 543         posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
 544         if (!posscpus)
 545                 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
 546
 547         /* Get maximum number of cpus found in possible cpuset. */
 548         maxposs = get_max_cpus(posscpus);
 549         if (maxposs < 0 || maxposs >= INT_MAX - 1)
 550                 return false;
 551
 552         if (file_exists(__ISOL_CPUS)) {
 553                 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
 554                 if (!isolcpus)
 555                         return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
 556
 557                 if (isdigit(isolcpus[0])) {
 558                         /* Get maximum number of cpus found in isolated cpuset. */
 559                         maxisol = get_max_cpus(isolcpus);
 560                         if (maxisol < 0 || maxisol >= INT_MAX - 1)
 561                                 return false;
 562                 }
 563
 564                 if (maxposs < maxisol)
 565                         maxposs = maxisol;
 566                 maxposs++;
 567         } else {
 568                 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
 569         }
 570
 571         if (file_exists(__OFFLINE_CPUS)) {
 572                 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
 573                 if (!offlinecpus)
 574                         return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
 575
 576                 if (isdigit(offlinecpus[0])) {
 577                         /* Get maximum number of cpus found in offline cpuset. */
 578                         maxoffline = get_max_cpus(offlinecpus);
 579                         if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
 580                                 return false;
 581                 }
 582
 583                 if (maxposs < maxoffline)
 584                         maxposs = maxoffline;
 585                 maxposs++;
 586         } else {
 587                 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
 588         }
 589
 590         if ((maxisol == 0) && (maxoffline == 0)) {
 591                 cpulist = move_ptr(posscpus);
 592                 goto copy_parent;
 593         }
 594
 595         possmask = lxc_cpumask(posscpus, maxposs);
 596         if (!possmask)
 597                 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
 598
 599         if (maxisol > 0) {
 600                 isolmask = lxc_cpumask(isolcpus, maxposs);
 601                 if (!isolmask)
 602                         return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
 603         }
 604
 605         if (maxoffline > 0) {
 606                 offlinemask = lxc_cpumask(offlinecpus, maxposs);
 607                 if (!offlinemask)
 608                         return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
 609         }
 610
 611         for (i = 0; i <= maxposs; i++) {
 612                 if ((isolmask && !is_set(i, isolmask)) ||
 613                     (offlinemask && !is_set(i, offlinemask)) ||
 614                     !is_set(i, possmask))
 615                         continue;
 616
 617                 flipped_bit = true;
 618                 clear_bit(i, possmask);
 619         }
 620
 621         if (!flipped_bit) {
 622                 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
 623                 TRACE("No isolated or offline cpus present in cpuset");
 624         } else {
 625                 cpulist = move_ptr(posscpus);
 626                 TRACE("Removed isolated or offline cpus from cpuset");
 627         }
 628         if (!cpulist)
 629                 return log_error_errno(false, errno, "Failed to create cpu list");
 630
 631 copy_parent:
 632         if (!am_initialized) {
 633                 ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
 634                 if (ret < 0)
 635                         return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
 636
 637                 TRACE("Copied cpu settings of parent cgroup");
 638         }
 639
 640         return true;
 641 }
 642
 643 static bool cpuset1_initialize(int dfd_base, int dfd_next)
 644 {
 645         char mems[PATH_MAX];
 646         ssize_t bytes;
 647         char v;
 648
 649         /*
 650         * Determine whether the base cgroup has cpuset
 651         * inheritance turned on.
 652          */
 653         bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
 654         if (bytes < 0)
 655                 return syserrno(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
 656
 657         /*
 658         * Initialize cpuset.cpus and make remove any isolated
 659         * and offline cpus.
 660          */
 661         if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
 662                 return syserrno(false, "Failed to initialize cpuset.cpus");
 663
 664         /* Read cpuset.mems from parent... */
 665         bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
 666         if (bytes < 0)
 667                 return syserrno(false, "Failed to read file %d(cpuset.mems)", dfd_base);
 668
 669         /* ... and copy to first cgroup in the tree... */
 670         bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
 671         if (bytes < 0)
 672                 return syserrno(false, "Failed to write %d(cpuset.mems)", dfd_next);
 673
 674         /* ... and finally turn on cpuset inheritance. */
 675         bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
 676         if (bytes < 0)
 677                 return syserrno(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
 678
 679         return log_trace(true, "Initialized cpuset in the legacy hierarchy");
 680 }
 681
 682 static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
 683                                 bool cpuset_v1, bool eexist_ignore)
 684 {
 685         __do_close int dfd_final = -EBADF;
 686         int dfd_cur = dfd_base;
 687         int ret = 0;
 688         size_t len;
 689         char *cur;
 690         char buf[PATH_MAX];
 691
 692         if (is_empty_string(path))
 693                 return ret_errno(EINVAL);
 694
 695         len = strlcpy(buf, path, sizeof(buf));
 696         if (len >= sizeof(buf))
 697                 return ret_errno(E2BIG);
 698
 699         lxc_iterate_parts(cur, buf, "/") {
 700                 /*
 701                  * Even though we vetted the paths when we parsed the config
 702                  * we're paranoid here and check that the path is neither
 703                  * absolute nor walks upwards.
 704                  */
 705                 if (abspath(cur))
 706                         return syserrno_set(-EINVAL, "No absolute paths allowed");
 707
 708                 if (strnequal(cur, "..", STRLITERALLEN("..")))
 709                         return syserrno_set(-EINVAL, "No upward walking paths allowed");
 710
 711                 ret = mkdirat(dfd_cur, cur, mode);
 712                 if (ret < 0) {
 713                         if (errno != EEXIST)
 714                                 return syserrno(-errno, "Failed to create %d(%s)", dfd_cur, cur);
 715
 716                         ret = -EEXIST;
 717                 }
 718                 TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
 719
 720                 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
 721                 if (dfd_final < 0)
 722                         return syserrno(-errno, "Fail to open%s directory %d(%s)",
 723                                         !ret ? " newly created" : "", dfd_base, cur);
 724                 if (dfd_cur != dfd_base)
 725                         close(dfd_cur);
 726                 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
 727                         return syserrno(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
 728                 /*
 729                  * Leave dfd_final pointing to the last fd we opened so
 730                  * it will be automatically zapped if we return early.
 731                  */
 732                 dfd_cur = dfd_final;
 733         }
 734
 735         /* The final cgroup must be succesfully creatd by us. */
 736         if (ret) {
 737                 if (ret != -EEXIST || !eexist_ignore)
 738                         return syserrno_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
 739         }
 740
 741         return move_fd(dfd_final);
 742 }
 743
 744 static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
 745                                struct hierarchy *h, const char *cgroup_limit_dir,
 746                                const char *cgroup_leaf, bool payload)
 747 {
 748         __do_close int fd_limit = -EBADF, fd_final = -EBADF;
 749         __do_free char *path = NULL, *limit_path = NULL;
 750         bool cpuset_v1 = false;
 751
 752         /*
 753          * The legacy cpuset controller needs massaging in case inheriting
 754          * settings from its immediate ancestor cgroup hasn't been turned on.
 755          */
 756         cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
 757
 758         if (payload && cgroup_leaf) {
 759                 /* With isolation both parts need to not already exist. */
 760                 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
 761                 if (fd_limit < 0)
 762                         return syserrno(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
 763
 764                 TRACE("Created limit cgroup %d->%d(%s)",
 765                       fd_limit, h->dfd_base, cgroup_limit_dir);
 766
 767                 /*
 768                  * With isolation the devices legacy cgroup needs to be
 769                  * iinitialized early, as it typically contains an 'a' (all)
 770                  * line, which is not possible once a subdirectory has been
 771                  * created.
 772                  */
 773                 if (string_in_list(h->controllers, "devices") &&
 774                     !ops->setup_limits_legacy(ops, conf, true))
 775                         return log_error(false, "Failed to setup legacy device limits");
 776
 777                 limit_path = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
 778                 path = must_make_path(limit_path, cgroup_leaf, NULL);
 779
 780                 /*
 781                  * If we use a separate limit cgroup, the leaf cgroup, i.e. the
 782                  * cgroup the container actually resides in, is below fd_limit.
 783                  */
 784                 fd_final = __cgroup_tree_create(fd_limit, cgroup_leaf, 0755, cpuset_v1, false);
 785                 if (fd_final < 0) {
 786                         /* Ensure we don't leave any garbage behind. */
 787                         if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
 788                                 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
 789                         else
 790                                 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
 791                 }
 792         } else {
 793                 path = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
 794
 795                 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
 796         }
 797         if (fd_final < 0)
 798                 return syserrno(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
 799
 800         if (payload) {
 801                 h->dfd_con = move_fd(fd_final);
 802                 h->path_con = move_ptr(path);
 803
 804                 if (fd_limit < 0)
 805                         h->dfd_lim = h->dfd_con;
 806                 else
 807                         h->dfd_lim = move_fd(fd_limit);
 808
 809                 if (limit_path)
 810                         h->path_lim = move_ptr(limit_path);
 811                 else
 812                         h->path_lim = h->path_con;
 813         } else {
 814                 h->dfd_mon = move_fd(fd_final);
 815         }
 816
 817         return true;
 818 }
 819
 820 static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
 821                                    bool payload)
 822 {
 823         bool prune = true;
 824
 825         if (payload) {
 826                 /* Check whether we actually created the cgroup to prune. */
 827                 if (h->dfd_lim < 0)
 828                         prune = false;
 829
 830                 free_equal(h->path_con, h->path_lim);
 831                 close_equal(h->dfd_con, h->dfd_lim);
 832         } else {
 833                 /* Check whether we actually created the cgroup to prune. */
 834                 if (h->dfd_mon < 0)
 835                         prune = false;
 836
 837                 close_prot_errno_disarm(h->dfd_mon);
 838         }
 839
 840         /* We didn't create this cgroup. */
 841         if (!prune)
 842                 return;
 843
 844         if (cgroup_tree_prune(h->dfd_base, path_prune))
 845                 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
 846         else
 847                 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
 848 }
 849
 850 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
 851                                                 struct lxc_handler *handler)
 852 {
 853         int len;
 854         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
 855         const struct lxc_conf *conf;
 856
 857         if (!ops) {
 858                 ERROR("Called with uninitialized cgroup operations");
 859                 return;
 860         }
 861
 862         if (!ops->hierarchies)
 863                 return;
 864
 865         if (!handler) {
 866                 ERROR("Called with uninitialized handler");
 867                 return;
 868         }
 869
 870         if (!handler->conf) {
 871                 ERROR("Called with uninitialized conf");
 872                 return;
 873         }
 874         conf = handler->conf;
 875
 876         if (!ops->monitor_cgroup) {
 877                 WARN("Uninitialized monitor cgroup");
 878                 return;
 879         }
 880
 881         len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
 882         if (len < 0)
 883                 return;
 884
 885         for (int i = 0; ops->hierarchies[i]; i++) {
 886                 __do_close int fd_pivot = -EBADF;
 887                 __do_free char *pivot_path = NULL;
 888                 struct hierarchy *h = ops->hierarchies[i];
 889                 bool cpuset_v1 = false;
 890                 int ret;
 891
 892                 /* Monitor might have died before we entered the cgroup. */
 893                 if (handler->monitor_pid <= 0) {
 894                         WARN("No valid monitor process found while destroying cgroups");
 895                         goto cgroup_prune_tree;
 896                 }
 897
 898                 if (conf->cgroup_meta.monitor_pivot_dir)
 899                         pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
 900                 else if (conf->cgroup_meta.dir)
 901                         pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
 902                 else
 903                         pivot_path = must_make_path(CGROUP_PIVOT, NULL);
 904
 905                 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
 906
 907                 fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
 908                 if (fd_pivot < 0) {
 909                         SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
 910                         continue;
 911                 }
 912
 913                 ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
 914                 if (ret != 0) {
 915                         SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
 916                         continue;
 917                 }
 918
 919 cgroup_prune_tree:
 920                 ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup);
 921                 if (ret < 0)
 922                         SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup);
 923                 else
 924                         TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup);
 925         }
 926 }
 927
 928 /*
 929  * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
 930  * proper prefix directory of lxc.cgroup.dir.payload.
 931  *
 932  * Returns the prefix length if it is set, otherwise zero on success.
 933  */
 934 static bool check_cgroup_dir_config(struct lxc_conf *conf)
 935 {
 936         const char *monitor_dir = conf->cgroup_meta.monitor_dir,
 937                    *container_dir = conf->cgroup_meta.container_dir,
 938                    *namespace_dir = conf->cgroup_meta.namespace_dir;
 939
 940         /* none of the new options are set, all is fine */
 941         if (!monitor_dir && !container_dir && !namespace_dir)
 942                 return true;
 943
 944         /* some are set, make sure lxc.cgroup.dir is not also set*/
 945         if (conf->cgroup_meta.dir)
 946                 return log_error_errno(false, EINVAL,
 947                         "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
 948
 949         /* make sure both monitor and payload are set */
 950         if (!monitor_dir || !container_dir)
 951                 return log_error_errno(false, EINVAL,
 952                         "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
 953
 954         /* namespace_dir may be empty */
 955         return true;
 956 }
 957
 958 __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
 959 {
 960         __do_free char *monitor_cgroup = NULL;
 961         int idx = 0;
 962         int i;
 963         size_t len;
 964         char *suffix = NULL;
 965         struct lxc_conf *conf;
 966
 967         if (!ops)
 968                 return ret_set_errno(false, ENOENT);
 969
 970         if (!ops->hierarchies)
 971                 return true;
 972
 973         if (ops->monitor_cgroup)
 974                 return ret_set_errno(false, EEXIST);
 975
 976         if (!handler || !handler->conf)
 977                 return ret_set_errno(false, EINVAL);
 978
 979         conf = handler->conf;
 980
 981         if (!check_cgroup_dir_config(conf))
 982                 return false;
 983
 984         if (conf->cgroup_meta.monitor_dir) {
 985                 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
 986         } else if (conf->cgroup_meta.dir) {
 987                 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
 988                                              DEFAULT_MONITOR_CGROUP_PREFIX,
 989                                              handler->name,
 990                                              CGROUP_CREATE_RETRY, NULL);
 991         } else if (ops->cgroup_pattern) {
 992                 __do_free char *cgroup_tree = NULL;
 993
 994                 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
 995                 if (!cgroup_tree)
 996                         return ret_set_errno(false, ENOMEM);
 997
 998                 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
 999                                              DEFAULT_MONITOR_CGROUP,
1000                                              CGROUP_CREATE_RETRY, NULL);
1001         } else {
1002                 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1003                                              handler->name,
1004                                              CGROUP_CREATE_RETRY, NULL);
1005         }
1006         if (!monitor_cgroup)
1007                 return ret_set_errno(false, ENOMEM);
1008
1009         if (!conf->cgroup_meta.monitor_dir) {
1010                 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1011                 *suffix = '\0';
1012         }
1013         do {
1014                 if (idx && suffix)
1015                         sprintf(suffix, "-%d", idx);
1016
1017                 for (i = 0; ops->hierarchies[i]; i++) {
1018                         if (cgroup_tree_create(ops, handler->conf,
1019                                                ops->hierarchies[i],
1020                                                monitor_cgroup, NULL, false))
1021                                 continue;
1022
1023                         DEBUG("Failed to create cgroup %s)", monitor_cgroup);
1024                         for (int j = 0; j <= i; j++)
1025                                 cgroup_tree_prune_leaf(ops->hierarchies[j],
1026                                                        monitor_cgroup, false);
1027
1028                         idx++;
1029                         break;
1030                 }
1031         } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1032
1033         if (idx == 1000 || (!suffix && idx != 0))
1034                 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
1035
1036         ops->monitor_cgroup = move_ptr(monitor_cgroup);
1037         return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1038 }
1039
1040 /*
1041  * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1042  * next cgroup_pattern-1, -2, ..., -999.
1043  */
1044 __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1045 {
1046         __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL;
1047         char *limit_cgroup;
1048         int idx = 0;
1049         int i;
1050         size_t len;
1051         char *suffix = NULL;
1052         struct lxc_conf *conf;
1053
1054         if (!ops)
1055                 return ret_set_errno(false, ENOENT);
1056
1057         if (!ops->hierarchies)
1058                 return true;
1059
1060         if (ops->container_cgroup || ops->container_limit_cgroup)
1061                 return ret_set_errno(false, EEXIST);
1062
1063         if (!handler || !handler->conf)
1064                 return ret_set_errno(false, EINVAL);
1065
1066         conf = handler->conf;
1067
1068         if (!check_cgroup_dir_config(conf))
1069                 return false;
1070
1071         if (conf->cgroup_meta.container_dir) {
1072                 __limit_cgroup = strdup(conf->cgroup_meta.container_dir);
1073                 if (!__limit_cgroup)
1074                         return ret_set_errno(false, ENOMEM);
1075
1076                 if (conf->cgroup_meta.namespace_dir) {
1077                         container_cgroup = must_make_path(__limit_cgroup,
1078                                                           conf->cgroup_meta.namespace_dir,
1079                                                           NULL);
1080                         limit_cgroup = __limit_cgroup;
1081                 } else {
1082                         /* explicit paths but without isolation */
1083                         limit_cgroup = move_ptr(__limit_cgroup);
1084                         container_cgroup = limit_cgroup;
1085                 }
1086         } else if (conf->cgroup_meta.dir) {
1087                 limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1088                                            DEFAULT_PAYLOAD_CGROUP_PREFIX,
1089                                            handler->name,
1090                                            CGROUP_CREATE_RETRY, NULL);
1091                 container_cgroup = limit_cgroup;
1092         } else if (ops->cgroup_pattern) {
1093                 __do_free char *cgroup_tree = NULL;
1094
1095                 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1096                 if (!cgroup_tree)
1097                         return ret_set_errno(false, ENOMEM);
1098
1099                 limit_cgroup = must_concat(&len, cgroup_tree, "/",
1100                                            DEFAULT_PAYLOAD_CGROUP,
1101                                            CGROUP_CREATE_RETRY, NULL);
1102                 container_cgroup = limit_cgroup;
1103         } else {
1104                 limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1105                                            handler->name,
1106                                            CGROUP_CREATE_RETRY, NULL);
1107                 container_cgroup = limit_cgroup;
1108         }
1109         if (!limit_cgroup)
1110                 return ret_set_errno(false, ENOMEM);
1111
1112         if (!conf->cgroup_meta.container_dir) {
1113                 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1114                 *suffix = '\0';
1115         }
1116         do {
1117                 if (idx && suffix)
1118                         sprintf(suffix, "-%d", idx);
1119
1120                 for (i = 0; ops->hierarchies[i]; i++) {
1121                         if (cgroup_tree_create(ops, handler->conf,
1122                                                ops->hierarchies[i], limit_cgroup,
1123                                                conf->cgroup_meta.namespace_dir,
1124                                                true))
1125                                 continue;
1126
1127                         DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->path_con ?: "(null)");
1128                         for (int j = 0; j <= i; j++)
1129                                 cgroup_tree_prune_leaf(ops->hierarchies[j],
1130                                                        limit_cgroup, true);
1131
1132                         idx++;
1133                         break;
1134                 }
1135         } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1136
1137         if (idx == 1000 || (!suffix && idx != 0))
1138                 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
1139
1140         ops->container_cgroup = move_ptr(container_cgroup);
1141         if (__limit_cgroup)
1142                 ops->container_limit_cgroup = move_ptr(__limit_cgroup);
1143         else
1144                 ops->container_limit_cgroup = ops->container_cgroup;
1145         INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup",
1146              ops->container_cgroup, ops->container_limit_cgroup);
1147         return true;
1148 }
1149
1150 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1151                                               struct lxc_handler *handler)
1152 {
1153         int monitor_len, transient_len = 0;
1154         char monitor[INTTYPE_TO_STRLEN(pid_t)],
1155             transient[INTTYPE_TO_STRLEN(pid_t)];
1156
1157         if (!ops)
1158                 return ret_set_errno(false, ENOENT);
1159
1160         if (!ops->hierarchies)
1161                 return true;
1162
1163         if (!ops->monitor_cgroup)
1164                 return ret_set_errno(false, ENOENT);
1165
1166         if (!handler || !handler->conf)
1167                 return ret_set_errno(false, EINVAL);
1168
1169         monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1170         if (monitor_len < 0)
1171                 return false;
1172
1173         if (handler->transient_pid > 0) {
1174                 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1175                 if (transient_len < 0)
1176                         return false;
1177         }
1178
1179         for (int i = 0; ops->hierarchies[i]; i++) {
1180                 struct hierarchy *h = ops->hierarchies[i];
1181                 int ret;
1182
1183                 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", monitor, monitor_len);
1184                 if (ret)
1185                         return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1186
1187                 TRACE("Moved monitor into cgroup %d", h->dfd_mon);
1188
1189                 if (handler->transient_pid <= 0)
1190                         continue;
1191
1192                 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len);
1193                 if (ret)
1194                         return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1195
1196                 TRACE("Moved transient process into cgroup %d", h->dfd_mon);
1197
1198                 /*
1199                  * we don't keep the fds for non-unified hierarchies around
1200                  * mainly because we don't make use of them anymore after the
1201                  * core cgroup setup is done but also because there are quite a
1202                  * lot of them.
1203                  */
1204                 if (!is_unified_hierarchy(h))
1205                         close_prot_errno_disarm(h->dfd_mon);
1206         }
1207         handler->transient_pid = -1;
1208
1209         return true;
1210 }
1211
1212 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1213                                               struct lxc_handler *handler)
1214 {
1215         int len;
1216         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1217
1218         if (!ops)
1219                 return ret_set_errno(false, ENOENT);
1220
1221         if (!ops->hierarchies)
1222                 return true;
1223
1224         if (!ops->container_cgroup)
1225                 return ret_set_errno(false, ENOENT);
1226
1227         if (!handler || !handler->conf)
1228                 return ret_set_errno(false, EINVAL);
1229
1230         len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1231         if (len < 0)
1232                 return false;
1233
1234         for (int i = 0; ops->hierarchies[i]; i++) {
1235                 struct hierarchy *h = ops->hierarchies[i];
1236                 int ret;
1237
1238                 if (is_unified_hierarchy(h) &&
1239                     (handler->clone_flags & CLONE_INTO_CGROUP))
1240                         continue;
1241
1242                 ret = lxc_writeat(h->dfd_con, "cgroup.procs", pidstr, len);
1243                 if (ret != 0)
1244                         return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->path_con);
1245
1246                 TRACE("Moved container into %s cgroup via %d", h->path_con, h->dfd_con);
1247         }
1248
1249         return true;
1250 }
1251
1252 static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1253                       gid_t chown_gid, mode_t chmod_mode)
1254 {
1255         int ret;
1256
1257         ret = fchownat(dirfd, path, chown_uid, chown_gid,
1258                        AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1259         if (ret < 0)
1260                 return log_warn_errno(-1,
1261                                       errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1262                                       dirfd, path, (int)chown_uid,
1263                                       (int)chown_gid);
1264
1265         ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1266         if (ret < 0)
1267                 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1268                                       dirfd, path, (int)chmod_mode);
1269
1270         return 0;
1271 }
1272
1273 /* chgrp the container cgroups to container group.  We leave
1274  * the container owner as cgroup owner.  So we must make the
1275  * directories 775 so that the container can create sub-cgroups.
1276  *
1277  * Also chown the tasks and cgroup.procs files.  Those may not
1278  * exist depending on kernel version.
1279  */
1280 static int chown_cgroup_wrapper(void *data)
1281 {
1282         int ret;
1283         uid_t destuid;
1284         struct generic_userns_exec_data *arg = data;
1285         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1286         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1287
1288         if (!lxc_drop_groups() && errno != EPERM)
1289                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1290
1291         ret = setresgid(nsgid, nsgid, nsgid);
1292         if (ret < 0)
1293                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
1294                                        (int)nsgid, (int)nsgid, (int)nsgid);
1295
1296         ret = setresuid(nsuid, nsuid, nsuid);
1297         if (ret < 0)
1298                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
1299                                        (int)nsuid, (int)nsuid, (int)nsuid);
1300
1301         destuid = get_ns_uid(arg->origuid);
1302         if (destuid == LXC_INVALID_UID)
1303                 destuid = 0;
1304
1305         for (int i = 0; arg->hierarchies[i]; i++) {
1306                 int dirfd = arg->hierarchies[i]->dfd_con;
1307
1308                 if (dirfd < 0)
1309                         return syserrno_set(-EBADF, "Invalid cgroup file descriptor");
1310
1311                 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
1312
1313                 /*
1314                  * Failures to chown() these are inconvenient but not
1315                  * detrimental We leave these owned by the container launcher,
1316                  * so that container root can write to the files to attach.  We
1317                  * chmod() them 664 so that container systemd can write to the
1318                  * files (which systemd in wily insists on doing).
1319                  */
1320
1321                 if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY)
1322                         (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
1323
1324                 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
1325
1326                 if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY)
1327                         continue;
1328
1329                 for (char **p = arg->hierarchies[i]->delegate; p && *p; p++)
1330                         (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
1331         }
1332
1333         return 0;
1334 }
1335
1336 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1337                                       struct lxc_conf *conf)
1338 {
1339         struct generic_userns_exec_data wrap;
1340
1341         if (!ops)
1342                 return ret_set_errno(false, ENOENT);
1343
1344         if (!ops->hierarchies)
1345                 return true;
1346
1347         if (!ops->container_cgroup)
1348                 return ret_set_errno(false, ENOENT);
1349
1350         if (!conf)
1351                 return ret_set_errno(false, EINVAL);
1352
1353         if (lxc_list_empty(&conf->id_map))
1354                 return true;
1355
1356         wrap.origuid = geteuid();
1357         wrap.path = NULL;
1358         wrap.hierarchies = ops->hierarchies;
1359         wrap.conf = conf;
1360
1361         if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1362                 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1363
1364         return true;
1365 }
1366
1367 __cgfsng_ops static void cgfsng_finalize(struct cgroup_ops *ops)
1368 {
1369         if (!ops)
1370                 return;
1371
1372         if (!ops->hierarchies)
1373                 return;
1374
1375         for (int i = 0; ops->hierarchies[i]; i++) {
1376                 struct hierarchy *h = ops->hierarchies[i];
1377
1378                 /* Close all monitor cgroup file descriptors. */
1379                 close_prot_errno_disarm(h->dfd_mon);
1380         }
1381         /* Close the cgroup root file descriptor. */
1382         close_prot_errno_disarm(ops->dfd_mnt);
1383
1384         /*
1385          * The checking for freezer support should obviously be done at cgroup
1386          * initialization time but that doesn't work reliable. The freezer
1387          * controller has been demoted (rightly so) to a simple file located in
1388          * each non-root cgroup. At the time when the container is created we
1389          * might still be located in /sys/fs/cgroup and so checking for
1390          * cgroup.freeze won't tell us anything because this file doesn't exist
1391          * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1392          * find an already existing cgroup and then check within that cgroup
1393          * for the existence of cgroup.freeze but that will only work on
1394          * systemd based hosts. Other init systems might not manage cgroups and
1395          * so no cgroup will exist. So we defer until we have created cgroups
1396          * for our container which means we check here.
1397          */
1398         if (pure_unified_layout(ops) &&
1399             !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK,
1400                        AT_SYMLINK_NOFOLLOW)) {
1401                 TRACE("Unified hierarchy supports freezer");
1402                 ops->unified->utilities |= FREEZER_CONTROLLER;
1403         }
1404 }
1405
1406 /* cgroup-full:* is done, no need to create subdirs */
1407 static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
1408 {
1409         switch (cgroup_automount_type) {
1410         case LXC_AUTO_CGROUP_RO:
1411                 return true;
1412         case LXC_AUTO_CGROUP_RW:
1413                 return true;
1414         case LXC_AUTO_CGROUP_MIXED:
1415                 return true;
1416         }
1417
1418         return false;
1419 }
1420
1421 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1422  * remount controller ro if needed and bindmount the cgroupfs onto
1423  * control/the/cg/path.
1424  */
1425 static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
1426                                        char *hierarchy_mnt, char *cgpath,
1427                                        const char *container_cgroup)
1428 {
1429         __do_free char *sourcepath = NULL;
1430         int ret, remount_flags;
1431         int flags = MS_BIND;
1432
1433         if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1434             (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
1435                 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL);
1436                 if (ret < 0)
1437                         return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1438                                                hierarchy_mnt, hierarchy_mnt);
1439
1440                 remount_flags = add_required_remount_flags(hierarchy_mnt,
1441                                                            hierarchy_mnt,
1442                                                            flags | MS_REMOUNT);
1443                 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup",
1444                             remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1445                             NULL);
1446                 if (ret < 0)
1447                         return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt);
1448
1449                 INFO("Remounted %s read-only", hierarchy_mnt);
1450         }
1451
1452         sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL);
1453         if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
1454                 flags |= MS_RDONLY;
1455
1456         ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1457         if (ret < 0)
1458                 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1459                                        h->controllers[0], cgpath);
1460         INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1461
1462         if (flags & MS_RDONLY) {
1463                 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1464                                                            flags | MS_REMOUNT);
1465                 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1466                 if (ret < 0)
1467                         return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
1468                 INFO("Remounted %s read-only", cgpath);
1469         }
1470
1471         INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1472         return 0;
1473 }
1474
1475 /* __cgroupfs_mount
1476  *
1477  * Mount cgroup hierarchies directly without using bind-mounts. The main
1478  * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1479  * cgroups for the LXC_AUTO_CGROUP_FULL option.
1480  */
1481 static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1482                             struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
1483                             const char *hierarchy_mnt)
1484 {
1485         __do_close int fd_fs = -EBADF;
1486         unsigned int flags = 0;
1487         char *fstype;
1488         int ret;
1489
1490         if (dfd_mnt_cgroupfs < 0)
1491                 return ret_errno(EINVAL);
1492
1493         flags |= MOUNT_ATTR_NOSUID;
1494         flags |= MOUNT_ATTR_NOEXEC;
1495         flags |= MOUNT_ATTR_NODEV;
1496         flags |= MOUNT_ATTR_RELATIME;
1497
1498         if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1499             (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO))
1500                 flags |= MOUNT_ATTR_RDONLY;
1501
1502         if (is_unified_hierarchy(h))
1503                 fstype = "cgroup2";
1504         else
1505                 fstype = "cgroup";
1506
1507         if (can_use_mount_api()) {
1508                 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1509                 if (fd_fs < 0)
1510                         return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1511
1512                 if (!is_unified_hierarchy(h)) {
1513                         for (const char **it = (const char **)h->controllers; it && *it; it++) {
1514                                 if (strnequal(*it, "name=", STRLITERALLEN("name=")))
1515                                         ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1516                                 else
1517                                         ret = fs_set_property(fd_fs, *it, "");
1518                                 if (ret < 0)
1519                                         return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1520                         }
1521                 }
1522
1523                 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1524                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1525                                 flags);
1526         } else {
1527                 __do_free char *controllers = NULL, *target = NULL;
1528                 unsigned int old_flags = 0;
1529                 const char *rootfs_mnt;
1530
1531                 if (!is_unified_hierarchy(h)) {
1532                         controllers = lxc_string_join(",", (const char **)h->controllers, false);
1533                         if (!controllers)
1534                                 return ret_errno(ENOMEM);
1535                 }
1536
1537                 rootfs_mnt = get_rootfs_mnt(rootfs);
1538                 ret = mnt_attributes_old(flags, &old_flags);
1539                 if (ret)
1540                         return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1541
1542                 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
1543                 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
1544         }
1545         if (ret < 0)
1546                 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1547                                        fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1548
1549         DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1550               fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1551         return 0;
1552 }
1553
1554 static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1555                                  struct lxc_rootfs *rootfs,
1556                                  int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
1557 {
1558         return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1559                                 dfd_mnt_cgroupfs, hierarchy_mnt);
1560 }
1561
1562 static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
1563                                       struct lxc_rootfs *rootfs,
1564                                       int dfd_mnt_cgroupfs,
1565                                       const char *hierarchy_mnt)
1566 {
1567         switch (cgroup_automount_type) {
1568         case LXC_AUTO_CGROUP_FULL_RO:
1569                 break;
1570         case LXC_AUTO_CGROUP_FULL_RW:
1571                 break;
1572         case LXC_AUTO_CGROUP_FULL_MIXED:
1573                 break;
1574         default:
1575                 return 0;
1576         }
1577
1578         return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1579                                 dfd_mnt_cgroupfs, hierarchy_mnt);
1580 }
1581
1582 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1583                                       struct lxc_handler *handler, int cg_flags)
1584 {
1585         __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF;
1586         __do_free char *cgroup_root = NULL;
1587         int cgroup_automount_type;
1588         bool in_cgroup_ns = false, wants_force_mount = false;
1589         struct lxc_conf *conf = handler->conf;
1590         struct lxc_rootfs *rootfs = &conf->rootfs;
1591         const char *rootfs_mnt = get_rootfs_mnt(rootfs);
1592         int ret;
1593
1594         if (!ops)
1595                 return ret_set_errno(false, ENOENT);
1596
1597         if (!ops->hierarchies)
1598                 return true;
1599
1600         if (!conf)
1601                 return ret_set_errno(false, EINVAL);
1602
1603         if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
1604                 return log_trace(true, "No cgroup mounts requested");
1605
1606         if (cg_flags & LXC_AUTO_CGROUP_FORCE) {
1607                 cg_flags &= ~LXC_AUTO_CGROUP_FORCE;
1608                 wants_force_mount = true;
1609         }
1610
1611         switch (cg_flags) {
1612         case LXC_AUTO_CGROUP_RO:
1613                 TRACE("Read-only cgroup mounts requested");
1614                 break;
1615         case LXC_AUTO_CGROUP_RW:
1616                 TRACE("Read-write cgroup mounts requested");
1617                 break;
1618         case LXC_AUTO_CGROUP_MIXED:
1619                 TRACE("Mixed cgroup mounts requested");
1620                 break;
1621         case LXC_AUTO_CGROUP_FULL_RO:
1622                 TRACE("Full read-only cgroup mounts requested");
1623                 break;
1624         case LXC_AUTO_CGROUP_FULL_RW:
1625                 TRACE("Full read-write cgroup mounts requested");
1626                 break;
1627         case LXC_AUTO_CGROUP_FULL_MIXED:
1628                 TRACE("Full mixed cgroup mounts requested");
1629                 break;
1630         default:
1631                 return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified");
1632         }
1633         cgroup_automount_type = cg_flags;
1634
1635         if (!wants_force_mount) {
1636                 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
1637
1638                 /*
1639                  * Most recent distro versions currently have init system that
1640                  * do support cgroup2 but do not mount it by default unless
1641                  * explicitly told so even if the host is cgroup2 only. That
1642                  * means they often will fail to boot. Fix this by pre-mounting
1643                  * cgroup2 by default. We will likely need to be doing this a
1644                  * few years until all distros have switched over to cgroup2 at
1645                  * which point we can safely assume that their init systems
1646                  * will mount it themselves.
1647                  */
1648                 if (pure_unified_layout(ops))
1649                         wants_force_mount = true;
1650         }
1651
1652         if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP))
1653                 in_cgroup_ns = true;
1654
1655         if (in_cgroup_ns && !wants_force_mount)
1656                 return log_trace(true, "Mounting cgroups not requested or needed");
1657
1658         /* This is really the codepath that we want. */
1659         if (pure_unified_layout(ops)) {
1660                 __do_close int dfd_mnt_unified = -EBADF;
1661
1662                 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1663                                           PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1664                 if (dfd_mnt_unified < 0)
1665                         return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
1666                                         DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1667                 /*
1668                  * If cgroup namespaces are supported but the container will
1669                  * not have CAP_SYS_ADMIN after it has started we need to mount
1670                  * the cgroups manually.
1671                  *
1672                  * Note that here we know that wants_force_mount is true.
1673                  * Otherwise we would've returned early above.
1674                  */
1675                 if (in_cgroup_ns) {
1676                         /*
1677                          *  1. cgroup:rw:force    -> Mount the cgroup2 filesystem.
1678                          *  2. cgroup:ro:force    -> Mount the cgroup2 filesystem read-only.
1679                          *  3. cgroup:mixed:force -> See comment above how this
1680                          *                           does not apply so
1681                          *                           cgroup:mixed is equal to
1682                          *                           cgroup:rw when cgroup
1683                          *                           namespaces are supported.
1684
1685                          *  4. cgroup:rw    -> No-op; init system responsible for mounting.
1686                          *  5. cgroup:ro    -> No-op; init system responsible for mounting.
1687                          *  6. cgroup:mixed -> No-op; init system responsible for mounting.
1688                          *
1689                          *  7. cgroup-full:rw    -> Not supported.
1690                          *  8. cgroup-full:ro    -> Not supported.
1691                          *  9. cgroup-full:mixed -> Not supported.
1692
1693                          * 10. cgroup-full:rw:force    -> Not supported.
1694                          * 11. cgroup-full:ro:force    -> Not supported.
1695                          * 12. cgroup-full:mixed:force -> Not supported.
1696                          */
1697                         ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
1698                         if (ret < 0)
1699                                 return syserrno(false, "Failed to force mount cgroup filesystem in cgroup namespace");
1700
1701                         return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
1702                 } else {
1703                         /*
1704                          * Either no cgroup namespace supported (highly
1705                          * unlikely unless we're dealing with a Frankenkernel.
1706                          * Or the user requested to keep the cgroup namespace
1707                          * of the host or another container.
1708                          */
1709                         if (wants_force_mount) {
1710                                 /*
1711                                  * 1. cgroup:rw:force    -> Bind-mount the cgroup2 filesystem writable.
1712                                  * 2. cgroup:ro:force    -> Bind-mount the cgroup2 filesystem read-only.
1713                                  * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
1714                                  *                          and make the parent directory of the
1715                                  *                          container's cgroup read-only but the
1716                                  *                          container's cgroup writable.
1717                                  *
1718                                  * 10. cgroup-full:rw:force    ->
1719                                  * 11. cgroup-full:ro:force    ->
1720                                  * 12. cgroup-full:mixed:force ->
1721                                  */
1722                                 errno = EOPNOTSUPP;
1723                                 SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1724                         } else {
1725                                 errno = EOPNOTSUPP;
1726                                 SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1727                         }
1728                 }
1729
1730                 return syserrno(false, "Failed to mount cgroups");
1731         }
1732
1733         /*
1734          * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
1735          * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
1736          * DEFAULT_CGROUP_MOUNTPOINT define.
1737          */
1738         if (can_use_mount_api()) {
1739                 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
1740                 if (fd_fs < 0)
1741                         return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
1742
1743                 ret = fs_set_property(fd_fs, "mode", "0755");
1744                 if (ret < 0)
1745                         return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1746
1747                 ret = fs_set_property(fd_fs, "size", "10240k");
1748                 if (ret < 0)
1749                         return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1750
1751                 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1752                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
1753                                 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
1754                                 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
1755         } else {
1756                 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1757                 ret = safe_mount(NULL, cgroup_root, "tmpfs",
1758                                  MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1759                                  "size=10240k,mode=755", rootfs_mnt);
1760         }
1761         if (ret < 0)
1762                 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
1763                                        DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1764
1765         dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1766                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1767         if (dfd_mnt_tmpfs < 0)
1768                 return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
1769                                 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1770
1771         for (int i = 0; ops->hierarchies[i]; i++) {
1772                 __do_free char *hierarchy_mnt = NULL, *path2 = NULL;
1773                 struct hierarchy *h = ops->hierarchies[i];
1774
1775                 ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000);
1776                 if (ret < 0)
1777                         return syserrno(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt);
1778
1779                 if (in_cgroup_ns && wants_force_mount) {
1780                         /*
1781                          * If cgroup namespaces are supported but the container
1782                          * will not have CAP_SYS_ADMIN after it has started we
1783                          * need to mount the cgroups manually.
1784                          */
1785                         ret = cgroupfs_mount(cgroup_automount_type, h, rootfs,
1786                                              dfd_mnt_tmpfs, h->at_mnt);
1787                         if (ret < 0)
1788                                 return false;
1789
1790                         continue;
1791                 }
1792
1793                 /* Here is where the ancient kernel section begins. */
1794                 ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs,
1795                                           dfd_mnt_tmpfs, h->at_mnt);
1796                 if (ret < 0)
1797                         return false;
1798
1799                 if (!cg_mount_needs_subdirs(cgroup_automount_type))
1800                         continue;
1801
1802                 if (!cgroup_root)
1803                         cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1804
1805                 hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL);
1806                 path2 = must_make_path(hierarchy_mnt, h->at_base,
1807                                        ops->container_cgroup, NULL);
1808                 ret = mkdir_p(path2, 0755);
1809                 if (ret < 0 && (errno != EEXIST))
1810                         return false;
1811
1812                 ret = cg_legacy_mount_controllers(cgroup_automount_type, h,
1813                                                   hierarchy_mnt, path2,
1814                                                   ops->container_cgroup);
1815                 if (ret < 0)
1816                         return false;
1817         }
1818
1819         return true;
1820 }
1821
1822 /* Only root needs to escape to the cgroup of its init. */
1823 __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
1824                                             struct lxc_conf *conf)
1825 {
1826         if (!ops)
1827                 return ret_set_errno(false, ENOENT);
1828
1829         if (!ops->hierarchies)
1830                 return true;
1831
1832         if (!conf)
1833                 return ret_set_errno(false, EINVAL);
1834
1835         if (conf->cgroup_meta.relative || geteuid())
1836                 return true;
1837
1838         for (int i = 0; ops->hierarchies[i]; i++) {
1839                 __do_free char *fullpath = NULL;
1840                 int ret;
1841
1842                 fullpath = make_cgroup_path(ops->hierarchies[i],
1843                                             ops->hierarchies[i]->at_base,
1844                                             "cgroup.procs", NULL);
1845                 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1846                 if (ret != 0)
1847                         return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
1848         }
1849
1850         return true;
1851 }
1852
1853 __cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
1854 {
1855         int i = 0;
1856
1857         if (!ops)
1858                 return ret_set_errno(-1, ENOENT);
1859
1860         if (!ops->hierarchies)
1861                 return 0;
1862
1863         for (; ops->hierarchies[i]; i++)
1864                 ;
1865
1866         return i;
1867 }
1868
1869 __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
1870                                                      int n, char ***out)
1871 {
1872         int i;
1873
1874         if (!ops)
1875                 return ret_set_errno(false, ENOENT);
1876
1877         if (!ops->hierarchies)
1878                 return ret_set_errno(false, ENOENT);
1879
1880         /* sanity check n */
1881         for (i = 0; i < n; i++)
1882                 if (!ops->hierarchies[i])
1883                         return ret_set_errno(false, ENOENT);
1884
1885         *out = ops->hierarchies[i]->controllers;
1886
1887         return true;
1888 }
1889
1890 static int cg_legacy_freeze(struct cgroup_ops *ops)
1891 {
1892         struct hierarchy *h;
1893
1894         h = get_hierarchy(ops, "freezer");
1895         if (!h)
1896                 return ret_set_errno(-1, ENOENT);
1897
1898         return lxc_write_openat(h->path_con, "freezer.state",
1899                                 "FROZEN", STRLITERALLEN("FROZEN"));
1900 }
1901
1902 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1903                                     struct lxc_epoll_descr *descr)
1904 {
1905         __do_free char *line = NULL;
1906         __do_fclose FILE *f = NULL;
1907         int state = PTR_TO_INT(cbdata);
1908         size_t len;
1909         const char *state_string;
1910
1911         f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
1912         if (!f)
1913                 return LXC_MAINLOOP_ERROR;
1914
1915         if (state == 1)
1916                 state_string = "frozen 1";
1917         else
1918                 state_string = "frozen 0";
1919
1920         while (getline(&line, &len, f) != -1)
1921                 if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2))
1922                         return LXC_MAINLOOP_CLOSE;
1923
1924         rewind(f);
1925
1926         return LXC_MAINLOOP_CONTINUE;
1927 }
1928
1929 static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
1930                                 const char *state_string,
1931                                 int state_num,
1932                                 const char *epoll_error,
1933                                 const char *wait_error)
1934 {
1935         __do_close int fd = -EBADF;
1936         call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
1937         int ret;
1938         struct lxc_epoll_descr descr;
1939         struct hierarchy *h;
1940
1941         h = ops->unified;
1942         if (!h)
1943                 return ret_set_errno(-1, ENOENT);
1944
1945         if (!h->path_con)
1946                 return ret_set_errno(-1, EEXIST);
1947
1948         if (timeout != 0) {
1949                 __do_free char *events_file = NULL;
1950
1951                 events_file = must_make_path(h->path_con, "cgroup.events", NULL);
1952                 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1953                 if (fd < 0)
1954                         return log_error_errno(-1, errno, "Failed to open cgroup.events file");
1955
1956                 ret = lxc_mainloop_open(&descr);
1957                 if (ret)
1958                         return log_error_errno(-1, errno, "%s", epoll_error);
1959
1960                 /* automatically cleaned up now */
1961                 descr_ptr = &descr;
1962
1963                 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
1964                 if (ret < 0)
1965                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
1966         }
1967
1968         ret = lxc_write_openat(h->path_con, "cgroup.freeze", state_string, 1);
1969         if (ret < 0)
1970                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
1971
1972         if (timeout != 0 && lxc_mainloop(&descr, timeout))
1973                 return log_error_errno(-1, errno, "%s", wait_error);
1974
1975         return 0;
1976 }
1977
1978 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
1979 {
1980         return cg_unified_freeze_do(ops, timeout, "1", 1,
1981                 "Failed to create epoll instance to wait for container freeze",
1982                 "Failed to wait for container to be frozen");
1983 }
1984
1985 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
1986 {
1987         if (!ops->hierarchies)
1988                 return ret_set_errno(-1, ENOENT);
1989
1990         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
1991                 return cg_legacy_freeze(ops);
1992
1993         return cg_unified_freeze(ops, timeout);
1994 }
1995
1996 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
1997 {
1998         struct hierarchy *h;
1999
2000         h = get_hierarchy(ops, "freezer");
2001         if (!h)
2002                 return ret_set_errno(-1, ENOENT);
2003
2004         return lxc_write_openat(h->path_con, "freezer.state",
2005                                 "THAWED", STRLITERALLEN("THAWED"));
2006 }
2007
2008 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
2009 {
2010         return cg_unified_freeze_do(ops, timeout, "0", 0,
2011                 "Failed to create epoll instance to wait for container unfreeze",
2012                 "Failed to wait for container to be unfrozen");
2013 }
2014
2015 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2016 {
2017         if (!ops->hierarchies)
2018                 return ret_set_errno(-1, ENOENT);
2019
2020         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2021                 return cg_legacy_unfreeze(ops);
2022
2023         return cg_unified_unfreeze(ops, timeout);
2024 }
2025
2026 static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2027                                         const char *controller, bool limiting)
2028 {
2029         struct hierarchy *h;
2030         size_t len;
2031         const char *path;
2032
2033         h = get_hierarchy(ops, controller);
2034         if (!h)
2035                 return log_warn_errno(NULL, ENOENT,
2036                                       "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller));
2037
2038         if (limiting)
2039                 path = h->path_lim;
2040         else
2041                 path = h->path_con;
2042         if (!path)
2043                 return NULL;
2044
2045         len = strlen(h->at_mnt);
2046         if (!strnequal(h->at_mnt, DEFAULT_CGROUP_MOUNTPOINT,
2047                        STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) {
2048                 path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT);
2049                 path += strspn(path, "/");
2050         }
2051         return path += len;
2052 }
2053
2054 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2055                                                   const char *controller)
2056 {
2057     return cgfsng_get_cgroup_do(ops, controller, false);
2058 }
2059
2060 __cgfsng_ops static const char *cgfsng_get_limiting_cgroup(struct cgroup_ops *ops,
2061                                                            const char *controller)
2062 {
2063     return cgfsng_get_cgroup_do(ops, controller, true);
2064 }
2065
2066 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2067  * which must be freed by the caller.
2068  */
2069 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2070                                                        const char *inpath,
2071                                                        const char *filename)
2072 {
2073         return make_cgroup_path(h, inpath, filename, NULL);
2074 }
2075
2076 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
2077 {
2078         int idx = 1;
2079         int ret;
2080         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2081         ssize_t pidstr_len;
2082
2083         /* Create leaf cgroup. */
2084         ret = mkdirat(unified_fd, ".lxc", 0755);
2085         if (ret < 0 && errno != EEXIST)
2086                 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2087
2088         pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2089         if (pidstr_len < 0)
2090                 return pidstr_len;
2091
2092         ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
2093         if (ret < 0)
2094                 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2095         if (ret == 0)
2096                 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
2097
2098         /* this is a non-leaf node */
2099         if (errno != EBUSY)
2100                 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
2101
2102         do {
2103                 bool rm = false;
2104                 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
2105                 char *slash = attach_cgroup;
2106
2107                 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2108                 if (ret < 0)
2109                         return ret;
2110
2111                 /*
2112                  * This shouldn't really happen but the compiler might complain
2113                  * that a short write would cause a buffer overrun. So be on
2114                  * the safe side.
2115                  */
2116                 if (ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2117                         return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2118
2119                 slash += (ret - STRLITERALLEN("/cgroup.procs"));
2120                 *slash = '\0';
2121
2122                 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2123                 if (ret < 0 && errno != EEXIST)
2124                         return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2125                 if (ret == 0)
2126                         rm = true;
2127
2128                 *slash = '/';
2129
2130                 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2131                 if (ret == 0)
2132                         return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
2133
2134                 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2135                         SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2136
2137                 /* this is a non-leaf node */
2138                 if (errno != EBUSY)
2139                         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2140
2141                 idx++;
2142         } while (idx < 1000);
2143
2144         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2145 }
2146
2147 static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2148                                      int unified_fd, int *sk_fd)
2149 {
2150         __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2151         int target_fds[2];
2152         ssize_t ret;
2153
2154         /* Create leaf cgroup. */
2155         ret = mkdirat(unified_fd, ".lxc", 0755);
2156         if (ret < 0 && errno != EEXIST)
2157                 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2158
2159         target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2160         if (target_fd0 < 0)
2161                 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2162         target_fds[0] = target_fd0;
2163
2164         target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2165         if (target_fd1 < 0)
2166                 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2167         target_fds[1] = target_fd1;
2168
2169         ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
2170         if (ret <= 0)
2171                 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
2172                                        target_fd0, target_fd1);
2173
2174         return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
2175 }
2176
2177 static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2178                                         int *sk_fd, pid_t pid)
2179 {
2180         __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2181         int target_fds[2];
2182         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2183         size_t pidstr_len;
2184         ssize_t ret;
2185
2186         ret = lxc_abstract_unix_recv_two_fds(sk, target_fds);
2187         if (ret < 0)
2188                 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
2189         target_fd0 = target_fds[0];
2190         target_fd1 = target_fds[1];
2191
2192         pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2193
2194         ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2195         if (ret > 0 && ret == pidstr_len)
2196                 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2197
2198         ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
2199         if (ret > 0 && ret == pidstr_len)
2200                 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
2201
2202         return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2203                                target_fd0, target_fd1);
2204 }
2205
2206 struct userns_exec_unified_attach_data {
2207         const struct lxc_conf *conf;
2208         int unified_fd;
2209         int sk_pair[2];
2210         pid_t pid;
2211 };
2212
2213 static int cgroup_unified_attach_child_wrapper(void *data)
2214 {
2215         struct userns_exec_unified_attach_data *args = data;
2216
2217         if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2218             args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2219                 return ret_errno(EINVAL);
2220
2221         close_prot_errno_disarm(args->sk_pair[0]);
2222         return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2223                                          &args->sk_pair[1]);
2224 }
2225
2226 static int cgroup_unified_attach_parent_wrapper(void *data)
2227 {
2228         struct userns_exec_unified_attach_data *args = data;
2229
2230         if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2231             args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2232                 return ret_errno(EINVAL);
2233
2234         close_prot_errno_disarm(args->sk_pair[1]);
2235         return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2236                                             args->pid);
2237 }
2238
2239 /* Technically, we're always at a delegation boundary here (This is especially
2240  * true when cgroup namespaces are available.). The reasoning is that in order
2241  * for us to have been able to start a container in the first place the root
2242  * cgroup must have been a leaf node. Now, either the container's init system
2243  * has populated the cgroup and kept it as a leaf node or it has created
2244  * subtrees. In the former case we will simply attach to the leaf node we
2245  * created when we started the container in the latter case we create our own
2246  * cgroup for the attaching process.
2247  */
2248 static int __cg_unified_attach(const struct hierarchy *h,
2249                                const struct lxc_conf *conf, const char *name,
2250                                const char *lxcpath, pid_t pid,
2251                                const char *controller)
2252 {
2253         __do_close int unified_fd = -EBADF;
2254         __do_free char *path = NULL, *cgroup = NULL;
2255         int ret;
2256
2257         if (!conf || !name || !lxcpath || pid <= 0)
2258                 return ret_errno(EINVAL);
2259
2260         ret = cgroup_attach(conf, name, lxcpath, pid);
2261         if (ret == 0)
2262                 return log_trace(0, "Attached to unified cgroup via command handler");
2263         if (ret != -ENOCGROUP2)
2264                 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2265
2266         /* Fall back to retrieving the path for the unified cgroup. */
2267         cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2268         /* not running */
2269         if (!cgroup)
2270                 return 0;
2271
2272         path = make_cgroup_path(h, cgroup, NULL);
2273
2274         unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
2275         if (unified_fd < 0)
2276                 return ret_errno(EBADF);
2277
2278         if (!lxc_list_empty(&conf->id_map)) {
2279                 struct userns_exec_unified_attach_data args = {
2280                         .conf           = conf,
2281                         .unified_fd     = unified_fd,
2282                         .pid            = pid,
2283                 };
2284
2285                 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2286                 if (ret < 0)
2287                         return -errno;
2288
2289                 ret = userns_exec_minimal(conf,
2290                                           cgroup_unified_attach_parent_wrapper,
2291                                           &args,
2292                                           cgroup_unified_attach_child_wrapper,
2293                                           &args);
2294         } else {
2295                 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2296         }
2297
2298         return ret;
2299 }
2300
2301 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2302                                        const struct lxc_conf *conf,
2303                                        const char *name, const char *lxcpath,
2304                                        pid_t pid)
2305 {
2306         int len, ret;
2307         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2308
2309         if (!ops)
2310                 return ret_set_errno(false, ENOENT);
2311
2312         if (!ops->hierarchies)
2313                 return true;
2314
2315         len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2316         if (len < 0)
2317                 return false;
2318
2319         for (int i = 0; ops->hierarchies[i]; i++) {
2320                 __do_free char *fullpath = NULL, *path = NULL;
2321                 struct hierarchy *h = ops->hierarchies[i];
2322
2323                 if (h->fs_type == UNIFIED_HIERARCHY) {
2324                         ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
2325                                                   h->controllers[0]);
2326                         if (ret < 0)
2327                                 return false;
2328
2329                         continue;
2330                 }
2331
2332                 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2333                 /* not running */
2334                 if (!path)
2335                         return false;
2336
2337                 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2338                 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2339                 if (ret < 0)
2340                         return log_error_errno(false, errno, "Failed to attach %d to %s",
2341                                                (int)pid, fullpath);
2342         }
2343
2344         return true;
2345 }
2346
2347 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.  Here we
2348  * don't have a cgroup_data set up, so we ask the running container through the
2349  * commands API for the cgroup path.
2350  */
2351 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2352                                      char *value, size_t len, const char *name,
2353                                      const char *lxcpath)
2354 {
2355         __do_free char *path = NULL;
2356         __do_free char *controller = NULL;
2357         char *p;
2358         struct hierarchy *h;
2359         int ret = -1;
2360
2361         if (!ops)
2362                 return ret_set_errno(-1, ENOENT);
2363
2364         controller = strdup(filename);
2365         if (!controller)
2366                 return ret_errno(ENOMEM);
2367
2368         p = strchr(controller, '.');
2369         if (p)
2370                 *p = '\0';
2371
2372         path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
2373         /* not running */
2374         if (!path)
2375                 return -1;
2376
2377         h = get_hierarchy(ops, controller);
2378         if (h) {
2379                 __do_free char *fullpath = NULL;
2380
2381                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2382                 ret = lxc_read_from_file(fullpath, value, len);
2383         }
2384
2385         return ret;
2386 }
2387
2388 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2389 {
2390         for (int count = 0; count < 3; count++, val++) {
2391                 switch (*val) {
2392                 case 'r':
2393                         device->access[count] = *val;
2394                         break;
2395                 case 'w':
2396                         device->access[count] = *val;
2397                         break;
2398                 case 'm':
2399                         device->access[count] = *val;
2400                         break;
2401                 case '\n':
2402                 case '\0':
2403                         count = 3;
2404                         break;
2405                 default:
2406                         return ret_errno(EINVAL);
2407                 }
2408         }
2409
2410         return 0;
2411 }
2412
2413 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2414                                     const char *val)
2415 {
2416         int count, ret;
2417         char temp[50];
2418
2419         if (strequal("devices.allow", key))
2420                 device->allow = 1; /* allow the device */
2421         else
2422                 device->allow = 0; /* deny the device */
2423
2424         if (strequal(val, "a")) {
2425                 /* global rule */
2426                 device->type = 'a';
2427                 device->major = -1;
2428                 device->minor = -1;
2429                 return 0;
2430         }
2431
2432         switch (*val) {
2433         case 'a':
2434                 __fallthrough;
2435         case 'b':
2436                 __fallthrough;
2437         case 'c':
2438                 device->type = *val;
2439                 break;
2440         default:
2441                 return -1;
2442         }
2443
2444         val++;
2445         if (!isspace(*val))
2446                 return -1;
2447         val++;
2448         if (*val == '*') {
2449                 device->major = -1;
2450                 val++;
2451         } else if (isdigit(*val)) {
2452                 memset(temp, 0, sizeof(temp));
2453                 for (count = 0; count < sizeof(temp) - 1; count++) {
2454                         temp[count] = *val;
2455                         val++;
2456                         if (!isdigit(*val))
2457                                 break;
2458                 }
2459                 ret = lxc_safe_int(temp, &device->major);
2460                 if (ret)
2461                         return -1;
2462         } else {
2463                 return -1;
2464         }
2465         if (*val != ':')
2466                 return -1;
2467         val++;
2468
2469         /* read minor */
2470         if (*val == '*') {
2471                 device->minor = -1;
2472                 val++;
2473         } else if (isdigit(*val)) {
2474                 memset(temp, 0, sizeof(temp));
2475                 for (count = 0; count < sizeof(temp) - 1; count++) {
2476                         temp[count] = *val;
2477                         val++;
2478                         if (!isdigit(*val))
2479                                 break;
2480                 }
2481                 ret = lxc_safe_int(temp, &device->minor);
2482                 if (ret)
2483                         return -1;
2484         } else {
2485                 return -1;
2486         }
2487         if (!isspace(*val))
2488                 return -1;
2489
2490         return device_cgroup_parse_access(device, ++val);
2491 }
2492
2493 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.  Here we
2494  * don't have a cgroup_data set up, so we ask the running container through the
2495  * commands API for the cgroup path.
2496  */
2497 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2498                                      const char *key, const char *value,
2499                                      const char *name, const char *lxcpath)
2500 {
2501         __do_free char *path = NULL;
2502         __do_free char *controller = NULL;
2503         char *p;
2504         struct hierarchy *h;
2505         int ret = -1;
2506
2507         if (!ops || is_empty_string(key) || is_empty_string(value) ||
2508             is_empty_string(name) || is_empty_string(lxcpath))
2509                 return ret_errno(EINVAL);
2510
2511         controller = strdup(key);
2512         if (!controller)
2513                 return ret_errno(ENOMEM);
2514
2515         p = strchr(controller, '.');
2516         if (p)
2517                 *p = '\0';
2518
2519         if (pure_unified_layout(ops) && strequal(controller, "devices")) {
2520                 struct device_item device = {};
2521
2522                 ret = device_cgroup_rule_parse(&device, key, value);
2523                 if (ret < 0)
2524                         return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2525                                                key, value);
2526
2527                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2528                 if (ret < 0)
2529                         return -1;
2530
2531                 return 0;
2532         }
2533
2534         path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
2535         /* not running */
2536         if (!path)
2537                 return -1;
2538
2539         h = get_hierarchy(ops, controller);
2540         if (h) {
2541                 __do_free char *fullpath = NULL;
2542
2543                 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2544                 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2545         }
2546
2547         return ret;
2548 }
2549
2550 /* take devices cgroup line
2551  *    /dev/foo rwx
2552  * and convert it to a valid
2553  *    type major:minor mode
2554  * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2555  * the output.
2556  */
2557 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2558                                             const char *devpath)
2559 {
2560         __do_free char *path = NULL;
2561         char *mode = NULL;
2562         int n_parts, ret;
2563         char *p;
2564         struct stat sb;
2565
2566         path = strdup(devpath);
2567         if (!path)
2568                 return ret_errno(ENOMEM);
2569
2570         /*
2571          * Read path followed by mode. Ignore any trailing text.
2572          * A '    # comment' would be legal. Technically other text is not
2573          * legal, we could check for that if we cared to.
2574          */
2575         for (n_parts = 1, p = path; *p; p++) {
2576                 if (*p != ' ')
2577                         continue;
2578                 *p = '\0';
2579
2580                 if (n_parts != 1)
2581                         break;
2582                 p++;
2583                 n_parts++;
2584
2585                 while (*p == ' ')
2586                         p++;
2587
2588                 mode = p;
2589
2590                 if (*p == '\0')
2591                         return ret_set_errno(-1, EINVAL);
2592         }
2593
2594         if (!mode)
2595                 return ret_errno(EINVAL);
2596
2597         if (device_cgroup_parse_access(device, mode) < 0)
2598                 return -1;
2599
2600         ret = stat(path, &sb);
2601         if (ret < 0)
2602                 return ret_set_errno(-1, errno);
2603
2604         mode_t m = sb.st_mode & S_IFMT;
2605         switch (m) {
2606         case S_IFBLK:
2607                 device->type = 'b';
2608                 break;
2609         case S_IFCHR:
2610                 device->type = 'c';
2611                 break;
2612         default:
2613                 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
2614         }
2615
2616         device->major = MAJOR(sb.st_rdev);
2617         device->minor = MINOR(sb.st_rdev);
2618         device->allow = 1;
2619
2620         return 0;
2621 }
2622
2623 static int convert_devpath(const char *invalue, char *dest)
2624 {
2625         struct device_item device = {};
2626         int ret;
2627
2628         ret = device_cgroup_rule_parse_devpath(&device, invalue);
2629         if (ret < 0)
2630                 return -1;
2631
2632         ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2633                          device.minor, device.access);
2634         if (ret < 0)
2635                 return log_error_errno(ret, -ret,
2636                                        "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2637                                        device.type, device.major, device.minor,
2638                                        device.access);
2639
2640         return 0;
2641 }
2642
2643 /* Called from setup_limits - here we have the container's cgroup_data because
2644  * we created the cgroups.
2645  */
2646 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2647                               const char *value, bool is_cpuset)
2648 {
2649         __do_free char *controller = NULL;
2650         char *p;
2651         /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2652         char converted_value[50];
2653         struct hierarchy *h;
2654
2655         controller = strdup(filename);
2656         if (!controller)
2657                 return ret_errno(ENOMEM);
2658
2659         p = strchr(controller, '.');
2660         if (p)
2661                 *p = '\0';
2662
2663         if (strequal("devices.allow", filename) && value[0] == '/') {
2664                 int ret;
2665
2666                 ret = convert_devpath(value, converted_value);
2667                 if (ret < 0)
2668                         return ret;
2669                 value = converted_value;
2670         }
2671
2672         h = get_hierarchy(ops, controller);
2673         if (!h)
2674                 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
2675
2676         if (is_cpuset) {
2677                 int ret = lxc_write_openat(h->path_con, filename, value, strlen(value));
2678                 if (ret)
2679                         return ret;
2680         }
2681         return lxc_write_openat(h->path_lim, filename, value, strlen(value));
2682 }
2683
2684 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2685                                                     struct lxc_conf *conf,
2686                                                     bool do_devices)
2687 {
2688         __do_free struct lxc_list *sorted_cgroup_settings = NULL;
2689         struct lxc_list *cgroup_settings = &conf->cgroup;
2690         struct lxc_list *iterator, *next;
2691         struct lxc_cgroup *cg;
2692         bool ret = false;
2693
2694         if (!ops)
2695                 return ret_set_errno(false, ENOENT);
2696
2697         if (!conf)
2698                 return ret_set_errno(false, EINVAL);
2699
2700         cgroup_settings = &conf->cgroup;
2701         if (lxc_list_empty(cgroup_settings))
2702                 return true;
2703
2704         if (!ops->hierarchies)
2705                 return ret_set_errno(false, EINVAL);
2706
2707         if (pure_unified_layout(ops))
2708                 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
2709
2710         sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2711         if (!sorted_cgroup_settings)
2712                 return false;
2713
2714         lxc_list_for_each(iterator, sorted_cgroup_settings) {
2715                 cg = iterator->elem;
2716
2717                 if (do_devices == strnequal("devices", cg->subsystem, 7)) {
2718                         if (cg_legacy_set_data(ops, cg->subsystem, cg->value, strnequal("cpuset", cg->subsystem, 6))) {
2719                                 if (do_devices && (errno == EACCES || errno == EPERM)) {
2720                                         SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2721                                         continue;
2722                                 }
2723                                 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2724                                 goto out;
2725                         }
2726                         DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
2727                 }
2728         }
2729
2730         ret = true;
2731         INFO("Limits for the legacy cgroup hierarchies have been setup");
2732 out:
2733         lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2734                 lxc_list_del(iterator);
2735                 free(iterator);
2736         }
2737
2738         return ret;
2739 }
2740
2741 /*
2742  * Some of the parsing logic comes from the original cgroup device v1
2743  * implementation in the kernel.
2744  */
2745 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2746                                      struct lxc_conf *conf, const char *key,
2747                                      const char *val)
2748 {
2749         struct device_item device_item = {};
2750         int ret;
2751
2752         if (strequal("devices.allow", key) && abspath(val))
2753                 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2754         else
2755                 ret = device_cgroup_rule_parse(&device_item, key, val);
2756         if (ret < 0)
2757                 return syserrno_set(EINVAL, "Failed to parse device rule %s=%s", key, val);
2758
2759         /*
2760          * Note that bpf_list_add_device() returns 1 if it altered the device
2761          * list and 0 if it didn't; both return values indicate success.
2762          * Only a negative return value indicates an error.
2763          */
2764         ret = bpf_list_add_device(&conf->bpf_devices, &device_item);
2765         if (ret < 0)
2766                 return -1;
2767
2768         return 0;
2769 }
2770
2771 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2772                                              struct lxc_handler *handler)
2773 {
2774         struct lxc_list *cgroup_settings, *iterator;
2775         struct hierarchy *h;
2776         struct lxc_conf *conf;
2777
2778         if (!ops)
2779                 return ret_set_errno(false, ENOENT);
2780
2781         if (!ops->hierarchies)
2782                 return true;
2783
2784         if (!ops->container_cgroup)
2785                 return ret_set_errno(false, EINVAL);
2786
2787         if (!handler || !handler->conf)
2788                 return ret_set_errno(false, EINVAL);
2789         conf = handler->conf;
2790
2791         cgroup_settings = &conf->cgroup2;
2792         if (lxc_list_empty(cgroup_settings))
2793                 return true;
2794
2795         if (!pure_unified_layout(ops))
2796                 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
2797
2798         if (!ops->unified)
2799                 return false;
2800         h = ops->unified;
2801
2802         lxc_list_for_each (iterator, cgroup_settings) {
2803                 struct lxc_cgroup *cg = iterator->elem;
2804                 int ret;
2805
2806                 if (strnequal("devices", cg->subsystem, 7))
2807                         ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, cg->value);
2808                 else
2809                         ret = lxc_write_openat(h->path_lim, cg->subsystem, cg->value, strlen(cg->value));
2810                 if (ret < 0)
2811                         return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2812
2813                 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2814         }
2815
2816         return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
2817 }
2818
2819 __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
2820 {
2821         struct lxc_conf *conf;
2822         struct hierarchy *unified;
2823
2824         if (!ops)
2825                 return ret_set_errno(false, ENOENT);
2826
2827         if (!ops->hierarchies)
2828                 return true;
2829
2830         if (!ops->container_cgroup)
2831                 return ret_set_errno(false, EEXIST);
2832
2833         if (!handler || !handler->conf)
2834                 return ret_set_errno(false, EINVAL);
2835         conf = handler->conf;
2836
2837         unified = ops->unified;
2838         if (!unified || !device_utility_controller(unified) ||
2839             !unified->path_con ||
2840             lxc_list_empty(&(conf->bpf_devices).device_item))
2841                 return true;
2842
2843         return bpf_cgroup_devices_attach(ops, &conf->bpf_devices);
2844 }
2845
2846 static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
2847 {
2848         __do_close int dfd_final = -EBADF;
2849         __do_free char *add_controllers = NULL, *copy = NULL;
2850         size_t full_len = 0;
2851         struct hierarchy *unified;
2852         int dfd_cur, ret;
2853         char *cur;
2854         char **it;
2855
2856         if (!ops->hierarchies || !pure_unified_layout(ops))
2857                 return true;
2858
2859         unified = ops->unified;
2860         if (!unified->controllers[0])
2861                 return true;
2862
2863         /* For now we simply enable all controllers that we have detected by
2864          * creating a string like "+memory +pids +cpu +io".
2865          * TODO: In the near future we might want to support "-<controller>"
2866          * etc. but whether supporting semantics like this make sense will need
2867          * some thinking.
2868          */
2869         for (it = unified->controllers; it && *it; it++) {
2870                 full_len += strlen(*it) + 2;
2871                 add_controllers = must_realloc(add_controllers, full_len + 1);
2872
2873                 if (unified->controllers[0] == *it)
2874                         add_controllers[0] = '\0';
2875
2876                 (void)strlcat(add_controllers, "+", full_len + 1);
2877                 (void)strlcat(add_controllers, *it, full_len + 1);
2878
2879                 if ((it + 1) && *(it + 1))
2880                         (void)strlcat(add_controllers, " ", full_len + 1);
2881         }
2882
2883         copy = strdup(cgroup);
2884         if (!copy)
2885                 return false;
2886
2887         /*
2888          * Placing the write to cgroup.subtree_control before the open() is
2889          * intentional because of the cgroup2 delegation model. It enforces
2890          * that leaf cgroups don't have any controllers enabled for delegation.
2891          */
2892         dfd_cur = unified->dfd_base;
2893         lxc_iterate_parts(cur, copy, "/") {
2894                 /*
2895                  * Even though we vetted the paths when we parsed the config
2896                  * we're paranoid here and check that the path is neither
2897                  * absolute nor walks upwards.
2898                  */
2899                 if (abspath(cur))
2900                         return syserrno_set(-EINVAL, "No absolute paths allowed");
2901
2902                 if (strnequal(cur, "..", STRLITERALLEN("..")))
2903                         return syserrno_set(-EINVAL, "No upward walking paths allowed");
2904
2905                 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
2906                 if (ret < 0)
2907                         return syserrno(-errno, "Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2908
2909                 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2910
2911                 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
2912                 if (dfd_final < 0)
2913                         return syserrno(-errno, "Fail to open directory %d(%s)", dfd_cur, cur);
2914                 if (dfd_cur != unified->dfd_base)
2915                         close(dfd_cur);
2916                 /*
2917                  * Leave dfd_final pointing to the last fd we opened so
2918                  * it will be automatically zapped if we return early.
2919                  */
2920                 dfd_cur = dfd_final;
2921         }
2922
2923         return true;
2924 }
2925
2926 __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2927 {
2928         if (!ops)
2929                 return ret_set_errno(false, ENOENT);
2930
2931         return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2932 }
2933
2934 __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2935 {
2936         if (!ops)
2937                 return ret_set_errno(false, ENOENT);
2938
2939         return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2940 }
2941
2942 static inline bool unified_cgroup(const char *line)
2943 {
2944         return *line == '0';
2945 }
2946
2947 static inline char *current_unified_cgroup(bool relative, char *line)
2948 {
2949         char *current_cgroup;
2950
2951         line += STRLITERALLEN("0::");
2952
2953         if (!abspath(line))
2954                 return ERR_PTR(-EINVAL);
2955
2956         /* remove init.scope */
2957         if (!relative)
2958                 line = prune_init_scope(line);
2959
2960         /* create a relative path */
2961         line = deabs(line);
2962
2963         current_cgroup = strdup(line);
2964         if (!current_cgroup)
2965                 return ERR_PTR(-ENOMEM);
2966
2967         return current_cgroup;
2968 }
2969
2970 static inline const char *unprefix(const char *controllers)
2971 {
2972         if (strnequal(controllers, "name=", STRLITERALLEN("name=")))
2973                 return controllers + STRLITERALLEN("name=");
2974         return controllers;
2975 }
2976
2977 static int __list_cgroup_delegate(char ***delegate)
2978 {
2979         __do_free char **list = NULL;
2980         __do_free char *buf = NULL;
2981         char *standard[] = {
2982                 "cgroup.procs",
2983                 "cgroup.threads",
2984                 "cgroup.subtree_control",
2985                 "memory.oom.group",
2986                 NULL,
2987         };
2988         char *token;
2989         int ret;
2990
2991         buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
2992         if (!buf) {
2993                 for (char **p = standard; p && *p; p++) {
2994                         ret = list_add_string(&list, *p);
2995                         if (ret < 0)
2996                                 return ret;
2997                 }
2998
2999                 *delegate = move_ptr(list);
3000                 return syswarn(0, "Failed to read /sys/kernel/cgroup/delegate");
3001         }
3002
3003         lxc_iterate_parts(token, buf, " \t\n") {
3004                 /*
3005                  * We always need to chown this for both cgroup and
3006                  * cgroup2.
3007                  */
3008                 if (strequal(token, "cgroup.procs"))
3009                         continue;
3010
3011                 ret = list_add_string(&list, token);
3012                 if (ret < 0)
3013                         return ret;
3014         }
3015
3016         *delegate = move_ptr(list);
3017         return 0;
3018 }
3019
3020 static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files)
3021 {
3022         __do_free_string_list char **list = NULL;
3023         int ret;
3024
3025         ret = __list_cgroup_delegate(&list);
3026         if (ret < 0)
3027                 return syserrno(ret, "Failed to determine unified cgroup delegation requirements");
3028
3029         for (char *const *s = list; s && *s; s++) {
3030                 if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT)
3031                         continue;
3032
3033                 return sysinfo(false, "The %s file is not writable, skipping unified hierarchy", *s);
3034         }
3035
3036         *ret_files = move_ptr(list);
3037         return true;
3038 }
3039
3040 static bool legacy_hierarchy_delegated(int dfd_base)
3041 {
3042         if (faccessat(dfd_base, "cgroup.procs", W_OK, 0) && errno != ENOENT)
3043                 return sysinfo(false, "The cgroup.procs file is not writable, skipping legacy hierarchy");
3044
3045         return true;
3046 }
3047
3048 static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
3049                                 bool unprivileged)
3050 {
3051         __do_free char *cgroup_info = NULL;
3052         char *it;
3053
3054         /*
3055          * Root spawned containers escape the current cgroup, so use init's
3056          * cgroups as our base in that case.
3057          */
3058         if (!relative && (geteuid() == 0))
3059                 cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3060         else
3061                 cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3062         if (!cgroup_info)
3063                 return ret_errno(ENOMEM);
3064
3065         lxc_iterate_parts(it, cgroup_info, "\n") {
3066                 __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
3067                 __do_free char *controllers = NULL, *current_cgroup = NULL;
3068                 __do_free_string_list char **controller_list = NULL,
3069                                            **delegate = NULL;
3070                 char *line;
3071                 int dfd, ret, type;
3072
3073                 /* Handle the unified cgroup hierarchy. */
3074                 line = it;
3075                 if (unified_cgroup(line)) {
3076                         char *unified_mnt;
3077
3078                         type = UNIFIED_HIERARCHY;
3079
3080                         current_cgroup = current_unified_cgroup(relative, line);
3081                         if (IS_ERR(current_cgroup))
3082                                 return PTR_ERR(current_cgroup);
3083
3084                         if (unified_cgroup_fd(ops->dfd_mnt)) {
3085                                 dfd_mnt = dup_cloexec(ops->dfd_mnt);
3086                                 unified_mnt = "";
3087                         } else {
3088                                 dfd_mnt = open_at(ops->dfd_mnt,
3089                                                   "unified",
3090                                                   PROTECT_OPATH_DIRECTORY,
3091                                                   PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3092                                 unified_mnt = "unified";
3093                         }
3094                         if (dfd_mnt < 0) {
3095                                 if (errno != ENOENT)
3096                                         return syserrno(-errno, "Failed to open %d/unified", ops->dfd_mnt);
3097
3098                                 SYSTRACE("Unified cgroup not mounted");
3099                                 continue;
3100                         }
3101                         dfd = dfd_mnt;
3102
3103                         if (!is_empty_string(current_cgroup)) {
3104                                 dfd_base = open_at(dfd_mnt, current_cgroup,
3105                                                    PROTECT_OPATH_DIRECTORY,
3106                                                    PROTECT_LOOKUP_BENEATH_XDEV, 0);
3107                                 if (dfd_base < 0)
3108                                         return syserrno(-errno, "Failed to open %d/%s", dfd_mnt, current_cgroup);
3109                                 dfd = dfd_base;
3110                         }
3111
3112                         if (!unified_hierarchy_delegated(dfd, &delegate))
3113                                 continue;
3114
3115                         controller_list = unified_controllers(dfd, "cgroup.controllers");
3116                         if (!controller_list) {
3117                                 TRACE("No controllers are enabled for delegation in the unified hierarchy");
3118                                 controller_list = list_new();
3119                                 if (!controller_list)
3120                                         return syserrno(-ENOMEM, "Failed to create empty controller list");
3121                         }
3122
3123                         controllers = strdup(unified_mnt);
3124                         if (!controllers)
3125                                 return ret_errno(ENOMEM);
3126                 } else {
3127                         char *__controllers, *__current_cgroup;
3128
3129                         type = LEGACY_HIERARCHY;
3130
3131                         __controllers = strchr(line, ':');
3132                         if (!__controllers)
3133                                 return ret_errno(EINVAL);
3134                         __controllers++;
3135
3136                         __current_cgroup = strchr(__controllers, ':');
3137                         if (!__current_cgroup)
3138                                 return ret_errno(EINVAL);
3139                         *__current_cgroup = '\0';
3140                         __current_cgroup++;
3141
3142                         controllers = strdup(unprefix(__controllers));
3143                         if (!controllers)
3144                                 return ret_errno(ENOMEM);
3145
3146                         dfd_mnt = open_at(ops->dfd_mnt,
3147                                           controllers, PROTECT_OPATH_DIRECTORY,
3148                                           PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3149                         if (dfd_mnt < 0) {
3150                                 if (errno != ENOENT)
3151                                         return syserrno(-errno, "Failed to open %d/%s",
3152                                                         ops->dfd_mnt, controllers);
3153
3154                                 SYSTRACE("%s not mounted", controllers);
3155                                 continue;
3156                         }
3157                         dfd = dfd_mnt;
3158
3159                         if (!abspath(__current_cgroup))
3160                                 return ret_errno(EINVAL);
3161
3162                         /* remove init.scope */
3163                         if (!relative)
3164                                 __current_cgroup = prune_init_scope(__current_cgroup);
3165
3166                         /* create a relative path */
3167                         __current_cgroup = deabs(__current_cgroup);
3168
3169                         current_cgroup = strdup(__current_cgroup);
3170                         if (!current_cgroup)
3171                                 return ret_errno(ENOMEM);
3172
3173                         if (!is_empty_string(current_cgroup)) {
3174                                 dfd_base = open_at(dfd_mnt, current_cgroup,
3175                                                    PROTECT_OPATH_DIRECTORY,
3176                                                    PROTECT_LOOKUP_BENEATH_XDEV, 0);
3177                                 if (dfd_base < 0)
3178                                         return syserrno(-errno, "Failed to open %d/%s",
3179                                                         dfd_mnt, current_cgroup);
3180                                 dfd = dfd_base;
3181                         }
3182
3183                         if (!legacy_hierarchy_delegated(dfd))
3184                                 continue;
3185
3186                         /*
3187                          * We intentionally pass __current_cgroup here and not
3188                          * controllers because we would otherwise chop the
3189                          * mountpoint.
3190                          */
3191                         controller_list = list_add_controllers(__controllers);
3192                         if (!controller_list)
3193                                 return syserrno(-ENOMEM, "Failed to create controller list from %s", __controllers);
3194
3195                         if (skip_hierarchy(ops, controller_list))
3196                                 continue;
3197
3198                         ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3199                 }
3200
3201                 ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd,
3202                                            current_cgroup, controller_list, type);
3203                 if (ret < 0)
3204                         return syserrno(ret, "Failed to add %s hierarchy", controllers);
3205
3206                 /* Transfer ownership. */
3207                 move_fd(dfd_mnt);
3208                 move_fd(dfd_base);
3209                 move_ptr(current_cgroup);
3210                 move_ptr(controllers);
3211                 move_ptr(controller_list);
3212                 if (type == UNIFIED_HIERARCHY)
3213                         ops->unified->delegate = move_ptr(delegate);
3214         }
3215
3216         /* determine cgroup layout */
3217         if (ops->unified) {
3218                 if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3219                         ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3220                 } else {
3221                         if (bpf_devices_cgroup_supported())
3222                                 ops->unified->utilities |= DEVICES_CONTROLLER;
3223                         ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3224                 }
3225         }
3226
3227         if (!controllers_available(ops))
3228                 return syserrno_set(-ENOENT, "One or more requested controllers unavailable or not delegated");
3229
3230         return 0;
3231 }
3232
3233 static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf)
3234 {
3235         __do_close int dfd = -EBADF;
3236         int ret;
3237         const char *controllers_use;
3238
3239         if (ops->dfd_mnt >= 0)
3240                 return ret_errno(EBUSY);
3241
3242         /*
3243          * I don't see the need for allowing symlinks here. If users want to
3244          * have their hierarchy available in different locations I strongly
3245          * suggest bind-mounts.
3246          */
3247         dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3248                         PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3249         if (dfd < 0)
3250                 return syserrno(-errno, "Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
3251
3252         controllers_use = lxc_global_config_value("lxc.cgroup.use");
3253         if (controllers_use) {
3254                 __do_free char *dup = NULL;
3255                 char *it;
3256
3257                 dup = strdup(controllers_use);
3258                 if (!dup)
3259                         return -errno;
3260
3261                 lxc_iterate_parts(it, dup, ",") {
3262                         ret = list_add_string(&ops->cgroup_use, it);
3263                         if (ret < 0)
3264                                 return ret;
3265                 }
3266         }
3267
3268         /*
3269          * Keep dfd referenced by the cleanup function and actually move the fd
3270          * once we know the initialization succeeded. So if we fail we clean up
3271          * the dfd.
3272          */
3273         ops->dfd_mnt = dfd;
3274
3275         ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !lxc_list_empty(&conf->id_map));
3276         if (ret < 0)
3277                 return syserrno(ret, "Failed to initialize cgroups");
3278
3279         /* Transfer ownership to cgroup_ops. */
3280         move_fd(dfd);
3281         return 0;
3282 }
3283
3284 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3285 {
3286         const char *cgroup_pattern;
3287
3288         if (!ops)
3289                 return ret_set_errno(-1, ENOENT);
3290
3291         /* copy system-wide cgroup information */
3292         cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3293         if (cgroup_pattern && !strequal(cgroup_pattern, "")) {
3294                 ops->cgroup_pattern = strdup(cgroup_pattern);
3295                 if (!ops->cgroup_pattern)
3296                         return ret_errno(ENOMEM);
3297         }
3298
3299         return 0;
3300 }
3301
3302 struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
3303 {
3304         __do_free struct cgroup_ops *cgfsng_ops = NULL;
3305
3306         cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
3307         if (!cgfsng_ops)
3308                 return ret_set_errno(NULL, ENOMEM);
3309
3310         cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3311         cgfsng_ops->dfd_mnt = -EBADF;
3312
3313         if (initialize_cgroups(cgfsng_ops, conf))
3314                 return NULL;
3315
3316         cgfsng_ops->data_init                           = cgfsng_data_init;
3317         cgfsng_ops->payload_destroy                     = cgfsng_payload_destroy;
3318         cgfsng_ops->monitor_destroy                     = cgfsng_monitor_destroy;
3319         cgfsng_ops->monitor_create                      = cgfsng_monitor_create;
3320         cgfsng_ops->monitor_enter                       = cgfsng_monitor_enter;
3321         cgfsng_ops->monitor_delegate_controllers        = cgfsng_monitor_delegate_controllers;
3322         cgfsng_ops->payload_delegate_controllers        = cgfsng_payload_delegate_controllers;
3323         cgfsng_ops->payload_create                      = cgfsng_payload_create;
3324         cgfsng_ops->payload_enter                       = cgfsng_payload_enter;
3325         cgfsng_ops->finalize                            = cgfsng_finalize;
3326         cgfsng_ops->get_cgroup                          = cgfsng_get_cgroup;
3327         cgfsng_ops->get                                 = cgfsng_get;
3328         cgfsng_ops->set                                 = cgfsng_set;
3329         cgfsng_ops->freeze                              = cgfsng_freeze;
3330         cgfsng_ops->unfreeze                            = cgfsng_unfreeze;
3331         cgfsng_ops->setup_limits_legacy                 = cgfsng_setup_limits_legacy;
3332         cgfsng_ops->setup_limits                        = cgfsng_setup_limits;
3333         cgfsng_ops->driver                              = "cgfsng";
3334         cgfsng_ops->version                             = "1.0.0";
3335         cgfsng_ops->attach                              = cgfsng_attach;
3336         cgfsng_ops->chown                               = cgfsng_chown;
3337         cgfsng_ops->mount                               = cgfsng_mount;
3338         cgfsng_ops->devices_activate                    = cgfsng_devices_activate;
3339         cgfsng_ops->get_limiting_cgroup                 = cgfsng_get_limiting_cgroup;
3340
3341         cgfsng_ops->criu_escape                         = cgfsng_criu_escape;
3342         cgfsng_ops->criu_num_hierarchies                = cgfsng_criu_num_hierarchies;
3343         cgfsng_ops->criu_get_hierarchies                = cgfsng_criu_get_hierarchies;
3344
3345         return move_ptr(cgfsng_ops);
3346 }
3347
3348 int cgroup_attach(const struct lxc_conf *conf, const char *name,
3349                   const char *lxcpath, pid_t pid)
3350 {
3351         __do_close int unified_fd = -EBADF;
3352         int ret;
3353
3354         if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
3355                 return ret_errno(EINVAL);
3356
3357         unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3358         if (unified_fd < 0)
3359                 return ret_errno(ENOCGROUP2);
3360
3361         if (!lxc_list_empty(&conf->id_map)) {
3362                 struct userns_exec_unified_attach_data args = {
3363                         .conf           = conf,
3364                         .unified_fd     = unified_fd,
3365                         .pid            = pid,
3366                 };
3367
3368                 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3369                 if (ret < 0)
3370                         return -errno;
3371
3372                 ret = userns_exec_minimal(conf,
3373                                           cgroup_unified_attach_parent_wrapper,
3374                                           &args,
3375                                           cgroup_unified_attach_child_wrapper,
3376                                           &args);
3377         } else {
3378                 ret = cgroup_attach_leaf(conf, unified_fd, pid);
3379         }
3380
3381         return ret;
3382 }
3383
3384 /* Connects to command socket therefore isn't callable from command handler. */
3385 int cgroup_get(const char *name, const char *lxcpath,
3386                const char *filename, char *buf, size_t len)
3387 {
3388         __do_close int unified_fd = -EBADF;
3389         ssize_t ret;
3390
3391         if (is_empty_string(filename) || is_empty_string(name) ||
3392             is_empty_string(lxcpath))
3393                 return ret_errno(EINVAL);
3394
3395         if ((buf && !len) || (len && !buf))
3396                 return ret_errno(EINVAL);
3397
3398         unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3399         if (unified_fd < 0)
3400                 return ret_errno(ENOCGROUP2);
3401
3402         ret = lxc_read_try_buf_at(unified_fd, filename, buf, len);
3403         if (ret < 0)
3404                 SYSERROR("Failed to read cgroup value");
3405
3406         return ret;
3407 }
3408
3409 /* Connects to command socket therefore isn't callable from command handler. */
3410 int cgroup_set(const char *name, const char *lxcpath,
3411                const char *filename, const char *value)
3412 {
3413         __do_close int unified_fd = -EBADF;
3414         ssize_t ret;
3415
3416         if (is_empty_string(filename) || is_empty_string(value) ||
3417             is_empty_string(name) || is_empty_string(lxcpath))
3418                 return ret_errno(EINVAL);
3419
3420         unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3421         if (unified_fd < 0)
3422                 return ret_errno(ENOCGROUP2);
3423
3424         if (strnequal(filename, "devices.", STRLITERALLEN("devices."))) {
3425                 struct device_item device = {};
3426
3427                 ret = device_cgroup_rule_parse(&device, filename, value);
3428                 if (ret < 0)
3429                         return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", filename, value);
3430
3431                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3432         } else {
3433                 ret = lxc_writeat(unified_fd, filename, value, strlen(value));
3434         }
3435
3436         return ret;
3437 }
3438
3439 static int do_cgroup_freeze(int unified_fd,
3440                             const char *state_string,
3441                             int state_num,
3442                             int timeout,
3443                             const char *epoll_error,
3444                             const char *wait_error)
3445 {
3446         __do_close int events_fd = -EBADF;
3447         call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
3448         int ret;
3449         struct lxc_epoll_descr descr = {};
3450
3451         if (timeout != 0) {
3452                 ret = lxc_mainloop_open(&descr);
3453                 if (ret)
3454                         return log_error_errno(-1, errno, "%s", epoll_error);
3455
3456                 /* automatically cleaned up now */
3457                 descr_ptr = &descr;
3458
3459                 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3460                 if (events_fd < 0)
3461                         return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3462
3463                 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
3464                 if (ret < 0)
3465                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3466         }
3467
3468         ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3469         if (ret < 0)
3470                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3471
3472         if (timeout != 0) {
3473                 ret = lxc_mainloop(&descr, timeout);
3474                 if (ret)
3475                         return log_error_errno(-1, errno, "%s", wait_error);
3476         }
3477
3478         return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3479 }
3480
3481 static inline int __cgroup_freeze(int unified_fd, int timeout)
3482 {
3483         return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3484                                 "Failed to create epoll instance to wait for container freeze",
3485                                 "Failed to wait for container to be frozen");
3486 }
3487
3488 int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
3489 {
3490         __do_close int unified_fd = -EBADF;
3491         int ret;
3492
3493         if (is_empty_string(name) || is_empty_string(lxcpath))
3494                 return ret_errno(EINVAL);
3495
3496         unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3497         if (unified_fd < 0)
3498                 return ret_errno(ENOCGROUP2);
3499
3500         lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
3501         ret = __cgroup_freeze(unified_fd, timeout);
3502         lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
3503         return ret;
3504 }
3505
3506 int __cgroup_unfreeze(int unified_fd, int timeout)
3507 {
3508         return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3509                                 "Failed to create epoll instance to wait for container freeze",
3510                                 "Failed to wait for container to be frozen");
3511 }
3512
3513 int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
3514 {
3515         __do_close int unified_fd = -EBADF;
3516         int ret;
3517
3518         if (is_empty_string(name) || is_empty_string(lxcpath))
3519                 return ret_errno(EINVAL);
3520
3521         unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3522         if (unified_fd < 0)
3523                 return ret_errno(ENOCGROUP2);
3524
3525         lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
3526         ret = __cgroup_unfreeze(unified_fd, timeout);
3527         lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
3528         return ret;
3529 }