src/lxc/cgroups/cgfsng.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 /*
   4  * cgfs-ng.c: this is a new, simplified implementation of a filesystem
   5  * cgroup backend.  The original cgfs.c was designed to be as flexible
   6  * as possible.  It would try to find cgroup filesystems no matter where
   7  * or how you had them mounted, and deduce the most usable mount for
   8  * each controller.
   9  *
  10  * This new implementation assumes that cgroup filesystems are mounted
  11  * under /sys/fs/cgroup/clist where clist is either the controller, or
  12  * a comma-separated list of controllers.
  13  */
  14
  15 #ifndef _GNU_SOURCE
  16 #define _GNU_SOURCE 1
  17 #endif
  18 #include <ctype.h>
  19 #include <dirent.h>
  20 #include <errno.h>
  21 #include <grp.h>
  22 #include <linux/kdev_t.h>
  23 #include <linux/types.h>
  24 #include <poll.h>
  25 #include <signal.h>
  26 #include <stdint.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/epoll.h>
  31 #include <sys/types.h>
  32 #include <unistd.h>
  33
  34 #include "af_unix.h"
  35 #include "caps.h"
  36 #include "cgroup.h"
  37 #include "cgroup2_devices.h"
  38 #include "cgroup_utils.h"
  39 #include "commands.h"
  40 #include "commands_utils.h"
  41 #include "conf.h"
  42 #include "config.h"
  43 #include "log.h"
  44 #include "macro.h"
  45 #include "mainloop.h"
  46 #include "memory_utils.h"
  47 #include "mount_utils.h"
  48 #include "storage/storage.h"
  49 #include "string_utils.h"
  50 #include "syscall_wrappers.h"
  51 #include "utils.h"
  52
  53 #ifndef HAVE_STRLCPY
  54 #include "include/strlcpy.h"
  55 #endif
  56
  57 #ifndef HAVE_STRLCAT
  58 #include "include/strlcat.h"
  59 #endif
  60
  61 lxc_log_define(cgfsng, cgroup);
  62
  63 /*
  64  * Given a pointer to a null-terminated array of pointers, realloc to add one
  65  * entry, and point the new entry to NULL. Do not fail. Return the index to the
  66  * second-to-last entry - that is, the one which is now available for use
  67  * (keeping the list null-terminated).
  68  */
  69 static int list_add(void ***list)
  70 {
  71         int idx = 0;
  72         void **p;
  73
  74         if (*list)
  75                 for (; (*list)[idx]; idx++)
  76                         ;
  77
  78         p = realloc(*list, (idx + 2) * sizeof(void **));
  79         if (!p)
  80                 return ret_errno(ENOMEM);
  81
  82         p[idx + 1] = NULL;
  83         *list = p;
  84
  85         return idx;
  86 }
  87
  88 /* Given a null-terminated array of strings, check whether @entry is one of the
  89  * strings.
  90  */
  91 static bool string_in_list(char **list, const char *entry)
  92 {
  93         if (!list)
  94                 return false;
  95
  96         for (int i = 0; list[i]; i++)
  97                 if (strequal(list[i], entry))
  98                         return true;
  99
 100         return false;
 101 }
 102
 103 /* Given a handler's cgroup data, return the struct hierarchy for the controller
 104  * @c, or NULL if there is none.
 105  */
 106 static struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
 107 {
 108         if (!ops->hierarchies)
 109                 return log_trace_errno(NULL, errno, "There are no useable cgroup controllers");
 110
 111         for (int i = 0; ops->hierarchies[i]; i++) {
 112                 if (!controller) {
 113                         /* This is the empty unified hierarchy. */
 114                         if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0])
 115                                 return ops->hierarchies[i];
 116
 117                         continue;
 118                 }
 119
 120                 /*
 121                  * Handle controllers with significant implementation changes
 122                  * from cgroup to cgroup2.
 123                  */
 124                 if (pure_unified_layout(ops)) {
 125                         if (strequal(controller, "devices")) {
 126                                 if (device_utility_controller(ops->unified))
 127                                         return ops->unified;
 128
 129                                 break;
 130                         } else if (strequal(controller, "freezer")) {
 131                                 if (freezer_utility_controller(ops->unified))
 132                                         return ops->unified;
 133
 134                                 break;
 135                         }
 136                 }
 137
 138                 if (string_in_list(ops->hierarchies[i]->controllers, controller))
 139                         return ops->hierarchies[i];
 140         }
 141
 142         if (controller)
 143                 WARN("There is no useable %s controller", controller);
 144         else
 145                 WARN("There is no empty unified cgroup hierarchy");
 146
 147         return ret_set_errno(NULL, ENOENT);
 148 }
 149
 150 /* Taken over modified from the kernel sources. */
 151 #define NBITS 32 /* bits in uint32_t */
 152 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
 153 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
 154
 155 static void set_bit(unsigned bit, uint32_t *bitarr)
 156 {
 157         bitarr[bit / NBITS] |= (1 << (bit % NBITS));
 158 }
 159
 160 static void clear_bit(unsigned bit, uint32_t *bitarr)
 161 {
 162         bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
 163 }
 164
 165 static bool is_set(unsigned bit, uint32_t *bitarr)
 166 {
 167         return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
 168 }
 169
 170 /* Create cpumask from cpulist aka turn:
 171  *
 172  *      0,2-3
 173  *
 174  * into bit array
 175  *
 176  *      1 0 1 1
 177  */
 178 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
 179 {
 180         __do_free uint32_t *bitarr = NULL;
 181         char *token;
 182         size_t arrlen;
 183
 184         arrlen = BITS_TO_LONGS(nbits);
 185         bitarr = calloc(arrlen, sizeof(uint32_t));
 186         if (!bitarr)
 187                 return ret_set_errno(NULL, ENOMEM);
 188
 189         lxc_iterate_parts(token, buf, ",") {
 190                 errno = 0;
 191                 unsigned end, start;
 192                 char *range;
 193
 194                 start = strtoul(token, NULL, 0);
 195                 end = start;
 196                 range = strchr(token, '-');
 197                 if (range)
 198                         end = strtoul(range + 1, NULL, 0);
 199
 200                 if (!(start <= end))
 201                         return ret_set_errno(NULL, EINVAL);
 202
 203                 if (end >= nbits)
 204                         return ret_set_errno(NULL, EINVAL);
 205
 206                 while (start <= end)
 207                         set_bit(start++, bitarr);
 208         }
 209
 210         return move_ptr(bitarr);
 211 }
 212
 213 /* Turn cpumask into simple, comma-separated cpulist. */
 214 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
 215 {
 216         __do_free_string_list char **cpulist = NULL;
 217         char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
 218         int ret;
 219
 220         for (size_t i = 0; i <= nbits; i++) {
 221                 if (!is_set(i, bitarr))
 222                         continue;
 223
 224                 ret = strnprintf(numstr, sizeof(numstr), "%zu", i);
 225                 if (ret < 0)
 226                         return NULL;
 227
 228                 ret = lxc_append_string(&cpulist, numstr);
 229                 if (ret < 0)
 230                         return ret_set_errno(NULL, ENOMEM);
 231         }
 232
 233         if (!cpulist)
 234                 return ret_set_errno(NULL, ENOMEM);
 235
 236         return lxc_string_join(",", (const char **)cpulist, false);
 237 }
 238
 239 static ssize_t get_max_cpus(char *cpulist)
 240 {
 241         char *c1, *c2;
 242         char *maxcpus = cpulist;
 243         size_t cpus = 0;
 244
 245         c1 = strrchr(maxcpus, ',');
 246         if (c1)
 247                 c1++;
 248
 249         c2 = strrchr(maxcpus, '-');
 250         if (c2)
 251                 c2++;
 252
 253         if (!c1 && !c2)
 254                 c1 = maxcpus;
 255         else if (c1 > c2)
 256                 c2 = c1;
 257         else if (c1 < c2)
 258                 c1 = c2;
 259         else if (!c1 && c2)
 260                 c1 = c2;
 261
 262         errno = 0;
 263         cpus = strtoul(c1, NULL, 0);
 264         if (errno != 0)
 265                 return -1;
 266
 267         return cpus;
 268 }
 269
 270 static inline bool is_unified_hierarchy(const struct hierarchy *h)
 271 {
 272         return h->fs_type == UNIFIED_HIERARCHY;
 273 }
 274
 275 /* Return true if the controller @entry is found in the null-terminated list of
 276  * hierarchies @hlist.
 277  */
 278 static bool controller_available(struct hierarchy **hlist, char *entry)
 279 {
 280         if (!hlist)
 281                 return false;
 282
 283         for (int i = 0; hlist[i]; i++)
 284                 if (string_in_list(hlist[i]->controllers, entry))
 285                         return true;
 286
 287         return false;
 288 }
 289
 290 static bool controllers_available(struct cgroup_ops *ops)
 291 {
 292         struct hierarchy **hlist;
 293
 294         if (!ops->cgroup_use)
 295                 return true;
 296
 297         hlist = ops->hierarchies;
 298         for (char **cur = ops->cgroup_use; cur && *cur; cur++)
 299                 if (!controller_available(hlist, *cur))
 300                         return log_error(false, "The %s controller found", *cur);
 301
 302         return true;
 303 }
 304
 305 static char **list_new(void)
 306 {
 307         __do_free_string_list char **list = NULL;
 308         int idx;
 309
 310         idx = list_add((void ***)&list);
 311         if (idx < 0)
 312                 return NULL;
 313
 314         list[idx] = NULL;
 315         return move_ptr(list);
 316 }
 317
 318 static int list_add_string(char ***list, char *entry)
 319 {
 320         __do_free char *dup = NULL;
 321         int idx;
 322
 323         dup = strdup(entry);
 324         if (!dup)
 325                 return ret_errno(ENOMEM);
 326
 327         idx = list_add((void ***)list);
 328         if (idx < 0)
 329                 return idx;
 330
 331         (*list)[idx] = move_ptr(dup);
 332         return 0;
 333 }
 334
 335 static char **list_add_controllers(char *controllers)
 336 {
 337         __do_free_string_list char **list = NULL;
 338         char *it;
 339
 340         lxc_iterate_parts(it, controllers, " \t\n") {
 341                 int ret;
 342
 343                 ret = list_add_string(&list, it);
 344                 if (ret < 0)
 345                         return NULL;
 346         }
 347
 348         return move_ptr(list);
 349 }
 350
 351 static char **unified_controllers(int dfd, const char *file)
 352 {
 353         __do_free char *buf = NULL;
 354
 355         buf = read_file_at(dfd, file, PROTECT_OPEN, 0);
 356         if (!buf)
 357                 return NULL;
 358
 359         return list_add_controllers(buf);
 360 }
 361
 362 static bool skip_hierarchy(const struct cgroup_ops *ops, char **controllers)
 363 {
 364         if (!ops->cgroup_use)
 365                 return false;
 366
 367         for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
 368                 bool found = false;
 369
 370                 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
 371                         if (!strequal(*cur_use, *cur_ctrl))
 372                                 continue;
 373
 374                         found = true;
 375                         break;
 376                 }
 377
 378                 if (found)
 379                         continue;
 380
 381                 return true;
 382         }
 383
 384         return false;
 385 }
 386
 387 static int cgroup_hierarchy_add(struct cgroup_ops *ops, int dfd_mnt, char *mnt,
 388                                 int dfd_base, char *base_cgroup,
 389                                 char **controllers, cgroupfs_type_magic_t fs_type)
 390 {
 391         __do_free struct hierarchy *new = NULL;
 392         int idx;
 393
 394         if (abspath(base_cgroup))
 395                 return syserrno_set(-EINVAL, "Container base path must be relative to controller mount");
 396
 397         new = zalloc(sizeof(*new));
 398         if (!new)
 399                 return ret_errno(ENOMEM);
 400
 401         new->dfd_con            = -EBADF;
 402         new->dfd_lim            = -EBADF;
 403         new->dfd_mon            = -EBADF;
 404
 405         new->fs_type            = fs_type;
 406         new->controllers        = controllers;
 407         new->at_mnt             = mnt;
 408         new->at_base            = base_cgroup;
 409
 410         new->dfd_mnt            = dfd_mnt;
 411         new->dfd_base           = dfd_base;
 412
 413         TRACE("Adding cgroup hierarchy mounted at %s and base cgroup %s",
 414               mnt, maybe_empty(base_cgroup));
 415         for (char *const *it = new->controllers; it && *it; it++)
 416                 TRACE("The hierarchy contains the %s controller", *it);
 417
 418         idx = list_add((void ***)&ops->hierarchies);
 419         if (idx < 0)
 420                 return ret_errno(idx);
 421
 422         if (fs_type == UNIFIED_HIERARCHY)
 423                 ops->unified = new;
 424         (ops->hierarchies)[idx] = move_ptr(new);
 425
 426         return 0;
 427 }
 428
 429 static int cgroup_tree_remove(struct hierarchy **hierarchies, const char *path_prune)
 430 {
 431         if (!path_prune || !hierarchies)
 432                 return 0;
 433
 434         for (int i = 0; hierarchies[i]; i++) {
 435                 struct hierarchy *h = hierarchies[i];
 436                 int ret;
 437
 438                 ret = cgroup_tree_prune(h->dfd_base, path_prune);
 439                 if (ret < 0)
 440                         SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
 441                 else
 442                         TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
 443
 444                 free_equal(h->path_lim, h->path_con);
 445         }
 446
 447         return 0;
 448 }
 449
 450 struct generic_userns_exec_data {
 451         struct hierarchy **hierarchies;
 452         const char *path_prune;
 453         struct lxc_conf *conf;
 454         uid_t origuid; /* target uid in parent namespace */
 455         char *path;
 456 };
 457
 458 static int cgroup_tree_remove_wrapper(void *data)
 459 {
 460         struct generic_userns_exec_data *arg = data;
 461         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
 462         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
 463         int ret;
 464
 465         if (!lxc_drop_groups() && errno != EPERM)
 466                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
 467
 468         ret = setresgid(nsgid, nsgid, nsgid);
 469         if (ret < 0)
 470                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
 471                                        (int)nsgid, (int)nsgid, (int)nsgid);
 472
 473         ret = setresuid(nsuid, nsuid, nsuid);
 474         if (ret < 0)
 475                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
 476                                        (int)nsuid, (int)nsuid, (int)nsuid);
 477
 478         return cgroup_tree_remove(arg->hierarchies, arg->path_prune);
 479 }
 480
 481 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
 482                                                 struct lxc_handler *handler)
 483 {
 484         int ret;
 485
 486         if (!ops) {
 487                 ERROR("Called with uninitialized cgroup operations");
 488                 return;
 489         }
 490
 491         if (!ops->hierarchies)
 492                 return;
 493
 494         if (!handler) {
 495                 ERROR("Called with uninitialized handler");
 496                 return;
 497         }
 498
 499         if (!handler->conf) {
 500                 ERROR("Called with uninitialized conf");
 501                 return;
 502         }
 503
 504         if (!ops->container_limit_cgroup) {
 505                 WARN("Uninitialized limit cgroup");
 506                 return;
 507         }
 508
 509         ret = bpf_program_cgroup_detach(handler->cgroup_ops->cgroup2_devices);
 510         if (ret < 0)
 511                 WARN("Failed to detach bpf program from cgroup");
 512
 513         if (!lxc_list_empty(&handler->conf->id_map)) {
 514                 struct generic_userns_exec_data wrap = {
 515                         .conf                   = handler->conf,
 516                         .path_prune             = ops->container_limit_cgroup,
 517                         .hierarchies            = ops->hierarchies,
 518                         .origuid                = 0,
 519                 };
 520                 ret = userns_exec_1(handler->conf, cgroup_tree_remove_wrapper,
 521                                     &wrap, "cgroup_tree_remove_wrapper");
 522         } else {
 523                 ret = cgroup_tree_remove(ops->hierarchies, ops->container_limit_cgroup);
 524         }
 525         if (ret < 0)
 526                 SYSWARN("Failed to destroy cgroups");
 527 }
 528
 529 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
 530 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
 531 static bool cpuset1_cpus_initialize(int dfd_parent, int dfd_child,
 532                                     bool am_initialized)
 533 {
 534         __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
 535                        *offlinecpus = NULL, *posscpus = NULL;
 536         __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
 537                            *possmask = NULL;
 538         int ret;
 539         ssize_t i;
 540         ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
 541         bool flipped_bit = false;
 542
 543         posscpus = read_file_at(dfd_parent, "cpuset.cpus", PROTECT_OPEN, 0);
 544         if (!posscpus)
 545                 return log_error_errno(false, errno, "Failed to read file \"%s\"", fpath);
 546
 547         /* Get maximum number of cpus found in possible cpuset. */
 548         maxposs = get_max_cpus(posscpus);
 549         if (maxposs < 0 || maxposs >= INT_MAX - 1)
 550                 return false;
 551
 552         if (file_exists(__ISOL_CPUS)) {
 553                 isolcpus = read_file_at(-EBADF, __ISOL_CPUS, PROTECT_OPEN, 0);
 554                 if (!isolcpus)
 555                         return log_error_errno(false, errno, "Failed to read file \"%s\"", __ISOL_CPUS);
 556
 557                 if (isdigit(isolcpus[0])) {
 558                         /* Get maximum number of cpus found in isolated cpuset. */
 559                         maxisol = get_max_cpus(isolcpus);
 560                         if (maxisol < 0 || maxisol >= INT_MAX - 1)
 561                                 return false;
 562                 }
 563
 564                 if (maxposs < maxisol)
 565                         maxposs = maxisol;
 566                 maxposs++;
 567         } else {
 568                 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
 569         }
 570
 571         if (file_exists(__OFFLINE_CPUS)) {
 572                 offlinecpus = read_file_at(-EBADF, __OFFLINE_CPUS, PROTECT_OPEN, 0);
 573                 if (!offlinecpus)
 574                         return log_error_errno(false, errno, "Failed to read file \"%s\"", __OFFLINE_CPUS);
 575
 576                 if (isdigit(offlinecpus[0])) {
 577                         /* Get maximum number of cpus found in offline cpuset. */
 578                         maxoffline = get_max_cpus(offlinecpus);
 579                         if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
 580                                 return false;
 581                 }
 582
 583                 if (maxposs < maxoffline)
 584                         maxposs = maxoffline;
 585                 maxposs++;
 586         } else {
 587                 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
 588         }
 589
 590         if ((maxisol == 0) && (maxoffline == 0)) {
 591                 cpulist = move_ptr(posscpus);
 592                 goto copy_parent;
 593         }
 594
 595         possmask = lxc_cpumask(posscpus, maxposs);
 596         if (!possmask)
 597                 return log_error_errno(false, errno, "Failed to create cpumask for possible cpus");
 598
 599         if (maxisol > 0) {
 600                 isolmask = lxc_cpumask(isolcpus, maxposs);
 601                 if (!isolmask)
 602                         return log_error_errno(false, errno, "Failed to create cpumask for isolated cpus");
 603         }
 604
 605         if (maxoffline > 0) {
 606                 offlinemask = lxc_cpumask(offlinecpus, maxposs);
 607                 if (!offlinemask)
 608                         return log_error_errno(false, errno, "Failed to create cpumask for offline cpus");
 609         }
 610
 611         for (i = 0; i <= maxposs; i++) {
 612                 if ((isolmask && !is_set(i, isolmask)) ||
 613                     (offlinemask && !is_set(i, offlinemask)) ||
 614                     !is_set(i, possmask))
 615                         continue;
 616
 617                 flipped_bit = true;
 618                 clear_bit(i, possmask);
 619         }
 620
 621         if (!flipped_bit) {
 622                 cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
 623                 TRACE("No isolated or offline cpus present in cpuset");
 624         } else {
 625                 cpulist = move_ptr(posscpus);
 626                 TRACE("Removed isolated or offline cpus from cpuset");
 627         }
 628         if (!cpulist)
 629                 return log_error_errno(false, errno, "Failed to create cpu list");
 630
 631 copy_parent:
 632         if (!am_initialized) {
 633                 ret = lxc_writeat(dfd_child, "cpuset.cpus", cpulist, strlen(cpulist));
 634                 if (ret < 0)
 635                         return log_error_errno(false, errno, "Failed to write cpu list to \"%d/cpuset.cpus\"", dfd_child);
 636
 637                 TRACE("Copied cpu settings of parent cgroup");
 638         }
 639
 640         return true;
 641 }
 642
 643 static bool cpuset1_initialize(int dfd_base, int dfd_next)
 644 {
 645         char mems[PATH_MAX];
 646         ssize_t bytes;
 647         char v;
 648
 649         /*
 650         * Determine whether the base cgroup has cpuset
 651         * inheritance turned on.
 652          */
 653         bytes = lxc_readat(dfd_base, "cgroup.clone_children", &v, 1);
 654         if (bytes < 0)
 655                 return syserrno(false, "Failed to read file %d(cgroup.clone_children)", dfd_base);
 656
 657         /*
 658         * Initialize cpuset.cpus and make remove any isolated
 659         * and offline cpus.
 660          */
 661         if (!cpuset1_cpus_initialize(dfd_base, dfd_next, v == '1'))
 662                 return syserrno(false, "Failed to initialize cpuset.cpus");
 663
 664         /* Read cpuset.mems from parent... */
 665         bytes = lxc_readat(dfd_base, "cpuset.mems", mems, sizeof(mems));
 666         if (bytes < 0)
 667                 return syserrno(false, "Failed to read file %d(cpuset.mems)", dfd_base);
 668
 669         /* ... and copy to first cgroup in the tree... */
 670         bytes = lxc_writeat(dfd_next, "cpuset.mems", mems, bytes);
 671         if (bytes < 0)
 672                 return syserrno(false, "Failed to write %d(cpuset.mems)", dfd_next);
 673
 674         /* ... and finally turn on cpuset inheritance. */
 675         bytes = lxc_writeat(dfd_next, "cgroup.clone_children", "1", 1);
 676         if (bytes < 0)
 677                 return syserrno(false, "Failed to write %d(cgroup.clone_children)", dfd_next);
 678
 679         return log_trace(true, "Initialized cpuset in the legacy hierarchy");
 680 }
 681
 682 static int __cgroup_tree_create(int dfd_base, const char *path, mode_t mode,
 683                                 bool cpuset_v1, bool eexist_ignore)
 684 {
 685         __do_close int dfd_final = -EBADF;
 686         int dfd_cur = dfd_base;
 687         int ret = 0;
 688         size_t len;
 689         char *cur;
 690         char buf[PATH_MAX];
 691
 692         if (is_empty_string(path))
 693                 return ret_errno(EINVAL);
 694
 695         len = strlcpy(buf, path, sizeof(buf));
 696         if (len >= sizeof(buf))
 697                 return ret_errno(E2BIG);
 698
 699         lxc_iterate_parts(cur, buf, "/") {
 700                 /*
 701                  * Even though we vetted the paths when we parsed the config
 702                  * we're paranoid here and check that the path is neither
 703                  * absolute nor walks upwards.
 704                  */
 705                 if (abspath(cur))
 706                         return syserrno_set(-EINVAL, "No absolute paths allowed");
 707
 708                 if (strnequal(cur, "..", STRLITERALLEN("..")))
 709                         return syserrno_set(-EINVAL, "No upward walking paths allowed");
 710
 711                 ret = mkdirat(dfd_cur, cur, mode);
 712                 if (ret < 0) {
 713                         if (errno != EEXIST)
 714                                 return syserrno(-errno, "Failed to create %d(%s)", dfd_cur, cur);
 715
 716                         ret = -EEXIST;
 717                 }
 718                 TRACE("%s %d(%s) cgroup", !ret ? "Created" : "Reusing", dfd_cur, cur);
 719
 720                 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
 721                 if (dfd_final < 0)
 722                         return syserrno(-errno, "Fail to open%s directory %d(%s)",
 723                                         !ret ? " newly created" : "", dfd_base, cur);
 724                 if (dfd_cur != dfd_base)
 725                         close(dfd_cur);
 726                 else if (cpuset_v1 && !cpuset1_initialize(dfd_base, dfd_final))
 727                         return syserrno(-EINVAL, "Failed to initialize cpuset controller in the legacy hierarchy");
 728                 /*
 729                  * Leave dfd_final pointing to the last fd we opened so
 730                  * it will be automatically zapped if we return early.
 731                  */
 732                 dfd_cur = dfd_final;
 733         }
 734
 735         /* The final cgroup must be succesfully creatd by us. */
 736         if (ret) {
 737                 if (ret != -EEXIST || !eexist_ignore)
 738                         return syserrno_set(ret, "Creating the final cgroup %d(%s) failed", dfd_base, path);
 739         }
 740
 741         return move_fd(dfd_final);
 742 }
 743
 744 static bool cgroup_tree_create(struct cgroup_ops *ops, struct lxc_conf *conf,
 745                                struct hierarchy *h, const char *cgroup_limit_dir,
 746                                const char *cgroup_leaf, bool payload)
 747 {
 748         __do_close int fd_limit = -EBADF, fd_final = -EBADF;
 749         __do_free char *path = NULL, *limit_path = NULL;
 750         bool cpuset_v1 = false;
 751
 752         /*
 753          * The legacy cpuset controller needs massaging in case inheriting
 754          * settings from its immediate ancestor cgroup hasn't been turned on.
 755          */
 756         cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
 757
 758         if (payload && cgroup_leaf) {
 759                 /* With isolation both parts need to not already exist. */
 760                 fd_limit = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
 761                 if (fd_limit < 0)
 762                         return syserrno(false, "Failed to create limiting cgroup %d(%s)", h->dfd_base, cgroup_limit_dir);
 763
 764                 TRACE("Created limit cgroup %d->%d(%s)",
 765                       fd_limit, h->dfd_base, cgroup_limit_dir);
 766
 767                 /*
 768                  * With isolation the devices legacy cgroup needs to be
 769                  * iinitialized early, as it typically contains an 'a' (all)
 770                  * line, which is not possible once a subdirectory has been
 771                  * created.
 772                  */
 773                 if (string_in_list(h->controllers, "devices") &&
 774                     !ops->setup_limits_legacy(ops, conf, true))
 775                         return log_error(false, "Failed to setup legacy device limits");
 776
 777                 limit_path = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
 778                 path = must_make_path(limit_path, cgroup_leaf, NULL);
 779
 780                 /*
 781                  * If we use a separate limit cgroup, the leaf cgroup, i.e. the
 782                  * cgroup the container actually resides in, is below fd_limit.
 783                  */
 784                 fd_final = __cgroup_tree_create(fd_limit, cgroup_leaf, 0755, cpuset_v1, false);
 785                 if (fd_final < 0) {
 786                         /* Ensure we don't leave any garbage behind. */
 787                         if (cgroup_tree_prune(h->dfd_base, cgroup_limit_dir))
 788                                 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, cgroup_limit_dir);
 789                         else
 790                                 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, cgroup_limit_dir);
 791                 }
 792         } else {
 793                 path = make_cgroup_path(h, h->at_base, cgroup_limit_dir, NULL);
 794
 795                 fd_final = __cgroup_tree_create(h->dfd_base, cgroup_limit_dir, 0755, cpuset_v1, false);
 796         }
 797         if (fd_final < 0)
 798                 return syserrno(false, "Failed to create %s cgroup %d(%s)", payload ? "payload" : "monitor", h->dfd_base, cgroup_limit_dir);
 799
 800         if (payload) {
 801                 h->dfd_con = move_fd(fd_final);
 802                 h->path_con = move_ptr(path);
 803
 804                 if (fd_limit < 0)
 805                         h->dfd_lim = h->dfd_con;
 806                 else
 807                         h->dfd_lim = move_fd(fd_limit);
 808
 809                 if (limit_path)
 810                         h->path_lim = move_ptr(limit_path);
 811                 else
 812                         h->path_lim = h->path_con;
 813         } else {
 814                 h->dfd_mon = move_fd(fd_final);
 815         }
 816
 817         return true;
 818 }
 819
 820 static void cgroup_tree_prune_leaf(struct hierarchy *h, const char *path_prune,
 821                                    bool payload)
 822 {
 823         bool prune = true;
 824
 825         if (payload) {
 826                 /* Check whether we actually created the cgroup to prune. */
 827                 if (h->dfd_lim < 0)
 828                         prune = false;
 829
 830                 free_equal(h->path_con, h->path_lim);
 831                 close_equal(h->dfd_con, h->dfd_lim);
 832         } else {
 833                 /* Check whether we actually created the cgroup to prune. */
 834                 if (h->dfd_mon < 0)
 835                         prune = false;
 836
 837                 close_prot_errno_disarm(h->dfd_mon);
 838         }
 839
 840         /* We didn't create this cgroup. */
 841         if (!prune)
 842                 return;
 843
 844         if (cgroup_tree_prune(h->dfd_base, path_prune))
 845                 SYSWARN("Failed to destroy %d(%s)", h->dfd_base, path_prune);
 846         else
 847                 TRACE("Removed cgroup tree %d(%s)", h->dfd_base, path_prune);
 848 }
 849
 850 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
 851                                                 struct lxc_handler *handler)
 852 {
 853         int len;
 854         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
 855         const struct lxc_conf *conf;
 856
 857         if (!ops) {
 858                 ERROR("Called with uninitialized cgroup operations");
 859                 return;
 860         }
 861
 862         if (!ops->hierarchies)
 863                 return;
 864
 865         if (!handler) {
 866                 ERROR("Called with uninitialized handler");
 867                 return;
 868         }
 869
 870         if (!handler->conf) {
 871                 ERROR("Called with uninitialized conf");
 872                 return;
 873         }
 874         conf = handler->conf;
 875
 876         if (!ops->monitor_cgroup) {
 877                 WARN("Uninitialized monitor cgroup");
 878                 return;
 879         }
 880
 881         len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
 882         if (len < 0)
 883                 return;
 884
 885         for (int i = 0; ops->hierarchies[i]; i++) {
 886                 __do_close int fd_pivot = -EBADF;
 887                 __do_free char *pivot_path = NULL;
 888                 struct hierarchy *h = ops->hierarchies[i];
 889                 bool cpuset_v1 = false;
 890                 int ret;
 891
 892                 /* Monitor might have died before we entered the cgroup. */
 893                 if (handler->monitor_pid <= 0) {
 894                         WARN("No valid monitor process found while destroying cgroups");
 895                         goto cgroup_prune_tree;
 896                 }
 897
 898                 if (conf->cgroup_meta.monitor_pivot_dir)
 899                         pivot_path = must_make_path(conf->cgroup_meta.monitor_pivot_dir, CGROUP_PIVOT, NULL);
 900                 else if (conf->cgroup_meta.dir)
 901                         pivot_path = must_make_path(conf->cgroup_meta.dir, CGROUP_PIVOT, NULL);
 902                 else
 903                         pivot_path = must_make_path(CGROUP_PIVOT, NULL);
 904
 905                 cpuset_v1 = !is_unified_hierarchy(h) && string_in_list(h->controllers, "cpuset");
 906
 907                 fd_pivot = __cgroup_tree_create(h->dfd_base, pivot_path, 0755, cpuset_v1, true);
 908                 if (fd_pivot < 0) {
 909                         SYSWARN("Failed to create pivot cgroup %d(%s)", h->dfd_base, pivot_path);
 910                         continue;
 911                 }
 912
 913                 ret = lxc_writeat(fd_pivot, "cgroup.procs", pidstr, len);
 914                 if (ret != 0) {
 915                         SYSWARN("Failed to move monitor %s to \"%s\"", pidstr, pivot_path);
 916                         continue;
 917                 }
 918
 919 cgroup_prune_tree:
 920                 ret = cgroup_tree_prune(h->dfd_base, ops->monitor_cgroup);
 921                 if (ret < 0)
 922                         SYSWARN("Failed to destroy %d(%s)", h->dfd_base, ops->monitor_cgroup);
 923                 else
 924                         TRACE("Removed cgroup tree %d(%s)", h->dfd_base, ops->monitor_cgroup);
 925         }
 926 }
 927
 928 /*
 929  * Check we have no lxc.cgroup.dir, and that lxc.cgroup.dir.limit_prefix is a
 930  * proper prefix directory of lxc.cgroup.dir.payload.
 931  *
 932  * Returns the prefix length if it is set, otherwise zero on success.
 933  */
 934 static bool check_cgroup_dir_config(struct lxc_conf *conf)
 935 {
 936         const char *monitor_dir = conf->cgroup_meta.monitor_dir,
 937                    *container_dir = conf->cgroup_meta.container_dir,
 938                    *namespace_dir = conf->cgroup_meta.namespace_dir;
 939
 940         /* none of the new options are set, all is fine */
 941         if (!monitor_dir && !container_dir && !namespace_dir)
 942                 return true;
 943
 944         /* some are set, make sure lxc.cgroup.dir is not also set*/
 945         if (conf->cgroup_meta.dir)
 946                 return log_error_errno(false, EINVAL,
 947                         "lxc.cgroup.dir conflicts with lxc.cgroup.dir.payload/monitor");
 948
 949         /* make sure both monitor and payload are set */
 950         if (!monitor_dir || !container_dir)
 951                 return log_error_errno(false, EINVAL,
 952                         "lxc.cgroup.dir.payload and lxc.cgroup.dir.monitor must both be set");
 953
 954         /* namespace_dir may be empty */
 955         return true;
 956 }
 957
 958 __cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
 959 {
 960         __do_free char *monitor_cgroup = NULL;
 961         int idx = 0;
 962         int i;
 963         size_t len;
 964         char *suffix = NULL;
 965         struct lxc_conf *conf;
 966
 967         if (!ops)
 968                 return ret_set_errno(false, ENOENT);
 969
 970         if (!ops->hierarchies)
 971                 return true;
 972
 973         if (ops->monitor_cgroup)
 974                 return ret_set_errno(false, EEXIST);
 975
 976         if (!handler || !handler->conf)
 977                 return ret_set_errno(false, EINVAL);
 978
 979         conf = handler->conf;
 980
 981         if (!check_cgroup_dir_config(conf))
 982                 return false;
 983
 984         if (conf->cgroup_meta.monitor_dir) {
 985                 monitor_cgroup = strdup(conf->cgroup_meta.monitor_dir);
 986         } else if (conf->cgroup_meta.dir) {
 987                 monitor_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
 988                                              DEFAULT_MONITOR_CGROUP_PREFIX,
 989                                              handler->name,
 990                                              CGROUP_CREATE_RETRY, NULL);
 991         } else if (ops->cgroup_pattern) {
 992                 __do_free char *cgroup_tree = NULL;
 993
 994                 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
 995                 if (!cgroup_tree)
 996                         return ret_set_errno(false, ENOMEM);
 997
 998                 monitor_cgroup = must_concat(&len, cgroup_tree, "/",
 999                                              DEFAULT_MONITOR_CGROUP,
1000                                              CGROUP_CREATE_RETRY, NULL);
1001         } else {
1002                 monitor_cgroup = must_concat(&len, DEFAULT_MONITOR_CGROUP_PREFIX,
1003                                              handler->name,
1004                                              CGROUP_CREATE_RETRY, NULL);
1005         }
1006         if (!monitor_cgroup)
1007                 return ret_set_errno(false, ENOMEM);
1008
1009         if (!conf->cgroup_meta.monitor_dir) {
1010                 suffix = monitor_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1011                 *suffix = '\0';
1012         }
1013         do {
1014                 if (idx && suffix)
1015                         sprintf(suffix, "-%d", idx);
1016
1017                 for (i = 0; ops->hierarchies[i]; i++) {
1018                         if (cgroup_tree_create(ops, handler->conf,
1019                                                ops->hierarchies[i],
1020                                                monitor_cgroup, NULL, false))
1021                                 continue;
1022
1023                         DEBUG("Failed to create cgroup %s)", monitor_cgroup);
1024                         for (int j = 0; j <= i; j++)
1025                                 cgroup_tree_prune_leaf(ops->hierarchies[j],
1026                                                        monitor_cgroup, false);
1027
1028                         idx++;
1029                         break;
1030                 }
1031         } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1032
1033         if (idx == 1000 || (!suffix && idx != 0))
1034                 return log_error_errno(false, ERANGE, "Failed to create monitor cgroup");
1035
1036         ops->monitor_cgroup = move_ptr(monitor_cgroup);
1037         return log_info(true, "The monitor process uses \"%s\" as cgroup", ops->monitor_cgroup);
1038 }
1039
1040 /*
1041  * Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1042  * next cgroup_pattern-1, -2, ..., -999.
1043  */
1044 __cgfsng_ops static bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_handler *handler)
1045 {
1046         __do_free char *container_cgroup = NULL, *__limit_cgroup = NULL;
1047         char *limit_cgroup;
1048         int idx = 0;
1049         int i;
1050         size_t len;
1051         char *suffix = NULL;
1052         struct lxc_conf *conf;
1053
1054         if (!ops)
1055                 return ret_set_errno(false, ENOENT);
1056
1057         if (!ops->hierarchies)
1058                 return true;
1059
1060         if (ops->container_cgroup || ops->container_limit_cgroup)
1061                 return ret_set_errno(false, EEXIST);
1062
1063         if (!handler || !handler->conf)
1064                 return ret_set_errno(false, EINVAL);
1065
1066         conf = handler->conf;
1067
1068         if (!check_cgroup_dir_config(conf))
1069                 return false;
1070
1071         if (conf->cgroup_meta.container_dir) {
1072                 __limit_cgroup = strdup(conf->cgroup_meta.container_dir);
1073                 if (!__limit_cgroup)
1074                         return ret_set_errno(false, ENOMEM);
1075
1076                 if (conf->cgroup_meta.namespace_dir) {
1077                         container_cgroup = must_make_path(__limit_cgroup,
1078                                                           conf->cgroup_meta.namespace_dir,
1079                                                           NULL);
1080                         limit_cgroup = __limit_cgroup;
1081                 } else {
1082                         /* explicit paths but without isolation */
1083                         limit_cgroup = move_ptr(__limit_cgroup);
1084                         container_cgroup = limit_cgroup;
1085                 }
1086         } else if (conf->cgroup_meta.dir) {
1087                 limit_cgroup = must_concat(&len, conf->cgroup_meta.dir, "/",
1088                                            DEFAULT_PAYLOAD_CGROUP_PREFIX,
1089                                            handler->name,
1090                                            CGROUP_CREATE_RETRY, NULL);
1091                 container_cgroup = limit_cgroup;
1092         } else if (ops->cgroup_pattern) {
1093                 __do_free char *cgroup_tree = NULL;
1094
1095                 cgroup_tree = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1096                 if (!cgroup_tree)
1097                         return ret_set_errno(false, ENOMEM);
1098
1099                 limit_cgroup = must_concat(&len, cgroup_tree, "/",
1100                                            DEFAULT_PAYLOAD_CGROUP,
1101                                            CGROUP_CREATE_RETRY, NULL);
1102                 container_cgroup = limit_cgroup;
1103         } else {
1104                 limit_cgroup = must_concat(&len, DEFAULT_PAYLOAD_CGROUP_PREFIX,
1105                                            handler->name,
1106                                            CGROUP_CREATE_RETRY, NULL);
1107                 container_cgroup = limit_cgroup;
1108         }
1109         if (!limit_cgroup)
1110                 return ret_set_errno(false, ENOMEM);
1111
1112         if (!conf->cgroup_meta.container_dir) {
1113                 suffix = container_cgroup + len - CGROUP_CREATE_RETRY_LEN;
1114                 *suffix = '\0';
1115         }
1116         do {
1117                 if (idx && suffix)
1118                         sprintf(suffix, "-%d", idx);
1119
1120                 for (i = 0; ops->hierarchies[i]; i++) {
1121                         if (cgroup_tree_create(ops, handler->conf,
1122                                                ops->hierarchies[i], limit_cgroup,
1123                                                conf->cgroup_meta.namespace_dir,
1124                                                true))
1125                                 continue;
1126
1127                         DEBUG("Failed to create cgroup \"%s\"", ops->hierarchies[i]->path_con ?: "(null)");
1128                         for (int j = 0; j <= i; j++)
1129                                 cgroup_tree_prune_leaf(ops->hierarchies[j],
1130                                                        limit_cgroup, true);
1131
1132                         idx++;
1133                         break;
1134                 }
1135         } while (ops->hierarchies[i] && idx > 0 && idx < 1000 && suffix);
1136
1137         if (idx == 1000 || (!suffix && idx != 0))
1138                 return log_error_errno(false, ERANGE, "Failed to create container cgroup");
1139
1140         ops->container_cgroup = move_ptr(container_cgroup);
1141         if (__limit_cgroup)
1142                 ops->container_limit_cgroup = move_ptr(__limit_cgroup);
1143         else
1144                 ops->container_limit_cgroup = ops->container_cgroup;
1145         INFO("The container process uses \"%s\" as inner and \"%s\" as limit cgroup",
1146              ops->container_cgroup, ops->container_limit_cgroup);
1147         return true;
1148 }
1149
1150 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
1151                                               struct lxc_handler *handler)
1152 {
1153         int monitor_len, transient_len = 0;
1154         char monitor[INTTYPE_TO_STRLEN(pid_t)],
1155             transient[INTTYPE_TO_STRLEN(pid_t)];
1156
1157         if (!ops)
1158                 return ret_set_errno(false, ENOENT);
1159
1160         if (!ops->hierarchies)
1161                 return true;
1162
1163         if (!ops->monitor_cgroup)
1164                 return ret_set_errno(false, ENOENT);
1165
1166         if (!handler || !handler->conf)
1167                 return ret_set_errno(false, EINVAL);
1168
1169         monitor_len = strnprintf(monitor, sizeof(monitor), "%d", handler->monitor_pid);
1170         if (monitor_len < 0)
1171                 return false;
1172
1173         if (handler->transient_pid > 0) {
1174                 transient_len = strnprintf(transient, sizeof(transient), "%d", handler->transient_pid);
1175                 if (transient_len < 0)
1176                         return false;
1177         }
1178
1179         for (int i = 0; ops->hierarchies[i]; i++) {
1180                 struct hierarchy *h = ops->hierarchies[i];
1181                 int ret;
1182
1183                 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", monitor, monitor_len);
1184                 if (ret)
1185                         return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1186
1187                 TRACE("Moved monitor into cgroup %d", h->dfd_mon);
1188
1189                 if (handler->transient_pid <= 0)
1190                         continue;
1191
1192                 ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len);
1193                 if (ret)
1194                         return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
1195
1196                 TRACE("Moved transient process into cgroup %d", h->dfd_mon);
1197
1198                 /*
1199                  * we don't keep the fds for non-unified hierarchies around
1200                  * mainly because we don't make use of them anymore after the
1201                  * core cgroup setup is done but also because there are quite a
1202                  * lot of them.
1203                  */
1204                 if (!is_unified_hierarchy(h))
1205                         close_prot_errno_disarm(h->dfd_mon);
1206         }
1207         handler->transient_pid = -1;
1208
1209         return true;
1210 }
1211
1212 __cgfsng_ops static bool cgfsng_payload_enter(struct cgroup_ops *ops,
1213                                               struct lxc_handler *handler)
1214 {
1215         int len;
1216         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1217
1218         if (!ops)
1219                 return ret_set_errno(false, ENOENT);
1220
1221         if (!ops->hierarchies)
1222                 return true;
1223
1224         if (!ops->container_cgroup)
1225                 return ret_set_errno(false, ENOENT);
1226
1227         if (!handler || !handler->conf)
1228                 return ret_set_errno(false, EINVAL);
1229
1230         len = strnprintf(pidstr, sizeof(pidstr), "%d", handler->pid);
1231         if (len < 0)
1232                 return false;
1233
1234         for (int i = 0; ops->hierarchies[i]; i++) {
1235                 struct hierarchy *h = ops->hierarchies[i];
1236                 int ret;
1237
1238                 if (is_unified_hierarchy(h) &&
1239                     (handler->clone_flags & CLONE_INTO_CGROUP))
1240                         continue;
1241
1242                 ret = lxc_writeat(h->dfd_con, "cgroup.procs", pidstr, len);
1243                 if (ret != 0)
1244                         return log_error_errno(false, errno, "Failed to enter cgroup \"%s\"", h->path_con);
1245
1246                 TRACE("Moved container into %s cgroup via %d", h->path_con, h->dfd_con);
1247         }
1248
1249         return true;
1250 }
1251
1252 static int fchowmodat(int dirfd, const char *path, uid_t chown_uid,
1253                       gid_t chown_gid, mode_t chmod_mode)
1254 {
1255         int ret;
1256
1257         ret = fchownat(dirfd, path, chown_uid, chown_gid,
1258                        AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
1259         if (ret < 0)
1260                 return log_warn_errno(-1,
1261                                       errno, "Failed to fchownat(%d, %s, %d, %d, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW )",
1262                                       dirfd, path, (int)chown_uid,
1263                                       (int)chown_gid);
1264
1265         ret = fchmodat(dirfd, (*path != '\0') ? path : ".", chmod_mode, 0);
1266         if (ret < 0)
1267                 return log_warn_errno(-1, errno, "Failed to fchmodat(%d, %s, %d, AT_SYMLINK_NOFOLLOW)",
1268                                       dirfd, path, (int)chmod_mode);
1269
1270         return 0;
1271 }
1272
1273 /* chgrp the container cgroups to container group.  We leave
1274  * the container owner as cgroup owner.  So we must make the
1275  * directories 775 so that the container can create sub-cgroups.
1276  *
1277  * Also chown the tasks and cgroup.procs files.  Those may not
1278  * exist depending on kernel version.
1279  */
1280 static int chown_cgroup_wrapper(void *data)
1281 {
1282         int ret;
1283         uid_t destuid;
1284         struct generic_userns_exec_data *arg = data;
1285         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1286         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1287
1288         if (!lxc_drop_groups() && errno != EPERM)
1289                 return log_error_errno(-1, errno, "Failed to setgroups(0, NULL)");
1290
1291         ret = setresgid(nsgid, nsgid, nsgid);
1292         if (ret < 0)
1293                 return log_error_errno(-1, errno, "Failed to setresgid(%d, %d, %d)",
1294                                        (int)nsgid, (int)nsgid, (int)nsgid);
1295
1296         ret = setresuid(nsuid, nsuid, nsuid);
1297         if (ret < 0)
1298                 return log_error_errno(-1, errno, "Failed to setresuid(%d, %d, %d)",
1299                                        (int)nsuid, (int)nsuid, (int)nsuid);
1300
1301         destuid = get_ns_uid(arg->origuid);
1302         if (destuid == LXC_INVALID_UID)
1303                 destuid = 0;
1304
1305         for (int i = 0; arg->hierarchies[i]; i++) {
1306                 int dirfd = arg->hierarchies[i]->dfd_con;
1307
1308                 if (dirfd < 0)
1309                         return syserrno_set(-EBADF, "Invalid cgroup file descriptor");
1310
1311                 (void)fchowmodat(dirfd, "", destuid, nsgid, 0775);
1312
1313                 /*
1314                  * Failures to chown() these are inconvenient but not
1315                  * detrimental We leave these owned by the container launcher,
1316                  * so that container root can write to the files to attach.  We
1317                  * chmod() them 664 so that container systemd can write to the
1318                  * files (which systemd in wily insists on doing).
1319                  */
1320
1321                 if (arg->hierarchies[i]->fs_type == LEGACY_HIERARCHY)
1322                         (void)fchowmodat(dirfd, "tasks", destuid, nsgid, 0664);
1323
1324                 (void)fchowmodat(dirfd, "cgroup.procs", destuid, nsgid, 0664);
1325
1326                 if (arg->hierarchies[i]->fs_type != UNIFIED_HIERARCHY)
1327                         continue;
1328
1329                 for (char **p = arg->hierarchies[i]->delegate; p && *p; p++)
1330                         (void)fchowmodat(dirfd, *p, destuid, nsgid, 0664);
1331         }
1332
1333         return 0;
1334 }
1335
1336 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1337                                       struct lxc_conf *conf)
1338 {
1339         struct generic_userns_exec_data wrap;
1340
1341         if (!ops)
1342                 return ret_set_errno(false, ENOENT);
1343
1344         if (!ops->hierarchies)
1345                 return true;
1346
1347         if (!ops->container_cgroup)
1348                 return ret_set_errno(false, ENOENT);
1349
1350         if (!conf)
1351                 return ret_set_errno(false, EINVAL);
1352
1353         if (lxc_list_empty(&conf->id_map))
1354                 return true;
1355
1356         wrap.origuid = geteuid();
1357         wrap.path = NULL;
1358         wrap.hierarchies = ops->hierarchies;
1359         wrap.conf = conf;
1360
1361         if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0)
1362                 return log_error_errno(false, errno, "Error requesting cgroup chown in new user namespace");
1363
1364         return true;
1365 }
1366
1367 __cgfsng_ops static void cgfsng_payload_finalize(struct cgroup_ops *ops)
1368 {
1369         if (!ops)
1370                 return;
1371
1372         if (!ops->hierarchies)
1373                 return;
1374
1375         for (int i = 0; ops->hierarchies[i]; i++) {
1376                 struct hierarchy *h = ops->hierarchies[i];
1377                 /*
1378                  * we don't keep the fds for non-unified hierarchies around
1379                  * mainly because we don't make use of them anymore after the
1380                  * core cgroup setup is done but also because there are quite a
1381                  * lot of them.
1382                  */
1383                 if (!is_unified_hierarchy(h))
1384                         close_prot_errno_disarm(h->dfd_con);
1385         }
1386
1387         /*
1388          * The checking for freezer support should obviously be done at cgroup
1389          * initialization time but that doesn't work reliable. The freezer
1390          * controller has been demoted (rightly so) to a simple file located in
1391          * each non-root cgroup. At the time when the container is created we
1392          * might still be located in /sys/fs/cgroup and so checking for
1393          * cgroup.freeze won't tell us anything because this file doesn't exist
1394          * in the root cgroup. We could then iterate through /sys/fs/cgroup and
1395          * find an already existing cgroup and then check within that cgroup
1396          * for the existence of cgroup.freeze but that will only work on
1397          * systemd based hosts. Other init systems might not manage cgroups and
1398          * so no cgroup will exist. So we defer until we have created cgroups
1399          * for our container which means we check here.
1400          */
1401         if (pure_unified_layout(ops) &&
1402             !faccessat(ops->unified->dfd_con, "cgroup.freeze", F_OK,
1403                        AT_SYMLINK_NOFOLLOW)) {
1404                 TRACE("Unified hierarchy supports freezer");
1405                 ops->unified->utilities |= FREEZER_CONTROLLER;
1406         }
1407 }
1408
1409 /* cgroup-full:* is done, no need to create subdirs */
1410 static inline bool cg_mount_needs_subdirs(int cgroup_automount_type)
1411 {
1412         switch (cgroup_automount_type) {
1413         case LXC_AUTO_CGROUP_RO:
1414                 return true;
1415         case LXC_AUTO_CGROUP_RW:
1416                 return true;
1417         case LXC_AUTO_CGROUP_MIXED:
1418                 return true;
1419         }
1420
1421         return false;
1422 }
1423
1424 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1425  * remount controller ro if needed and bindmount the cgroupfs onto
1426  * control/the/cg/path.
1427  */
1428 static int cg_legacy_mount_controllers(int cgroup_automount_type, struct hierarchy *h,
1429                                        char *hierarchy_mnt, char *cgpath,
1430                                        const char *container_cgroup)
1431 {
1432         __do_free char *sourcepath = NULL;
1433         int ret, remount_flags;
1434         int flags = MS_BIND;
1435
1436         if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1437             (cgroup_automount_type == LXC_AUTO_CGROUP_MIXED)) {
1438                 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup", MS_BIND, NULL);
1439                 if (ret < 0)
1440                         return log_error_errno(-1, errno, "Failed to bind mount \"%s\" onto \"%s\"",
1441                                                hierarchy_mnt, hierarchy_mnt);
1442
1443                 remount_flags = add_required_remount_flags(hierarchy_mnt,
1444                                                            hierarchy_mnt,
1445                                                            flags | MS_REMOUNT);
1446                 ret = mount(hierarchy_mnt, hierarchy_mnt, "cgroup",
1447                             remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1448                             NULL);
1449                 if (ret < 0)
1450                         return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", hierarchy_mnt);
1451
1452                 INFO("Remounted %s read-only", hierarchy_mnt);
1453         }
1454
1455         sourcepath = make_cgroup_path(h, h->at_base, container_cgroup, NULL);
1456         if (cgroup_automount_type == LXC_AUTO_CGROUP_RO)
1457                 flags |= MS_RDONLY;
1458
1459         ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1460         if (ret < 0)
1461                 return log_error_errno(-1, errno, "Failed to mount \"%s\" onto \"%s\"",
1462                                        h->controllers[0], cgpath);
1463         INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1464
1465         if (flags & MS_RDONLY) {
1466                 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1467                                                            flags | MS_REMOUNT);
1468                 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1469                 if (ret < 0)
1470                         return log_error_errno(-1, errno, "Failed to remount \"%s\" ro", cgpath);
1471                 INFO("Remounted %s read-only", cgpath);
1472         }
1473
1474         INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1475         return 0;
1476 }
1477
1478 /* __cgroupfs_mount
1479  *
1480  * Mount cgroup hierarchies directly without using bind-mounts. The main
1481  * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1482  * cgroups for the LXC_AUTO_CGROUP_FULL option.
1483  */
1484 static int __cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1485                             struct lxc_rootfs *rootfs, int dfd_mnt_cgroupfs,
1486                             const char *hierarchy_mnt)
1487 {
1488         __do_close int fd_fs = -EBADF;
1489         unsigned int flags = 0;
1490         char *fstype;
1491         int ret;
1492
1493         if (dfd_mnt_cgroupfs < 0)
1494                 return ret_errno(EINVAL);
1495
1496         flags |= MOUNT_ATTR_NOSUID;
1497         flags |= MOUNT_ATTR_NOEXEC;
1498         flags |= MOUNT_ATTR_NODEV;
1499         flags |= MOUNT_ATTR_RELATIME;
1500
1501         if ((cgroup_automount_type == LXC_AUTO_CGROUP_RO) ||
1502             (cgroup_automount_type == LXC_AUTO_CGROUP_FULL_RO))
1503                 flags |= MOUNT_ATTR_RDONLY;
1504
1505         if (is_unified_hierarchy(h))
1506                 fstype = "cgroup2";
1507         else
1508                 fstype = "cgroup";
1509
1510         if (can_use_mount_api()) {
1511                 fd_fs = fs_prepare(fstype, -EBADF, "", 0, 0);
1512                 if (fd_fs < 0)
1513                         return log_error_errno(-errno, errno, "Failed to prepare filesystem context for %s", fstype);
1514
1515                 if (!is_unified_hierarchy(h)) {
1516                         for (const char **it = (const char **)h->controllers; it && *it; it++) {
1517                                 if (strnequal(*it, "name=", STRLITERALLEN("name=")))
1518                                         ret = fs_set_property(fd_fs, "name", *it + STRLITERALLEN("name="));
1519                                 else
1520                                         ret = fs_set_property(fd_fs, *it, "");
1521                                 if (ret < 0)
1522                                         return log_error_errno(-errno, errno, "Failed to add %s controller to cgroup filesystem context %d(dev)", *it, fd_fs);
1523                         }
1524                 }
1525
1526                 ret = fs_attach(fd_fs, dfd_mnt_cgroupfs, hierarchy_mnt,
1527                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH,
1528                                 flags);
1529         } else {
1530                 __do_free char *controllers = NULL, *target = NULL;
1531                 unsigned int old_flags = 0;
1532                 const char *rootfs_mnt;
1533
1534                 if (!is_unified_hierarchy(h)) {
1535                         controllers = lxc_string_join(",", (const char **)h->controllers, false);
1536                         if (!controllers)
1537                                 return ret_errno(ENOMEM);
1538                 }
1539
1540                 rootfs_mnt = get_rootfs_mnt(rootfs);
1541                 ret = mnt_attributes_old(flags, &old_flags);
1542                 if (ret)
1543                         return log_error_errno(-EINVAL, EINVAL, "Unsupported mount properties specified");
1544
1545                 target = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, hierarchy_mnt, NULL);
1546                 ret = safe_mount(NULL, target, fstype, old_flags, controllers, rootfs_mnt);
1547         }
1548         if (ret < 0)
1549                 return log_error_errno(ret, errno, "Failed to mount %s filesystem onto %d(%s)",
1550                                        fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1551
1552         DEBUG("Mounted cgroup filesystem %s onto %d(%s)",
1553               fstype, dfd_mnt_cgroupfs, maybe_empty(hierarchy_mnt));
1554         return 0;
1555 }
1556
1557 static inline int cgroupfs_mount(int cgroup_automount_type, struct hierarchy *h,
1558                                  struct lxc_rootfs *rootfs,
1559                                  int dfd_mnt_cgroupfs, const char *hierarchy_mnt)
1560 {
1561         return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1562                                 dfd_mnt_cgroupfs, hierarchy_mnt);
1563 }
1564
1565 static inline int cgroupfs_bind_mount(int cgroup_automount_type, struct hierarchy *h,
1566                                       struct lxc_rootfs *rootfs,
1567                                       int dfd_mnt_cgroupfs,
1568                                       const char *hierarchy_mnt)
1569 {
1570         switch (cgroup_automount_type) {
1571         case LXC_AUTO_CGROUP_FULL_RO:
1572                 break;
1573         case LXC_AUTO_CGROUP_FULL_RW:
1574                 break;
1575         case LXC_AUTO_CGROUP_FULL_MIXED:
1576                 break;
1577         default:
1578                 return 0;
1579         }
1580
1581         return __cgroupfs_mount(cgroup_automount_type, h, rootfs,
1582                                 dfd_mnt_cgroupfs, hierarchy_mnt);
1583 }
1584
1585 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1586                                       struct lxc_handler *handler, int cg_flags)
1587 {
1588         __do_close int dfd_mnt_tmpfs = -EBADF, fd_fs = -EBADF;
1589         __do_free char *cgroup_root = NULL;
1590         int cgroup_automount_type;
1591         bool in_cgroup_ns = false, wants_force_mount = false;
1592         struct lxc_conf *conf = handler->conf;
1593         struct lxc_rootfs *rootfs = &conf->rootfs;
1594         const char *rootfs_mnt = get_rootfs_mnt(rootfs);
1595         int ret;
1596
1597         if (!ops)
1598                 return ret_set_errno(false, ENOENT);
1599
1600         if (!ops->hierarchies)
1601                 return true;
1602
1603         if (!conf)
1604                 return ret_set_errno(false, EINVAL);
1605
1606         if ((cg_flags & LXC_AUTO_CGROUP_MASK) == 0)
1607                 return log_trace(true, "No cgroup mounts requested");
1608
1609         if (cg_flags & LXC_AUTO_CGROUP_FORCE) {
1610                 cg_flags &= ~LXC_AUTO_CGROUP_FORCE;
1611                 wants_force_mount = true;
1612         }
1613
1614         switch (cg_flags) {
1615         case LXC_AUTO_CGROUP_RO:
1616                 TRACE("Read-only cgroup mounts requested");
1617                 break;
1618         case LXC_AUTO_CGROUP_RW:
1619                 TRACE("Read-write cgroup mounts requested");
1620                 break;
1621         case LXC_AUTO_CGROUP_MIXED:
1622                 TRACE("Mixed cgroup mounts requested");
1623                 break;
1624         case LXC_AUTO_CGROUP_FULL_RO:
1625                 TRACE("Full read-only cgroup mounts requested");
1626                 break;
1627         case LXC_AUTO_CGROUP_FULL_RW:
1628                 TRACE("Full read-write cgroup mounts requested");
1629                 break;
1630         case LXC_AUTO_CGROUP_FULL_MIXED:
1631                 TRACE("Full mixed cgroup mounts requested");
1632                 break;
1633         default:
1634                 return log_error_errno(false, EINVAL, "Invalid cgroup mount options specified");
1635         }
1636         cgroup_automount_type = cg_flags;
1637
1638         if (!wants_force_mount) {
1639                 wants_force_mount = !lxc_wants_cap(CAP_SYS_ADMIN, conf);
1640
1641                 /*
1642                  * Most recent distro versions currently have init system that
1643                  * do support cgroup2 but do not mount it by default unless
1644                  * explicitly told so even if the host is cgroup2 only. That
1645                  * means they often will fail to boot. Fix this by pre-mounting
1646                  * cgroup2 by default. We will likely need to be doing this a
1647                  * few years until all distros have switched over to cgroup2 at
1648                  * which point we can safely assume that their init systems
1649                  * will mount it themselves.
1650                  */
1651                 if (pure_unified_layout(ops))
1652                         wants_force_mount = true;
1653         }
1654
1655         if (cgns_supported() && container_uses_namespace(handler, CLONE_NEWCGROUP))
1656                 in_cgroup_ns = true;
1657
1658         if (in_cgroup_ns && !wants_force_mount)
1659                 return log_trace(true, "Mounting cgroups not requested or needed");
1660
1661         /* This is really the codepath that we want. */
1662         if (pure_unified_layout(ops)) {
1663                 __do_close int dfd_mnt_unified = -EBADF;
1664
1665                 dfd_mnt_unified = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1666                                           PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1667                 if (dfd_mnt_unified < 0)
1668                         return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
1669                                         DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1670                 /*
1671                  * If cgroup namespaces are supported but the container will
1672                  * not have CAP_SYS_ADMIN after it has started we need to mount
1673                  * the cgroups manually.
1674                  *
1675                  * Note that here we know that wants_force_mount is true.
1676                  * Otherwise we would've returned early above.
1677                  */
1678                 if (in_cgroup_ns) {
1679                         /*
1680                          *  1. cgroup:rw:force    -> Mount the cgroup2 filesystem.
1681                          *  2. cgroup:ro:force    -> Mount the cgroup2 filesystem read-only.
1682                          *  3. cgroup:mixed:force -> See comment above how this
1683                          *                           does not apply so
1684                          *                           cgroup:mixed is equal to
1685                          *                           cgroup:rw when cgroup
1686                          *                           namespaces are supported.
1687
1688                          *  4. cgroup:rw    -> No-op; init system responsible for mounting.
1689                          *  5. cgroup:ro    -> No-op; init system responsible for mounting.
1690                          *  6. cgroup:mixed -> No-op; init system responsible for mounting.
1691                          *
1692                          *  7. cgroup-full:rw    -> Not supported.
1693                          *  8. cgroup-full:ro    -> Not supported.
1694                          *  9. cgroup-full:mixed -> Not supported.
1695
1696                          * 10. cgroup-full:rw:force    -> Not supported.
1697                          * 11. cgroup-full:ro:force    -> Not supported.
1698                          * 12. cgroup-full:mixed:force -> Not supported.
1699                          */
1700                         ret = cgroupfs_mount(cgroup_automount_type, ops->unified, rootfs, dfd_mnt_unified, "");
1701                         if (ret < 0)
1702                                 return syserrno(false, "Failed to force mount cgroup filesystem in cgroup namespace");
1703
1704                         return log_trace(true, "Force mounted cgroup filesystem in new cgroup namespace");
1705                 } else {
1706                         /*
1707                          * Either no cgroup namespace supported (highly
1708                          * unlikely unless we're dealing with a Frankenkernel.
1709                          * Or the user requested to keep the cgroup namespace
1710                          * of the host or another container.
1711                          */
1712                         if (wants_force_mount) {
1713                                 /*
1714                                  * 1. cgroup:rw:force    -> Bind-mount the cgroup2 filesystem writable.
1715                                  * 2. cgroup:ro:force    -> Bind-mount the cgroup2 filesystem read-only.
1716                                  * 3. cgroup:mixed:force -> bind-mount the cgroup2 filesystem and
1717                                  *                          and make the parent directory of the
1718                                  *                          container's cgroup read-only but the
1719                                  *                          container's cgroup writable.
1720                                  *
1721                                  * 10. cgroup-full:rw:force    ->
1722                                  * 11. cgroup-full:ro:force    ->
1723                                  * 12. cgroup-full:mixed:force ->
1724                                  */
1725                                 errno = EOPNOTSUPP;
1726                                 SYSWARN("Force-mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1727                         } else {
1728                                 errno = EOPNOTSUPP;
1729                                 SYSWARN("Mounting the unified cgroup hierarchy without cgroup namespace support is currently not supported");
1730                         }
1731                 }
1732
1733                 return syserrno(false, "Failed to mount cgroups");
1734         }
1735
1736         /*
1737          * Mount a tmpfs over DEFAULT_CGROUP_MOUNTPOINT. Note that we're
1738          * relying on RESOLVE_BENEATH so we need to skip the leading "/" in the
1739          * DEFAULT_CGROUP_MOUNTPOINT define.
1740          */
1741         if (can_use_mount_api()) {
1742                 fd_fs = fs_prepare("tmpfs", -EBADF, "", 0, 0);
1743                 if (fd_fs < 0)
1744                         return log_error_errno(-errno, errno, "Failed to create new filesystem context for tmpfs");
1745
1746                 ret = fs_set_property(fd_fs, "mode", "0755");
1747                 if (ret < 0)
1748                         return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1749
1750                 ret = fs_set_property(fd_fs, "size", "10240k");
1751                 if (ret < 0)
1752                         return log_error_errno(-errno, errno, "Failed to mount tmpfs onto %d(dev)", fd_fs);
1753
1754                 ret = fs_attach(fd_fs, rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1755                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV,
1756                                 MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |
1757                                 MOUNT_ATTR_NOEXEC | MOUNT_ATTR_RELATIME);
1758         } else {
1759                 cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1760                 ret = safe_mount(NULL, cgroup_root, "tmpfs",
1761                                  MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1762                                  "size=10240k,mode=755", rootfs_mnt);
1763         }
1764         if (ret < 0)
1765                 return log_error_errno(false, errno, "Failed to mount tmpfs on %s",
1766                                        DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1767
1768         dfd_mnt_tmpfs = open_at(rootfs->dfd_mnt, DEFAULT_CGROUP_MOUNTPOINT_RELATIVE,
1769                                 PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH_XDEV, 0);
1770         if (dfd_mnt_tmpfs < 0)
1771                 return syserrno(-errno, "Failed to open %d(%s)", rootfs->dfd_mnt,
1772                                 DEFAULT_CGROUP_MOUNTPOINT_RELATIVE);
1773
1774         for (int i = 0; ops->hierarchies[i]; i++) {
1775                 __do_free char *hierarchy_mnt = NULL, *path2 = NULL;
1776                 struct hierarchy *h = ops->hierarchies[i];
1777
1778                 ret = mkdirat(dfd_mnt_tmpfs, h->at_mnt, 0000);
1779                 if (ret < 0)
1780                         return syserrno(false, "Failed to create cgroup at_mnt %d(%s)", dfd_mnt_tmpfs, h->at_mnt);
1781
1782                 if (in_cgroup_ns && wants_force_mount) {
1783                         /*
1784                          * If cgroup namespaces are supported but the container
1785                          * will not have CAP_SYS_ADMIN after it has started we
1786                          * need to mount the cgroups manually.
1787                          */
1788                         ret = cgroupfs_mount(cgroup_automount_type, h, rootfs,
1789                                              dfd_mnt_tmpfs, h->at_mnt);
1790                         if (ret < 0)
1791                                 return false;
1792
1793                         continue;
1794                 }
1795
1796                 /* Here is where the ancient kernel section begins. */
1797                 ret = cgroupfs_bind_mount(cgroup_automount_type, h, rootfs,
1798                                           dfd_mnt_tmpfs, h->at_mnt);
1799                 if (ret < 0)
1800                         return false;
1801
1802                 if (!cg_mount_needs_subdirs(cgroup_automount_type))
1803                         continue;
1804
1805                 if (!cgroup_root)
1806                         cgroup_root = must_make_path(rootfs_mnt, DEFAULT_CGROUP_MOUNTPOINT, NULL);
1807
1808                 hierarchy_mnt = must_make_path(cgroup_root, h->at_mnt, NULL);
1809                 path2 = must_make_path(hierarchy_mnt, h->at_base,
1810                                        ops->container_cgroup, NULL);
1811                 ret = mkdir_p(path2, 0755);
1812                 if (ret < 0 && (errno != EEXIST))
1813                         return false;
1814
1815                 ret = cg_legacy_mount_controllers(cgroup_automount_type, h,
1816                                                   hierarchy_mnt, path2,
1817                                                   ops->container_cgroup);
1818                 if (ret < 0)
1819                         return false;
1820         }
1821
1822         return true;
1823 }
1824
1825 /* Only root needs to escape to the cgroup of its init. */
1826 __cgfsng_ops static bool cgfsng_criu_escape(const struct cgroup_ops *ops,
1827                                             struct lxc_conf *conf)
1828 {
1829         if (!ops)
1830                 return ret_set_errno(false, ENOENT);
1831
1832         if (!ops->hierarchies)
1833                 return true;
1834
1835         if (!conf)
1836                 return ret_set_errno(false, EINVAL);
1837
1838         if (conf->cgroup_meta.relative || geteuid())
1839                 return true;
1840
1841         for (int i = 0; ops->hierarchies[i]; i++) {
1842                 __do_free char *fullpath = NULL;
1843                 int ret;
1844
1845                 fullpath = make_cgroup_path(ops->hierarchies[i],
1846                                             ops->hierarchies[i]->at_base,
1847                                             "cgroup.procs", NULL);
1848                 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1849                 if (ret != 0)
1850                         return log_error_errno(false, errno, "Failed to escape to cgroup \"%s\"", fullpath);
1851         }
1852
1853         return true;
1854 }
1855
1856 __cgfsng_ops static int cgfsng_criu_num_hierarchies(struct cgroup_ops *ops)
1857 {
1858         int i = 0;
1859
1860         if (!ops)
1861                 return ret_set_errno(-1, ENOENT);
1862
1863         if (!ops->hierarchies)
1864                 return 0;
1865
1866         for (; ops->hierarchies[i]; i++)
1867                 ;
1868
1869         return i;
1870 }
1871
1872 __cgfsng_ops static bool cgfsng_criu_get_hierarchies(struct cgroup_ops *ops,
1873                                                      int n, char ***out)
1874 {
1875         int i;
1876
1877         if (!ops)
1878                 return ret_set_errno(false, ENOENT);
1879
1880         if (!ops->hierarchies)
1881                 return ret_set_errno(false, ENOENT);
1882
1883         /* sanity check n */
1884         for (i = 0; i < n; i++)
1885                 if (!ops->hierarchies[i])
1886                         return ret_set_errno(false, ENOENT);
1887
1888         *out = ops->hierarchies[i]->controllers;
1889
1890         return true;
1891 }
1892
1893 static int cg_legacy_freeze(struct cgroup_ops *ops)
1894 {
1895         struct hierarchy *h;
1896
1897         h = get_hierarchy(ops, "freezer");
1898         if (!h)
1899                 return ret_set_errno(-1, ENOENT);
1900
1901         return lxc_write_openat(h->path_con, "freezer.state",
1902                                 "FROZEN", STRLITERALLEN("FROZEN"));
1903 }
1904
1905 static int freezer_cgroup_events_cb(int fd, uint32_t events, void *cbdata,
1906                                     struct lxc_epoll_descr *descr)
1907 {
1908         __do_free char *line = NULL;
1909         __do_fclose FILE *f = NULL;
1910         int state = PTR_TO_INT(cbdata);
1911         size_t len;
1912         const char *state_string;
1913
1914         f = fdopen_at(fd, "", "re", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH);
1915         if (!f)
1916                 return LXC_MAINLOOP_ERROR;
1917
1918         if (state == 1)
1919                 state_string = "frozen 1";
1920         else
1921                 state_string = "frozen 0";
1922
1923         while (getline(&line, &len, f) != -1)
1924                 if (strnequal(line, state_string, STRLITERALLEN("frozen") + 2))
1925                         return LXC_MAINLOOP_CLOSE;
1926
1927         rewind(f);
1928
1929         return LXC_MAINLOOP_CONTINUE;
1930 }
1931
1932 static int cg_unified_freeze_do(struct cgroup_ops *ops, int timeout,
1933                                 const char *state_string,
1934                                 int state_num,
1935                                 const char *epoll_error,
1936                                 const char *wait_error)
1937 {
1938         __do_close int fd = -EBADF;
1939         call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
1940         int ret;
1941         struct lxc_epoll_descr descr;
1942         struct hierarchy *h;
1943
1944         h = ops->unified;
1945         if (!h)
1946                 return ret_set_errno(-1, ENOENT);
1947
1948         if (!h->path_con)
1949                 return ret_set_errno(-1, EEXIST);
1950
1951         if (timeout != 0) {
1952                 __do_free char *events_file = NULL;
1953
1954                 events_file = must_make_path(h->path_con, "cgroup.events", NULL);
1955                 fd = open(events_file, O_RDONLY | O_CLOEXEC);
1956                 if (fd < 0)
1957                         return log_error_errno(-1, errno, "Failed to open cgroup.events file");
1958
1959                 ret = lxc_mainloop_open(&descr);
1960                 if (ret)
1961                         return log_error_errno(-1, errno, "%s", epoll_error);
1962
1963                 /* automatically cleaned up now */
1964                 descr_ptr = &descr;
1965
1966                 ret = lxc_mainloop_add_handler_events(&descr, fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
1967                 if (ret < 0)
1968                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
1969         }
1970
1971         ret = lxc_write_openat(h->path_con, "cgroup.freeze", state_string, 1);
1972         if (ret < 0)
1973                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
1974
1975         if (timeout != 0 && lxc_mainloop(&descr, timeout))
1976                 return log_error_errno(-1, errno, "%s", wait_error);
1977
1978         return 0;
1979 }
1980
1981 static int cg_unified_freeze(struct cgroup_ops *ops, int timeout)
1982 {
1983         return cg_unified_freeze_do(ops, timeout, "1", 1,
1984                 "Failed to create epoll instance to wait for container freeze",
1985                 "Failed to wait for container to be frozen");
1986 }
1987
1988 __cgfsng_ops static int cgfsng_freeze(struct cgroup_ops *ops, int timeout)
1989 {
1990         if (!ops->hierarchies)
1991                 return ret_set_errno(-1, ENOENT);
1992
1993         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
1994                 return cg_legacy_freeze(ops);
1995
1996         return cg_unified_freeze(ops, timeout);
1997 }
1998
1999 static int cg_legacy_unfreeze(struct cgroup_ops *ops)
2000 {
2001         struct hierarchy *h;
2002
2003         h = get_hierarchy(ops, "freezer");
2004         if (!h)
2005                 return ret_set_errno(-1, ENOENT);
2006
2007         return lxc_write_openat(h->path_con, "freezer.state",
2008                                 "THAWED", STRLITERALLEN("THAWED"));
2009 }
2010
2011 static int cg_unified_unfreeze(struct cgroup_ops *ops, int timeout)
2012 {
2013         return cg_unified_freeze_do(ops, timeout, "0", 0,
2014                 "Failed to create epoll instance to wait for container unfreeze",
2015                 "Failed to wait for container to be unfrozen");
2016 }
2017
2018 __cgfsng_ops static int cgfsng_unfreeze(struct cgroup_ops *ops, int timeout)
2019 {
2020         if (!ops->hierarchies)
2021                 return ret_set_errno(-1, ENOENT);
2022
2023         if (ops->cgroup_layout != CGROUP_LAYOUT_UNIFIED)
2024                 return cg_legacy_unfreeze(ops);
2025
2026         return cg_unified_unfreeze(ops, timeout);
2027 }
2028
2029 static const char *cgfsng_get_cgroup_do(struct cgroup_ops *ops,
2030                                         const char *controller, bool limiting)
2031 {
2032         struct hierarchy *h;
2033         size_t len;
2034         const char *path;
2035
2036         h = get_hierarchy(ops, controller);
2037         if (!h)
2038                 return log_warn_errno(NULL, ENOENT,
2039                                       "Failed to find hierarchy for controller \"%s\"", maybe_empty(controller));
2040
2041         if (limiting)
2042                 path = h->path_lim;
2043         else
2044                 path = h->path_con;
2045         if (!path)
2046                 return NULL;
2047
2048         len = strlen(h->at_mnt);
2049         if (!strnequal(h->at_mnt, DEFAULT_CGROUP_MOUNTPOINT,
2050                        STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT))) {
2051                 path += STRLITERALLEN(DEFAULT_CGROUP_MOUNTPOINT);
2052                 path += strspn(path, "/");
2053         }
2054         return path += len;
2055 }
2056
2057 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
2058                                                   const char *controller)
2059 {
2060     return cgfsng_get_cgroup_do(ops, controller, false);
2061 }
2062
2063 __cgfsng_ops static const char *cgfsng_get_limiting_cgroup(struct cgroup_ops *ops,
2064                                                            const char *controller)
2065 {
2066     return cgfsng_get_cgroup_do(ops, controller, true);
2067 }
2068
2069 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2070  * which must be freed by the caller.
2071  */
2072 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2073                                                        const char *inpath,
2074                                                        const char *filename)
2075 {
2076         return make_cgroup_path(h, inpath, filename, NULL);
2077 }
2078
2079 static int cgroup_attach_leaf(const struct lxc_conf *conf, int unified_fd, pid_t pid)
2080 {
2081         int idx = 1;
2082         int ret;
2083         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2084         ssize_t pidstr_len;
2085
2086         /* Create leaf cgroup. */
2087         ret = mkdirat(unified_fd, ".lxc", 0755);
2088         if (ret < 0 && errno != EEXIST)
2089                 return log_error_errno(-errno, errno, "Failed to create leaf cgroup \".lxc\"");
2090
2091         pidstr_len = strnprintf(pidstr, sizeof(pidstr), INT64_FMT, (int64_t)pid);
2092         if (pidstr_len < 0)
2093                 return pidstr_len;
2094
2095         ret = lxc_writeat(unified_fd, ".lxc/cgroup.procs", pidstr, pidstr_len);
2096         if (ret < 0)
2097                 ret = lxc_writeat(unified_fd, "cgroup.procs", pidstr, pidstr_len);
2098         if (ret == 0)
2099                 return log_trace(0, "Moved process %s into cgroup %d(.lxc)", pidstr, unified_fd);
2100
2101         /* this is a non-leaf node */
2102         if (errno != EBUSY)
2103                 return log_error_errno(-errno, errno, "Failed to attach to unified cgroup");
2104
2105         do {
2106                 bool rm = false;
2107                 char attach_cgroup[STRLITERALLEN(".lxc-/cgroup.procs") + INTTYPE_TO_STRLEN(int) + 1];
2108                 char *slash = attach_cgroup;
2109
2110                 ret = strnprintf(attach_cgroup, sizeof(attach_cgroup), ".lxc-%d/cgroup.procs", idx);
2111                 if (ret < 0)
2112                         return ret;
2113
2114                 /*
2115                  * This shouldn't really happen but the compiler might complain
2116                  * that a short write would cause a buffer overrun. So be on
2117                  * the safe side.
2118                  */
2119                 if (ret < STRLITERALLEN(".lxc-/cgroup.procs"))
2120                         return log_error_errno(-EINVAL, EINVAL, "Unexpected short write would cause buffer-overrun");
2121
2122                 slash += (ret - STRLITERALLEN("/cgroup.procs"));
2123                 *slash = '\0';
2124
2125                 ret = mkdirat(unified_fd, attach_cgroup, 0755);
2126                 if (ret < 0 && errno != EEXIST)
2127                         return log_error_errno(-1, errno, "Failed to create cgroup %s", attach_cgroup);
2128                 if (ret == 0)
2129                         rm = true;
2130
2131                 *slash = '/';
2132
2133                 ret = lxc_writeat(unified_fd, attach_cgroup, pidstr, pidstr_len);
2134                 if (ret == 0)
2135                         return log_trace(0, "Moved process %s into cgroup %d(%s)", pidstr, unified_fd, attach_cgroup);
2136
2137                 if (rm && unlinkat(unified_fd, attach_cgroup, AT_REMOVEDIR))
2138                         SYSERROR("Failed to remove cgroup \"%d(%s)\"", unified_fd, attach_cgroup);
2139
2140                 /* this is a non-leaf node */
2141                 if (errno != EBUSY)
2142                         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2143
2144                 idx++;
2145         } while (idx < 1000);
2146
2147         return log_error_errno(-1, errno, "Failed to attach to unified cgroup");
2148 }
2149
2150 static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
2151                                      int unified_fd, int *sk_fd)
2152 {
2153         __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2154         int target_fds[2];
2155         ssize_t ret;
2156
2157         /* Create leaf cgroup. */
2158         ret = mkdirat(unified_fd, ".lxc", 0755);
2159         if (ret < 0 && errno != EEXIST)
2160                 return log_error_errno(-1, errno, "Failed to create leaf cgroup \".lxc\"");
2161
2162         target_fd0 = open_at(unified_fd, ".lxc/cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2163         if (target_fd0 < 0)
2164                 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2165         target_fds[0] = target_fd0;
2166
2167         target_fd1 = open_at(unified_fd, "cgroup.procs", PROTECT_OPEN_W, PROTECT_LOOKUP_BENEATH, 0);
2168         if (target_fd1 < 0)
2169                 return log_error_errno(-errno, errno, "Failed to open \".lxc/cgroup.procs\"");
2170         target_fds[1] = target_fd1;
2171
2172         ret = lxc_abstract_unix_send_fds(sk, target_fds, 2, NULL, 0);
2173         if (ret <= 0)
2174                 return log_error_errno(-errno, errno, "Failed to send \".lxc/cgroup.procs\" fds %d and %d",
2175                                        target_fd0, target_fd1);
2176
2177         return log_debug(0, "Sent target cgroup fds %d and %d", target_fd0, target_fd1);
2178 }
2179
2180 static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
2181                                         int *sk_fd, pid_t pid)
2182 {
2183         __do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
2184         int target_fds[2];
2185         char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
2186         size_t pidstr_len;
2187         ssize_t ret;
2188
2189         ret = lxc_abstract_unix_recv_fds(sk, target_fds, 2, NULL, 0);
2190         if (ret <= 0)
2191                 return log_error_errno(-1, errno, "Failed to receive target cgroup fd");
2192         target_fd0 = target_fds[0];
2193         target_fd1 = target_fds[1];
2194
2195         pidstr_len = sprintf(pidstr, INT64_FMT, (int64_t)pid);
2196
2197         ret = lxc_write_nointr(target_fd0, pidstr, pidstr_len);
2198         if (ret > 0 && ret == pidstr_len)
2199                 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd0);
2200
2201         ret = lxc_write_nointr(target_fd1, pidstr, pidstr_len);
2202         if (ret > 0 && ret == pidstr_len)
2203                 return log_debug(0, "Moved process into target cgroup via fd %d", target_fd1);
2204
2205         return log_debug_errno(-1, errno, "Failed to move process into target cgroup via fd %d and %d",
2206                                target_fd0, target_fd1);
2207 }
2208
2209 struct userns_exec_unified_attach_data {
2210         const struct lxc_conf *conf;
2211         int unified_fd;
2212         int sk_pair[2];
2213         pid_t pid;
2214 };
2215
2216 static int cgroup_unified_attach_child_wrapper(void *data)
2217 {
2218         struct userns_exec_unified_attach_data *args = data;
2219
2220         if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2221             args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2222                 return ret_errno(EINVAL);
2223
2224         close_prot_errno_disarm(args->sk_pair[0]);
2225         return cgroup_attach_create_leaf(args->conf, args->unified_fd,
2226                                          &args->sk_pair[1]);
2227 }
2228
2229 static int cgroup_unified_attach_parent_wrapper(void *data)
2230 {
2231         struct userns_exec_unified_attach_data *args = data;
2232
2233         if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
2234             args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
2235                 return ret_errno(EINVAL);
2236
2237         close_prot_errno_disarm(args->sk_pair[1]);
2238         return cgroup_attach_move_into_leaf(args->conf, &args->sk_pair[0],
2239                                             args->pid);
2240 }
2241
2242 /* Technically, we're always at a delegation boundary here (This is especially
2243  * true when cgroup namespaces are available.). The reasoning is that in order
2244  * for us to have been able to start a container in the first place the root
2245  * cgroup must have been a leaf node. Now, either the container's init system
2246  * has populated the cgroup and kept it as a leaf node or it has created
2247  * subtrees. In the former case we will simply attach to the leaf node we
2248  * created when we started the container in the latter case we create our own
2249  * cgroup for the attaching process.
2250  */
2251 static int __cg_unified_attach(const struct hierarchy *h,
2252                                const struct lxc_conf *conf, const char *name,
2253                                const char *lxcpath, pid_t pid,
2254                                const char *controller)
2255 {
2256         __do_close int unified_fd = -EBADF;
2257         __do_free char *path = NULL, *cgroup = NULL;
2258         int ret;
2259
2260         if (!conf || !name || !lxcpath || pid <= 0)
2261                 return ret_errno(EINVAL);
2262
2263         ret = cgroup_attach(conf, name, lxcpath, pid);
2264         if (ret == 0)
2265                 return log_trace(0, "Attached to unified cgroup via command handler");
2266         if (ret != -ENOCGROUP2)
2267                 return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
2268
2269         /* Fall back to retrieving the path for the unified cgroup. */
2270         cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2271         /* not running */
2272         if (!cgroup)
2273                 return 0;
2274
2275         path = make_cgroup_path(h, cgroup, NULL);
2276
2277         unified_fd = open(path, O_PATH | O_DIRECTORY | O_CLOEXEC);
2278         if (unified_fd < 0)
2279                 return ret_errno(EBADF);
2280
2281         if (!lxc_list_empty(&conf->id_map)) {
2282                 struct userns_exec_unified_attach_data args = {
2283                         .conf           = conf,
2284                         .unified_fd     = unified_fd,
2285                         .pid            = pid,
2286                 };
2287
2288                 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
2289                 if (ret < 0)
2290                         return -errno;
2291
2292                 ret = userns_exec_minimal(conf,
2293                                           cgroup_unified_attach_parent_wrapper,
2294                                           &args,
2295                                           cgroup_unified_attach_child_wrapper,
2296                                           &args);
2297         } else {
2298                 ret = cgroup_attach_leaf(conf, unified_fd, pid);
2299         }
2300
2301         return ret;
2302 }
2303
2304 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops,
2305                                        const struct lxc_conf *conf,
2306                                        const char *name, const char *lxcpath,
2307                                        pid_t pid)
2308 {
2309         int len, ret;
2310         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2311
2312         if (!ops)
2313                 return ret_set_errno(false, ENOENT);
2314
2315         if (!ops->hierarchies)
2316                 return true;
2317
2318         len = strnprintf(pidstr, sizeof(pidstr), "%d", pid);
2319         if (len < 0)
2320                 return false;
2321
2322         for (int i = 0; ops->hierarchies[i]; i++) {
2323                 __do_free char *fullpath = NULL, *path = NULL;
2324                 struct hierarchy *h = ops->hierarchies[i];
2325
2326                 if (h->fs_type == UNIFIED_HIERARCHY) {
2327                         ret = __cg_unified_attach(h, conf, name, lxcpath, pid,
2328                                                   h->controllers[0]);
2329                         if (ret < 0)
2330                                 return false;
2331
2332                         continue;
2333                 }
2334
2335                 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2336                 /* not running */
2337                 if (!path)
2338                         return false;
2339
2340                 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2341                 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2342                 if (ret < 0)
2343                         return log_error_errno(false, errno, "Failed to attach %d to %s",
2344                                                (int)pid, fullpath);
2345         }
2346
2347         return true;
2348 }
2349
2350 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.  Here we
2351  * don't have a cgroup_data set up, so we ask the running container through the
2352  * commands API for the cgroup path.
2353  */
2354 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2355                                      char *value, size_t len, const char *name,
2356                                      const char *lxcpath)
2357 {
2358         __do_free char *path = NULL;
2359         __do_free char *controller = NULL;
2360         char *p;
2361         struct hierarchy *h;
2362         int ret = -1;
2363
2364         if (!ops)
2365                 return ret_set_errno(-1, ENOENT);
2366
2367         controller = strdup(filename);
2368         if (!controller)
2369                 return ret_errno(ENOMEM);
2370
2371         p = strchr(controller, '.');
2372         if (p)
2373                 *p = '\0';
2374
2375         path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
2376         /* not running */
2377         if (!path)
2378                 return -1;
2379
2380         h = get_hierarchy(ops, controller);
2381         if (h) {
2382                 __do_free char *fullpath = NULL;
2383
2384                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2385                 ret = lxc_read_from_file(fullpath, value, len);
2386         }
2387
2388         return ret;
2389 }
2390
2391 static int device_cgroup_parse_access(struct device_item *device, const char *val)
2392 {
2393         for (int count = 0; count < 3; count++, val++) {
2394                 switch (*val) {
2395                 case 'r':
2396                         device->access[count] = *val;
2397                         break;
2398                 case 'w':
2399                         device->access[count] = *val;
2400                         break;
2401                 case 'm':
2402                         device->access[count] = *val;
2403                         break;
2404                 case '\n':
2405                 case '\0':
2406                         count = 3;
2407                         break;
2408                 default:
2409                         return ret_errno(EINVAL);
2410                 }
2411         }
2412
2413         return 0;
2414 }
2415
2416 static int device_cgroup_rule_parse(struct device_item *device, const char *key,
2417                                     const char *val)
2418 {
2419         int count, ret;
2420         char temp[50];
2421
2422         if (strequal("devices.allow", key))
2423                 device->allow = 1; /* allow the device */
2424         else
2425                 device->allow = 0; /* deny the device */
2426
2427         if (strequal(val, "a")) {
2428                 /* global rule */
2429                 device->type = 'a';
2430                 device->major = -1;
2431                 device->minor = -1;
2432                 return 0;
2433         }
2434
2435         switch (*val) {
2436         case 'a':
2437                 __fallthrough;
2438         case 'b':
2439                 __fallthrough;
2440         case 'c':
2441                 device->type = *val;
2442                 break;
2443         default:
2444                 return -1;
2445         }
2446
2447         val++;
2448         if (!isspace(*val))
2449                 return -1;
2450         val++;
2451         if (*val == '*') {
2452                 device->major = -1;
2453                 val++;
2454         } else if (isdigit(*val)) {
2455                 memset(temp, 0, sizeof(temp));
2456                 for (count = 0; count < sizeof(temp) - 1; count++) {
2457                         temp[count] = *val;
2458                         val++;
2459                         if (!isdigit(*val))
2460                                 break;
2461                 }
2462                 ret = lxc_safe_int(temp, &device->major);
2463                 if (ret)
2464                         return -1;
2465         } else {
2466                 return -1;
2467         }
2468         if (*val != ':')
2469                 return -1;
2470         val++;
2471
2472         /* read minor */
2473         if (*val == '*') {
2474                 device->minor = -1;
2475                 val++;
2476         } else if (isdigit(*val)) {
2477                 memset(temp, 0, sizeof(temp));
2478                 for (count = 0; count < sizeof(temp) - 1; count++) {
2479                         temp[count] = *val;
2480                         val++;
2481                         if (!isdigit(*val))
2482                                 break;
2483                 }
2484                 ret = lxc_safe_int(temp, &device->minor);
2485                 if (ret)
2486                         return -1;
2487         } else {
2488                 return -1;
2489         }
2490         if (!isspace(*val))
2491                 return -1;
2492
2493         return device_cgroup_parse_access(device, ++val);
2494 }
2495
2496 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.  Here we
2497  * don't have a cgroup_data set up, so we ask the running container through the
2498  * commands API for the cgroup path.
2499  */
2500 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2501                                      const char *key, const char *value,
2502                                      const char *name, const char *lxcpath)
2503 {
2504         __do_free char *path = NULL;
2505         __do_free char *controller = NULL;
2506         char *p;
2507         struct hierarchy *h;
2508         int ret = -1;
2509
2510         if (!ops || is_empty_string(key) || is_empty_string(value) ||
2511             is_empty_string(name) || is_empty_string(lxcpath))
2512                 return ret_errno(EINVAL);
2513
2514         controller = strdup(key);
2515         if (!controller)
2516                 return ret_errno(ENOMEM);
2517
2518         p = strchr(controller, '.');
2519         if (p)
2520                 *p = '\0';
2521
2522         if (pure_unified_layout(ops) && strequal(controller, "devices")) {
2523                 struct device_item device = {};
2524
2525                 ret = device_cgroup_rule_parse(&device, key, value);
2526                 if (ret < 0)
2527                         return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s",
2528                                                key, value);
2529
2530                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
2531                 if (ret < 0)
2532                         return -1;
2533
2534                 return 0;
2535         }
2536
2537         path = lxc_cmd_get_limiting_cgroup_path(name, lxcpath, controller);
2538         /* not running */
2539         if (!path)
2540                 return -1;
2541
2542         h = get_hierarchy(ops, controller);
2543         if (h) {
2544                 __do_free char *fullpath = NULL;
2545
2546                 fullpath = build_full_cgpath_from_monitorpath(h, path, key);
2547                 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2548         }
2549
2550         return ret;
2551 }
2552
2553 /* take devices cgroup line
2554  *    /dev/foo rwx
2555  * and convert it to a valid
2556  *    type major:minor mode
2557  * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2558  * the output.
2559  */
2560 static int device_cgroup_rule_parse_devpath(struct device_item *device,
2561                                             const char *devpath)
2562 {
2563         __do_free char *path = NULL;
2564         char *mode = NULL;
2565         int n_parts, ret;
2566         char *p;
2567         struct stat sb;
2568
2569         path = strdup(devpath);
2570         if (!path)
2571                 return ret_errno(ENOMEM);
2572
2573         /*
2574          * Read path followed by mode. Ignore any trailing text.
2575          * A '    # comment' would be legal. Technically other text is not
2576          * legal, we could check for that if we cared to.
2577          */
2578         for (n_parts = 1, p = path; *p; p++) {
2579                 if (*p != ' ')
2580                         continue;
2581                 *p = '\0';
2582
2583                 if (n_parts != 1)
2584                         break;
2585                 p++;
2586                 n_parts++;
2587
2588                 while (*p == ' ')
2589                         p++;
2590
2591                 mode = p;
2592
2593                 if (*p == '\0')
2594                         return ret_set_errno(-1, EINVAL);
2595         }
2596
2597         if (!mode)
2598                 return ret_errno(EINVAL);
2599
2600         if (device_cgroup_parse_access(device, mode) < 0)
2601                 return -1;
2602
2603         ret = stat(path, &sb);
2604         if (ret < 0)
2605                 return ret_set_errno(-1, errno);
2606
2607         mode_t m = sb.st_mode & S_IFMT;
2608         switch (m) {
2609         case S_IFBLK:
2610                 device->type = 'b';
2611                 break;
2612         case S_IFCHR:
2613                 device->type = 'c';
2614                 break;
2615         default:
2616                 return log_error_errno(-1, EINVAL, "Unsupported device type %i for \"%s\"", m, path);
2617         }
2618
2619         device->major = MAJOR(sb.st_rdev);
2620         device->minor = MINOR(sb.st_rdev);
2621         device->allow = 1;
2622
2623         return 0;
2624 }
2625
2626 static int convert_devpath(const char *invalue, char *dest)
2627 {
2628         struct device_item device = {};
2629         int ret;
2630
2631         ret = device_cgroup_rule_parse_devpath(&device, invalue);
2632         if (ret < 0)
2633                 return -1;
2634
2635         ret = strnprintf(dest, 50, "%c %d:%d %s", device.type, device.major,
2636                          device.minor, device.access);
2637         if (ret < 0)
2638                 return log_error_errno(ret, -ret,
2639                                        "Error on configuration value \"%c %d:%d %s\" (max 50 chars)",
2640                                        device.type, device.major, device.minor,
2641                                        device.access);
2642
2643         return 0;
2644 }
2645
2646 /* Called from setup_limits - here we have the container's cgroup_data because
2647  * we created the cgroups.
2648  */
2649 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2650                               const char *value, bool is_cpuset)
2651 {
2652         __do_free char *controller = NULL;
2653         char *p;
2654         /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2655         char converted_value[50];
2656         struct hierarchy *h;
2657
2658         controller = strdup(filename);
2659         if (!controller)
2660                 return ret_errno(ENOMEM);
2661
2662         p = strchr(controller, '.');
2663         if (p)
2664                 *p = '\0';
2665
2666         if (strequal("devices.allow", filename) && value[0] == '/') {
2667                 int ret;
2668
2669                 ret = convert_devpath(value, converted_value);
2670                 if (ret < 0)
2671                         return ret;
2672                 value = converted_value;
2673         }
2674
2675         h = get_hierarchy(ops, controller);
2676         if (!h)
2677                 return log_error_errno(-ENOENT, ENOENT, "Failed to setup limits for the \"%s\" controller. The controller seems to be unused by \"cgfsng\" cgroup driver or not enabled on the cgroup hierarchy", controller);
2678
2679         if (is_cpuset) {
2680                 int ret = lxc_write_openat(h->path_con, filename, value, strlen(value));
2681                 if (ret)
2682                         return ret;
2683         }
2684         return lxc_write_openat(h->path_lim, filename, value, strlen(value));
2685 }
2686
2687 __cgfsng_ops static bool cgfsng_setup_limits_legacy(struct cgroup_ops *ops,
2688                                                     struct lxc_conf *conf,
2689                                                     bool do_devices)
2690 {
2691         __do_free struct lxc_list *sorted_cgroup_settings = NULL;
2692         struct lxc_list *cgroup_settings = &conf->cgroup;
2693         struct lxc_list *iterator, *next;
2694         struct lxc_cgroup *cg;
2695         bool ret = false;
2696
2697         if (!ops)
2698                 return ret_set_errno(false, ENOENT);
2699
2700         if (!conf)
2701                 return ret_set_errno(false, EINVAL);
2702
2703         cgroup_settings = &conf->cgroup;
2704         if (lxc_list_empty(cgroup_settings))
2705                 return true;
2706
2707         if (!ops->hierarchies)
2708                 return ret_set_errno(false, EINVAL);
2709
2710         if (pure_unified_layout(ops))
2711                 return log_warn_errno(true, EINVAL, "Ignoring legacy cgroup limits on pure cgroup2 system");
2712
2713         sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2714         if (!sorted_cgroup_settings)
2715                 return false;
2716
2717         lxc_list_for_each(iterator, sorted_cgroup_settings) {
2718                 cg = iterator->elem;
2719
2720                 if (do_devices == strnequal("devices", cg->subsystem, 7)) {
2721                         if (cg_legacy_set_data(ops, cg->subsystem, cg->value, strnequal("cpuset", cg->subsystem, 6))) {
2722                                 if (do_devices && (errno == EACCES || errno == EPERM)) {
2723                                         SYSWARN("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2724                                         continue;
2725                                 }
2726                                 SYSERROR("Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2727                                 goto out;
2728                         }
2729                         DEBUG("Set controller \"%s\" set to \"%s\"", cg->subsystem, cg->value);
2730                 }
2731         }
2732
2733         ret = true;
2734         INFO("Limits for the legacy cgroup hierarchies have been setup");
2735 out:
2736         lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2737                 lxc_list_del(iterator);
2738                 free(iterator);
2739         }
2740
2741         return ret;
2742 }
2743
2744 /*
2745  * Some of the parsing logic comes from the original cgroup device v1
2746  * implementation in the kernel.
2747  */
2748 static int bpf_device_cgroup_prepare(struct cgroup_ops *ops,
2749                                      struct lxc_conf *conf, const char *key,
2750                                      const char *val)
2751 {
2752         struct device_item device_item = {};
2753         int ret;
2754
2755         if (strequal("devices.allow", key) && abspath(val))
2756                 ret = device_cgroup_rule_parse_devpath(&device_item, val);
2757         else
2758                 ret = device_cgroup_rule_parse(&device_item, key, val);
2759         if (ret < 0)
2760                 return syserrno_set(EINVAL, "Failed to parse device rule %s=%s", key, val);
2761
2762         /*
2763          * Note that bpf_list_add_device() returns 1 if it altered the device
2764          * list and 0 if it didn't; both return values indicate success.
2765          * Only a negative return value indicates an error.
2766          */
2767         ret = bpf_list_add_device(&conf->bpf_devices, &device_item);
2768         if (ret < 0)
2769                 return -1;
2770
2771         return 0;
2772 }
2773
2774 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2775                                              struct lxc_handler *handler)
2776 {
2777         struct lxc_list *cgroup_settings, *iterator;
2778         struct hierarchy *h;
2779         struct lxc_conf *conf;
2780
2781         if (!ops)
2782                 return ret_set_errno(false, ENOENT);
2783
2784         if (!ops->hierarchies)
2785                 return true;
2786
2787         if (!ops->container_cgroup)
2788                 return ret_set_errno(false, EINVAL);
2789
2790         if (!handler || !handler->conf)
2791                 return ret_set_errno(false, EINVAL);
2792         conf = handler->conf;
2793
2794         cgroup_settings = &conf->cgroup2;
2795         if (lxc_list_empty(cgroup_settings))
2796                 return true;
2797
2798         if (!pure_unified_layout(ops))
2799                 return log_warn_errno(true, EINVAL, "Ignoring cgroup2 limits on legacy cgroup system");
2800
2801         if (!ops->unified)
2802                 return false;
2803         h = ops->unified;
2804
2805         lxc_list_for_each (iterator, cgroup_settings) {
2806                 struct lxc_cgroup *cg = iterator->elem;
2807                 int ret;
2808
2809                 if (strnequal("devices", cg->subsystem, 7))
2810                         ret = bpf_device_cgroup_prepare(ops, conf, cg->subsystem, cg->value);
2811                 else
2812                         ret = lxc_write_openat(h->path_lim, cg->subsystem, cg->value, strlen(cg->value));
2813                 if (ret < 0)
2814                         return log_error_errno(false, errno, "Failed to set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2815
2816                 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2817         }
2818
2819         return log_info(true, "Limits for the unified cgroup hierarchy have been setup");
2820 }
2821
2822 __cgfsng_ops static bool cgfsng_devices_activate(struct cgroup_ops *ops, struct lxc_handler *handler)
2823 {
2824         struct lxc_conf *conf;
2825         struct hierarchy *unified;
2826
2827         if (!ops)
2828                 return ret_set_errno(false, ENOENT);
2829
2830         if (!ops->hierarchies)
2831                 return true;
2832
2833         if (!ops->container_cgroup)
2834                 return ret_set_errno(false, EEXIST);
2835
2836         if (!handler || !handler->conf)
2837                 return ret_set_errno(false, EINVAL);
2838         conf = handler->conf;
2839
2840         unified = ops->unified;
2841         if (!unified || !device_utility_controller(unified) ||
2842             !unified->path_con ||
2843             lxc_list_empty(&(conf->bpf_devices).device_item))
2844                 return true;
2845
2846         return bpf_cgroup_devices_attach(ops, &conf->bpf_devices);
2847 }
2848
2849 static bool __cgfsng_delegate_controllers(struct cgroup_ops *ops, const char *cgroup)
2850 {
2851         __do_close int dfd_final = -EBADF;
2852         __do_free char *add_controllers = NULL, *copy = NULL;
2853         size_t full_len = 0;
2854         struct hierarchy *unified;
2855         int dfd_cur, ret;
2856         char *cur;
2857         char **it;
2858
2859         if (!ops->hierarchies || !pure_unified_layout(ops))
2860                 return true;
2861
2862         unified = ops->unified;
2863         if (!unified->controllers[0])
2864                 return true;
2865
2866         /* For now we simply enable all controllers that we have detected by
2867          * creating a string like "+memory +pids +cpu +io".
2868          * TODO: In the near future we might want to support "-<controller>"
2869          * etc. but whether supporting semantics like this make sense will need
2870          * some thinking.
2871          */
2872         for (it = unified->controllers; it && *it; it++) {
2873                 full_len += strlen(*it) + 2;
2874                 add_controllers = must_realloc(add_controllers, full_len + 1);
2875
2876                 if (unified->controllers[0] == *it)
2877                         add_controllers[0] = '\0';
2878
2879                 (void)strlcat(add_controllers, "+", full_len + 1);
2880                 (void)strlcat(add_controllers, *it, full_len + 1);
2881
2882                 if ((it + 1) && *(it + 1))
2883                         (void)strlcat(add_controllers, " ", full_len + 1);
2884         }
2885
2886         copy = strdup(cgroup);
2887         if (!copy)
2888                 return false;
2889
2890         /*
2891          * Placing the write to cgroup.subtree_control before the open() is
2892          * intentional because of the cgroup2 delegation model. It enforces
2893          * that leaf cgroups don't have any controllers enabled for delegation.
2894          */
2895         dfd_cur = unified->dfd_base;
2896         lxc_iterate_parts(cur, copy, "/") {
2897                 /*
2898                  * Even though we vetted the paths when we parsed the config
2899                  * we're paranoid here and check that the path is neither
2900                  * absolute nor walks upwards.
2901                  */
2902                 if (abspath(cur))
2903                         return syserrno_set(-EINVAL, "No absolute paths allowed");
2904
2905                 if (strnequal(cur, "..", STRLITERALLEN("..")))
2906                         return syserrno_set(-EINVAL, "No upward walking paths allowed");
2907
2908                 ret = lxc_writeat(dfd_cur, "cgroup.subtree_control", add_controllers, full_len);
2909                 if (ret < 0)
2910                         return syserrno(-errno, "Could not enable \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2911
2912                 TRACE("Enabled \"%s\" controllers in the unified cgroup %d", add_controllers, dfd_cur);
2913
2914                 dfd_final = open_at(dfd_cur, cur, PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_BENEATH, 0);
2915                 if (dfd_final < 0)
2916                         return syserrno(-errno, "Fail to open directory %d(%s)", dfd_cur, cur);
2917                 if (dfd_cur != unified->dfd_base)
2918                         close(dfd_cur);
2919                 /*
2920                  * Leave dfd_final pointing to the last fd we opened so
2921                  * it will be automatically zapped if we return early.
2922                  */
2923                 dfd_cur = dfd_final;
2924         }
2925
2926         return true;
2927 }
2928
2929 __cgfsng_ops static bool cgfsng_monitor_delegate_controllers(struct cgroup_ops *ops)
2930 {
2931         if (!ops)
2932                 return ret_set_errno(false, ENOENT);
2933
2934         return __cgfsng_delegate_controllers(ops, ops->monitor_cgroup);
2935 }
2936
2937 __cgfsng_ops static bool cgfsng_payload_delegate_controllers(struct cgroup_ops *ops)
2938 {
2939         if (!ops)
2940                 return ret_set_errno(false, ENOENT);
2941
2942         return __cgfsng_delegate_controllers(ops, ops->container_cgroup);
2943 }
2944
2945 static inline bool unified_cgroup(const char *line)
2946 {
2947         return *line == '0';
2948 }
2949
2950 static inline char *current_unified_cgroup(bool relative, char *line)
2951 {
2952         char *current_cgroup;
2953
2954         line += STRLITERALLEN("0::");
2955
2956         if (!abspath(line))
2957                 return ERR_PTR(-EINVAL);
2958
2959         /* remove init.scope */
2960         if (!relative)
2961                 line = prune_init_scope(line);
2962
2963         /* create a relative path */
2964         line = deabs(line);
2965
2966         current_cgroup = strdup(line);
2967         if (!current_cgroup)
2968                 return ERR_PTR(-ENOMEM);
2969
2970         return current_cgroup;
2971 }
2972
2973 static inline const char *unprefix(const char *controllers)
2974 {
2975         if (strnequal(controllers, "name=", STRLITERALLEN("name=")))
2976                 return controllers + STRLITERALLEN("name=");
2977         return controllers;
2978 }
2979
2980 static int __list_cgroup_delegate(char ***delegate)
2981 {
2982         __do_free char **list = NULL;
2983         __do_free char *buf = NULL;
2984         char *standard[] = {
2985                 "cgroup.procs",
2986                 "cgroup.threads",
2987                 "cgroup.subtree_control",
2988                 "memory.oom.group",
2989                 NULL,
2990         };
2991         char *token;
2992         int ret;
2993
2994         buf = read_file_at(-EBADF, "/sys/kernel/cgroup/delegate", PROTECT_OPEN, 0);
2995         if (!buf) {
2996                 for (char **p = standard; p && *p; p++) {
2997                         ret = list_add_string(&list, *p);
2998                         if (ret < 0)
2999                                 return ret;
3000                 }
3001
3002                 *delegate = move_ptr(list);
3003                 return syswarn(0, "Failed to read /sys/kernel/cgroup/delegate");
3004         }
3005
3006         lxc_iterate_parts(token, buf, " \t\n") {
3007                 /*
3008                  * We always need to chown this for both cgroup and
3009                  * cgroup2.
3010                  */
3011                 if (strequal(token, "cgroup.procs"))
3012                         continue;
3013
3014                 ret = list_add_string(&list, token);
3015                 if (ret < 0)
3016                         return ret;
3017         }
3018
3019         *delegate = move_ptr(list);
3020         return 0;
3021 }
3022
3023 static bool unified_hierarchy_delegated(int dfd_base, char ***ret_files)
3024 {
3025         __do_free_string_list char **list = NULL;
3026         int ret;
3027
3028         ret = __list_cgroup_delegate(&list);
3029         if (ret < 0)
3030                 return syserrno(ret, "Failed to determine unified cgroup delegation requirements");
3031
3032         for (char *const *s = list; s && *s; s++) {
3033                 if (!faccessat(dfd_base, *s, W_OK, 0) || errno == ENOENT)
3034                         continue;
3035
3036                 return sysinfo(false, "The %s file is not writable, skipping unified hierarchy", *s);
3037         }
3038
3039         *ret_files = move_ptr(list);
3040         return true;
3041 }
3042
3043 static bool legacy_hierarchy_delegated(int dfd_base)
3044 {
3045         if (faccessat(dfd_base, "cgroup.procs", W_OK, 0) && errno != ENOENT)
3046                 return sysinfo(false, "The cgroup.procs file is not writable, skipping legacy hierarchy");
3047
3048         return true;
3049 }
3050
3051 static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
3052                                 bool unprivileged)
3053 {
3054         __do_free char *cgroup_info = NULL;
3055         char *it;
3056
3057         /*
3058          * Root spawned containers escape the current cgroup, so use init's
3059          * cgroups as our base in that case.
3060          */
3061         if (!relative && (geteuid() == 0))
3062                 cgroup_info = read_file_at(-EBADF, "/proc/1/cgroup", PROTECT_OPEN, 0);
3063         else
3064                 cgroup_info = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
3065         if (!cgroup_info)
3066                 return ret_errno(ENOMEM);
3067
3068         lxc_iterate_parts(it, cgroup_info, "\n") {
3069                 __do_close int dfd_base = -EBADF, dfd_mnt = -EBADF;
3070                 __do_free char *controllers = NULL, *current_cgroup = NULL;
3071                 __do_free_string_list char **controller_list = NULL,
3072                                            **delegate = NULL;
3073                 char *line;
3074                 int dfd, ret, type;
3075
3076                 /* Handle the unified cgroup hierarchy. */
3077                 line = it;
3078                 if (unified_cgroup(line)) {
3079                         char *unified_mnt;
3080
3081                         type = UNIFIED_HIERARCHY;
3082
3083                         current_cgroup = current_unified_cgroup(relative, line);
3084                         if (IS_ERR(current_cgroup))
3085                                 return PTR_ERR(current_cgroup);
3086
3087                         if (unified_cgroup_fd(ops->dfd_mnt)) {
3088                                 dfd_mnt = dup_cloexec(ops->dfd_mnt);
3089                                 unified_mnt = "";
3090                         } else {
3091                                 dfd_mnt = open_at(ops->dfd_mnt,
3092                                                   "unified",
3093                                                   PROTECT_OPATH_DIRECTORY,
3094                                                   PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3095                                 unified_mnt = "unified";
3096                         }
3097                         if (dfd_mnt < 0) {
3098                                 if (errno != ENOENT)
3099                                         return syserrno(-errno, "Failed to open %d/unified", ops->dfd_mnt);
3100
3101                                 SYSTRACE("Unified cgroup not mounted");
3102                                 continue;
3103                         }
3104                         dfd = dfd_mnt;
3105
3106                         if (!is_empty_string(current_cgroup)) {
3107                                 dfd_base = open_at(dfd_mnt, current_cgroup,
3108                                                    PROTECT_OPATH_DIRECTORY,
3109                                                    PROTECT_LOOKUP_BENEATH_XDEV, 0);
3110                                 if (dfd_base < 0)
3111                                         return syserrno(-errno, "Failed to open %d/%s", dfd_mnt, current_cgroup);
3112                                 dfd = dfd_base;
3113                         }
3114
3115                         if (!unified_hierarchy_delegated(dfd, &delegate))
3116                                 continue;
3117
3118                         controller_list = unified_controllers(dfd, "cgroup.controllers");
3119                         if (!controller_list) {
3120                                 TRACE("No controllers are enabled for delegation in the unified hierarchy");
3121                                 controller_list = list_new();
3122                                 if (!controller_list)
3123                                         return syserrno(-ENOMEM, "Failed to create empty controller list");
3124                         }
3125
3126                         controllers = strdup(unified_mnt);
3127                         if (!controllers)
3128                                 return ret_errno(ENOMEM);
3129                 } else {
3130                         char *__controllers, *__current_cgroup;
3131
3132                         type = LEGACY_HIERARCHY;
3133
3134                         __controllers = strchr(line, ':');
3135                         if (!__controllers)
3136                                 return ret_errno(EINVAL);
3137                         __controllers++;
3138
3139                         __current_cgroup = strchr(__controllers, ':');
3140                         if (!__current_cgroup)
3141                                 return ret_errno(EINVAL);
3142                         *__current_cgroup = '\0';
3143                         __current_cgroup++;
3144
3145                         controllers = strdup(unprefix(__controllers));
3146                         if (!controllers)
3147                                 return ret_errno(ENOMEM);
3148
3149                         dfd_mnt = open_at(ops->dfd_mnt,
3150                                           controllers, PROTECT_OPATH_DIRECTORY,
3151                                           PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3152                         if (dfd_mnt < 0) {
3153                                 if (errno != ENOENT)
3154                                         return syserrno(-errno, "Failed to open %d/%s",
3155                                                         ops->dfd_mnt, controllers);
3156
3157                                 SYSTRACE("%s not mounted", controllers);
3158                                 continue;
3159                         }
3160                         dfd = dfd_mnt;
3161
3162                         if (!abspath(__current_cgroup))
3163                                 return ret_errno(EINVAL);
3164
3165                         /* remove init.scope */
3166                         if (!relative)
3167                                 __current_cgroup = prune_init_scope(__current_cgroup);
3168
3169                         /* create a relative path */
3170                         __current_cgroup = deabs(__current_cgroup);
3171
3172                         current_cgroup = strdup(__current_cgroup);
3173                         if (!current_cgroup)
3174                                 return ret_errno(ENOMEM);
3175
3176                         if (!is_empty_string(current_cgroup)) {
3177                                 dfd_base = open_at(dfd_mnt, current_cgroup,
3178                                                    PROTECT_OPATH_DIRECTORY,
3179                                                    PROTECT_LOOKUP_BENEATH_XDEV, 0);
3180                                 if (dfd_base < 0)
3181                                         return syserrno(-errno, "Failed to open %d/%s",
3182                                                         dfd_mnt, current_cgroup);
3183                                 dfd = dfd_base;
3184                         }
3185
3186                         if (!legacy_hierarchy_delegated(dfd))
3187                                 continue;
3188
3189                         /*
3190                          * We intentionally pass __current_cgroup here and not
3191                          * controllers because we would otherwise chop the
3192                          * mountpoint.
3193                          */
3194                         controller_list = list_add_controllers(__controllers);
3195                         if (!controller_list)
3196                                 return syserrno(-ENOMEM, "Failed to create controller list from %s", __controllers);
3197
3198                         if (skip_hierarchy(ops, controller_list))
3199                                 continue;
3200
3201                         ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
3202                 }
3203
3204                 ret = cgroup_hierarchy_add(ops, dfd_mnt, controllers, dfd,
3205                                            current_cgroup, controller_list, type);
3206                 if (ret < 0)
3207                         return syserrno(ret, "Failed to add %s hierarchy", controllers);
3208
3209                 /* Transfer ownership. */
3210                 move_fd(dfd_mnt);
3211                 move_fd(dfd_base);
3212                 move_ptr(current_cgroup);
3213                 move_ptr(controllers);
3214                 move_ptr(controller_list);
3215                 if (type == UNIFIED_HIERARCHY)
3216                         ops->unified->delegate = move_ptr(delegate);
3217         }
3218
3219         /* determine cgroup layout */
3220         if (ops->unified) {
3221                 if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
3222                         ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
3223                 } else {
3224                         if (bpf_devices_cgroup_supported())
3225                                 ops->unified->utilities |= DEVICES_CONTROLLER;
3226                         ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
3227                 }
3228         }
3229
3230         if (!controllers_available(ops))
3231                 return syserrno_set(-ENOENT, "One or more requested controllers unavailable or not delegated");
3232
3233         return 0;
3234 }
3235
3236 static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf)
3237 {
3238         __do_close int dfd = -EBADF;
3239         int ret;
3240         const char *controllers_use;
3241
3242         if (ops->dfd_mnt >= 0)
3243                 return ret_errno(EBUSY);
3244
3245         /*
3246          * I don't see the need for allowing symlinks here. If users want to
3247          * have their hierarchy available in different locations I strongly
3248          * suggest bind-mounts.
3249          */
3250         dfd = open_at(-EBADF, DEFAULT_CGROUP_MOUNTPOINT,
3251                         PROTECT_OPATH_DIRECTORY, PROTECT_LOOKUP_ABSOLUTE_XDEV, 0);
3252         if (dfd < 0)
3253                 return syserrno(-errno, "Failed to open " DEFAULT_CGROUP_MOUNTPOINT);
3254
3255         controllers_use = lxc_global_config_value("lxc.cgroup.use");
3256         if (controllers_use) {
3257                 __do_free char *dup = NULL;
3258                 char *it;
3259
3260                 dup = strdup(controllers_use);
3261                 if (!dup)
3262                         return -errno;
3263
3264                 lxc_iterate_parts(it, dup, ",") {
3265                         ret = list_add_string(&ops->cgroup_use, it);
3266                         if (ret < 0)
3267                                 return ret;
3268                 }
3269         }
3270
3271         /*
3272          * Keep dfd referenced by the cleanup function and actually move the fd
3273          * once we know the initialization succeeded. So if we fail we clean up
3274          * the dfd.
3275          */
3276         ops->dfd_mnt = dfd;
3277
3278         ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !lxc_list_empty(&conf->id_map));
3279         if (ret < 0)
3280                 return syserrno(ret, "Failed to initialize cgroups");
3281
3282         /* Transfer ownership to cgroup_ops. */
3283         move_fd(dfd);
3284         return 0;
3285 }
3286
3287 __cgfsng_ops static int cgfsng_data_init(struct cgroup_ops *ops)
3288 {
3289         const char *cgroup_pattern;
3290
3291         if (!ops)
3292                 return ret_set_errno(-1, ENOENT);
3293
3294         /* copy system-wide cgroup information */
3295         cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
3296         if (cgroup_pattern && !strequal(cgroup_pattern, "")) {
3297                 ops->cgroup_pattern = strdup(cgroup_pattern);
3298                 if (!ops->cgroup_pattern)
3299                         return ret_errno(ENOMEM);
3300         }
3301
3302         return 0;
3303 }
3304
3305 struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
3306 {
3307         __do_free struct cgroup_ops *cgfsng_ops = NULL;
3308
3309         cgfsng_ops = zalloc(sizeof(struct cgroup_ops));
3310         if (!cgfsng_ops)
3311                 return ret_set_errno(NULL, ENOMEM);
3312
3313         cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
3314         cgfsng_ops->dfd_mnt = -EBADF;
3315
3316         if (initialize_cgroups(cgfsng_ops, conf))
3317                 return NULL;
3318
3319         cgfsng_ops->data_init                           = cgfsng_data_init;
3320         cgfsng_ops->payload_destroy                     = cgfsng_payload_destroy;
3321         cgfsng_ops->monitor_destroy                     = cgfsng_monitor_destroy;
3322         cgfsng_ops->monitor_create                      = cgfsng_monitor_create;
3323         cgfsng_ops->monitor_enter                       = cgfsng_monitor_enter;
3324         cgfsng_ops->monitor_delegate_controllers        = cgfsng_monitor_delegate_controllers;
3325         cgfsng_ops->payload_delegate_controllers        = cgfsng_payload_delegate_controllers;
3326         cgfsng_ops->payload_create                      = cgfsng_payload_create;
3327         cgfsng_ops->payload_enter                       = cgfsng_payload_enter;
3328         cgfsng_ops->payload_finalize                    = cgfsng_payload_finalize;
3329         cgfsng_ops->get_cgroup                          = cgfsng_get_cgroup;
3330         cgfsng_ops->get                                 = cgfsng_get;
3331         cgfsng_ops->set                                 = cgfsng_set;
3332         cgfsng_ops->freeze                              = cgfsng_freeze;
3333         cgfsng_ops->unfreeze                            = cgfsng_unfreeze;
3334         cgfsng_ops->setup_limits_legacy                 = cgfsng_setup_limits_legacy;
3335         cgfsng_ops->setup_limits                        = cgfsng_setup_limits;
3336         cgfsng_ops->driver                              = "cgfsng";
3337         cgfsng_ops->version                             = "1.0.0";
3338         cgfsng_ops->attach                              = cgfsng_attach;
3339         cgfsng_ops->chown                               = cgfsng_chown;
3340         cgfsng_ops->mount                               = cgfsng_mount;
3341         cgfsng_ops->devices_activate                    = cgfsng_devices_activate;
3342         cgfsng_ops->get_limiting_cgroup                 = cgfsng_get_limiting_cgroup;
3343
3344         cgfsng_ops->criu_escape                         = cgfsng_criu_escape;
3345         cgfsng_ops->criu_num_hierarchies                = cgfsng_criu_num_hierarchies;
3346         cgfsng_ops->criu_get_hierarchies                = cgfsng_criu_get_hierarchies;
3347
3348         return move_ptr(cgfsng_ops);
3349 }
3350
3351 int cgroup_attach(const struct lxc_conf *conf, const char *name,
3352                   const char *lxcpath, pid_t pid)
3353 {
3354         __do_close int unified_fd = -EBADF;
3355         int ret;
3356
3357         if (!conf || is_empty_string(name) || is_empty_string(lxcpath) || pid <= 0)
3358                 return ret_errno(EINVAL);
3359
3360         unified_fd = lxc_cmd_get_cgroup2_fd(name, lxcpath);
3361         if (unified_fd < 0)
3362                 return ret_errno(ENOCGROUP2);
3363
3364         if (!lxc_list_empty(&conf->id_map)) {
3365                 struct userns_exec_unified_attach_data args = {
3366                         .conf           = conf,
3367                         .unified_fd     = unified_fd,
3368                         .pid            = pid,
3369                 };
3370
3371                 ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
3372                 if (ret < 0)
3373                         return -errno;
3374
3375                 ret = userns_exec_minimal(conf,
3376                                           cgroup_unified_attach_parent_wrapper,
3377                                           &args,
3378                                           cgroup_unified_attach_child_wrapper,
3379                                           &args);
3380         } else {
3381                 ret = cgroup_attach_leaf(conf, unified_fd, pid);
3382         }
3383
3384         return ret;
3385 }
3386
3387 /* Connects to command socket therefore isn't callable from command handler. */
3388 int cgroup_get(const char *name, const char *lxcpath,
3389                const char *filename, char *buf, size_t len)
3390 {
3391         __do_close int unified_fd = -EBADF;
3392         ssize_t ret;
3393
3394         if (is_empty_string(filename) || is_empty_string(name) ||
3395             is_empty_string(lxcpath))
3396                 return ret_errno(EINVAL);
3397
3398         if ((buf && !len) || (len && !buf))
3399                 return ret_errno(EINVAL);
3400
3401         unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3402         if (unified_fd < 0)
3403                 return ret_errno(ENOCGROUP2);
3404
3405         ret = lxc_read_try_buf_at(unified_fd, filename, buf, len);
3406         if (ret < 0)
3407                 SYSERROR("Failed to read cgroup value");
3408
3409         return ret;
3410 }
3411
3412 /* Connects to command socket therefore isn't callable from command handler. */
3413 int cgroup_set(const char *name, const char *lxcpath,
3414                const char *filename, const char *value)
3415 {
3416         __do_close int unified_fd = -EBADF;
3417         ssize_t ret;
3418
3419         if (is_empty_string(filename) || is_empty_string(value) ||
3420             is_empty_string(name) || is_empty_string(lxcpath))
3421                 return ret_errno(EINVAL);
3422
3423         unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3424         if (unified_fd < 0)
3425                 return ret_errno(ENOCGROUP2);
3426
3427         if (strnequal(filename, "devices.", STRLITERALLEN("devices."))) {
3428                 struct device_item device = {};
3429
3430                 ret = device_cgroup_rule_parse(&device, filename, value);
3431                 if (ret < 0)
3432                         return log_error_errno(-1, EINVAL, "Failed to parse device string %s=%s", filename, value);
3433
3434                 ret = lxc_cmd_add_bpf_device_cgroup(name, lxcpath, &device);
3435         } else {
3436                 ret = lxc_writeat(unified_fd, filename, value, strlen(value));
3437         }
3438
3439         return ret;
3440 }
3441
3442 static int do_cgroup_freeze(int unified_fd,
3443                             const char *state_string,
3444                             int state_num,
3445                             int timeout,
3446                             const char *epoll_error,
3447                             const char *wait_error)
3448 {
3449         __do_close int events_fd = -EBADF;
3450         call_cleaner(lxc_mainloop_close) struct lxc_epoll_descr *descr_ptr = NULL;
3451         int ret;
3452         struct lxc_epoll_descr descr = {};
3453
3454         if (timeout != 0) {
3455                 ret = lxc_mainloop_open(&descr);
3456                 if (ret)
3457                         return log_error_errno(-1, errno, "%s", epoll_error);
3458
3459                 /* automatically cleaned up now */
3460                 descr_ptr = &descr;
3461
3462                 events_fd = open_at(unified_fd, "cgroup.events", PROTECT_OPEN, PROTECT_LOOKUP_BENEATH, 0);
3463                 if (events_fd < 0)
3464                         return log_error_errno(-errno, errno, "Failed to open cgroup.events file");
3465
3466                 ret = lxc_mainloop_add_handler_events(&descr, events_fd, EPOLLPRI, freezer_cgroup_events_cb, INT_TO_PTR(state_num));
3467                 if (ret < 0)
3468                         return log_error_errno(-1, errno, "Failed to add cgroup.events fd handler to mainloop");
3469         }
3470
3471         ret = lxc_writeat(unified_fd, "cgroup.freeze", state_string, 1);
3472         if (ret < 0)
3473                 return log_error_errno(-1, errno, "Failed to open cgroup.freeze file");
3474
3475         if (timeout != 0) {
3476                 ret = lxc_mainloop(&descr, timeout);
3477                 if (ret)
3478                         return log_error_errno(-1, errno, "%s", wait_error);
3479         }
3480
3481         return log_trace(0, "Container now %s", (state_num == 1) ? "frozen" : "unfrozen");
3482 }
3483
3484 static inline int __cgroup_freeze(int unified_fd, int timeout)
3485 {
3486         return do_cgroup_freeze(unified_fd, "1", 1, timeout,
3487                                 "Failed to create epoll instance to wait for container freeze",
3488                                 "Failed to wait for container to be frozen");
3489 }
3490
3491 int cgroup_freeze(const char *name, const char *lxcpath, int timeout)
3492 {
3493         __do_close int unified_fd = -EBADF;
3494         int ret;
3495
3496         if (is_empty_string(name) || is_empty_string(lxcpath))
3497                 return ret_errno(EINVAL);
3498
3499         unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3500         if (unified_fd < 0)
3501                 return ret_errno(ENOCGROUP2);
3502
3503         lxc_cmd_notify_state_listeners(name, lxcpath, FREEZING);
3504         ret = __cgroup_freeze(unified_fd, timeout);
3505         lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? FROZEN : RUNNING);
3506         return ret;
3507 }
3508
3509 int __cgroup_unfreeze(int unified_fd, int timeout)
3510 {
3511         return do_cgroup_freeze(unified_fd, "0", 0, timeout,
3512                                 "Failed to create epoll instance to wait for container freeze",
3513                                 "Failed to wait for container to be frozen");
3514 }
3515
3516 int cgroup_unfreeze(const char *name, const char *lxcpath, int timeout)
3517 {
3518         __do_close int unified_fd = -EBADF;
3519         int ret;
3520
3521         if (is_empty_string(name) || is_empty_string(lxcpath))
3522                 return ret_errno(EINVAL);
3523
3524         unified_fd = lxc_cmd_get_limiting_cgroup2_fd(name, lxcpath);
3525         if (unified_fd < 0)
3526                 return ret_errno(ENOCGROUP2);
3527
3528         lxc_cmd_notify_state_listeners(name, lxcpath, THAWED);
3529         ret = __cgroup_unfreeze(unified_fd, timeout);
3530         lxc_cmd_notify_state_listeners(name, lxcpath, !ret ? RUNNING : FROZEN);
3531         return ret;
3532 }