src/lxc/cgroups/cgfsng.c

   1 /*
   2  * lxc: linux Container library
   3  *
   4  * Copyright © 2016 Canonical Ltd.
   5  *
   6  * Authors:
   7  * Serge Hallyn <serge.hallyn@ubuntu.com>
   8  * Christian Brauner <christian.brauner@ubuntu.com>
   9  *
  10  * This library is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * This library is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with this library; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /*
  26  * cgfs-ng.c: this is a new, simplified implementation of a filesystem
  27  * cgroup backend.  The original cgfs.c was designed to be as flexible
  28  * as possible.  It would try to find cgroup filesystems no matter where
  29  * or how you had them mounted, and deduce the most usable mount for
  30  * each controller.
  31  *
  32  * This new implementation assumes that cgroup filesystems are mounted
  33  * under /sys/fs/cgroup/clist where clist is either the controller, or
  34  * a comma-separated list of controllers.
  35  */
  36
  37 #ifndef _GNU_SOURCE
  38 #define _GNU_SOURCE 1
  39 #endif
  40 #include <ctype.h>
  41 #include <dirent.h>
  42 #include <errno.h>
  43 #include <grp.h>
  44 #include <linux/kdev_t.h>
  45 #include <linux/types.h>
  46 #include <stdint.h>
  47 #include <stdio.h>
  48 #include <stdlib.h>
  49 #include <string.h>
  50 #include <sys/types.h>
  51 #include <unistd.h>
  52
  53 #include "caps.h"
  54 #include "cgroup.h"
  55 #include "cgroup_utils.h"
  56 #include "commands.h"
  57 #include "conf.h"
  58 #include "config.h"
  59 #include "log.h"
  60 #include "macro.h"
  61 #include "memory_utils.h"
  62 #include "storage/storage.h"
  63 #include "utils.h"
  64
  65 #ifndef HAVE_STRLCPY
  66 #include "include/strlcpy.h"
  67 #endif
  68
  69 #ifndef HAVE_STRLCAT
  70 #include "include/strlcat.h"
  71 #endif
  72
  73 lxc_log_define(cgfsng, cgroup);
  74
  75 static void free_string_list(char **clist)
  76 {
  77         int i;
  78
  79         if (!clist)
  80                 return;
  81
  82         for (i = 0; clist[i]; i++)
  83                 free(clist[i]);
  84
  85         free(clist);
  86 }
  87
  88 /* Given a pointer to a null-terminated array of pointers, realloc to add one
  89  * entry, and point the new entry to NULL. Do not fail. Return the index to the
  90  * second-to-last entry - that is, the one which is now available for use
  91  * (keeping the list null-terminated).
  92  */
  93 static int append_null_to_list(void ***list)
  94 {
  95         int newentry = 0;
  96
  97         if (*list)
  98                 for (; (*list)[newentry]; newentry++)
  99                         ;
 100
 101         *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
 102         (*list)[newentry + 1] = NULL;
 103         return newentry;
 104 }
 105
 106 /* Given a null-terminated array of strings, check whether @entry is one of the
 107  * strings.
 108  */
 109 static bool string_in_list(char **list, const char *entry)
 110 {
 111         int i;
 112
 113         if (!list)
 114                 return false;
 115
 116         for (i = 0; list[i]; i++)
 117                 if (strcmp(list[i], entry) == 0)
 118                         return true;
 119
 120         return false;
 121 }
 122
 123 /* Return a copy of @entry prepending "name=", i.e.  turn "systemd" into
 124  * "name=systemd". Do not fail.
 125  */
 126 static char *cg_legacy_must_prefix_named(char *entry)
 127 {
 128         size_t len;
 129         char *prefixed;
 130
 131         len = strlen(entry);
 132         prefixed = must_realloc(NULL, len + 6);
 133
 134         memcpy(prefixed, "name=", STRLITERALLEN("name="));
 135         memcpy(prefixed + STRLITERALLEN("name="), entry, len);
 136         prefixed[len + 5] = '\0';
 137
 138         return prefixed;
 139 }
 140
 141 /* Append an entry to the clist. Do not fail. @clist must be NULL the first time
 142  * we are called.
 143  *
 144  * We also handle named subsystems here. Any controller which is not a kernel
 145  * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
 146  * we refuse to use because we're not sure which we have here.
 147  * (TODO: We could work around this in some cases by just remounting to be
 148  * unambiguous, or by comparing mountpoint contents with current cgroup.)
 149  *
 150  * The last entry will always be NULL.
 151  */
 152 static void must_append_controller(char **klist, char **nlist, char ***clist,
 153                                    char *entry)
 154 {
 155         int newentry;
 156         char *copy;
 157
 158         if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
 159                 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
 160                 ERROR("It is both a named and kernel subsystem");
 161                 return;
 162         }
 163
 164         newentry = append_null_to_list((void ***)clist);
 165
 166         if (strncmp(entry, "name=", 5) == 0)
 167                 copy = must_copy_string(entry);
 168         else if (string_in_list(klist, entry))
 169                 copy = must_copy_string(entry);
 170         else
 171                 copy = cg_legacy_must_prefix_named(entry);
 172
 173         (*clist)[newentry] = copy;
 174 }
 175
 176 /* Given a handler's cgroup data, return the struct hierarchy for the controller
 177  * @c, or NULL if there is none.
 178  */
 179 struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
 180 {
 181         int i;
 182
 183         errno = ENOENT;
 184
 185         if (!ops->hierarchies) {
 186                 TRACE("There are no useable cgroup controllers");
 187                 return NULL;
 188         }
 189
 190         for (i = 0; ops->hierarchies[i]; i++) {
 191                 if (!controller) {
 192                         /* This is the empty unified hierarchy. */
 193                         if (ops->hierarchies[i]->controllers &&
 194                             !ops->hierarchies[i]->controllers[0])
 195                                 return ops->hierarchies[i];
 196
 197                         continue;
 198                 }
 199
 200                 if (string_in_list(ops->hierarchies[i]->controllers, controller))
 201                         return ops->hierarchies[i];
 202         }
 203
 204         if (controller)
 205                 WARN("There is no useable %s controller", controller);
 206         else
 207                 WARN("There is no empty unified cgroup hierarchy");
 208
 209         return NULL;
 210 }
 211
 212 #define BATCH_SIZE 50
 213 static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
 214 {
 215         int newbatches = (newlen / BATCH_SIZE) + 1;
 216         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 217
 218         if (!*mem || newbatches > oldbatches) {
 219                 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
 220         }
 221 }
 222
 223 static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
 224 {
 225         size_t full = oldlen + newlen;
 226
 227         batch_realloc(dest, oldlen, full + 1);
 228
 229         memcpy(*dest + oldlen, new, newlen + 1);
 230 }
 231
 232 /* Slurp in a whole file */
 233 static char *read_file(const char *fnam)
 234 {
 235         __do_free char *line = NULL;
 236         __do_fclose FILE *f = NULL;
 237         int linelen;
 238         char *buf = NULL;
 239         size_t len = 0, fulllen = 0;
 240
 241         f = fopen(fnam, "r");
 242         if (!f)
 243                 return NULL;
 244         while ((linelen = getline(&line, &len, f)) != -1) {
 245                 append_line(&buf, fulllen, line, linelen);
 246                 fulllen += linelen;
 247         }
 248         return buf;
 249 }
 250
 251 /* Taken over modified from the kernel sources. */
 252 #define NBITS 32 /* bits in uint32_t */
 253 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
 254 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
 255
 256 static void set_bit(unsigned bit, uint32_t *bitarr)
 257 {
 258         bitarr[bit / NBITS] |= (1 << (bit % NBITS));
 259 }
 260
 261 static void clear_bit(unsigned bit, uint32_t *bitarr)
 262 {
 263         bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
 264 }
 265
 266 static bool is_set(unsigned bit, uint32_t *bitarr)
 267 {
 268         return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
 269 }
 270
 271 /* Create cpumask from cpulist aka turn:
 272  *
 273  *      0,2-3
 274  *
 275  * into bit array
 276  *
 277  *      1 0 1 1
 278  */
 279 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
 280 {
 281         char *token;
 282         size_t arrlen;
 283         uint32_t *bitarr;
 284
 285         arrlen = BITS_TO_LONGS(nbits);
 286         bitarr = calloc(arrlen, sizeof(uint32_t));
 287         if (!bitarr)
 288                 return NULL;
 289
 290         lxc_iterate_parts(token, buf, ",") {
 291                 errno = 0;
 292                 unsigned end, start;
 293                 char *range;
 294
 295                 start = strtoul(token, NULL, 0);
 296                 end = start;
 297                 range = strchr(token, '-');
 298                 if (range)
 299                         end = strtoul(range + 1, NULL, 0);
 300
 301                 if (!(start <= end)) {
 302                         free(bitarr);
 303                         return NULL;
 304                 }
 305
 306                 if (end >= nbits) {
 307                         free(bitarr);
 308                         return NULL;
 309                 }
 310
 311                 while (start <= end)
 312                         set_bit(start++, bitarr);
 313         }
 314
 315         return bitarr;
 316 }
 317
 318 /* Turn cpumask into simple, comma-separated cpulist. */
 319 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
 320 {
 321         int ret;
 322         size_t i;
 323         char **cpulist = NULL;
 324         char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
 325
 326         for (i = 0; i <= nbits; i++) {
 327                 if (!is_set(i, bitarr))
 328                         continue;
 329
 330                 ret = snprintf(numstr, sizeof(numstr), "%zu", i);
 331                 if (ret < 0 || (size_t)ret >= sizeof(numstr)) {
 332                         lxc_free_array((void **)cpulist, free);
 333                         return NULL;
 334                 }
 335
 336                 ret = lxc_append_string(&cpulist, numstr);
 337                 if (ret < 0) {
 338                         lxc_free_array((void **)cpulist, free);
 339                         return NULL;
 340                 }
 341         }
 342
 343         if (!cpulist)
 344                 return NULL;
 345
 346         return lxc_string_join(",", (const char **)cpulist, false);
 347 }
 348
 349 static ssize_t get_max_cpus(char *cpulist)
 350 {
 351         char *c1, *c2;
 352         char *maxcpus = cpulist;
 353         size_t cpus = 0;
 354
 355         c1 = strrchr(maxcpus, ',');
 356         if (c1)
 357                 c1++;
 358
 359         c2 = strrchr(maxcpus, '-');
 360         if (c2)
 361                 c2++;
 362
 363         if (!c1 && !c2)
 364                 c1 = maxcpus;
 365         else if (c1 > c2)
 366                 c2 = c1;
 367         else if (c1 < c2)
 368                 c1 = c2;
 369         else if (!c1 && c2)
 370                 c1 = c2;
 371
 372         errno = 0;
 373         cpus = strtoul(c1, NULL, 0);
 374         if (errno != 0)
 375                 return -1;
 376
 377         return cpus;
 378 }
 379
 380 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
 381 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
 382 static bool cg_legacy_filter_and_set_cpus(char *path, bool am_initialized)
 383 {
 384         __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
 385                        *offlinecpus = NULL, *posscpus = NULL;
 386         __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
 387                            *possmask = NULL;
 388         int ret;
 389         ssize_t i;
 390         char oldv;
 391         char *lastslash;
 392         ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
 393         bool bret = false, flipped_bit = false;
 394
 395         lastslash = strrchr(path, '/');
 396         if (!lastslash) {
 397                 ERROR("Failed to detect \"/\" in \"%s\"", path);
 398                 return bret;
 399         }
 400         oldv = *lastslash;
 401         *lastslash = '\0';
 402         fpath = must_make_path(path, "cpuset.cpus", NULL);
 403         posscpus = read_file(fpath);
 404         if (!posscpus) {
 405                 SYSERROR("Failed to read file \"%s\"", fpath);
 406                 return false;
 407         }
 408
 409         /* Get maximum number of cpus found in possible cpuset. */
 410         maxposs = get_max_cpus(posscpus);
 411         if (maxposs < 0 || maxposs >= INT_MAX - 1)
 412                 return false;
 413
 414         if (file_exists(__ISOL_CPUS)) {
 415                 isolcpus = read_file(__ISOL_CPUS);
 416                 if (!isolcpus) {
 417                         SYSERROR("Failed to read file \"%s\"", __ISOL_CPUS);
 418                         return false;
 419                 }
 420
 421                 if (isdigit(isolcpus[0])) {
 422                         /* Get maximum number of cpus found in isolated cpuset. */
 423                         maxisol = get_max_cpus(isolcpus);
 424                         if (maxisol < 0 || maxisol >= INT_MAX - 1)
 425                                 return false;
 426                 }
 427
 428                 if (maxposs < maxisol)
 429                         maxposs = maxisol;
 430                 maxposs++;
 431         } else {
 432                 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
 433         }
 434
 435         if (file_exists(__OFFLINE_CPUS)) {
 436                 offlinecpus = read_file(__OFFLINE_CPUS);
 437                 if (!offlinecpus) {
 438                         SYSERROR("Failed to read file \"%s\"", __OFFLINE_CPUS);
 439                         return false;
 440                 }
 441
 442                 if (isdigit(offlinecpus[0])) {
 443                         /* Get maximum number of cpus found in offline cpuset. */
 444                         maxoffline = get_max_cpus(offlinecpus);
 445                         if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
 446                                 return false;
 447                 }
 448
 449                 if (maxposs < maxoffline)
 450                         maxposs = maxoffline;
 451                 maxposs++;
 452         } else {
 453                 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
 454         }
 455
 456         if ((maxisol == 0) && (maxoffline == 0)) {
 457                 cpulist = move_ptr(posscpus);
 458                 goto copy_parent;
 459         }
 460
 461         possmask = lxc_cpumask(posscpus, maxposs);
 462         if (!possmask) {
 463                 ERROR("Failed to create cpumask for possible cpus");
 464                 return false;
 465         }
 466
 467         if (maxisol > 0) {
 468                 isolmask = lxc_cpumask(isolcpus, maxposs);
 469                 if (!isolmask) {
 470                         ERROR("Failed to create cpumask for isolated cpus");
 471                         return false;
 472                 }
 473         }
 474
 475         if (maxoffline > 0) {
 476                 offlinemask = lxc_cpumask(offlinecpus, maxposs);
 477                 if (!offlinemask) {
 478                         ERROR("Failed to create cpumask for offline cpus");
 479                         return false;
 480                 }
 481         }
 482
 483         for (i = 0; i <= maxposs; i++) {
 484                 if ((isolmask && !is_set(i, isolmask)) ||
 485                     (offlinemask && !is_set(i, offlinemask)) ||
 486                     !is_set(i, possmask))
 487                         continue;
 488
 489                 flipped_bit = true;
 490                 clear_bit(i, possmask);
 491         }
 492
 493         if (!flipped_bit) {
 494                 DEBUG("No isolated or offline cpus present in cpuset");
 495                 return true;
 496         }
 497         DEBUG("Removed isolated or offline cpus from cpuset");
 498
 499         cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
 500         if (!cpulist) {
 501                 ERROR("Failed to create cpu list");
 502                 return false;
 503         }
 504
 505 copy_parent:
 506         if (!am_initialized) {
 507                 *lastslash = oldv;
 508                 fpath = must_make_path(path, "cpuset.cpus", NULL);
 509                 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false,
 510                                         0666);
 511                 if (ret < 0) {
 512                         SYSERROR("Failed to write cpu list to \"%s\"", fpath);
 513                         return false;
 514                 }
 515
 516                 TRACE("Copied cpu settings of parent cgroup");
 517         }
 518
 519         return true;
 520 }
 521
 522 /* Copy contents of parent(@path)/@file to @path/@file */
 523 static bool copy_parent_file(char *path, char *file)
 524 {
 525         __do_free char *child_path = NULL, *parent_path = NULL, *value = NULL;
 526         int ret;
 527         char oldv;
 528         int len = 0;
 529         char *lastslash = NULL;
 530
 531         lastslash = strrchr(path, '/');
 532         if (!lastslash) {
 533                 ERROR("Failed to detect \"/\" in \"%s\"", path);
 534                 return false;
 535         }
 536         oldv = *lastslash;
 537         *lastslash = '\0';
 538         parent_path = must_make_path(path, file, NULL);
 539         len = lxc_read_from_file(parent_path, NULL, 0);
 540         if (len <= 0) {
 541                 SYSERROR("Failed to determine buffer size");
 542                 return false;
 543         }
 544
 545         value = must_realloc(NULL, len + 1);
 546         ret = lxc_read_from_file(parent_path, value, len);
 547         if (ret != len) {
 548                 SYSERROR("Failed to read from parent file \"%s\"", parent_path);
 549                 return false;
 550         }
 551
 552         *lastslash = oldv;
 553         child_path = must_make_path(path, file, NULL);
 554         ret = lxc_write_to_file(child_path, value, len, false, 0666);
 555         if (ret < 0)
 556                 SYSERROR("Failed to write \"%s\" to file \"%s\"", value, child_path);
 557         return ret >= 0;
 558 }
 559
 560 /* Initialize the cpuset hierarchy in first directory of @gname and set
 561  * cgroup.clone_children so that children inherit settings. Since the
 562  * h->base_path is populated by init or ourselves, we know it is already
 563  * initialized.
 564  */
 565 static bool cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
 566 {
 567         __do_free char *cgpath = NULL, *clonechildrenpath = NULL;
 568         int ret;
 569         char v;
 570         char *slash;
 571
 572         if (!string_in_list(h->controllers, "cpuset"))
 573                 return true;
 574
 575         if (*cgname == '/')
 576                 cgname++;
 577         slash = strchr(cgname, '/');
 578         if (slash)
 579                 *slash = '\0';
 580
 581         cgpath = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
 582         if (slash)
 583                 *slash = '/';
 584
 585         ret = mkdir(cgpath, 0755);
 586         if (ret < 0) {
 587                 if (errno != EEXIST) {
 588                         SYSERROR("Failed to create directory \"%s\"", cgpath);
 589                         return false;
 590                 }
 591         }
 592
 593         clonechildrenpath = must_make_path(cgpath, "cgroup.clone_children", NULL);
 594         /* unified hierarchy doesn't have clone_children */
 595         if (!file_exists(clonechildrenpath))
 596                 return true;
 597
 598         ret = lxc_read_from_file(clonechildrenpath, &v, 1);
 599         if (ret < 0) {
 600                 SYSERROR("Failed to read file \"%s\"", clonechildrenpath);
 601                 return false;
 602         }
 603
 604         /* Make sure any isolated cpus are removed from cpuset.cpus. */
 605         if (!cg_legacy_filter_and_set_cpus(cgpath, v == '1')) {
 606                 SYSERROR("Failed to remove isolated cpus");
 607                 return false;
 608         }
 609
 610         /* Already set for us by someone else. */
 611         if (v == '1') {
 612                 DEBUG("\"cgroup.clone_children\" was already set to \"1\"");
 613                 return true;
 614         }
 615
 616         /* copy parent's settings */
 617         if (!copy_parent_file(cgpath, "cpuset.mems")) {
 618                 SYSERROR("Failed to copy \"cpuset.mems\" settings");
 619                 return false;
 620         }
 621
 622         ret = lxc_write_to_file(clonechildrenpath, "1", 1, false, 0666);
 623         if (ret < 0) {
 624                 /* Set clone_children so children inherit our settings */
 625                 SYSERROR("Failed to write 1 to \"%s\"", clonechildrenpath);
 626                 return false;
 627         }
 628
 629         return true;
 630 }
 631
 632 /* Given two null-terminated lists of strings, return true if any string is in
 633  * both.
 634  */
 635 static bool controller_lists_intersect(char **l1, char **l2)
 636 {
 637         int i;
 638
 639         if (!l1 || !l2)
 640                 return false;
 641
 642         for (i = 0; l1[i]; i++) {
 643                 if (string_in_list(l2, l1[i]))
 644                         return true;
 645         }
 646
 647         return false;
 648 }
 649
 650 /* For a null-terminated list of controllers @clist, return true if any of those
 651  * controllers is already listed the null-terminated list of hierarchies @hlist.
 652  * Realistically, if one is present, all must be present.
 653  */
 654 static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
 655 {
 656         int i;
 657
 658         if (!hlist)
 659                 return false;
 660
 661         for (i = 0; hlist[i]; i++)
 662                 if (controller_lists_intersect(hlist[i]->controllers, clist))
 663                         return true;
 664
 665         return false;
 666 }
 667
 668 /* Return true if the controller @entry is found in the null-terminated list of
 669  * hierarchies @hlist.
 670  */
 671 static bool controller_found(struct hierarchy **hlist, char *entry)
 672 {
 673         int i;
 674
 675         if (!hlist)
 676                 return false;
 677
 678         for (i = 0; hlist[i]; i++)
 679                 if (string_in_list(hlist[i]->controllers, entry))
 680                         return true;
 681
 682         return false;
 683 }
 684
 685 /* Return true if all of the controllers which we require have been found.  The
 686  * required list is  freezer and anything in lxc.cgroup.use.
 687  */
 688 static bool all_controllers_found(struct cgroup_ops *ops)
 689 {
 690         char **cur;
 691         struct hierarchy **hlist = ops->hierarchies;
 692
 693         if (!ops->cgroup_use)
 694                 return true;
 695
 696         for (cur = ops->cgroup_use; cur && *cur; cur++)
 697                 if (!controller_found(hlist, *cur)) {
 698                         ERROR("No %s controller mountpoint found", *cur);
 699                         return false;
 700                 }
 701
 702         return true;
 703 }
 704
 705 /* Get the controllers from a mountinfo line There are other ways we could get
 706  * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
 707  * could parse the mount options. But we simply assume that the mountpoint must
 708  * be /sys/fs/cgroup/controller-list
 709  */
 710 static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
 711                                         int type)
 712 {
 713         /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
 714          * for legacy hierarchies.
 715          */
 716         int i;
 717         char *p2, *tok;
 718         char *p = line, *sep = ",";
 719         char **aret = NULL;
 720
 721         for (i = 0; i < 4; i++) {
 722                 p = strchr(p, ' ');
 723                 if (!p)
 724                         return NULL;
 725                 p++;
 726         }
 727
 728         /* Note, if we change how mountinfo works, then our caller will need to
 729          * verify /sys/fs/cgroup/ in this field.
 730          */
 731         if (strncmp(p, "/sys/fs/cgroup/", 15) != 0) {
 732                 ERROR("Found hierarchy not under /sys/fs/cgroup: \"%s\"", p);
 733                 return NULL;
 734         }
 735
 736         p += 15;
 737         p2 = strchr(p, ' ');
 738         if (!p2) {
 739                 ERROR("Corrupt mountinfo");
 740                 return NULL;
 741         }
 742         *p2 = '\0';
 743
 744         if (type == CGROUP_SUPER_MAGIC) {
 745                 __do_free char *dup = NULL;
 746
 747                 /* strdup() here for v1 hierarchies. Otherwise
 748                  * lxc_iterate_parts() will destroy mountpoints such as
 749                  * "/sys/fs/cgroup/cpu,cpuacct".
 750                  */
 751                 dup = must_copy_string(p);
 752                 if (!dup)
 753                         return NULL;
 754
 755                 lxc_iterate_parts (tok, dup, sep)
 756                         must_append_controller(klist, nlist, &aret, tok);
 757         }
 758         *p2 = ' ';
 759
 760         return aret;
 761 }
 762
 763 static char **cg_unified_make_empty_controller(void)
 764 {
 765         int newentry;
 766         char **aret = NULL;
 767
 768         newentry = append_null_to_list((void ***)&aret);
 769         aret[newentry] = NULL;
 770         return aret;
 771 }
 772
 773 static char **cg_unified_get_controllers(const char *file)
 774 {
 775         __do_free char *buf = NULL;
 776         char *tok;
 777         char *sep = " \t\n";
 778         char **aret = NULL;
 779
 780         buf = read_file(file);
 781         if (!buf)
 782                 return NULL;
 783
 784         lxc_iterate_parts(tok, buf, sep) {
 785                 int newentry;
 786                 char *copy;
 787
 788                 newentry = append_null_to_list((void ***)&aret);
 789                 copy = must_copy_string(tok);
 790                 aret[newentry] = copy;
 791         }
 792
 793         return aret;
 794 }
 795
 796 static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
 797                                        char *container_base_path, int type)
 798 {
 799         struct hierarchy *new;
 800         int newentry;
 801
 802         new = must_realloc(NULL, sizeof(*new));
 803         new->controllers = clist;
 804         new->mountpoint = mountpoint;
 805         new->container_base_path = container_base_path;
 806         new->container_full_path = NULL;
 807         new->monitor_full_path = NULL;
 808         new->version = type;
 809         new->cgroup2_chown = NULL;
 810
 811         newentry = append_null_to_list((void ***)h);
 812         (*h)[newentry] = new;
 813         return new;
 814 }
 815
 816 /* Get a copy of the mountpoint from @line, which is a line from
 817  * /proc/self/mountinfo.
 818  */
 819 static char *cg_hybrid_get_mountpoint(char *line)
 820 {
 821         int i;
 822         size_t len;
 823         char *p2;
 824         char *p = line, *sret = NULL;
 825
 826         for (i = 0; i < 4; i++) {
 827                 p = strchr(p, ' ');
 828                 if (!p)
 829                         return NULL;
 830                 p++;
 831         }
 832
 833         if (strncmp(p, "/sys/fs/cgroup/", 15) != 0)
 834                 return NULL;
 835
 836         p2 = strchr(p + 15, ' ');
 837         if (!p2)
 838                 return NULL;
 839         *p2 = '\0';
 840
 841         len = strlen(p);
 842         sret = must_realloc(NULL, len + 1);
 843         memcpy(sret, p, len);
 844         sret[len] = '\0';
 845         return sret;
 846 }
 847
 848 /* Given a multi-line string, return a null-terminated copy of the current line. */
 849 static char *copy_to_eol(char *p)
 850 {
 851         char *p2 = strchr(p, '\n'), *sret;
 852         size_t len;
 853
 854         if (!p2)
 855                 return NULL;
 856
 857         len = p2 - p;
 858         sret = must_realloc(NULL, len + 1);
 859         memcpy(sret, p, len);
 860         sret[len] = '\0';
 861         return sret;
 862 }
 863
 864 /* cgline: pointer to character after the first ':' in a line in a \n-terminated
 865  * /proc/self/cgroup file. Check whether controller c is present.
 866  */
 867 static bool controller_in_clist(char *cgline, char *c)
 868 {
 869         __do_free char *tmp = NULL;
 870         char *tok, *eol;
 871         size_t len;
 872
 873         eol = strchr(cgline, ':');
 874         if (!eol)
 875                 return false;
 876
 877         len = eol - cgline;
 878         tmp = must_realloc(NULL, len + 1);
 879         memcpy(tmp, cgline, len);
 880         tmp[len] = '\0';
 881
 882         lxc_iterate_parts(tok, tmp, ",")
 883                 if (strcmp(tok, c) == 0)
 884                         return true;
 885
 886         return false;
 887 }
 888
 889 /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
 890  * @controller.
 891  */
 892 static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
 893                                           int type)
 894 {
 895         char *p = basecginfo;
 896
 897         for (;;) {
 898                 bool is_cgv2_base_cgroup = false;
 899
 900                 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
 901                 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
 902                         is_cgv2_base_cgroup = true;
 903
 904                 p = strchr(p, ':');
 905                 if (!p)
 906                         return NULL;
 907                 p++;
 908
 909                 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
 910                         p = strchr(p, ':');
 911                         if (!p)
 912                                 return NULL;
 913                         p++;
 914                         return copy_to_eol(p);
 915                 }
 916
 917                 p = strchr(p, '\n');
 918                 if (!p)
 919                         return NULL;
 920                 p++;
 921         }
 922 }
 923
 924 static void must_append_string(char ***list, char *entry)
 925 {
 926         int newentry;
 927         char *copy;
 928
 929         newentry = append_null_to_list((void ***)list);
 930         copy = must_copy_string(entry);
 931         (*list)[newentry] = copy;
 932 }
 933
 934 static int get_existing_subsystems(char ***klist, char ***nlist)
 935 {
 936         __do_free char *line = NULL;
 937         __do_fclose FILE *f = NULL;
 938         size_t len = 0;
 939
 940         f = fopen("/proc/self/cgroup", "r");
 941         if (!f)
 942                 return -1;
 943
 944         while (getline(&line, &len, f) != -1) {
 945                 char *p, *p2, *tok;
 946                 p = strchr(line, ':');
 947                 if (!p)
 948                         continue;
 949                 p++;
 950                 p2 = strchr(p, ':');
 951                 if (!p2)
 952                         continue;
 953                 *p2 = '\0';
 954
 955                 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
 956                  * contains an entry of the form:
 957                  *
 958                  *      0::/some/path
 959                  *
 960                  * In this case we use "cgroup2" as controller name.
 961                  */
 962                 if ((p2 - p) == 0) {
 963                         must_append_string(klist, "cgroup2");
 964                         continue;
 965                 }
 966
 967                 lxc_iterate_parts(tok, p, ",") {
 968                         if (strncmp(tok, "name=", 5) == 0)
 969                                 must_append_string(nlist, tok);
 970                         else
 971                                 must_append_string(klist, tok);
 972                 }
 973         }
 974
 975         return 0;
 976 }
 977
 978 static void trim(char *s)
 979 {
 980         size_t len;
 981
 982         len = strlen(s);
 983         while ((len > 1) && (s[len - 1] == '\n'))
 984                 s[--len] = '\0';
 985 }
 986
 987 static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
 988 {
 989         int i;
 990         struct hierarchy **it;
 991
 992         if (!ops->hierarchies) {
 993                 TRACE("  No hierarchies found");
 994                 return;
 995         }
 996
 997         TRACE("  Hierarchies:");
 998         for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
 999                 int j;
1000                 char **cit;
1001
1002                 TRACE("  %d: base_cgroup: %s", i, (*it)->container_base_path ? (*it)->container_base_path : "(null)");
1003                 TRACE("      mountpoint:  %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
1004                 TRACE("      controllers:");
1005                 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
1006                         TRACE("      %d: %s", j, *cit);
1007         }
1008 }
1009
1010 static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
1011                                               char **nlist)
1012 {
1013         int k;
1014         char **it;
1015
1016         TRACE("basecginfo is:");
1017         TRACE("%s", basecginfo);
1018
1019         for (k = 0, it = klist; it && *it; it++, k++)
1020                 TRACE("kernel subsystem %d: %s", k, *it);
1021
1022         for (k = 0, it = nlist; it && *it; it++, k++)
1023                 TRACE("named subsystem %d: %s", k, *it);
1024 }
1025
1026 static int cgroup_rmdir(struct hierarchy **hierarchies,
1027                         const char *container_cgroup)
1028 {
1029         int i;
1030
1031         if (!container_cgroup || !hierarchies)
1032                 return 0;
1033
1034         for (i = 0; hierarchies[i]; i++) {
1035                 int ret;
1036                 struct hierarchy *h = hierarchies[i];
1037
1038                 if (!h->container_full_path)
1039                         continue;
1040
1041                 ret = recursive_destroy(h->container_full_path);
1042                 if (ret < 0)
1043                         WARN("Failed to destroy \"%s\"", h->container_full_path);
1044
1045                 free(h->container_full_path);
1046                 h->container_full_path = NULL;
1047         }
1048
1049         return 0;
1050 }
1051
1052 struct generic_userns_exec_data {
1053         struct hierarchy **hierarchies;
1054         const char *container_cgroup;
1055         struct lxc_conf *conf;
1056         uid_t origuid; /* target uid in parent namespace */
1057         char *path;
1058 };
1059
1060 static int cgroup_rmdir_wrapper(void *data)
1061 {
1062         int ret;
1063         struct generic_userns_exec_data *arg = data;
1064         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1065         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1066
1067         ret = setresgid(nsgid, nsgid, nsgid);
1068         if (ret < 0) {
1069                 SYSERROR("Failed to setresgid(%d, %d, %d)", (int)nsgid,
1070                          (int)nsgid, (int)nsgid);
1071                 return -1;
1072         }
1073
1074         ret = setresuid(nsuid, nsuid, nsuid);
1075         if (ret < 0) {
1076                 SYSERROR("Failed to setresuid(%d, %d, %d)", (int)nsuid,
1077                          (int)nsuid, (int)nsuid);
1078                 return -1;
1079         }
1080
1081         ret = setgroups(0, NULL);
1082         if (ret < 0 && errno != EPERM) {
1083                 SYSERROR("Failed to setgroups(0, NULL)");
1084                 return -1;
1085         }
1086
1087         return cgroup_rmdir(arg->hierarchies, arg->container_cgroup);
1088 }
1089
1090 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
1091                                                 struct lxc_handler *handler)
1092 {
1093         int ret;
1094         struct generic_userns_exec_data wrap;
1095
1096         if (!ops->hierarchies)
1097                 return;
1098
1099         wrap.origuid = 0;
1100         wrap.container_cgroup = ops->container_cgroup;
1101         wrap.hierarchies = ops->hierarchies;
1102         wrap.conf = handler->conf;
1103
1104         if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
1105                 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
1106                                     "cgroup_rmdir_wrapper");
1107         else
1108                 ret = cgroup_rmdir(ops->hierarchies, ops->container_cgroup);
1109         if (ret < 0) {
1110                 WARN("Failed to destroy cgroups");
1111                 return;
1112         }
1113 }
1114
1115 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1116                                                 struct lxc_handler *handler)
1117 {
1118         int len;
1119         struct lxc_conf *conf = handler->conf;
1120         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1121
1122         if (!ops->hierarchies)
1123                 return;
1124
1125         len = snprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1126         if (len < 0 || (size_t)len >= sizeof(pidstr))
1127                 return;
1128
1129         for (int i = 0; ops->hierarchies[i]; i++) {
1130                 __do_free char *pivot_path = NULL;
1131                 int ret;
1132                 char *chop;
1133                 char pivot_cgroup[] = PIVOT_CGROUP;
1134                 struct hierarchy *h = ops->hierarchies[i];
1135
1136                 if (!h->monitor_full_path)
1137                         continue;
1138
1139                 if (conf && conf->cgroup_meta.dir)
1140                         pivot_path = must_make_path(h->mountpoint,
1141                                                     h->container_base_path,
1142                                                     conf->cgroup_meta.dir,
1143                                                     PIVOT_CGROUP,
1144                                                     "cgroup.procs", NULL);
1145                 else
1146                         pivot_path = must_make_path(h->mountpoint,
1147                                                     h->container_base_path,
1148                                                     PIVOT_CGROUP,
1149                                                     "cgroup.procs", NULL);
1150
1151                 chop = strrchr(pivot_path, '/');
1152                 if (chop)
1153                         *chop = '\0';
1154
1155                 /*
1156                  * Make sure not to pass in the ro string literal PIVOT_CGROUP
1157                  * here.
1158                  */
1159                 if (!cg_legacy_handle_cpuset_hierarchy(h, pivot_cgroup)) {
1160                         WARN("Failed to handle legacy cpuset controller");
1161                         continue;
1162                 }
1163
1164                 ret = mkdir_p(pivot_path, 0755);
1165                 if (ret < 0 && errno != EEXIST) {
1166                         SYSWARN("Failed to create cgroup \"%s\"\n", pivot_path);
1167                         continue;
1168                 }
1169
1170                 if (chop)
1171                         *chop = '/';
1172
1173                 /* Move ourselves into the pivot cgroup to delete our own
1174                  * cgroup.
1175                  */
1176                 ret = lxc_write_to_file(pivot_path, pidstr, len, false, 0666);
1177                 if (ret != 0) {
1178                         SYSWARN("Failed to move monitor %s to \"%s\"\n", pidstr, pivot_path);
1179                         continue;
1180                 }
1181
1182                 ret = recursive_destroy(h->monitor_full_path);
1183                 if (ret < 0)
1184                         WARN("Failed to destroy \"%s\"", h->monitor_full_path);
1185         }
1186 }
1187
1188 static bool cg_unified_create_cgroup(struct hierarchy *h, char *cgname)
1189 {
1190         __do_free char *add_controllers = NULL, *cgroup = NULL;
1191         size_t i, parts_len;
1192         char **it;
1193         size_t full_len = 0;
1194         char **parts = NULL;
1195         bool bret = false;
1196
1197         if (h->version != CGROUP2_SUPER_MAGIC)
1198                 return true;
1199
1200         if (!h->controllers)
1201                 return true;
1202
1203         /* For now we simply enable all controllers that we have detected by
1204          * creating a string like "+memory +pids +cpu +io".
1205          * TODO: In the near future we might want to support "-<controller>"
1206          * etc. but whether supporting semantics like this make sense will need
1207          * some thinking.
1208          */
1209         for (it = h->controllers; it && *it; it++) {
1210                 full_len += strlen(*it) + 2;
1211                 add_controllers = must_realloc(add_controllers, full_len + 1);
1212
1213                 if (h->controllers[0] == *it)
1214                         add_controllers[0] = '\0';
1215
1216                 (void)strlcat(add_controllers, "+", full_len + 1);
1217                 (void)strlcat(add_controllers, *it, full_len + 1);
1218
1219                 if ((it + 1) && *(it + 1))
1220                         (void)strlcat(add_controllers, " ", full_len + 1);
1221         }
1222
1223         parts = lxc_string_split(cgname, '/');
1224         if (!parts)
1225                 goto on_error;
1226
1227         parts_len = lxc_array_len((void **)parts);
1228         if (parts_len > 0)
1229                 parts_len--;
1230
1231         cgroup = must_make_path(h->mountpoint, h->container_base_path, NULL);
1232         for (i = 0; i < parts_len; i++) {
1233                 int ret;
1234                 __do_free char *target = NULL;
1235
1236                 cgroup = must_append_path(cgroup, parts[i], NULL);
1237                 target = must_make_path(cgroup, "cgroup.subtree_control", NULL);
1238                 ret = lxc_write_to_file(target, add_controllers, full_len, false, 0666);
1239                 if (ret < 0) {
1240                         SYSERROR("Could not enable \"%s\" controllers in the "
1241                                  "unified cgroup \"%s\"", add_controllers, cgroup);
1242                         goto on_error;
1243                 }
1244         }
1245
1246         bret = true;
1247
1248 on_error:
1249         lxc_free_array((void **)parts, free);
1250         return bret;
1251 }
1252
1253 static int mkdir_eexist_on_last(const char *dir, mode_t mode)
1254 {
1255         const char *tmp = dir;
1256         const char *orig = dir;
1257         size_t orig_len;
1258
1259         orig_len = strlen(dir);
1260         do {
1261                 __do_free char *makeme;
1262                 int ret;
1263                 size_t cur_len;
1264
1265                 dir = tmp + strspn(tmp, "/");
1266                 tmp = dir + strcspn(dir, "/");
1267
1268                 errno = ENOMEM;
1269                 cur_len = dir - orig;
1270                 makeme = strndup(orig, cur_len);
1271                 if (!makeme)
1272                         return -1;
1273
1274                 ret = mkdir(makeme, mode);
1275                 if (ret < 0) {
1276                         if ((errno != EEXIST) || (orig_len == cur_len)) {
1277                                 SYSERROR("Failed to create directory \"%s\"", makeme);
1278                                 return -1;
1279                         }
1280                 }
1281         } while (tmp != dir);
1282
1283         return 0;
1284 }
1285
1286 static bool monitor_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1287 {
1288         int ret;
1289
1290         if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
1291                 ERROR("Failed to handle legacy cpuset controller");
1292                 return false;
1293         }
1294
1295         h->monitor_full_path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
1296         ret = mkdir_eexist_on_last(h->monitor_full_path, 0755);
1297         if (ret < 0) {
1298                 ERROR("Failed to create cgroup \"%s\"", h->monitor_full_path);
1299                 return false;
1300         }
1301
1302         return cg_unified_create_cgroup(h, cgname);
1303 }
1304
1305 static bool container_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1306 {
1307         int ret;
1308
1309         if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
1310                 ERROR("Failed to handle legacy cpuset controller");
1311                 return false;
1312         }
1313
1314         h->container_full_path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
1315         ret = mkdir_eexist_on_last(h->container_full_path, 0755);
1316         if (ret < 0) {
1317                 ERROR("Failed to create cgroup \"%s\"", h->container_full_path);
1318                 return false;
1319         }
1320
1321         return cg_unified_create_cgroup(h, cgname);
1322 }
1323
1324 static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname, bool monitor)
1325 {
1326         int ret;
1327         char *full_path;
1328
1329         if (monitor)
1330                 full_path = h->monitor_full_path;
1331         else
1332                 full_path = h->container_full_path;
1333
1334         ret = rmdir(full_path);
1335         if (ret < 0)
1336                 SYSERROR("Failed to rmdir(\"%s\") from failed creation attempt", full_path);
1337
1338         free(full_path);
1339
1340         if (monitor)
1341                 h->monitor_full_path = NULL;
1342         else
1343                 h->container_full_path = NULL;
1344 }
1345
1346 __cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
1347                                                       struct lxc_handler *handler)
1348 {
1349         __do_free char *monitor_cgroup = NULL;
1350         char *offset, *tmp;
1351         int i, idx = 0;
1352         size_t len;
1353         struct lxc_conf *conf = handler->conf;
1354
1355         if (!conf)
1356                 return false;
1357
1358         if (!ops->hierarchies)
1359                 return true;
1360
1361         if (conf->cgroup_meta.dir)
1362                 tmp = lxc_string_join("/",
1363                                       (const char *[]){conf->cgroup_meta.dir,
1364                                                        ops->monitor_pattern,
1365                                                        handler->name, NULL},
1366                                       false);
1367         else
1368                 tmp = must_make_path(ops->monitor_pattern, handler->name, NULL);
1369         if (!tmp)
1370                 return false;
1371
1372         len = strlen(tmp) + 5; /* leave room for -NNN\0 */
1373         monitor_cgroup = must_realloc(tmp, len);
1374         offset = monitor_cgroup + len - 5;
1375         *offset = 0;
1376
1377         do {
1378                 if (idx) {
1379                         int ret = snprintf(offset, 5, "-%d", idx);
1380                         if (ret < 0 || (size_t)ret >= 5)
1381                                 return false;
1382                 }
1383
1384                 for (i = 0; ops->hierarchies[i]; i++) {
1385                         if (!monitor_create_path_for_hierarchy(ops->hierarchies[i],
1386                                                                monitor_cgroup)) {
1387                                 ERROR("Failed to create cgroup \"%s\"",
1388                                       ops->hierarchies[i]->monitor_full_path);
1389                                 for (int j = 0; j < i; j++)
1390                                         remove_path_for_hierarchy(ops->hierarchies[j],
1391                                                                   monitor_cgroup,
1392                                                                   true);
1393
1394                                 idx++;
1395                                 break;
1396                         }
1397                 }
1398         } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
1399
1400         if (idx == 1000)
1401                 return false;
1402
1403         INFO("The monitor process uses \"%s\" as cgroup", monitor_cgroup);
1404         return true;
1405 }
1406
1407 /* Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1408  * next cgroup_pattern-1, -2, ..., -999.
1409  */
1410 __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
1411                                                         struct lxc_handler *handler)
1412 {
1413         __do_free char *container_cgroup = NULL, *tmp = NULL;
1414         int i;
1415         size_t len;
1416         char *offset;
1417         int idx = 0;
1418         struct lxc_conf *conf = handler->conf;
1419
1420         if (ops->container_cgroup)
1421                 return false;
1422
1423         if (!conf)
1424                 return false;
1425
1426         if (!ops->hierarchies)
1427                 return true;
1428
1429         if (conf->cgroup_meta.dir)
1430                 tmp = lxc_string_join("/", (const char *[]){conf->cgroup_meta.dir, handler->name, NULL}, false);
1431         else
1432                 tmp = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1433         if (!tmp) {
1434                 ERROR("Failed expanding cgroup name pattern");
1435                 return false;
1436         }
1437
1438         len = strlen(tmp) + 5; /* leave room for -NNN\0 */
1439         container_cgroup = must_realloc(NULL, len);
1440         (void)strlcpy(container_cgroup, tmp, len);
1441         offset = container_cgroup + len - 5;
1442
1443         do {
1444                 if (idx) {
1445                         int ret = snprintf(offset, 5, "-%d", idx);
1446                         if (ret < 0 || (size_t)ret >= 5)
1447                                 return false;
1448                 }
1449
1450                 for (i = 0; ops->hierarchies[i]; i++) {
1451                         if (!container_create_path_for_hierarchy(ops->hierarchies[i],
1452                                                                  container_cgroup)) {
1453                                 ERROR("Failed to create cgroup \"%s\"",
1454                                       ops->hierarchies[i]->container_full_path);
1455                                 for (int j = 0; j < i; j++)
1456                                         remove_path_for_hierarchy(ops->hierarchies[j],
1457                                                                   container_cgroup,
1458                                                                   false);
1459                                 idx++;
1460                                 break;
1461                         }
1462                 }
1463         } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
1464
1465         if (idx == 1000)
1466                 return false;
1467
1468         INFO("The container process uses \"%s\" as cgroup", container_cgroup);
1469         ops->container_cgroup = move_ptr(container_cgroup);
1470         return true;
1471 }
1472
1473 __cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid,
1474                                              bool monitor)
1475 {
1476         int len;
1477         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1478
1479         if (!ops->hierarchies)
1480                 return true;
1481
1482         len = snprintf(pidstr, sizeof(pidstr), "%d", pid);
1483         if (len < 0 || (size_t)len >= sizeof(pidstr))
1484                 return false;
1485
1486         for (int i = 0; ops->hierarchies[i]; i++) {
1487                 int ret;
1488                 __do_free char *path = NULL;
1489
1490                 if (monitor)
1491                         path = must_make_path(ops->hierarchies[i]->monitor_full_path,
1492                                               "cgroup.procs", NULL);
1493                 else
1494                         path = must_make_path(ops->hierarchies[i]->container_full_path,
1495                                               "cgroup.procs", NULL);
1496                 ret = lxc_write_to_file(path, pidstr, len, false, 0666);
1497                 if (ret != 0) {
1498                         SYSERROR("Failed to enter cgroup \"%s\"", path);
1499                         return false;
1500                 }
1501         }
1502
1503         return true;
1504 }
1505
1506 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops, pid_t pid)
1507 {
1508         return __do_cgroup_enter(ops, pid, true);
1509 }
1510
1511 static bool cgfsng_payload_enter(struct cgroup_ops *ops, pid_t pid)
1512 {
1513         return __do_cgroup_enter(ops, pid, false);
1514 }
1515
1516 static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid,
1517                    mode_t chmod_mode)
1518 {
1519         int ret;
1520
1521         ret = chown(path, chown_uid, chown_gid);
1522         if (ret < 0) {
1523                 SYSWARN("Failed to chown(%s, %d, %d)", path, (int)chown_uid, (int)chown_gid);
1524                 return -1;
1525         }
1526
1527         ret = chmod(path, chmod_mode);
1528         if (ret < 0) {
1529                 SYSWARN("Failed to chmod(%s, %d)", path, (int)chmod_mode);
1530                 return -1;
1531         }
1532
1533         return 0;
1534 }
1535
1536 /* chgrp the container cgroups to container group.  We leave
1537  * the container owner as cgroup owner.  So we must make the
1538  * directories 775 so that the container can create sub-cgroups.
1539  *
1540  * Also chown the tasks and cgroup.procs files.  Those may not
1541  * exist depending on kernel version.
1542  */
1543 static int chown_cgroup_wrapper(void *data)
1544 {
1545         int i, ret;
1546         uid_t destuid;
1547         struct generic_userns_exec_data *arg = data;
1548         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1549         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1550
1551         ret = setresgid(nsgid, nsgid, nsgid);
1552         if (ret < 0) {
1553                 SYSERROR("Failed to setresgid(%d, %d, %d)",
1554                          (int)nsgid, (int)nsgid, (int)nsgid);
1555                 return -1;
1556         }
1557
1558         ret = setresuid(nsuid, nsuid, nsuid);
1559         if (ret < 0) {
1560                 SYSERROR("Failed to setresuid(%d, %d, %d)",
1561                          (int)nsuid, (int)nsuid, (int)nsuid);
1562                 return -1;
1563         }
1564
1565         ret = setgroups(0, NULL);
1566         if (ret < 0 && errno != EPERM) {
1567                 SYSERROR("Failed to setgroups(0, NULL)");
1568                 return -1;
1569         }
1570
1571         destuid = get_ns_uid(arg->origuid);
1572         if (destuid == LXC_INVALID_UID)
1573                 destuid = 0;
1574
1575         for (i = 0; arg->hierarchies[i]; i++) {
1576                 __do_free char *fullpath = NULL;
1577                 char *path = arg->hierarchies[i]->container_full_path;
1578
1579                 ret = chowmod(path, destuid, nsgid, 0775);
1580                 if (ret < 0)
1581                         return -1;
1582
1583                 /* Failures to chown() these are inconvenient but not
1584                  * detrimental We leave these owned by the container launcher,
1585                  * so that container root can write to the files to attach.  We
1586                  * chmod() them 664 so that container systemd can write to the
1587                  * files (which systemd in wily insists on doing).
1588                  */
1589
1590                 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC) {
1591                         fullpath = must_make_path(path, "tasks", NULL);
1592                         (void)chowmod(fullpath, destuid, nsgid, 0664);
1593                 }
1594
1595                 fullpath = must_make_path(path, "cgroup.procs", NULL);
1596                 (void)chowmod(fullpath, destuid, nsgid, 0664);
1597
1598                 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
1599                         continue;
1600
1601                 for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++) {
1602                         fullpath = must_make_path(path, *p, NULL);
1603                         (void)chowmod(fullpath, destuid, nsgid, 0664);
1604                 }
1605         }
1606
1607         return 0;
1608 }
1609
1610 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1611                                         struct lxc_conf *conf)
1612 {
1613         struct generic_userns_exec_data wrap;
1614
1615         if (lxc_list_empty(&conf->id_map))
1616                 return true;
1617
1618         if (!ops->hierarchies)
1619                 return true;
1620
1621         wrap.origuid = geteuid();
1622         wrap.path = NULL;
1623         wrap.hierarchies = ops->hierarchies;
1624         wrap.conf = conf;
1625
1626         if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1627                           "chown_cgroup_wrapper") < 0) {
1628                 ERROR("Error requesting cgroup chown in new user namespace");
1629                 return false;
1630         }
1631
1632         return true;
1633 }
1634
1635 /* cgroup-full:* is done, no need to create subdirs */
1636 static bool cg_mount_needs_subdirs(int type)
1637 {
1638         if (type >= LXC_AUTO_CGROUP_FULL_RO)
1639                 return false;
1640
1641         return true;
1642 }
1643
1644 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1645  * remount controller ro if needed and bindmount the cgroupfs onto
1646  * control/the/cg/path.
1647  */
1648 static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1649                                        char *controllerpath, char *cgpath,
1650                                        const char *container_cgroup)
1651 {
1652         __do_free char *sourcepath = NULL;
1653         int ret, remount_flags;
1654         int flags = MS_BIND;
1655
1656         if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1657                 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1658                 if (ret < 0) {
1659                         SYSERROR("Failed to bind mount \"%s\" onto \"%s\"",
1660                                  controllerpath, controllerpath);
1661                         return -1;
1662                 }
1663
1664                 remount_flags = add_required_remount_flags(controllerpath,
1665                                                            controllerpath,
1666                                                            flags | MS_REMOUNT);
1667                 ret = mount(controllerpath, controllerpath, "cgroup",
1668                             remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1669                             NULL);
1670                 if (ret < 0) {
1671                         SYSERROR("Failed to remount \"%s\" ro", controllerpath);
1672                         return -1;
1673                 }
1674
1675                 INFO("Remounted %s read-only", controllerpath);
1676         }
1677
1678         sourcepath = must_make_path(h->mountpoint, h->container_base_path,
1679                                     container_cgroup, NULL);
1680         if (type == LXC_AUTO_CGROUP_RO)
1681                 flags |= MS_RDONLY;
1682
1683         ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1684         if (ret < 0) {
1685                 SYSERROR("Failed to mount \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1686                 return -1;
1687         }
1688         INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1689
1690         if (flags & MS_RDONLY) {
1691                 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1692                                                            flags | MS_REMOUNT);
1693                 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1694                 if (ret < 0) {
1695                         SYSERROR("Failed to remount \"%s\" ro", cgpath);
1696                         return -1;
1697                 }
1698                 INFO("Remounted %s read-only", cgpath);
1699         }
1700
1701         INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1702         return 0;
1703 }
1704
1705 /* __cg_mount_direct
1706  *
1707  * Mount cgroup hierarchies directly without using bind-mounts. The main
1708  * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1709  * cgroups for the LXC_AUTO_CGROUP_FULL option.
1710  */
1711 static int __cg_mount_direct(int type, struct hierarchy *h,
1712                              const char *controllerpath)
1713 {
1714          int ret;
1715          __do_free char *controllers = NULL;
1716          char *fstype = "cgroup2";
1717          unsigned long flags = 0;
1718
1719          flags |= MS_NOSUID;
1720          flags |= MS_NOEXEC;
1721          flags |= MS_NODEV;
1722          flags |= MS_RELATIME;
1723
1724          if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1725                  flags |= MS_RDONLY;
1726
1727          if (h->version != CGROUP2_SUPER_MAGIC) {
1728                  controllers = lxc_string_join(",", (const char **)h->controllers, false);
1729                  if (!controllers)
1730                          return -ENOMEM;
1731                  fstype = "cgroup";
1732         }
1733
1734         ret = mount("cgroup", controllerpath, fstype, flags, controllers);
1735         if (ret < 0) {
1736                 SYSERROR("Failed to mount \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1737                 return -1;
1738         }
1739
1740         DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1741         return 0;
1742 }
1743
1744 static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1745                                                const char *controllerpath)
1746 {
1747         return __cg_mount_direct(type, h, controllerpath);
1748 }
1749
1750 static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
1751                                        const char *controllerpath)
1752 {
1753         if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1754                 return 0;
1755
1756         return __cg_mount_direct(type, h, controllerpath);
1757 }
1758
1759 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1760                                         struct lxc_handler *handler,
1761                                         const char *root, int type)
1762 {
1763         __do_free char *tmpfspath = NULL;
1764         int i, ret;
1765         bool has_cgns = false, retval = false, wants_force_mount = false;
1766
1767         if (!ops->hierarchies)
1768                 return true;
1769
1770         if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1771                 return true;
1772
1773         if (type & LXC_AUTO_CGROUP_FORCE) {
1774                 type &= ~LXC_AUTO_CGROUP_FORCE;
1775                 wants_force_mount = true;
1776         }
1777
1778         if (!wants_force_mount){
1779                 if (!lxc_list_empty(&handler->conf->keepcaps))
1780                         wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1781                 else
1782                         wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1783         }
1784
1785         has_cgns = cgns_supported();
1786         if (has_cgns && !wants_force_mount)
1787                 return true;
1788
1789         if (type == LXC_AUTO_CGROUP_NOSPEC)
1790                 type = LXC_AUTO_CGROUP_MIXED;
1791         else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1792                 type = LXC_AUTO_CGROUP_FULL_MIXED;
1793
1794         /* Mount tmpfs */
1795         tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
1796         ret = safe_mount(NULL, tmpfspath, "tmpfs",
1797                          MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1798                          "size=10240k,mode=755", root);
1799         if (ret < 0)
1800                 goto on_error;
1801
1802         for (i = 0; ops->hierarchies[i]; i++) {
1803                 __do_free char *controllerpath = NULL, *path2 = NULL;
1804                 struct hierarchy *h = ops->hierarchies[i];
1805                 char *controller = strrchr(h->mountpoint, '/');
1806
1807                 if (!controller)
1808                         continue;
1809                 controller++;
1810
1811                 controllerpath = must_make_path(tmpfspath, controller, NULL);
1812                 if (dir_exists(controllerpath))
1813                         continue;
1814
1815                 ret = mkdir(controllerpath, 0755);
1816                 if (ret < 0) {
1817                         SYSERROR("Error creating cgroup path: %s", controllerpath);
1818                         goto on_error;
1819                 }
1820
1821                 if (has_cgns && wants_force_mount) {
1822                         /* If cgroup namespaces are supported but the container
1823                          * will not have CAP_SYS_ADMIN after it has started we
1824                          * need to mount the cgroups manually.
1825                          */
1826                         ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
1827                         if (ret < 0)
1828                                 goto on_error;
1829
1830                         continue;
1831                 }
1832
1833                 ret = cg_mount_cgroup_full(type, h, controllerpath);
1834                 if (ret < 0)
1835                         goto on_error;
1836
1837                 if (!cg_mount_needs_subdirs(type))
1838                         continue;
1839
1840                 path2 = must_make_path(controllerpath, h->container_base_path,
1841                                        ops->container_cgroup, NULL);
1842                 ret = mkdir_p(path2, 0755);
1843                 if (ret < 0)
1844                         goto on_error;
1845
1846                 ret = cg_legacy_mount_controllers(type, h, controllerpath,
1847                                                   path2, ops->container_cgroup);
1848                 if (ret < 0)
1849                         goto on_error;
1850         }
1851         retval = true;
1852
1853 on_error:
1854         return retval;
1855 }
1856
1857 static int recursive_count_nrtasks(char *dirname)
1858 {
1859         __do_free char *path = NULL;
1860         __do_closedir DIR *dir = NULL;
1861         struct dirent *direntp;
1862         int count = 0, ret;
1863
1864         dir = opendir(dirname);
1865         if (!dir)
1866                 return 0;
1867
1868         while ((direntp = readdir(dir))) {
1869                 struct stat mystat;
1870
1871                 if (!strcmp(direntp->d_name, ".") ||
1872                     !strcmp(direntp->d_name, ".."))
1873                         continue;
1874
1875                 path = must_make_path(dirname, direntp->d_name, NULL);
1876
1877                 if (lstat(path, &mystat))
1878                         continue;
1879
1880                 if (!S_ISDIR(mystat.st_mode))
1881                         continue;
1882
1883                 count += recursive_count_nrtasks(path);
1884         }
1885
1886         path = must_make_path(dirname, "cgroup.procs", NULL);
1887         ret = lxc_count_file_lines(path);
1888         if (ret != -1)
1889                 count += ret;
1890
1891         return count;
1892 }
1893
1894 __cgfsng_ops static int cgfsng_nrtasks(struct cgroup_ops *ops)
1895 {
1896         __do_free char *path = NULL;
1897         int count;
1898
1899         if (!ops->container_cgroup || !ops->hierarchies)
1900                 return -1;
1901
1902         path = must_make_path(ops->hierarchies[0]->container_full_path, NULL);
1903         count = recursive_count_nrtasks(path);
1904         return count;
1905 }
1906
1907 /* Only root needs to escape to the cgroup of its init. */
1908 __cgfsng_ops static bool cgfsng_escape(const struct cgroup_ops *ops,
1909                                          struct lxc_conf *conf)
1910 {
1911         int i;
1912
1913         if (conf->cgroup_meta.relative || geteuid() || !ops->hierarchies)
1914                 return true;
1915
1916         for (i = 0; ops->hierarchies[i]; i++) {
1917                 int ret;
1918                 __do_free char *fullpath = NULL;
1919
1920                 fullpath = must_make_path(ops->hierarchies[i]->mountpoint,
1921                                           ops->hierarchies[i]->container_base_path,
1922                                           "cgroup.procs", NULL);
1923                 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1924                 if (ret != 0) {
1925                         SYSERROR("Failed to escape to cgroup \"%s\"", fullpath);
1926                         return false;
1927                 }
1928         }
1929
1930         return true;
1931 }
1932
1933 __cgfsng_ops static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
1934 {
1935         int i = 0;
1936
1937         if (!ops->hierarchies)
1938                 return 0;
1939
1940         for (; ops->hierarchies[i]; i++)
1941                 ;
1942
1943         return i;
1944 }
1945
1946 __cgfsng_ops static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out)
1947 {
1948         int i;
1949
1950         if (!ops->hierarchies)
1951                 return false;
1952
1953         /* sanity check n */
1954         for (i = 0; i < n; i++)
1955                 if (!ops->hierarchies[i])
1956                         return false;
1957
1958         *out = ops->hierarchies[i]->controllers;
1959
1960         return true;
1961 }
1962
1963 #define THAWED "THAWED"
1964 #define THAWED_LEN (strlen(THAWED))
1965
1966 /* TODO: If the unified cgroup hierarchy grows a freezer controller this needs
1967  * to be adapted.
1968  */
1969 __cgfsng_ops static bool cgfsng_unfreeze(struct cgroup_ops *ops)
1970 {
1971         int ret;
1972         __do_free char *fullpath = NULL;
1973         struct hierarchy *h;
1974
1975         h = get_hierarchy(ops, "freezer");
1976         if (!h)
1977                 return false;
1978
1979         fullpath = must_make_path(h->container_full_path, "freezer.state", NULL);
1980         ret = lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false, 0666);
1981         if (ret < 0)
1982                 return false;
1983
1984         return true;
1985 }
1986
1987 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
1988                                                     const char *controller)
1989 {
1990         struct hierarchy *h;
1991
1992         h = get_hierarchy(ops, controller);
1993         if (!h) {
1994                 WARN("Failed to find hierarchy for controller \"%s\"",
1995                      controller ? controller : "(null)");
1996                 return NULL;
1997         }
1998
1999         return h->container_full_path ? h->container_full_path + strlen(h->mountpoint) : NULL;
2000 }
2001
2002 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2003  * which must be freed by the caller.
2004  */
2005 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2006                                                        const char *inpath,
2007                                                        const char *filename)
2008 {
2009         return must_make_path(h->mountpoint, inpath, filename, NULL);
2010 }
2011
2012 /* Technically, we're always at a delegation boundary here (This is especially
2013  * true when cgroup namespaces are available.). The reasoning is that in order
2014  * for us to have been able to start a container in the first place the root
2015  * cgroup must have been a leaf node. Now, either the container's init system
2016  * has populated the cgroup and kept it as a leaf node or it has created
2017  * subtrees. In the former case we will simply attach to the leaf node we
2018  * created when we started the container in the latter case we create our own
2019  * cgroup for the attaching process.
2020  */
2021 static int __cg_unified_attach(const struct hierarchy *h, const char *name,
2022                                const char *lxcpath, const char *pidstr,
2023                                size_t pidstr_len, const char *controller)
2024 {
2025         __do_free char *base_path = NULL, *container_cgroup = NULL,
2026                        *full_path = NULL;
2027         int ret;
2028         size_t len;
2029         int fret = -1, idx = 0;
2030
2031         container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2032         /* not running */
2033         if (!container_cgroup)
2034                 return 0;
2035
2036         base_path = must_make_path(h->mountpoint, container_cgroup, NULL);
2037         full_path = must_make_path(base_path, "cgroup.procs", NULL);
2038         /* cgroup is populated */
2039         ret = lxc_write_to_file(full_path, pidstr, pidstr_len, false, 0666);
2040         if (ret < 0 && errno != EBUSY)
2041                 goto on_error;
2042
2043         if (ret == 0)
2044                 goto on_success;
2045
2046         len = strlen(base_path) + STRLITERALLEN("/lxc-1000") +
2047               STRLITERALLEN("/cgroup-procs");
2048         full_path = must_realloc(NULL, len + 1);
2049         do {
2050                 if (idx)
2051                         ret = snprintf(full_path, len + 1, "%s/lxc-%d",
2052                                        base_path, idx);
2053                 else
2054                         ret = snprintf(full_path, len + 1, "%s/lxc", base_path);
2055                 if (ret < 0 || (size_t)ret >= len + 1)
2056                         goto on_error;
2057
2058                 ret = mkdir_p(full_path, 0755);
2059                 if (ret < 0 && errno != EEXIST)
2060                         goto on_error;
2061
2062                 (void)strlcat(full_path, "/cgroup.procs", len + 1);
2063                 ret = lxc_write_to_file(full_path, pidstr, len, false, 0666);
2064                 if (ret == 0)
2065                         goto on_success;
2066
2067                 /* this is a non-leaf node */
2068                 if (errno != EBUSY)
2069                         goto on_error;
2070
2071                 idx++;
2072         } while (idx < 1000);
2073
2074 on_success:
2075         if (idx < 1000)
2076                 fret = 0;
2077
2078 on_error:
2079         return fret;
2080 }
2081
2082 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops, const char *name,
2083                                          const char *lxcpath, pid_t pid)
2084 {
2085         int i, len, ret;
2086         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2087
2088         if (!ops->hierarchies)
2089                 return true;
2090
2091         len = snprintf(pidstr, sizeof(pidstr), "%d", pid);
2092         if (len < 0 || (size_t)len >= sizeof(pidstr))
2093                 return false;
2094
2095         for (i = 0; ops->hierarchies[i]; i++) {
2096                 __do_free char *path = NULL;
2097                 char *fullpath = NULL;
2098                 struct hierarchy *h = ops->hierarchies[i];
2099
2100                 if (h->version == CGROUP2_SUPER_MAGIC) {
2101                         ret = __cg_unified_attach(h, name, lxcpath, pidstr, len,
2102                                                   h->controllers[0]);
2103                         if (ret < 0)
2104                                 return false;
2105
2106                         continue;
2107                 }
2108
2109                 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2110                 /* not running */
2111                 if (!path)
2112                         continue;
2113
2114                 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2115                 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2116                 if (ret < 0) {
2117                         SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
2118                         return false;
2119                 }
2120         }
2121
2122         return true;
2123 }
2124
2125 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.  Here we
2126  * don't have a cgroup_data set up, so we ask the running container through the
2127  * commands API for the cgroup path.
2128  */
2129 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2130                                      char *value, size_t len, const char *name,
2131                                      const char *lxcpath)
2132 {
2133         __do_free char *path = NULL;
2134         __do_free char *controller = NULL;
2135         char *p;
2136         struct hierarchy *h;
2137         int ret = -1;
2138
2139         controller = must_copy_string(filename);
2140         p = strchr(controller, '.');
2141         if (p)
2142                 *p = '\0';
2143
2144         path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2145         /* not running */
2146         if (!path)
2147                 return -1;
2148
2149         h = get_hierarchy(ops, controller);
2150         if (h) {
2151                 __do_free char *fullpath = NULL;
2152
2153                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2154                 ret = lxc_read_from_file(fullpath, value, len);
2155         }
2156
2157         return ret;
2158 }
2159
2160 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.  Here we
2161  * don't have a cgroup_data set up, so we ask the running container through the
2162  * commands API for the cgroup path.
2163  */
2164 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2165                                      const char *filename, const char *value,
2166                                      const char *name, const char *lxcpath)
2167 {
2168         __do_free char *path = NULL;
2169         __do_free char *controller = NULL;
2170         char *p;
2171         struct hierarchy *h;
2172         int ret = -1;
2173
2174         controller = must_copy_string(filename);
2175         p = strchr(controller, '.');
2176         if (p)
2177                 *p = '\0';
2178
2179         path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2180         /* not running */
2181         if (!path)
2182                 return -1;
2183
2184         h = get_hierarchy(ops, controller);
2185         if (h) {
2186                 __do_free char *fullpath = NULL;
2187
2188                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2189                 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2190         }
2191
2192         return ret;
2193 }
2194
2195 /* take devices cgroup line
2196  *    /dev/foo rwx
2197  * and convert it to a valid
2198  *    type major:minor mode
2199  * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2200  * the output.
2201  */
2202 static int convert_devpath(const char *invalue, char *dest)
2203 {
2204         __do_free char *path = NULL;
2205         int n_parts;
2206         char *p, type;
2207         unsigned long minor, major;
2208         struct stat sb;
2209         int ret = -EINVAL;
2210         char *mode = NULL;
2211
2212         path = must_copy_string(invalue);
2213
2214         /* Read path followed by mode. Ignore any trailing text.
2215          * A '    # comment' would be legal. Technically other text is not
2216          * legal, we could check for that if we cared to.
2217          */
2218         for (n_parts = 1, p = path; *p; p++) {
2219                 if (*p != ' ')
2220                         continue;
2221                 *p = '\0';
2222
2223                 if (n_parts != 1)
2224                         break;
2225                 p++;
2226                 n_parts++;
2227
2228                 while (*p == ' ')
2229                         p++;
2230
2231                 mode = p;
2232
2233                 if (*p == '\0')
2234                         goto out;
2235         }
2236
2237         if (n_parts == 1)
2238                 goto out;
2239
2240         ret = stat(path, &sb);
2241         if (ret < 0)
2242                 goto out;
2243
2244         mode_t m = sb.st_mode & S_IFMT;
2245         switch (m) {
2246         case S_IFBLK:
2247                 type = 'b';
2248                 break;
2249         case S_IFCHR:
2250                 type = 'c';
2251                 break;
2252         default:
2253                 ERROR("Unsupported device type %i for \"%s\"", m, path);
2254                 ret = -EINVAL;
2255                 goto out;
2256         }
2257
2258         major = MAJOR(sb.st_rdev);
2259         minor = MINOR(sb.st_rdev);
2260         ret = snprintf(dest, 50, "%c %lu:%lu %s", type, major, minor, mode);
2261         if (ret < 0 || ret >= 50) {
2262                 ERROR("Error on configuration value \"%c %lu:%lu %s\" (max 50 "
2263                       "chars)", type, major, minor, mode);
2264                 ret = -ENAMETOOLONG;
2265                 goto out;
2266         }
2267         ret = 0;
2268
2269 out:
2270         return ret;
2271 }
2272
2273 /* Called from setup_limits - here we have the container's cgroup_data because
2274  * we created the cgroups.
2275  */
2276 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2277                               const char *value)
2278 {
2279         __do_free char *controller = NULL;
2280         __do_free char *fullpath = NULL;
2281         char *p;
2282         /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2283         char converted_value[50];
2284         struct hierarchy *h;
2285         int ret = 0;
2286
2287         controller = must_copy_string(filename);
2288         p = strchr(controller, '.');
2289         if (p)
2290                 *p = '\0';
2291
2292         if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
2293                 ret = convert_devpath(value, converted_value);
2294                 if (ret < 0)
2295                         return ret;
2296                 value = converted_value;
2297         }
2298
2299         h = get_hierarchy(ops, controller);
2300         if (!h) {
2301                 ERROR("Failed to setup limits for the \"%s\" controller. "
2302                       "The controller seems to be unused by \"cgfsng\" cgroup "
2303                       "driver or not enabled on the cgroup hierarchy",
2304                       controller);
2305                 errno = ENOENT;
2306                 return -ENOENT;
2307         }
2308
2309         fullpath = must_make_path(h->container_full_path, filename, NULL);
2310         ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2311         return ret;
2312 }
2313
2314 static bool __cg_legacy_setup_limits(struct cgroup_ops *ops,
2315                                      struct lxc_list *cgroup_settings,
2316                                      bool do_devices)
2317 {
2318         __do_free struct lxc_list *sorted_cgroup_settings = NULL;
2319         struct lxc_list *iterator, *next;
2320         struct lxc_cgroup *cg;
2321         bool ret = false;
2322
2323         if (lxc_list_empty(cgroup_settings))
2324                 return true;
2325
2326         if (!ops->hierarchies)
2327                 return false;
2328
2329         sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2330         if (!sorted_cgroup_settings)
2331                 return false;
2332
2333         lxc_list_for_each(iterator, sorted_cgroup_settings) {
2334                 cg = iterator->elem;
2335
2336                 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2337                         if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
2338                                 if (do_devices && (errno == EACCES || errno == EPERM)) {
2339                                         WARN("Failed to set \"%s\" to \"%s\"",
2340                                              cg->subsystem, cg->value);
2341                                         continue;
2342                                 }
2343                                 WARN("Failed to set \"%s\" to \"%s\"",
2344                                      cg->subsystem, cg->value);
2345                                 goto out;
2346                         }
2347                         DEBUG("Set controller \"%s\" set to \"%s\"",
2348                               cg->subsystem, cg->value);
2349                 }
2350         }
2351
2352         ret = true;
2353         INFO("Limits for the legacy cgroup hierarchies have been setup");
2354 out:
2355         lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2356                 lxc_list_del(iterator);
2357                 free(iterator);
2358         }
2359
2360         return ret;
2361 }
2362
2363 static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
2364                                       struct lxc_list *cgroup_settings)
2365 {
2366         struct lxc_list *iterator;
2367         struct hierarchy *h = ops->unified;
2368
2369         if (lxc_list_empty(cgroup_settings))
2370                 return true;
2371
2372         if (!h)
2373                 return false;
2374
2375         lxc_list_for_each(iterator, cgroup_settings) {
2376                 __do_free char *fullpath = NULL;
2377                 int ret;
2378                 struct lxc_cgroup *cg = iterator->elem;
2379
2380                 fullpath = must_make_path(h->container_full_path, cg->subsystem, NULL);
2381                 ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false, 0666);
2382                 if (ret < 0) {
2383                         SYSERROR("Failed to set \"%s\" to \"%s\"",
2384                                  cg->subsystem, cg->value);
2385                         return false;
2386                 }
2387                 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2388         }
2389
2390         INFO("Limits for the unified cgroup hierarchy have been setup");
2391         return true;
2392 }
2393
2394 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2395                                                struct lxc_conf *conf,
2396                                                bool do_devices)
2397 {
2398         bool bret;
2399
2400         bret = __cg_legacy_setup_limits(ops, &conf->cgroup, do_devices);
2401         if (!bret)
2402                 return false;
2403
2404         return __cg_unified_setup_limits(ops, &conf->cgroup2);
2405 }
2406
2407 static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
2408                                        char **controllers)
2409 {
2410         char **cur_ctrl, **cur_use;
2411
2412         if (!ops->cgroup_use)
2413                 return true;
2414
2415         for (cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
2416                 bool found = false;
2417
2418                 for (cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
2419                         if (strcmp(*cur_use, *cur_ctrl) != 0)
2420                                 continue;
2421
2422                         found = true;
2423                         break;
2424                 }
2425
2426                 if (found)
2427                         continue;
2428
2429                 return false;
2430         }
2431
2432         return true;
2433 }
2434
2435 static void cg_unified_delegate(char ***delegate)
2436 {
2437         __do_free char *tmp = NULL;
2438         int idx;
2439         char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
2440
2441         tmp = read_file("/sys/kernel/cgroup/delegate");
2442         if (!tmp) {
2443                 for (char **p = standard; p && *p; p++) {
2444                         idx = append_null_to_list((void ***)delegate);
2445                         (*delegate)[idx] = must_copy_string(*p);
2446                 }
2447         } else {
2448                 char *token;
2449                 lxc_iterate_parts (token, tmp, " \t\n") {
2450                         /*
2451                          * We always need to chown this for both cgroup and
2452                          * cgroup2.
2453                          */
2454                         if (strcmp(token, "cgroup.procs") == 0)
2455                                 continue;
2456
2457                         idx = append_null_to_list((void ***)delegate);
2458                         (*delegate)[idx] = must_copy_string(token);
2459                 }
2460         }
2461 }
2462
2463 /* At startup, parse_hierarchies finds all the info we need about cgroup
2464  * mountpoints and current cgroups, and stores it in @d.
2465  */
2466 static bool cg_hybrid_init(struct cgroup_ops *ops, bool relative,
2467                            bool unprivileged)
2468 {
2469         __do_free char *basecginfo = NULL;
2470         __do_free char *line = NULL;
2471         __do_fclose FILE *f = NULL;
2472         int ret;
2473         size_t len = 0;
2474         char **klist = NULL, **nlist = NULL;
2475
2476         /* Root spawned containers escape the current cgroup, so use init's
2477          * cgroups as our base in that case.
2478          */
2479         if (!relative && (geteuid() == 0))
2480                 basecginfo = read_file("/proc/1/cgroup");
2481         else
2482                 basecginfo = read_file("/proc/self/cgroup");
2483         if (!basecginfo)
2484                 return false;
2485
2486         ret = get_existing_subsystems(&klist, &nlist);
2487         if (ret < 0) {
2488                 ERROR("Failed to retrieve available legacy cgroup controllers");
2489                 return false;
2490         }
2491
2492         f = fopen("/proc/self/mountinfo", "r");
2493         if (!f) {
2494                 ERROR("Failed to open \"/proc/self/mountinfo\"");
2495                 return false;
2496         }
2497
2498         lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
2499
2500         while (getline(&line, &len, f) != -1) {
2501                 int type;
2502                 bool writeable;
2503                 struct hierarchy *new;
2504                 char *base_cgroup = NULL, *mountpoint = NULL;
2505                 char **controller_list = NULL;
2506
2507                 type = get_cgroup_version(line);
2508                 if (type == 0)
2509                         continue;
2510
2511                 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
2512                         continue;
2513
2514                 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
2515                         if (type == CGROUP2_SUPER_MAGIC)
2516                                 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2517                         else if (type == CGROUP_SUPER_MAGIC)
2518                                 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
2519                 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
2520                         if (type == CGROUP_SUPER_MAGIC)
2521                                 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2522                 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
2523                         if (type == CGROUP2_SUPER_MAGIC)
2524                                 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2525                 }
2526
2527                 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
2528                 if (!controller_list && type == CGROUP_SUPER_MAGIC)
2529                         continue;
2530
2531                 if (type == CGROUP_SUPER_MAGIC)
2532                         if (controller_list_is_dup(ops->hierarchies, controller_list))
2533                                 goto next;
2534
2535                 mountpoint = cg_hybrid_get_mountpoint(line);
2536                 if (!mountpoint) {
2537                         ERROR("Failed parsing mountpoint from \"%s\"", line);
2538                         goto next;
2539                 }
2540
2541                 if (type == CGROUP_SUPER_MAGIC)
2542                         base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
2543                 else
2544                         base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
2545                 if (!base_cgroup) {
2546                         ERROR("Failed to find current cgroup");
2547                         goto next;
2548                 }
2549
2550                 trim(base_cgroup);
2551                 prune_init_scope(base_cgroup);
2552                 if (type == CGROUP2_SUPER_MAGIC)
2553                         writeable = test_writeable_v2(mountpoint, base_cgroup);
2554                 else
2555                         writeable = test_writeable_v1(mountpoint, base_cgroup);
2556                 if (!writeable)
2557                         goto next;
2558
2559                 if (type == CGROUP2_SUPER_MAGIC) {
2560                         char *cgv2_ctrl_path;
2561
2562                         cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
2563                                                         "cgroup.controllers",
2564                                                         NULL);
2565
2566                         controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
2567                         free(cgv2_ctrl_path);
2568                         if (!controller_list) {
2569                                 controller_list = cg_unified_make_empty_controller();
2570                                 TRACE("No controllers are enabled for "
2571                                       "delegation in the unified hierarchy");
2572                         }
2573                 }
2574
2575                 /* Exclude all controllers that cgroup use does not want. */
2576                 if (!cgroup_use_wants_controllers(ops, controller_list))
2577                         goto next;
2578
2579                 new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
2580                 if (type == CGROUP2_SUPER_MAGIC && !ops->unified) {
2581                         if (unprivileged)
2582                                 cg_unified_delegate(&new->cgroup2_chown);
2583                         ops->unified = new;
2584                 }
2585
2586                 continue;
2587
2588         next:
2589                 free_string_list(controller_list);
2590                 free(mountpoint);
2591                 free(base_cgroup);
2592         }
2593
2594         free_string_list(klist);
2595         free_string_list(nlist);
2596
2597         TRACE("Writable cgroup hierarchies:");
2598         lxc_cgfsng_print_hierarchies(ops);
2599
2600         /* verify that all controllers in cgroup.use and all crucial
2601          * controllers are accounted for
2602          */
2603         if (!all_controllers_found(ops))
2604                 return false;
2605
2606         return true;
2607 }
2608
2609 static int cg_is_pure_unified(void)
2610 {
2611
2612         int ret;
2613         struct statfs fs;
2614
2615         ret = statfs("/sys/fs/cgroup", &fs);
2616         if (ret < 0)
2617                 return -ENOMEDIUM;
2618
2619         if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
2620                 return CGROUP2_SUPER_MAGIC;
2621
2622         return 0;
2623 }
2624
2625 /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
2626 static char *cg_unified_get_current_cgroup(bool relative)
2627 {
2628         __do_free char *basecginfo = NULL;
2629         char *base_cgroup;
2630         char *copy = NULL;
2631
2632         if (!relative && (geteuid() == 0))
2633                 basecginfo = read_file("/proc/1/cgroup");
2634         else
2635                 basecginfo = read_file("/proc/self/cgroup");
2636         if (!basecginfo)
2637                 return NULL;
2638
2639         base_cgroup = strstr(basecginfo, "0::/");
2640         if (!base_cgroup)
2641                 goto cleanup_on_err;
2642
2643         base_cgroup = base_cgroup + 3;
2644         copy = copy_to_eol(base_cgroup);
2645         if (!copy)
2646                 goto cleanup_on_err;
2647
2648 cleanup_on_err:
2649         if (copy)
2650                 trim(copy);
2651
2652         return copy;
2653 }
2654
2655 static int cg_unified_init(struct cgroup_ops *ops, bool relative,
2656                            bool unprivileged)
2657 {
2658         __do_free char *subtree_path = NULL;
2659         int ret;
2660         char *mountpoint;
2661         char **delegatable;
2662         struct hierarchy *new;
2663         char *base_cgroup = NULL;
2664
2665         ret = cg_is_pure_unified();
2666         if (ret == -ENOMEDIUM)
2667                 return -ENOMEDIUM;
2668
2669         if (ret != CGROUP2_SUPER_MAGIC)
2670                 return 0;
2671
2672         base_cgroup = cg_unified_get_current_cgroup(relative);
2673         if (!base_cgroup)
2674                 return -EINVAL;
2675         prune_init_scope(base_cgroup);
2676
2677         /* We assume that we have already been given controllers to delegate
2678          * further down the hierarchy. If not it is up to the user to delegate
2679          * them to us.
2680          */
2681         mountpoint = must_copy_string("/sys/fs/cgroup");
2682         subtree_path = must_make_path(mountpoint, base_cgroup,
2683                                       "cgroup.subtree_control", NULL);
2684         delegatable = cg_unified_get_controllers(subtree_path);
2685         if (!delegatable)
2686                 delegatable = cg_unified_make_empty_controller();
2687         if (!delegatable[0])
2688                 TRACE("No controllers are enabled for delegation");
2689
2690         /* TODO: If the user requested specific controllers via lxc.cgroup.use
2691          * we should verify here. The reason I'm not doing it right is that I'm
2692          * not convinced that lxc.cgroup.use will be the future since it is a
2693          * global property. I much rather have an option that lets you request
2694          * controllers per container.
2695          */
2696
2697         new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
2698         if (!unprivileged)
2699                 cg_unified_delegate(&new->cgroup2_chown);
2700
2701         ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2702         ops->unified = new;
2703         return CGROUP2_SUPER_MAGIC;
2704 }
2705
2706 static bool cg_init(struct cgroup_ops *ops, struct lxc_conf *conf)
2707 {
2708         int ret;
2709         const char *tmp;
2710         bool relative = conf->cgroup_meta.relative;
2711
2712         tmp = lxc_global_config_value("lxc.cgroup.use");
2713         if (tmp) {
2714                 __do_free char *pin = NULL;
2715                 char *chop, *cur;
2716
2717                 pin = must_copy_string(tmp);
2718                 chop = pin;
2719
2720                 lxc_iterate_parts(cur, chop, ",")
2721                         must_append_string(&ops->cgroup_use, cur);
2722         }
2723
2724         ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
2725         if (ret < 0)
2726                 return false;
2727
2728         if (ret == CGROUP2_SUPER_MAGIC)
2729                 return true;
2730
2731         return cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
2732 }
2733
2734 __cgfsng_ops static bool cgfsng_data_init(struct cgroup_ops *ops)
2735 {
2736         const char *cgroup_pattern;
2737
2738         /* copy system-wide cgroup information */
2739         cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
2740         if (!cgroup_pattern) {
2741                 /* lxc.cgroup.pattern is only NULL on error. */
2742                 ERROR("Failed to retrieve cgroup pattern");
2743                 return false;
2744         }
2745         ops->cgroup_pattern = must_copy_string(cgroup_pattern);
2746         ops->monitor_pattern = MONITOR_CGROUP;
2747
2748         return true;
2749 }
2750
2751 struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
2752 {
2753         struct cgroup_ops *cgfsng_ops;
2754
2755         cgfsng_ops = malloc(sizeof(struct cgroup_ops));
2756         if (!cgfsng_ops)
2757                 return NULL;
2758
2759         memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
2760         cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
2761
2762         if (!cg_init(cgfsng_ops, conf)) {
2763                 free(cgfsng_ops);
2764                 return NULL;
2765         }
2766
2767         cgfsng_ops->data_init = cgfsng_data_init;
2768         cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
2769         cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
2770         cgfsng_ops->monitor_create = cgfsng_monitor_create;
2771         cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
2772         cgfsng_ops->payload_create = cgfsng_payload_create;
2773         cgfsng_ops->payload_enter = cgfsng_payload_enter;
2774         cgfsng_ops->escape = cgfsng_escape;
2775         cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
2776         cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
2777         cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
2778         cgfsng_ops->get = cgfsng_get;
2779         cgfsng_ops->set = cgfsng_set;
2780         cgfsng_ops->unfreeze = cgfsng_unfreeze;
2781         cgfsng_ops->setup_limits = cgfsng_setup_limits;
2782         cgfsng_ops->driver = "cgfsng";
2783         cgfsng_ops->version = "1.0.0";
2784         cgfsng_ops->attach = cgfsng_attach;
2785         cgfsng_ops->chown = cgfsng_chown;
2786         cgfsng_ops->mount = cgfsng_mount;
2787         cgfsng_ops->nrtasks = cgfsng_nrtasks;
2788
2789         return cgfsng_ops;
2790 }