src/lxc/cgroups/cgfsng.c

   1 /*
   2  * lxc: linux Container library
   3  *
   4  * Copyright © 2016 Canonical Ltd.
   5  *
   6  * Authors:
   7  * Serge Hallyn <serge.hallyn@ubuntu.com>
   8  * Christian Brauner <christian.brauner@ubuntu.com>
   9  *
  10  * This library is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * This library is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with this library; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /*
  26  * cgfs-ng.c: this is a new, simplified implementation of a filesystem
  27  * cgroup backend.  The original cgfs.c was designed to be as flexible
  28  * as possible.  It would try to find cgroup filesystems no matter where
  29  * or how you had them mounted, and deduce the most usable mount for
  30  * each controller.
  31  *
  32  * This new implementation assumes that cgroup filesystems are mounted
  33  * under /sys/fs/cgroup/clist where clist is either the controller, or
  34  * a comma-separated list of controllers.
  35  */
  36
  37 #ifndef _GNU_SOURCE
  38 #define _GNU_SOURCE 1
  39 #endif
  40 #include <ctype.h>
  41 #include <dirent.h>
  42 #include <errno.h>
  43 #include <grp.h>
  44 #include <linux/kdev_t.h>
  45 #include <linux/types.h>
  46 #include <stdint.h>
  47 #include <stdio.h>
  48 #include <stdlib.h>
  49 #include <string.h>
  50 #include <sys/types.h>
  51 #include <unistd.h>
  52
  53 #include "caps.h"
  54 #include "cgroup.h"
  55 #include "cgroup_utils.h"
  56 #include "commands.h"
  57 #include "conf.h"
  58 #include "config.h"
  59 #include "log.h"
  60 #include "macro.h"
  61 #include "memory_utils.h"
  62 #include "storage/storage.h"
  63 #include "utils.h"
  64
  65 #ifndef HAVE_STRLCPY
  66 #include "include/strlcpy.h"
  67 #endif
  68
  69 #ifndef HAVE_STRLCAT
  70 #include "include/strlcat.h"
  71 #endif
  72
  73 lxc_log_define(cgfsng, cgroup);
  74
  75 static void free_string_list(char **clist)
  76 {
  77         int i;
  78
  79         if (!clist)
  80                 return;
  81
  82         for (i = 0; clist[i]; i++)
  83                 free(clist[i]);
  84
  85         free(clist);
  86 }
  87
  88 /* Given a pointer to a null-terminated array of pointers, realloc to add one
  89  * entry, and point the new entry to NULL. Do not fail. Return the index to the
  90  * second-to-last entry - that is, the one which is now available for use
  91  * (keeping the list null-terminated).
  92  */
  93 static int append_null_to_list(void ***list)
  94 {
  95         int newentry = 0;
  96
  97         if (*list)
  98                 for (; (*list)[newentry]; newentry++)
  99                         ;
 100
 101         *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
 102         (*list)[newentry + 1] = NULL;
 103         return newentry;
 104 }
 105
 106 /* Given a null-terminated array of strings, check whether @entry is one of the
 107  * strings.
 108  */
 109 static bool string_in_list(char **list, const char *entry)
 110 {
 111         int i;
 112
 113         if (!list)
 114                 return false;
 115
 116         for (i = 0; list[i]; i++)
 117                 if (strcmp(list[i], entry) == 0)
 118                         return true;
 119
 120         return false;
 121 }
 122
 123 /* Return a copy of @entry prepending "name=", i.e.  turn "systemd" into
 124  * "name=systemd". Do not fail.
 125  */
 126 static char *cg_legacy_must_prefix_named(char *entry)
 127 {
 128         size_t len;
 129         char *prefixed;
 130
 131         len = strlen(entry);
 132         prefixed = must_realloc(NULL, len + 6);
 133
 134         memcpy(prefixed, "name=", STRLITERALLEN("name="));
 135         memcpy(prefixed + STRLITERALLEN("name="), entry, len);
 136         prefixed[len + 5] = '\0';
 137
 138         return prefixed;
 139 }
 140
 141 /* Append an entry to the clist. Do not fail. @clist must be NULL the first time
 142  * we are called.
 143  *
 144  * We also handle named subsystems here. Any controller which is not a kernel
 145  * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
 146  * we refuse to use because we're not sure which we have here.
 147  * (TODO: We could work around this in some cases by just remounting to be
 148  * unambiguous, or by comparing mountpoint contents with current cgroup.)
 149  *
 150  * The last entry will always be NULL.
 151  */
 152 static void must_append_controller(char **klist, char **nlist, char ***clist,
 153                                    char *entry)
 154 {
 155         int newentry;
 156         char *copy;
 157
 158         if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
 159                 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
 160                 ERROR("It is both a named and kernel subsystem");
 161                 return;
 162         }
 163
 164         newentry = append_null_to_list((void ***)clist);
 165
 166         if (strncmp(entry, "name=", 5) == 0)
 167                 copy = must_copy_string(entry);
 168         else if (string_in_list(klist, entry))
 169                 copy = must_copy_string(entry);
 170         else
 171                 copy = cg_legacy_must_prefix_named(entry);
 172
 173         (*clist)[newentry] = copy;
 174 }
 175
 176 /* Given a handler's cgroup data, return the struct hierarchy for the controller
 177  * @c, or NULL if there is none.
 178  */
 179 struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *controller)
 180 {
 181         int i;
 182
 183         errno = ENOENT;
 184
 185         if (!ops->hierarchies) {
 186                 TRACE("There are no useable cgroup controllers");
 187                 return NULL;
 188         }
 189
 190         for (i = 0; ops->hierarchies[i]; i++) {
 191                 if (!controller) {
 192                         /* This is the empty unified hierarchy. */
 193                         if (ops->hierarchies[i]->controllers &&
 194                             !ops->hierarchies[i]->controllers[0])
 195                                 return ops->hierarchies[i];
 196
 197                         continue;
 198                 }
 199
 200                 if (string_in_list(ops->hierarchies[i]->controllers, controller))
 201                         return ops->hierarchies[i];
 202         }
 203
 204         if (controller)
 205                 WARN("There is no useable %s controller", controller);
 206         else
 207                 WARN("There is no empty unified cgroup hierarchy");
 208
 209         return NULL;
 210 }
 211
 212 #define BATCH_SIZE 50
 213 static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
 214 {
 215         int newbatches = (newlen / BATCH_SIZE) + 1;
 216         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 217
 218         if (!*mem || newbatches > oldbatches) {
 219                 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
 220         }
 221 }
 222
 223 static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
 224 {
 225         size_t full = oldlen + newlen;
 226
 227         batch_realloc(dest, oldlen, full + 1);
 228
 229         memcpy(*dest + oldlen, new, newlen + 1);
 230 }
 231
 232 /* Slurp in a whole file */
 233 static char *read_file(const char *fnam)
 234 {
 235         __do_free char *line = NULL;
 236         __do_fclose FILE *f = NULL;
 237         int linelen;
 238         char *buf = NULL;
 239         size_t len = 0, fulllen = 0;
 240
 241         f = fopen(fnam, "r");
 242         if (!f)
 243                 return NULL;
 244         while ((linelen = getline(&line, &len, f)) != -1) {
 245                 append_line(&buf, fulllen, line, linelen);
 246                 fulllen += linelen;
 247         }
 248         return buf;
 249 }
 250
 251 /* Taken over modified from the kernel sources. */
 252 #define NBITS 32 /* bits in uint32_t */
 253 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
 254 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
 255
 256 static void set_bit(unsigned bit, uint32_t *bitarr)
 257 {
 258         bitarr[bit / NBITS] |= (1 << (bit % NBITS));
 259 }
 260
 261 static void clear_bit(unsigned bit, uint32_t *bitarr)
 262 {
 263         bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
 264 }
 265
 266 static bool is_set(unsigned bit, uint32_t *bitarr)
 267 {
 268         return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
 269 }
 270
 271 /* Create cpumask from cpulist aka turn:
 272  *
 273  *      0,2-3
 274  *
 275  * into bit array
 276  *
 277  *      1 0 1 1
 278  */
 279 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
 280 {
 281         char *token;
 282         size_t arrlen;
 283         uint32_t *bitarr;
 284
 285         arrlen = BITS_TO_LONGS(nbits);
 286         bitarr = calloc(arrlen, sizeof(uint32_t));
 287         if (!bitarr)
 288                 return NULL;
 289
 290         lxc_iterate_parts(token, buf, ",") {
 291                 errno = 0;
 292                 unsigned end, start;
 293                 char *range;
 294
 295                 start = strtoul(token, NULL, 0);
 296                 end = start;
 297                 range = strchr(token, '-');
 298                 if (range)
 299                         end = strtoul(range + 1, NULL, 0);
 300
 301                 if (!(start <= end)) {
 302                         free(bitarr);
 303                         return NULL;
 304                 }
 305
 306                 if (end >= nbits) {
 307                         free(bitarr);
 308                         return NULL;
 309                 }
 310
 311                 while (start <= end)
 312                         set_bit(start++, bitarr);
 313         }
 314
 315         return bitarr;
 316 }
 317
 318 /* Turn cpumask into simple, comma-separated cpulist. */
 319 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
 320 {
 321         int ret;
 322         size_t i;
 323         char *tmp = NULL;
 324         char **cpulist = NULL;
 325         char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
 326
 327         for (i = 0; i <= nbits; i++) {
 328                 if (!is_set(i, bitarr))
 329                         continue;
 330
 331                 ret = snprintf(numstr, sizeof(numstr), "%zu", i);
 332                 if (ret < 0 || (size_t)ret >= sizeof(numstr)) {
 333                         lxc_free_array((void **)cpulist, free);
 334                         return NULL;
 335                 }
 336
 337                 ret = lxc_append_string(&cpulist, numstr);
 338                 if (ret < 0) {
 339                         lxc_free_array((void **)cpulist, free);
 340                         return NULL;
 341                 }
 342         }
 343
 344         if (!cpulist)
 345                 return NULL;
 346
 347         tmp = lxc_string_join(",", (const char **)cpulist, false);
 348         lxc_free_array((void **)cpulist, free);
 349
 350         return tmp;
 351 }
 352
 353 static ssize_t get_max_cpus(char *cpulist)
 354 {
 355         char *c1, *c2;
 356         char *maxcpus = cpulist;
 357         size_t cpus = 0;
 358
 359         c1 = strrchr(maxcpus, ',');
 360         if (c1)
 361                 c1++;
 362
 363         c2 = strrchr(maxcpus, '-');
 364         if (c2)
 365                 c2++;
 366
 367         if (!c1 && !c2)
 368                 c1 = maxcpus;
 369         else if (c1 > c2)
 370                 c2 = c1;
 371         else if (c1 < c2)
 372                 c1 = c2;
 373         else if (!c1 && c2)
 374                 c1 = c2;
 375
 376         errno = 0;
 377         cpus = strtoul(c1, NULL, 0);
 378         if (errno != 0)
 379                 return -1;
 380
 381         return cpus;
 382 }
 383
 384 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
 385 #define __OFFLINE_CPUS "/sys/devices/system/cpu/offline"
 386 static bool cg_legacy_filter_and_set_cpus(char *path, bool am_initialized)
 387 {
 388         __do_free char *cpulist = NULL, *fpath = NULL, *isolcpus = NULL,
 389                        *offlinecpus = NULL, *posscpus = NULL;
 390         __do_free uint32_t *isolmask = NULL, *offlinemask = NULL,
 391                            *possmask = NULL;
 392         int ret;
 393         ssize_t i;
 394         char oldv;
 395         char *lastslash;
 396         ssize_t maxisol = 0, maxoffline = 0, maxposs = 0;
 397         bool bret = false, flipped_bit = false;
 398
 399         lastslash = strrchr(path, '/');
 400         if (!lastslash) {
 401                 ERROR("Failed to detect \"/\" in \"%s\"", path);
 402                 return bret;
 403         }
 404         oldv = *lastslash;
 405         *lastslash = '\0';
 406         fpath = must_make_path(path, "cpuset.cpus", NULL);
 407         *lastslash = oldv;
 408         posscpus = read_file(fpath);
 409         if (!posscpus) {
 410                 SYSERROR("Failed to read file \"%s\"", fpath);
 411                 return false;
 412         }
 413
 414         /* Get maximum number of cpus found in possible cpuset. */
 415         maxposs = get_max_cpus(posscpus);
 416         if (maxposs < 0 || maxposs >= INT_MAX - 1)
 417                 return false;
 418
 419         if (file_exists(__ISOL_CPUS)) {
 420                 isolcpus = read_file(__ISOL_CPUS);
 421                 if (!isolcpus) {
 422                         SYSERROR("Failed to read file \"%s\"", __ISOL_CPUS);
 423                         return false;
 424                 }
 425
 426                 if (isdigit(isolcpus[0])) {
 427                         /* Get maximum number of cpus found in isolated cpuset. */
 428                         maxisol = get_max_cpus(isolcpus);
 429                         if (maxisol < 0 || maxisol >= INT_MAX - 1)
 430                                 return false;
 431                 }
 432
 433                 if (maxposs < maxisol)
 434                         maxposs = maxisol;
 435                 maxposs++;
 436         } else {
 437                 TRACE("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
 438         }
 439
 440         if (file_exists(__OFFLINE_CPUS)) {
 441                 offlinecpus = read_file(__OFFLINE_CPUS);
 442                 if (!offlinecpus) {
 443                         SYSERROR("Failed to read file \"%s\"", __OFFLINE_CPUS);
 444                         return false;
 445                 }
 446
 447                 if (isdigit(offlinecpus[0])) {
 448                         /* Get maximum number of cpus found in offline cpuset. */
 449                         maxoffline = get_max_cpus(offlinecpus);
 450                         if (maxoffline < 0 || maxoffline >= INT_MAX - 1)
 451                                 return false;
 452                 }
 453
 454                 if (maxposs < maxoffline)
 455                         maxposs = maxoffline;
 456                 maxposs++;
 457         } else {
 458                 TRACE("The path \""__OFFLINE_CPUS"\" to read offline cpus from does not exist");
 459         }
 460
 461         if ((maxisol == 0) && (maxoffline == 0)) {
 462                 cpulist = move_ptr(posscpus);
 463                 goto copy_parent;
 464         }
 465
 466         possmask = lxc_cpumask(posscpus, maxposs);
 467         if (!possmask) {
 468                 ERROR("Failed to create cpumask for possible cpus");
 469                 return false;
 470         }
 471
 472         if (maxisol > 0) {
 473                 isolmask = lxc_cpumask(isolcpus, maxposs);
 474                 if (!isolmask) {
 475                         ERROR("Failed to create cpumask for isolated cpus");
 476                         return false;
 477                 }
 478         }
 479
 480         if (maxoffline > 0) {
 481                 offlinemask = lxc_cpumask(offlinecpus, maxposs);
 482                 if (!offlinemask) {
 483                         ERROR("Failed to create cpumask for offline cpus");
 484                         return false;
 485                 }
 486         }
 487
 488         for (i = 0; i <= maxposs; i++) {
 489                 if ((isolmask && !is_set(i, isolmask)) ||
 490                     (offlinemask && !is_set(i, offlinemask)) ||
 491                     !is_set(i, possmask))
 492                         continue;
 493
 494                 flipped_bit = true;
 495                 clear_bit(i, possmask);
 496         }
 497
 498         if (!flipped_bit) {
 499                 DEBUG("No isolated or offline cpus present in cpuset");
 500                 return true;
 501         }
 502         DEBUG("Removed isolated or offline cpus from cpuset");
 503
 504         cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
 505         if (!cpulist) {
 506                 ERROR("Failed to create cpu list");
 507                 return false;
 508         }
 509
 510 copy_parent:
 511         if (!am_initialized) {
 512                 fpath = must_make_path(path, "cpuset.cpus", NULL);
 513                 ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false,
 514                                         0666);
 515                 if (ret < 0) {
 516                         SYSERROR("Failed to write cpu list to \"%s\"", fpath);
 517                         return false;
 518                 }
 519
 520                 TRACE("Copied cpu settings of parent cgroup");
 521         }
 522
 523         return true;
 524 }
 525
 526 /* Copy contents of parent(@path)/@file to @path/@file */
 527 static bool copy_parent_file(char *path, char *file)
 528 {
 529         __do_free char *child_path = NULL, *parent_path = NULL, *value = NULL;
 530         int ret;
 531         char oldv;
 532         int len = 0;
 533         char *lastslash = NULL;
 534
 535         lastslash = strrchr(path, '/');
 536         if (!lastslash) {
 537                 ERROR("Failed to detect \"/\" in \"%s\"", path);
 538                 return false;
 539         }
 540         oldv = *lastslash;
 541         *lastslash = '\0';
 542         parent_path = must_make_path(path, file, NULL);
 543         len = lxc_read_from_file(parent_path, NULL, 0);
 544         if (len <= 0) {
 545                 SYSERROR("Failed to determine buffer size");
 546                 return false;
 547         }
 548
 549         value = must_realloc(NULL, len + 1);
 550         ret = lxc_read_from_file(parent_path, value, len);
 551         if (ret != len) {
 552                 SYSERROR("Failed to read from parent file \"%s\"", parent_path);
 553                 return false;
 554         }
 555
 556         *lastslash = oldv;
 557         child_path = must_make_path(path, file, NULL);
 558         ret = lxc_write_to_file(child_path, value, len, false, 0666);
 559         if (ret < 0)
 560                 SYSERROR("Failed to write \"%s\" to file \"%s\"", value, child_path);
 561         return ret >= 0;
 562 }
 563
 564 /* Initialize the cpuset hierarchy in first directory of @gname and set
 565  * cgroup.clone_children so that children inherit settings. Since the
 566  * h->base_path is populated by init or ourselves, we know it is already
 567  * initialized.
 568  */
 569 static bool cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
 570 {
 571         __do_free char *cgpath = NULL, *clonechildrenpath = NULL;
 572         int ret;
 573         char v;
 574         char *slash;
 575
 576         if (!string_in_list(h->controllers, "cpuset"))
 577                 return true;
 578
 579         if (*cgname == '/')
 580                 cgname++;
 581         slash = strchr(cgname, '/');
 582         if (slash)
 583                 *slash = '\0';
 584
 585         cgpath = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
 586         if (slash)
 587                 *slash = '/';
 588
 589         ret = mkdir(cgpath, 0755);
 590         if (ret < 0) {
 591                 if (errno != EEXIST) {
 592                         SYSERROR("Failed to create directory \"%s\"", cgpath);
 593                         return false;
 594                 }
 595         }
 596
 597         clonechildrenpath = must_make_path(cgpath, "cgroup.clone_children", NULL);
 598         /* unified hierarchy doesn't have clone_children */
 599         if (!file_exists(clonechildrenpath))
 600                 return true;
 601
 602         ret = lxc_read_from_file(clonechildrenpath, &v, 1);
 603         if (ret < 0) {
 604                 SYSERROR("Failed to read file \"%s\"", clonechildrenpath);
 605                 return false;
 606         }
 607
 608         /* Make sure any isolated cpus are removed from cpuset.cpus. */
 609         if (!cg_legacy_filter_and_set_cpus(cgpath, v == '1')) {
 610                 SYSERROR("Failed to remove isolated cpus");
 611                 return false;
 612         }
 613
 614         /* Already set for us by someone else. */
 615         if (v == '1') {
 616                 DEBUG("\"cgroup.clone_children\" was already set to \"1\"");
 617                 return true;
 618         }
 619
 620         /* copy parent's settings */
 621         if (!copy_parent_file(cgpath, "cpuset.mems")) {
 622                 SYSERROR("Failed to copy \"cpuset.mems\" settings");
 623                 return false;
 624         }
 625
 626         ret = lxc_write_to_file(clonechildrenpath, "1", 1, false, 0666);
 627         if (ret < 0) {
 628                 /* Set clone_children so children inherit our settings */
 629                 SYSERROR("Failed to write 1 to \"%s\"", clonechildrenpath);
 630                 return false;
 631         }
 632
 633         return true;
 634 }
 635
 636 /* Given two null-terminated lists of strings, return true if any string is in
 637  * both.
 638  */
 639 static bool controller_lists_intersect(char **l1, char **l2)
 640 {
 641         int i;
 642
 643         if (!l1 || !l2)
 644                 return false;
 645
 646         for (i = 0; l1[i]; i++) {
 647                 if (string_in_list(l2, l1[i]))
 648                         return true;
 649         }
 650
 651         return false;
 652 }
 653
 654 /* For a null-terminated list of controllers @clist, return true if any of those
 655  * controllers is already listed the null-terminated list of hierarchies @hlist.
 656  * Realistically, if one is present, all must be present.
 657  */
 658 static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
 659 {
 660         int i;
 661
 662         if (!hlist)
 663                 return false;
 664
 665         for (i = 0; hlist[i]; i++)
 666                 if (controller_lists_intersect(hlist[i]->controllers, clist))
 667                         return true;
 668
 669         return false;
 670 }
 671
 672 /* Return true if the controller @entry is found in the null-terminated list of
 673  * hierarchies @hlist.
 674  */
 675 static bool controller_found(struct hierarchy **hlist, char *entry)
 676 {
 677         int i;
 678
 679         if (!hlist)
 680                 return false;
 681
 682         for (i = 0; hlist[i]; i++)
 683                 if (string_in_list(hlist[i]->controllers, entry))
 684                         return true;
 685
 686         return false;
 687 }
 688
 689 /* Return true if all of the controllers which we require have been found.  The
 690  * required list is  freezer and anything in lxc.cgroup.use.
 691  */
 692 static bool all_controllers_found(struct cgroup_ops *ops)
 693 {
 694         char **cur;
 695         struct hierarchy **hlist = ops->hierarchies;
 696
 697         if (!ops->cgroup_use)
 698                 return true;
 699
 700         for (cur = ops->cgroup_use; cur && *cur; cur++)
 701                 if (!controller_found(hlist, *cur)) {
 702                         ERROR("No %s controller mountpoint found", *cur);
 703                         return false;
 704                 }
 705
 706         return true;
 707 }
 708
 709 /* Get the controllers from a mountinfo line There are other ways we could get
 710  * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
 711  * could parse the mount options. But we simply assume that the mountpoint must
 712  * be /sys/fs/cgroup/controller-list
 713  */
 714 static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
 715                                         int type)
 716 {
 717         /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
 718          * for legacy hierarchies.
 719          */
 720         int i;
 721         char *p2, *tok;
 722         char *p = line, *sep = ",";
 723         char **aret = NULL;
 724
 725         for (i = 0; i < 4; i++) {
 726                 p = strchr(p, ' ');
 727                 if (!p)
 728                         return NULL;
 729                 p++;
 730         }
 731
 732         /* Note, if we change how mountinfo works, then our caller will need to
 733          * verify /sys/fs/cgroup/ in this field.
 734          */
 735         if (strncmp(p, "/sys/fs/cgroup/", 15) != 0) {
 736                 ERROR("Found hierarchy not under /sys/fs/cgroup: \"%s\"", p);
 737                 return NULL;
 738         }
 739
 740         p += 15;
 741         p2 = strchr(p, ' ');
 742         if (!p2) {
 743                 ERROR("Corrupt mountinfo");
 744                 return NULL;
 745         }
 746         *p2 = '\0';
 747
 748         if (type == CGROUP_SUPER_MAGIC) {
 749                 __do_free char *dup = NULL;
 750
 751                 /* strdup() here for v1 hierarchies. Otherwise
 752                  * lxc_iterate_parts() will destroy mountpoints such as
 753                  * "/sys/fs/cgroup/cpu,cpuacct".
 754                  */
 755                 dup = must_copy_string(p);
 756                 if (!dup)
 757                         return NULL;
 758
 759                 lxc_iterate_parts (tok, dup, sep)
 760                         must_append_controller(klist, nlist, &aret, tok);
 761         }
 762         *p2 = ' ';
 763
 764         return aret;
 765 }
 766
 767 static char **cg_unified_make_empty_controller(void)
 768 {
 769         int newentry;
 770         char **aret = NULL;
 771
 772         newentry = append_null_to_list((void ***)&aret);
 773         aret[newentry] = NULL;
 774         return aret;
 775 }
 776
 777 static char **cg_unified_get_controllers(const char *file)
 778 {
 779         __do_free char *buf = NULL;
 780         char *tok;
 781         char *sep = " \t\n";
 782         char **aret = NULL;
 783
 784         buf = read_file(file);
 785         if (!buf)
 786                 return NULL;
 787
 788         lxc_iterate_parts(tok, buf, sep) {
 789                 int newentry;
 790                 char *copy;
 791
 792                 newentry = append_null_to_list((void ***)&aret);
 793                 copy = must_copy_string(tok);
 794                 aret[newentry] = copy;
 795         }
 796
 797         return aret;
 798 }
 799
 800 static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
 801                                        char *container_base_path, int type)
 802 {
 803         struct hierarchy *new;
 804         int newentry;
 805
 806         new = must_realloc(NULL, sizeof(*new));
 807         new->controllers = clist;
 808         new->mountpoint = mountpoint;
 809         new->container_base_path = container_base_path;
 810         new->container_full_path = NULL;
 811         new->monitor_full_path = NULL;
 812         new->version = type;
 813         new->cgroup2_chown = NULL;
 814
 815         newentry = append_null_to_list((void ***)h);
 816         (*h)[newentry] = new;
 817         return new;
 818 }
 819
 820 /* Get a copy of the mountpoint from @line, which is a line from
 821  * /proc/self/mountinfo.
 822  */
 823 static char *cg_hybrid_get_mountpoint(char *line)
 824 {
 825         int i;
 826         size_t len;
 827         char *p2;
 828         char *p = line, *sret = NULL;
 829
 830         for (i = 0; i < 4; i++) {
 831                 p = strchr(p, ' ');
 832                 if (!p)
 833                         return NULL;
 834                 p++;
 835         }
 836
 837         if (strncmp(p, "/sys/fs/cgroup/", 15) != 0)
 838                 return NULL;
 839
 840         p2 = strchr(p + 15, ' ');
 841         if (!p2)
 842                 return NULL;
 843         *p2 = '\0';
 844
 845         len = strlen(p);
 846         sret = must_realloc(NULL, len + 1);
 847         memcpy(sret, p, len);
 848         sret[len] = '\0';
 849         return sret;
 850 }
 851
 852 /* Given a multi-line string, return a null-terminated copy of the current line. */
 853 static char *copy_to_eol(char *p)
 854 {
 855         char *p2 = strchr(p, '\n'), *sret;
 856         size_t len;
 857
 858         if (!p2)
 859                 return NULL;
 860
 861         len = p2 - p;
 862         sret = must_realloc(NULL, len + 1);
 863         memcpy(sret, p, len);
 864         sret[len] = '\0';
 865         return sret;
 866 }
 867
 868 /* cgline: pointer to character after the first ':' in a line in a \n-terminated
 869  * /proc/self/cgroup file. Check whether controller c is present.
 870  */
 871 static bool controller_in_clist(char *cgline, char *c)
 872 {
 873         __do_free char *tmp = NULL;
 874         char *tok, *eol;
 875         size_t len;
 876
 877         eol = strchr(cgline, ':');
 878         if (!eol)
 879                 return false;
 880
 881         len = eol - cgline;
 882         tmp = must_realloc(NULL, len + 1);
 883         memcpy(tmp, cgline, len);
 884         tmp[len] = '\0';
 885
 886         lxc_iterate_parts(tok, tmp, ",")
 887                 if (strcmp(tok, c) == 0)
 888                         return true;
 889
 890         return false;
 891 }
 892
 893 /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
 894  * @controller.
 895  */
 896 static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
 897                                           int type)
 898 {
 899         char *p = basecginfo;
 900
 901         for (;;) {
 902                 bool is_cgv2_base_cgroup = false;
 903
 904                 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
 905                 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
 906                         is_cgv2_base_cgroup = true;
 907
 908                 p = strchr(p, ':');
 909                 if (!p)
 910                         return NULL;
 911                 p++;
 912
 913                 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
 914                         p = strchr(p, ':');
 915                         if (!p)
 916                                 return NULL;
 917                         p++;
 918                         return copy_to_eol(p);
 919                 }
 920
 921                 p = strchr(p, '\n');
 922                 if (!p)
 923                         return NULL;
 924                 p++;
 925         }
 926 }
 927
 928 static void must_append_string(char ***list, char *entry)
 929 {
 930         int newentry;
 931         char *copy;
 932
 933         newentry = append_null_to_list((void ***)list);
 934         copy = must_copy_string(entry);
 935         (*list)[newentry] = copy;
 936 }
 937
 938 static int get_existing_subsystems(char ***klist, char ***nlist)
 939 {
 940         __do_free char *line = NULL;
 941         __do_fclose FILE *f = NULL;
 942         size_t len = 0;
 943
 944         f = fopen("/proc/self/cgroup", "r");
 945         if (!f)
 946                 return -1;
 947
 948         while (getline(&line, &len, f) != -1) {
 949                 char *p, *p2, *tok;
 950                 p = strchr(line, ':');
 951                 if (!p)
 952                         continue;
 953                 p++;
 954                 p2 = strchr(p, ':');
 955                 if (!p2)
 956                         continue;
 957                 *p2 = '\0';
 958
 959                 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
 960                  * contains an entry of the form:
 961                  *
 962                  *      0::/some/path
 963                  *
 964                  * In this case we use "cgroup2" as controller name.
 965                  */
 966                 if ((p2 - p) == 0) {
 967                         must_append_string(klist, "cgroup2");
 968                         continue;
 969                 }
 970
 971                 lxc_iterate_parts(tok, p, ",") {
 972                         if (strncmp(tok, "name=", 5) == 0)
 973                                 must_append_string(nlist, tok);
 974                         else
 975                                 must_append_string(klist, tok);
 976                 }
 977         }
 978
 979         return 0;
 980 }
 981
 982 static void trim(char *s)
 983 {
 984         size_t len;
 985
 986         len = strlen(s);
 987         while ((len > 1) && (s[len - 1] == '\n'))
 988                 s[--len] = '\0';
 989 }
 990
 991 static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
 992 {
 993         int i;
 994         struct hierarchy **it;
 995
 996         if (!ops->hierarchies) {
 997                 TRACE("  No hierarchies found");
 998                 return;
 999         }
1000
1001         TRACE("  Hierarchies:");
1002         for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
1003                 int j;
1004                 char **cit;
1005
1006                 TRACE("  %d: base_cgroup: %s", i, (*it)->container_base_path ? (*it)->container_base_path : "(null)");
1007                 TRACE("      mountpoint:  %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
1008                 TRACE("      controllers:");
1009                 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
1010                         TRACE("      %d: %s", j, *cit);
1011         }
1012 }
1013
1014 static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
1015                                               char **nlist)
1016 {
1017         int k;
1018         char **it;
1019
1020         TRACE("basecginfo is:");
1021         TRACE("%s", basecginfo);
1022
1023         for (k = 0, it = klist; it && *it; it++, k++)
1024                 TRACE("kernel subsystem %d: %s", k, *it);
1025
1026         for (k = 0, it = nlist; it && *it; it++, k++)
1027                 TRACE("named subsystem %d: %s", k, *it);
1028 }
1029
1030 static int cgroup_rmdir(struct hierarchy **hierarchies,
1031                         const char *container_cgroup)
1032 {
1033         int i;
1034
1035         if (!container_cgroup || !hierarchies)
1036                 return 0;
1037
1038         for (i = 0; hierarchies[i]; i++) {
1039                 int ret;
1040                 struct hierarchy *h = hierarchies[i];
1041
1042                 if (!h->container_full_path)
1043                         continue;
1044
1045                 ret = recursive_destroy(h->container_full_path);
1046                 if (ret < 0)
1047                         WARN("Failed to destroy \"%s\"", h->container_full_path);
1048
1049                 free(h->container_full_path);
1050                 h->container_full_path = NULL;
1051         }
1052
1053         return 0;
1054 }
1055
1056 struct generic_userns_exec_data {
1057         struct hierarchy **hierarchies;
1058         const char *container_cgroup;
1059         struct lxc_conf *conf;
1060         uid_t origuid; /* target uid in parent namespace */
1061         char *path;
1062 };
1063
1064 static int cgroup_rmdir_wrapper(void *data)
1065 {
1066         int ret;
1067         struct generic_userns_exec_data *arg = data;
1068         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1069         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1070
1071         ret = setresgid(nsgid, nsgid, nsgid);
1072         if (ret < 0) {
1073                 SYSERROR("Failed to setresgid(%d, %d, %d)", (int)nsgid,
1074                          (int)nsgid, (int)nsgid);
1075                 return -1;
1076         }
1077
1078         ret = setresuid(nsuid, nsuid, nsuid);
1079         if (ret < 0) {
1080                 SYSERROR("Failed to setresuid(%d, %d, %d)", (int)nsuid,
1081                          (int)nsuid, (int)nsuid);
1082                 return -1;
1083         }
1084
1085         ret = setgroups(0, NULL);
1086         if (ret < 0 && errno != EPERM) {
1087                 SYSERROR("Failed to setgroups(0, NULL)");
1088                 return -1;
1089         }
1090
1091         return cgroup_rmdir(arg->hierarchies, arg->container_cgroup);
1092 }
1093
1094 __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
1095                                                 struct lxc_handler *handler)
1096 {
1097         int ret;
1098         struct generic_userns_exec_data wrap;
1099
1100         if (!ops->hierarchies)
1101                 return;
1102
1103         wrap.origuid = 0;
1104         wrap.container_cgroup = ops->container_cgroup;
1105         wrap.hierarchies = ops->hierarchies;
1106         wrap.conf = handler->conf;
1107
1108         if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
1109                 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
1110                                     "cgroup_rmdir_wrapper");
1111         else
1112                 ret = cgroup_rmdir(ops->hierarchies, ops->container_cgroup);
1113         if (ret < 0) {
1114                 WARN("Failed to destroy cgroups");
1115                 return;
1116         }
1117 }
1118
1119 __cgfsng_ops static void cgfsng_monitor_destroy(struct cgroup_ops *ops,
1120                                                 struct lxc_handler *handler)
1121 {
1122         int len;
1123         struct lxc_conf *conf = handler->conf;
1124         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1125
1126         if (!ops->hierarchies)
1127                 return;
1128
1129         len = snprintf(pidstr, sizeof(pidstr), "%d", handler->monitor_pid);
1130         if (len < 0 || (size_t)len >= sizeof(pidstr))
1131                 return;
1132
1133         for (int i = 0; ops->hierarchies[i]; i++) {
1134                 __do_free char *pivot_path = NULL;
1135                 int ret;
1136                 char *chop;
1137                 char pivot_cgroup[] = PIVOT_CGROUP;
1138                 struct hierarchy *h = ops->hierarchies[i];
1139
1140                 if (!h->monitor_full_path)
1141                         continue;
1142
1143                 if (conf && conf->cgroup_meta.dir)
1144                         pivot_path = must_make_path(h->mountpoint,
1145                                                     h->container_base_path,
1146                                                     conf->cgroup_meta.dir,
1147                                                     PIVOT_CGROUP,
1148                                                     "cgroup.procs", NULL);
1149                 else
1150                         pivot_path = must_make_path(h->mountpoint,
1151                                                     h->container_base_path,
1152                                                     PIVOT_CGROUP,
1153                                                     "cgroup.procs", NULL);
1154
1155                 chop = strrchr(pivot_path, '/');
1156                 if (chop)
1157                         *chop = '\0';
1158
1159                 /*
1160                  * Make sure not to pass in the ro string literal PIVOT_CGROUP
1161                  * here.
1162                  */
1163                 if (!cg_legacy_handle_cpuset_hierarchy(h, pivot_cgroup)) {
1164                         WARN("Failed to handle legacy cpuset controller");
1165                         continue;
1166                 }
1167
1168                 ret = mkdir_p(pivot_path, 0755);
1169                 if (ret < 0 && errno != EEXIST) {
1170                         SYSWARN("Failed to create cgroup \"%s\"\n", pivot_path);
1171                         continue;
1172                 }
1173
1174                 if (chop)
1175                         *chop = '/';
1176
1177                 /* Move ourselves into the pivot cgroup to delete our own
1178                  * cgroup.
1179                  */
1180                 ret = lxc_write_to_file(pivot_path, pidstr, len, false, 0666);
1181                 if (ret != 0) {
1182                         SYSWARN("Failed to move monitor %s to \"%s\"\n", pidstr, pivot_path);
1183                         continue;
1184                 }
1185
1186                 ret = recursive_destroy(h->monitor_full_path);
1187                 if (ret < 0)
1188                         WARN("Failed to destroy \"%s\"", h->monitor_full_path);
1189         }
1190 }
1191
1192 static bool cg_unified_create_cgroup(struct hierarchy *h, char *cgname)
1193 {
1194         __do_free char *add_controllers = NULL, *cgroup = NULL;
1195         size_t i, parts_len;
1196         char **it;
1197         size_t full_len = 0;
1198         char **parts = NULL;
1199         bool bret = false;
1200
1201         if (h->version != CGROUP2_SUPER_MAGIC)
1202                 return true;
1203
1204         if (!h->controllers)
1205                 return true;
1206
1207         /* For now we simply enable all controllers that we have detected by
1208          * creating a string like "+memory +pids +cpu +io".
1209          * TODO: In the near future we might want to support "-<controller>"
1210          * etc. but whether supporting semantics like this make sense will need
1211          * some thinking.
1212          */
1213         for (it = h->controllers; it && *it; it++) {
1214                 full_len += strlen(*it) + 2;
1215                 add_controllers = must_realloc(add_controllers, full_len + 1);
1216
1217                 if (h->controllers[0] == *it)
1218                         add_controllers[0] = '\0';
1219
1220                 (void)strlcat(add_controllers, "+", full_len + 1);
1221                 (void)strlcat(add_controllers, *it, full_len + 1);
1222
1223                 if ((it + 1) && *(it + 1))
1224                         (void)strlcat(add_controllers, " ", full_len + 1);
1225         }
1226
1227         parts = lxc_string_split(cgname, '/');
1228         if (!parts)
1229                 goto on_error;
1230
1231         parts_len = lxc_array_len((void **)parts);
1232         if (parts_len > 0)
1233                 parts_len--;
1234
1235         cgroup = must_make_path(h->mountpoint, h->container_base_path, NULL);
1236         for (i = 0; i < parts_len; i++) {
1237                 int ret;
1238                 __do_free char *target = NULL;
1239
1240                 cgroup = must_append_path(cgroup, parts[i], NULL);
1241                 target = must_make_path(cgroup, "cgroup.subtree_control", NULL);
1242                 ret = lxc_write_to_file(target, add_controllers, full_len, false, 0666);
1243                 if (ret < 0) {
1244                         SYSERROR("Could not enable \"%s\" controllers in the "
1245                                  "unified cgroup \"%s\"", add_controllers, cgroup);
1246                         goto on_error;
1247                 }
1248         }
1249
1250         bret = true;
1251
1252 on_error:
1253         lxc_free_array((void **)parts, free);
1254         return bret;
1255 }
1256
1257 static int mkdir_eexist_on_last(const char *dir, mode_t mode)
1258 {
1259         const char *tmp = dir;
1260         const char *orig = dir;
1261         size_t orig_len;
1262
1263         orig_len = strlen(dir);
1264         do {
1265                 __do_free char *makeme;
1266                 int ret;
1267                 size_t cur_len;
1268
1269                 dir = tmp + strspn(tmp, "/");
1270                 tmp = dir + strcspn(dir, "/");
1271
1272                 errno = ENOMEM;
1273                 cur_len = dir - orig;
1274                 makeme = strndup(orig, cur_len);
1275                 if (!makeme)
1276                         return -1;
1277
1278                 ret = mkdir(makeme, mode);
1279                 if (ret < 0) {
1280                         if ((errno != EEXIST) || (orig_len == cur_len)) {
1281                                 SYSERROR("Failed to create directory \"%s\"", makeme);
1282                                 return -1;
1283                         }
1284                 }
1285         } while (tmp != dir);
1286
1287         return 0;
1288 }
1289
1290 static bool monitor_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1291 {
1292         int ret;
1293
1294         if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
1295                 ERROR("Failed to handle legacy cpuset controller");
1296                 return false;
1297         }
1298
1299         h->monitor_full_path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
1300         ret = mkdir_eexist_on_last(h->monitor_full_path, 0755);
1301         if (ret < 0) {
1302                 ERROR("Failed to create cgroup \"%s\"", h->monitor_full_path);
1303                 return false;
1304         }
1305
1306         return cg_unified_create_cgroup(h, cgname);
1307 }
1308
1309 static bool container_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1310 {
1311         int ret;
1312
1313         if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
1314                 ERROR("Failed to handle legacy cpuset controller");
1315                 return false;
1316         }
1317
1318         h->container_full_path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
1319         ret = mkdir_eexist_on_last(h->container_full_path, 0755);
1320         if (ret < 0) {
1321                 ERROR("Failed to create cgroup \"%s\"", h->container_full_path);
1322                 return false;
1323         }
1324
1325         return cg_unified_create_cgroup(h, cgname);
1326 }
1327
1328 static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname, bool monitor)
1329 {
1330         int ret;
1331         char *full_path;
1332
1333         if (monitor)
1334                 full_path = h->monitor_full_path;
1335         else
1336                 full_path = h->container_full_path;
1337
1338         ret = rmdir(full_path);
1339         if (ret < 0)
1340                 SYSERROR("Failed to rmdir(\"%s\") from failed creation attempt", full_path);
1341
1342         free(full_path);
1343
1344         if (monitor)
1345                 h->monitor_full_path = NULL;
1346         else
1347                 h->container_full_path = NULL;
1348 }
1349
1350 __cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
1351                                                       struct lxc_handler *handler)
1352 {
1353         __do_free char *monitor_cgroup = NULL;
1354         char *offset, *tmp;
1355         int i, idx = 0;
1356         size_t len;
1357         struct lxc_conf *conf = handler->conf;
1358
1359         if (!conf)
1360                 return false;
1361
1362         if (!ops->hierarchies)
1363                 return true;
1364
1365         if (conf->cgroup_meta.dir)
1366                 tmp = lxc_string_join("/",
1367                                       (const char *[]){conf->cgroup_meta.dir,
1368                                                        ops->monitor_pattern,
1369                                                        handler->name, NULL},
1370                                       false);
1371         else
1372                 tmp = must_make_path(ops->monitor_pattern, handler->name, NULL);
1373         if (!tmp)
1374                 return false;
1375
1376         len = strlen(tmp) + 5; /* leave room for -NNN\0 */
1377         monitor_cgroup = must_realloc(tmp, len);
1378         offset = monitor_cgroup + len - 5;
1379         *offset = 0;
1380
1381         do {
1382                 if (idx) {
1383                         int ret = snprintf(offset, 5, "-%d", idx);
1384                         if (ret < 0 || (size_t)ret >= 5)
1385                                 return false;
1386                 }
1387
1388                 for (i = 0; ops->hierarchies[i]; i++) {
1389                         if (!monitor_create_path_for_hierarchy(ops->hierarchies[i],
1390                                                                monitor_cgroup)) {
1391                                 ERROR("Failed to create cgroup \"%s\"",
1392                                       ops->hierarchies[i]->monitor_full_path);
1393                                 for (int j = 0; j < i; j++)
1394                                         remove_path_for_hierarchy(ops->hierarchies[j],
1395                                                                   monitor_cgroup,
1396                                                                   true);
1397
1398                                 idx++;
1399                                 break;
1400                         }
1401                 }
1402         } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
1403
1404         if (idx == 1000)
1405                 return false;
1406
1407         INFO("The monitor process uses \"%s\" as cgroup", monitor_cgroup);
1408         return true;
1409 }
1410
1411 /* Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1412  * next cgroup_pattern-1, -2, ..., -999.
1413  */
1414 __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
1415                                                         struct lxc_handler *handler)
1416 {
1417         __do_free char *container_cgroup = NULL, *tmp = NULL;
1418         int i;
1419         size_t len;
1420         char *offset;
1421         int idx = 0;
1422         struct lxc_conf *conf = handler->conf;
1423
1424         if (ops->container_cgroup)
1425                 return false;
1426
1427         if (!conf)
1428                 return false;
1429
1430         if (!ops->hierarchies)
1431                 return true;
1432
1433         if (conf->cgroup_meta.dir)
1434                 tmp = lxc_string_join("/", (const char *[]){conf->cgroup_meta.dir, handler->name, NULL}, false);
1435         else
1436                 tmp = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1437         if (!tmp) {
1438                 ERROR("Failed expanding cgroup name pattern");
1439                 return false;
1440         }
1441
1442         len = strlen(tmp) + 5; /* leave room for -NNN\0 */
1443         container_cgroup = must_realloc(NULL, len);
1444         (void)strlcpy(container_cgroup, tmp, len);
1445         offset = container_cgroup + len - 5;
1446
1447         do {
1448                 if (idx) {
1449                         int ret = snprintf(offset, 5, "-%d", idx);
1450                         if (ret < 0 || (size_t)ret >= 5)
1451                                 return false;
1452                 }
1453
1454                 for (i = 0; ops->hierarchies[i]; i++) {
1455                         if (!container_create_path_for_hierarchy(ops->hierarchies[i],
1456                                                                  container_cgroup)) {
1457                                 ERROR("Failed to create cgroup \"%s\"",
1458                                       ops->hierarchies[i]->container_full_path);
1459                                 for (int j = 0; j < i; j++)
1460                                         remove_path_for_hierarchy(ops->hierarchies[j],
1461                                                                   container_cgroup,
1462                                                                   false);
1463                                 idx++;
1464                                 break;
1465                         }
1466                 }
1467         } while (ops->hierarchies[i] && idx > 0 && idx < 1000);
1468
1469         if (idx == 1000)
1470                 return false;
1471
1472         INFO("The container process uses \"%s\" as cgroup", container_cgroup);
1473         ops->container_cgroup = move_ptr(container_cgroup);
1474         return true;
1475 }
1476
1477 __cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid,
1478                                              bool monitor)
1479 {
1480         int len;
1481         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
1482
1483         if (!ops->hierarchies)
1484                 return true;
1485
1486         len = snprintf(pidstr, sizeof(pidstr), "%d", pid);
1487         if (len < 0 || (size_t)len >= sizeof(pidstr))
1488                 return false;
1489
1490         for (int i = 0; ops->hierarchies[i]; i++) {
1491                 int ret;
1492                 __do_free char *path = NULL;
1493
1494                 if (monitor)
1495                         path = must_make_path(ops->hierarchies[i]->monitor_full_path,
1496                                               "cgroup.procs", NULL);
1497                 else
1498                         path = must_make_path(ops->hierarchies[i]->container_full_path,
1499                                               "cgroup.procs", NULL);
1500                 ret = lxc_write_to_file(path, pidstr, len, false, 0666);
1501                 if (ret != 0) {
1502                         SYSERROR("Failed to enter cgroup \"%s\"", path);
1503                         return false;
1504                 }
1505         }
1506
1507         return true;
1508 }
1509
1510 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops, pid_t pid)
1511 {
1512         return __do_cgroup_enter(ops, pid, true);
1513 }
1514
1515 static bool cgfsng_payload_enter(struct cgroup_ops *ops, pid_t pid)
1516 {
1517         return __do_cgroup_enter(ops, pid, false);
1518 }
1519
1520 static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid,
1521                    mode_t chmod_mode)
1522 {
1523         int ret;
1524
1525         ret = chown(path, chown_uid, chown_gid);
1526         if (ret < 0) {
1527                 SYSWARN("Failed to chown(%s, %d, %d)", path, (int)chown_uid, (int)chown_gid);
1528                 return -1;
1529         }
1530
1531         ret = chmod(path, chmod_mode);
1532         if (ret < 0) {
1533                 SYSWARN("Failed to chmod(%s, %d)", path, (int)chmod_mode);
1534                 return -1;
1535         }
1536
1537         return 0;
1538 }
1539
1540 /* chgrp the container cgroups to container group.  We leave
1541  * the container owner as cgroup owner.  So we must make the
1542  * directories 775 so that the container can create sub-cgroups.
1543  *
1544  * Also chown the tasks and cgroup.procs files.  Those may not
1545  * exist depending on kernel version.
1546  */
1547 static int chown_cgroup_wrapper(void *data)
1548 {
1549         int i, ret;
1550         uid_t destuid;
1551         struct generic_userns_exec_data *arg = data;
1552         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1553         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1554
1555         ret = setresgid(nsgid, nsgid, nsgid);
1556         if (ret < 0) {
1557                 SYSERROR("Failed to setresgid(%d, %d, %d)",
1558                          (int)nsgid, (int)nsgid, (int)nsgid);
1559                 return -1;
1560         }
1561
1562         ret = setresuid(nsuid, nsuid, nsuid);
1563         if (ret < 0) {
1564                 SYSERROR("Failed to setresuid(%d, %d, %d)",
1565                          (int)nsuid, (int)nsuid, (int)nsuid);
1566                 return -1;
1567         }
1568
1569         ret = setgroups(0, NULL);
1570         if (ret < 0 && errno != EPERM) {
1571                 SYSERROR("Failed to setgroups(0, NULL)");
1572                 return -1;
1573         }
1574
1575         destuid = get_ns_uid(arg->origuid);
1576         if (destuid == LXC_INVALID_UID)
1577                 destuid = 0;
1578
1579         for (i = 0; arg->hierarchies[i]; i++) {
1580                 __do_free char *fullpath = NULL;
1581                 char *path = arg->hierarchies[i]->container_full_path;
1582
1583                 ret = chowmod(path, destuid, nsgid, 0775);
1584                 if (ret < 0)
1585                         return -1;
1586
1587                 /* Failures to chown() these are inconvenient but not
1588                  * detrimental We leave these owned by the container launcher,
1589                  * so that container root can write to the files to attach.  We
1590                  * chmod() them 664 so that container systemd can write to the
1591                  * files (which systemd in wily insists on doing).
1592                  */
1593
1594                 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC) {
1595                         fullpath = must_make_path(path, "tasks", NULL);
1596                         (void)chowmod(fullpath, destuid, nsgid, 0664);
1597                 }
1598
1599                 fullpath = must_make_path(path, "cgroup.procs", NULL);
1600                 (void)chowmod(fullpath, destuid, nsgid, 0664);
1601
1602                 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
1603                         continue;
1604
1605                 for (char **p = arg->hierarchies[i]->cgroup2_chown; p && *p; p++) {
1606                         fullpath = must_make_path(path, *p, NULL);
1607                         (void)chowmod(fullpath, destuid, nsgid, 0664);
1608                 }
1609         }
1610
1611         return 0;
1612 }
1613
1614 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1615                                         struct lxc_conf *conf)
1616 {
1617         struct generic_userns_exec_data wrap;
1618
1619         if (lxc_list_empty(&conf->id_map))
1620                 return true;
1621
1622         if (!ops->hierarchies)
1623                 return true;
1624
1625         wrap.origuid = geteuid();
1626         wrap.path = NULL;
1627         wrap.hierarchies = ops->hierarchies;
1628         wrap.conf = conf;
1629
1630         if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1631                           "chown_cgroup_wrapper") < 0) {
1632                 ERROR("Error requesting cgroup chown in new user namespace");
1633                 return false;
1634         }
1635
1636         return true;
1637 }
1638
1639 /* cgroup-full:* is done, no need to create subdirs */
1640 static bool cg_mount_needs_subdirs(int type)
1641 {
1642         if (type >= LXC_AUTO_CGROUP_FULL_RO)
1643                 return false;
1644
1645         return true;
1646 }
1647
1648 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1649  * remount controller ro if needed and bindmount the cgroupfs onto
1650  * control/the/cg/path.
1651  */
1652 static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1653                                        char *controllerpath, char *cgpath,
1654                                        const char *container_cgroup)
1655 {
1656         __do_free char *sourcepath = NULL;
1657         int ret, remount_flags;
1658         int flags = MS_BIND;
1659
1660         if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1661                 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1662                 if (ret < 0) {
1663                         SYSERROR("Failed to bind mount \"%s\" onto \"%s\"",
1664                                  controllerpath, controllerpath);
1665                         return -1;
1666                 }
1667
1668                 remount_flags = add_required_remount_flags(controllerpath,
1669                                                            controllerpath,
1670                                                            flags | MS_REMOUNT);
1671                 ret = mount(controllerpath, controllerpath, "cgroup",
1672                             remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1673                             NULL);
1674                 if (ret < 0) {
1675                         SYSERROR("Failed to remount \"%s\" ro", controllerpath);
1676                         return -1;
1677                 }
1678
1679                 INFO("Remounted %s read-only", controllerpath);
1680         }
1681
1682         sourcepath = must_make_path(h->mountpoint, h->container_base_path,
1683                                     container_cgroup, NULL);
1684         if (type == LXC_AUTO_CGROUP_RO)
1685                 flags |= MS_RDONLY;
1686
1687         ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1688         if (ret < 0) {
1689                 SYSERROR("Failed to mount \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1690                 return -1;
1691         }
1692         INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1693
1694         if (flags & MS_RDONLY) {
1695                 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1696                                                            flags | MS_REMOUNT);
1697                 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1698                 if (ret < 0) {
1699                         SYSERROR("Failed to remount \"%s\" ro", cgpath);
1700                         return -1;
1701                 }
1702                 INFO("Remounted %s read-only", cgpath);
1703         }
1704
1705         INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1706         return 0;
1707 }
1708
1709 /* __cg_mount_direct
1710  *
1711  * Mount cgroup hierarchies directly without using bind-mounts. The main
1712  * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1713  * cgroups for the LXC_AUTO_CGROUP_FULL option.
1714  */
1715 static int __cg_mount_direct(int type, struct hierarchy *h,
1716                              const char *controllerpath)
1717 {
1718          int ret;
1719          __do_free char *controllers = NULL;
1720          char *fstype = "cgroup2";
1721          unsigned long flags = 0;
1722
1723          flags |= MS_NOSUID;
1724          flags |= MS_NOEXEC;
1725          flags |= MS_NODEV;
1726          flags |= MS_RELATIME;
1727
1728          if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1729                  flags |= MS_RDONLY;
1730
1731          if (h->version != CGROUP2_SUPER_MAGIC) {
1732                  controllers = lxc_string_join(",", (const char **)h->controllers, false);
1733                  if (!controllers)
1734                          return -ENOMEM;
1735                  fstype = "cgroup";
1736         }
1737
1738         ret = mount("cgroup", controllerpath, fstype, flags, controllers);
1739         if (ret < 0) {
1740                 SYSERROR("Failed to mount \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1741                 return -1;
1742         }
1743
1744         DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1745         return 0;
1746 }
1747
1748 static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1749                                                const char *controllerpath)
1750 {
1751         return __cg_mount_direct(type, h, controllerpath);
1752 }
1753
1754 static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
1755                                        const char *controllerpath)
1756 {
1757         if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1758                 return 0;
1759
1760         return __cg_mount_direct(type, h, controllerpath);
1761 }
1762
1763 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1764                                         struct lxc_handler *handler,
1765                                         const char *root, int type)
1766 {
1767         __do_free char *tmpfspath = NULL;
1768         int i, ret;
1769         bool has_cgns = false, retval = false, wants_force_mount = false;
1770
1771         if (!ops->hierarchies)
1772                 return true;
1773
1774         if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1775                 return true;
1776
1777         if (type & LXC_AUTO_CGROUP_FORCE) {
1778                 type &= ~LXC_AUTO_CGROUP_FORCE;
1779                 wants_force_mount = true;
1780         }
1781
1782         if (!wants_force_mount){
1783                 if (!lxc_list_empty(&handler->conf->keepcaps))
1784                         wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1785                 else
1786                         wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1787         }
1788
1789         has_cgns = cgns_supported();
1790         if (has_cgns && !wants_force_mount)
1791                 return true;
1792
1793         if (type == LXC_AUTO_CGROUP_NOSPEC)
1794                 type = LXC_AUTO_CGROUP_MIXED;
1795         else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1796                 type = LXC_AUTO_CGROUP_FULL_MIXED;
1797
1798         /* Mount tmpfs */
1799         tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
1800         ret = safe_mount(NULL, tmpfspath, "tmpfs",
1801                          MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1802                          "size=10240k,mode=755", root);
1803         if (ret < 0)
1804                 goto on_error;
1805
1806         for (i = 0; ops->hierarchies[i]; i++) {
1807                 __do_free char *controllerpath = NULL, *path2 = NULL;
1808                 struct hierarchy *h = ops->hierarchies[i];
1809                 char *controller = strrchr(h->mountpoint, '/');
1810
1811                 if (!controller)
1812                         continue;
1813                 controller++;
1814
1815                 controllerpath = must_make_path(tmpfspath, controller, NULL);
1816                 if (dir_exists(controllerpath))
1817                         continue;
1818
1819                 ret = mkdir(controllerpath, 0755);
1820                 if (ret < 0) {
1821                         SYSERROR("Error creating cgroup path: %s", controllerpath);
1822                         goto on_error;
1823                 }
1824
1825                 if (has_cgns && wants_force_mount) {
1826                         /* If cgroup namespaces are supported but the container
1827                          * will not have CAP_SYS_ADMIN after it has started we
1828                          * need to mount the cgroups manually.
1829                          */
1830                         ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
1831                         if (ret < 0)
1832                                 goto on_error;
1833
1834                         continue;
1835                 }
1836
1837                 ret = cg_mount_cgroup_full(type, h, controllerpath);
1838                 if (ret < 0)
1839                         goto on_error;
1840
1841                 if (!cg_mount_needs_subdirs(type))
1842                         continue;
1843
1844                 path2 = must_make_path(controllerpath, h->container_base_path,
1845                                        ops->container_cgroup, NULL);
1846                 ret = mkdir_p(path2, 0755);
1847                 if (ret < 0)
1848                         goto on_error;
1849
1850                 ret = cg_legacy_mount_controllers(type, h, controllerpath,
1851                                                   path2, ops->container_cgroup);
1852                 if (ret < 0)
1853                         goto on_error;
1854         }
1855         retval = true;
1856
1857 on_error:
1858         return retval;
1859 }
1860
1861 static int recursive_count_nrtasks(char *dirname)
1862 {
1863         __do_free char *path = NULL;
1864         __do_closedir DIR *dir = NULL;
1865         struct dirent *direntp;
1866         int count = 0, ret;
1867
1868         dir = opendir(dirname);
1869         if (!dir)
1870                 return 0;
1871
1872         while ((direntp = readdir(dir))) {
1873                 struct stat mystat;
1874
1875                 if (!strcmp(direntp->d_name, ".") ||
1876                     !strcmp(direntp->d_name, ".."))
1877                         continue;
1878
1879                 path = must_make_path(dirname, direntp->d_name, NULL);
1880
1881                 if (lstat(path, &mystat))
1882                         continue;
1883
1884                 if (!S_ISDIR(mystat.st_mode))
1885                         continue;
1886
1887                 count += recursive_count_nrtasks(path);
1888         }
1889
1890         path = must_make_path(dirname, "cgroup.procs", NULL);
1891         ret = lxc_count_file_lines(path);
1892         if (ret != -1)
1893                 count += ret;
1894
1895         return count;
1896 }
1897
1898 __cgfsng_ops static int cgfsng_nrtasks(struct cgroup_ops *ops)
1899 {
1900         __do_free char *path = NULL;
1901         int count;
1902
1903         if (!ops->container_cgroup || !ops->hierarchies)
1904                 return -1;
1905
1906         path = must_make_path(ops->hierarchies[0]->container_full_path, NULL);
1907         count = recursive_count_nrtasks(path);
1908         return count;
1909 }
1910
1911 /* Only root needs to escape to the cgroup of its init. */
1912 __cgfsng_ops static bool cgfsng_escape(const struct cgroup_ops *ops,
1913                                          struct lxc_conf *conf)
1914 {
1915         int i;
1916
1917         if (conf->cgroup_meta.relative || geteuid() || !ops->hierarchies)
1918                 return true;
1919
1920         for (i = 0; ops->hierarchies[i]; i++) {
1921                 int ret;
1922                 __do_free char *fullpath = NULL;
1923
1924                 fullpath = must_make_path(ops->hierarchies[i]->mountpoint,
1925                                           ops->hierarchies[i]->container_base_path,
1926                                           "cgroup.procs", NULL);
1927                 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1928                 if (ret != 0) {
1929                         SYSERROR("Failed to escape to cgroup \"%s\"", fullpath);
1930                         return false;
1931                 }
1932         }
1933
1934         return true;
1935 }
1936
1937 __cgfsng_ops static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
1938 {
1939         int i = 0;
1940
1941         if (!ops->hierarchies)
1942                 return 0;
1943
1944         for (; ops->hierarchies[i]; i++)
1945                 ;
1946
1947         return i;
1948 }
1949
1950 __cgfsng_ops static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out)
1951 {
1952         int i;
1953
1954         if (!ops->hierarchies)
1955                 return false;
1956
1957         /* sanity check n */
1958         for (i = 0; i < n; i++)
1959                 if (!ops->hierarchies[i])
1960                         return false;
1961
1962         *out = ops->hierarchies[i]->controllers;
1963
1964         return true;
1965 }
1966
1967 #define THAWED "THAWED"
1968 #define THAWED_LEN (strlen(THAWED))
1969
1970 /* TODO: If the unified cgroup hierarchy grows a freezer controller this needs
1971  * to be adapted.
1972  */
1973 __cgfsng_ops static bool cgfsng_unfreeze(struct cgroup_ops *ops)
1974 {
1975         int ret;
1976         __do_free char *fullpath = NULL;
1977         struct hierarchy *h;
1978
1979         h = get_hierarchy(ops, "freezer");
1980         if (!h)
1981                 return false;
1982
1983         fullpath = must_make_path(h->container_full_path, "freezer.state", NULL);
1984         ret = lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false, 0666);
1985         if (ret < 0)
1986                 return false;
1987
1988         return true;
1989 }
1990
1991 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
1992                                                     const char *controller)
1993 {
1994         struct hierarchy *h;
1995
1996         h = get_hierarchy(ops, controller);
1997         if (!h) {
1998                 WARN("Failed to find hierarchy for controller \"%s\"",
1999                      controller ? controller : "(null)");
2000                 return NULL;
2001         }
2002
2003         return h->container_full_path ? h->container_full_path + strlen(h->mountpoint) : NULL;
2004 }
2005
2006 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
2007  * which must be freed by the caller.
2008  */
2009 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
2010                                                        const char *inpath,
2011                                                        const char *filename)
2012 {
2013         return must_make_path(h->mountpoint, inpath, filename, NULL);
2014 }
2015
2016 /* Technically, we're always at a delegation boundary here (This is especially
2017  * true when cgroup namespaces are available.). The reasoning is that in order
2018  * for us to have been able to start a container in the first place the root
2019  * cgroup must have been a leaf node. Now, either the container's init system
2020  * has populated the cgroup and kept it as a leaf node or it has created
2021  * subtrees. In the former case we will simply attach to the leaf node we
2022  * created when we started the container in the latter case we create our own
2023  * cgroup for the attaching process.
2024  */
2025 static int __cg_unified_attach(const struct hierarchy *h, const char *name,
2026                                const char *lxcpath, const char *pidstr,
2027                                size_t pidstr_len, const char *controller)
2028 {
2029         __do_free char *base_path = NULL, *container_cgroup = NULL,
2030                        *full_path = NULL;
2031         int ret;
2032         size_t len;
2033         int fret = -1, idx = 0;
2034
2035         container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2036         /* not running */
2037         if (!container_cgroup)
2038                 return 0;
2039
2040         base_path = must_make_path(h->mountpoint, container_cgroup, NULL);
2041         full_path = must_make_path(base_path, "cgroup.procs", NULL);
2042         /* cgroup is populated */
2043         ret = lxc_write_to_file(full_path, pidstr, pidstr_len, false, 0666);
2044         if (ret < 0 && errno != EBUSY)
2045                 goto on_error;
2046
2047         if (ret == 0)
2048                 goto on_success;
2049
2050         len = strlen(base_path) + STRLITERALLEN("/lxc-1000") +
2051               STRLITERALLEN("/cgroup-procs");
2052         full_path = must_realloc(NULL, len + 1);
2053         do {
2054                 if (idx)
2055                         ret = snprintf(full_path, len + 1, "%s/lxc-%d",
2056                                        base_path, idx);
2057                 else
2058                         ret = snprintf(full_path, len + 1, "%s/lxc", base_path);
2059                 if (ret < 0 || (size_t)ret >= len + 1)
2060                         goto on_error;
2061
2062                 ret = mkdir_p(full_path, 0755);
2063                 if (ret < 0 && errno != EEXIST)
2064                         goto on_error;
2065
2066                 (void)strlcat(full_path, "/cgroup.procs", len + 1);
2067                 ret = lxc_write_to_file(full_path, pidstr, len, false, 0666);
2068                 if (ret == 0)
2069                         goto on_success;
2070
2071                 /* this is a non-leaf node */
2072                 if (errno != EBUSY)
2073                         goto on_error;
2074
2075                 idx++;
2076         } while (idx < 1000);
2077
2078 on_success:
2079         if (idx < 1000)
2080                 fret = 0;
2081
2082 on_error:
2083         return fret;
2084 }
2085
2086 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops, const char *name,
2087                                          const char *lxcpath, pid_t pid)
2088 {
2089         int i, len, ret;
2090         char pidstr[INTTYPE_TO_STRLEN(pid_t)];
2091
2092         if (!ops->hierarchies)
2093                 return true;
2094
2095         len = snprintf(pidstr, sizeof(pidstr), "%d", pid);
2096         if (len < 0 || (size_t)len >= sizeof(pidstr))
2097                 return false;
2098
2099         for (i = 0; ops->hierarchies[i]; i++) {
2100                 __do_free char *fullpath = NULL, *path = NULL;
2101                 struct hierarchy *h = ops->hierarchies[i];
2102
2103                 if (h->version == CGROUP2_SUPER_MAGIC) {
2104                         ret = __cg_unified_attach(h, name, lxcpath, pidstr, len,
2105                                                   h->controllers[0]);
2106                         if (ret < 0)
2107                                 return false;
2108
2109                         continue;
2110                 }
2111
2112                 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2113                 /* not running */
2114                 if (!path)
2115                         continue;
2116
2117                 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2118                 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2119                 if (ret < 0) {
2120                         SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
2121                         return false;
2122                 }
2123         }
2124
2125         return true;
2126 }
2127
2128 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.  Here we
2129  * don't have a cgroup_data set up, so we ask the running container through the
2130  * commands API for the cgroup path.
2131  */
2132 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2133                                      char *value, size_t len, const char *name,
2134                                      const char *lxcpath)
2135 {
2136         __do_free char *path = NULL;
2137         __do_free char *controller = NULL;
2138         char *p;
2139         struct hierarchy *h;
2140         int ret = -1;
2141
2142         controller = must_copy_string(filename);
2143         p = strchr(controller, '.');
2144         if (p)
2145                 *p = '\0';
2146
2147         path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2148         /* not running */
2149         if (!path)
2150                 return -1;
2151
2152         h = get_hierarchy(ops, controller);
2153         if (h) {
2154                 __do_free char *fullpath = NULL;
2155
2156                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2157                 ret = lxc_read_from_file(fullpath, value, len);
2158         }
2159
2160         return ret;
2161 }
2162
2163 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.  Here we
2164  * don't have a cgroup_data set up, so we ask the running container through the
2165  * commands API for the cgroup path.
2166  */
2167 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2168                                      const char *filename, const char *value,
2169                                      const char *name, const char *lxcpath)
2170 {
2171         __do_free char *path = NULL;
2172         __do_free char *controller = NULL;
2173         char *p;
2174         struct hierarchy *h;
2175         int ret = -1;
2176
2177         controller = must_copy_string(filename);
2178         p = strchr(controller, '.');
2179         if (p)
2180                 *p = '\0';
2181
2182         path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2183         /* not running */
2184         if (!path)
2185                 return -1;
2186
2187         h = get_hierarchy(ops, controller);
2188         if (h) {
2189                 __do_free char *fullpath = NULL;
2190
2191                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2192                 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2193         }
2194
2195         return ret;
2196 }
2197
2198 /* take devices cgroup line
2199  *    /dev/foo rwx
2200  * and convert it to a valid
2201  *    type major:minor mode
2202  * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2203  * the output.
2204  */
2205 static int convert_devpath(const char *invalue, char *dest)
2206 {
2207         __do_free char *path = NULL;
2208         int n_parts;
2209         char *p, type;
2210         unsigned long minor, major;
2211         struct stat sb;
2212         int ret = -EINVAL;
2213         char *mode = NULL;
2214
2215         path = must_copy_string(invalue);
2216
2217         /* Read path followed by mode. Ignore any trailing text.
2218          * A '    # comment' would be legal. Technically other text is not
2219          * legal, we could check for that if we cared to.
2220          */
2221         for (n_parts = 1, p = path; *p; p++) {
2222                 if (*p != ' ')
2223                         continue;
2224                 *p = '\0';
2225
2226                 if (n_parts != 1)
2227                         break;
2228                 p++;
2229                 n_parts++;
2230
2231                 while (*p == ' ')
2232                         p++;
2233
2234                 mode = p;
2235
2236                 if (*p == '\0')
2237                         goto out;
2238         }
2239
2240         if (n_parts == 1)
2241                 goto out;
2242
2243         ret = stat(path, &sb);
2244         if (ret < 0)
2245                 goto out;
2246
2247         mode_t m = sb.st_mode & S_IFMT;
2248         switch (m) {
2249         case S_IFBLK:
2250                 type = 'b';
2251                 break;
2252         case S_IFCHR:
2253                 type = 'c';
2254                 break;
2255         default:
2256                 ERROR("Unsupported device type %i for \"%s\"", m, path);
2257                 ret = -EINVAL;
2258                 goto out;
2259         }
2260
2261         major = MAJOR(sb.st_rdev);
2262         minor = MINOR(sb.st_rdev);
2263         ret = snprintf(dest, 50, "%c %lu:%lu %s", type, major, minor, mode);
2264         if (ret < 0 || ret >= 50) {
2265                 ERROR("Error on configuration value \"%c %lu:%lu %s\" (max 50 "
2266                       "chars)", type, major, minor, mode);
2267                 ret = -ENAMETOOLONG;
2268                 goto out;
2269         }
2270         ret = 0;
2271
2272 out:
2273         return ret;
2274 }
2275
2276 /* Called from setup_limits - here we have the container's cgroup_data because
2277  * we created the cgroups.
2278  */
2279 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2280                               const char *value)
2281 {
2282         __do_free char *controller = NULL;
2283         __do_free char *fullpath = NULL;
2284         char *p;
2285         /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2286         char converted_value[50];
2287         struct hierarchy *h;
2288         int ret = 0;
2289
2290         controller = must_copy_string(filename);
2291         p = strchr(controller, '.');
2292         if (p)
2293                 *p = '\0';
2294
2295         if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
2296                 ret = convert_devpath(value, converted_value);
2297                 if (ret < 0)
2298                         return ret;
2299                 value = converted_value;
2300         }
2301
2302         h = get_hierarchy(ops, controller);
2303         if (!h) {
2304                 ERROR("Failed to setup limits for the \"%s\" controller. "
2305                       "The controller seems to be unused by \"cgfsng\" cgroup "
2306                       "driver or not enabled on the cgroup hierarchy",
2307                       controller);
2308                 errno = ENOENT;
2309                 return -ENOENT;
2310         }
2311
2312         fullpath = must_make_path(h->container_full_path, filename, NULL);
2313         ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2314         return ret;
2315 }
2316
2317 static bool __cg_legacy_setup_limits(struct cgroup_ops *ops,
2318                                      struct lxc_list *cgroup_settings,
2319                                      bool do_devices)
2320 {
2321         __do_free struct lxc_list *sorted_cgroup_settings = NULL;
2322         struct lxc_list *iterator, *next;
2323         struct lxc_cgroup *cg;
2324         bool ret = false;
2325
2326         if (lxc_list_empty(cgroup_settings))
2327                 return true;
2328
2329         if (!ops->hierarchies)
2330                 return false;
2331
2332         sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2333         if (!sorted_cgroup_settings)
2334                 return false;
2335
2336         lxc_list_for_each(iterator, sorted_cgroup_settings) {
2337                 cg = iterator->elem;
2338
2339                 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2340                         if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
2341                                 if (do_devices && (errno == EACCES || errno == EPERM)) {
2342                                         WARN("Failed to set \"%s\" to \"%s\"",
2343                                              cg->subsystem, cg->value);
2344                                         continue;
2345                                 }
2346                                 WARN("Failed to set \"%s\" to \"%s\"",
2347                                      cg->subsystem, cg->value);
2348                                 goto out;
2349                         }
2350                         DEBUG("Set controller \"%s\" set to \"%s\"",
2351                               cg->subsystem, cg->value);
2352                 }
2353         }
2354
2355         ret = true;
2356         INFO("Limits for the legacy cgroup hierarchies have been setup");
2357 out:
2358         lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2359                 lxc_list_del(iterator);
2360                 free(iterator);
2361         }
2362
2363         return ret;
2364 }
2365
2366 static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
2367                                       struct lxc_list *cgroup_settings)
2368 {
2369         struct lxc_list *iterator;
2370         struct hierarchy *h = ops->unified;
2371
2372         if (lxc_list_empty(cgroup_settings))
2373                 return true;
2374
2375         if (!h)
2376                 return false;
2377
2378         lxc_list_for_each(iterator, cgroup_settings) {
2379                 __do_free char *fullpath = NULL;
2380                 int ret;
2381                 struct lxc_cgroup *cg = iterator->elem;
2382
2383                 fullpath = must_make_path(h->container_full_path, cg->subsystem, NULL);
2384                 ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false, 0666);
2385                 if (ret < 0) {
2386                         SYSERROR("Failed to set \"%s\" to \"%s\"",
2387                                  cg->subsystem, cg->value);
2388                         return false;
2389                 }
2390                 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2391         }
2392
2393         INFO("Limits for the unified cgroup hierarchy have been setup");
2394         return true;
2395 }
2396
2397 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2398                                              struct lxc_conf *conf,
2399                                              bool do_devices)
2400 {
2401         if (!__cg_legacy_setup_limits(ops, &conf->cgroup, do_devices))
2402                 return false;
2403
2404         return __cg_unified_setup_limits(ops, &conf->cgroup2);
2405 }
2406
2407 static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
2408                                        char **controllers)
2409 {
2410         if (!ops->cgroup_use)
2411                 return true;
2412
2413         for (char **cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
2414                 bool found = false;
2415
2416                 for (char **cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
2417                         if (strcmp(*cur_use, *cur_ctrl) != 0)
2418                                 continue;
2419
2420                         found = true;
2421                         break;
2422                 }
2423
2424                 if (found)
2425                         continue;
2426
2427                 return false;
2428         }
2429
2430         return true;
2431 }
2432
2433 static void cg_unified_delegate(char ***delegate)
2434 {
2435         __do_free char *tmp = NULL;
2436         int idx;
2437         char *standard[] = {"cgroup.subtree_control", "cgroup.threads", NULL};
2438
2439         tmp = read_file("/sys/kernel/cgroup/delegate");
2440         if (!tmp) {
2441                 for (char **p = standard; p && *p; p++) {
2442                         idx = append_null_to_list((void ***)delegate);
2443                         (*delegate)[idx] = must_copy_string(*p);
2444                 }
2445         } else {
2446                 char *token;
2447                 lxc_iterate_parts (token, tmp, " \t\n") {
2448                         /*
2449                          * We always need to chown this for both cgroup and
2450                          * cgroup2.
2451                          */
2452                         if (strcmp(token, "cgroup.procs") == 0)
2453                                 continue;
2454
2455                         idx = append_null_to_list((void ***)delegate);
2456                         (*delegate)[idx] = must_copy_string(token);
2457                 }
2458         }
2459 }
2460
2461 /* At startup, parse_hierarchies finds all the info we need about cgroup
2462  * mountpoints and current cgroups, and stores it in @d.
2463  */
2464 static bool cg_hybrid_init(struct cgroup_ops *ops, bool relative,
2465                            bool unprivileged)
2466 {
2467         __do_free char *basecginfo = NULL;
2468         __do_free char *line = NULL;
2469         __do_fclose FILE *f = NULL;
2470         int ret;
2471         size_t len = 0;
2472         char **klist = NULL, **nlist = NULL;
2473
2474         /* Root spawned containers escape the current cgroup, so use init's
2475          * cgroups as our base in that case.
2476          */
2477         if (!relative && (geteuid() == 0))
2478                 basecginfo = read_file("/proc/1/cgroup");
2479         else
2480                 basecginfo = read_file("/proc/self/cgroup");
2481         if (!basecginfo)
2482                 return false;
2483
2484         ret = get_existing_subsystems(&klist, &nlist);
2485         if (ret < 0) {
2486                 ERROR("Failed to retrieve available legacy cgroup controllers");
2487                 return false;
2488         }
2489
2490         f = fopen("/proc/self/mountinfo", "r");
2491         if (!f) {
2492                 ERROR("Failed to open \"/proc/self/mountinfo\"");
2493                 return false;
2494         }
2495
2496         lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
2497
2498         while (getline(&line, &len, f) != -1) {
2499                 int type;
2500                 bool writeable;
2501                 struct hierarchy *new;
2502                 char *base_cgroup = NULL, *mountpoint = NULL;
2503                 char **controller_list = NULL;
2504
2505                 type = get_cgroup_version(line);
2506                 if (type == 0)
2507                         continue;
2508
2509                 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
2510                         continue;
2511
2512                 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
2513                         if (type == CGROUP2_SUPER_MAGIC)
2514                                 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2515                         else if (type == CGROUP_SUPER_MAGIC)
2516                                 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
2517                 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
2518                         if (type == CGROUP_SUPER_MAGIC)
2519                                 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2520                 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
2521                         if (type == CGROUP2_SUPER_MAGIC)
2522                                 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2523                 }
2524
2525                 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
2526                 if (!controller_list && type == CGROUP_SUPER_MAGIC)
2527                         continue;
2528
2529                 if (type == CGROUP_SUPER_MAGIC)
2530                         if (controller_list_is_dup(ops->hierarchies, controller_list))
2531                                 goto next;
2532
2533                 mountpoint = cg_hybrid_get_mountpoint(line);
2534                 if (!mountpoint) {
2535                         ERROR("Failed parsing mountpoint from \"%s\"", line);
2536                         goto next;
2537                 }
2538
2539                 if (type == CGROUP_SUPER_MAGIC)
2540                         base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
2541                 else
2542                         base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
2543                 if (!base_cgroup) {
2544                         ERROR("Failed to find current cgroup");
2545                         goto next;
2546                 }
2547
2548                 trim(base_cgroup);
2549                 prune_init_scope(base_cgroup);
2550                 if (type == CGROUP2_SUPER_MAGIC)
2551                         writeable = test_writeable_v2(mountpoint, base_cgroup);
2552                 else
2553                         writeable = test_writeable_v1(mountpoint, base_cgroup);
2554                 if (!writeable)
2555                         goto next;
2556
2557                 if (type == CGROUP2_SUPER_MAGIC) {
2558                         char *cgv2_ctrl_path;
2559
2560                         cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
2561                                                         "cgroup.controllers",
2562                                                         NULL);
2563
2564                         controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
2565                         free(cgv2_ctrl_path);
2566                         if (!controller_list) {
2567                                 controller_list = cg_unified_make_empty_controller();
2568                                 TRACE("No controllers are enabled for "
2569                                       "delegation in the unified hierarchy");
2570                         }
2571                 }
2572
2573                 /* Exclude all controllers that cgroup use does not want. */
2574                 if (!cgroup_use_wants_controllers(ops, controller_list))
2575                         goto next;
2576
2577                 new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
2578                 if (type == CGROUP2_SUPER_MAGIC && !ops->unified) {
2579                         if (unprivileged)
2580                                 cg_unified_delegate(&new->cgroup2_chown);
2581                         ops->unified = new;
2582                 }
2583
2584                 continue;
2585
2586         next:
2587                 free_string_list(controller_list);
2588                 free(mountpoint);
2589                 free(base_cgroup);
2590         }
2591
2592         free_string_list(klist);
2593         free_string_list(nlist);
2594
2595         TRACE("Writable cgroup hierarchies:");
2596         lxc_cgfsng_print_hierarchies(ops);
2597
2598         /* verify that all controllers in cgroup.use and all crucial
2599          * controllers are accounted for
2600          */
2601         if (!all_controllers_found(ops))
2602                 return false;
2603
2604         return true;
2605 }
2606
2607 static int cg_is_pure_unified(void)
2608 {
2609
2610         int ret;
2611         struct statfs fs;
2612
2613         ret = statfs("/sys/fs/cgroup", &fs);
2614         if (ret < 0)
2615                 return -ENOMEDIUM;
2616
2617         if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
2618                 return CGROUP2_SUPER_MAGIC;
2619
2620         return 0;
2621 }
2622
2623 /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
2624 static char *cg_unified_get_current_cgroup(bool relative)
2625 {
2626         __do_free char *basecginfo = NULL;
2627         char *base_cgroup;
2628         char *copy = NULL;
2629
2630         if (!relative && (geteuid() == 0))
2631                 basecginfo = read_file("/proc/1/cgroup");
2632         else
2633                 basecginfo = read_file("/proc/self/cgroup");
2634         if (!basecginfo)
2635                 return NULL;
2636
2637         base_cgroup = strstr(basecginfo, "0::/");
2638         if (!base_cgroup)
2639                 goto cleanup_on_err;
2640
2641         base_cgroup = base_cgroup + 3;
2642         copy = copy_to_eol(base_cgroup);
2643         if (!copy)
2644                 goto cleanup_on_err;
2645
2646 cleanup_on_err:
2647         if (copy)
2648                 trim(copy);
2649
2650         return copy;
2651 }
2652
2653 static int cg_unified_init(struct cgroup_ops *ops, bool relative,
2654                            bool unprivileged)
2655 {
2656         __do_free char *subtree_path = NULL;
2657         int ret;
2658         char *mountpoint;
2659         char **delegatable;
2660         struct hierarchy *new;
2661         char *base_cgroup = NULL;
2662
2663         ret = cg_is_pure_unified();
2664         if (ret == -ENOMEDIUM)
2665                 return -ENOMEDIUM;
2666
2667         if (ret != CGROUP2_SUPER_MAGIC)
2668                 return 0;
2669
2670         base_cgroup = cg_unified_get_current_cgroup(relative);
2671         if (!base_cgroup)
2672                 return -EINVAL;
2673         prune_init_scope(base_cgroup);
2674
2675         /* We assume that we have already been given controllers to delegate
2676          * further down the hierarchy. If not it is up to the user to delegate
2677          * them to us.
2678          */
2679         mountpoint = must_copy_string("/sys/fs/cgroup");
2680         subtree_path = must_make_path(mountpoint, base_cgroup,
2681                                       "cgroup.subtree_control", NULL);
2682         delegatable = cg_unified_get_controllers(subtree_path);
2683         if (!delegatable)
2684                 delegatable = cg_unified_make_empty_controller();
2685         if (!delegatable[0])
2686                 TRACE("No controllers are enabled for delegation");
2687
2688         /* TODO: If the user requested specific controllers via lxc.cgroup.use
2689          * we should verify here. The reason I'm not doing it right is that I'm
2690          * not convinced that lxc.cgroup.use will be the future since it is a
2691          * global property. I much rather have an option that lets you request
2692          * controllers per container.
2693          */
2694
2695         new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
2696         if (!unprivileged)
2697                 cg_unified_delegate(&new->cgroup2_chown);
2698
2699         ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2700         ops->unified = new;
2701         return CGROUP2_SUPER_MAGIC;
2702 }
2703
2704 static bool cg_init(struct cgroup_ops *ops, struct lxc_conf *conf)
2705 {
2706         int ret;
2707         const char *tmp;
2708         bool relative = conf->cgroup_meta.relative;
2709
2710         tmp = lxc_global_config_value("lxc.cgroup.use");
2711         if (tmp) {
2712                 __do_free char *pin = NULL;
2713                 char *chop, *cur;
2714
2715                 pin = must_copy_string(tmp);
2716                 chop = pin;
2717
2718                 lxc_iterate_parts(cur, chop, ",")
2719                         must_append_string(&ops->cgroup_use, cur);
2720         }
2721
2722         ret = cg_unified_init(ops, relative, !lxc_list_empty(&conf->id_map));
2723         if (ret < 0)
2724                 return false;
2725
2726         if (ret == CGROUP2_SUPER_MAGIC)
2727                 return true;
2728
2729         return cg_hybrid_init(ops, relative, !lxc_list_empty(&conf->id_map));
2730 }
2731
2732 __cgfsng_ops static bool cgfsng_data_init(struct cgroup_ops *ops)
2733 {
2734         const char *cgroup_pattern;
2735
2736         /* copy system-wide cgroup information */
2737         cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
2738         if (!cgroup_pattern) {
2739                 /* lxc.cgroup.pattern is only NULL on error. */
2740                 ERROR("Failed to retrieve cgroup pattern");
2741                 return false;
2742         }
2743         ops->cgroup_pattern = must_copy_string(cgroup_pattern);
2744         ops->monitor_pattern = MONITOR_CGROUP;
2745
2746         return true;
2747 }
2748
2749 struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
2750 {
2751         __do_free struct cgroup_ops *cgfsng_ops = NULL;
2752
2753         cgfsng_ops = malloc(sizeof(struct cgroup_ops));
2754         if (!cgfsng_ops)
2755                 return NULL;
2756
2757         memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
2758         cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
2759
2760         if (!cg_init(cgfsng_ops, conf))
2761                 return NULL;
2762
2763         cgfsng_ops->data_init = cgfsng_data_init;
2764         cgfsng_ops->payload_destroy = cgfsng_payload_destroy;
2765         cgfsng_ops->monitor_destroy = cgfsng_monitor_destroy;
2766         cgfsng_ops->monitor_create = cgfsng_monitor_create;
2767         cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
2768         cgfsng_ops->payload_create = cgfsng_payload_create;
2769         cgfsng_ops->payload_enter = cgfsng_payload_enter;
2770         cgfsng_ops->escape = cgfsng_escape;
2771         cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
2772         cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
2773         cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
2774         cgfsng_ops->get = cgfsng_get;
2775         cgfsng_ops->set = cgfsng_set;
2776         cgfsng_ops->unfreeze = cgfsng_unfreeze;
2777         cgfsng_ops->setup_limits = cgfsng_setup_limits;
2778         cgfsng_ops->driver = "cgfsng";
2779         cgfsng_ops->version = "1.0.0";
2780         cgfsng_ops->attach = cgfsng_attach;
2781         cgfsng_ops->chown = cgfsng_chown;
2782         cgfsng_ops->mount = cgfsng_mount;
2783         cgfsng_ops->nrtasks = cgfsng_nrtasks;
2784
2785         return move_ptr(cgfsng_ops);
2786 }