src/lxc/cgroups/cgfsng.c

   1 /*
   2  * lxc: linux Container library
   3  *
   4  * Copyright © 2016 Canonical Ltd.
   5  *
   6  * Authors:
   7  * Serge Hallyn <serge.hallyn@ubuntu.com>
   8  * Christian Brauner <christian.brauner@ubuntu.com>
   9  *
  10  * This library is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * This library is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with this library; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /*
  26  * cgfs-ng.c: this is a new, simplified implementation of a filesystem
  27  * cgroup backend.  The original cgfs.c was designed to be as flexible
  28  * as possible.  It would try to find cgroup filesystems no matter where
  29  * or how you had them mounted, and deduce the most usable mount for
  30  * each controller.
  31  *
  32  * This new implementation assumes that cgroup filesystems are mounted
  33  * under /sys/fs/cgroup/clist where clist is either the controller, or
  34  * a comman-separated list of controllers.
  35  */
  36
  37 #include "config.h"
  38
  39 #include <ctype.h>
  40 #include <dirent.h>
  41 #include <errno.h>
  42 #include <grp.h>
  43 #include <stdint.h>
  44 #include <stdio.h>
  45 #include <stdlib.h>
  46 #include <string.h>
  47 #include <unistd.h>
  48 #include <linux/kdev_t.h>
  49 #include <linux/types.h>
  50 #include <sys/types.h>
  51
  52 #include "caps.h"
  53 #include "cgroup.h"
  54 #include "cgroup_utils.h"
  55 #include "commands.h"
  56 #include "conf.h"
  57 #include "log.h"
  58 #include "macro.h"
  59 #include "storage/storage.h"
  60 #include "utils.h"
  61
  62 #ifndef HAVE_STRLCPY
  63 #include "include/strlcpy.h"
  64 #endif
  65
  66 #ifndef HAVE_STRLCAT
  67 #include "include/strlcat.h"
  68 #endif
  69
  70 lxc_log_define(cgfsng, cgroup);
  71
  72 static void free_string_list(char **clist)
  73 {
  74         int i;
  75
  76         if (!clist)
  77                 return;
  78
  79         for (i = 0; clist[i]; i++)
  80                 free(clist[i]);
  81
  82         free(clist);
  83 }
  84
  85 /* Allocate a pointer, do not fail. */
  86 static void *must_alloc(size_t sz)
  87 {
  88         return must_realloc(NULL, sz);
  89 }
  90
  91 /* Given a pointer to a null-terminated array of pointers, realloc to add one
  92  * entry, and point the new entry to NULL. Do not fail. Return the index to the
  93  * second-to-last entry - that is, the one which is now available for use
  94  * (keeping the list null-terminated).
  95  */
  96 static int append_null_to_list(void ***list)
  97 {
  98         int newentry = 0;
  99
 100         if (*list)
 101                 for (; (*list)[newentry]; newentry++)
 102                         ;
 103
 104         *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
 105         (*list)[newentry + 1] = NULL;
 106         return newentry;
 107 }
 108
 109 /* Given a null-terminated array of strings, check whether @entry is one of the
 110  * strings.
 111  */
 112 static bool string_in_list(char **list, const char *entry)
 113 {
 114         int i;
 115
 116         if (!list)
 117                 return false;
 118
 119         for (i = 0; list[i]; i++)
 120                 if (strcmp(list[i], entry) == 0)
 121                         return true;
 122
 123         return false;
 124 }
 125
 126 /* Return a copy of @entry prepending "name=", i.e.  turn "systemd" into
 127  * "name=systemd". Do not fail.
 128  */
 129 static char *cg_legacy_must_prefix_named(char *entry)
 130 {
 131         size_t len;
 132         char *prefixed;
 133
 134         len = strlen(entry);
 135         prefixed = must_alloc(len + 6);
 136
 137         memcpy(prefixed, "name=", sizeof("name=") - 1);
 138         memcpy(prefixed + sizeof("name=") - 1, entry, len);
 139         prefixed[len + 5] = '\0';
 140         return prefixed;
 141 }
 142
 143 /* Append an entry to the clist. Do not fail. @clist must be NULL the first time
 144  * we are called.
 145  *
 146  * We also handle named subsystems here. Any controller which is not a kernel
 147  * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
 148  * we refuse to use because we're not sure which we have here.
 149  * (TODO: We could work around this in some cases by just remounting to be
 150  * unambiguous, or by comparing mountpoint contents with current cgroup.)
 151  *
 152  * The last entry will always be NULL.
 153  */
 154 static void must_append_controller(char **klist, char **nlist, char ***clist,
 155                                    char *entry)
 156 {
 157         int newentry;
 158         char *copy;
 159
 160         if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
 161                 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
 162                 ERROR("It is both a named and kernel subsystem");
 163                 return;
 164         }
 165
 166         newentry = append_null_to_list((void ***)clist);
 167
 168         if (strncmp(entry, "name=", 5) == 0)
 169                 copy = must_copy_string(entry);
 170         else if (string_in_list(klist, entry))
 171                 copy = must_copy_string(entry);
 172         else
 173                 copy = cg_legacy_must_prefix_named(entry);
 174
 175         (*clist)[newentry] = copy;
 176 }
 177
 178 /* Given a handler's cgroup data, return the struct hierarchy for the controller
 179  * @c, or NULL if there is none.
 180  */
 181 struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *c)
 182 {
 183         int i;
 184
 185         if (!ops->hierarchies)
 186                 return NULL;
 187
 188         for (i = 0; ops->hierarchies[i]; i++) {
 189                 if (!c) {
 190                         /* This is the empty unified hierarchy. */
 191                         if (ops->hierarchies[i]->controllers &&
 192                             !ops->hierarchies[i]->controllers[0])
 193                                 return ops->hierarchies[i];
 194
 195                         continue;
 196                 }
 197
 198                 if (string_in_list(ops->hierarchies[i]->controllers, c))
 199                         return ops->hierarchies[i];
 200         }
 201
 202         return NULL;
 203 }
 204
 205 #define BATCH_SIZE 50
 206 static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
 207 {
 208         int newbatches = (newlen / BATCH_SIZE) + 1;
 209         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 210
 211         if (!*mem || newbatches > oldbatches) {
 212                 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
 213         }
 214 }
 215
 216 static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
 217 {
 218         size_t full = oldlen + newlen;
 219
 220         batch_realloc(dest, oldlen, full + 1);
 221
 222         memcpy(*dest + oldlen, new, newlen + 1);
 223 }
 224
 225 /* Slurp in a whole file */
 226 static char *read_file(const char *fnam)
 227 {
 228         FILE *f;
 229         char *line = NULL, *buf = NULL;
 230         size_t len = 0, fulllen = 0;
 231         int linelen;
 232
 233         f = fopen(fnam, "r");
 234         if (!f)
 235                 return NULL;
 236         while ((linelen = getline(&line, &len, f)) != -1) {
 237                 append_line(&buf, fulllen, line, linelen);
 238                 fulllen += linelen;
 239         }
 240         fclose(f);
 241         free(line);
 242         return buf;
 243 }
 244
 245 /* Taken over modified from the kernel sources. */
 246 #define NBITS 32 /* bits in uint32_t */
 247 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
 248 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
 249
 250 static void set_bit(unsigned bit, uint32_t *bitarr)
 251 {
 252         bitarr[bit / NBITS] |= (1 << (bit % NBITS));
 253 }
 254
 255 static void clear_bit(unsigned bit, uint32_t *bitarr)
 256 {
 257         bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
 258 }
 259
 260 static bool is_set(unsigned bit, uint32_t *bitarr)
 261 {
 262         return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
 263 }
 264
 265 /* Create cpumask from cpulist aka turn:
 266  *
 267  *      0,2-3
 268  *
 269  * into bit array
 270  *
 271  *      1 0 1 1
 272  */
 273 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
 274 {
 275         char *token;
 276         size_t arrlen;
 277         uint32_t *bitarr;
 278
 279         arrlen = BITS_TO_LONGS(nbits);
 280         bitarr = calloc(arrlen, sizeof(uint32_t));
 281         if (!bitarr)
 282                 return NULL;
 283
 284         lxc_iterate_parts(token, buf, ",") {
 285                 errno = 0;
 286                 unsigned end, start;
 287                 char *range;
 288
 289                 start = strtoul(token, NULL, 0);
 290                 end = start;
 291                 range = strchr(token, '-');
 292                 if (range)
 293                         end = strtoul(range + 1, NULL, 0);
 294
 295                 if (!(start <= end)) {
 296                         free(bitarr);
 297                         return NULL;
 298                 }
 299
 300                 if (end >= nbits) {
 301                         free(bitarr);
 302                         return NULL;
 303                 }
 304
 305                 while (start <= end)
 306                         set_bit(start++, bitarr);
 307         }
 308
 309         return bitarr;
 310 }
 311
 312 /* Turn cpumask into simple, comma-separated cpulist. */
 313 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
 314 {
 315         int ret;
 316         size_t i;
 317         char **cpulist = NULL;
 318         char numstr[INTTYPE_TO_STRLEN(size_t)] = {0};
 319
 320         for (i = 0; i <= nbits; i++) {
 321                 if (!is_set(i, bitarr))
 322                         continue;
 323
 324                 ret = snprintf(numstr, sizeof(numstr), "%zu", i);
 325                 if (ret < 0 || (size_t)ret >= sizeof(numstr)) {
 326                         lxc_free_array((void **)cpulist, free);
 327                         return NULL;
 328                 }
 329
 330                 ret = lxc_append_string(&cpulist, numstr);
 331                 if (ret < 0) {
 332                         lxc_free_array((void **)cpulist, free);
 333                         return NULL;
 334                 }
 335         }
 336
 337         if (!cpulist)
 338                 return NULL;
 339
 340         return lxc_string_join(",", (const char **)cpulist, false);
 341 }
 342
 343 static ssize_t get_max_cpus(char *cpulist)
 344 {
 345         char *c1, *c2;
 346         char *maxcpus = cpulist;
 347         size_t cpus = 0;
 348
 349         c1 = strrchr(maxcpus, ',');
 350         if (c1)
 351                 c1++;
 352
 353         c2 = strrchr(maxcpus, '-');
 354         if (c2)
 355                 c2++;
 356
 357         if (!c1 && !c2)
 358                 c1 = maxcpus;
 359         else if (c1 > c2)
 360                 c2 = c1;
 361         else if (c1 < c2)
 362                 c1 = c2;
 363         else if (!c1 && c2)
 364                 c1 = c2;
 365
 366         errno = 0;
 367         cpus = strtoul(c1, NULL, 0);
 368         if (errno != 0)
 369                 return -1;
 370
 371         return cpus;
 372 }
 373
 374 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
 375 static bool cg_legacy_filter_and_set_cpus(char *path, bool am_initialized)
 376 {
 377         int ret;
 378         ssize_t i;
 379         char *lastslash, *fpath, oldv;
 380         ssize_t maxisol = 0, maxposs = 0;
 381         char *cpulist = NULL, *isolcpus = NULL, *posscpus = NULL;
 382         uint32_t *isolmask = NULL, *possmask = NULL;
 383         bool bret = false, flipped_bit = false;
 384
 385         lastslash = strrchr(path, '/');
 386         if (!lastslash) {
 387                 ERROR("Failed to detect \"/\" in \"%s\"", path);
 388                 return bret;
 389         }
 390         oldv = *lastslash;
 391         *lastslash = '\0';
 392         fpath = must_make_path(path, "cpuset.cpus", NULL);
 393         posscpus = read_file(fpath);
 394         if (!posscpus) {
 395                 SYSERROR("Failed to read file \"%s\"", fpath);
 396                 goto on_error;
 397         }
 398
 399         /* Get maximum number of cpus found in possible cpuset. */
 400         maxposs = get_max_cpus(posscpus);
 401         if (maxposs < 0 || maxposs >= INT_MAX - 1)
 402                 goto on_error;
 403
 404         if (!file_exists(__ISOL_CPUS)) {
 405                 /* This system doesn't expose isolated cpus. */
 406                 DEBUG("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
 407                 cpulist = posscpus;
 408                 /* No isolated cpus but we weren't already initialized by
 409                  * someone. We should simply copy the parents cpuset.cpus
 410                  * values.
 411                  */
 412                 if (!am_initialized) {
 413                         DEBUG("Copying cpu settings of parent cgroup");
 414                         goto copy_parent;
 415                 }
 416                 /* No isolated cpus but we were already initialized by someone.
 417                  * Nothing more to do for us.
 418                  */
 419                 goto on_success;
 420         }
 421
 422         isolcpus = read_file(__ISOL_CPUS);
 423         if (!isolcpus) {
 424                 SYSERROR("Failed to read file \""__ISOL_CPUS"\"");
 425                 goto on_error;
 426         }
 427         if (!isdigit(isolcpus[0])) {
 428                 TRACE("No isolated cpus detected");
 429                 cpulist = posscpus;
 430                 /* No isolated cpus but we weren't already initialized by
 431                  * someone. We should simply copy the parents cpuset.cpus
 432                  * values.
 433                  */
 434                 if (!am_initialized) {
 435                         DEBUG("Copying cpu settings of parent cgroup");
 436                         goto copy_parent;
 437                 }
 438                 /* No isolated cpus but we were already initialized by someone.
 439                  * Nothing more to do for us.
 440                  */
 441                 goto on_success;
 442         }
 443
 444         /* Get maximum number of cpus found in isolated cpuset. */
 445         maxisol = get_max_cpus(isolcpus);
 446         if (maxisol < 0 || maxisol >= INT_MAX - 1)
 447                 goto on_error;
 448
 449         if (maxposs < maxisol)
 450                 maxposs = maxisol;
 451         maxposs++;
 452
 453         possmask = lxc_cpumask(posscpus, maxposs);
 454         if (!possmask) {
 455                 ERROR("Failed to create cpumask for possible cpus");
 456                 goto on_error;
 457         }
 458
 459         isolmask = lxc_cpumask(isolcpus, maxposs);
 460         if (!isolmask) {
 461                 ERROR("Failed to create cpumask for isolated cpus");
 462                 goto on_error;
 463         }
 464
 465         for (i = 0; i <= maxposs; i++) {
 466                 if (!is_set(i, isolmask) || !is_set(i, possmask))
 467                         continue;
 468
 469                 flipped_bit = true;
 470                 clear_bit(i, possmask);
 471         }
 472
 473         if (!flipped_bit) {
 474                 DEBUG("No isolated cpus present in cpuset");
 475                 goto on_success;
 476         }
 477         DEBUG("Removed isolated cpus from cpuset");
 478
 479         cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
 480         if (!cpulist) {
 481                 ERROR("Failed to create cpu list");
 482                 goto on_error;
 483         }
 484
 485 copy_parent:
 486         *lastslash = oldv;
 487         free(fpath);
 488         fpath = must_make_path(path, "cpuset.cpus", NULL);
 489         ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false, 0666);
 490         if (ret < 0) {
 491                 SYSERROR("Failed to write cpu list to \"%s\"", fpath);
 492                 goto on_error;
 493         }
 494
 495 on_success:
 496         bret = true;
 497
 498 on_error:
 499         free(fpath);
 500
 501         free(isolcpus);
 502         free(isolmask);
 503
 504         if (posscpus != cpulist)
 505                 free(posscpus);
 506         free(possmask);
 507
 508         free(cpulist);
 509         return bret;
 510 }
 511
 512 /* Copy contents of parent(@path)/@file to @path/@file */
 513 static bool copy_parent_file(char *path, char *file)
 514 {
 515         int ret;
 516         char *fpath, *lastslash, oldv;
 517         int len = 0;
 518         char *value = NULL;
 519
 520         lastslash = strrchr(path, '/');
 521         if (!lastslash) {
 522                 ERROR("Failed to detect \"/\" in \"%s\"", path);
 523                 return false;
 524         }
 525         oldv = *lastslash;
 526         *lastslash = '\0';
 527         fpath = must_make_path(path, file, NULL);
 528         len = lxc_read_from_file(fpath, NULL, 0);
 529         if (len <= 0)
 530                 goto on_error;
 531
 532         value = must_alloc(len + 1);
 533         ret = lxc_read_from_file(fpath, value, len);
 534         if (ret != len)
 535                 goto on_error;
 536         free(fpath);
 537
 538         *lastslash = oldv;
 539         fpath = must_make_path(path, file, NULL);
 540         ret = lxc_write_to_file(fpath, value, len, false, 0666);
 541         if (ret < 0)
 542                 SYSERROR("Failed to write \"%s\" to file \"%s\"", value, fpath);
 543         free(fpath);
 544         free(value);
 545         return ret >= 0;
 546
 547 on_error:
 548         SYSERROR("Failed to read file \"%s\"", fpath);
 549         free(fpath);
 550         free(value);
 551         return false;
 552 }
 553
 554 /* Initialize the cpuset hierarchy in first directory of @gname and set
 555  * cgroup.clone_children so that children inherit settings. Since the
 556  * h->base_path is populated by init or ourselves, we know it is already
 557  * initialized.
 558  */
 559 static bool cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
 560 {
 561         int ret;
 562         char v;
 563         char *cgpath, *clonechildrenpath, *slash;
 564
 565         if (!string_in_list(h->controllers, "cpuset"))
 566                 return true;
 567
 568         if (*cgname == '/')
 569                 cgname++;
 570         slash = strchr(cgname, '/');
 571         if (slash)
 572                 *slash = '\0';
 573
 574         cgpath = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
 575         if (slash)
 576                 *slash = '/';
 577
 578         ret = mkdir(cgpath, 0755);
 579         if (ret < 0) {
 580                 if (errno != EEXIST) {
 581                         SYSERROR("Failed to create directory \"%s\"", cgpath);
 582                         free(cgpath);
 583                         return false;
 584                 }
 585         }
 586
 587         clonechildrenpath =
 588             must_make_path(cgpath, "cgroup.clone_children", NULL);
 589         /* unified hierarchy doesn't have clone_children */
 590         if (!file_exists(clonechildrenpath)) {
 591                 free(clonechildrenpath);
 592                 free(cgpath);
 593                 return true;
 594         }
 595
 596         ret = lxc_read_from_file(clonechildrenpath, &v, 1);
 597         if (ret < 0) {
 598                 SYSERROR("Failed to read file \"%s\"", clonechildrenpath);
 599                 free(clonechildrenpath);
 600                 free(cgpath);
 601                 return false;
 602         }
 603
 604         /* Make sure any isolated cpus are removed from cpuset.cpus. */
 605         if (!cg_legacy_filter_and_set_cpus(cgpath, v == '1')) {
 606                 SYSERROR("Failed to remove isolated cpus");
 607                 free(clonechildrenpath);
 608                 free(cgpath);
 609                 return false;
 610         }
 611
 612         /* Already set for us by someone else. */
 613         if (v == '1') {
 614                 DEBUG("\"cgroup.clone_children\" was already set to \"1\"");
 615                 free(clonechildrenpath);
 616                 free(cgpath);
 617                 return true;
 618         }
 619
 620         /* copy parent's settings */
 621         if (!copy_parent_file(cgpath, "cpuset.mems")) {
 622                 SYSERROR("Failed to copy \"cpuset.mems\" settings");
 623                 free(cgpath);
 624                 free(clonechildrenpath);
 625                 return false;
 626         }
 627         free(cgpath);
 628
 629         ret = lxc_write_to_file(clonechildrenpath, "1", 1, false, 0666);
 630         if (ret < 0) {
 631                 /* Set clone_children so children inherit our settings */
 632                 SYSERROR("Failed to write 1 to \"%s\"", clonechildrenpath);
 633                 free(clonechildrenpath);
 634                 return false;
 635         }
 636         free(clonechildrenpath);
 637         return true;
 638 }
 639
 640 /* Given two null-terminated lists of strings, return true if any string is in
 641  * both.
 642  */
 643 static bool controller_lists_intersect(char **l1, char **l2)
 644 {
 645         int i;
 646
 647         if (!l1 || !l2)
 648                 return false;
 649
 650         for (i = 0; l1[i]; i++) {
 651                 if (string_in_list(l2, l1[i]))
 652                         return true;
 653         }
 654
 655         return false;
 656 }
 657
 658 /* For a null-terminated list of controllers @clist, return true if any of those
 659  * controllers is already listed the null-terminated list of hierarchies @hlist.
 660  * Realistically, if one is present, all must be present.
 661  */
 662 static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
 663 {
 664         int i;
 665
 666         if (!hlist)
 667                 return false;
 668
 669         for (i = 0; hlist[i]; i++)
 670                 if (controller_lists_intersect(hlist[i]->controllers, clist))
 671                         return true;
 672
 673         return false;
 674 }
 675
 676 /* Return true if the controller @entry is found in the null-terminated list of
 677  * hierarchies @hlist.
 678  */
 679 static bool controller_found(struct hierarchy **hlist, char *entry)
 680 {
 681         int i;
 682
 683         if (!hlist)
 684                 return false;
 685
 686         for (i = 0; hlist[i]; i++)
 687                 if (string_in_list(hlist[i]->controllers, entry))
 688                         return true;
 689
 690         return false;
 691 }
 692
 693 /* Return true if all of the controllers which we require have been found.  The
 694  * required list is  freezer and anything in lxc.cgroup.use.
 695  */
 696 static bool all_controllers_found(struct cgroup_ops *ops)
 697 {
 698         char **cur;
 699         struct hierarchy **hlist = ops->hierarchies;
 700
 701         if (!controller_found(hlist, "freezer")) {
 702                 ERROR("No freezer controller mountpoint found");
 703                 return false;
 704         }
 705
 706         if (!ops->cgroup_use)
 707                 return true;
 708
 709         for (cur = ops->cgroup_use; cur && *cur; cur++)
 710                 if (!controller_found(hlist, *cur)) {
 711                         ERROR("No %s controller mountpoint found", *cur);
 712                         return false;
 713                 }
 714
 715         return true;
 716 }
 717
 718 /* Get the controllers from a mountinfo line There are other ways we could get
 719  * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
 720  * could parse the mount options. But we simply assume that the mountpoint must
 721  * be /sys/fs/cgroup/controller-list
 722  */
 723 static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
 724                                         int type)
 725 {
 726         /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
 727          * for legacy hierarchies.
 728          */
 729         int i;
 730         char *dup, *p2, *tok;
 731         char *p = line, *sep = ",";
 732         char **aret = NULL;
 733
 734         for (i = 0; i < 4; i++) {
 735                 p = strchr(p, ' ');
 736                 if (!p)
 737                         return NULL;
 738                 p++;
 739         }
 740
 741         /* Note, if we change how mountinfo works, then our caller will need to
 742          * verify /sys/fs/cgroup/ in this field.
 743          */
 744         if (strncmp(p, "/sys/fs/cgroup/", 15) != 0) {
 745                 ERROR("Found hierarchy not under /sys/fs/cgroup: \"%s\"", p);
 746                 return NULL;
 747         }
 748
 749         p += 15;
 750         p2 = strchr(p, ' ');
 751         if (!p2) {
 752                 ERROR("Corrupt mountinfo");
 753                 return NULL;
 754         }
 755         *p2 = '\0';
 756
 757         if (type == CGROUP_SUPER_MAGIC) {
 758                 /* strdup() here for v1 hierarchies. Otherwise
 759                  * lxc_iterate_parts() will destroy mountpoints such as
 760                  * "/sys/fs/cgroup/cpu,cpuacct".
 761                  */
 762                 dup = strdup(p);
 763                 if (!dup)
 764                         return NULL;
 765
 766                 lxc_iterate_parts(tok, dup, sep) {
 767                         must_append_controller(klist, nlist, &aret, tok);
 768                 }
 769
 770                 free(dup);
 771         }
 772         *p2 = ' ';
 773
 774         return aret;
 775 }
 776
 777 static char **cg_unified_make_empty_controller(void)
 778 {
 779         int newentry;
 780         char **aret = NULL;
 781
 782         newentry = append_null_to_list((void ***)&aret);
 783         aret[newentry] = NULL;
 784         return aret;
 785 }
 786
 787 static char **cg_unified_get_controllers(const char *file)
 788 {
 789         char *buf, *tok;
 790         char *sep = " \t\n";
 791         char **aret = NULL;
 792
 793         buf = read_file(file);
 794         if (!buf)
 795                 return NULL;
 796
 797         lxc_iterate_parts(tok, buf, sep) {
 798                 int newentry;
 799                 char *copy;
 800
 801                 newentry = append_null_to_list((void ***)&aret);
 802                 copy = must_copy_string(tok);
 803                 aret[newentry] = copy;
 804         }
 805
 806         free(buf);
 807         return aret;
 808 }
 809
 810 static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
 811                                        char *container_base_path, int type)
 812 {
 813         struct hierarchy *new;
 814         int newentry;
 815
 816         new = must_alloc(sizeof(*new));
 817         new->controllers = clist;
 818         new->mountpoint = mountpoint;
 819         new->container_base_path = container_base_path;
 820         new->container_full_path = NULL;
 821         new->monitor_full_path = NULL;
 822         new->version = type;
 823
 824         newentry = append_null_to_list((void ***)h);
 825         (*h)[newentry] = new;
 826         return new;
 827 }
 828
 829 /* Get a copy of the mountpoint from @line, which is a line from
 830  * /proc/self/mountinfo.
 831  */
 832 static char *cg_hybrid_get_mountpoint(char *line)
 833 {
 834         int i;
 835         size_t len;
 836         char *p2;
 837         char *p = line, *sret = NULL;
 838
 839         for (i = 0; i < 4; i++) {
 840                 p = strchr(p, ' ');
 841                 if (!p)
 842                         return NULL;
 843                 p++;
 844         }
 845
 846         if (strncmp(p, "/sys/fs/cgroup/", 15) != 0)
 847                 return NULL;
 848
 849         p2 = strchr(p + 15, ' ');
 850         if (!p2)
 851                 return NULL;
 852         *p2 = '\0';
 853
 854         len = strlen(p);
 855         sret = must_alloc(len + 1);
 856         memcpy(sret, p, len);
 857         sret[len] = '\0';
 858         return sret;
 859 }
 860
 861 /* Given a multi-line string, return a null-terminated copy of the current line. */
 862 static char *copy_to_eol(char *p)
 863 {
 864         char *p2 = strchr(p, '\n'), *sret;
 865         size_t len;
 866
 867         if (!p2)
 868                 return NULL;
 869
 870         len = p2 - p;
 871         sret = must_alloc(len + 1);
 872         memcpy(sret, p, len);
 873         sret[len] = '\0';
 874         return sret;
 875 }
 876
 877 /* cgline: pointer to character after the first ':' in a line in a \n-terminated
 878  * /proc/self/cgroup file. Check whether controller c is present.
 879  */
 880 static bool controller_in_clist(char *cgline, char *c)
 881 {
 882         char *tok, *eol, *tmp;
 883         size_t len;
 884
 885         eol = strchr(cgline, ':');
 886         if (!eol)
 887                 return false;
 888
 889         len = eol - cgline;
 890         tmp = alloca(len + 1);
 891         memcpy(tmp, cgline, len);
 892         tmp[len] = '\0';
 893
 894         lxc_iterate_parts(tok, tmp, ",") {
 895                 if (strcmp(tok, c) == 0)
 896                         return true;
 897         }
 898
 899         return false;
 900 }
 901
 902 /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
 903  * @controller.
 904  */
 905 static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
 906                                           int type)
 907 {
 908         char *p = basecginfo;
 909
 910         for (;;) {
 911                 bool is_cgv2_base_cgroup = false;
 912
 913                 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
 914                 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
 915                         is_cgv2_base_cgroup = true;
 916
 917                 p = strchr(p, ':');
 918                 if (!p)
 919                         return NULL;
 920                 p++;
 921
 922                 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
 923                         p = strchr(p, ':');
 924                         if (!p)
 925                                 return NULL;
 926                         p++;
 927                         return copy_to_eol(p);
 928                 }
 929
 930                 p = strchr(p, '\n');
 931                 if (!p)
 932                         return NULL;
 933                 p++;
 934         }
 935 }
 936
 937 static void must_append_string(char ***list, char *entry)
 938 {
 939         int newentry;
 940         char *copy;
 941
 942         newentry = append_null_to_list((void ***)list);
 943         copy = must_copy_string(entry);
 944         (*list)[newentry] = copy;
 945 }
 946
 947 static int get_existing_subsystems(char ***klist, char ***nlist)
 948 {
 949         FILE *f;
 950         char *line = NULL;
 951         size_t len = 0;
 952
 953         f = fopen("/proc/self/cgroup", "r");
 954         if (!f)
 955                 return -1;
 956
 957         while (getline(&line, &len, f) != -1) {
 958                 char *p, *p2, *tok;
 959                 p = strchr(line, ':');
 960                 if (!p)
 961                         continue;
 962                 p++;
 963                 p2 = strchr(p, ':');
 964                 if (!p2)
 965                         continue;
 966                 *p2 = '\0';
 967
 968                 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
 969                  * contains an entry of the form:
 970                  *
 971                  *      0::/some/path
 972                  *
 973                  * In this case we use "cgroup2" as controller name.
 974                  */
 975                 if ((p2 - p) == 0) {
 976                         must_append_string(klist, "cgroup2");
 977                         continue;
 978                 }
 979
 980                 lxc_iterate_parts(tok, p, ",") {
 981                         if (strncmp(tok, "name=", 5) == 0)
 982                                 must_append_string(nlist, tok);
 983                         else
 984                                 must_append_string(klist, tok);
 985                 }
 986         }
 987
 988         free(line);
 989         fclose(f);
 990         return 0;
 991 }
 992
 993 static void trim(char *s)
 994 {
 995         size_t len;
 996
 997         len = strlen(s);
 998         while ((len > 1) && (s[len - 1] == '\n'))
 999                 s[--len] = '\0';
1000 }
1001
1002 static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
1003 {
1004         int i;
1005         struct hierarchy **it;
1006
1007         if (!ops->hierarchies) {
1008                 TRACE("  No hierarchies found");
1009                 return;
1010         }
1011
1012         TRACE("  Hierarchies:");
1013         for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
1014                 int j;
1015                 char **cit;
1016
1017                 TRACE("  %d: base_cgroup: %s", i, (*it)->container_base_path ? (*it)->container_base_path : "(null)");
1018                 TRACE("      mountpoint:  %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
1019                 TRACE("      controllers:");
1020                 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
1021                         TRACE("      %d: %s", j, *cit);
1022         }
1023 }
1024
1025 static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
1026                                               char **nlist)
1027 {
1028         int k;
1029         char **it;
1030
1031         TRACE("basecginfo is:");
1032         TRACE("%s", basecginfo);
1033
1034         for (k = 0, it = klist; it && *it; it++, k++)
1035                 TRACE("kernel subsystem %d: %s", k, *it);
1036
1037         for (k = 0, it = nlist; it && *it; it++, k++)
1038                 TRACE("named subsystem %d: %s", k, *it);
1039 }
1040
1041 static int cgroup_rmdir(struct hierarchy **hierarchies,
1042                         const char *container_cgroup)
1043 {
1044         int i;
1045
1046         if (!container_cgroup || !hierarchies)
1047                 return 0;
1048
1049         for (i = 0; hierarchies[i]; i++) {
1050                 int ret;
1051                 struct hierarchy *h = hierarchies[i];
1052
1053                 if (!h->container_full_path)
1054                         continue;
1055
1056                 ret = recursive_destroy(h->container_full_path);
1057                 if (ret < 0)
1058                         WARN("Failed to destroy \"%s\"", h->container_full_path);
1059
1060                 free(h->container_full_path);
1061                 h->container_full_path = NULL;
1062         }
1063
1064         return 0;
1065 }
1066
1067 struct generic_userns_exec_data {
1068         struct hierarchy **hierarchies;
1069         const char *container_cgroup;
1070         struct lxc_conf *conf;
1071         uid_t origuid; /* target uid in parent namespace */
1072         char *path;
1073 };
1074
1075 static int cgroup_rmdir_wrapper(void *data)
1076 {
1077         int ret;
1078         struct generic_userns_exec_data *arg = data;
1079         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1080         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1081
1082         ret = setresgid(nsgid, nsgid, nsgid);
1083         if (ret < 0) {
1084                 SYSERROR("Failed to setresgid(%d, %d, %d)", (int)nsgid,
1085                          (int)nsgid, (int)nsgid);
1086                 return -1;
1087         }
1088
1089         ret = setresuid(nsuid, nsuid, nsuid);
1090         if (ret < 0) {
1091                 SYSERROR("Failed to setresuid(%d, %d, %d)", (int)nsuid,
1092                          (int)nsuid, (int)nsuid);
1093                 return -1;
1094         }
1095
1096         ret = setgroups(0, NULL);
1097         if (ret < 0 && errno != EPERM) {
1098                 SYSERROR("Failed to setgroups(0, NULL)");
1099                 return -1;
1100         }
1101
1102         return cgroup_rmdir(arg->hierarchies, arg->container_cgroup);
1103 }
1104
1105 __cgfsng_ops static void cgfsng_destroy(struct cgroup_ops *ops, struct lxc_handler *handler)
1106 {
1107         int ret;
1108         struct generic_userns_exec_data wrap;
1109
1110         wrap.origuid = 0;
1111         wrap.container_cgroup = ops->container_cgroup;
1112         wrap.hierarchies = ops->hierarchies;
1113         wrap.conf = handler->conf;
1114
1115         if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
1116                 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
1117                                     "cgroup_rmdir_wrapper");
1118         else
1119                 ret = cgroup_rmdir(ops->hierarchies, ops->container_cgroup);
1120         if (ret < 0) {
1121                 WARN("Failed to destroy cgroups");
1122                 return;
1123         }
1124 }
1125
1126 static bool cg_unified_create_cgroup(struct hierarchy *h, char *cgname)
1127 {
1128         size_t i, parts_len;
1129         char **it;
1130         size_t full_len = 0;
1131         char *add_controllers = NULL, *cgroup = NULL;
1132         char **parts = NULL;
1133         bool bret = false;
1134
1135         if (h->version != CGROUP2_SUPER_MAGIC)
1136                 return true;
1137
1138         if (!h->controllers)
1139                 return true;
1140
1141         /* For now we simply enable all controllers that we have detected by
1142          * creating a string like "+memory +pids +cpu +io".
1143          * TODO: In the near future we might want to support "-<controller>"
1144          * etc. but whether supporting semantics like this make sense will need
1145          * some thinking.
1146          */
1147         for (it = h->controllers; it && *it; it++) {
1148                 full_len += strlen(*it) + 2;
1149                 add_controllers = must_realloc(add_controllers, full_len + 1);
1150
1151                 if (h->controllers[0] == *it)
1152                         add_controllers[0] = '\0';
1153
1154                 (void)strlcat(add_controllers, "+", full_len + 1);
1155                 (void)strlcat(add_controllers, *it, full_len + 1);
1156
1157                 if ((it + 1) && *(it + 1))
1158                         (void)strlcat(add_controllers, " ", full_len + 1);
1159         }
1160
1161         parts = lxc_string_split(cgname, '/');
1162         if (!parts)
1163                 goto on_error;
1164
1165         parts_len = lxc_array_len((void **)parts);
1166         if (parts_len > 0)
1167                 parts_len--;
1168
1169         cgroup = must_make_path(h->mountpoint, h->container_base_path, NULL);
1170         for (i = 0; i < parts_len; i++) {
1171                 int ret;
1172                 char *target;
1173
1174                 cgroup = must_append_path(cgroup, parts[i], NULL);
1175                 target = must_make_path(cgroup, "cgroup.subtree_control", NULL);
1176                 ret = lxc_write_to_file(target, add_controllers, full_len, false, 0666);
1177                 free(target);
1178                 if (ret < 0) {
1179                         SYSERROR("Could not enable \"%s\" controllers in the "
1180                                  "unified cgroup \"%s\"", add_controllers, cgroup);
1181                         goto on_error;
1182                 }
1183         }
1184
1185         bret = true;
1186
1187 on_error:
1188         lxc_free_array((void **)parts, free);
1189         free(add_controllers);
1190         free(cgroup);
1191         return bret;
1192 }
1193
1194 static bool monitor_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1195 {
1196         int ret;
1197
1198         h->monitor_full_path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
1199         if (dir_exists(h->monitor_full_path))
1200                 return true;
1201
1202         ret = mkdir_p(h->monitor_full_path, 0755);
1203         if (ret < 0) {
1204                 ERROR("Failed to create cgroup \"%s\"", h->monitor_full_path);
1205                 return false;
1206         }
1207
1208         return cg_unified_create_cgroup(h, cgname);
1209 }
1210
1211 static bool container_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1212 {
1213         int ret;
1214
1215         h->container_full_path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
1216         if (dir_exists(h->container_full_path)) {
1217                 ERROR("The cgroup \"%s\" already existed", h->container_full_path);
1218                 return false;
1219         }
1220
1221         if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
1222                 ERROR("Failed to handle legacy cpuset controller");
1223                 return false;
1224         }
1225
1226         ret = mkdir_p(h->container_full_path, 0755);
1227         if (ret < 0) {
1228                 ERROR("Failed to create cgroup \"%s\"", h->container_full_path);
1229                 return false;
1230         }
1231
1232         return cg_unified_create_cgroup(h, cgname);
1233 }
1234
1235 static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname, bool monitor)
1236 {
1237         int ret;
1238         char *full_path;
1239
1240         if (monitor)
1241                 full_path = h->monitor_full_path;
1242         else
1243                 full_path = h->container_full_path;
1244
1245         ret = rmdir(full_path);
1246         if (ret < 0)
1247                 SYSERROR("Failed to rmdir(\"%s\") from failed creation attempt", full_path);
1248
1249         free(full_path);
1250
1251         if (monitor)
1252                 h->monitor_full_path = NULL;
1253         else
1254                 h->container_full_path = NULL;
1255 }
1256
1257 __cgfsng_ops static inline bool cgfsng_monitor_create(struct cgroup_ops *ops,
1258                                                         struct lxc_handler *handler)
1259 {
1260         char *monitor_cgroup;
1261         bool bret = false;
1262         struct lxc_conf *conf = handler->conf;
1263
1264         if (!conf)
1265                 return bret;
1266
1267         if (conf->cgroup_meta.dir)
1268                 monitor_cgroup = lxc_string_join("/", (const char *[]){conf->cgroup_meta.dir, ops->monitor_pattern, handler->name, NULL}, false);
1269         else
1270                 monitor_cgroup = must_make_path(ops->monitor_pattern, handler->name, NULL);
1271         if (!monitor_cgroup)
1272                 return bret;
1273
1274         for (int i = 0; ops->hierarchies[i]; i++) {
1275                 if (!monitor_create_path_for_hierarchy(ops->hierarchies[i], monitor_cgroup)) {
1276                         ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->monitor_full_path);
1277                         free(ops->hierarchies[i]->container_full_path);
1278                         ops->hierarchies[i]->container_full_path = NULL;
1279                         for (int j = 0; j < i; j++)
1280                                 remove_path_for_hierarchy(ops->hierarchies[j], monitor_cgroup, true);
1281                         goto on_error;
1282                 }
1283         }
1284
1285         bret = true;
1286
1287 on_error:
1288         free(monitor_cgroup);
1289
1290         return bret;
1291 }
1292
1293 /* Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1294  * next cgroup_pattern-1, -2, ..., -999.
1295  */
1296 __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
1297                                                         struct lxc_handler *handler)
1298 {
1299         int i;
1300         size_t len;
1301         char *container_cgroup, *offset, *tmp;
1302         int idx = 0;
1303         struct lxc_conf *conf = handler->conf;
1304
1305         if (ops->container_cgroup) {
1306                 WARN("cgfsng_create called a second time: %s", ops->container_cgroup);
1307                 return false;
1308         }
1309
1310         if (!conf)
1311                 return false;
1312
1313         if (conf->cgroup_meta.dir)
1314                 tmp = lxc_string_join("/", (const char *[]){conf->cgroup_meta.dir, handler->name, NULL}, false);
1315         else
1316                 tmp = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1317         if (!tmp) {
1318                 ERROR("Failed expanding cgroup name pattern");
1319                 return false;
1320         }
1321
1322         len = strlen(tmp) + 5; /* leave room for -NNN\0 */
1323         container_cgroup = must_alloc(len);
1324         (void)strlcpy(container_cgroup, tmp, len);
1325         free(tmp);
1326         offset = container_cgroup + len - 5;
1327
1328 again:
1329         if (idx == 1000) {
1330                 ERROR("Too many conflicting cgroup names");
1331                 goto out_free;
1332         }
1333
1334         if (idx) {
1335                 int ret;
1336
1337                 ret = snprintf(offset, 5, "-%d", idx);
1338                 if (ret < 0 || (size_t)ret >= 5) {
1339                         FILE *f = fopen("/dev/null", "w");
1340                         if (f) {
1341                                 fprintf(f, "Workaround for GCC7 bug: "
1342                                            "https://gcc.gnu.org/bugzilla/"
1343                                            "show_bug.cgi?id=78969");
1344                                 fclose(f);
1345                         }
1346                 }
1347         }
1348
1349         for (i = 0; ops->hierarchies[i]; i++) {
1350                 if (!container_create_path_for_hierarchy(ops->hierarchies[i], container_cgroup)) {
1351                         ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path);
1352                         free(ops->hierarchies[i]->container_full_path);
1353                         ops->hierarchies[i]->container_full_path = NULL;
1354                         for (int j = 0; j < i; j++)
1355                                 remove_path_for_hierarchy(ops->hierarchies[j], container_cgroup, false);
1356                         idx++;
1357                         goto again;
1358                 }
1359         }
1360
1361         ops->container_cgroup = container_cgroup;
1362
1363         return true;
1364
1365 out_free:
1366         free(container_cgroup);
1367
1368         return false;
1369 }
1370
1371 __cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid,
1372                                              bool monitor)
1373 {
1374         int len;
1375         char pidstr[25];
1376
1377         len = snprintf(pidstr, 25, "%d", pid);
1378         if (len < 0 || len >= 25)
1379                 return false;
1380
1381         for (int i = 0; ops->hierarchies[i]; i++) {
1382                 int ret;
1383                 char *path;
1384
1385                 if (monitor)
1386                         path = must_make_path(ops->hierarchies[i]->monitor_full_path,
1387                                               "cgroup.procs", NULL);
1388                 else
1389                         path = must_make_path(ops->hierarchies[i]->container_full_path,
1390                                               "cgroup.procs", NULL);
1391                 ret = lxc_write_to_file(path, pidstr, len, false, 0666);
1392                 if (ret != 0) {
1393                         SYSERROR("Failed to enter cgroup \"%s\"", path);
1394                         free(path);
1395                         return false;
1396                 }
1397                 free(path);
1398         }
1399
1400         return true;
1401 }
1402
1403 __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops, pid_t pid)
1404 {
1405         return __do_cgroup_enter(ops, pid, true);
1406 }
1407
1408 static bool cgfsng_payload_enter(struct cgroup_ops *ops, pid_t pid)
1409 {
1410         return __do_cgroup_enter(ops, pid, false);
1411 }
1412
1413 static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid,
1414                    mode_t chmod_mode)
1415 {
1416         int ret;
1417
1418         ret = chown(path, chown_uid, chown_gid);
1419         if (ret < 0) {
1420                 SYSWARN("Failed to chown(%s, %d, %d)", path, (int)chown_uid, (int)chown_gid);
1421                 return -1;
1422         }
1423
1424         ret = chmod(path, chmod_mode);
1425         if (ret < 0) {
1426                 SYSWARN("Failed to chmod(%s, %d)", path, (int)chmod_mode);
1427                 return -1;
1428         }
1429
1430         return 0;
1431 }
1432
1433 /* chgrp the container cgroups to container group.  We leave
1434  * the container owner as cgroup owner.  So we must make the
1435  * directories 775 so that the container can create sub-cgroups.
1436  *
1437  * Also chown the tasks and cgroup.procs files.  Those may not
1438  * exist depending on kernel version.
1439  */
1440 static int chown_cgroup_wrapper(void *data)
1441 {
1442         int i, ret;
1443         uid_t destuid;
1444         struct generic_userns_exec_data *arg = data;
1445         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1446         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1447
1448         ret = setresgid(nsgid, nsgid, nsgid);
1449         if (ret < 0) {
1450                 SYSERROR("Failed to setresgid(%d, %d, %d)",
1451                          (int)nsgid, (int)nsgid, (int)nsgid);
1452                 return -1;
1453         }
1454
1455         ret = setresuid(nsuid, nsuid, nsuid);
1456         if (ret < 0) {
1457                 SYSERROR("Failed to setresuid(%d, %d, %d)",
1458                          (int)nsuid, (int)nsuid, (int)nsuid);
1459                 return -1;
1460         }
1461
1462         ret = setgroups(0, NULL);
1463         if (ret < 0 && errno != EPERM) {
1464                 SYSERROR("Failed to setgroups(0, NULL)");
1465                 return -1;
1466         }
1467
1468         destuid = get_ns_uid(arg->origuid);
1469         if (destuid == LXC_INVALID_UID)
1470                 destuid = 0;
1471
1472         for (i = 0; arg->hierarchies[i]; i++) {
1473                 char *fullpath;
1474                 char *path = arg->hierarchies[i]->container_full_path;
1475
1476                 ret = chowmod(path, destuid, nsgid, 0775);
1477                 if (ret < 0)
1478                         return -1;
1479
1480                 /* Failures to chown() these are inconvenient but not
1481                  * detrimental We leave these owned by the container launcher,
1482                  * so that container root can write to the files to attach.  We
1483                  * chmod() them 664 so that container systemd can write to the
1484                  * files (which systemd in wily insists on doing).
1485                  */
1486
1487                 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC) {
1488                         fullpath = must_make_path(path, "tasks", NULL);
1489                         (void)chowmod(fullpath, destuid, nsgid, 0664);
1490                         free(fullpath);
1491                 }
1492
1493                 fullpath = must_make_path(path, "cgroup.procs", NULL);
1494                 (void)chowmod(fullpath, destuid, nsgid, 0664);
1495                 free(fullpath);
1496
1497                 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
1498                         continue;
1499
1500                 fullpath = must_make_path(path, "cgroup.subtree_control", NULL);
1501                 (void)chowmod(fullpath, destuid, nsgid, 0664);
1502                 free(fullpath);
1503
1504                 fullpath = must_make_path(path, "cgroup.threads", NULL);
1505                 (void)chowmod(fullpath, destuid, nsgid, 0664);
1506                 free(fullpath);
1507         }
1508
1509         return 0;
1510 }
1511
1512 __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
1513                                         struct lxc_conf *conf)
1514 {
1515         struct generic_userns_exec_data wrap;
1516
1517         if (lxc_list_empty(&conf->id_map))
1518                 return true;
1519
1520         wrap.origuid = geteuid();
1521         wrap.path = NULL;
1522         wrap.hierarchies = ops->hierarchies;
1523         wrap.conf = conf;
1524
1525         if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1526                           "chown_cgroup_wrapper") < 0) {
1527                 ERROR("Error requesting cgroup chown in new user namespace");
1528                 return false;
1529         }
1530
1531         return true;
1532 }
1533
1534 /* cgroup-full:* is done, no need to create subdirs */
1535 static bool cg_mount_needs_subdirs(int type)
1536 {
1537         if (type >= LXC_AUTO_CGROUP_FULL_RO)
1538                 return false;
1539
1540         return true;
1541 }
1542
1543 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1544  * remount controller ro if needed and bindmount the cgroupfs onto
1545  * controll/the/cg/path.
1546  */
1547 static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1548                                        char *controllerpath, char *cgpath,
1549                                        const char *container_cgroup)
1550 {
1551         int ret, remount_flags;
1552         char *sourcepath;
1553         int flags = MS_BIND;
1554
1555         if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1556                 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1557                 if (ret < 0) {
1558                         SYSERROR("Failed to bind mount \"%s\" onto \"%s\"",
1559                                  controllerpath, controllerpath);
1560                         return -1;
1561                 }
1562
1563                 remount_flags = add_required_remount_flags(controllerpath,
1564                                                            controllerpath,
1565                                                            flags | MS_REMOUNT);
1566                 ret = mount(controllerpath, controllerpath, "cgroup",
1567                             remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1568                             NULL);
1569                 if (ret < 0) {
1570                         SYSERROR("Failed to remount \"%s\" ro", controllerpath);
1571                         return -1;
1572                 }
1573
1574                 INFO("Remounted %s read-only", controllerpath);
1575         }
1576
1577         sourcepath = must_make_path(h->mountpoint, h->container_base_path,
1578                                     container_cgroup, NULL);
1579         if (type == LXC_AUTO_CGROUP_RO)
1580                 flags |= MS_RDONLY;
1581
1582         ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1583         if (ret < 0) {
1584                 SYSERROR("Failed to mount \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1585                 free(sourcepath);
1586                 return -1;
1587         }
1588         INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1589
1590         if (flags & MS_RDONLY) {
1591                 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1592                                                            flags | MS_REMOUNT);
1593                 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1594                 if (ret < 0) {
1595                         SYSERROR("Failed to remount \"%s\" ro", cgpath);
1596                         free(sourcepath);
1597                         return -1;
1598                 }
1599                 INFO("Remounted %s read-only", cgpath);
1600         }
1601
1602         free(sourcepath);
1603         INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1604         return 0;
1605 }
1606
1607 /* __cg_mount_direct
1608  *
1609  * Mount cgroup hierarchies directly without using bind-mounts. The main
1610  * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1611  * cgroups for the LXC_AUTO_CGROUP_FULL option.
1612  */
1613 static int __cg_mount_direct(int type, struct hierarchy *h,
1614                              const char *controllerpath)
1615 {
1616          int ret;
1617          char *controllers = NULL;
1618          char *fstype = "cgroup2";
1619          unsigned long flags = 0;
1620
1621          flags |= MS_NOSUID;
1622          flags |= MS_NOEXEC;
1623          flags |= MS_NODEV;
1624          flags |= MS_RELATIME;
1625
1626          if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1627                  flags |= MS_RDONLY;
1628
1629          if (h->version != CGROUP2_SUPER_MAGIC) {
1630                  controllers = lxc_string_join(",", (const char **)h->controllers, false);
1631                  if (!controllers)
1632                          return -ENOMEM;
1633                  fstype = "cgroup";
1634         }
1635
1636         ret = mount("cgroup", controllerpath, fstype, flags, controllers);
1637         free(controllers);
1638         if (ret < 0) {
1639                 SYSERROR("Failed to mount \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1640                 return -1;
1641         }
1642
1643         DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1644         return 0;
1645 }
1646
1647 static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1648                                                const char *controllerpath)
1649 {
1650         return __cg_mount_direct(type, h, controllerpath);
1651 }
1652
1653 static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
1654                                        const char *controllerpath)
1655 {
1656         if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1657                 return 0;
1658
1659         return __cg_mount_direct(type, h, controllerpath);
1660 }
1661
1662 __cgfsng_ops static bool cgfsng_mount(struct cgroup_ops *ops,
1663                                         struct lxc_handler *handler,
1664                                         const char *root, int type)
1665 {
1666         int i, ret;
1667         char *tmpfspath = NULL;
1668         bool has_cgns = false, retval = false, wants_force_mount = false;
1669
1670         if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1671                 return true;
1672
1673         if (type & LXC_AUTO_CGROUP_FORCE) {
1674                 type &= ~LXC_AUTO_CGROUP_FORCE;
1675                 wants_force_mount = true;
1676         }
1677
1678         if (!wants_force_mount){
1679                 if (!lxc_list_empty(&handler->conf->keepcaps))
1680                         wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1681                 else
1682                         wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1683         }
1684
1685         has_cgns = cgns_supported();
1686         if (has_cgns && !wants_force_mount)
1687                 return true;
1688
1689         if (type == LXC_AUTO_CGROUP_NOSPEC)
1690                 type = LXC_AUTO_CGROUP_MIXED;
1691         else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1692                 type = LXC_AUTO_CGROUP_FULL_MIXED;
1693
1694         /* Mount tmpfs */
1695         tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
1696         ret = safe_mount(NULL, tmpfspath, "tmpfs",
1697                          MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1698                          "size=10240k,mode=755", root);
1699         if (ret < 0)
1700                 goto on_error;
1701
1702         for (i = 0; ops->hierarchies[i]; i++) {
1703                 char *controllerpath, *path2;
1704                 struct hierarchy *h = ops->hierarchies[i];
1705                 char *controller = strrchr(h->mountpoint, '/');
1706
1707                 if (!controller)
1708                         continue;
1709                 controller++;
1710
1711                 controllerpath = must_make_path(tmpfspath, controller, NULL);
1712                 if (dir_exists(controllerpath)) {
1713                         free(controllerpath);
1714                         continue;
1715                 }
1716
1717                 ret = mkdir(controllerpath, 0755);
1718                 if (ret < 0) {
1719                         SYSERROR("Error creating cgroup path: %s", controllerpath);
1720                         free(controllerpath);
1721                         goto on_error;
1722                 }
1723
1724                 if (has_cgns && wants_force_mount) {
1725                         /* If cgroup namespaces are supported but the container
1726                          * will not have CAP_SYS_ADMIN after it has started we
1727                          * need to mount the cgroups manually.
1728                          */
1729                         ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
1730                         free(controllerpath);
1731                         if (ret < 0)
1732                                 goto on_error;
1733
1734                         continue;
1735                 }
1736
1737                 ret = cg_mount_cgroup_full(type, h, controllerpath);
1738                 if (ret < 0) {
1739                         free(controllerpath);
1740                         goto on_error;
1741                 }
1742
1743                 if (!cg_mount_needs_subdirs(type)) {
1744                         free(controllerpath);
1745                         continue;
1746                 }
1747
1748                 path2 = must_make_path(controllerpath, h->container_base_path,
1749                                        ops->container_cgroup, NULL);
1750                 ret = mkdir_p(path2, 0755);
1751                 if (ret < 0) {
1752                         free(controllerpath);
1753                         free(path2);
1754                         goto on_error;
1755                 }
1756
1757                 ret = cg_legacy_mount_controllers(type, h, controllerpath,
1758                                                   path2, ops->container_cgroup);
1759                 free(controllerpath);
1760                 free(path2);
1761                 if (ret < 0)
1762                         goto on_error;
1763         }
1764         retval = true;
1765
1766 on_error:
1767         free(tmpfspath);
1768         return retval;
1769 }
1770
1771 static int recursive_count_nrtasks(char *dirname)
1772 {
1773         struct dirent *direntp;
1774         DIR *dir;
1775         int count = 0, ret;
1776         char *path;
1777
1778         dir = opendir(dirname);
1779         if (!dir)
1780                 return 0;
1781
1782         while ((direntp = readdir(dir))) {
1783                 struct stat mystat;
1784
1785                 if (!strcmp(direntp->d_name, ".") ||
1786                     !strcmp(direntp->d_name, ".."))
1787                         continue;
1788
1789                 path = must_make_path(dirname, direntp->d_name, NULL);
1790
1791                 if (lstat(path, &mystat))
1792                         goto next;
1793
1794                 if (!S_ISDIR(mystat.st_mode))
1795                         goto next;
1796
1797                 count += recursive_count_nrtasks(path);
1798         next:
1799                 free(path);
1800         }
1801
1802         path = must_make_path(dirname, "cgroup.procs", NULL);
1803         ret = lxc_count_file_lines(path);
1804         if (ret != -1)
1805                 count += ret;
1806         free(path);
1807
1808         (void)closedir(dir);
1809
1810         return count;
1811 }
1812
1813 __cgfsng_ops static int cgfsng_nrtasks(struct cgroup_ops *ops)
1814 {
1815         int count;
1816         char *path;
1817
1818         if (!ops->container_cgroup || !ops->hierarchies)
1819                 return -1;
1820
1821         path = must_make_path(ops->hierarchies[0]->container_full_path, NULL);
1822         count = recursive_count_nrtasks(path);
1823         free(path);
1824         return count;
1825 }
1826
1827 /* Only root needs to escape to the cgroup of its init. */
1828 __cgfsng_ops static bool cgfsng_escape(const struct cgroup_ops *ops,
1829                                          struct lxc_conf *conf)
1830 {
1831         int i;
1832
1833         if (conf->cgroup_meta.relative || geteuid())
1834                 return true;
1835
1836         for (i = 0; ops->hierarchies[i]; i++) {
1837                 int ret;
1838                 char *fullpath;
1839
1840                 fullpath = must_make_path(ops->hierarchies[i]->mountpoint,
1841                                           ops->hierarchies[i]->container_base_path,
1842                                           "cgroup.procs", NULL);
1843                 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1844                 if (ret != 0) {
1845                         SYSERROR("Failed to escape to cgroup \"%s\"", fullpath);
1846                         free(fullpath);
1847                         return false;
1848                 }
1849                 free(fullpath);
1850         }
1851
1852         return true;
1853 }
1854
1855 __cgfsng_ops static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
1856 {
1857         int i;
1858
1859         for (i = 0; ops->hierarchies[i]; i++)
1860                 ;
1861
1862         return i;
1863 }
1864
1865 __cgfsng_ops static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out)
1866 {
1867         int i;
1868
1869         /* sanity check n */
1870         for (i = 0; i < n; i++)
1871                 if (!ops->hierarchies[i])
1872                         return false;
1873
1874         *out = ops->hierarchies[i]->controllers;
1875
1876         return true;
1877 }
1878
1879 #define THAWED "THAWED"
1880 #define THAWED_LEN (strlen(THAWED))
1881
1882 /* TODO: If the unified cgroup hierarchy grows a freezer controller this needs
1883  * to be adapted.
1884  */
1885 __cgfsng_ops static bool cgfsng_unfreeze(struct cgroup_ops *ops)
1886 {
1887         int ret;
1888         char *fullpath;
1889         struct hierarchy *h;
1890
1891         h = get_hierarchy(ops, "freezer");
1892         if (!h)
1893                 return false;
1894
1895         fullpath = must_make_path(h->container_full_path, "freezer.state", NULL);
1896         ret = lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false, 0666);
1897         free(fullpath);
1898         if (ret < 0)
1899                 return false;
1900
1901         return true;
1902 }
1903
1904 __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
1905                                                     const char *controller)
1906 {
1907         struct hierarchy *h;
1908
1909         h = get_hierarchy(ops, controller);
1910         if (!h) {
1911                 WARN("Failed to find hierarchy for controller \"%s\"",
1912                      controller ? controller : "(null)");
1913                 return NULL;
1914         }
1915
1916         return h->container_full_path ? h->container_full_path + strlen(h->mountpoint) : NULL;
1917 }
1918
1919 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
1920  * which must be freed by the caller.
1921  */
1922 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
1923                                                        const char *inpath,
1924                                                        const char *filename)
1925 {
1926         return must_make_path(h->mountpoint, inpath, filename, NULL);
1927 }
1928
1929 /* Technically, we're always at a delegation boundary here (This is especially
1930  * true when cgroup namespaces are available.). The reasoning is that in order
1931  * for us to have been able to start a container in the first place the root
1932  * cgroup must have been a leaf node. Now, either the container's init system
1933  * has populated the cgroup and kept it as a leaf node or it has created
1934  * subtrees. In the former case we will simply attach to the leaf node we
1935  * created when we started the container in the latter case we create our own
1936  * cgroup for the attaching process.
1937  */
1938 static int __cg_unified_attach(const struct hierarchy *h, const char *name,
1939                                const char *lxcpath, const char *pidstr,
1940                                size_t pidstr_len, const char *controller)
1941 {
1942         int ret;
1943         size_t len;
1944         int fret = -1, idx = 0;
1945         char *base_path = NULL, *container_cgroup = NULL, *full_path = NULL;
1946
1947         container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
1948         /* not running */
1949         if (!container_cgroup)
1950                 return 0;
1951
1952         base_path = must_make_path(h->mountpoint, container_cgroup, NULL);
1953         full_path = must_make_path(base_path, "cgroup.procs", NULL);
1954         /* cgroup is populated */
1955         ret = lxc_write_to_file(full_path, pidstr, pidstr_len, false, 0666);
1956         if (ret < 0 && errno != EBUSY)
1957                 goto on_error;
1958
1959         if (ret == 0)
1960                 goto on_success;
1961
1962         free(full_path);
1963
1964         len = strlen(base_path) + sizeof("/lxc-1000") - 1 +
1965               sizeof("/cgroup-procs") - 1;
1966         full_path = must_alloc(len + 1);
1967         do {
1968                 if (idx)
1969                         ret = snprintf(full_path, len + 1, "%s/lxc-%d",
1970                                        base_path, idx);
1971                 else
1972                         ret = snprintf(full_path, len + 1, "%s/lxc", base_path);
1973                 if (ret < 0 || (size_t)ret >= len + 1)
1974                         goto on_error;
1975
1976                 ret = mkdir_p(full_path, 0755);
1977                 if (ret < 0 && errno != EEXIST)
1978                         goto on_error;
1979
1980                 (void)strlcat(full_path, "/cgroup.procs", len + 1);
1981                 ret = lxc_write_to_file(full_path, pidstr, len, false, 0666);
1982                 if (ret == 0)
1983                         goto on_success;
1984
1985                 /* this is a non-leaf node */
1986                 if (errno != EBUSY)
1987                         goto on_error;
1988
1989         } while (++idx > 0 && idx < 1000);
1990
1991 on_success:
1992         if (idx < 1000)
1993                 fret = 0;
1994
1995 on_error:
1996         free(base_path);
1997         free(container_cgroup);
1998         free(full_path);
1999
2000         return fret;
2001 }
2002
2003 __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops, const char *name,
2004                                          const char *lxcpath, pid_t pid)
2005 {
2006         int i, len, ret;
2007         char pidstr[25];
2008
2009         len = snprintf(pidstr, 25, "%d", pid);
2010         if (len < 0 || len >= 25)
2011                 return false;
2012
2013         for (i = 0; ops->hierarchies[i]; i++) {
2014                 char *path;
2015                 char *fullpath = NULL;
2016                 struct hierarchy *h = ops->hierarchies[i];
2017
2018                 if (h->version == CGROUP2_SUPER_MAGIC) {
2019                         ret = __cg_unified_attach(h, name, lxcpath, pidstr, len,
2020                                                   h->controllers[0]);
2021                         if (ret < 0)
2022                                 return false;
2023
2024                         continue;
2025                 }
2026
2027                 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2028                 /* not running */
2029                 if (!path)
2030                         continue;
2031
2032                 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2033                 free(path);
2034                 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2035                 if (ret < 0) {
2036                         SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
2037                         free(fullpath);
2038                         return false;
2039                 }
2040                 free(fullpath);
2041         }
2042
2043         return true;
2044 }
2045
2046 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.  Here we
2047  * don't have a cgroup_data set up, so we ask the running container through the
2048  * commands API for the cgroup path.
2049  */
2050 __cgfsng_ops static int cgfsng_get(struct cgroup_ops *ops, const char *filename,
2051                                      char *value, size_t len, const char *name,
2052                                      const char *lxcpath)
2053 {
2054         int ret = -1;
2055         size_t controller_len;
2056         char *controller, *p, *path;
2057         struct hierarchy *h;
2058
2059         controller_len = strlen(filename);
2060         controller = alloca(controller_len + 1);
2061         (void)strlcpy(controller, filename, controller_len + 1);
2062
2063         p = strchr(controller, '.');
2064         if (p)
2065                 *p = '\0';
2066
2067         path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2068         /* not running */
2069         if (!path)
2070                 return -1;
2071
2072         h = get_hierarchy(ops, controller);
2073         if (h) {
2074                 char *fullpath;
2075
2076                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2077                 ret = lxc_read_from_file(fullpath, value, len);
2078                 free(fullpath);
2079         }
2080         free(path);
2081
2082         return ret;
2083 }
2084
2085 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.  Here we
2086  * don't have a cgroup_data set up, so we ask the running container through the
2087  * commands API for the cgroup path.
2088  */
2089 __cgfsng_ops static int cgfsng_set(struct cgroup_ops *ops,
2090                                      const char *filename, const char *value,
2091                                      const char *name, const char *lxcpath)
2092 {
2093         int ret = -1;
2094         size_t controller_len;
2095         char *controller, *p, *path;
2096         struct hierarchy *h;
2097
2098         controller_len = strlen(filename);
2099         controller = alloca(controller_len + 1);
2100         (void)strlcpy(controller, filename, controller_len + 1);
2101
2102         p = strchr(controller, '.');
2103         if (p)
2104                 *p = '\0';
2105
2106         path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2107         /* not running */
2108         if (!path)
2109                 return -1;
2110
2111         h = get_hierarchy(ops, controller);
2112         if (h) {
2113                 char *fullpath;
2114
2115                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2116                 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2117                 free(fullpath);
2118         }
2119         free(path);
2120
2121         return ret;
2122 }
2123
2124 /* take devices cgroup line
2125  *    /dev/foo rwx
2126  * and convert it to a valid
2127  *    type major:minor mode
2128  * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2129  * the output.
2130  */
2131 static int convert_devpath(const char *invalue, char *dest)
2132 {
2133         int n_parts;
2134         char *p, *path, type;
2135         unsigned long minor, major;
2136         struct stat sb;
2137         int ret = -EINVAL;
2138         char *mode = NULL;
2139
2140         path = must_copy_string(invalue);
2141
2142         /* Read path followed by mode. Ignore any trailing text.
2143          * A '    # comment' would be legal. Technically other text is not
2144          * legal, we could check for that if we cared to.
2145          */
2146         for (n_parts = 1, p = path; *p && n_parts < 3; p++) {
2147                 if (*p != ' ')
2148                         continue;
2149                 *p = '\0';
2150
2151                 if (n_parts != 1)
2152                         break;
2153                 p++;
2154                 n_parts++;
2155
2156                 while (*p == ' ')
2157                         p++;
2158
2159                 mode = p;
2160
2161                 if (*p == '\0')
2162                         goto out;
2163         }
2164
2165         if (n_parts == 1)
2166                 goto out;
2167
2168         ret = stat(path, &sb);
2169         if (ret < 0)
2170                 goto out;
2171
2172         mode_t m = sb.st_mode & S_IFMT;
2173         switch (m) {
2174         case S_IFBLK:
2175                 type = 'b';
2176                 break;
2177         case S_IFCHR:
2178                 type = 'c';
2179                 break;
2180         default:
2181                 ERROR("Unsupported device type %i for \"%s\"", m, path);
2182                 ret = -EINVAL;
2183                 goto out;
2184         }
2185
2186         major = MAJOR(sb.st_rdev);
2187         minor = MINOR(sb.st_rdev);
2188         ret = snprintf(dest, 50, "%c %lu:%lu %s", type, major, minor, mode);
2189         if (ret < 0 || ret >= 50) {
2190                 ERROR("Error on configuration value \"%c %lu:%lu %s\" (max 50 "
2191                       "chars)", type, major, minor, mode);
2192                 ret = -ENAMETOOLONG;
2193                 goto out;
2194         }
2195         ret = 0;
2196
2197 out:
2198         free(path);
2199         return ret;
2200 }
2201
2202 /* Called from setup_limits - here we have the container's cgroup_data because
2203  * we created the cgroups.
2204  */
2205 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2206                               const char *value)
2207 {
2208         size_t len;
2209         char *fullpath, *p;
2210         /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2211         char converted_value[50];
2212         struct hierarchy *h;
2213         int ret = 0;
2214         char *controller = NULL;
2215
2216         len = strlen(filename);
2217         controller = alloca(len + 1);
2218         (void)strlcpy(controller, filename, len + 1);
2219
2220         p = strchr(controller, '.');
2221         if (p)
2222                 *p = '\0';
2223
2224         if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
2225                 ret = convert_devpath(value, converted_value);
2226                 if (ret < 0)
2227                         return ret;
2228                 value = converted_value;
2229         }
2230
2231         h = get_hierarchy(ops, controller);
2232         if (!h) {
2233                 ERROR("Failed to setup limits for the \"%s\" controller. "
2234                       "The controller seems to be unused by \"cgfsng\" cgroup "
2235                       "driver or not enabled on the cgroup hierarchy",
2236                       controller);
2237                 errno = ENOENT;
2238                 return -ENOENT;
2239         }
2240
2241         fullpath = must_make_path(h->container_full_path, filename, NULL);
2242         ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2243         free(fullpath);
2244         return ret;
2245 }
2246
2247 static bool __cg_legacy_setup_limits(struct cgroup_ops *ops,
2248                                      struct lxc_list *cgroup_settings,
2249                                      bool do_devices)
2250 {
2251         struct lxc_list *iterator, *next, *sorted_cgroup_settings;
2252         struct lxc_cgroup *cg;
2253         bool ret = false;
2254
2255         if (lxc_list_empty(cgroup_settings))
2256                 return true;
2257
2258         sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2259         if (!sorted_cgroup_settings)
2260                 return false;
2261
2262         lxc_list_for_each(iterator, sorted_cgroup_settings) {
2263                 cg = iterator->elem;
2264
2265                 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2266                         if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
2267                                 if (do_devices && (errno == EACCES || errno == EPERM)) {
2268                                         WARN("Failed to set \"%s\" to \"%s\"",
2269                                              cg->subsystem, cg->value);
2270                                         continue;
2271                                 }
2272                                 WARN("Failed to set \"%s\" to \"%s\"",
2273                                      cg->subsystem, cg->value);
2274                                 goto out;
2275                         }
2276                         DEBUG("Set controller \"%s\" set to \"%s\"",
2277                               cg->subsystem, cg->value);
2278                 }
2279         }
2280
2281         ret = true;
2282         INFO("Limits for the legacy cgroup hierarchies have been setup");
2283 out:
2284         lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2285                 lxc_list_del(iterator);
2286                 free(iterator);
2287         }
2288         free(sorted_cgroup_settings);
2289         return ret;
2290 }
2291
2292 static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
2293                                       struct lxc_list *cgroup_settings)
2294 {
2295         struct lxc_list *iterator;
2296         struct hierarchy *h = ops->unified;
2297
2298         if (lxc_list_empty(cgroup_settings))
2299                 return true;
2300
2301         if (!h)
2302                 return false;
2303
2304         lxc_list_for_each(iterator, cgroup_settings) {
2305                 int ret;
2306                 char *fullpath;
2307                 struct lxc_cgroup *cg = iterator->elem;
2308
2309                 fullpath = must_make_path(h->container_full_path, cg->subsystem, NULL);
2310                 ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false, 0666);
2311                 free(fullpath);
2312                 if (ret < 0) {
2313                         SYSERROR("Failed to set \"%s\" to \"%s\"",
2314                                  cg->subsystem, cg->value);
2315                         return false;
2316                 }
2317                 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2318         }
2319
2320         INFO("Limits for the unified cgroup hierarchy have been setup");
2321         return true;
2322 }
2323
2324 __cgfsng_ops static bool cgfsng_setup_limits(struct cgroup_ops *ops,
2325                                                struct lxc_conf *conf,
2326                                                bool do_devices)
2327 {
2328         bool bret;
2329
2330         bret = __cg_legacy_setup_limits(ops, &conf->cgroup, do_devices);
2331         if (!bret)
2332                 return false;
2333
2334         return __cg_unified_setup_limits(ops, &conf->cgroup2);
2335 }
2336
2337 static bool cgroup_use_wants_controllers(const struct cgroup_ops *ops,
2338                                        char **controllers)
2339 {
2340         char **cur_ctrl, **cur_use;
2341
2342         if (!ops->cgroup_use)
2343                 return true;
2344
2345         for (cur_ctrl = controllers; cur_ctrl && *cur_ctrl; cur_ctrl++) {
2346                 bool found = false;
2347
2348                 for (cur_use = ops->cgroup_use; cur_use && *cur_use; cur_use++) {
2349                         if (strcmp(*cur_use, *cur_ctrl) != 0)
2350                                 continue;
2351
2352                         found = true;
2353                         break;
2354                 }
2355
2356                 if (found)
2357                         continue;
2358
2359                 return false;
2360         }
2361
2362         return true;
2363 }
2364
2365 /* At startup, parse_hierarchies finds all the info we need about cgroup
2366  * mountpoints and current cgroups, and stores it in @d.
2367  */
2368 static bool cg_hybrid_init(struct cgroup_ops *ops, bool relative)
2369 {
2370         int ret;
2371         char *basecginfo;
2372         FILE *f;
2373         size_t len = 0;
2374         char *line = NULL;
2375         char **klist = NULL, **nlist = NULL;
2376
2377         /* Root spawned containers escape the current cgroup, so use init's
2378          * cgroups as our base in that case.
2379          */
2380         if (!relative && (geteuid() == 0))
2381                 basecginfo = read_file("/proc/1/cgroup");
2382         else
2383                 basecginfo = read_file("/proc/self/cgroup");
2384         if (!basecginfo)
2385                 return false;
2386
2387         ret = get_existing_subsystems(&klist, &nlist);
2388         if (ret < 0) {
2389                 ERROR("Failed to retrieve available legacy cgroup controllers");
2390                 free(basecginfo);
2391                 return false;
2392         }
2393
2394         f = fopen("/proc/self/mountinfo", "r");
2395         if (!f) {
2396                 ERROR("Failed to open \"/proc/self/mountinfo\"");
2397                 free(basecginfo);
2398                 return false;
2399         }
2400
2401         lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
2402
2403         while (getline(&line, &len, f) != -1) {
2404                 int type;
2405                 bool writeable;
2406                 struct hierarchy *new;
2407                 char *base_cgroup = NULL, *mountpoint = NULL;
2408                 char **controller_list = NULL;
2409
2410                 type = get_cgroup_version(line);
2411                 if (type == 0)
2412                         continue;
2413
2414                 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
2415                         continue;
2416
2417                 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
2418                         if (type == CGROUP2_SUPER_MAGIC)
2419                                 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2420                         else if (type == CGROUP_SUPER_MAGIC)
2421                                 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
2422                 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
2423                         if (type == CGROUP_SUPER_MAGIC)
2424                                 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2425                 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
2426                         if (type == CGROUP2_SUPER_MAGIC)
2427                                 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2428                 }
2429
2430                 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
2431                 if (!controller_list && type == CGROUP_SUPER_MAGIC)
2432                         continue;
2433
2434                 if (type == CGROUP_SUPER_MAGIC)
2435                         if (controller_list_is_dup(ops->hierarchies, controller_list))
2436                                 goto next;
2437
2438                 mountpoint = cg_hybrid_get_mountpoint(line);
2439                 if (!mountpoint) {
2440                         ERROR("Failed parsing mountpoint from \"%s\"", line);
2441                         goto next;
2442                 }
2443
2444                 if (type == CGROUP_SUPER_MAGIC)
2445                         base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
2446                 else
2447                         base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
2448                 if (!base_cgroup) {
2449                         ERROR("Failed to find current cgroup");
2450                         goto next;
2451                 }
2452
2453                 trim(base_cgroup);
2454                 prune_init_scope(base_cgroup);
2455                 if (type == CGROUP2_SUPER_MAGIC)
2456                         writeable = test_writeable_v2(mountpoint, base_cgroup);
2457                 else
2458                         writeable = test_writeable_v1(mountpoint, base_cgroup);
2459                 if (!writeable)
2460                         goto next;
2461
2462                 if (type == CGROUP2_SUPER_MAGIC) {
2463                         char *cgv2_ctrl_path;
2464
2465                         cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
2466                                                         "cgroup.controllers",
2467                                                         NULL);
2468
2469                         controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
2470                         free(cgv2_ctrl_path);
2471                         if (!controller_list) {
2472                                 controller_list = cg_unified_make_empty_controller();
2473                                 TRACE("No controllers are enabled for "
2474                                       "delegation in the unified hierarchy");
2475                         }
2476                 }
2477
2478                 /* Exclude all controllers that cgroup use does not want. */
2479                 if (!cgroup_use_wants_controllers(ops, controller_list))
2480                         goto next;
2481
2482                 new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
2483                 if (type == CGROUP2_SUPER_MAGIC && !ops->unified)
2484                         ops->unified = new;
2485
2486                 continue;
2487
2488         next:
2489                 free_string_list(controller_list);
2490                 free(mountpoint);
2491                 free(base_cgroup);
2492         }
2493
2494         free_string_list(klist);
2495         free_string_list(nlist);
2496
2497         free(basecginfo);
2498
2499         fclose(f);
2500         free(line);
2501
2502         TRACE("Writable cgroup hierarchies:");
2503         lxc_cgfsng_print_hierarchies(ops);
2504
2505         /* verify that all controllers in cgroup.use and all crucial
2506          * controllers are accounted for
2507          */
2508         if (!all_controllers_found(ops))
2509                 return false;
2510
2511         return true;
2512 }
2513
2514 static int cg_is_pure_unified(void)
2515 {
2516
2517         int ret;
2518         struct statfs fs;
2519
2520         ret = statfs("/sys/fs/cgroup", &fs);
2521         if (ret < 0)
2522                 return -ENOMEDIUM;
2523
2524         if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
2525                 return CGROUP2_SUPER_MAGIC;
2526
2527         return 0;
2528 }
2529
2530 /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
2531 static char *cg_unified_get_current_cgroup(bool relative)
2532 {
2533         char *basecginfo, *base_cgroup;
2534         char *copy = NULL;
2535
2536         if (!relative && (geteuid() == 0))
2537                 basecginfo = read_file("/proc/1/cgroup");
2538         else
2539                 basecginfo = read_file("/proc/self/cgroup");
2540         if (!basecginfo)
2541                 return NULL;
2542
2543         base_cgroup = strstr(basecginfo, "0::/");
2544         if (!base_cgroup)
2545                 goto cleanup_on_err;
2546
2547         base_cgroup = base_cgroup + 3;
2548         copy = copy_to_eol(base_cgroup);
2549         if (!copy)
2550                 goto cleanup_on_err;
2551
2552 cleanup_on_err:
2553         free(basecginfo);
2554         if (copy)
2555                 trim(copy);
2556
2557         return copy;
2558 }
2559
2560 static int cg_unified_init(struct cgroup_ops *ops, bool relative)
2561 {
2562         int ret;
2563         char *mountpoint, *subtree_path;
2564         char **delegatable;
2565         char *base_cgroup = NULL;
2566
2567         ret = cg_is_pure_unified();
2568         if (ret == -ENOMEDIUM)
2569                 return -ENOMEDIUM;
2570
2571         if (ret != CGROUP2_SUPER_MAGIC)
2572                 return 0;
2573
2574         base_cgroup = cg_unified_get_current_cgroup(relative);
2575         if (!base_cgroup)
2576                 return -EINVAL;
2577         prune_init_scope(base_cgroup);
2578
2579         /* We assume that we have already been given controllers to delegate
2580          * further down the hierarchy. If not it is up to the user to delegate
2581          * them to us.
2582          */
2583         mountpoint = must_copy_string("/sys/fs/cgroup");
2584         subtree_path = must_make_path(mountpoint, base_cgroup,
2585                                       "cgroup.subtree_control", NULL);
2586         delegatable = cg_unified_get_controllers(subtree_path);
2587         free(subtree_path);
2588         if (!delegatable)
2589                 delegatable = cg_unified_make_empty_controller();
2590         if (!delegatable[0])
2591                 TRACE("No controllers are enabled for delegation");
2592
2593         /* TODO: If the user requested specific controllers via lxc.cgroup.use
2594          * we should verify here. The reason I'm not doing it right is that I'm
2595          * not convinced that lxc.cgroup.use will be the future since it is a
2596          * global property. I much rather have an option that lets you request
2597          * controllers per container.
2598          */
2599
2600         add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
2601
2602         ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2603         return CGROUP2_SUPER_MAGIC;
2604 }
2605
2606 static bool cg_init(struct cgroup_ops *ops, struct lxc_conf *conf)
2607 {
2608         int ret;
2609         const char *tmp;
2610         bool relative = conf->cgroup_meta.relative;
2611
2612         tmp = lxc_global_config_value("lxc.cgroup.use");
2613         if (tmp) {
2614                 char *chop, *cur, *pin;
2615
2616                 pin = must_copy_string(tmp);
2617                 chop = pin;
2618
2619                 lxc_iterate_parts(cur, chop, ",") {
2620                         must_append_string(&ops->cgroup_use, cur);
2621                 }
2622
2623                 free(pin);
2624         }
2625
2626         ret = cg_unified_init(ops, relative);
2627         if (ret < 0)
2628                 return false;
2629
2630         if (ret == CGROUP2_SUPER_MAGIC)
2631                 return true;
2632
2633         return cg_hybrid_init(ops, relative);
2634 }
2635
2636 __cgfsng_ops static bool cgfsng_data_init(struct cgroup_ops *ops)
2637 {
2638         const char *cgroup_pattern;
2639
2640         /* copy system-wide cgroup information */
2641         cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
2642         if (!cgroup_pattern) {
2643                 /* lxc.cgroup.pattern is only NULL on error. */
2644                 ERROR("Failed to retrieve cgroup pattern");
2645                 return false;
2646         }
2647         ops->cgroup_pattern = must_copy_string(cgroup_pattern);
2648         ops->monitor_pattern = must_copy_string("lxc.monitor");
2649
2650         return true;
2651 }
2652
2653 struct cgroup_ops *cgfsng_ops_init(struct lxc_conf *conf)
2654 {
2655         struct cgroup_ops *cgfsng_ops;
2656
2657         cgfsng_ops = malloc(sizeof(struct cgroup_ops));
2658         if (!cgfsng_ops)
2659                 return NULL;
2660
2661         memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
2662         cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
2663
2664         if (!cg_init(cgfsng_ops, conf)) {
2665                 free(cgfsng_ops);
2666                 return NULL;
2667         }
2668
2669         cgfsng_ops->data_init = cgfsng_data_init;
2670         cgfsng_ops->destroy = cgfsng_destroy;
2671         cgfsng_ops->monitor_create = cgfsng_monitor_create;
2672         cgfsng_ops->monitor_enter = cgfsng_monitor_enter;
2673         cgfsng_ops->payload_create = cgfsng_payload_create;
2674         cgfsng_ops->payload_enter = cgfsng_payload_enter;
2675         cgfsng_ops->escape = cgfsng_escape;
2676         cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
2677         cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
2678         cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
2679         cgfsng_ops->get = cgfsng_get;
2680         cgfsng_ops->set = cgfsng_set;
2681         cgfsng_ops->unfreeze = cgfsng_unfreeze;
2682         cgfsng_ops->setup_limits = cgfsng_setup_limits;
2683         cgfsng_ops->driver = "cgfsng";
2684         cgfsng_ops->version = "1.0.0";
2685         cgfsng_ops->attach = cgfsng_attach;
2686         cgfsng_ops->chown = cgfsng_chown;
2687         cgfsng_ops->mount = cgfsng_mount;
2688         cgfsng_ops->nrtasks = cgfsng_nrtasks;
2689
2690         return cgfsng_ops;
2691 }