src/lxc/cgroups/cgfsng.c

   1 /*
   2  * lxc: linux Container library
   3  *
   4  * Copyright © 2016 Canonical Ltd.
   5  *
   6  * Authors:
   7  * Serge Hallyn <serge.hallyn@ubuntu.com>
   8  * Christian Brauner <christian.brauner@ubuntu.com>
   9  *
  10  * This library is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * This library is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with this library; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /*
  26  * cgfs-ng.c: this is a new, simplified implementation of a filesystem
  27  * cgroup backend.  The original cgfs.c was designed to be as flexible
  28  * as possible.  It would try to find cgroup filesystems no matter where
  29  * or how you had them mounted, and deduce the most usable mount for
  30  * each controller.
  31  *
  32  * This new implementation assumes that cgroup filesystems are mounted
  33  * under /sys/fs/cgroup/clist where clist is either the controller, or
  34  * a comman-separated list of controllers.
  35  */
  36
  37 #include "config.h"
  38
  39 #include <ctype.h>
  40 #include <dirent.h>
  41 #include <errno.h>
  42 #include <grp.h>
  43 #include <stdint.h>
  44 #include <stdio.h>
  45 #include <stdlib.h>
  46 #include <string.h>
  47 #include <unistd.h>
  48 #include <linux/kdev_t.h>
  49 #include <linux/types.h>
  50 #include <sys/types.h>
  51
  52 #include "caps.h"
  53 #include "cgroup.h"
  54 #include "cgroup_utils.h"
  55 #include "commands.h"
  56 #include "conf.h"
  57 #include "log.h"
  58 #include "storage/storage.h"
  59 #include "utils.h"
  60
  61 #ifndef HAVE_STRLCPY
  62 #include "include/strlcpy.h"
  63 #endif
  64
  65 lxc_log_define(lxc_cgfsng, lxc);
  66
  67 static void free_string_list(char **clist)
  68 {
  69         int i;
  70
  71         if (!clist)
  72                 return;
  73
  74         for (i = 0; clist[i]; i++)
  75                 free(clist[i]);
  76
  77         free(clist);
  78 }
  79
  80 /* Allocate a pointer, do not fail. */
  81 static void *must_alloc(size_t sz)
  82 {
  83         return must_realloc(NULL, sz);
  84 }
  85
  86 /* Given a pointer to a null-terminated array of pointers, realloc to add one
  87  * entry, and point the new entry to NULL. Do not fail. Return the index to the
  88  * second-to-last entry - that is, the one which is now available for use
  89  * (keeping the list null-terminated).
  90  */
  91 static int append_null_to_list(void ***list)
  92 {
  93         int newentry = 0;
  94
  95         if (*list)
  96                 for (; (*list)[newentry]; newentry++)
  97                         ;
  98
  99         *list = must_realloc(*list, (newentry + 2) * sizeof(void **));
 100         (*list)[newentry + 1] = NULL;
 101         return newentry;
 102 }
 103
 104 /* Given a null-terminated array of strings, check whether @entry is one of the
 105  * strings.
 106  */
 107 static bool string_in_list(char **list, const char *entry)
 108 {
 109         int i;
 110
 111         if (!list)
 112                 return false;
 113
 114         for (i = 0; list[i]; i++)
 115                 if (strcmp(list[i], entry) == 0)
 116                         return true;
 117
 118         return false;
 119 }
 120
 121 /* Return a copy of @entry prepending "name=", i.e.  turn "systemd" into
 122  * "name=systemd". Do not fail.
 123  */
 124 static char *cg_legacy_must_prefix_named(char *entry)
 125 {
 126         size_t len;
 127         char *prefixed;
 128
 129         len = strlen(entry);
 130         prefixed = must_alloc(len + 6);
 131
 132         memcpy(prefixed, "name=", sizeof("name=") - 1);
 133         memcpy(prefixed + sizeof("name=") - 1, entry, len);
 134         prefixed[len + 5] = '\0';
 135         return prefixed;
 136 }
 137
 138 /* Append an entry to the clist. Do not fail. @clist must be NULL the first time
 139  * we are called.
 140  *
 141  * We also handle named subsystems here. Any controller which is not a kernel
 142  * subsystem, we prefix "name=". Any which is both a kernel and named subsystem,
 143  * we refuse to use because we're not sure which we have here.
 144  * (TODO: We could work around this in some cases by just remounting to be
 145  * unambiguous, or by comparing mountpoint contents with current cgroup.)
 146  *
 147  * The last entry will always be NULL.
 148  */
 149 static void must_append_controller(char **klist, char **nlist, char ***clist,
 150                                    char *entry)
 151 {
 152         int newentry;
 153         char *copy;
 154
 155         if (string_in_list(klist, entry) && string_in_list(nlist, entry)) {
 156                 ERROR("Refusing to use ambiguous controller \"%s\"", entry);
 157                 ERROR("It is both a named and kernel subsystem");
 158                 return;
 159         }
 160
 161         newentry = append_null_to_list((void ***)clist);
 162
 163         if (strncmp(entry, "name=", 5) == 0)
 164                 copy = must_copy_string(entry);
 165         else if (string_in_list(klist, entry))
 166                 copy = must_copy_string(entry);
 167         else
 168                 copy = cg_legacy_must_prefix_named(entry);
 169
 170         (*clist)[newentry] = copy;
 171 }
 172
 173 /* Given a handler's cgroup data, return the struct hierarchy for the controller
 174  * @c, or NULL if there is none.
 175  */
 176 struct hierarchy *get_hierarchy(struct cgroup_ops *ops, const char *c)
 177 {
 178         int i;
 179
 180         if (!ops->hierarchies)
 181                 return NULL;
 182
 183         for (i = 0; ops->hierarchies[i]; i++) {
 184                 if (!c) {
 185                         /* This is the empty unified hierarchy. */
 186                         if (ops->hierarchies[i]->controllers &&
 187                             !ops->hierarchies[i]->controllers[0])
 188                                 return ops->hierarchies[i];
 189
 190                         continue;
 191                 }
 192
 193                 if (string_in_list(ops->hierarchies[i]->controllers, c))
 194                         return ops->hierarchies[i];
 195         }
 196
 197         return NULL;
 198 }
 199
 200 #define BATCH_SIZE 50
 201 static void batch_realloc(char **mem, size_t oldlen, size_t newlen)
 202 {
 203         int newbatches = (newlen / BATCH_SIZE) + 1;
 204         int oldbatches = (oldlen / BATCH_SIZE) + 1;
 205
 206         if (!*mem || newbatches > oldbatches) {
 207                 *mem = must_realloc(*mem, newbatches * BATCH_SIZE);
 208         }
 209 }
 210
 211 static void append_line(char **dest, size_t oldlen, char *new, size_t newlen)
 212 {
 213         size_t full = oldlen + newlen;
 214
 215         batch_realloc(dest, oldlen, full + 1);
 216
 217         memcpy(*dest + oldlen, new, newlen + 1);
 218 }
 219
 220 /* Slurp in a whole file */
 221 static char *read_file(const char *fnam)
 222 {
 223         FILE *f;
 224         char *line = NULL, *buf = NULL;
 225         size_t len = 0, fulllen = 0;
 226         int linelen;
 227
 228         f = fopen(fnam, "r");
 229         if (!f)
 230                 return NULL;
 231         while ((linelen = getline(&line, &len, f)) != -1) {
 232                 append_line(&buf, fulllen, line, linelen);
 233                 fulllen += linelen;
 234         }
 235         fclose(f);
 236         free(line);
 237         return buf;
 238 }
 239
 240 /* Taken over modified from the kernel sources. */
 241 #define NBITS 32 /* bits in uint32_t */
 242 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
 243 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
 244
 245 static void set_bit(unsigned bit, uint32_t *bitarr)
 246 {
 247         bitarr[bit / NBITS] |= (1 << (bit % NBITS));
 248 }
 249
 250 static void clear_bit(unsigned bit, uint32_t *bitarr)
 251 {
 252         bitarr[bit / NBITS] &= ~(1 << (bit % NBITS));
 253 }
 254
 255 static bool is_set(unsigned bit, uint32_t *bitarr)
 256 {
 257         return (bitarr[bit / NBITS] & (1 << (bit % NBITS))) != 0;
 258 }
 259
 260 /* Create cpumask from cpulist aka turn:
 261  *
 262  *      0,2-3
 263  *
 264  * into bit array
 265  *
 266  *      1 0 1 1
 267  */
 268 static uint32_t *lxc_cpumask(char *buf, size_t nbits)
 269 {
 270         char *token;
 271         size_t arrlen;
 272         uint32_t *bitarr;
 273         char *saveptr = NULL;
 274
 275         arrlen = BITS_TO_LONGS(nbits);
 276         bitarr = calloc(arrlen, sizeof(uint32_t));
 277         if (!bitarr)
 278                 return NULL;
 279
 280         for (; (token = strtok_r(buf, ",", &saveptr)); buf = NULL) {
 281                 errno = 0;
 282                 unsigned end, start;
 283                 char *range;
 284
 285                 start = strtoul(token, NULL, 0);
 286                 end = start;
 287                 range = strchr(token, '-');
 288                 if (range)
 289                         end = strtoul(range + 1, NULL, 0);
 290
 291                 if (!(start <= end)) {
 292                         free(bitarr);
 293                         return NULL;
 294                 }
 295
 296                 if (end >= nbits) {
 297                         free(bitarr);
 298                         return NULL;
 299                 }
 300
 301                 while (start <= end)
 302                         set_bit(start++, bitarr);
 303         }
 304
 305         return bitarr;
 306 }
 307
 308 /* Turn cpumask into simple, comma-separated cpulist. */
 309 static char *lxc_cpumask_to_cpulist(uint32_t *bitarr, size_t nbits)
 310 {
 311         int ret;
 312         size_t i;
 313         char **cpulist = NULL;
 314         char numstr[LXC_NUMSTRLEN64] = {0};
 315
 316         for (i = 0; i <= nbits; i++) {
 317                 if (!is_set(i, bitarr))
 318                         continue;
 319
 320                 ret = snprintf(numstr, LXC_NUMSTRLEN64, "%zu", i);
 321                 if (ret < 0 || (size_t)ret >= LXC_NUMSTRLEN64) {
 322                         lxc_free_array((void **)cpulist, free);
 323                         return NULL;
 324                 }
 325
 326                 ret = lxc_append_string(&cpulist, numstr);
 327                 if (ret < 0) {
 328                         lxc_free_array((void **)cpulist, free);
 329                         return NULL;
 330                 }
 331         }
 332
 333         if (!cpulist)
 334                 return NULL;
 335
 336         return lxc_string_join(",", (const char **)cpulist, false);
 337 }
 338
 339 static ssize_t get_max_cpus(char *cpulist)
 340 {
 341         char *c1, *c2;
 342         char *maxcpus = cpulist;
 343         size_t cpus = 0;
 344
 345         c1 = strrchr(maxcpus, ',');
 346         if (c1)
 347                 c1++;
 348
 349         c2 = strrchr(maxcpus, '-');
 350         if (c2)
 351                 c2++;
 352
 353         if (!c1 && !c2)
 354                 c1 = maxcpus;
 355         else if (c1 > c2)
 356                 c2 = c1;
 357         else if (c1 < c2)
 358                 c1 = c2;
 359         else if (!c1 && c2)
 360                 c1 = c2;
 361
 362         errno = 0;
 363         cpus = strtoul(c1, NULL, 0);
 364         if (errno != 0)
 365                 return -1;
 366
 367         return cpus;
 368 }
 369
 370 #define __ISOL_CPUS "/sys/devices/system/cpu/isolated"
 371 static bool cg_legacy_filter_and_set_cpus(char *path, bool am_initialized)
 372 {
 373         int ret;
 374         ssize_t i;
 375         char *lastslash, *fpath, oldv;
 376         ssize_t maxisol = 0, maxposs = 0;
 377         char *cpulist = NULL, *isolcpus = NULL, *posscpus = NULL;
 378         uint32_t *isolmask = NULL, *possmask = NULL;
 379         bool bret = false, flipped_bit = false;
 380
 381         lastslash = strrchr(path, '/');
 382         if (!lastslash) {
 383                 ERROR("Failed to detect \"/\" in \"%s\"", path);
 384                 return bret;
 385         }
 386         oldv = *lastslash;
 387         *lastslash = '\0';
 388         fpath = must_make_path(path, "cpuset.cpus", NULL);
 389         posscpus = read_file(fpath);
 390         if (!posscpus) {
 391                 SYSERROR("Failed to read file \"%s\"", fpath);
 392                 goto on_error;
 393         }
 394
 395         /* Get maximum number of cpus found in possible cpuset. */
 396         maxposs = get_max_cpus(posscpus);
 397         if (maxposs < 0)
 398                 goto on_error;
 399
 400         if (!file_exists(__ISOL_CPUS)) {
 401                 /* This system doesn't expose isolated cpus. */
 402                 DEBUG("The path \""__ISOL_CPUS"\" to read isolated cpus from does not exist");
 403                 cpulist = posscpus;
 404                 /* No isolated cpus but we weren't already initialized by
 405                  * someone. We should simply copy the parents cpuset.cpus
 406                  * values.
 407                  */
 408                 if (!am_initialized) {
 409                         DEBUG("Copying cpu settings of parent cgroup");
 410                         goto copy_parent;
 411                 }
 412                 /* No isolated cpus but we were already initialized by someone.
 413                  * Nothing more to do for us.
 414                  */
 415                 goto on_success;
 416         }
 417
 418         isolcpus = read_file(__ISOL_CPUS);
 419         if (!isolcpus) {
 420                 SYSERROR("Failed to read file \""__ISOL_CPUS"\"");
 421                 goto on_error;
 422         }
 423         if (!isdigit(isolcpus[0])) {
 424                 TRACE("No isolated cpus detected");
 425                 cpulist = posscpus;
 426                 /* No isolated cpus but we weren't already initialized by
 427                  * someone. We should simply copy the parents cpuset.cpus
 428                  * values.
 429                  */
 430                 if (!am_initialized) {
 431                         DEBUG("Copying cpu settings of parent cgroup");
 432                         goto copy_parent;
 433                 }
 434                 /* No isolated cpus but we were already initialized by someone.
 435                  * Nothing more to do for us.
 436                  */
 437                 goto on_success;
 438         }
 439
 440         /* Get maximum number of cpus found in isolated cpuset. */
 441         maxisol = get_max_cpus(isolcpus);
 442         if (maxisol < 0)
 443                 goto on_error;
 444
 445         if (maxposs < maxisol)
 446                 maxposs = maxisol;
 447         maxposs++;
 448
 449         possmask = lxc_cpumask(posscpus, maxposs);
 450         if (!possmask) {
 451                 ERROR("Failed to create cpumask for possible cpus");
 452                 goto on_error;
 453         }
 454
 455         isolmask = lxc_cpumask(isolcpus, maxposs);
 456         if (!isolmask) {
 457                 ERROR("Failed to create cpumask for isolated cpus");
 458                 goto on_error;
 459         }
 460
 461         for (i = 0; i <= maxposs; i++) {
 462                 if (!is_set(i, isolmask) || !is_set(i, possmask))
 463                         continue;
 464
 465                 flipped_bit = true;
 466                 clear_bit(i, possmask);
 467         }
 468
 469         if (!flipped_bit) {
 470                 DEBUG("No isolated cpus present in cpuset");
 471                 goto on_success;
 472         }
 473         DEBUG("Removed isolated cpus from cpuset");
 474
 475         cpulist = lxc_cpumask_to_cpulist(possmask, maxposs);
 476         if (!cpulist) {
 477                 ERROR("Failed to create cpu list");
 478                 goto on_error;
 479         }
 480
 481 copy_parent:
 482         *lastslash = oldv;
 483         free(fpath);
 484         fpath = must_make_path(path, "cpuset.cpus", NULL);
 485         ret = lxc_write_to_file(fpath, cpulist, strlen(cpulist), false, 0666);
 486         if (ret < 0) {
 487                 SYSERROR("Failed to write cpu list to \"%s\"", fpath);
 488                 goto on_error;
 489         }
 490
 491 on_success:
 492         bret = true;
 493
 494 on_error:
 495         free(fpath);
 496
 497         free(isolcpus);
 498         free(isolmask);
 499
 500         if (posscpus != cpulist)
 501                 free(posscpus);
 502         free(possmask);
 503
 504         free(cpulist);
 505         return bret;
 506 }
 507
 508 /* Copy contents of parent(@path)/@file to @path/@file */
 509 static bool copy_parent_file(char *path, char *file)
 510 {
 511         int ret;
 512         char *fpath, *lastslash, oldv;
 513         int len = 0;
 514         char *value = NULL;
 515
 516         lastslash = strrchr(path, '/');
 517         if (!lastslash) {
 518                 ERROR("Failed to detect \"/\" in \"%s\"", path);
 519                 return false;
 520         }
 521         oldv = *lastslash;
 522         *lastslash = '\0';
 523         fpath = must_make_path(path, file, NULL);
 524         len = lxc_read_from_file(fpath, NULL, 0);
 525         if (len <= 0)
 526                 goto on_error;
 527
 528         value = must_alloc(len + 1);
 529         ret = lxc_read_from_file(fpath, value, len);
 530         if (ret != len)
 531                 goto on_error;
 532         free(fpath);
 533
 534         *lastslash = oldv;
 535         fpath = must_make_path(path, file, NULL);
 536         ret = lxc_write_to_file(fpath, value, len, false, 0666);
 537         if (ret < 0)
 538                 SYSERROR("Failed to write \"%s\" to file \"%s\"", value, fpath);
 539         free(fpath);
 540         free(value);
 541         return ret >= 0;
 542
 543 on_error:
 544         SYSERROR("Failed to read file \"%s\"", fpath);
 545         free(fpath);
 546         free(value);
 547         return false;
 548 }
 549
 550 /* Initialize the cpuset hierarchy in first directory of @gname and set
 551  * cgroup.clone_children so that children inherit settings. Since the
 552  * h->base_path is populated by init or ourselves, we know it is already
 553  * initialized.
 554  */
 555 static bool cg_legacy_handle_cpuset_hierarchy(struct hierarchy *h, char *cgname)
 556 {
 557         int ret;
 558         char v;
 559         char *cgpath, *clonechildrenpath, *slash;
 560
 561         if (!string_in_list(h->controllers, "cpuset"))
 562                 return true;
 563
 564         if (*cgname == '/')
 565                 cgname++;
 566         slash = strchr(cgname, '/');
 567         if (slash)
 568                 *slash = '\0';
 569
 570         cgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
 571         if (slash)
 572                 *slash = '/';
 573
 574         ret = mkdir(cgpath, 0755);
 575         if (ret < 0) {
 576                 if (errno != EEXIST) {
 577                         SYSERROR("Failed to create directory \"%s\"", cgpath);
 578                         free(cgpath);
 579                         return false;
 580                 }
 581         }
 582
 583         clonechildrenpath =
 584             must_make_path(cgpath, "cgroup.clone_children", NULL);
 585         /* unified hierarchy doesn't have clone_children */
 586         if (!file_exists(clonechildrenpath)) {
 587                 free(clonechildrenpath);
 588                 free(cgpath);
 589                 return true;
 590         }
 591
 592         ret = lxc_read_from_file(clonechildrenpath, &v, 1);
 593         if (ret < 0) {
 594                 SYSERROR("Failed to read file \"%s\"", clonechildrenpath);
 595                 free(clonechildrenpath);
 596                 free(cgpath);
 597                 return false;
 598         }
 599
 600         /* Make sure any isolated cpus are removed from cpuset.cpus. */
 601         if (!cg_legacy_filter_and_set_cpus(cgpath, v == '1')) {
 602                 SYSERROR("Failed to remove isolated cpus");
 603                 free(clonechildrenpath);
 604                 free(cgpath);
 605                 return false;
 606         }
 607
 608         /* Already set for us by someone else. */
 609         if (v == '1') {
 610                 DEBUG("\"cgroup.clone_children\" was already set to \"1\"");
 611                 free(clonechildrenpath);
 612                 free(cgpath);
 613                 return true;
 614         }
 615
 616         /* copy parent's settings */
 617         if (!copy_parent_file(cgpath, "cpuset.mems")) {
 618                 SYSERROR("Failed to copy \"cpuset.mems\" settings");
 619                 free(cgpath);
 620                 free(clonechildrenpath);
 621                 return false;
 622         }
 623         free(cgpath);
 624
 625         ret = lxc_write_to_file(clonechildrenpath, "1", 1, false, 0666);
 626         if (ret < 0) {
 627                 /* Set clone_children so children inherit our settings */
 628                 SYSERROR("Failed to write 1 to \"%s\"", clonechildrenpath);
 629                 free(clonechildrenpath);
 630                 return false;
 631         }
 632         free(clonechildrenpath);
 633         return true;
 634 }
 635
 636 /* Given two null-terminated lists of strings, return true if any string is in
 637  * both.
 638  */
 639 static bool controller_lists_intersect(char **l1, char **l2)
 640 {
 641         int i;
 642
 643         if (!l1 || !l2)
 644                 return false;
 645
 646         for (i = 0; l1[i]; i++) {
 647                 if (string_in_list(l2, l1[i]))
 648                         return true;
 649         }
 650
 651         return false;
 652 }
 653
 654 /* For a null-terminated list of controllers @clist, return true if any of those
 655  * controllers is already listed the null-terminated list of hierarchies @hlist.
 656  * Realistically, if one is present, all must be present.
 657  */
 658 static bool controller_list_is_dup(struct hierarchy **hlist, char **clist)
 659 {
 660         int i;
 661
 662         if (!hlist)
 663                 return false;
 664
 665         for (i = 0; hlist[i]; i++)
 666                 if (controller_lists_intersect(hlist[i]->controllers, clist))
 667                         return true;
 668
 669         return false;
 670 }
 671
 672 /* Return true if the controller @entry is found in the null-terminated list of
 673  * hierarchies @hlist.
 674  */
 675 static bool controller_found(struct hierarchy **hlist, char *entry)
 676 {
 677         int i;
 678
 679         if (!hlist)
 680                 return false;
 681
 682         for (i = 0; hlist[i]; i++)
 683                 if (string_in_list(hlist[i]->controllers, entry))
 684                         return true;
 685
 686         return false;
 687 }
 688
 689 /* Return true if all of the controllers which we require have been found.  The
 690  * required list is  freezer and anything in lxc.cgroup.use.
 691  */
 692 static bool all_controllers_found(struct cgroup_ops *ops)
 693 {
 694         char *p;
 695         char *saveptr = NULL;
 696         struct hierarchy **hlist = ops->hierarchies;
 697
 698         if (!controller_found(hlist, "freezer")) {
 699                 ERROR("No freezer controller mountpoint found");
 700                 return false;
 701         }
 702
 703         if (!ops->cgroup_use)
 704                 return true;
 705
 706         for (; (p = strtok_r(ops->cgroup_use, ",", &saveptr)); ops->cgroup_use = NULL)
 707                 if (!controller_found(hlist, p)) {
 708                         ERROR("No %s controller mountpoint found", p);
 709                         return false;
 710                 }
 711
 712         return true;
 713 }
 714
 715 /* Get the controllers from a mountinfo line There are other ways we could get
 716  * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
 717  * could parse the mount options. But we simply assume that the mountpoint must
 718  * be /sys/fs/cgroup/controller-list
 719  */
 720 static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line,
 721                                         int type)
 722 {
 723         /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list
 724          * for legacy hierarchies.
 725          */
 726         int i;
 727         char *dup, *p2, *tok;
 728         char *p = line, *saveptr = NULL, *sep = ",";
 729         char **aret = NULL;
 730
 731         for (i = 0; i < 4; i++) {
 732                 p = strchr(p, ' ');
 733                 if (!p)
 734                         return NULL;
 735                 p++;
 736         }
 737
 738         /* Note, if we change how mountinfo works, then our caller will need to
 739          * verify /sys/fs/cgroup/ in this field.
 740          */
 741         if (strncmp(p, "/sys/fs/cgroup/", 15) != 0) {
 742                 ERROR("Found hierarchy not under /sys/fs/cgroup: \"%s\"", p);
 743                 return NULL;
 744         }
 745
 746         p += 15;
 747         p2 = strchr(p, ' ');
 748         if (!p2) {
 749                 ERROR("Corrupt mountinfo");
 750                 return NULL;
 751         }
 752         *p2 = '\0';
 753
 754         if (type == CGROUP_SUPER_MAGIC) {
 755                 /* strdup() here for v1 hierarchies. Otherwise strtok_r() will
 756                  * destroy mountpoints such as "/sys/fs/cgroup/cpu,cpuacct".
 757                  */
 758                 dup = strdup(p);
 759                 if (!dup)
 760                         return NULL;
 761
 762                 for (tok = strtok_r(dup, sep, &saveptr); tok;
 763                      tok = strtok_r(NULL, sep, &saveptr))
 764                         must_append_controller(klist, nlist, &aret, tok);
 765
 766                 free(dup);
 767         }
 768         *p2 = ' ';
 769
 770         return aret;
 771 }
 772
 773 static char **cg_unified_make_empty_controller(void)
 774 {
 775         int newentry;
 776         char **aret = NULL;
 777
 778         newentry = append_null_to_list((void ***)&aret);
 779         aret[newentry] = NULL;
 780         return aret;
 781 }
 782
 783 static char **cg_unified_get_controllers(const char *file)
 784 {
 785         char *buf, *tok;
 786         char *saveptr = NULL, *sep = " \t\n";
 787         char **aret = NULL;
 788
 789         buf = read_file(file);
 790         if (!buf)
 791                 return NULL;
 792
 793         for (tok = strtok_r(buf, sep, &saveptr); tok;
 794              tok = strtok_r(NULL, sep, &saveptr)) {
 795                 int newentry;
 796                 char *copy;
 797
 798                 newentry = append_null_to_list((void ***)&aret);
 799                 copy = must_copy_string(tok);
 800                 aret[newentry] = copy;
 801         }
 802
 803         free(buf);
 804         return aret;
 805 }
 806
 807 static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint,
 808                                        char *base_cgroup, int type)
 809 {
 810         struct hierarchy *new;
 811         int newentry;
 812
 813         new = must_alloc(sizeof(*new));
 814         new->controllers = clist;
 815         new->mountpoint = mountpoint;
 816         new->base_cgroup = base_cgroup;
 817         new->fullcgpath = NULL;
 818         new->version = type;
 819
 820         newentry = append_null_to_list((void ***)h);
 821         (*h)[newentry] = new;
 822         return new;
 823 }
 824
 825 /* Get a copy of the mountpoint from @line, which is a line from
 826  * /proc/self/mountinfo.
 827  */
 828 static char *cg_hybrid_get_mountpoint(char *line)
 829 {
 830         int i;
 831         size_t len;
 832         char *p2;
 833         char *p = line, *sret = NULL;
 834
 835         for (i = 0; i < 4; i++) {
 836                 p = strchr(p, ' ');
 837                 if (!p)
 838                         return NULL;
 839                 p++;
 840         }
 841
 842         if (strncmp(p, "/sys/fs/cgroup/", 15) != 0)
 843                 return NULL;
 844
 845         p2 = strchr(p + 15, ' ');
 846         if (!p2)
 847                 return NULL;
 848         *p2 = '\0';
 849
 850         len = strlen(p);
 851         sret = must_alloc(len + 1);
 852         memcpy(sret, p, len);
 853         sret[len] = '\0';
 854         return sret;
 855 }
 856
 857 /* Given a multi-line string, return a null-terminated copy of the current line. */
 858 static char *copy_to_eol(char *p)
 859 {
 860         char *p2 = strchr(p, '\n'), *sret;
 861         size_t len;
 862
 863         if (!p2)
 864                 return NULL;
 865
 866         len = p2 - p;
 867         sret = must_alloc(len + 1);
 868         memcpy(sret, p, len);
 869         sret[len] = '\0';
 870         return sret;
 871 }
 872
 873 /* cgline: pointer to character after the first ':' in a line in a \n-terminated
 874  * /proc/self/cgroup file. Check whether controller c is present.
 875  */
 876 static bool controller_in_clist(char *cgline, char *c)
 877 {
 878         char *tok, *saveptr = NULL, *eol, *tmp;
 879         size_t len;
 880
 881         eol = strchr(cgline, ':');
 882         if (!eol)
 883                 return false;
 884
 885         len = eol - cgline;
 886         tmp = alloca(len + 1);
 887         memcpy(tmp, cgline, len);
 888         tmp[len] = '\0';
 889
 890         for (tok = strtok_r(tmp, ",", &saveptr); tok;
 891              tok = strtok_r(NULL, ",", &saveptr)) {
 892                 if (strcmp(tok, c) == 0)
 893                         return true;
 894         }
 895
 896         return false;
 897 }
 898
 899 /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
 900  * @controller.
 901  */
 902 static char *cg_hybrid_get_current_cgroup(char *basecginfo, char *controller,
 903                                           int type)
 904 {
 905         char *p = basecginfo;
 906
 907         for (;;) {
 908                 bool is_cgv2_base_cgroup = false;
 909
 910                 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
 911                 if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0'))
 912                         is_cgv2_base_cgroup = true;
 913
 914                 p = strchr(p, ':');
 915                 if (!p)
 916                         return NULL;
 917                 p++;
 918
 919                 if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) {
 920                         p = strchr(p, ':');
 921                         if (!p)
 922                                 return NULL;
 923                         p++;
 924                         return copy_to_eol(p);
 925                 }
 926
 927                 p = strchr(p, '\n');
 928                 if (!p)
 929                         return NULL;
 930                 p++;
 931         }
 932 }
 933
 934 static void must_append_string(char ***list, char *entry)
 935 {
 936         int newentry;
 937         char *copy;
 938
 939         newentry = append_null_to_list((void ***)list);
 940         copy = must_copy_string(entry);
 941         (*list)[newentry] = copy;
 942 }
 943
 944 static int get_existing_subsystems(char ***klist, char ***nlist)
 945 {
 946         FILE *f;
 947         char *line = NULL;
 948         size_t len = 0;
 949
 950         f = fopen("/proc/self/cgroup", "r");
 951         if (!f)
 952                 return -1;
 953
 954         while (getline(&line, &len, f) != -1) {
 955                 char *p, *p2, *tok, *saveptr = NULL;
 956                 p = strchr(line, ':');
 957                 if (!p)
 958                         continue;
 959                 p++;
 960                 p2 = strchr(p, ':');
 961                 if (!p2)
 962                         continue;
 963                 *p2 = '\0';
 964
 965                 /* If the kernel has cgroup v2 support, then /proc/self/cgroup
 966                  * contains an entry of the form:
 967                  *
 968                  *      0::/some/path
 969                  *
 970                  * In this case we use "cgroup2" as controller name.
 971                  */
 972                 if ((p2 - p) == 0) {
 973                         must_append_string(klist, "cgroup2");
 974                         continue;
 975                 }
 976
 977                 for (tok = strtok_r(p, ",", &saveptr); tok;
 978                      tok = strtok_r(NULL, ",", &saveptr)) {
 979                         if (strncmp(tok, "name=", 5) == 0)
 980                                 must_append_string(nlist, tok);
 981                         else
 982                                 must_append_string(klist, tok);
 983                 }
 984         }
 985
 986         free(line);
 987         fclose(f);
 988         return 0;
 989 }
 990
 991 static void trim(char *s)
 992 {
 993         size_t len;
 994
 995         len = strlen(s);
 996         while ((len > 1) && (s[len - 1] == '\n'))
 997                 s[--len] = '\0';
 998 }
 999
1000 static void lxc_cgfsng_print_hierarchies(struct cgroup_ops *ops)
1001 {
1002         int i;
1003         struct hierarchy **it;
1004
1005         if (!ops->hierarchies) {
1006                 TRACE("  No hierarchies found");
1007                 return;
1008         }
1009
1010         TRACE("  Hierarchies:");
1011         for (i = 0, it = ops->hierarchies; it && *it; it++, i++) {
1012                 int j;
1013                 char **cit;
1014
1015                 TRACE("  %d: base_cgroup: %s", i, (*it)->base_cgroup ? (*it)->base_cgroup : "(null)");
1016                 TRACE("      mountpoint:  %s", (*it)->mountpoint ? (*it)->mountpoint : "(null)");
1017                 TRACE("      controllers:");
1018                 for (j = 0, cit = (*it)->controllers; cit && *cit; cit++, j++)
1019                         TRACE("      %d: %s", j, *cit);
1020         }
1021 }
1022
1023 static void lxc_cgfsng_print_basecg_debuginfo(char *basecginfo, char **klist,
1024                                               char **nlist)
1025 {
1026         int k;
1027         char **it;
1028
1029         TRACE("basecginfo is:");
1030         TRACE("%s", basecginfo);
1031
1032         for (k = 0, it = klist; it && *it; it++, k++)
1033                 TRACE("kernel subsystem %d: %s", k, *it);
1034
1035         for (k = 0, it = nlist; it && *it; it++, k++)
1036                 TRACE("named subsystem %d: %s", k, *it);
1037 }
1038
1039 static int recursive_destroy(char *dirname)
1040 {
1041         int ret;
1042         struct dirent *direntp;
1043         DIR *dir;
1044         int r = 0;
1045
1046         dir = opendir(dirname);
1047         if (!dir)
1048                 return -1;
1049
1050         while ((direntp = readdir(dir))) {
1051                 char *pathname;
1052                 struct stat mystat;
1053
1054                 if (!strcmp(direntp->d_name, ".") ||
1055                     !strcmp(direntp->d_name, ".."))
1056                         continue;
1057
1058                 pathname = must_make_path(dirname, direntp->d_name, NULL);
1059
1060                 ret = lstat(pathname, &mystat);
1061                 if (ret < 0) {
1062                         if (!r)
1063                                 WARN("Failed to stat \"%s\"", pathname);
1064                         r = -1;
1065                         goto next;
1066                 }
1067
1068                 if (!S_ISDIR(mystat.st_mode))
1069                         goto next;
1070
1071                 ret = recursive_destroy(pathname);
1072                 if (ret < 0)
1073                         r = -1;
1074         next:
1075                 free(pathname);
1076         }
1077
1078         ret = rmdir(dirname);
1079         if (ret < 0) {
1080                 if (!r)
1081                         WARN("%s - Failed to delete \"%s\"", strerror(errno), dirname);
1082                 r = -1;
1083         }
1084
1085         ret = closedir(dir);
1086         if (ret < 0) {
1087                 if (!r)
1088                         WARN("%s - Failed to delete \"%s\"", strerror(errno), dirname);
1089                 r = -1;
1090         }
1091
1092         return r;
1093 }
1094
1095 static int cgroup_rmdir(struct hierarchy **hierarchies,
1096                         const char *container_cgroup)
1097 {
1098         int i;
1099
1100         if (!container_cgroup || !hierarchies)
1101                 return 0;
1102
1103         for (i = 0; hierarchies[i]; i++) {
1104                 int ret;
1105                 struct hierarchy *h = hierarchies[i];
1106
1107                 if (!h->fullcgpath)
1108                         continue;
1109
1110                 ret = recursive_destroy(h->fullcgpath);
1111                 if (ret < 0)
1112                         WARN("Failed to destroy \"%s\"", h->fullcgpath);
1113
1114                 free(h->fullcgpath);
1115                 h->fullcgpath = NULL;
1116         }
1117
1118         return 0;
1119 }
1120
1121 struct generic_userns_exec_data {
1122         struct hierarchy **hierarchies;
1123         const char *container_cgroup;
1124         struct lxc_conf *conf;
1125         uid_t origuid; /* target uid in parent namespace */
1126         char *path;
1127 };
1128
1129 static int cgroup_rmdir_wrapper(void *data)
1130 {
1131         int ret;
1132         struct generic_userns_exec_data *arg = data;
1133         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1134         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1135
1136         ret = setresgid(nsgid, nsgid, nsgid);
1137         if (ret < 0) {
1138                 SYSERROR("Failed to setresgid(%d, %d, %d)", (int)nsgid,
1139                          (int)nsgid, (int)nsgid);
1140                 return -1;
1141         }
1142
1143         ret = setresuid(nsuid, nsuid, nsuid);
1144         if (ret < 0) {
1145                 SYSERROR("Failed to setresuid(%d, %d, %d)", (int)nsuid,
1146                          (int)nsuid, (int)nsuid);
1147                 return -1;
1148         }
1149
1150         ret = setgroups(0, NULL);
1151         if (ret < 0 && errno != EPERM) {
1152                 SYSERROR("Failed to setgroups(0, NULL)");
1153                 return -1;
1154         }
1155
1156         return cgroup_rmdir(arg->hierarchies, arg->container_cgroup);
1157 }
1158
1159 static void cgfsng_destroy(struct cgroup_ops *ops, struct lxc_handler *handler)
1160 {
1161         int ret;
1162         struct generic_userns_exec_data wrap;
1163
1164         wrap.origuid = 0;
1165         wrap.container_cgroup = ops->container_cgroup;
1166         wrap.hierarchies = ops->hierarchies;
1167         wrap.conf = handler->conf;
1168
1169         if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
1170                 ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
1171                                     "cgroup_rmdir_wrapper");
1172         else
1173                 ret = cgroup_rmdir(ops->hierarchies, ops->container_cgroup);
1174         if (ret < 0) {
1175                 WARN("Failed to destroy cgroups");
1176                 return;
1177         }
1178 }
1179
1180 static bool cg_unified_create_cgroup(struct hierarchy *h, char *cgname)
1181 {
1182         size_t i, parts_len;
1183         char **it;
1184         size_t full_len = 0;
1185         char *add_controllers = NULL, *cgroup = NULL;
1186         char **parts = NULL;
1187         bool bret = false;
1188
1189         if (h->version != CGROUP2_SUPER_MAGIC)
1190                 return true;
1191
1192         if (!h->controllers)
1193                 return true;
1194
1195         /* For now we simply enable all controllers that we have detected by
1196          * creating a string like "+memory +pids +cpu +io".
1197          * TODO: In the near future we might want to support "-<controller>"
1198          * etc. but whether supporting semantics like this make sense will need
1199          * some thinking.
1200          */
1201         for (it = h->controllers; it && *it; it++) {
1202                 full_len += strlen(*it) + 2;
1203                 add_controllers = must_realloc(add_controllers, full_len + 1);
1204
1205                 if (h->controllers[0] == *it)
1206                         add_controllers[0] = '\0';
1207
1208                 strncat(add_controllers, "+", 1);
1209                 strncat(add_controllers, *it, strlen(*it));
1210
1211                 if ((it + 1) && *(it + 1))
1212                         strncat(add_controllers, " ", 1);
1213         }
1214
1215         parts = lxc_string_split(cgname, '/');
1216         if (!parts)
1217                 goto on_error;
1218
1219         parts_len = lxc_array_len((void **)parts);
1220         if (parts_len > 0)
1221                 parts_len--;
1222
1223         cgroup = must_make_path(h->mountpoint, h->base_cgroup, NULL);
1224         for (i = 0; i < parts_len; i++) {
1225                 int ret;
1226                 char *target;
1227
1228                 cgroup = must_append_path(cgroup, parts[i], NULL);
1229                 target = must_make_path(cgroup, "cgroup.subtree_control", NULL);
1230                 ret = lxc_write_to_file(target, add_controllers, full_len, false, 0666);
1231                 free(target);
1232                 if (ret < 0) {
1233                         SYSERROR("Could not enable \"%s\" controllers in the "
1234                                  "unified cgroup \"%s\"", add_controllers, cgroup);
1235                         goto on_error;
1236                 }
1237         }
1238
1239         bret = true;
1240
1241 on_error:
1242         lxc_free_array((void **)parts, free);
1243         free(add_controllers);
1244         free(cgroup);
1245         return bret;
1246 }
1247
1248 static bool create_path_for_hierarchy(struct hierarchy *h, char *cgname)
1249 {
1250         int ret;
1251
1252         h->fullcgpath = must_make_path(h->mountpoint, h->base_cgroup, cgname, NULL);
1253         if (dir_exists(h->fullcgpath)) {
1254                 ERROR("The cgroup \"%s\" already existed", h->fullcgpath);
1255                 return false;
1256         }
1257
1258         if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
1259                 ERROR("Failed to handle legacy cpuset controller");
1260                 return false;
1261         }
1262
1263         ret = mkdir_p(h->fullcgpath, 0755);
1264         if (ret < 0) {
1265                 ERROR("Failed to create cgroup \"%s\"", h->fullcgpath);
1266                 return false;
1267         }
1268
1269         return cg_unified_create_cgroup(h, cgname);
1270 }
1271
1272 static void remove_path_for_hierarchy(struct hierarchy *h, char *cgname)
1273 {
1274         int ret;
1275
1276         ret = rmdir(h->fullcgpath);
1277         if (ret < 0)
1278                 SYSERROR("Failed to rmdir(\"%s\") from failed creation attempt", h->fullcgpath);
1279
1280         free(h->fullcgpath);
1281         h->fullcgpath = NULL;
1282 }
1283
1284 /* Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
1285  * next cgroup_pattern-1, -2, ..., -999.
1286  */
1287 static inline bool cgfsng_create(struct cgroup_ops *ops,
1288                                  struct lxc_handler *handler)
1289 {
1290         int i;
1291         size_t len;
1292         char *container_cgroup, *offset, *tmp;
1293         int idx = 0;
1294         struct lxc_conf *conf = handler->conf;
1295
1296         if (ops->container_cgroup) {
1297                 WARN("cgfsng_create called a second time: %s", ops->container_cgroup);
1298                 return false;
1299         }
1300
1301         if (!conf)
1302                 return false;
1303
1304         if (conf->cgroup_meta.dir)
1305                 tmp = lxc_string_join("/", (const char *[]){conf->cgroup_meta.dir, handler->name, NULL}, false);
1306         else
1307                 tmp = lxc_string_replace("%n", handler->name, ops->cgroup_pattern);
1308         if (!tmp) {
1309                 ERROR("Failed expanding cgroup name pattern");
1310                 return false;
1311         }
1312
1313         len = strlen(tmp) + 5; /* leave room for -NNN\0 */
1314         container_cgroup = must_alloc(len);
1315         (void)strlcpy(container_cgroup, tmp, len);
1316         free(tmp);
1317         offset = container_cgroup + len - 5;
1318
1319 again:
1320         if (idx == 1000) {
1321                 ERROR("Too many conflicting cgroup names");
1322                 goto out_free;
1323         }
1324
1325         if (idx) {
1326                 int ret;
1327
1328                 ret = snprintf(offset, 5, "-%d", idx);
1329                 if (ret < 0 || (size_t)ret >= 5) {
1330                         FILE *f = fopen("/dev/null", "w");
1331                         if (f) {
1332                                 fprintf(f, "Workaround for GCC7 bug: "
1333                                            "https://gcc.gnu.org/bugzilla/"
1334                                            "show_bug.cgi?id=78969");
1335                                 fclose(f);
1336                         }
1337                 }
1338         }
1339
1340         for (i = 0; ops->hierarchies[i]; i++) {
1341                 if (!create_path_for_hierarchy(ops->hierarchies[i], container_cgroup)) {
1342                         int j;
1343                         ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->fullcgpath);
1344                         free(ops->hierarchies[i]->fullcgpath);
1345                         ops->hierarchies[i]->fullcgpath = NULL;
1346                         for (j = 0; j < i; j++)
1347                                 remove_path_for_hierarchy(ops->hierarchies[j], container_cgroup);
1348                         idx++;
1349                         goto again;
1350                 }
1351         }
1352
1353         ops->container_cgroup = container_cgroup;
1354
1355         return true;
1356
1357 out_free:
1358         free(container_cgroup);
1359
1360         return false;
1361 }
1362
1363 static bool cgfsng_enter(struct cgroup_ops *ops, pid_t pid)
1364 {
1365         int i, len;
1366         char pidstr[25];
1367
1368         len = snprintf(pidstr, 25, "%d", pid);
1369         if (len < 0 || len >= 25)
1370                 return false;
1371
1372         for (i = 0; ops->hierarchies[i]; i++) {
1373                 int ret;
1374                 char *fullpath;
1375
1376                 fullpath = must_make_path(ops->hierarchies[i]->fullcgpath,
1377                                           "cgroup.procs", NULL);
1378                 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
1379                 if (ret != 0) {
1380                         SYSERROR("Failed to enter cgroup \"%s\"", fullpath);
1381                         free(fullpath);
1382                         return false;
1383                 }
1384                 free(fullpath);
1385         }
1386
1387         return true;
1388 }
1389
1390 static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid,
1391                    mode_t chmod_mode)
1392 {
1393         int ret;
1394
1395         ret = chown(path, chown_uid, chown_gid);
1396         if (ret < 0) {
1397                 WARN("%s - Failed to chown(%s, %d, %d)", strerror(errno), path,
1398                      (int)chown_uid, (int)chown_gid);
1399                 return -1;
1400         }
1401
1402         ret = chmod(path, chmod_mode);
1403         if (ret < 0) {
1404                 WARN("%s - Failed to chmod(%s, %d)", strerror(errno), path,
1405                      (int)chmod_mode);
1406                 return -1;
1407         }
1408
1409         return 0;
1410 }
1411
1412 /* chgrp the container cgroups to container group.  We leave
1413  * the container owner as cgroup owner.  So we must make the
1414  * directories 775 so that the container can create sub-cgroups.
1415  *
1416  * Also chown the tasks and cgroup.procs files.  Those may not
1417  * exist depending on kernel version.
1418  */
1419 static int chown_cgroup_wrapper(void *data)
1420 {
1421         int i, ret;
1422         uid_t destuid;
1423         struct generic_userns_exec_data *arg = data;
1424         uid_t nsuid = (arg->conf->root_nsuid_map != NULL) ? 0 : arg->conf->init_uid;
1425         gid_t nsgid = (arg->conf->root_nsgid_map != NULL) ? 0 : arg->conf->init_gid;
1426
1427         ret = setresgid(nsgid, nsgid, nsgid);
1428         if (ret < 0) {
1429                 SYSERROR("Failed to setresgid(%d, %d, %d)",
1430                          (int)nsgid, (int)nsgid, (int)nsgid);
1431                 return -1;
1432         }
1433
1434         ret = setresuid(nsuid, nsuid, nsuid);
1435         if (ret < 0) {
1436                 SYSERROR("Failed to setresuid(%d, %d, %d)",
1437                          (int)nsuid, (int)nsuid, (int)nsuid);
1438                 return -1;
1439         }
1440
1441         ret = setgroups(0, NULL);
1442         if (ret < 0 && errno != EPERM) {
1443                 SYSERROR("Failed to setgroups(0, NULL)");
1444                 return -1;
1445         }
1446
1447         destuid = get_ns_uid(arg->origuid);
1448
1449         for (i = 0; arg->hierarchies[i]; i++) {
1450                 char *fullpath;
1451                 char *path = arg->hierarchies[i]->fullcgpath;
1452
1453                 ret = chowmod(path, destuid, nsgid, 0775);
1454                 if (ret < 0)
1455                         return -1;
1456
1457                 /* Failures to chown() these are inconvenient but not
1458                  * detrimental We leave these owned by the container launcher,
1459                  * so that container root can write to the files to attach.  We
1460                  * chmod() them 664 so that container systemd can write to the
1461                  * files (which systemd in wily insists on doing).
1462                  */
1463
1464                 if (arg->hierarchies[i]->version == CGROUP_SUPER_MAGIC) {
1465                         fullpath = must_make_path(path, "tasks", NULL);
1466                         (void)chowmod(fullpath, destuid, nsgid, 0664);
1467                         free(fullpath);
1468                 }
1469
1470                 fullpath = must_make_path(path, "cgroup.procs", NULL);
1471                 (void)chowmod(fullpath, destuid, nsgid, 0664);
1472                 free(fullpath);
1473
1474                 if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
1475                         continue;
1476
1477                 fullpath = must_make_path(path, "cgroup.subtree_control", NULL);
1478                 (void)chowmod(fullpath, destuid, nsgid, 0664);
1479                 free(fullpath);
1480
1481                 fullpath = must_make_path(path, "cgroup.threads", NULL);
1482                 (void)chowmod(fullpath, destuid, nsgid, 0664);
1483                 free(fullpath);
1484         }
1485
1486         return 0;
1487 }
1488
1489 static bool cgfsng_chown(struct cgroup_ops *ops, struct lxc_conf *conf)
1490 {
1491         struct generic_userns_exec_data wrap;
1492
1493         if (lxc_list_empty(&conf->id_map))
1494                 return true;
1495
1496         wrap.origuid = geteuid();
1497         wrap.path = NULL;
1498         wrap.hierarchies = ops->hierarchies;
1499         wrap.conf = conf;
1500
1501         if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
1502                           "chown_cgroup_wrapper") < 0) {
1503                 ERROR("Error requesting cgroup chown in new user namespace");
1504                 return false;
1505         }
1506
1507         return true;
1508 }
1509
1510 /* cgroup-full:* is done, no need to create subdirs */
1511 static bool cg_mount_needs_subdirs(int type)
1512 {
1513         if (type >= LXC_AUTO_CGROUP_FULL_RO)
1514                 return false;
1515
1516         return true;
1517 }
1518
1519 /* After $rootfs/sys/fs/container/controller/the/cg/path has been created,
1520  * remount controller ro if needed and bindmount the cgroupfs onto
1521  * controll/the/cg/path.
1522  */
1523 static int cg_legacy_mount_controllers(int type, struct hierarchy *h,
1524                                        char *controllerpath, char *cgpath,
1525                                        const char *container_cgroup)
1526 {
1527         int ret, remount_flags;
1528         char *sourcepath;
1529         int flags = MS_BIND;
1530
1531         if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_MIXED) {
1532                 ret = mount(controllerpath, controllerpath, "cgroup", MS_BIND, NULL);
1533                 if (ret < 0) {
1534                         SYSERROR("Failed to bind mount \"%s\" onto \"%s\"",
1535                                  controllerpath, controllerpath);
1536                         return -1;
1537                 }
1538
1539                 remount_flags = add_required_remount_flags(controllerpath,
1540                                                            controllerpath,
1541                                                            flags | MS_REMOUNT);
1542                 ret = mount(controllerpath, controllerpath, "cgroup",
1543                             remount_flags | MS_REMOUNT | MS_BIND | MS_RDONLY,
1544                             NULL);
1545                 if (ret < 0) {
1546                         SYSERROR("Failed to remount \"%s\" ro", controllerpath);
1547                         return -1;
1548                 }
1549
1550                 INFO("Remounted %s read-only", controllerpath);
1551         }
1552
1553         sourcepath = must_make_path(h->mountpoint, h->base_cgroup,
1554                                     container_cgroup, NULL);
1555         if (type == LXC_AUTO_CGROUP_RO)
1556                 flags |= MS_RDONLY;
1557
1558         ret = mount(sourcepath, cgpath, "cgroup", flags, NULL);
1559         if (ret < 0) {
1560                 SYSERROR("Failed to mount \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1561                 free(sourcepath);
1562                 return -1;
1563         }
1564         INFO("Mounted \"%s\" onto \"%s\"", h->controllers[0], cgpath);
1565
1566         if (flags & MS_RDONLY) {
1567                 remount_flags = add_required_remount_flags(sourcepath, cgpath,
1568                                                            flags | MS_REMOUNT);
1569                 ret = mount(sourcepath, cgpath, "cgroup", remount_flags, NULL);
1570                 if (ret < 0) {
1571                         SYSERROR("Failed to remount \"%s\" ro", cgpath);
1572                         free(sourcepath);
1573                         return -1;
1574                 }
1575                 INFO("Remounted %s read-only", cgpath);
1576         }
1577
1578         free(sourcepath);
1579         INFO("Completed second stage cgroup automounts for \"%s\"", cgpath);
1580         return 0;
1581 }
1582
1583 /* __cg_mount_direct
1584  *
1585  * Mount cgroup hierarchies directly without using bind-mounts. The main
1586  * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting
1587  * cgroups for the LXC_AUTO_CGROUP_FULL option.
1588  */
1589 static int __cg_mount_direct(int type, struct hierarchy *h,
1590                              const char *controllerpath)
1591 {
1592          int ret;
1593          char *controllers = NULL;
1594          char *fstype = "cgroup2";
1595          unsigned long flags = 0;
1596
1597          flags |= MS_NOSUID;
1598          flags |= MS_NOEXEC;
1599          flags |= MS_NODEV;
1600          flags |= MS_RELATIME;
1601
1602          if (type == LXC_AUTO_CGROUP_RO || type == LXC_AUTO_CGROUP_FULL_RO)
1603                  flags |= MS_RDONLY;
1604
1605          if (h->version != CGROUP2_SUPER_MAGIC) {
1606                  controllers = lxc_string_join(",", (const char **)h->controllers, false);
1607                  if (!controllers)
1608                          return -ENOMEM;
1609                  fstype = "cgroup";
1610         }
1611
1612         ret = mount("cgroup", controllerpath, fstype, flags, controllers);
1613         free(controllers);
1614         if (ret < 0) {
1615                 SYSERROR("Failed to mount \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1616                 return -1;
1617         }
1618
1619         DEBUG("Mounted \"%s\" with cgroup filesystem type %s", controllerpath, fstype);
1620         return 0;
1621 }
1622
1623 static inline int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h,
1624                                                const char *controllerpath)
1625 {
1626         return __cg_mount_direct(type, h, controllerpath);
1627 }
1628
1629 static inline int cg_mount_cgroup_full(int type, struct hierarchy *h,
1630                                        const char *controllerpath)
1631 {
1632         if (type < LXC_AUTO_CGROUP_FULL_RO || type > LXC_AUTO_CGROUP_FULL_MIXED)
1633                 return 0;
1634
1635         return __cg_mount_direct(type, h, controllerpath);
1636 }
1637
1638 static bool cgfsng_mount(struct cgroup_ops *ops, struct lxc_handler *handler,
1639                          const char *root, int type)
1640 {
1641         int i, ret;
1642         char *tmpfspath = NULL;
1643         bool has_cgns = false, retval = false, wants_force_mount = false;
1644
1645         if ((type & LXC_AUTO_CGROUP_MASK) == 0)
1646                 return true;
1647
1648         if (type & LXC_AUTO_CGROUP_FORCE) {
1649                 type &= ~LXC_AUTO_CGROUP_FORCE;
1650                 wants_force_mount = true;
1651         }
1652
1653         if (!wants_force_mount){
1654                 if (!lxc_list_empty(&handler->conf->keepcaps))
1655                         wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps);
1656                 else
1657                         wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps);
1658         }
1659
1660         has_cgns = cgns_supported();
1661         if (has_cgns && !wants_force_mount)
1662                 return true;
1663
1664         if (type == LXC_AUTO_CGROUP_NOSPEC)
1665                 type = LXC_AUTO_CGROUP_MIXED;
1666         else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1667                 type = LXC_AUTO_CGROUP_FULL_MIXED;
1668
1669         /* Mount tmpfs */
1670         tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL);
1671         ret = safe_mount(NULL, tmpfspath, "tmpfs",
1672                          MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME,
1673                          "size=10240k,mode=755", root);
1674         if (ret < 0)
1675                 goto on_error;
1676
1677         for (i = 0; ops->hierarchies[i]; i++) {
1678                 char *controllerpath, *path2;
1679                 struct hierarchy *h = ops->hierarchies[i];
1680                 char *controller = strrchr(h->mountpoint, '/');
1681
1682                 if (!controller)
1683                         continue;
1684                 controller++;
1685
1686                 controllerpath = must_make_path(tmpfspath, controller, NULL);
1687                 if (dir_exists(controllerpath)) {
1688                         free(controllerpath);
1689                         continue;
1690                 }
1691
1692                 ret = mkdir(controllerpath, 0755);
1693                 if (ret < 0) {
1694                         SYSERROR("Error creating cgroup path: %s", controllerpath);
1695                         free(controllerpath);
1696                         goto on_error;
1697                 }
1698
1699                 if (has_cgns && wants_force_mount) {
1700                         /* If cgroup namespaces are supported but the container
1701                          * will not have CAP_SYS_ADMIN after it has started we
1702                          * need to mount the cgroups manually.
1703                          */
1704                         ret = cg_mount_in_cgroup_namespace(type, h, controllerpath);
1705                         free(controllerpath);
1706                         if (ret < 0)
1707                                 goto on_error;
1708
1709                         continue;
1710                 }
1711
1712                 ret = cg_mount_cgroup_full(type, h, controllerpath);
1713                 if (ret < 0) {
1714                         free(controllerpath);
1715                         goto on_error;
1716                 }
1717
1718                 if (!cg_mount_needs_subdirs(type)) {
1719                         free(controllerpath);
1720                         continue;
1721                 }
1722
1723                 path2 = must_make_path(controllerpath, h->base_cgroup,
1724                                        ops->container_cgroup, NULL);
1725                 ret = mkdir_p(path2, 0755);
1726                 if (ret < 0) {
1727                         free(controllerpath);
1728                         free(path2);
1729                         goto on_error;
1730                 }
1731
1732                 ret = cg_legacy_mount_controllers(type, h, controllerpath,
1733                                                   path2, ops->container_cgroup);
1734                 free(controllerpath);
1735                 free(path2);
1736                 if (ret < 0)
1737                         goto on_error;
1738         }
1739         retval = true;
1740
1741 on_error:
1742         free(tmpfspath);
1743         return retval;
1744 }
1745
1746 static int recursive_count_nrtasks(char *dirname)
1747 {
1748         struct dirent *direntp;
1749         DIR *dir;
1750         int count = 0, ret;
1751         char *path;
1752
1753         dir = opendir(dirname);
1754         if (!dir)
1755                 return 0;
1756
1757         while ((direntp = readdir(dir))) {
1758                 struct stat mystat;
1759
1760                 if (!strcmp(direntp->d_name, ".") ||
1761                     !strcmp(direntp->d_name, ".."))
1762                         continue;
1763
1764                 path = must_make_path(dirname, direntp->d_name, NULL);
1765
1766                 if (lstat(path, &mystat))
1767                         goto next;
1768
1769                 if (!S_ISDIR(mystat.st_mode))
1770                         goto next;
1771
1772                 count += recursive_count_nrtasks(path);
1773         next:
1774                 free(path);
1775         }
1776
1777         path = must_make_path(dirname, "cgroup.procs", NULL);
1778         ret = lxc_count_file_lines(path);
1779         if (ret != -1)
1780                 count += ret;
1781         free(path);
1782
1783         (void)closedir(dir);
1784
1785         return count;
1786 }
1787
1788 static int cgfsng_nrtasks(struct cgroup_ops *ops)
1789 {
1790         int count;
1791         char *path;
1792
1793         if (!ops->container_cgroup || !ops->hierarchies)
1794                 return -1;
1795
1796         path = must_make_path(ops->hierarchies[0]->fullcgpath, NULL);
1797         count = recursive_count_nrtasks(path);
1798         free(path);
1799         return count;
1800 }
1801
1802 /* Only root needs to escape to the cgroup of its init. */
1803 static bool cgfsng_escape(const struct cgroup_ops *ops)
1804 {
1805         int i;
1806
1807         if (geteuid())
1808                 return true;
1809
1810         for (i = 0; ops->hierarchies[i]; i++) {
1811                 int ret;
1812                 char *fullpath;
1813
1814                 fullpath = must_make_path(ops->hierarchies[i]->mountpoint,
1815                                           ops->hierarchies[i]->base_cgroup,
1816                                           "cgroup.procs", NULL);
1817                 ret = lxc_write_to_file(fullpath, "0", 2, false, 0666);
1818                 if (ret != 0) {
1819                         SYSERROR("Failed to escape to cgroup \"%s\"", fullpath);
1820                         free(fullpath);
1821                         return false;
1822                 }
1823                 free(fullpath);
1824         }
1825
1826         return true;
1827 }
1828
1829 static int cgfsng_num_hierarchies(struct cgroup_ops *ops)
1830 {
1831         int i;
1832
1833         for (i = 0; ops->hierarchies[i]; i++)
1834                 ;
1835
1836         return i;
1837 }
1838
1839 static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out)
1840 {
1841         int i;
1842
1843         /* sanity check n */
1844         for (i = 0; i < n; i++)
1845                 if (!ops->hierarchies[i])
1846                         return false;
1847
1848         *out = ops->hierarchies[i]->controllers;
1849
1850         return true;
1851 }
1852
1853 #define THAWED "THAWED"
1854 #define THAWED_LEN (strlen(THAWED))
1855
1856 /* TODO: If the unified cgroup hierarchy grows a freezer controller this needs
1857  * to be adapted.
1858  */
1859 static bool cgfsng_unfreeze(struct cgroup_ops *ops)
1860 {
1861         int ret;
1862         char *fullpath;
1863         struct hierarchy *h;
1864
1865         h = get_hierarchy(ops, "freezer");
1866         if (!h)
1867                 return false;
1868
1869         fullpath = must_make_path(h->fullcgpath, "freezer.state", NULL);
1870         ret = lxc_write_to_file(fullpath, THAWED, THAWED_LEN, false, 0666);
1871         free(fullpath);
1872         if (ret < 0)
1873                 return false;
1874
1875         return true;
1876 }
1877
1878 static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
1879                                      const char *controller)
1880 {
1881         struct hierarchy *h;
1882
1883         h = get_hierarchy(ops, controller);
1884         if (!h) {
1885                 WARN("Failed to find hierarchy for controller \"%s\"",
1886                      controller ? controller : "(null)");
1887                 return NULL;
1888         }
1889
1890         return h->fullcgpath ? h->fullcgpath + strlen(h->mountpoint) : NULL;
1891 }
1892
1893 /* Given a cgroup path returned from lxc_cmd_get_cgroup_path, build a full path,
1894  * which must be freed by the caller.
1895  */
1896 static inline char *build_full_cgpath_from_monitorpath(struct hierarchy *h,
1897                                                        const char *inpath,
1898                                                        const char *filename)
1899 {
1900         return must_make_path(h->mountpoint, inpath, filename, NULL);
1901 }
1902
1903 /* Technically, we're always at a delegation boundary here (This is especially
1904  * true when cgroup namespaces are available.). The reasoning is that in order
1905  * for us to have been able to start a container in the first place the root
1906  * cgroup must have been a leaf node. Now, either the container's init system
1907  * has populated the cgroup and kept it as a leaf node or it has created
1908  * subtrees. In the former case we will simply attach to the leaf node we
1909  * created when we started the container in the latter case we create our own
1910  * cgroup for the attaching process.
1911  */
1912 static int __cg_unified_attach(const struct hierarchy *h, const char *name,
1913                                const char *lxcpath, const char *pidstr,
1914                                size_t pidstr_len, const char *controller)
1915 {
1916         int ret;
1917         size_t len;
1918         int fret = -1, idx = 0;
1919         char *base_path = NULL, *container_cgroup = NULL, *full_path = NULL;
1920
1921         container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
1922         /* not running */
1923         if (!container_cgroup)
1924                 return 0;
1925
1926         base_path = must_make_path(h->mountpoint, container_cgroup, NULL);
1927         full_path = must_make_path(base_path, "cgroup.procs", NULL);
1928         /* cgroup is populated */
1929         ret = lxc_write_to_file(full_path, pidstr, pidstr_len, false, 0666);
1930         if (ret < 0 && errno != EBUSY)
1931                 goto on_error;
1932
1933         if (ret == 0)
1934                 goto on_success;
1935
1936         free(full_path);
1937
1938         len = strlen(base_path) + sizeof("/lxc-1000") - 1 +
1939               sizeof("/cgroup-procs") - 1;
1940         full_path = must_alloc(len + 1);
1941         do {
1942                 if (idx)
1943                         ret = snprintf(full_path, len + 1, "%s/lxc-%d",
1944                                        base_path, idx);
1945                 else
1946                         ret = snprintf(full_path, len + 1, "%s/lxc", base_path);
1947                 if (ret < 0 || (size_t)ret >= len + 1)
1948                         goto on_error;
1949
1950                 ret = mkdir_p(full_path, 0755);
1951                 if (ret < 0 && errno != EEXIST)
1952                         goto on_error;
1953
1954                 strncat(full_path, "/cgroup.procs", strlen("/cgroup.procs"));
1955                 ret = lxc_write_to_file(full_path, pidstr, len, false, 0666);
1956                 if (ret == 0)
1957                         goto on_success;
1958
1959                 /* this is a non-leaf node */
1960                 if (errno != EBUSY)
1961                         goto on_error;
1962
1963         } while (++idx > 0 && idx < 1000);
1964
1965 on_success:
1966         if (idx < 1000)
1967                 fret = 0;
1968
1969 on_error:
1970         free(base_path);
1971         free(container_cgroup);
1972         free(full_path);
1973
1974         return fret;
1975 }
1976
1977 static bool cgfsng_attach(struct cgroup_ops *ops, const char *name,
1978                           const char *lxcpath, pid_t pid)
1979 {
1980         int i, len, ret;
1981         char pidstr[25];
1982
1983         len = snprintf(pidstr, 25, "%d", pid);
1984         if (len < 0 || len >= 25)
1985                 return false;
1986
1987         for (i = 0; ops->hierarchies[i]; i++) {
1988                 char *path;
1989                 char *fullpath = NULL;
1990                 struct hierarchy *h = ops->hierarchies[i];
1991
1992                 if (h->version == CGROUP2_SUPER_MAGIC) {
1993                         ret = __cg_unified_attach(h, name, lxcpath, pidstr, len,
1994                                                   h->controllers[0]);
1995                         if (ret < 0)
1996                                 return false;
1997
1998                         continue;
1999                 }
2000
2001                 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
2002                 /* not running */
2003                 if (!path)
2004                         continue;
2005
2006                 fullpath = build_full_cgpath_from_monitorpath(h, path, "cgroup.procs");
2007                 free(path);
2008                 ret = lxc_write_to_file(fullpath, pidstr, len, false, 0666);
2009                 if (ret < 0) {
2010                         SYSERROR("Failed to attach %d to %s", (int)pid, fullpath);
2011                         free(fullpath);
2012                         return false;
2013                 }
2014                 free(fullpath);
2015         }
2016
2017         return true;
2018 }
2019
2020 /* Called externally (i.e. from 'lxc-cgroup') to query cgroup limits.  Here we
2021  * don't have a cgroup_data set up, so we ask the running container through the
2022  * commands API for the cgroup path.
2023  */
2024 static int cgfsng_get(struct cgroup_ops *ops, const char *filename, char *value,
2025                       size_t len, const char *name, const char *lxcpath)
2026 {
2027         int ret = -1;
2028         size_t controller_len;
2029         char *controller, *p, *path;
2030         struct hierarchy *h;
2031
2032         controller_len = strlen(filename);
2033         controller = alloca(controller_len + 1);
2034         (void)strlcpy(controller, filename, controller_len + 1);
2035
2036         p = strchr(controller, '.');
2037         if (p)
2038                 *p = '\0';
2039
2040         path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2041         /* not running */
2042         if (!path)
2043                 return -1;
2044
2045         h = get_hierarchy(ops, controller);
2046         if (h) {
2047                 char *fullpath;
2048
2049                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2050                 ret = lxc_read_from_file(fullpath, value, len);
2051                 free(fullpath);
2052         }
2053         free(path);
2054
2055         return ret;
2056 }
2057
2058 /* Called externally (i.e. from 'lxc-cgroup') to set new cgroup limits.  Here we
2059  * don't have a cgroup_data set up, so we ask the running container through the
2060  * commands API for the cgroup path.
2061  */
2062 static int cgfsng_set(struct cgroup_ops *ops, const char *filename,
2063                       const char *value, const char *name, const char *lxcpath)
2064 {
2065         int ret = -1;
2066         size_t controller_len;
2067         char *controller, *p, *path;
2068         struct hierarchy *h;
2069
2070         controller_len = strlen(filename);
2071         controller = alloca(controller_len + 1);
2072         (void)strlcpy(controller, filename, controller_len + 1);
2073
2074         p = strchr(controller, '.');
2075         if (p)
2076                 *p = '\0';
2077
2078         path = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
2079         /* not running */
2080         if (!path)
2081                 return -1;
2082
2083         h = get_hierarchy(ops, controller);
2084         if (h) {
2085                 char *fullpath;
2086
2087                 fullpath = build_full_cgpath_from_monitorpath(h, path, filename);
2088                 ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2089                 free(fullpath);
2090         }
2091         free(path);
2092
2093         return ret;
2094 }
2095
2096 /* take devices cgroup line
2097  *    /dev/foo rwx
2098  * and convert it to a valid
2099  *    type major:minor mode
2100  * line. Return <0 on error. Dest is a preallocated buffer long enough to hold
2101  * the output.
2102  */
2103 static int convert_devpath(const char *invalue, char *dest)
2104 {
2105         int n_parts;
2106         char *p, *path, type;
2107         unsigned long minor, major;
2108         struct stat sb;
2109         int ret = -EINVAL;
2110         char *mode = NULL;
2111
2112         path = must_copy_string(invalue);
2113
2114         /* Read path followed by mode. Ignore any trailing text.
2115          * A '    # comment' would be legal. Technically other text is not
2116          * legal, we could check for that if we cared to.
2117          */
2118         for (n_parts = 1, p = path; *p && n_parts < 3; p++) {
2119                 if (*p != ' ')
2120                         continue;
2121                 *p = '\0';
2122
2123                 if (n_parts != 1)
2124                         break;
2125                 p++;
2126                 n_parts++;
2127
2128                 while (*p == ' ')
2129                         p++;
2130
2131                 mode = p;
2132
2133                 if (*p == '\0')
2134                         goto out;
2135         }
2136
2137         if (n_parts == 1)
2138                 goto out;
2139
2140         ret = stat(path, &sb);
2141         if (ret < 0)
2142                 goto out;
2143
2144         mode_t m = sb.st_mode & S_IFMT;
2145         switch (m) {
2146         case S_IFBLK:
2147                 type = 'b';
2148                 break;
2149         case S_IFCHR:
2150                 type = 'c';
2151                 break;
2152         default:
2153                 ERROR("Unsupported device type %i for \"%s\"", m, path);
2154                 ret = -EINVAL;
2155                 goto out;
2156         }
2157
2158         major = MAJOR(sb.st_rdev);
2159         minor = MINOR(sb.st_rdev);
2160         ret = snprintf(dest, 50, "%c %lu:%lu %s", type, major, minor, mode);
2161         if (ret < 0 || ret >= 50) {
2162                 ERROR("Error on configuration value \"%c %lu:%lu %s\" (max 50 "
2163                       "chars)", type, major, minor, mode);
2164                 ret = -ENAMETOOLONG;
2165                 goto out;
2166         }
2167         ret = 0;
2168
2169 out:
2170         free(path);
2171         return ret;
2172 }
2173
2174 /* Called from setup_limits - here we have the container's cgroup_data because
2175  * we created the cgroups.
2176  */
2177 static int cg_legacy_set_data(struct cgroup_ops *ops, const char *filename,
2178                               const char *value)
2179 {
2180         size_t len;
2181         char *fullpath, *p;
2182         /* "b|c <2^64-1>:<2^64-1> r|w|m" = 47 chars max */
2183         char converted_value[50];
2184         struct hierarchy *h;
2185         int ret = 0;
2186         char *controller = NULL;
2187
2188         len = strlen(filename);
2189         controller = alloca(len + 1);
2190         (void)strlcpy(controller, filename, len + 1);
2191
2192         p = strchr(controller, '.');
2193         if (p)
2194                 *p = '\0';
2195
2196         if (strcmp("devices.allow", filename) == 0 && value[0] == '/') {
2197                 ret = convert_devpath(value, converted_value);
2198                 if (ret < 0)
2199                         return ret;
2200                 value = converted_value;
2201         }
2202
2203         h = get_hierarchy(ops, controller);
2204         if (!h) {
2205                 ERROR("Failed to setup limits for the \"%s\" controller. "
2206                       "The controller seems to be unused by \"cgfsng\" cgroup "
2207                       "driver or not enabled on the cgroup hierarchy",
2208                       controller);
2209                 errno = ENOENT;
2210                 return -ENOENT;
2211         }
2212
2213         fullpath = must_make_path(h->fullcgpath, filename, NULL);
2214         ret = lxc_write_to_file(fullpath, value, strlen(value), false, 0666);
2215         free(fullpath);
2216         return ret;
2217 }
2218
2219 static bool __cg_legacy_setup_limits(struct cgroup_ops *ops,
2220                                      struct lxc_list *cgroup_settings,
2221                                      bool do_devices)
2222 {
2223         struct lxc_list *iterator, *next, *sorted_cgroup_settings;
2224         struct lxc_cgroup *cg;
2225         bool ret = false;
2226
2227         if (lxc_list_empty(cgroup_settings))
2228                 return true;
2229
2230         sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2231         if (!sorted_cgroup_settings)
2232                 return false;
2233
2234         lxc_list_for_each(iterator, sorted_cgroup_settings) {
2235                 cg = iterator->elem;
2236
2237                 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2238                         if (cg_legacy_set_data(ops, cg->subsystem, cg->value)) {
2239                                 if (do_devices && (errno == EACCES || errno == EPERM)) {
2240                                         WARN("Failed to set \"%s\" to \"%s\"",
2241                                              cg->subsystem, cg->value);
2242                                         continue;
2243                                 }
2244                                 WARN("Failed to set \"%s\" to \"%s\"",
2245                                      cg->subsystem, cg->value);
2246                                 goto out;
2247                         }
2248                         DEBUG("Set controller \"%s\" set to \"%s\"",
2249                               cg->subsystem, cg->value);
2250                 }
2251         }
2252
2253         ret = true;
2254         INFO("Limits for the legacy cgroup hierarchies have been setup");
2255 out:
2256         lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2257                 lxc_list_del(iterator);
2258                 free(iterator);
2259         }
2260         free(sorted_cgroup_settings);
2261         return ret;
2262 }
2263
2264 static bool __cg_unified_setup_limits(struct cgroup_ops *ops,
2265                                       struct lxc_list *cgroup_settings)
2266 {
2267         struct lxc_list *iterator;
2268         struct hierarchy *h = ops->unified;
2269
2270         if (lxc_list_empty(cgroup_settings))
2271                 return true;
2272
2273         if (!h)
2274                 return false;
2275
2276         lxc_list_for_each(iterator, cgroup_settings) {
2277                 int ret;
2278                 char *fullpath;
2279                 struct lxc_cgroup *cg = iterator->elem;
2280
2281                 fullpath = must_make_path(h->fullcgpath, cg->subsystem, NULL);
2282                 ret = lxc_write_to_file(fullpath, cg->value, strlen(cg->value), false, 0666);
2283                 free(fullpath);
2284                 if (ret < 0) {
2285                         SYSERROR("Failed to set \"%s\" to \"%s\"",
2286                                  cg->subsystem, cg->value);
2287                         return false;
2288                 }
2289                 TRACE("Set \"%s\" to \"%s\"", cg->subsystem, cg->value);
2290         }
2291
2292         INFO("Limits for the unified cgroup hierarchy have been setup");
2293         return true;
2294 }
2295
2296 static bool cgfsng_setup_limits(struct cgroup_ops *ops, struct lxc_conf *conf,
2297                                 bool do_devices)
2298 {
2299         bool bret;
2300
2301         bret = __cg_legacy_setup_limits(ops, &conf->cgroup, do_devices);
2302         if (!bret)
2303                 return false;
2304
2305         return __cg_unified_setup_limits(ops, &conf->cgroup2);
2306 }
2307
2308 /* At startup, parse_hierarchies finds all the info we need about cgroup
2309  * mountpoints and current cgroups, and stores it in @d.
2310  */
2311 static bool cg_hybrid_init(struct cgroup_ops *ops)
2312 {
2313         int ret;
2314         char *basecginfo;
2315         bool will_escape;
2316         FILE *f;
2317         size_t len = 0;
2318         char *line = NULL;
2319         char **klist = NULL, **nlist = NULL;
2320
2321         /* Root spawned containers escape the current cgroup, so use init's
2322          * cgroups as our base in that case.
2323          */
2324         will_escape = (geteuid() == 0);
2325         if (will_escape)
2326                 basecginfo = read_file("/proc/1/cgroup");
2327         else
2328                 basecginfo = read_file("/proc/self/cgroup");
2329         if (!basecginfo)
2330                 return false;
2331
2332         ret = get_existing_subsystems(&klist, &nlist);
2333         if (ret < 0) {
2334                 ERROR("Failed to retrieve available legacy cgroup controllers");
2335                 free(basecginfo);
2336                 return false;
2337         }
2338
2339         f = fopen("/proc/self/mountinfo", "r");
2340         if (!f) {
2341                 ERROR("Failed to open \"/proc/self/mountinfo\"");
2342                 free(basecginfo);
2343                 return false;
2344         }
2345
2346         lxc_cgfsng_print_basecg_debuginfo(basecginfo, klist, nlist);
2347
2348         while (getline(&line, &len, f) != -1) {
2349                 int type;
2350                 bool writeable;
2351                 struct hierarchy *new;
2352                 char *base_cgroup = NULL, *mountpoint = NULL;
2353                 char **controller_list = NULL;
2354
2355                 type = get_cgroup_version(line);
2356                 if (type == 0)
2357                         continue;
2358
2359                 if (type == CGROUP2_SUPER_MAGIC && ops->unified)
2360                         continue;
2361
2362                 if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) {
2363                         if (type == CGROUP2_SUPER_MAGIC)
2364                                 ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2365                         else if (type == CGROUP_SUPER_MAGIC)
2366                                 ops->cgroup_layout = CGROUP_LAYOUT_LEGACY;
2367                 } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) {
2368                         if (type == CGROUP_SUPER_MAGIC)
2369                                 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2370                 } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) {
2371                         if (type == CGROUP2_SUPER_MAGIC)
2372                                 ops->cgroup_layout = CGROUP_LAYOUT_HYBRID;
2373                 }
2374
2375                 controller_list = cg_hybrid_get_controllers(klist, nlist, line, type);
2376                 if (!controller_list && type == CGROUP_SUPER_MAGIC)
2377                         continue;
2378
2379                 if (type == CGROUP_SUPER_MAGIC)
2380                         if (controller_list_is_dup(ops->hierarchies, controller_list))
2381                                 goto next;
2382
2383                 mountpoint = cg_hybrid_get_mountpoint(line);
2384                 if (!mountpoint) {
2385                         ERROR("Failed parsing mountpoint from \"%s\"", line);
2386                         goto next;
2387                 }
2388
2389                 if (type == CGROUP_SUPER_MAGIC)
2390                         base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC);
2391                 else
2392                         base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC);
2393                 if (!base_cgroup) {
2394                         ERROR("Failed to find current cgroup");
2395                         goto next;
2396                 }
2397
2398                 trim(base_cgroup);
2399                 prune_init_scope(base_cgroup);
2400                 if (type == CGROUP2_SUPER_MAGIC)
2401                         writeable = test_writeable_v2(mountpoint, base_cgroup);
2402                 else
2403                         writeable = test_writeable_v1(mountpoint, base_cgroup);
2404                 if (!writeable)
2405                         goto next;
2406
2407                 if (type == CGROUP2_SUPER_MAGIC) {
2408                         char *cgv2_ctrl_path;
2409
2410                         cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup,
2411                                                         "cgroup.controllers",
2412                                                         NULL);
2413
2414                         controller_list = cg_unified_get_controllers(cgv2_ctrl_path);
2415                         free(cgv2_ctrl_path);
2416                         if (!controller_list) {
2417                                 controller_list = cg_unified_make_empty_controller();
2418                                 TRACE("No controllers are enabled for "
2419                                       "delegation in the unified hierarchy");
2420                         }
2421                 }
2422
2423                 new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type);
2424                 if (type == CGROUP2_SUPER_MAGIC && !ops->unified)
2425                         ops->unified = new;
2426
2427                 continue;
2428
2429         next:
2430                 free_string_list(controller_list);
2431                 free(mountpoint);
2432                 free(base_cgroup);
2433         }
2434
2435         free_string_list(klist);
2436         free_string_list(nlist);
2437
2438         free(basecginfo);
2439
2440         fclose(f);
2441         free(line);
2442
2443         TRACE("Writable cgroup hierarchies:");
2444         lxc_cgfsng_print_hierarchies(ops);
2445
2446         /* verify that all controllers in cgroup.use and all crucial
2447          * controllers are accounted for
2448          */
2449         if (!all_controllers_found(ops))
2450                 return false;
2451
2452         return true;
2453 }
2454
2455 static int cg_is_pure_unified(void)
2456 {
2457
2458         int ret;
2459         struct statfs fs;
2460
2461         ret = statfs("/sys/fs/cgroup", &fs);
2462         if (ret < 0)
2463                 return -ENOMEDIUM;
2464
2465         if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC))
2466                 return CGROUP2_SUPER_MAGIC;
2467
2468         return 0;
2469 }
2470
2471 /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
2472 static char *cg_unified_get_current_cgroup(void)
2473 {
2474         char *basecginfo, *base_cgroup;
2475         bool will_escape;
2476         char *copy = NULL;
2477
2478         will_escape = (geteuid() == 0);
2479         if (will_escape)
2480                 basecginfo = read_file("/proc/1/cgroup");
2481         else
2482                 basecginfo = read_file("/proc/self/cgroup");
2483         if (!basecginfo)
2484                 return NULL;
2485
2486         base_cgroup = strstr(basecginfo, "0::/");
2487         if (!base_cgroup)
2488                 goto cleanup_on_err;
2489
2490         base_cgroup = base_cgroup + 3;
2491         copy = copy_to_eol(base_cgroup);
2492         if (!copy)
2493                 goto cleanup_on_err;
2494
2495 cleanup_on_err:
2496         free(basecginfo);
2497         if (copy)
2498                 trim(copy);
2499
2500         return copy;
2501 }
2502
2503 static int cg_unified_init(struct cgroup_ops *ops)
2504 {
2505         int ret;
2506         char *mountpoint, *subtree_path;
2507         char **delegatable;
2508         char *base_cgroup = NULL;
2509
2510         ret = cg_is_pure_unified();
2511         if (ret == -ENOMEDIUM)
2512                 return -ENOMEDIUM;
2513
2514         if (ret != CGROUP2_SUPER_MAGIC)
2515                 return 0;
2516
2517         base_cgroup = cg_unified_get_current_cgroup();
2518         if (!base_cgroup)
2519                 return -EINVAL;
2520         prune_init_scope(base_cgroup);
2521
2522         /* We assume that we have already been given controllers to delegate
2523          * further down the hierarchy. If not it is up to the user to delegate
2524          * them to us.
2525          */
2526         mountpoint = must_copy_string("/sys/fs/cgroup");
2527         subtree_path = must_make_path(mountpoint, base_cgroup,
2528                                       "cgroup.subtree_control", NULL);
2529         delegatable = cg_unified_get_controllers(subtree_path);
2530         free(subtree_path);
2531         if (!delegatable)
2532                 delegatable = cg_unified_make_empty_controller();
2533         if (!delegatable[0])
2534                 TRACE("No controllers are enabled for delegation");
2535
2536         /* TODO: If the user requested specific controllers via lxc.cgroup.use
2537          * we should verify here. The reason I'm not doing it right is that I'm
2538          * not convinced that lxc.cgroup.use will be the future since it is a
2539          * global property. I much rather have an option that lets you request
2540          * controllers per container.
2541          */
2542
2543         add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC);
2544
2545         ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED;
2546         return CGROUP2_SUPER_MAGIC;
2547 }
2548
2549 static bool cg_init(struct cgroup_ops *ops)
2550 {
2551         int ret;
2552         const char *tmp;
2553
2554         tmp = lxc_global_config_value("lxc.cgroup.use");
2555         if (tmp)
2556                 ops->cgroup_use = must_copy_string(tmp);
2557
2558         ret = cg_unified_init(ops);
2559         if (ret < 0)
2560                 return false;
2561
2562         if (ret == CGROUP2_SUPER_MAGIC)
2563                 return true;
2564
2565         return cg_hybrid_init(ops);
2566 }
2567
2568 static bool cgfsng_data_init(struct cgroup_ops *ops)
2569 {
2570         const char *cgroup_pattern;
2571
2572         /* copy system-wide cgroup information */
2573         cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
2574         if (!cgroup_pattern) {
2575                 /* lxc.cgroup.pattern is only NULL on error. */
2576                 ERROR("Failed to retrieve cgroup pattern");
2577                 return false;
2578         }
2579         ops->cgroup_pattern = must_copy_string(cgroup_pattern);
2580
2581         return true;
2582 }
2583
2584 struct cgroup_ops *cgfsng_ops_init(void)
2585 {
2586         struct cgroup_ops *cgfsng_ops;
2587
2588         cgfsng_ops = malloc(sizeof(struct cgroup_ops));
2589         if (!cgfsng_ops)
2590                 return NULL;
2591
2592         memset(cgfsng_ops, 0, sizeof(struct cgroup_ops));
2593         cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN;
2594
2595         if (!cg_init(cgfsng_ops)) {
2596                 free(cgfsng_ops);
2597                 return NULL;
2598         }
2599
2600         cgfsng_ops->data_init = cgfsng_data_init;
2601         cgfsng_ops->destroy = cgfsng_destroy;
2602         cgfsng_ops->create = cgfsng_create;
2603         cgfsng_ops->enter = cgfsng_enter;
2604         cgfsng_ops->escape = cgfsng_escape;
2605         cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies;
2606         cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies;
2607         cgfsng_ops->get_cgroup = cgfsng_get_cgroup;
2608         cgfsng_ops->get = cgfsng_get;
2609         cgfsng_ops->set = cgfsng_set;
2610         cgfsng_ops->unfreeze = cgfsng_unfreeze;
2611         cgfsng_ops->setup_limits = cgfsng_setup_limits;
2612         cgfsng_ops->driver = "cgfsng";
2613         cgfsng_ops->version = "1.0.0";
2614         cgfsng_ops->attach = cgfsng_attach;
2615         cgfsng_ops->chown = cgfsng_chown;
2616         cgfsng_ops->mount = cgfsng_mount;
2617         cgfsng_ops->nrtasks = cgfsng_nrtasks;
2618
2619         return cgfsng_ops;
2620 }