src/lxc/cgroups/cgfs.c

   1 /*
   2  * lxc: linux Container library
   3  *
   4  * (C) Copyright IBM Corp. 2007, 2008
   5  *
   6  * Authors:
   7  * Daniel Lezcano <daniel.lezcano at free.fr>
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23 #include "config.h"
  24
  25 #include <stdio.h>
  26 #include <stdlib.h>
  27 #include <errno.h>
  28 #include <unistd.h>
  29 #include <string.h>
  30 #include <dirent.h>
  31 #include <fcntl.h>
  32 #include <grp.h>
  33 #include <ctype.h>
  34 #include <sys/types.h>
  35 #include <sys/stat.h>
  36 #include <sys/param.h>
  37 #include <sys/inotify.h>
  38 #include <sys/mount.h>
  39 #include <netinet/in.h>
  40 #include <net/if.h>
  41
  42 #include "bdev.h"
  43 #include "error.h"
  44 #include "commands.h"
  45 #include "list.h"
  46 #include "conf.h"
  47 #include "utils.h"
  48 #include "log.h"
  49 #include "cgroup.h"
  50 #include "start.h"
  51 #include "state.h"
  52
  53 #if IS_BIONIC
  54 #include <../include/lxcmntent.h>
  55 #else
  56 #include <mntent.h>
  57 #endif
  58
  59 struct cgroup_hierarchy;
  60 struct cgroup_meta_data;
  61 struct cgroup_mount_point;
  62
  63 /*
  64  * cgroup_meta_data: the metadata about the cgroup infrastructure on this
  65  *                   host
  66  */
  67 struct cgroup_meta_data {
  68         ptrdiff_t ref; /* simple refcount */
  69         struct cgroup_hierarchy **hierarchies;
  70         struct cgroup_mount_point **mount_points;
  71         int maximum_hierarchy;
  72 };
  73
  74 /*
  75  * cgroup_hierarchy: describes a single cgroup hierarchy
  76  *                   (may have multiple mount points)
  77  */
  78 struct cgroup_hierarchy {
  79         int index;
  80         bool used; /* false if the hierarchy should be ignored by lxc */
  81         char **subsystems;
  82         struct cgroup_mount_point *rw_absolute_mount_point;
  83         struct cgroup_mount_point *ro_absolute_mount_point;
  84         struct cgroup_mount_point **all_mount_points;
  85         size_t all_mount_point_capacity;
  86 };
  87
  88 /*
  89  * cgroup_mount_point: a mount point to where a hierarchy
  90  *                     is mounted to
  91  */
  92 struct cgroup_mount_point {
  93         struct cgroup_hierarchy *hierarchy;
  94         char *mount_point;
  95         char *mount_prefix;
  96         bool read_only;
  97         bool need_cpuset_init;
  98 };
  99
 100 /*
 101  * cgroup_process_info: describes the membership of a
 102  *                      process to the different cgroup
 103  *                      hierarchies
 104  *
 105  * Note this is the per-process info tracked by the cgfs_ops.
 106  * This is not used with cgmanager.
 107  */
 108 struct cgroup_process_info {
 109         struct cgroup_process_info *next;
 110         struct cgroup_meta_data *meta_ref;
 111         struct cgroup_hierarchy *hierarchy;
 112         char *cgroup_path;
 113         char *cgroup_path_sub;
 114         char **created_paths;
 115         size_t created_paths_capacity;
 116         size_t created_paths_count;
 117         struct cgroup_mount_point *designated_mount_point;
 118 };
 119
 120 struct cgfs_data {
 121         char *name;
 122         const char *cgroup_pattern;
 123         struct cgroup_meta_data *meta;
 124         struct cgroup_process_info *info;
 125 };
 126
 127 lxc_log_define(lxc_cgfs, lxc);
 128
 129 static struct cgroup_process_info *lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str, struct cgroup_meta_data *meta);
 130 static char **subsystems_from_mount_options(const char *mount_options, char **kernel_list);
 131 static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp);
 132 static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h);
 133 static bool is_valid_cgroup(const char *name);
 134 static int create_cgroup(struct cgroup_mount_point *mp, const char *path);
 135 static int remove_cgroup(struct cgroup_mount_point *mp, const char *path, bool recurse,
 136                                 struct lxc_conf *conf);
 137 static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp, const char *path, const char *suffix);
 138 static struct cgroup_process_info *find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem);
 139 static int do_cgroup_get(const char *cgroup_path, const char *sub_filename, char *value, size_t len);
 140 static int do_cgroup_set(const char *cgroup_path, const char *sub_filename, const char *value);
 141 static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d, char *v, bool for_allow);
 142 static int do_setup_cgroup_limits(struct cgfs_data *d, struct lxc_list *cgroup_settings, bool do_devices);
 143 static int cgroup_recursive_task_count(const char *cgroup_path);
 144 static int handle_cgroup_settings(struct cgroup_mount_point *mp, char *cgroup_path);
 145 static bool init_cpuset_if_needed(struct cgroup_mount_point *mp, const char *path);
 146
 147 static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist);
 148 static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data);
 149 static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data);
 150
 151 /* free process membership information */
 152 static void lxc_cgroup_process_info_free(struct cgroup_process_info *info);
 153 static void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info,
 154                                 struct lxc_conf *conf);
 155
 156 static struct cgroup_ops cgfs_ops;
 157
 158 static int cgroup_rmdir(char *dirname)
 159 {
 160         struct dirent *direntp;
 161         int saved_errno = 0;
 162         DIR *dir;
 163         int ret, failed=0;
 164         char pathname[MAXPATHLEN];
 165
 166         dir = opendir(dirname);
 167         if (!dir) {
 168                 ERROR("%s: failed to open %s", __func__, dirname);
 169                 return -1;
 170         }
 171
 172         while ((direntp = readdir(dir))) {
 173                 struct stat mystat;
 174                 int rc;
 175
 176                 if (!direntp)
 177                         break;
 178
 179                 if (!strcmp(direntp->d_name, ".") ||
 180                     !strcmp(direntp->d_name, ".."))
 181                         continue;
 182
 183                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 184                 if (rc < 0 || rc >= MAXPATHLEN) {
 185                         ERROR("pathname too long");
 186                         failed=1;
 187                         if (!saved_errno)
 188                                 saved_errno = -ENOMEM;
 189                         continue;
 190                 }
 191                 ret = lstat(pathname, &mystat);
 192                 if (ret) {
 193                         SYSERROR("%s: failed to stat %s", __func__, pathname);
 194                         failed=1;
 195                         if (!saved_errno)
 196                                 saved_errno = errno;
 197                         continue;
 198                 }
 199                 if (S_ISDIR(mystat.st_mode)) {
 200                         if (cgroup_rmdir(pathname) < 0) {
 201                                 if (!saved_errno)
 202                                         saved_errno = errno;
 203                                 failed=1;
 204                         }
 205                 }
 206         }
 207
 208         if (rmdir(dirname) < 0) {
 209                 SYSERROR("%s: failed to delete %s", __func__, dirname);
 210                 if (!saved_errno)
 211                         saved_errno = errno;
 212                 failed=1;
 213         }
 214
 215         ret = closedir(dir);
 216         if (ret) {
 217                 SYSERROR("%s: failed to close directory %s", __func__, dirname);
 218                 if (!saved_errno)
 219                         saved_errno = errno;
 220                 failed=1;
 221         }
 222
 223         errno = saved_errno;
 224         return failed ? -1 : 0;
 225 }
 226
 227 static int rmdir_wrapper(void *data)
 228 {
 229         char *path = data;
 230
 231         if (setresgid(0,0,0) < 0)
 232                 SYSERROR("Failed to setgid to 0");
 233         if (setresuid(0,0,0) < 0)
 234                 SYSERROR("Failed to setuid to 0");
 235         if (setgroups(0, NULL) < 0)
 236                 SYSERROR("Failed to clear groups");
 237
 238         return cgroup_rmdir(path);
 239 }
 240
 241 static struct cgroup_meta_data *lxc_cgroup_load_meta()
 242 {
 243         const char *cgroup_use = NULL;
 244         char **cgroup_use_list = NULL;
 245         struct cgroup_meta_data *md = NULL;
 246         int saved_errno;
 247
 248         errno = 0;
 249         cgroup_use = lxc_global_config_value("lxc.cgroup.use");
 250         if (!cgroup_use && errno != 0)
 251                 return NULL;
 252         if (cgroup_use) {
 253                 cgroup_use_list = lxc_string_split_and_trim(cgroup_use, ',');
 254                 if (!cgroup_use_list)
 255                         return NULL;
 256         }
 257
 258         md = lxc_cgroup_load_meta2((const char **)cgroup_use_list);
 259         saved_errno = errno;
 260         lxc_free_array((void **)cgroup_use_list, free);
 261         errno = saved_errno;
 262         return md;
 263 }
 264
 265 /* Step 1: determine all kernel subsystems */
 266 static bool find_cgroup_subsystems(char ***kernel_subsystems)
 267 {
 268         FILE *proc_cgroups;
 269         bool bret = false;
 270         char *line = NULL;
 271         size_t sz = 0;
 272         size_t kernel_subsystems_count = 0;
 273         size_t kernel_subsystems_capacity = 0;
 274         int r;
 275
 276         proc_cgroups = fopen_cloexec("/proc/cgroups", "r");
 277         if (!proc_cgroups)
 278                 return false;
 279
 280         while (getline(&line, &sz, proc_cgroups) != -1) {
 281                 char *tab1;
 282                 char *tab2;
 283                 int hierarchy_number;
 284
 285                 if (line[0] == '#')
 286                         continue;
 287                 if (!line[0])
 288                         continue;
 289
 290                 tab1 = strchr(line, '\t');
 291                 if (!tab1)
 292                         continue;
 293                 *tab1++ = '\0';
 294                 tab2 = strchr(tab1, '\t');
 295                 if (!tab2)
 296                         continue;
 297                 *tab2 = '\0';
 298
 299                 tab2 = NULL;
 300                 hierarchy_number = strtoul(tab1, &tab2, 10);
 301                 if (!tab2 || *tab2)
 302                         continue;
 303                 (void)hierarchy_number;
 304
 305                 r = lxc_grow_array((void ***)kernel_subsystems, &kernel_subsystems_capacity, kernel_subsystems_count + 1, 12);
 306                 if (r < 0)
 307                         goto out;
 308                 (*kernel_subsystems)[kernel_subsystems_count] = strdup(line);
 309                 if (!(*kernel_subsystems)[kernel_subsystems_count])
 310                         goto out;
 311                 kernel_subsystems_count++;
 312         }
 313         bret = true;
 314
 315 out:
 316         fclose(proc_cgroups);
 317         free(line);
 318         return bret;
 319 }
 320
 321 /* Step 2: determine all hierarchies (by reading /proc/self/cgroup),
 322  *         since mount points don't specify hierarchy number and
 323  *         /proc/cgroups does not contain named hierarchies
 324  */
 325 static bool find_cgroup_hierarchies(struct cgroup_meta_data *meta_data,
 326         bool all_kernel_subsystems, bool all_named_subsystems,
 327         const char **subsystem_whitelist)
 328 {
 329         FILE *proc_self_cgroup;
 330         char *line = NULL;
 331         size_t sz = 0;
 332         int r;
 333         bool bret = false;
 334         size_t hierarchy_capacity = 0;
 335
 336         proc_self_cgroup = fopen_cloexec("/proc/self/cgroup", "r");
 337         /* if for some reason (because of setns() and pid namespace for example),
 338          * /proc/self is not valid, we try /proc/1/cgroup... */
 339         if (!proc_self_cgroup)
 340                 proc_self_cgroup = fopen_cloexec("/proc/1/cgroup", "r");
 341         if (!proc_self_cgroup)
 342                 return false;
 343
 344         while (getline(&line, &sz, proc_self_cgroup) != -1) {
 345                 /* file format: hierarchy:subsystems:group,
 346                  * we only extract hierarchy and subsystems
 347                  * here */
 348                 char *colon1;
 349                 char *colon2;
 350                 int hierarchy_number;
 351                 struct cgroup_hierarchy *h = NULL;
 352                 char **p;
 353
 354                 if (!line[0])
 355                         continue;
 356
 357                 colon1 = strchr(line, ':');
 358                 if (!colon1)
 359                         continue;
 360                 *colon1++ = '\0';
 361                 colon2 = strchr(colon1, ':');
 362                 if (!colon2)
 363                         continue;
 364                 *colon2 = '\0';
 365
 366                 colon2 = NULL;
 367
 368                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
 369                  * form: 0::/
 370                  * These entries need to be skipped.
 371                  */
 372                 if (!strcmp(colon1, ""))
 373                         continue;
 374
 375                 hierarchy_number = strtoul(line, &colon2, 10);
 376                 if (!colon2 || *colon2)
 377                         continue;
 378
 379                 if (hierarchy_number > meta_data->maximum_hierarchy) {
 380                         /* lxc_grow_array will never shrink, so even if we find a lower
 381                         * hierarchy number here, the array will never be smaller
 382                         */
 383                         r = lxc_grow_array((void ***)&meta_data->hierarchies, &hierarchy_capacity, hierarchy_number + 1, 12);
 384                         if (r < 0)
 385                                 goto out;
 386
 387                         meta_data->maximum_hierarchy = hierarchy_number;
 388                 }
 389
 390                 /* this shouldn't happen, we had this already */
 391                 if (meta_data->hierarchies[hierarchy_number])
 392                         goto out;
 393
 394                 h = calloc(1, sizeof(struct cgroup_hierarchy));
 395                 if (!h)
 396                         goto out;
 397
 398                 meta_data->hierarchies[hierarchy_number] = h;
 399
 400                 h->index = hierarchy_number;
 401                 h->subsystems = lxc_string_split_and_trim(colon1, ',');
 402                 if (!h->subsystems)
 403                         goto out;
 404                 /* see if this hierarchy should be considered */
 405                 if (!all_kernel_subsystems || !all_named_subsystems) {
 406                         for (p = h->subsystems; *p; p++) {
 407                                 if (!strncmp(*p, "name=", 5)) {
 408                                         if (all_named_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
 409                                                 h->used = true;
 410                                                 break;
 411                                         }
 412                                 } else {
 413                                         if (all_kernel_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
 414                                                 h->used = true;
 415                                                 break;
 416                                         }
 417                                 }
 418                         }
 419                 } else {
 420                         /* we want all hierarchy anyway */
 421                         h->used = true;
 422                 }
 423         }
 424         bret = true;
 425
 426 out:
 427         fclose(proc_self_cgroup);
 428         free(line);
 429         return bret;
 430 }
 431
 432 /* Step 3: determine all mount points of each hierarchy */
 433 static bool find_hierarchy_mountpts( struct cgroup_meta_data *meta_data, char **kernel_subsystems)
 434 {
 435         bool bret = false;
 436         FILE *proc_self_mountinfo;
 437         char *line = NULL;
 438         size_t sz = 0;
 439         char **tokens = NULL;
 440         size_t mount_point_count = 0;
 441         size_t mount_point_capacity = 0;
 442         size_t token_capacity = 0;
 443         int r;
 444         bool is_cgns = cgns_supported();
 445
 446         proc_self_mountinfo = fopen_cloexec("/proc/self/mountinfo", "r");
 447         /* if for some reason (because of setns() and pid namespace for example),
 448          * /proc/self is not valid, we try /proc/1/cgroup... */
 449         if (!proc_self_mountinfo)
 450                 proc_self_mountinfo = fopen_cloexec("/proc/1/mountinfo", "r");
 451         if (!proc_self_mountinfo)
 452                 return false;
 453
 454         while (getline(&line, &sz, proc_self_mountinfo) != -1) {
 455                 char *token, *line_tok, *saveptr = NULL;
 456                 size_t i, j, k;
 457                 struct cgroup_mount_point *mount_point;
 458                 struct cgroup_hierarchy *h;
 459                 char **subsystems;
 460                 bool is_lxcfs = false;
 461
 462                 if (line[0] && line[strlen(line) - 1] == '\n')
 463                         line[strlen(line) - 1] = '\0';
 464
 465                 for (i = 0, line_tok = line; (token = strtok_r(line_tok, " ", &saveptr)); line_tok = NULL) {
 466                         r = lxc_grow_array((void ***)&tokens, &token_capacity, i + 1, 64);
 467                         if (r < 0)
 468                                 goto out;
 469                         tokens[i++] = token;
 470                 }
 471
 472                 /* layout of /proc/self/mountinfo:
 473                  *      0: id
 474                  *      1: parent id
 475                  *      2: device major:minor
 476                  *      3: mount prefix
 477                  *      4: mount point
 478                  *      5: per-mount options
 479                  *    [optional X]: additional data
 480                  *    X+7: "-"
 481                  *    X+8: type
 482                  *    X+9: source
 483                  *    X+10: per-superblock options
 484                  */
 485                 for (j = 6; j < i && tokens[j]; j++)
 486                         if (!strcmp(tokens[j], "-"))
 487                                 break;
 488
 489                 /* could not find separator */
 490                 if (j >= i || !tokens[j])
 491                         continue;
 492                 /* there should be exactly three fields after
 493                  * the separator
 494                  */
 495                 if (i != j + 4)
 496                         continue;
 497
 498                 /* not a cgroup filesystem */
 499                 if (strcmp(tokens[j + 1], "cgroup") != 0) {
 500                         if (strcmp(tokens[j + 1], "fuse.lxcfs") != 0)
 501                                 continue;
 502                         if (strncmp(tokens[4], "/sys/fs/cgroup/", 15) != 0)
 503                                 continue;
 504                         is_lxcfs = true;
 505                         char *curtok = tokens[4] + 15;
 506                         subsystems = subsystems_from_mount_options(curtok,
 507                                                          kernel_subsystems);
 508                 } else
 509                         subsystems = subsystems_from_mount_options(tokens[j + 3],
 510                                                          kernel_subsystems);
 511                 if (!subsystems)
 512                         goto out;
 513
 514                 h = NULL;
 515                 for (k = 0; k <= meta_data->maximum_hierarchy; k++) {
 516                         if (meta_data->hierarchies[k] &&
 517                             meta_data->hierarchies[k]->subsystems[0] &&
 518                             lxc_string_in_array(meta_data->hierarchies[k]->subsystems[0], (const char **)subsystems)) {
 519                                 /* TODO: we could also check if the lists really match completely,
 520                                  *       just to have an additional sanity check */
 521                                 h = meta_data->hierarchies[k];
 522                                 break;
 523                         }
 524                 }
 525                 lxc_free_array((void **)subsystems, free);
 526
 527                 r = lxc_grow_array((void ***)&meta_data->mount_points, &mount_point_capacity, mount_point_count + 1, 12);
 528                 if (r < 0)
 529                         goto out;
 530
 531                 /* create mount point object */
 532                 mount_point = calloc(1, sizeof(*mount_point));
 533                 if (!mount_point)
 534                         goto out;
 535
 536                 meta_data->mount_points[mount_point_count++] = mount_point;
 537
 538                 mount_point->hierarchy = h;
 539                 if (is_lxcfs || is_cgns)
 540                         mount_point->mount_prefix = strdup("/");
 541                 else
 542                         mount_point->mount_prefix = strdup(tokens[3]);
 543                 mount_point->mount_point = strdup(tokens[4]);
 544                 if (!mount_point->mount_point || !mount_point->mount_prefix)
 545                         goto out;
 546                 mount_point->read_only = !lxc_string_in_list("rw", tokens[5], ',');
 547
 548                 if (!strcmp(mount_point->mount_prefix, "/")) {
 549                         if (mount_point->read_only) {
 550                                 if (!h->ro_absolute_mount_point)
 551                                         h->ro_absolute_mount_point = mount_point;
 552                         } else {
 553                                 if (!h->rw_absolute_mount_point)
 554                                         h->rw_absolute_mount_point = mount_point;
 555                         }
 556                 }
 557
 558                 k = lxc_array_len((void **)h->all_mount_points);
 559                 r = lxc_grow_array((void ***)&h->all_mount_points, &h->all_mount_point_capacity, k + 1, 4);
 560                 if (r < 0)
 561                         goto out;
 562                 h->all_mount_points[k] = mount_point;
 563         }
 564         bret = true;
 565
 566 out:
 567         fclose(proc_self_mountinfo);
 568         free(tokens);
 569         free(line);
 570         return bret;
 571 }
 572
 573 static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist)
 574 {
 575         bool all_kernel_subsystems = true;
 576         bool all_named_subsystems = false;
 577         struct cgroup_meta_data *meta_data = NULL;
 578         char **kernel_subsystems = NULL;
 579         int saved_errno = 0;
 580
 581         /* if the subsystem whitelist is not specified, include all
 582          * hierarchies that contain kernel subsystems by default but
 583          * no hierarchies that only contain named subsystems
 584          *
 585          * if it is specified, the specifier @all will select all
 586          * hierarchies, @kernel will select all hierarchies with
 587          * kernel subsystems and @named will select all named
 588          * hierarchies
 589          */
 590         all_kernel_subsystems = subsystem_whitelist ?
 591                 (lxc_string_in_array("@kernel", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
 592                 true;
 593         all_named_subsystems = subsystem_whitelist ?
 594                 (lxc_string_in_array("@named", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
 595                 true;
 596
 597         meta_data = calloc(1, sizeof(struct cgroup_meta_data));
 598         if (!meta_data)
 599                 return NULL;
 600         meta_data->ref = 1;
 601
 602         if (!find_cgroup_subsystems(&kernel_subsystems))
 603                 goto out_error;
 604
 605         if (!find_cgroup_hierarchies(meta_data, all_kernel_subsystems,
 606                                 all_named_subsystems, subsystem_whitelist))
 607                 goto out_error;
 608
 609         if (!find_hierarchy_mountpts(meta_data, kernel_subsystems))
 610                 goto out_error;
 611
 612         /* oops, we couldn't find anything */
 613         if (!meta_data->hierarchies || !meta_data->mount_points) {
 614                 errno = EINVAL;
 615                 goto out_error;
 616         }
 617
 618         lxc_free_array((void **)kernel_subsystems, free);
 619         return meta_data;
 620
 621 out_error:
 622         saved_errno = errno;
 623         lxc_free_array((void **)kernel_subsystems, free);
 624         lxc_cgroup_put_meta(meta_data);
 625         errno = saved_errno;
 626         return NULL;
 627 }
 628
 629 static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data)
 630 {
 631         meta_data->ref++;
 632         return meta_data;
 633 }
 634
 635 static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data)
 636 {
 637         size_t i;
 638         if (!meta_data)
 639                 return NULL;
 640         if (--meta_data->ref > 0)
 641                 return meta_data;
 642         lxc_free_array((void **)meta_data->mount_points, (lxc_free_fn)lxc_cgroup_mount_point_free);
 643         if (meta_data->hierarchies)
 644                 for (i = 0; i <= meta_data->maximum_hierarchy; i++)
 645                         if (meta_data->hierarchies[i])
 646                                 lxc_cgroup_hierarchy_free(meta_data->hierarchies[i]);
 647         free(meta_data->hierarchies);
 648         free(meta_data);
 649         return NULL;
 650 }
 651
 652 static struct cgroup_hierarchy *lxc_cgroup_find_hierarchy(struct cgroup_meta_data *meta_data, const char *subsystem)
 653 {
 654         size_t i;
 655         for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
 656                 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
 657                 if (!h)
 658                         continue;
 659                 if (h && lxc_string_in_array(subsystem, (const char **)h->subsystems))
 660                         return h;
 661         }
 662         return NULL;
 663 }
 664
 665 static bool mountpoint_is_accessible(struct cgroup_mount_point *mp)
 666 {
 667         return mp && access(mp->mount_point, F_OK) == 0;
 668 }
 669
 670 static struct cgroup_mount_point *lxc_cgroup_find_mount_point(struct cgroup_hierarchy *hierarchy, const char *group, bool should_be_writable)
 671 {
 672         struct cgroup_mount_point **mps;
 673         struct cgroup_mount_point *current_result = NULL;
 674         ssize_t quality = -1;
 675
 676         /* trivial case */
 677         if (mountpoint_is_accessible(hierarchy->rw_absolute_mount_point))
 678                 return hierarchy->rw_absolute_mount_point;
 679         if (!should_be_writable && mountpoint_is_accessible(hierarchy->ro_absolute_mount_point))
 680                 return hierarchy->ro_absolute_mount_point;
 681
 682         for (mps = hierarchy->all_mount_points; mps && *mps; mps++) {
 683                 struct cgroup_mount_point *mp = *mps;
 684                 size_t prefix_len = mp->mount_prefix ? strlen(mp->mount_prefix) : 0;
 685
 686                 if (prefix_len == 1 && mp->mount_prefix[0] == '/')
 687                         prefix_len = 0;
 688
 689                 if (!mountpoint_is_accessible(mp))
 690                         continue;
 691
 692                 if (should_be_writable && mp->read_only)
 693                         continue;
 694
 695                 if (!prefix_len ||
 696                     (strncmp(group, mp->mount_prefix, prefix_len) == 0 &&
 697                      (group[prefix_len] == '\0' || group[prefix_len] == '/'))) {
 698                         /* search for the best quality match, i.e. the match with the
 699                          * shortest prefix where this group is still contained
 700                          */
 701                         if (quality == -1 || prefix_len < quality) {
 702                                 current_result = mp;
 703                                 quality = prefix_len;
 704                         }
 705                 }
 706         }
 707
 708         if (!current_result)
 709                 errno = ENOENT;
 710         return current_result;
 711 }
 712
 713 static char *lxc_cgroup_find_abs_path(const char *subsystem, const char *group, bool should_be_writable, const char *suffix)
 714 {
 715         struct cgroup_meta_data *meta_data;
 716         struct cgroup_hierarchy *h;
 717         struct cgroup_mount_point *mp;
 718         char *result;
 719         int saved_errno;
 720
 721         meta_data = lxc_cgroup_load_meta();
 722         if (!meta_data)
 723                 return NULL;
 724
 725         h = lxc_cgroup_find_hierarchy(meta_data, subsystem);
 726         if (!h)
 727                 goto out_error;
 728
 729         mp = lxc_cgroup_find_mount_point(h, group, should_be_writable);
 730         if (!mp)
 731                 goto out_error;
 732
 733         result = cgroup_to_absolute_path(mp, group, suffix);
 734         if (!result)
 735                 goto out_error;
 736
 737         lxc_cgroup_put_meta(meta_data);
 738         return result;
 739
 740 out_error:
 741         saved_errno = errno;
 742         lxc_cgroup_put_meta(meta_data);
 743         errno = saved_errno;
 744         return NULL;
 745 }
 746
 747 static struct cgroup_process_info *lxc_cgroup_process_info_get(pid_t pid, struct cgroup_meta_data *meta)
 748 {
 749         char pid_buf[32];
 750         snprintf(pid_buf, 32, "/proc/%lu/cgroup", (unsigned long)pid);
 751         return lxc_cgroup_process_info_getx(pid_buf, meta);
 752 }
 753
 754 static struct cgroup_process_info *lxc_cgroup_process_info_get_init(struct cgroup_meta_data *meta)
 755 {
 756         return lxc_cgroup_process_info_get(1, meta);
 757 }
 758
 759 static struct cgroup_process_info *lxc_cgroup_process_info_get_self(struct cgroup_meta_data *meta)
 760 {
 761         struct cgroup_process_info *i;
 762         i = lxc_cgroup_process_info_getx("/proc/self/cgroup", meta);
 763         if (!i)
 764                 i = lxc_cgroup_process_info_get(getpid(), meta);
 765         return i;
 766 }
 767
 768 /*
 769  * If a controller has ns cgroup mounted, then in that cgroup the handler->pid
 770  * is already in a new cgroup named after the pid.  'mnt' is passed in as
 771  * the full current cgroup.  Say that is /sys/fs/cgroup/lxc/2975 and the container
 772  * name is c1. .  We want to rename the cgroup directory to /sys/fs/cgroup/lxc/c1,
 773  * and return the string /sys/fs/cgroup/lxc/c1.
 774  */
 775 static char *cgroup_rename_nsgroup(const char *mountpath, const char *oldname, pid_t pid, const char *name)
 776 {
 777         char *dir, *fulloldpath;
 778         char *newname, *fullnewpath;
 779         int len, newlen, ret;
 780
 781         /*
 782          * if cgroup is mounted at /cgroup and task is in cgroup /ab/, pid 2375 and
 783          * name is c1,
 784          * dir: /ab
 785          * fulloldpath = /cgroup/ab/2375
 786          * fullnewpath = /cgroup/ab/c1
 787          * newname = /ab/c1
 788          */
 789         dir = alloca(strlen(oldname) + 1);
 790         strcpy(dir, oldname);
 791
 792         len = strlen(oldname) + strlen(mountpath) + 22;
 793         fulloldpath = alloca(len);
 794         ret = snprintf(fulloldpath, len, "%s/%s/%ld", mountpath, oldname, (unsigned long)pid);
 795         if (ret < 0 || ret >= len)
 796                 return NULL;
 797
 798         len = strlen(dir) + strlen(name) + 2;
 799         newname = malloc(len);
 800         if (!newname) {
 801                 SYSERROR("Out of memory");
 802                 return NULL;
 803         }
 804         ret = snprintf(newname, len, "%s/%s", dir, name);
 805         if (ret < 0 || ret >= len) {
 806                 free(newname);
 807                 return NULL;
 808         }
 809
 810         newlen = strlen(mountpath) + len + 2;
 811         fullnewpath = alloca(newlen);
 812         ret = snprintf(fullnewpath, newlen, "%s/%s", mountpath, newname);
 813         if (ret < 0 || ret >= newlen) {
 814                 free(newname);
 815                 return NULL;
 816         }
 817
 818         if (access(fullnewpath, F_OK) == 0) {
 819                 if (rmdir(fullnewpath) != 0) {
 820                         SYSERROR("container cgroup %s already exists.", fullnewpath);
 821                         free(newname);
 822                         return NULL;
 823                 }
 824         }
 825         if (rename(fulloldpath, fullnewpath)) {
 826                 SYSERROR("failed to rename cgroup %s->%s", fulloldpath, fullnewpath);
 827                 free(newname);
 828                 return NULL;
 829         }
 830
 831         DEBUG("'%s' renamed to '%s'", oldname, newname);
 832
 833         return newname;
 834 }
 835
 836 static bool is_crucial_hierarchy(struct cgroup_hierarchy *h)
 837 {
 838         char **p;
 839
 840         for (p = h->subsystems; *p; p++) {
 841                 if (is_crucial_cgroup_subsystem(*p))
 842                         return true;
 843         }
 844         return false;
 845 }
 846
 847 /* create a new cgroup */
 848 static struct cgroup_process_info *lxc_cgroupfs_create(const char *name, const char *path_pattern, struct cgroup_meta_data *meta_data, const char *sub_pattern)
 849 {
 850         char **cgroup_path_components = NULL;
 851         char **p = NULL;
 852         char *path_so_far = NULL;
 853         char **new_cgroup_paths = NULL;
 854         char **new_cgroup_paths_sub = NULL;
 855         struct cgroup_mount_point *mp;
 856         struct cgroup_hierarchy *h;
 857         struct cgroup_process_info *base_info = NULL;
 858         struct cgroup_process_info *info_ptr;
 859         int saved_errno;
 860         int r;
 861         unsigned suffix = 0;
 862         bool had_sub_pattern = false;
 863         size_t i;
 864
 865         if (!is_valid_cgroup(name)) {
 866                 ERROR("Invalid cgroup name: '%s'", name);
 867                 errno = EINVAL;
 868                 return NULL;
 869         }
 870
 871         if (!strstr(path_pattern, "%n")) {
 872                 ERROR("Invalid cgroup path pattern: '%s'; contains no %%n for specifying container name", path_pattern);
 873                 errno = EINVAL;
 874                 return NULL;
 875         }
 876
 877         /* we will modify the result of this operation directly,
 878          * so we don't have to copy the data structure
 879          */
 880         base_info = (path_pattern[0] == '/') ?
 881                 lxc_cgroup_process_info_get_init(meta_data) :
 882                 lxc_cgroup_process_info_get_self(meta_data);
 883         if (!base_info)
 884                 return NULL;
 885
 886         new_cgroup_paths = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
 887         if (!new_cgroup_paths)
 888                 goto out_initial_error;
 889
 890         new_cgroup_paths_sub = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
 891         if (!new_cgroup_paths_sub)
 892                 goto out_initial_error;
 893
 894         /* find mount points we can use */
 895         for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
 896                 h = info_ptr->hierarchy;
 897                 if (!h)
 898                         continue;
 899                 mp = lxc_cgroup_find_mount_point(h, info_ptr->cgroup_path, true);
 900                 if (!mp) {
 901                         ERROR("Could not find writable mount point for cgroup hierarchy %d while trying to create cgroup.", h->index);
 902                         goto out_initial_error;
 903                 }
 904                 info_ptr->designated_mount_point = mp;
 905
 906                 if (lxc_string_in_array("ns", (const char **)h->subsystems))
 907                         continue;
 908                 if (handle_cgroup_settings(mp, info_ptr->cgroup_path) < 0) {
 909                         ERROR("Could not set clone_children to 1 for cpuset hierarchy in parent cgroup.");
 910                         goto out_initial_error;
 911                 }
 912         }
 913
 914         /* normalize the path */
 915         cgroup_path_components = lxc_normalize_path(path_pattern);
 916         if (!cgroup_path_components)
 917                 goto out_initial_error;
 918
 919         /* go through the path components to see if we can create them */
 920         for (p = cgroup_path_components; *p || (sub_pattern && !had_sub_pattern); p++) {
 921                 /* we only want to create the same component with -1, -2, etc.
 922                  * if the component contains the container name itself, otherwise
 923                  * it's not an error if it already exists
 924                  */
 925                 char *p_eff = *p ? *p : (char *)sub_pattern;
 926                 bool contains_name = strstr(p_eff, "%n");
 927                 char *current_component = NULL;
 928                 char *current_subpath = NULL;
 929                 char *current_entire_path = NULL;
 930                 char *parts[3];
 931                 size_t j = 0;
 932                 i = 0;
 933
 934                 /* if we are processing the subpattern, we want to make sure
 935                  * loop is ended the next time around
 936                  */
 937                 if (!*p) {
 938                         had_sub_pattern = true;
 939                         p--;
 940                 }
 941
 942                 goto find_name_on_this_level;
 943
 944         cleanup_name_on_this_level:
 945                 /* This is reached if we found a name clash.
 946                  * In that case, remove the cgroup from all previous hierarchies
 947                  */
 948                 for (j = 0, info_ptr = base_info; j < i && info_ptr; info_ptr = info_ptr->next, j++) {
 949                         if (info_ptr->created_paths_count < 1)
 950                                 continue;
 951                         r = remove_cgroup(info_ptr->designated_mount_point, info_ptr->created_paths[info_ptr->created_paths_count - 1], false, NULL);
 952                         if (r < 0)
 953                                 WARN("could not clean up cgroup we created when trying to create container");
 954                         free(info_ptr->created_paths[info_ptr->created_paths_count - 1]);
 955                         info_ptr->created_paths[--info_ptr->created_paths_count] = NULL;
 956                 }
 957                 if (current_component != current_subpath)
 958                         free(current_subpath);
 959                 if (current_component != p_eff)
 960                         free(current_component);
 961                 current_component = current_subpath = NULL;
 962                 /* try again with another suffix */
 963                 ++suffix;
 964
 965         find_name_on_this_level:
 966                 /* determine name of the path component we should create */
 967                 if (contains_name && suffix > 0) {
 968                         char *buf = calloc(strlen(name) + 32, 1);
 969                         if (!buf)
 970                                 goto out_initial_error;
 971                         snprintf(buf, strlen(name) + 32, "%s-%u", name, suffix);
 972                         current_component = lxc_string_replace("%n", buf, p_eff);
 973                         free(buf);
 974                 } else {
 975                         current_component = contains_name ? lxc_string_replace("%n", name, p_eff) : p_eff;
 976                 }
 977                 parts[0] = path_so_far;
 978                 parts[1] = current_component;
 979                 parts[2] = NULL;
 980                 current_subpath = path_so_far ? lxc_string_join("/", (const char **)parts, false) : current_component;
 981
 982                 /* Now go through each hierarchy and try to create the
 983                  * corresponding cgroup
 984                  */
 985                 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
 986                         char *parts2[3];
 987
 988                         if (!info_ptr->hierarchy)
 989                                 continue;
 990
 991                         if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
 992                                 continue;
 993                         current_entire_path = NULL;
 994
 995                         parts2[0] = !strcmp(info_ptr->cgroup_path, "/") ? "" : info_ptr->cgroup_path;
 996                         parts2[1] = current_subpath;
 997                         parts2[2] = NULL;
 998                         current_entire_path = lxc_string_join("/", (const char **)parts2, false);
 999
1000                         if (!*p) {
1001                                 /* we are processing the subpath, so only update that one */
1002                                 free(new_cgroup_paths_sub[i]);
1003                                 new_cgroup_paths_sub[i] = strdup(current_entire_path);
1004                                 if (!new_cgroup_paths_sub[i])
1005                                         goto cleanup_from_error;
1006                         } else {
1007                                 /* remember which path was used on this controller */
1008                                 free(new_cgroup_paths[i]);
1009                                 new_cgroup_paths[i] = strdup(current_entire_path);
1010                                 if (!new_cgroup_paths[i])
1011                                         goto cleanup_from_error;
1012                         }
1013
1014                         r = create_cgroup(info_ptr->designated_mount_point, current_entire_path);
1015                         if (r < 0 && errno == EEXIST && contains_name) {
1016                                 /* name clash => try new name with new suffix */
1017                                 free(current_entire_path);
1018                                 current_entire_path = NULL;
1019                                 goto cleanup_name_on_this_level;
1020                         } else if (r < 0 && errno != EEXIST) {
1021                                 if (is_crucial_hierarchy(info_ptr->hierarchy)) {
1022                                         SYSERROR("Could not create cgroup '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
1023                                         goto cleanup_from_error;
1024                                 }
1025                                 goto skip;
1026                         } else if (r == 0) {
1027                                 /* successfully created */
1028                                 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1029                                 if (r < 0)
1030                                         goto cleanup_from_error;
1031                                 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, current_entire_path)) {
1032                                         ERROR("Failed to initialize cpuset for '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
1033                                         goto cleanup_from_error;
1034                                 }
1035                                 info_ptr->created_paths[info_ptr->created_paths_count++] = current_entire_path;
1036                         } else {
1037                                 /* if we didn't create the cgroup, then we have to make sure that
1038                                  * further cgroups will be created properly
1039                                  */
1040                                 if (handle_cgroup_settings(info_ptr->designated_mount_point, info_ptr->cgroup_path) < 0) {
1041                                         ERROR("Could not set clone_children to 1 for cpuset hierarchy in pre-existing cgroup.");
1042                                         goto cleanup_from_error;
1043                                 }
1044                                 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, info_ptr->cgroup_path)) {
1045                                         ERROR("Failed to initialize cpuset in pre-existing '%s'.", info_ptr->cgroup_path);
1046                                         goto cleanup_from_error;
1047                                 }
1048
1049 skip:
1050                                 /* already existed but path component of pattern didn't contain '%n',
1051                                  * so this is not an error; but then we don't need current_entire_path
1052                                  * anymore...
1053                                  */
1054                                 free(current_entire_path);
1055                                 current_entire_path = NULL;
1056                         }
1057                 }
1058
1059                 /* save path so far */
1060                 free(path_so_far);
1061                 path_so_far = strdup(current_subpath);
1062                 if (!path_so_far)
1063                         goto cleanup_from_error;
1064
1065                 /* cleanup */
1066                 if (current_component != current_subpath)
1067                         free(current_subpath);
1068                 if (current_component != p_eff)
1069                         free(current_component);
1070                 current_component = current_subpath = NULL;
1071                 continue;
1072
1073         cleanup_from_error:
1074                 /* called if an error occurred in the loop, so we
1075                  * do some additional cleanup here
1076                  */
1077                 saved_errno = errno;
1078                 if (current_component != current_subpath)
1079                         free(current_subpath);
1080                 if (current_component != p_eff)
1081                         free(current_component);
1082                 free(current_entire_path);
1083                 errno = saved_errno;
1084                 goto out_initial_error;
1085         }
1086
1087         /* we're done, now update the paths */
1088         for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
1089                 if (!info_ptr->hierarchy)
1090                         continue;
1091                 /* ignore legacy 'ns' subsystem here, lxc_cgroup_create_legacy
1092                  * will take care of it
1093                  * Since we do a continue in above loop, new_cgroup_paths[i] is
1094                  * unset anyway, as is new_cgroup_paths_sub[i]
1095                  */
1096                 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1097                         continue;
1098                 free(info_ptr->cgroup_path);
1099                 info_ptr->cgroup_path = new_cgroup_paths[i];
1100                 info_ptr->cgroup_path_sub = new_cgroup_paths_sub[i];
1101         }
1102         /* don't use lxc_free_array since we used the array members
1103          * to store them in our result...
1104          */
1105         free(new_cgroup_paths);
1106         free(new_cgroup_paths_sub);
1107         free(path_so_far);
1108         lxc_free_array((void **)cgroup_path_components, free);
1109         return base_info;
1110
1111 out_initial_error:
1112         saved_errno = errno;
1113         free(path_so_far);
1114         lxc_cgroup_process_info_free_and_remove(base_info, NULL);
1115         lxc_free_array((void **)new_cgroup_paths, free);
1116         lxc_free_array((void **)new_cgroup_paths_sub, free);
1117         lxc_free_array((void **)cgroup_path_components, free);
1118         errno = saved_errno;
1119         return NULL;
1120 }
1121
1122 static int lxc_cgroup_create_legacy(struct cgroup_process_info *base_info, const char *name, pid_t pid)
1123 {
1124         struct cgroup_process_info *info_ptr;
1125         int r;
1126
1127         for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
1128                 if (!info_ptr->hierarchy)
1129                         continue;
1130
1131                 if (!lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1132                         continue;
1133                 /*
1134                  * For any path which has ns cgroup mounted, handler->pid is already
1135                  * moved into a container called '%d % (handler->pid)'.  Rename it to
1136                  * the cgroup name and record that.
1137                  */
1138                 char *tmp = cgroup_rename_nsgroup((const char *)info_ptr->designated_mount_point->mount_point,
1139                                 info_ptr->cgroup_path, pid, name);
1140                 if (!tmp)
1141                         return -1;
1142                 free(info_ptr->cgroup_path);
1143                 info_ptr->cgroup_path = tmp;
1144                 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1145                 if (r < 0)
1146                         return -1;
1147                 tmp = strdup(tmp);
1148                 if (!tmp)
1149                         return -1;
1150                 info_ptr->created_paths[info_ptr->created_paths_count++] = tmp;
1151         }
1152         return 0;
1153 }
1154
1155 /* get the cgroup membership of a given container */
1156 static struct cgroup_process_info *lxc_cgroup_get_container_info(const char *name, const char *lxcpath, struct cgroup_meta_data *meta_data)
1157 {
1158         struct cgroup_process_info *result = NULL;
1159         int saved_errno = 0;
1160         size_t i;
1161         struct cgroup_process_info **cptr = &result;
1162         struct cgroup_process_info *entry = NULL;
1163         char *path = NULL;
1164
1165         for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
1166                 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
1167                 if (!h || !h->used)
1168                         continue;
1169
1170                 /* use the command interface to look for the cgroup */
1171                 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->subsystems[0]);
1172                 if (!path) {
1173                         h->used = false;
1174                         continue;
1175                 }
1176
1177                 entry = calloc(1, sizeof(struct cgroup_process_info));
1178                 if (!entry)
1179                         goto out_error;
1180                 entry->meta_ref = lxc_cgroup_get_meta(meta_data);
1181                 entry->hierarchy = h;
1182                 entry->cgroup_path = path;
1183                 path = NULL;
1184
1185                 /* it is not an error if we don't find anything here,
1186                  * it is up to the caller to decide what to do in that
1187                  * case */
1188                 entry->designated_mount_point = lxc_cgroup_find_mount_point(h, entry->cgroup_path, true);
1189
1190                 *cptr = entry;
1191                 cptr = &entry->next;
1192                 entry = NULL;
1193         }
1194
1195         return result;
1196 out_error:
1197         saved_errno = errno;
1198         free(path);
1199         lxc_cgroup_process_info_free(result);
1200         lxc_cgroup_process_info_free(entry);
1201         errno = saved_errno;
1202         return NULL;
1203 }
1204
1205 /* move a processs to the cgroups specified by the membership */
1206 static int lxc_cgroupfs_enter(struct cgroup_process_info *info, pid_t pid, bool enter_sub)
1207 {
1208         char pid_buf[32];
1209         char *cgroup_tasks_fn;
1210         int r;
1211         struct cgroup_process_info *info_ptr;
1212
1213         snprintf(pid_buf, 32, "%lu", (unsigned long)pid);
1214         for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1215                 if (!info_ptr->hierarchy)
1216                         continue;
1217
1218                 char *cgroup_path = (enter_sub && info_ptr->cgroup_path_sub) ?
1219                         info_ptr->cgroup_path_sub :
1220                         info_ptr->cgroup_path;
1221
1222                 if (!info_ptr->designated_mount_point) {
1223                         info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, cgroup_path, true);
1224                         if (!info_ptr->designated_mount_point) {
1225                                 SYSERROR("Could not add pid %lu to cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", (unsigned long)pid, cgroup_path);
1226                                 return -1;
1227                         }
1228                 }
1229
1230                 cgroup_tasks_fn = cgroup_to_absolute_path(info_ptr->designated_mount_point, cgroup_path, "/tasks");
1231                 if (!cgroup_tasks_fn) {
1232                         SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1233                         return -1;
1234                 }
1235
1236                 r = lxc_write_to_file(cgroup_tasks_fn, pid_buf, strlen(pid_buf), false);
1237                 free(cgroup_tasks_fn);
1238                 if (r < 0 && is_crucial_hierarchy(info_ptr->hierarchy)) {
1239                         SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1240                         return -1;
1241                 }
1242         }
1243
1244         return 0;
1245 }
1246
1247 /* free process membership information */
1248 void lxc_cgroup_process_info_free(struct cgroup_process_info *info)
1249 {
1250         struct cgroup_process_info *next;
1251         if (!info)
1252                 return;
1253         next = info->next;
1254         lxc_cgroup_put_meta(info->meta_ref);
1255         free(info->cgroup_path);
1256         free(info->cgroup_path_sub);
1257         lxc_free_array((void **)info->created_paths, free);
1258         free(info);
1259         lxc_cgroup_process_info_free(next);
1260 }
1261
1262 /* free process membership information and remove cgroups that were created */
1263 void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info, struct lxc_conf *conf)
1264 {
1265         struct cgroup_process_info *next;
1266         char **pp;
1267         if (!info)
1268                 return;
1269         next = info->next;
1270         {
1271                 struct cgroup_mount_point *mp = info->designated_mount_point;
1272                 if (!mp)
1273                         mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1274                 if (mp)
1275                         /* ignore return value here, perhaps we created the
1276                          * '/lxc' cgroup in this container but another container
1277                          * is still running (for example)
1278                          */
1279                         (void)remove_cgroup(mp, info->cgroup_path, true, conf);
1280         }
1281         for (pp = info->created_paths; pp && *pp; pp++);
1282         for ((void)(pp && --pp); info->created_paths && pp >= info->created_paths; --pp) {
1283                 free(*pp);
1284         }
1285         free(info->created_paths);
1286         lxc_cgroup_put_meta(info->meta_ref);
1287         free(info->cgroup_path);
1288         free(info->cgroup_path_sub);
1289         free(info);
1290         lxc_cgroup_process_info_free_and_remove(next, conf);
1291 }
1292
1293 static char *lxc_cgroup_get_hierarchy_path_data(const char *subsystem, struct cgfs_data *d)
1294 {
1295         struct cgroup_process_info *info = d->info;
1296         info = find_info_for_subsystem(info, subsystem);
1297         if (!info)
1298                 return NULL;
1299         prune_init_scope(info->cgroup_path);
1300         return info->cgroup_path;
1301 }
1302
1303 static char *lxc_cgroup_get_hierarchy_abs_path_data(const char *subsystem, struct cgfs_data *d)
1304 {
1305         struct cgroup_process_info *info = d->info;
1306         struct cgroup_mount_point *mp = NULL;
1307
1308         info = find_info_for_subsystem(info, subsystem);
1309         if (!info)
1310                 return NULL;
1311         if (info->designated_mount_point) {
1312                 mp = info->designated_mount_point;
1313         } else {
1314                 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1315                 if (!mp)
1316                         return NULL;
1317         }
1318         return cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1319 }
1320
1321 static char *lxc_cgroup_get_hierarchy_abs_path(const char *subsystem, const char *name, const char *lxcpath)
1322 {
1323         struct cgroup_meta_data *meta;
1324         struct cgroup_process_info *base_info, *info;
1325         struct cgroup_mount_point *mp;
1326         char *result = NULL;
1327
1328         meta = lxc_cgroup_load_meta();
1329         if (!meta)
1330                 return NULL;
1331         base_info = lxc_cgroup_get_container_info(name, lxcpath, meta);
1332         if (!base_info)
1333                 goto out1;
1334         info = find_info_for_subsystem(base_info, subsystem);
1335         if (!info)
1336                 goto out2;
1337         if (info->designated_mount_point) {
1338                 mp = info->designated_mount_point;
1339         } else {
1340                 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1341                 if (!mp)
1342                         goto out3;
1343         }
1344         result = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1345 out3:
1346 out2:
1347         lxc_cgroup_process_info_free(base_info);
1348 out1:
1349         lxc_cgroup_put_meta(meta);
1350         return result;
1351 }
1352
1353 static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfs_data *d)
1354 {
1355         char *subsystem = NULL, *p, *path;
1356         int ret = -1;
1357
1358         subsystem = alloca(strlen(filename) + 1);
1359         strcpy(subsystem, filename);
1360         if ((p = strchr(subsystem, '.')) != NULL)
1361                 *p = '\0';
1362
1363         errno = ENOENT;
1364         path = lxc_cgroup_get_hierarchy_abs_path_data(subsystem, d);
1365         if (path) {
1366                 ret = do_cgroup_set(path, filename, value);
1367                 int saved_errno = errno;
1368                 free(path);
1369                 errno = saved_errno;
1370         }
1371         return ret;
1372 }
1373
1374 static int lxc_cgroupfs_set(const char *filename, const char *value, const char *name, const char *lxcpath)
1375 {
1376         char *subsystem = NULL, *p, *path;
1377         int ret = -1;
1378
1379         subsystem = alloca(strlen(filename) + 1);
1380         strcpy(subsystem, filename);
1381         if ((p = strchr(subsystem, '.')) != NULL)
1382                 *p = '\0';
1383
1384         path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1385         if (path) {
1386                 ret = do_cgroup_set(path, filename, value);
1387                 free(path);
1388         }
1389         return ret;
1390 }
1391
1392 static int lxc_cgroupfs_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
1393 {
1394         char *subsystem = NULL, *p, *path;
1395         int ret = -1;
1396
1397         subsystem = alloca(strlen(filename) + 1);
1398         strcpy(subsystem, filename);
1399         if ((p = strchr(subsystem, '.')) != NULL)
1400                 *p = '\0';
1401
1402         path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1403         if (path) {
1404                 ret = do_cgroup_get(path, filename, value, len);
1405                 free(path);
1406         }
1407         return ret;
1408 }
1409
1410 static bool cgroupfs_mount_cgroup(void *hdata, const char *root, int type)
1411 {
1412         size_t bufsz = strlen(root) + sizeof("/sys/fs/cgroup");
1413         char *path = NULL;
1414         char **parts = NULL;
1415         char *dirname = NULL;
1416         char *abs_path = NULL;
1417         char *abs_path2 = NULL;
1418         struct cgfs_data *cgfs_d;
1419         struct cgroup_process_info *info, *base_info;
1420         int r, saved_errno = 0;
1421
1422         if (cgns_supported())
1423                 return true;
1424
1425         cgfs_d = hdata;
1426         if (!cgfs_d)
1427                 return false;
1428         base_info = cgfs_d->info;
1429
1430         /* If we get passed the _NOSPEC types, we default to _MIXED, since we don't
1431          * have access to the lxc_conf object at this point. It really should be up
1432          * to the caller to fix this, but this doesn't really hurt.
1433          */
1434         if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1435                 type = LXC_AUTO_CGROUP_FULL_MIXED;
1436         else if (type == LXC_AUTO_CGROUP_NOSPEC)
1437                 type = LXC_AUTO_CGROUP_MIXED;
1438
1439         if (type < LXC_AUTO_CGROUP_RO || type > LXC_AUTO_CGROUP_FULL_MIXED) {
1440                 ERROR("could not mount cgroups into container: invalid type specified internally");
1441                 errno = EINVAL;
1442                 return false;
1443         }
1444
1445         path = calloc(1, bufsz);
1446         if (!path)
1447                 return false;
1448         snprintf(path, bufsz, "%s/sys/fs/cgroup", root);
1449         r = safe_mount("cgroup_root", path, "tmpfs",
1450                         MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1451                         "size=10240k,mode=755",
1452                         root);
1453         if (r < 0) {
1454                 SYSERROR("could not mount tmpfs to /sys/fs/cgroup in the container");
1455                 return false;
1456         }
1457
1458         /* now mount all the hierarchies we care about */
1459         for (info = base_info; info; info = info->next) {
1460                 size_t subsystem_count, i;
1461                 struct cgroup_mount_point *mp = info->designated_mount_point;
1462
1463                 if (!info->hierarchy)
1464                         continue;
1465
1466                 if (!mountpoint_is_accessible(mp))
1467                         mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1468
1469                 if (!mp) {
1470                         SYSERROR("could not find original mount point for cgroup hierarchy while trying to mount cgroup filesystem");
1471                         goto out_error;
1472                 }
1473
1474                 subsystem_count = lxc_array_len((void **)info->hierarchy->subsystems);
1475                 parts = calloc(subsystem_count + 1, sizeof(char *));
1476                 if (!parts)
1477                         goto out_error;
1478
1479                 for (i = 0; i < subsystem_count; i++) {
1480                         if (!strncmp(info->hierarchy->subsystems[i], "name=", 5))
1481                                 parts[i] = info->hierarchy->subsystems[i] + 5;
1482                         else
1483                                 parts[i] = info->hierarchy->subsystems[i];
1484                 }
1485                 dirname = lxc_string_join(",", (const char **)parts, false);
1486                 if (!dirname)
1487                         goto out_error;
1488
1489                 /* create subsystem directory */
1490                 abs_path = lxc_append_paths(path, dirname);
1491                 if (!abs_path)
1492                         goto out_error;
1493                 r = mkdir_p(abs_path, 0755);
1494                 if (r < 0 && errno != EEXIST) {
1495                         SYSERROR("could not create cgroup subsystem directory /sys/fs/cgroup/%s", dirname);
1496                         goto out_error;
1497                 }
1498
1499                 abs_path2 = lxc_append_paths(abs_path, info->cgroup_path);
1500                 if (!abs_path2)
1501                         goto out_error;
1502
1503                 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_RW || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1504                         /* bind-mount the cgroup entire filesystem there */
1505                         if (strcmp(mp->mount_prefix, "/") != 0) {
1506                                 /* FIXME: maybe we should just try to remount the entire hierarchy
1507                                  *        with a regular mount command? may that works? */
1508                                 ERROR("could not automatically mount cgroup-full to /sys/fs/cgroup/%s: host has no mount point for this cgroup filesystem that has access to the root cgroup", dirname);
1509                                 goto out_error;
1510                         }
1511                         r = mount(mp->mount_point, abs_path, "none", MS_BIND, 0);
1512                         if (r < 0) {
1513                                 SYSERROR("error bind-mounting %s to %s", mp->mount_point, abs_path);
1514                                 goto out_error;
1515                         }
1516                         /* main cgroup path should be read-only */
1517                         if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1518                                 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1519                                 if (r < 0) {
1520                                         SYSERROR("error re-mounting %s readonly", abs_path);
1521                                         goto out_error;
1522                                 }
1523                         }
1524                         /* own cgroup should be read-write */
1525                         if (type == LXC_AUTO_CGROUP_FULL_MIXED) {
1526                                 r = mount(abs_path2, abs_path2, NULL, MS_BIND, NULL);
1527                                 if (r < 0) {
1528                                         SYSERROR("error bind-mounting %s onto itself", abs_path2);
1529                                         goto out_error;
1530                                 }
1531                                 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND, NULL);
1532                                 if (r < 0) {
1533                                         SYSERROR("error re-mounting %s readwrite", abs_path2);
1534                                         goto out_error;
1535                                 }
1536                         }
1537                 } else {
1538                         /* create path for container's cgroup */
1539                         r = mkdir_p(abs_path2, 0755);
1540                         if (r < 0 && errno != EEXIST) {
1541                                 SYSERROR("could not create cgroup directory /sys/fs/cgroup/%s%s", dirname, info->cgroup_path);
1542                                 goto out_error;
1543                         }
1544
1545                         /* for read-only and mixed cases, we have to bind-mount the tmpfs directory
1546                          * that points to the hierarchy itself (i.e. /sys/fs/cgroup/cpu etc.) onto
1547                          * itself and then bind-mount it read-only, since we keep the tmpfs itself
1548                          * read-write (see comment below)
1549                          */
1550                         if (type == LXC_AUTO_CGROUP_MIXED || type == LXC_AUTO_CGROUP_RO) {
1551                                 r = mount(abs_path, abs_path, NULL, MS_BIND, NULL);
1552                                 if (r < 0) {
1553                                         SYSERROR("error bind-mounting %s onto itself", abs_path);
1554                                         goto out_error;
1555                                 }
1556                                 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1557                                 if (r < 0) {
1558                                         SYSERROR("error re-mounting %s readonly", abs_path);
1559                                         goto out_error;
1560                                 }
1561                         }
1562
1563                         free(abs_path);
1564                         abs_path = NULL;
1565
1566                         /* bind-mount container's cgroup to that directory */
1567                         abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1568                         if (!abs_path)
1569                                 goto out_error;
1570                         r = mount(abs_path, abs_path2, "none", MS_BIND, 0);
1571                         if (r < 0 && is_crucial_hierarchy(info->hierarchy)) {
1572                                 SYSERROR("error bind-mounting %s to %s", abs_path, abs_path2);
1573                                 goto out_error;
1574                         }
1575                         if (type == LXC_AUTO_CGROUP_RO) {
1576                                 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1577                                 if (r < 0) {
1578                                         SYSERROR("error re-mounting %s readonly", abs_path2);
1579                                         goto out_error;
1580                                 }
1581                         }
1582                 }
1583
1584                 free(abs_path);
1585                 free(abs_path2);
1586                 abs_path = NULL;
1587                 abs_path2 = NULL;
1588
1589                 /* add symlinks for every single subsystem */
1590                 if (subsystem_count > 1) {
1591                         for (i = 0; i < subsystem_count; i++) {
1592                                 abs_path = lxc_append_paths(path, parts[i]);
1593                                 if (!abs_path)
1594                                         goto out_error;
1595                                 r = symlink(dirname, abs_path);
1596                                 if (r < 0)
1597                                         WARN("could not create symlink %s -> %s in /sys/fs/cgroup of container", parts[i], dirname);
1598                                 free(abs_path);
1599                                 abs_path = NULL;
1600                         }
1601                 }
1602                 free(dirname);
1603                 free(parts);
1604                 dirname = NULL;
1605                 parts = NULL;
1606         }
1607
1608         /* We used to remount the entire tmpfs readonly if any :ro or
1609          * :mixed mode was specified. However, Ubuntu's mountall has the
1610          * unfortunate behavior to block bootup if /sys/fs/cgroup is
1611          * mounted read-only and cannot be remounted read-write.
1612          * (mountall reads /lib/init/fstab and tries to (re-)mount all of
1613          * these if they are not already mounted with the right options;
1614          * it contains an entry for /sys/fs/cgroup. In case it can't do
1615          * that, it prompts for the user to either manually fix it or
1616          * boot anyway. But without user input, booting of the container
1617          * hangs.)
1618          *
1619          * Instead of remounting the entire tmpfs readonly, we only
1620          * remount the paths readonly that are part of the cgroup
1621          * hierarchy.
1622          */
1623
1624         free(path);
1625
1626         return true;
1627
1628 out_error:
1629         saved_errno = errno;
1630         free(path);
1631         free(dirname);
1632         free(parts);
1633         free(abs_path);
1634         free(abs_path2);
1635         errno = saved_errno;
1636         return false;
1637 }
1638
1639 static int cgfs_nrtasks(void *hdata)
1640 {
1641         struct cgfs_data *d = hdata;
1642         struct cgroup_process_info *info;
1643         struct cgroup_mount_point *mp = NULL;
1644         char *abs_path = NULL;
1645         int ret;
1646
1647         if (!d) {
1648                 errno = ENOENT;
1649                 return -1;
1650         }
1651
1652         info = d->info;
1653         if (!info) {
1654                 errno = ENOENT;
1655                 return -1;
1656         }
1657
1658         if (info->designated_mount_point) {
1659                 mp = info->designated_mount_point;
1660         } else {
1661                 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, false);
1662                 if (!mp)
1663                         return -1;
1664         }
1665
1666         abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1667         if (!abs_path)
1668                 return -1;
1669
1670         ret = cgroup_recursive_task_count(abs_path);
1671         free(abs_path);
1672         return ret;
1673 }
1674
1675 static struct cgroup_process_info *
1676 lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str,
1677                              struct cgroup_meta_data *meta)
1678 {
1679         struct cgroup_process_info *result = NULL;
1680         FILE *proc_pid_cgroup = NULL;
1681         char *line = NULL;
1682         size_t sz = 0;
1683         int saved_errno = 0;
1684         struct cgroup_process_info **cptr = &result;
1685         struct cgroup_process_info *entry = NULL;
1686
1687         proc_pid_cgroup = fopen_cloexec(proc_pid_cgroup_str, "r");
1688         if (!proc_pid_cgroup)
1689                 return NULL;
1690
1691         while (getline(&line, &sz, proc_pid_cgroup) != -1) {
1692                 /* file format: hierarchy:subsystems:group */
1693                 char *colon1;
1694                 char *colon2;
1695                 char *endptr;
1696                 int hierarchy_number;
1697                 struct cgroup_hierarchy *h = NULL;
1698
1699                 if (!line[0])
1700                         continue;
1701
1702                 if (line[strlen(line) - 1] == '\n')
1703                         line[strlen(line) - 1] = '\0';
1704
1705                 colon1 = strchr(line, ':');
1706                 if (!colon1)
1707                         continue;
1708                 *colon1++ = '\0';
1709                 colon2 = strchr(colon1, ':');
1710                 if (!colon2)
1711                         continue;
1712                 *colon2++ = '\0';
1713
1714                 endptr = NULL;
1715
1716                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
1717                  * form: 0::/
1718                  * These entries need to be skipped.
1719                  */
1720                 if (!strcmp(colon1, ""))
1721                         continue;
1722
1723                 hierarchy_number = strtoul(line, &endptr, 10);
1724                 if (!endptr || *endptr)
1725                         continue;
1726
1727                 if (hierarchy_number > meta->maximum_hierarchy) {
1728                         /* we encountered a hierarchy we didn't have before,
1729                          * so probably somebody remounted some stuff in the
1730                          * mean time...
1731                          */
1732                         errno = EAGAIN;
1733                         goto out_error;
1734                 }
1735
1736                 h = meta->hierarchies[hierarchy_number];
1737                 if (!h) {
1738                         /* we encountered a hierarchy that was thought to be
1739                          * dead before, so probably somebody remounted some
1740                          * stuff in the mean time...
1741                          */
1742                         errno = EAGAIN;
1743                         goto out_error;
1744                 }
1745
1746                 /* we are told that we should ignore this hierarchy */
1747                 if (!h->used)
1748                         continue;
1749
1750                 entry = calloc(1, sizeof(struct cgroup_process_info));
1751                 if (!entry)
1752                         goto out_error;
1753
1754                 entry->meta_ref = lxc_cgroup_get_meta(meta);
1755                 entry->hierarchy = h;
1756                 entry->cgroup_path = strdup(colon2);
1757                 if (!entry->cgroup_path)
1758                         goto out_error;
1759                 prune_init_scope(entry->cgroup_path);
1760
1761                 *cptr = entry;
1762                 cptr = &entry->next;
1763                 entry = NULL;
1764         }
1765
1766         fclose(proc_pid_cgroup);
1767         free(line);
1768         return result;
1769
1770 out_error:
1771         saved_errno = errno;
1772         if (proc_pid_cgroup)
1773                 fclose(proc_pid_cgroup);
1774         lxc_cgroup_process_info_free(result);
1775         lxc_cgroup_process_info_free(entry);
1776         free(line);
1777         errno = saved_errno;
1778         return NULL;
1779 }
1780
1781 static char **subsystems_from_mount_options(const char *mount_options,
1782                                             char **kernel_list)
1783 {
1784         char *token, *str, *saveptr = NULL;
1785         char **result = NULL;
1786         size_t result_capacity = 0;
1787         size_t result_count = 0;
1788         int saved_errno;
1789         int r;
1790
1791         str = alloca(strlen(mount_options)+1);
1792         strcpy(str, mount_options);
1793         for (; (token = strtok_r(str, ",", &saveptr)); str = NULL) {
1794                 /* we have a subsystem if it's either in the list of
1795                  * subsystems provided by the kernel OR if it starts
1796                  * with name= for named hierarchies
1797                  */
1798                 r = lxc_grow_array((void ***)&result, &result_capacity, result_count + 1, 12);
1799                 if (r < 0)
1800                         goto out_free;
1801                 result[result_count + 1] = NULL;
1802                 if (strncmp(token, "name=", 5) && !lxc_string_in_array(token, (const char **)kernel_list)) {
1803                         // this is eg 'systemd' but the mount will be 'name=systemd'
1804                         result[result_count] = malloc(strlen(token) + 6);
1805                         if (result[result_count])
1806                                 sprintf(result[result_count], "name=%s", token);
1807                 } else
1808                         result[result_count] = strdup(token);
1809                 if (!result[result_count])
1810                         goto out_free;
1811                 result_count++;
1812         }
1813
1814         return result;
1815
1816 out_free:
1817         saved_errno = errno;
1818         lxc_free_array((void**)result, free);
1819         errno = saved_errno;
1820         return NULL;
1821 }
1822
1823 static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp)
1824 {
1825         if (!mp)
1826                 return;
1827         free(mp->mount_point);
1828         free(mp->mount_prefix);
1829         free(mp);
1830 }
1831
1832 static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h)
1833 {
1834         if (!h)
1835                 return;
1836         if (h->subsystems) {
1837                 lxc_free_array((void **)h->subsystems, free);
1838                 h->subsystems = NULL;
1839         }
1840         if (h->all_mount_points) {
1841                 free(h->all_mount_points);
1842                 h->all_mount_points = NULL;
1843         }
1844         free(h);
1845         h = NULL;
1846 }
1847
1848 static bool is_valid_cgroup(const char *name)
1849 {
1850         const char *p;
1851         for (p = name; *p; p++) {
1852                 /* Use the ASCII printable characters range(32 - 127)
1853                  * is reasonable, we kick out 32(SPACE) because it'll
1854                  * break legacy lxc-ls
1855                  */
1856                 if (*p <= 32 || *p >= 127 || *p == '/')
1857                         return false;
1858         }
1859         return strcmp(name, ".") != 0 && strcmp(name, "..") != 0;
1860 }
1861
1862 static int create_or_remove_cgroup(bool do_remove,
1863                 struct cgroup_mount_point *mp, const char *path, int recurse,
1864                 struct lxc_conf *conf)
1865 {
1866         int r, saved_errno = 0;
1867         char *buf = cgroup_to_absolute_path(mp, path, NULL);
1868         if (!buf)
1869                 return -1;
1870
1871         /* create or remove directory */
1872         if (do_remove) {
1873                 if (!dir_exists(buf))
1874                         return 0;
1875                 if (recurse) {
1876                         if (conf && !lxc_list_empty(&conf->id_map))
1877                                 r = userns_exec_1(conf, rmdir_wrapper, buf);
1878                         else
1879                                 r = cgroup_rmdir(buf);
1880                 } else
1881                         r = rmdir(buf);
1882         } else
1883                 r = mkdir_p(buf, 0777);
1884         saved_errno = errno;
1885         free(buf);
1886         errno = saved_errno;
1887         return r;
1888 }
1889
1890 static int create_cgroup(struct cgroup_mount_point *mp, const char *path)
1891 {
1892         return create_or_remove_cgroup(false, mp, path, false, NULL);
1893 }
1894
1895 static int remove_cgroup(struct cgroup_mount_point *mp,
1896                          const char *path, bool recurse, struct lxc_conf *conf)
1897 {
1898         return create_or_remove_cgroup(true, mp, path, recurse, conf);
1899 }
1900
1901 static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp,
1902                                      const char *path, const char *suffix)
1903 {
1904         /* first we have to make sure we subtract the mount point's prefix */
1905         char *prefix = mp->mount_prefix;
1906         char *buf;
1907         ssize_t len, rv;
1908
1909         /* we want to make sure only absolute paths to cgroups are passed to us */
1910         if (path[0] != '/') {
1911                 errno = EINVAL;
1912                 return NULL;
1913         }
1914
1915         if (prefix && !strcmp(prefix, "/"))
1916                 prefix = NULL;
1917
1918         /* prefix doesn't match */
1919         if (prefix && strncmp(prefix, path, strlen(prefix)) != 0) {
1920                 errno = EINVAL;
1921                 return NULL;
1922         }
1923         /* if prefix is /foo and path is /foobar */
1924         if (prefix && path[strlen(prefix)] != '/' && path[strlen(prefix)] != '\0') {
1925                 errno = EINVAL;
1926                 return NULL;
1927         }
1928
1929         /* remove prefix from path */
1930         path += prefix ? strlen(prefix) : 0;
1931
1932         len = strlen(mp->mount_point) + strlen(path) + (suffix ? strlen(suffix) : 0);
1933         buf = calloc(len + 1, 1);
1934         if (!buf)
1935                 return NULL;
1936         rv = snprintf(buf, len + 1, "%s%s%s", mp->mount_point, path, suffix ? suffix : "");
1937         if (rv > len) {
1938                 free(buf);
1939                 errno = ENOMEM;
1940                 return NULL;
1941         }
1942
1943         return buf;
1944 }
1945
1946 static struct cgroup_process_info *
1947 find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem)
1948 {
1949         struct cgroup_process_info *info_ptr;
1950         for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1951                 struct cgroup_hierarchy *h = info_ptr->hierarchy;
1952                 if (!h)
1953                         continue;
1954                 if (lxc_string_in_array(subsystem, (const char **)h->subsystems))
1955                         return info_ptr;
1956         }
1957         errno = ENOENT;
1958         return NULL;
1959 }
1960
1961 static int do_cgroup_get(const char *cgroup_path, const char *sub_filename,
1962                          char *value, size_t len)
1963 {
1964         const char *parts[3] = {
1965                 cgroup_path,
1966                 sub_filename,
1967                 NULL
1968         };
1969         char *filename;
1970         int ret, saved_errno;
1971
1972         filename = lxc_string_join("/", parts, false);
1973         if (!filename)
1974                 return -1;
1975
1976         ret = lxc_read_from_file(filename, value, len);
1977         saved_errno = errno;
1978         free(filename);
1979         errno = saved_errno;
1980         return ret;
1981 }
1982
1983 static int do_cgroup_set(const char *cgroup_path, const char *sub_filename,
1984                          const char *value)
1985 {
1986         const char *parts[3] = {
1987                 cgroup_path,
1988                 sub_filename,
1989                 NULL
1990         };
1991         char *filename;
1992         int ret, saved_errno;
1993
1994         filename = lxc_string_join("/", parts, false);
1995         if (!filename)
1996                 return -1;
1997
1998         ret = lxc_write_to_file(filename, value, strlen(value), false);
1999         saved_errno = errno;
2000         free(filename);
2001         errno = saved_errno;
2002         return ret;
2003 }
2004
2005 static int do_setup_cgroup_limits(struct cgfs_data *d,
2006                            struct lxc_list *cgroup_settings, bool do_devices)
2007 {
2008         struct lxc_list *iterator, *sorted_cgroup_settings, *next;
2009         struct lxc_cgroup *cg;
2010         int ret = -1;
2011
2012         if (lxc_list_empty(cgroup_settings))
2013                 return 0;
2014
2015         sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2016         if (!sorted_cgroup_settings) {
2017                 return -1;
2018         }
2019
2020         lxc_list_for_each(iterator, sorted_cgroup_settings) {
2021                 cg = iterator->elem;
2022
2023                 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2024                         if (strcmp(cg->subsystem, "devices.deny") == 0 &&
2025                                         cgroup_devices_has_allow_or_deny(d, cg->value, false))
2026                                 continue;
2027                         if (strcmp(cg->subsystem, "devices.allow") == 0 &&
2028                                         cgroup_devices_has_allow_or_deny(d, cg->value, true))
2029                                 continue;
2030                         if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
2031                                 if (do_devices && (errno == EACCES || errno == EPERM)) {
2032                                         WARN("Error setting %s to %s for %s",
2033                                               cg->subsystem, cg->value, d->name);
2034                                         continue;
2035                                 }
2036                                 SYSERROR("Error setting %s to %s for %s",
2037                                       cg->subsystem, cg->value, d->name);
2038                                 goto out;
2039                         }
2040                 }
2041
2042                 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
2043         }
2044
2045         ret = 0;
2046         INFO("cgroup has been setup");
2047 out:
2048         lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2049                 lxc_list_del(iterator);
2050                 free(iterator);
2051         }
2052         free(sorted_cgroup_settings);
2053         return ret;
2054 }
2055
2056 static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d,
2057                                              char *v, bool for_allow)
2058 {
2059         char *path;
2060         FILE *devices_list;
2061         char *line = NULL;
2062         size_t sz = 0;
2063         bool ret = !for_allow;
2064         const char *parts[3] = {
2065                 NULL,
2066                 "devices.list",
2067                 NULL
2068         };
2069
2070         // XXX FIXME if users could use something other than 'lxc.devices.deny = a'.
2071         // not sure they ever do, but they *could*
2072         // right now, I'm assuming they do NOT
2073         if (!for_allow && strcmp(v, "a") != 0 && strcmp(v, "a *:* rwm") != 0)
2074                 return false;
2075
2076         parts[0] = (const char *)lxc_cgroup_get_hierarchy_abs_path_data("devices", d);
2077         if (!parts[0])
2078                 return false;
2079         path = lxc_string_join("/", parts, false);
2080         if (!path) {
2081                 free((void *)parts[0]);
2082                 return false;
2083         }
2084
2085         devices_list = fopen_cloexec(path, "r");
2086         if (!devices_list) {
2087                 free(path);
2088                 return false;
2089         }
2090
2091         while (getline(&line, &sz, devices_list) != -1) {
2092                 size_t len = strlen(line);
2093                 if (len > 0 && line[len-1] == '\n')
2094                         line[len-1] = '\0';
2095                 if (strcmp(line, "a *:* rwm") == 0) {
2096                         ret = for_allow;
2097                         goto out;
2098                 } else if (for_allow && strcmp(line, v) == 0) {
2099                         ret = true;
2100                         goto out;
2101                 }
2102         }
2103
2104 out:
2105         fclose(devices_list);
2106         free(line);
2107         free(path);
2108         return ret;
2109 }
2110
2111 static int cgroup_recursive_task_count(const char *cgroup_path)
2112 {
2113         DIR *d;
2114         struct dirent *dent;
2115         int n = 0, r;
2116
2117         d = opendir(cgroup_path);
2118         if (!d)
2119                 return 0;
2120
2121         while ((dent = readdir(d))) {
2122                 const char *parts[3] = {
2123                         cgroup_path,
2124                         dent->d_name,
2125                         NULL
2126                 };
2127                 char *sub_path;
2128                 struct stat st;
2129
2130                 if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, ".."))
2131                         continue;
2132                 sub_path = lxc_string_join("/", parts, false);
2133                 if (!sub_path) {
2134                         closedir(d);
2135                         return -1;
2136                 }
2137                 r = stat(sub_path, &st);
2138                 if (r < 0) {
2139                         closedir(d);
2140                         free(sub_path);
2141                         return -1;
2142                 }
2143                 if (S_ISDIR(st.st_mode)) {
2144                         r = cgroup_recursive_task_count(sub_path);
2145                         if (r >= 0)
2146                                 n += r;
2147                 } else if (!strcmp(dent->d_name, "tasks")) {
2148                         r = lxc_count_file_lines(sub_path);
2149                         if (r >= 0)
2150                                 n += r;
2151                 }
2152                 free(sub_path);
2153         }
2154         closedir(d);
2155
2156         return n;
2157 }
2158
2159 static int handle_cgroup_settings(struct cgroup_mount_point *mp,
2160                                   char *cgroup_path)
2161 {
2162         int r, saved_errno = 0;
2163         char buf[2];
2164
2165         mp->need_cpuset_init = false;
2166
2167         /* If this is the memory cgroup, we want to enforce hierarchy.
2168          * But don't fail if for some reason we can't.
2169          */
2170         if (lxc_string_in_array("memory", (const char **)mp->hierarchy->subsystems)) {
2171                 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/memory.use_hierarchy");
2172                 if (cc_path) {
2173                         r = lxc_read_from_file(cc_path, buf, 1);
2174                         if (r < 1 || buf[0] != '1') {
2175                                 r = lxc_write_to_file(cc_path, "1", 1, false);
2176                                 if (r < 0)
2177                                         SYSERROR("failed to set memory.use_hierarchy to 1; continuing");
2178                         }
2179                         free(cc_path);
2180                 }
2181         }
2182
2183         /* if this is a cpuset hierarchy, we have to set cgroup.clone_children in
2184          * the base cgroup, otherwise containers will start with an empty cpuset.mems
2185          * and cpuset.cpus and then
2186          */
2187         if (lxc_string_in_array("cpuset", (const char **)mp->hierarchy->subsystems)) {
2188                 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/cgroup.clone_children");
2189                 struct stat sb;
2190
2191                 if (!cc_path)
2192                         return -1;
2193                 /* cgroup.clone_children is not available when running under
2194                  * older kernel versions; in this case, we'll initialize
2195                  * cpuset.cpus and cpuset.mems later, after the new cgroup
2196                  * was created
2197                  */
2198                 if (stat(cc_path, &sb) != 0 && errno == ENOENT) {
2199                         mp->need_cpuset_init = true;
2200                         free(cc_path);
2201                         return 0;
2202                 }
2203                 r = lxc_read_from_file(cc_path, buf, 1);
2204                 if (r == 1 && buf[0] == '1') {
2205                         free(cc_path);
2206                         return 0;
2207                 }
2208                 r = lxc_write_to_file(cc_path, "1", 1, false);
2209                 saved_errno = errno;
2210                 free(cc_path);
2211                 errno = saved_errno;
2212                 return r < 0 ? -1 : 0;
2213         }
2214         return 0;
2215 }
2216
2217 static int cgroup_read_from_file(const char *fn, char buf[], size_t bufsize)
2218 {
2219         int ret = lxc_read_from_file(fn, buf, bufsize);
2220         if (ret < 0) {
2221                 SYSERROR("failed to read %s", fn);
2222                 return ret;
2223         }
2224         if (ret == bufsize) {
2225                 if (bufsize > 0) {
2226                         /* obviously this wasn't empty */
2227                         buf[bufsize-1] = '\0';
2228                         return ret;
2229                 }
2230                 /* Callers don't do this, but regression/sanity check */
2231                 ERROR("%s: was not expecting 0 bufsize", __func__);
2232                 return -1;
2233         }
2234         buf[ret] = '\0';
2235         return ret;
2236 }
2237
2238 static bool do_init_cpuset_file(struct cgroup_mount_point *mp,
2239                                 const char *path, const char *name)
2240 {
2241         char value[1024];
2242         char *childfile, *parentfile = NULL, *tmp;
2243         int ret;
2244         bool ok = false;
2245
2246         childfile = cgroup_to_absolute_path(mp, path, name);
2247         if (!childfile)
2248                 return false;
2249
2250         /* don't overwrite a non-empty value in the file */
2251         ret = cgroup_read_from_file(childfile, value, sizeof(value));
2252         if (ret < 0)
2253                 goto out;
2254         if (value[0] != '\0' && value[0] != '\n') {
2255                 ok = true;
2256                 goto out;
2257         }
2258
2259         /* path to the same name in the parent cgroup */
2260         parentfile = strdup(path);
2261         if (!parentfile)
2262                 goto out;
2263
2264         tmp = strrchr(parentfile, '/');
2265         if (!tmp)
2266                 goto out;
2267         if (tmp == parentfile)
2268                 tmp++; /* keep the '/' at the start */
2269         *tmp = '\0';
2270         tmp = parentfile;
2271         parentfile = cgroup_to_absolute_path(mp, tmp, name);
2272         free(tmp);
2273         if (!parentfile)
2274                 goto out;
2275
2276         /* copy from parent to child cgroup */
2277         ret = cgroup_read_from_file(parentfile, value, sizeof(value));
2278         if (ret < 0)
2279                 goto out;
2280         if (ret == sizeof(value)) {
2281                 /* If anyone actually sees this error, we can address it */
2282                 ERROR("parent cpuset value too long");
2283                 goto out;
2284         }
2285         ok = (lxc_write_to_file(childfile, value, strlen(value), false) >= 0);
2286         if (!ok)
2287                 SYSERROR("failed writing %s", childfile);
2288
2289 out:
2290         free(parentfile);
2291         free(childfile);
2292         return ok;
2293 }
2294
2295 static bool init_cpuset_if_needed(struct cgroup_mount_point *mp,
2296                                   const char *path)
2297 {
2298         /* the files we have to handle here are only in cpuset hierarchies */
2299         if (!lxc_string_in_array("cpuset",
2300                                  (const char **)mp->hierarchy->subsystems))
2301                 return true;
2302
2303         if (!mp->need_cpuset_init)
2304                 return true;
2305
2306         return (do_init_cpuset_file(mp, path, "/cpuset.cpus") &&
2307                 do_init_cpuset_file(mp, path, "/cpuset.mems") );
2308 }
2309
2310 static void print_cgfs_init_debuginfo(struct cgfs_data *d)
2311 {
2312         int i;
2313
2314         if (!getenv("LXC_DEBUG_CGFS"))
2315                 return;
2316
2317         DEBUG("Cgroup information:");
2318         DEBUG("  container name: %s", d->name);
2319         if (!d->meta || !d->meta->hierarchies) {
2320                 DEBUG("  No hierarchies found.");
2321                 return;
2322         }
2323         DEBUG("  Controllers:");
2324         for (i = 0; i <= d->meta->maximum_hierarchy; i++) {
2325                 char **p;
2326                 struct cgroup_hierarchy *h = d->meta->hierarchies[i];
2327                 if (!h) {
2328                         DEBUG("     Empty hierarchy number %d.", i);
2329                         continue;
2330                 }
2331                 for (p = h->subsystems; p && *p; p++) {
2332                         DEBUG("     %2d: %s", i, *p);
2333                 }
2334         }
2335 }
2336
2337 struct cgroup_ops *cgfs_ops_init(void)
2338 {
2339         return &cgfs_ops;
2340 }
2341
2342 static void *cgfs_init(const char *name)
2343 {
2344         struct cgfs_data *d;
2345
2346         d = malloc(sizeof(*d));
2347         if (!d)
2348                 return NULL;
2349
2350         memset(d, 0, sizeof(*d));
2351         d->name = strdup(name);
2352         if (!d->name)
2353                 goto err1;
2354
2355         d->cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
2356
2357         d->meta = lxc_cgroup_load_meta();
2358         if (!d->meta) {
2359                 ERROR("cgroupfs failed to detect cgroup metadata");
2360                 goto err2;
2361         }
2362
2363         print_cgfs_init_debuginfo(d);
2364
2365         return d;
2366
2367 err2:
2368         free(d->name);
2369 err1:
2370         free(d);
2371         return NULL;
2372 }
2373
2374 static void cgfs_destroy(void *hdata, struct lxc_conf *conf)
2375 {
2376         struct cgfs_data *d = hdata;
2377
2378         if (!d)
2379                 return;
2380         free(d->name);
2381         lxc_cgroup_process_info_free_and_remove(d->info, conf);
2382         lxc_cgroup_put_meta(d->meta);
2383         free(d);
2384 }
2385
2386 static inline bool cgfs_create(void *hdata)
2387 {
2388         struct cgfs_data *d = hdata;
2389         struct cgroup_process_info *i;
2390         struct cgroup_meta_data *md;
2391
2392         if (!d)
2393                 return false;
2394         md = d->meta;
2395         i = lxc_cgroupfs_create(d->name, d->cgroup_pattern, md, NULL);
2396         if (!i)
2397                 return false;
2398         d->info = i;
2399         return true;
2400 }
2401
2402 static inline bool cgfs_enter(void *hdata, pid_t pid)
2403 {
2404         struct cgfs_data *d = hdata;
2405         struct cgroup_process_info *i;
2406         int ret;
2407
2408         if (!d)
2409                 return false;
2410         i = d->info;
2411         ret = lxc_cgroupfs_enter(i, pid, false);
2412
2413         return ret == 0;
2414 }
2415
2416 static inline bool cgfs_create_legacy(void *hdata, pid_t pid)
2417 {
2418         struct cgfs_data *d = hdata;
2419         struct cgroup_process_info *i;
2420
2421         if (!d)
2422                 return false;
2423         i = d->info;
2424         if (lxc_cgroup_create_legacy(i, d->name, pid) < 0) {
2425                 ERROR("failed to create legacy ns cgroups for '%s'", d->name);
2426                 return false;
2427         }
2428         return true;
2429 }
2430
2431 static const char *cgfs_get_cgroup(void *hdata, const char *subsystem)
2432 {
2433         struct cgfs_data *d = hdata;
2434
2435         if (!d)
2436                 return NULL;
2437         return lxc_cgroup_get_hierarchy_path_data(subsystem, d);
2438 }
2439
2440 static bool cgfs_escape(void *hdata)
2441 {
2442         struct cgroup_meta_data *md;
2443         int i;
2444         bool ret = false;
2445
2446         md = lxc_cgroup_load_meta();
2447         if (!md)
2448                 return false;
2449
2450         for (i = 0; i <= md->maximum_hierarchy; i++) {
2451                 struct cgroup_hierarchy *h = md->hierarchies[i];
2452                 struct cgroup_mount_point *mp;
2453                 char *tasks;
2454                 FILE *f;
2455                 int written;
2456
2457                 if (!h) {
2458                         WARN("not escaping hierarchy %d", i);
2459                         continue;
2460                 }
2461
2462                 mp = lxc_cgroup_find_mount_point(h, "/", true);
2463                 if (!mp)
2464                         goto out;
2465
2466                 tasks = cgroup_to_absolute_path(mp, "/", "tasks");
2467                 if (!tasks)
2468                         goto out;
2469
2470                 f = fopen(tasks, "a");
2471                 free(tasks);
2472                 if (!f)
2473                         goto out;
2474
2475                 written = fprintf(f, "%d\n", getpid());
2476                 fclose(f);
2477                 if (written < 0) {
2478                         SYSERROR("writing tasks failed\n");
2479                         goto out;
2480                 }
2481         }
2482
2483         ret = true;
2484 out:
2485         lxc_cgroup_put_meta(md);
2486         return ret;
2487 }
2488
2489 static int cgfs_num_hierarchies(void)
2490 {
2491         /* not implemented */
2492         return -1;
2493 }
2494
2495 static bool cgfs_get_hierarchies(int i, char ***out)
2496 {
2497         /* not implemented */
2498         return false;
2499 }
2500
2501 static bool cgfs_unfreeze(void *hdata)
2502 {
2503         struct cgfs_data *d = hdata;
2504         char *cgabspath, *cgrelpath;
2505         int ret;
2506
2507         if (!d)
2508                 return false;
2509
2510         cgrelpath = lxc_cgroup_get_hierarchy_path_data("freezer", d);
2511         cgabspath = lxc_cgroup_find_abs_path("freezer", cgrelpath, true, NULL);
2512         if (!cgabspath)
2513                 return false;
2514
2515         ret = do_cgroup_set(cgabspath, "freezer.state", "THAWED");
2516         free(cgabspath);
2517         return ret == 0;
2518 }
2519
2520 static bool cgroupfs_setup_limits(void *hdata, struct lxc_list *cgroup_conf,
2521                                   bool with_devices)
2522 {
2523         struct cgfs_data *d = hdata;
2524
2525         if (!d)
2526                 return false;
2527         return do_setup_cgroup_limits(d, cgroup_conf, with_devices) == 0;
2528 }
2529
2530 static bool lxc_cgroupfs_attach(const char *name, const char *lxcpath, pid_t pid)
2531 {
2532         struct cgroup_meta_data *meta_data;
2533         struct cgroup_process_info *container_info;
2534         int ret;
2535
2536         meta_data = lxc_cgroup_load_meta();
2537         if (!meta_data) {
2538                 ERROR("could not move attached process %d to cgroup of container", pid);
2539                 return false;
2540         }
2541
2542         container_info = lxc_cgroup_get_container_info(name, lxcpath, meta_data);
2543         lxc_cgroup_put_meta(meta_data);
2544         if (!container_info) {
2545                 ERROR("could not move attached process %d to cgroup of container", pid);
2546                 return false;
2547         }
2548
2549         ret = lxc_cgroupfs_enter(container_info, pid, false);
2550         lxc_cgroup_process_info_free(container_info);
2551         if (ret < 0) {
2552                 ERROR("could not move attached process %d to cgroup of container", pid);
2553                 return false;
2554         }
2555         return true;
2556 }
2557
2558 struct chown_data {
2559         const char *cgroup_path;
2560         uid_t origuid;
2561 };
2562
2563 /*
2564  * TODO - someone should refactor this to unshare once passing all the paths
2565  * to be chowned in one go
2566  */
2567 static int chown_cgroup_wrapper(void *data)
2568 {
2569         struct chown_data *arg = data;
2570         uid_t destuid;
2571         char *fpath;
2572
2573         if (setresgid(0,0,0) < 0)
2574                 SYSERROR("Failed to setgid to 0");
2575         if (setresuid(0,0,0) < 0)
2576                 SYSERROR("Failed to setuid to 0");
2577         if (setgroups(0, NULL) < 0)
2578                 SYSERROR("Failed to clear groups");
2579         destuid = get_ns_uid(arg->origuid);
2580
2581         if (chown(arg->cgroup_path, destuid, 0) < 0)
2582                 SYSERROR("Failed chowning %s to %d", arg->cgroup_path, (int)destuid);
2583
2584         fpath = lxc_append_paths(arg->cgroup_path, "tasks");
2585         if (!fpath)
2586                 return -1;
2587         if (chown(fpath, destuid, 0) < 0)
2588                 SYSERROR("Error chowning %s\n", fpath);
2589         free(fpath);
2590
2591         fpath = lxc_append_paths(arg->cgroup_path, "cgroup.procs");
2592         if (!fpath)
2593                 return -1;
2594         if (chown(fpath, destuid, 0) < 0)
2595                 SYSERROR("Error chowning %s", fpath);
2596         free(fpath);
2597
2598         return 0;
2599 }
2600
2601 static bool do_cgfs_chown(char *cgroup_path, struct lxc_conf *conf)
2602 {
2603         struct chown_data data;
2604         char *fpath;
2605
2606         if (!dir_exists(cgroup_path))
2607                 return true;
2608
2609         if (lxc_list_empty(&conf->id_map))
2610                 /* If there's no mapping then we don't need to chown */
2611                 return true;
2612
2613         data.cgroup_path = cgroup_path;
2614         data.origuid = geteuid();
2615
2616         /* Unpriv users can't chown it themselves, so chown from
2617          * a child namespace mapping both our own and the target uid
2618          */
2619         if (userns_exec_1(conf, chown_cgroup_wrapper, &data) < 0) {
2620                 ERROR("Error requesting cgroup chown in new namespace");
2621                 return false;
2622         }
2623
2624         /*
2625          * Now chmod 775 the directory else the container cannot create cgroups.
2626          * This can't be done in the child namespace because it only group-owns
2627          * the cgroup
2628          */
2629         if (chmod(cgroup_path, 0775) < 0) {
2630                 SYSERROR("Error chmoding %s\n", cgroup_path);
2631                 return false;
2632         }
2633         fpath = lxc_append_paths(cgroup_path, "tasks");
2634         if (!fpath)
2635                 return false;
2636         if (chmod(fpath, 0664) < 0)
2637                 SYSERROR("Error chmoding %s\n", fpath);
2638         free(fpath);
2639         fpath = lxc_append_paths(cgroup_path, "cgroup.procs");
2640         if (!fpath)
2641                 return false;
2642         if (chmod(fpath, 0664) < 0)
2643                 SYSERROR("Error chmoding %s\n", fpath);
2644         free(fpath);
2645
2646         return true;
2647 }
2648
2649 static bool cgfs_chown(void *hdata, struct lxc_conf *conf)
2650 {
2651         struct cgfs_data *d = hdata;
2652         struct cgroup_process_info *info_ptr;
2653         char *cgpath;
2654         bool r = true;
2655
2656         if (!d)
2657                 return false;
2658
2659         for (info_ptr = d->info; info_ptr; info_ptr = info_ptr->next) {
2660                 if (!info_ptr->hierarchy)
2661                         continue;
2662
2663                 if (!info_ptr->designated_mount_point) {
2664                         info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, info_ptr->cgroup_path, true);
2665                         if (!info_ptr->designated_mount_point) {
2666                                 SYSERROR("Could not chown cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", info_ptr->cgroup_path);
2667                                 return false;
2668                         }
2669                 }
2670
2671                 cgpath = cgroup_to_absolute_path(info_ptr->designated_mount_point, info_ptr->cgroup_path, NULL);
2672                 if (!cgpath) {
2673                         SYSERROR("Could not chown cgroup %s: internal error", info_ptr->cgroup_path);
2674                         continue;
2675                 }
2676                 r = do_cgfs_chown(cgpath, conf);
2677                 if (!r && is_crucial_hierarchy(info_ptr->hierarchy)) {
2678                         ERROR("Failed chowning %s\n", cgpath);
2679                         free(cgpath);
2680                         return false;
2681                 }
2682                 free(cgpath);
2683         }
2684
2685         return true;
2686 }
2687
2688 static struct cgroup_ops cgfs_ops = {
2689         .init = cgfs_init,
2690         .destroy = cgfs_destroy,
2691         .create = cgfs_create,
2692         .enter = cgfs_enter,
2693         .create_legacy = cgfs_create_legacy,
2694         .get_cgroup = cgfs_get_cgroup,
2695         .escape = cgfs_escape,
2696         .num_hierarchies = cgfs_num_hierarchies,
2697         .get_hierarchies = cgfs_get_hierarchies,
2698         .get = lxc_cgroupfs_get,
2699         .set = lxc_cgroupfs_set,
2700         .unfreeze = cgfs_unfreeze,
2701         .setup_limits = cgroupfs_setup_limits,
2702         .name = "cgroupfs",
2703         .attach = lxc_cgroupfs_attach,
2704         .chown = cgfs_chown,
2705         .mount_cgroup = cgroupfs_mount_cgroup,
2706         .nrtasks = cgfs_nrtasks,
2707         .driver = CGFS,
2708 };