src/lxc/cgroups/cgfs.c

   1 /*
   2  * lxc: linux Container library
   3  *
   4  * (C) Copyright IBM Corp. 2007, 2008
   5  *
   6  * Authors:
   7  * Daniel Lezcano <daniel.lezcano at free.fr>
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22  */
  23 #include "config.h"
  24
  25 #include <stdio.h>
  26 #include <stdlib.h>
  27 #include <errno.h>
  28 #include <unistd.h>
  29 #include <string.h>
  30 #include <dirent.h>
  31 #include <fcntl.h>
  32 #include <grp.h>
  33 #include <ctype.h>
  34 #include <sys/types.h>
  35 #include <sys/stat.h>
  36 #include <sys/param.h>
  37 #include <sys/inotify.h>
  38 #include <sys/mount.h>
  39 #include <netinet/in.h>
  40 #include <net/if.h>
  41
  42 #include "error.h"
  43 #include "commands.h"
  44 #include "list.h"
  45 #include "conf.h"
  46 #include "utils.h"
  47 #include "log.h"
  48 #include "cgroup.h"
  49 #include "start.h"
  50 #include "state.h"
  51 #include "storage.h"
  52
  53 #if IS_BIONIC
  54 #include <../include/lxcmntent.h>
  55 #else
  56 #include <mntent.h>
  57 #endif
  58
  59 struct cgroup_hierarchy;
  60 struct cgroup_meta_data;
  61 struct cgroup_mount_point;
  62
  63 /*
  64  * cgroup_meta_data: the metadata about the cgroup infrastructure on this
  65  *                   host
  66  */
  67 struct cgroup_meta_data {
  68         ptrdiff_t ref; /* simple refcount */
  69         struct cgroup_hierarchy **hierarchies;
  70         struct cgroup_mount_point **mount_points;
  71         int maximum_hierarchy;
  72 };
  73
  74 /*
  75  * cgroup_hierarchy: describes a single cgroup hierarchy
  76  *                   (may have multiple mount points)
  77  */
  78 struct cgroup_hierarchy {
  79         int index;
  80         bool used; /* false if the hierarchy should be ignored by lxc */
  81         char **subsystems;
  82         struct cgroup_mount_point *rw_absolute_mount_point;
  83         struct cgroup_mount_point *ro_absolute_mount_point;
  84         struct cgroup_mount_point **all_mount_points;
  85         size_t all_mount_point_capacity;
  86 };
  87
  88 /*
  89  * cgroup_mount_point: a mount point to where a hierarchy
  90  *                     is mounted to
  91  */
  92 struct cgroup_mount_point {
  93         struct cgroup_hierarchy *hierarchy;
  94         char *mount_point;
  95         char *mount_prefix;
  96         bool read_only;
  97         bool need_cpuset_init;
  98 };
  99
 100 /*
 101  * cgroup_process_info: describes the membership of a
 102  *                      process to the different cgroup
 103  *                      hierarchies
 104  *
 105  * Note this is the per-process info tracked by the cgfs_ops.
 106  * This is not used with cgmanager.
 107  */
 108 struct cgroup_process_info {
 109         struct cgroup_process_info *next;
 110         struct cgroup_meta_data *meta_ref;
 111         struct cgroup_hierarchy *hierarchy;
 112         char *cgroup_path;
 113         char *cgroup_path_sub;
 114         char **created_paths;
 115         size_t created_paths_capacity;
 116         size_t created_paths_count;
 117         struct cgroup_mount_point *designated_mount_point;
 118 };
 119
 120 struct cgfs_data {
 121         char *name;
 122         const char *cgroup_pattern;
 123         struct cgroup_meta_data *meta;
 124         struct cgroup_process_info *info;
 125 };
 126
 127 lxc_log_define(lxc_cgfs, lxc);
 128
 129 static struct cgroup_process_info *lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str, struct cgroup_meta_data *meta);
 130 static char **subsystems_from_mount_options(const char *mount_options, char **kernel_list);
 131 static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp);
 132 static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h);
 133 static bool is_valid_cgroup(const char *name);
 134 static int create_cgroup(struct cgroup_mount_point *mp, const char *path);
 135 static int remove_cgroup(struct cgroup_mount_point *mp, const char *path, bool recurse,
 136                                 struct lxc_conf *conf);
 137 static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp, const char *path, const char *suffix);
 138 static struct cgroup_process_info *find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem);
 139 static int do_cgroup_get(const char *cgroup_path, const char *sub_filename, char *value, size_t len);
 140 static int do_cgroup_set(const char *cgroup_path, const char *sub_filename, const char *value);
 141 static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d, char *v, bool for_allow);
 142 static int do_setup_cgroup_limits(struct cgfs_data *d, struct lxc_list *cgroup_settings, bool do_devices);
 143 static int cgroup_recursive_task_count(const char *cgroup_path);
 144 static int handle_cgroup_settings(struct cgroup_mount_point *mp, char *cgroup_path);
 145 static bool init_cpuset_if_needed(struct cgroup_mount_point *mp, const char *path);
 146
 147 static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist);
 148 static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data);
 149 static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data);
 150
 151 /* free process membership information */
 152 static void lxc_cgroup_process_info_free(struct cgroup_process_info *info);
 153 static void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info,
 154                                 struct lxc_conf *conf);
 155
 156 static struct cgroup_ops cgfs_ops;
 157
 158 static int cgroup_rmdir(char *dirname)
 159 {
 160         struct dirent *direntp;
 161         int saved_errno = 0;
 162         DIR *dir;
 163         int ret, failed=0;
 164         char pathname[MAXPATHLEN];
 165
 166         dir = opendir(dirname);
 167         if (!dir) {
 168                 ERROR("Failed to open %s", dirname);
 169                 return -1;
 170         }
 171
 172         while ((direntp = readdir(dir))) {
 173                 struct stat mystat;
 174                 int rc;
 175
 176                 if (!direntp)
 177                         break;
 178
 179                 if (!strcmp(direntp->d_name, ".") ||
 180                     !strcmp(direntp->d_name, ".."))
 181                         continue;
 182
 183                 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
 184                 if (rc < 0 || rc >= MAXPATHLEN) {
 185                         ERROR("pathname too long");
 186                         failed=1;
 187                         if (!saved_errno)
 188                                 saved_errno = -ENOMEM;
 189                         continue;
 190                 }
 191                 ret = lstat(pathname, &mystat);
 192                 if (ret) {
 193                         SYSERROR("Failed to stat %s", pathname);
 194                         failed=1;
 195                         if (!saved_errno)
 196                                 saved_errno = errno;
 197                         continue;
 198                 }
 199                 if (S_ISDIR(mystat.st_mode)) {
 200                         if (cgroup_rmdir(pathname) < 0) {
 201                                 if (!saved_errno)
 202                                         saved_errno = errno;
 203                                 failed=1;
 204                         }
 205                 }
 206         }
 207
 208         if (rmdir(dirname) < 0) {
 209                 SYSERROR("Failed to delete %s", dirname);
 210                 if (!saved_errno)
 211                         saved_errno = errno;
 212                 failed=1;
 213         }
 214
 215         ret = closedir(dir);
 216         if (ret) {
 217                 SYSERROR("Failed to close directory %s", dirname);
 218                 if (!saved_errno)
 219                         saved_errno = errno;
 220                 failed=1;
 221         }
 222
 223         errno = saved_errno;
 224         return failed ? -1 : 0;
 225 }
 226
 227 static int rmdir_wrapper(void *data)
 228 {
 229         char *path = data;
 230
 231         if (setresgid(0,0,0) < 0)
 232                 SYSERROR("Failed to setgid to 0");
 233         if (setresuid(0,0,0) < 0)
 234                 SYSERROR("Failed to setuid to 0");
 235         if (setgroups(0, NULL) < 0)
 236                 SYSERROR("Failed to clear groups");
 237
 238         return cgroup_rmdir(path);
 239 }
 240
 241 static struct cgroup_meta_data *lxc_cgroup_load_meta()
 242 {
 243         const char *cgroup_use = NULL;
 244         char **cgroup_use_list = NULL;
 245         struct cgroup_meta_data *md = NULL;
 246         int saved_errno;
 247
 248         errno = 0;
 249         cgroup_use = lxc_global_config_value("lxc.cgroup.use");
 250         if (!cgroup_use && errno != 0)
 251                 return NULL;
 252         if (cgroup_use) {
 253                 cgroup_use_list = lxc_string_split_and_trim(cgroup_use, ',');
 254                 if (!cgroup_use_list)
 255                         return NULL;
 256         }
 257
 258         md = lxc_cgroup_load_meta2((const char **)cgroup_use_list);
 259         saved_errno = errno;
 260         lxc_free_array((void **)cgroup_use_list, free);
 261         errno = saved_errno;
 262         return md;
 263 }
 264
 265 /* Step 1: determine all kernel subsystems */
 266 static bool find_cgroup_subsystems(char ***kernel_subsystems)
 267 {
 268         FILE *proc_cgroups;
 269         bool bret = false;
 270         char *line = NULL;
 271         size_t sz = 0;
 272         size_t kernel_subsystems_count = 0;
 273         size_t kernel_subsystems_capacity = 0;
 274         int r;
 275
 276         proc_cgroups = fopen_cloexec("/proc/cgroups", "r");
 277         if (!proc_cgroups)
 278                 return false;
 279
 280         while (getline(&line, &sz, proc_cgroups) != -1) {
 281                 char *tab1;
 282                 char *tab2;
 283                 int hierarchy_number;
 284
 285                 if (line[0] == '#')
 286                         continue;
 287                 if (!line[0])
 288                         continue;
 289
 290                 tab1 = strchr(line, '\t');
 291                 if (!tab1)
 292                         continue;
 293                 *tab1++ = '\0';
 294                 tab2 = strchr(tab1, '\t');
 295                 if (!tab2)
 296                         continue;
 297                 *tab2 = '\0';
 298
 299                 tab2 = NULL;
 300                 hierarchy_number = strtoul(tab1, &tab2, 10);
 301                 if (!tab2 || *tab2)
 302                         continue;
 303                 (void)hierarchy_number;
 304
 305                 r = lxc_grow_array((void ***)kernel_subsystems, &kernel_subsystems_capacity, kernel_subsystems_count + 1, 12);
 306                 if (r < 0)
 307                         goto out;
 308                 (*kernel_subsystems)[kernel_subsystems_count] = strdup(line);
 309                 if (!(*kernel_subsystems)[kernel_subsystems_count])
 310                         goto out;
 311                 kernel_subsystems_count++;
 312         }
 313         bret = true;
 314
 315 out:
 316         fclose(proc_cgroups);
 317         free(line);
 318         return bret;
 319 }
 320
 321 /* Step 2: determine all hierarchies (by reading /proc/self/cgroup),
 322  *         since mount points don't specify hierarchy number and
 323  *         /proc/cgroups does not contain named hierarchies
 324  */
 325 static bool find_cgroup_hierarchies(struct cgroup_meta_data *meta_data,
 326         bool all_kernel_subsystems, bool all_named_subsystems,
 327         const char **subsystem_whitelist)
 328 {
 329         FILE *proc_self_cgroup;
 330         char *line = NULL;
 331         size_t sz = 0;
 332         int r;
 333         bool bret = false;
 334         size_t hierarchy_capacity = 0;
 335
 336         proc_self_cgroup = fopen_cloexec("/proc/self/cgroup", "r");
 337         /* if for some reason (because of setns() and pid namespace for example),
 338          * /proc/self is not valid, we try /proc/1/cgroup... */
 339         if (!proc_self_cgroup)
 340                 proc_self_cgroup = fopen_cloexec("/proc/1/cgroup", "r");
 341         if (!proc_self_cgroup)
 342                 return false;
 343
 344         while (getline(&line, &sz, proc_self_cgroup) != -1) {
 345                 /* file format: hierarchy:subsystems:group,
 346                  * we only extract hierarchy and subsystems
 347                  * here */
 348                 char *colon1;
 349                 char *colon2;
 350                 int hierarchy_number;
 351                 struct cgroup_hierarchy *h = NULL;
 352                 char **p;
 353
 354                 if (!line[0])
 355                         continue;
 356
 357                 colon1 = strchr(line, ':');
 358                 if (!colon1)
 359                         continue;
 360                 *colon1++ = '\0';
 361                 colon2 = strchr(colon1, ':');
 362                 if (!colon2)
 363                         continue;
 364                 *colon2 = '\0';
 365
 366                 colon2 = NULL;
 367
 368                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
 369                  * form: 0::/
 370                  * These entries need to be skipped.
 371                  */
 372                 if (!strcmp(colon1, ""))
 373                         continue;
 374
 375                 hierarchy_number = strtoul(line, &colon2, 10);
 376                 if (!colon2 || *colon2)
 377                         continue;
 378
 379                 if (hierarchy_number > meta_data->maximum_hierarchy) {
 380                         /* lxc_grow_array will never shrink, so even if we find a lower
 381                         * hierarchy number here, the array will never be smaller
 382                         */
 383                         r = lxc_grow_array((void ***)&meta_data->hierarchies, &hierarchy_capacity, hierarchy_number + 1, 12);
 384                         if (r < 0)
 385                                 goto out;
 386
 387                         meta_data->maximum_hierarchy = hierarchy_number;
 388                 }
 389
 390                 /* this shouldn't happen, we had this already */
 391                 if (meta_data->hierarchies[hierarchy_number])
 392                         goto out;
 393
 394                 h = calloc(1, sizeof(struct cgroup_hierarchy));
 395                 if (!h)
 396                         goto out;
 397
 398                 meta_data->hierarchies[hierarchy_number] = h;
 399
 400                 h->index = hierarchy_number;
 401                 h->subsystems = lxc_string_split_and_trim(colon1, ',');
 402                 if (!h->subsystems)
 403                         goto out;
 404                 /* see if this hierarchy should be considered */
 405                 if (!all_kernel_subsystems || !all_named_subsystems) {
 406                         for (p = h->subsystems; *p; p++) {
 407                                 if (!strncmp(*p, "name=", 5)) {
 408                                         if (all_named_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
 409                                                 h->used = true;
 410                                                 break;
 411                                         }
 412                                 } else {
 413                                         if (all_kernel_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
 414                                                 h->used = true;
 415                                                 break;
 416                                         }
 417                                 }
 418                         }
 419                 } else {
 420                         /* we want all hierarchy anyway */
 421                         h->used = true;
 422                 }
 423         }
 424         bret = true;
 425
 426 out:
 427         fclose(proc_self_cgroup);
 428         free(line);
 429         return bret;
 430 }
 431
 432 /* Step 3: determine all mount points of each hierarchy */
 433 static bool find_hierarchy_mountpts( struct cgroup_meta_data *meta_data, char **kernel_subsystems)
 434 {
 435         bool bret = false;
 436         FILE *proc_self_mountinfo;
 437         char *line = NULL;
 438         size_t sz = 0;
 439         char **tokens = NULL;
 440         size_t mount_point_count = 0;
 441         size_t mount_point_capacity = 0;
 442         size_t token_capacity = 0;
 443         int r;
 444         bool is_cgns = cgns_supported();
 445
 446         proc_self_mountinfo = fopen_cloexec("/proc/self/mountinfo", "r");
 447         /* if for some reason (because of setns() and pid namespace for example),
 448          * /proc/self is not valid, we try /proc/1/cgroup... */
 449         if (!proc_self_mountinfo)
 450                 proc_self_mountinfo = fopen_cloexec("/proc/1/mountinfo", "r");
 451         if (!proc_self_mountinfo)
 452                 return false;
 453
 454         while (getline(&line, &sz, proc_self_mountinfo) != -1) {
 455                 char *token, *line_tok, *saveptr = NULL;
 456                 size_t i, j, k;
 457                 struct cgroup_mount_point *mount_point;
 458                 struct cgroup_hierarchy *h;
 459                 char **subsystems;
 460                 bool is_lxcfs = false;
 461
 462                 if (line[0] && line[strlen(line) - 1] == '\n')
 463                         line[strlen(line) - 1] = '\0';
 464
 465                 for (i = 0, line_tok = line; (token = strtok_r(line_tok, " ", &saveptr)); line_tok = NULL) {
 466                         r = lxc_grow_array((void ***)&tokens, &token_capacity, i + 1, 64);
 467                         if (r < 0)
 468                                 goto out;
 469                         tokens[i++] = token;
 470                 }
 471
 472                 /* layout of /proc/self/mountinfo:
 473                  *      0: id
 474                  *      1: parent id
 475                  *      2: device major:minor
 476                  *      3: mount prefix
 477                  *      4: mount point
 478                  *      5: per-mount options
 479                  *    [optional X]: additional data
 480                  *    X+7: "-"
 481                  *    X+8: type
 482                  *    X+9: source
 483                  *    X+10: per-superblock options
 484                  */
 485                 for (j = 6; j < i && tokens[j]; j++)
 486                         if (!strcmp(tokens[j], "-"))
 487                                 break;
 488
 489                 /* could not find separator */
 490                 if (j >= i || !tokens[j])
 491                         continue;
 492                 /* there should be exactly three fields after
 493                  * the separator
 494                  */
 495                 if (i != j + 4)
 496                         continue;
 497
 498                 /* not a cgroup filesystem */
 499                 if (strcmp(tokens[j + 1], "cgroup") != 0) {
 500                         if (strcmp(tokens[j + 1], "fuse.lxcfs") != 0)
 501                                 continue;
 502                         if (strncmp(tokens[4], "/sys/fs/cgroup/", 15) != 0)
 503                                 continue;
 504                         is_lxcfs = true;
 505                         char *curtok = tokens[4] + 15;
 506                         subsystems = subsystems_from_mount_options(curtok,
 507                                                          kernel_subsystems);
 508                 } else
 509                         subsystems = subsystems_from_mount_options(tokens[j + 3],
 510                                                          kernel_subsystems);
 511                 if (!subsystems)
 512                         goto out;
 513
 514                 h = NULL;
 515                 for (k = 0; k <= meta_data->maximum_hierarchy; k++) {
 516                         if (meta_data->hierarchies[k] &&
 517                             meta_data->hierarchies[k]->subsystems[0] &&
 518                             lxc_string_in_array(meta_data->hierarchies[k]->subsystems[0], (const char **)subsystems)) {
 519                                 /* TODO: we could also check if the lists really match completely,
 520                                  *       just to have an additional sanity check */
 521                                 h = meta_data->hierarchies[k];
 522                                 break;
 523                         }
 524                 }
 525                 lxc_free_array((void **)subsystems, free);
 526
 527                 r = lxc_grow_array((void ***)&meta_data->mount_points, &mount_point_capacity, mount_point_count + 1, 12);
 528                 if (r < 0)
 529                         goto out;
 530
 531                 /* create mount point object */
 532                 mount_point = calloc(1, sizeof(*mount_point));
 533                 if (!mount_point)
 534                         goto out;
 535
 536                 meta_data->mount_points[mount_point_count++] = mount_point;
 537
 538                 mount_point->hierarchy = h;
 539                 if (is_lxcfs || is_cgns)
 540                         mount_point->mount_prefix = strdup("/");
 541                 else
 542                         mount_point->mount_prefix = strdup(tokens[3]);
 543                 mount_point->mount_point = strdup(tokens[4]);
 544                 if (!mount_point->mount_point || !mount_point->mount_prefix)
 545                         goto out;
 546                 mount_point->read_only = !lxc_string_in_list("rw", tokens[5], ',');
 547
 548                 if (!strcmp(mount_point->mount_prefix, "/")) {
 549                         if (mount_point->read_only) {
 550                                 if (!h->ro_absolute_mount_point)
 551                                         h->ro_absolute_mount_point = mount_point;
 552                         } else {
 553                                 if (!h->rw_absolute_mount_point)
 554                                         h->rw_absolute_mount_point = mount_point;
 555                         }
 556                 }
 557
 558                 if (h)
 559                         k = lxc_array_len((void **)h->all_mount_points);
 560                 else
 561                         k = 0;
 562                 r = lxc_grow_array((void ***)&h->all_mount_points, &h->all_mount_point_capacity, k + 1, 4);
 563                 if (r < 0)
 564                         goto out;
 565                 h->all_mount_points[k] = mount_point;
 566         }
 567         bret = true;
 568
 569 out:
 570         fclose(proc_self_mountinfo);
 571         free(tokens);
 572         free(line);
 573         return bret;
 574 }
 575
 576 static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist)
 577 {
 578         bool all_kernel_subsystems = true;
 579         bool all_named_subsystems = false;
 580         struct cgroup_meta_data *meta_data = NULL;
 581         char **kernel_subsystems = NULL;
 582         int saved_errno = 0;
 583
 584         /* if the subsystem whitelist is not specified, include all
 585          * hierarchies that contain kernel subsystems by default but
 586          * no hierarchies that only contain named subsystems
 587          *
 588          * if it is specified, the specifier @all will select all
 589          * hierarchies, @kernel will select all hierarchies with
 590          * kernel subsystems and @named will select all named
 591          * hierarchies
 592          */
 593         all_kernel_subsystems = subsystem_whitelist ?
 594                 (lxc_string_in_array("@kernel", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
 595                 true;
 596         all_named_subsystems = subsystem_whitelist ?
 597                 (lxc_string_in_array("@named", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
 598                 true;
 599
 600         meta_data = calloc(1, sizeof(struct cgroup_meta_data));
 601         if (!meta_data)
 602                 return NULL;
 603         meta_data->ref = 1;
 604
 605         if (!find_cgroup_subsystems(&kernel_subsystems))
 606                 goto out_error;
 607
 608         if (!find_cgroup_hierarchies(meta_data, all_kernel_subsystems,
 609                                 all_named_subsystems, subsystem_whitelist))
 610                 goto out_error;
 611
 612         if (!find_hierarchy_mountpts(meta_data, kernel_subsystems))
 613                 goto out_error;
 614
 615         /* oops, we couldn't find anything */
 616         if (!meta_data->hierarchies || !meta_data->mount_points) {
 617                 errno = EINVAL;
 618                 goto out_error;
 619         }
 620
 621         lxc_free_array((void **)kernel_subsystems, free);
 622         return meta_data;
 623
 624 out_error:
 625         saved_errno = errno;
 626         lxc_free_array((void **)kernel_subsystems, free);
 627         lxc_cgroup_put_meta(meta_data);
 628         errno = saved_errno;
 629         return NULL;
 630 }
 631
 632 static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data)
 633 {
 634         meta_data->ref++;
 635         return meta_data;
 636 }
 637
 638 static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data)
 639 {
 640         size_t i;
 641         if (!meta_data)
 642                 return NULL;
 643         if (--meta_data->ref > 0)
 644                 return meta_data;
 645         lxc_free_array((void **)meta_data->mount_points, (lxc_free_fn)lxc_cgroup_mount_point_free);
 646         if (meta_data->hierarchies)
 647                 for (i = 0; i <= meta_data->maximum_hierarchy; i++)
 648                         if (meta_data->hierarchies[i])
 649                                 lxc_cgroup_hierarchy_free(meta_data->hierarchies[i]);
 650         free(meta_data->hierarchies);
 651         free(meta_data);
 652         return NULL;
 653 }
 654
 655 static struct cgroup_hierarchy *lxc_cgroup_find_hierarchy(struct cgroup_meta_data *meta_data, const char *subsystem)
 656 {
 657         size_t i;
 658         for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
 659                 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
 660                 if (!h)
 661                         continue;
 662                 if (h && lxc_string_in_array(subsystem, (const char **)h->subsystems))
 663                         return h;
 664         }
 665         return NULL;
 666 }
 667
 668 static bool mountpoint_is_accessible(struct cgroup_mount_point *mp)
 669 {
 670         return mp && access(mp->mount_point, F_OK) == 0;
 671 }
 672
 673 static struct cgroup_mount_point *lxc_cgroup_find_mount_point(struct cgroup_hierarchy *hierarchy, const char *group, bool should_be_writable)
 674 {
 675         struct cgroup_mount_point **mps;
 676         struct cgroup_mount_point *current_result = NULL;
 677         ssize_t quality = -1;
 678
 679         /* trivial case */
 680         if (mountpoint_is_accessible(hierarchy->rw_absolute_mount_point))
 681                 return hierarchy->rw_absolute_mount_point;
 682         if (!should_be_writable && mountpoint_is_accessible(hierarchy->ro_absolute_mount_point))
 683                 return hierarchy->ro_absolute_mount_point;
 684
 685         for (mps = hierarchy->all_mount_points; mps && *mps; mps++) {
 686                 struct cgroup_mount_point *mp = *mps;
 687                 size_t prefix_len = mp->mount_prefix ? strlen(mp->mount_prefix) : 0;
 688
 689                 if (prefix_len == 1 && mp->mount_prefix[0] == '/')
 690                         prefix_len = 0;
 691
 692                 if (!mountpoint_is_accessible(mp))
 693                         continue;
 694
 695                 if (should_be_writable && mp->read_only)
 696                         continue;
 697
 698                 if (!prefix_len ||
 699                     (strncmp(group, mp->mount_prefix, prefix_len) == 0 &&
 700                      (group[prefix_len] == '\0' || group[prefix_len] == '/'))) {
 701                         /* search for the best quality match, i.e. the match with the
 702                          * shortest prefix where this group is still contained
 703                          */
 704                         if (quality == -1 || prefix_len < quality) {
 705                                 current_result = mp;
 706                                 quality = prefix_len;
 707                         }
 708                 }
 709         }
 710
 711         if (!current_result)
 712                 errno = ENOENT;
 713         return current_result;
 714 }
 715
 716 static char *lxc_cgroup_find_abs_path(const char *subsystem, const char *group, bool should_be_writable, const char *suffix)
 717 {
 718         struct cgroup_meta_data *meta_data;
 719         struct cgroup_hierarchy *h;
 720         struct cgroup_mount_point *mp;
 721         char *result;
 722         int saved_errno;
 723
 724         meta_data = lxc_cgroup_load_meta();
 725         if (!meta_data)
 726                 return NULL;
 727
 728         h = lxc_cgroup_find_hierarchy(meta_data, subsystem);
 729         if (!h)
 730                 goto out_error;
 731
 732         mp = lxc_cgroup_find_mount_point(h, group, should_be_writable);
 733         if (!mp)
 734                 goto out_error;
 735
 736         result = cgroup_to_absolute_path(mp, group, suffix);
 737         if (!result)
 738                 goto out_error;
 739
 740         lxc_cgroup_put_meta(meta_data);
 741         return result;
 742
 743 out_error:
 744         saved_errno = errno;
 745         lxc_cgroup_put_meta(meta_data);
 746         errno = saved_errno;
 747         return NULL;
 748 }
 749
 750 static struct cgroup_process_info *lxc_cgroup_process_info_get(pid_t pid, struct cgroup_meta_data *meta)
 751 {
 752         char pid_buf[32];
 753         snprintf(pid_buf, 32, "/proc/%lu/cgroup", (unsigned long)pid);
 754         return lxc_cgroup_process_info_getx(pid_buf, meta);
 755 }
 756
 757 static struct cgroup_process_info *lxc_cgroup_process_info_get_init(struct cgroup_meta_data *meta)
 758 {
 759         return lxc_cgroup_process_info_get(1, meta);
 760 }
 761
 762 static struct cgroup_process_info *lxc_cgroup_process_info_get_self(struct cgroup_meta_data *meta)
 763 {
 764         struct cgroup_process_info *i;
 765         i = lxc_cgroup_process_info_getx("/proc/self/cgroup", meta);
 766         if (!i)
 767                 i = lxc_cgroup_process_info_get(lxc_raw_getpid(), meta);
 768         return i;
 769 }
 770
 771 /*
 772  * If a controller has ns cgroup mounted, then in that cgroup the handler->pid
 773  * is already in a new cgroup named after the pid.  'mnt' is passed in as
 774  * the full current cgroup.  Say that is /sys/fs/cgroup/lxc/2975 and the container
 775  * name is c1. .  We want to rename the cgroup directory to /sys/fs/cgroup/lxc/c1,
 776  * and return the string /sys/fs/cgroup/lxc/c1.
 777  */
 778 static char *cgroup_rename_nsgroup(const char *mountpath, const char *oldname, pid_t pid, const char *name)
 779 {
 780         char *dir, *fulloldpath;
 781         char *newname, *fullnewpath;
 782         int len, newlen, ret;
 783
 784         /*
 785          * if cgroup is mounted at /cgroup and task is in cgroup /ab/, pid 2375 and
 786          * name is c1,
 787          * dir: /ab
 788          * fulloldpath = /cgroup/ab/2375
 789          * fullnewpath = /cgroup/ab/c1
 790          * newname = /ab/c1
 791          */
 792         dir = alloca(strlen(oldname) + 1);
 793         strcpy(dir, oldname);
 794
 795         len = strlen(oldname) + strlen(mountpath) + 22;
 796         fulloldpath = alloca(len);
 797         ret = snprintf(fulloldpath, len, "%s/%s/%lu", mountpath, oldname, (unsigned long)pid);
 798         if (ret < 0 || ret >= len)
 799                 return NULL;
 800
 801         len = strlen(dir) + strlen(name) + 2;
 802         newname = malloc(len);
 803         if (!newname) {
 804                 SYSERROR("Out of memory");
 805                 return NULL;
 806         }
 807         ret = snprintf(newname, len, "%s/%s", dir, name);
 808         if (ret < 0 || ret >= len) {
 809                 free(newname);
 810                 return NULL;
 811         }
 812
 813         newlen = strlen(mountpath) + len + 2;
 814         fullnewpath = alloca(newlen);
 815         ret = snprintf(fullnewpath, newlen, "%s/%s", mountpath, newname);
 816         if (ret < 0 || ret >= newlen) {
 817                 free(newname);
 818                 return NULL;
 819         }
 820
 821         if (access(fullnewpath, F_OK) == 0) {
 822                 if (rmdir(fullnewpath) != 0) {
 823                         SYSERROR("container cgroup %s already exists.", fullnewpath);
 824                         free(newname);
 825                         return NULL;
 826                 }
 827         }
 828         if (rename(fulloldpath, fullnewpath)) {
 829                 SYSERROR("failed to rename cgroup %s->%s", fulloldpath, fullnewpath);
 830                 free(newname);
 831                 return NULL;
 832         }
 833
 834         DEBUG("'%s' renamed to '%s'", oldname, newname);
 835
 836         return newname;
 837 }
 838
 839 static bool is_crucial_hierarchy(struct cgroup_hierarchy *h)
 840 {
 841         char **p;
 842
 843         for (p = h->subsystems; *p; p++) {
 844                 if (is_crucial_cgroup_subsystem(*p))
 845                         return true;
 846         }
 847         return false;
 848 }
 849
 850 /* create a new cgroup */
 851 static struct cgroup_process_info *lxc_cgroupfs_create(const char *name, const char *path_pattern, struct cgroup_meta_data *meta_data, const char *sub_pattern)
 852 {
 853         char **cgroup_path_components = NULL;
 854         char **p = NULL;
 855         char *path_so_far = NULL;
 856         char **new_cgroup_paths = NULL;
 857         char **new_cgroup_paths_sub = NULL;
 858         struct cgroup_mount_point *mp;
 859         struct cgroup_hierarchy *h;
 860         struct cgroup_process_info *base_info = NULL;
 861         struct cgroup_process_info *info_ptr;
 862         int saved_errno;
 863         int r;
 864         unsigned suffix = 0;
 865         bool had_sub_pattern = false;
 866         size_t i;
 867
 868         if (!is_valid_cgroup(name)) {
 869                 ERROR("Invalid cgroup name: '%s'", name);
 870                 errno = EINVAL;
 871                 return NULL;
 872         }
 873
 874         if (!strstr(path_pattern, "%n")) {
 875                 ERROR("Invalid cgroup path pattern: '%s'; contains no %%n for specifying container name", path_pattern);
 876                 errno = EINVAL;
 877                 return NULL;
 878         }
 879
 880         /* we will modify the result of this operation directly,
 881          * so we don't have to copy the data structure
 882          */
 883         base_info = (path_pattern[0] == '/') ?
 884                 lxc_cgroup_process_info_get_init(meta_data) :
 885                 lxc_cgroup_process_info_get_self(meta_data);
 886         if (!base_info)
 887                 return NULL;
 888
 889         new_cgroup_paths = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
 890         if (!new_cgroup_paths)
 891                 goto out_initial_error;
 892
 893         new_cgroup_paths_sub = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
 894         if (!new_cgroup_paths_sub)
 895                 goto out_initial_error;
 896
 897         /* find mount points we can use */
 898         for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
 899                 h = info_ptr->hierarchy;
 900                 if (!h)
 901                         continue;
 902                 mp = lxc_cgroup_find_mount_point(h, info_ptr->cgroup_path, true);
 903                 if (!mp) {
 904                         ERROR("Could not find writable mount point for cgroup hierarchy %d while trying to create cgroup.", h->index);
 905                         goto out_initial_error;
 906                 }
 907                 info_ptr->designated_mount_point = mp;
 908
 909                 if (lxc_string_in_array("ns", (const char **)h->subsystems))
 910                         continue;
 911                 if (handle_cgroup_settings(mp, info_ptr->cgroup_path) < 0) {
 912                         ERROR("Could not set clone_children to 1 for cpuset hierarchy in parent cgroup.");
 913                         goto out_initial_error;
 914                 }
 915         }
 916
 917         /* normalize the path */
 918         cgroup_path_components = lxc_normalize_path(path_pattern);
 919         if (!cgroup_path_components)
 920                 goto out_initial_error;
 921
 922         /* go through the path components to see if we can create them */
 923         for (p = cgroup_path_components; *p || (sub_pattern && !had_sub_pattern); p++) {
 924                 /* we only want to create the same component with -1, -2, etc.
 925                  * if the component contains the container name itself, otherwise
 926                  * it's not an error if it already exists
 927                  */
 928                 char *p_eff = *p ? *p : (char *)sub_pattern;
 929                 bool contains_name = strstr(p_eff, "%n");
 930                 char *current_component = NULL;
 931                 char *current_subpath = NULL;
 932                 char *current_entire_path = NULL;
 933                 char *parts[3];
 934                 size_t j = 0;
 935                 i = 0;
 936
 937                 /* if we are processing the subpattern, we want to make sure
 938                  * loop is ended the next time around
 939                  */
 940                 if (!*p) {
 941                         had_sub_pattern = true;
 942                         p--;
 943                 }
 944
 945                 goto find_name_on_this_level;
 946
 947         cleanup_name_on_this_level:
 948                 /* This is reached if we found a name clash.
 949                  * In that case, remove the cgroup from all previous hierarchies
 950                  */
 951                 for (j = 0, info_ptr = base_info; j < i && info_ptr; info_ptr = info_ptr->next, j++) {
 952                         if (info_ptr->created_paths_count < 1)
 953                                 continue;
 954                         r = remove_cgroup(info_ptr->designated_mount_point, info_ptr->created_paths[info_ptr->created_paths_count - 1], false, NULL);
 955                         if (r < 0)
 956                                 WARN("could not clean up cgroup we created when trying to create container");
 957                         free(info_ptr->created_paths[info_ptr->created_paths_count - 1]);
 958                         info_ptr->created_paths[--info_ptr->created_paths_count] = NULL;
 959                 }
 960                 if (current_component != current_subpath)
 961                         free(current_subpath);
 962                 if (current_component != p_eff)
 963                         free(current_component);
 964                 current_component = current_subpath = NULL;
 965                 /* try again with another suffix */
 966                 ++suffix;
 967
 968         find_name_on_this_level:
 969                 /* determine name of the path component we should create */
 970                 if (contains_name && suffix > 0) {
 971                         char *buf = calloc(strlen(name) + 32, 1);
 972                         if (!buf)
 973                                 goto out_initial_error;
 974                         snprintf(buf, strlen(name) + 32, "%s-%u", name, suffix);
 975                         current_component = lxc_string_replace("%n", buf, p_eff);
 976                         free(buf);
 977                 } else {
 978                         current_component = contains_name ? lxc_string_replace("%n", name, p_eff) : p_eff;
 979                 }
 980                 parts[0] = path_so_far;
 981                 parts[1] = current_component;
 982                 parts[2] = NULL;
 983                 current_subpath = path_so_far ? lxc_string_join("/", (const char **)parts, false) : current_component;
 984
 985                 /* Now go through each hierarchy and try to create the
 986                  * corresponding cgroup
 987                  */
 988                 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
 989                         char *parts2[3];
 990
 991                         if (!info_ptr->hierarchy)
 992                                 continue;
 993
 994                         if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
 995                                 continue;
 996                         current_entire_path = NULL;
 997
 998                         parts2[0] = !strcmp(info_ptr->cgroup_path, "/") ? "" : info_ptr->cgroup_path;
 999                         parts2[1] = current_subpath;
1000                         parts2[2] = NULL;
1001                         current_entire_path = lxc_string_join("/", (const char **)parts2, false);
1002
1003                         if (!*p) {
1004                                 /* we are processing the subpath, so only update that one */
1005                                 free(new_cgroup_paths_sub[i]);
1006                                 new_cgroup_paths_sub[i] = strdup(current_entire_path);
1007                                 if (!new_cgroup_paths_sub[i])
1008                                         goto cleanup_from_error;
1009                         } else {
1010                                 /* remember which path was used on this controller */
1011                                 free(new_cgroup_paths[i]);
1012                                 new_cgroup_paths[i] = strdup(current_entire_path);
1013                                 if (!new_cgroup_paths[i])
1014                                         goto cleanup_from_error;
1015                         }
1016
1017                         r = create_cgroup(info_ptr->designated_mount_point, current_entire_path);
1018                         if (r < 0 && errno == EEXIST && contains_name) {
1019                                 /* name clash => try new name with new suffix */
1020                                 free(current_entire_path);
1021                                 current_entire_path = NULL;
1022                                 goto cleanup_name_on_this_level;
1023                         } else if (r < 0 && errno != EEXIST) {
1024                                 if (is_crucial_hierarchy(info_ptr->hierarchy)) {
1025                                         SYSERROR("Could not create cgroup '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
1026                                         goto cleanup_from_error;
1027                                 }
1028                                 goto skip;
1029                         } else if (r == 0) {
1030                                 /* successfully created */
1031                                 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1032                                 if (r < 0)
1033                                         goto cleanup_from_error;
1034                                 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, current_entire_path)) {
1035                                         ERROR("Failed to initialize cpuset for '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
1036                                         goto cleanup_from_error;
1037                                 }
1038                                 info_ptr->created_paths[info_ptr->created_paths_count++] = current_entire_path;
1039                         } else {
1040                                 /* if we didn't create the cgroup, then we have to make sure that
1041                                  * further cgroups will be created properly
1042                                  */
1043                                 if (handle_cgroup_settings(info_ptr->designated_mount_point, info_ptr->cgroup_path) < 0) {
1044                                         ERROR("Could not set clone_children to 1 for cpuset hierarchy in pre-existing cgroup.");
1045                                         goto cleanup_from_error;
1046                                 }
1047                                 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, info_ptr->cgroup_path)) {
1048                                         ERROR("Failed to initialize cpuset in pre-existing '%s'.", info_ptr->cgroup_path);
1049                                         goto cleanup_from_error;
1050                                 }
1051
1052 skip:
1053                                 /* already existed but path component of pattern didn't contain '%n',
1054                                  * so this is not an error; but then we don't need current_entire_path
1055                                  * anymore...
1056                                  */
1057                                 free(current_entire_path);
1058                                 current_entire_path = NULL;
1059                         }
1060                 }
1061
1062                 /* save path so far */
1063                 free(path_so_far);
1064                 path_so_far = strdup(current_subpath);
1065                 if (!path_so_far)
1066                         goto cleanup_from_error;
1067
1068                 /* cleanup */
1069                 if (current_component != current_subpath)
1070                         free(current_subpath);
1071                 if (current_component != p_eff)
1072                         free(current_component);
1073                 current_component = current_subpath = NULL;
1074                 continue;
1075
1076         cleanup_from_error:
1077                 /* called if an error occurred in the loop, so we
1078                  * do some additional cleanup here
1079                  */
1080                 saved_errno = errno;
1081                 if (current_component != current_subpath)
1082                         free(current_subpath);
1083                 if (current_component != p_eff)
1084                         free(current_component);
1085                 free(current_entire_path);
1086                 errno = saved_errno;
1087                 goto out_initial_error;
1088         }
1089
1090         /* we're done, now update the paths */
1091         for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
1092                 if (!info_ptr->hierarchy)
1093                         continue;
1094                 /* ignore legacy 'ns' subsystem here, lxc_cgroup_create_legacy
1095                  * will take care of it
1096                  * Since we do a continue in above loop, new_cgroup_paths[i] is
1097                  * unset anyway, as is new_cgroup_paths_sub[i]
1098                  */
1099                 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1100                         continue;
1101                 free(info_ptr->cgroup_path);
1102                 info_ptr->cgroup_path = new_cgroup_paths[i];
1103                 info_ptr->cgroup_path_sub = new_cgroup_paths_sub[i];
1104         }
1105         /* don't use lxc_free_array since we used the array members
1106          * to store them in our result...
1107          */
1108         free(new_cgroup_paths);
1109         free(new_cgroup_paths_sub);
1110         free(path_so_far);
1111         lxc_free_array((void **)cgroup_path_components, free);
1112         return base_info;
1113
1114 out_initial_error:
1115         saved_errno = errno;
1116         free(path_so_far);
1117         lxc_cgroup_process_info_free_and_remove(base_info, NULL);
1118         lxc_free_array((void **)new_cgroup_paths, free);
1119         lxc_free_array((void **)new_cgroup_paths_sub, free);
1120         lxc_free_array((void **)cgroup_path_components, free);
1121         errno = saved_errno;
1122         return NULL;
1123 }
1124
1125 static int lxc_cgroup_create_legacy(struct cgroup_process_info *base_info, const char *name, pid_t pid)
1126 {
1127         struct cgroup_process_info *info_ptr;
1128         int r;
1129
1130         for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
1131                 if (!info_ptr->hierarchy)
1132                         continue;
1133
1134                 if (!lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1135                         continue;
1136                 /*
1137                  * For any path which has ns cgroup mounted, handler->pid is already
1138                  * moved into a container called '%d % (handler->pid)'.  Rename it to
1139                  * the cgroup name and record that.
1140                  */
1141                 char *tmp = cgroup_rename_nsgroup((const char *)info_ptr->designated_mount_point->mount_point,
1142                                 info_ptr->cgroup_path, pid, name);
1143                 if (!tmp)
1144                         return -1;
1145                 free(info_ptr->cgroup_path);
1146                 info_ptr->cgroup_path = tmp;
1147                 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1148                 if (r < 0)
1149                         return -1;
1150                 tmp = strdup(tmp);
1151                 if (!tmp)
1152                         return -1;
1153                 info_ptr->created_paths[info_ptr->created_paths_count++] = tmp;
1154         }
1155         return 0;
1156 }
1157
1158 /* get the cgroup membership of a given container */
1159 static struct cgroup_process_info *lxc_cgroup_get_container_info(const char *name, const char *lxcpath, struct cgroup_meta_data *meta_data)
1160 {
1161         struct cgroup_process_info *result = NULL;
1162         int saved_errno = 0;
1163         size_t i;
1164         struct cgroup_process_info **cptr = &result;
1165         struct cgroup_process_info *entry = NULL;
1166         char *path = NULL;
1167
1168         for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
1169                 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
1170                 if (!h || !h->used)
1171                         continue;
1172
1173                 /* use the command interface to look for the cgroup */
1174                 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->subsystems[0]);
1175                 if (!path) {
1176                         h->used = false;
1177                         continue;
1178                 }
1179
1180                 entry = calloc(1, sizeof(struct cgroup_process_info));
1181                 if (!entry)
1182                         goto out_error;
1183                 entry->meta_ref = lxc_cgroup_get_meta(meta_data);
1184                 entry->hierarchy = h;
1185                 entry->cgroup_path = path;
1186                 path = NULL;
1187
1188                 /* it is not an error if we don't find anything here,
1189                  * it is up to the caller to decide what to do in that
1190                  * case */
1191                 entry->designated_mount_point = lxc_cgroup_find_mount_point(h, entry->cgroup_path, true);
1192
1193                 *cptr = entry;
1194                 cptr = &entry->next;
1195                 entry = NULL;
1196         }
1197
1198         return result;
1199 out_error:
1200         saved_errno = errno;
1201         free(path);
1202         lxc_cgroup_process_info_free(result);
1203         lxc_cgroup_process_info_free(entry);
1204         errno = saved_errno;
1205         return NULL;
1206 }
1207
1208 /* move a processs to the cgroups specified by the membership */
1209 static int lxc_cgroupfs_enter(struct cgroup_process_info *info, pid_t pid, bool enter_sub)
1210 {
1211         char pid_buf[32];
1212         char *cgroup_tasks_fn;
1213         int r;
1214         struct cgroup_process_info *info_ptr;
1215
1216         snprintf(pid_buf, 32, "%lu", (unsigned long)pid);
1217         for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1218                 if (!info_ptr->hierarchy)
1219                         continue;
1220
1221                 char *cgroup_path = (enter_sub && info_ptr->cgroup_path_sub) ?
1222                         info_ptr->cgroup_path_sub :
1223                         info_ptr->cgroup_path;
1224
1225                 if (!info_ptr->designated_mount_point) {
1226                         info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, cgroup_path, true);
1227                         if (!info_ptr->designated_mount_point) {
1228                                 SYSERROR("Could not add pid %lu to cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", (unsigned long)pid, cgroup_path);
1229                                 return -1;
1230                         }
1231                 }
1232
1233                 cgroup_tasks_fn = cgroup_to_absolute_path(info_ptr->designated_mount_point, cgroup_path, "/tasks");
1234                 if (!cgroup_tasks_fn) {
1235                         SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1236                         return -1;
1237                 }
1238
1239                 r = lxc_write_to_file(cgroup_tasks_fn, pid_buf, strlen(pid_buf), false);
1240                 free(cgroup_tasks_fn);
1241                 if (r < 0 && is_crucial_hierarchy(info_ptr->hierarchy)) {
1242                         SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1243                         return -1;
1244                 }
1245         }
1246
1247         return 0;
1248 }
1249
1250 /* free process membership information */
1251 void lxc_cgroup_process_info_free(struct cgroup_process_info *info)
1252 {
1253         struct cgroup_process_info *next;
1254         if (!info)
1255                 return;
1256         next = info->next;
1257         lxc_cgroup_put_meta(info->meta_ref);
1258         free(info->cgroup_path);
1259         free(info->cgroup_path_sub);
1260         lxc_free_array((void **)info->created_paths, free);
1261         free(info);
1262         lxc_cgroup_process_info_free(next);
1263 }
1264
1265 /* free process membership information and remove cgroups that were created */
1266 void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info, struct lxc_conf *conf)
1267 {
1268         struct cgroup_process_info *next;
1269         char **pp;
1270         if (!info)
1271                 return;
1272         next = info->next;
1273         {
1274                 struct cgroup_mount_point *mp = info->designated_mount_point;
1275                 if (!mp)
1276                         mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1277                 if (mp)
1278                         /* ignore return value here, perhaps we created the
1279                          * '/lxc' cgroup in this container but another container
1280                          * is still running (for example)
1281                          */
1282                         (void)remove_cgroup(mp, info->cgroup_path, true, conf);
1283         }
1284         for (pp = info->created_paths; pp && *pp; pp++);
1285         for ((void)(pp && --pp); info->created_paths && pp >= info->created_paths; --pp) {
1286                 free(*pp);
1287         }
1288         free(info->created_paths);
1289         lxc_cgroup_put_meta(info->meta_ref);
1290         free(info->cgroup_path);
1291         free(info->cgroup_path_sub);
1292         free(info);
1293         lxc_cgroup_process_info_free_and_remove(next, conf);
1294 }
1295
1296 static char *lxc_cgroup_get_hierarchy_path_data(const char *subsystem, struct cgfs_data *d)
1297 {
1298         struct cgroup_process_info *info = d->info;
1299         info = find_info_for_subsystem(info, subsystem);
1300         if (!info)
1301                 return NULL;
1302         prune_init_scope(info->cgroup_path);
1303         return info->cgroup_path;
1304 }
1305
1306 static char *lxc_cgroup_get_hierarchy_abs_path_data(const char *subsystem, struct cgfs_data *d)
1307 {
1308         struct cgroup_process_info *info = d->info;
1309         struct cgroup_mount_point *mp = NULL;
1310
1311         info = find_info_for_subsystem(info, subsystem);
1312         if (!info)
1313                 return NULL;
1314         if (info->designated_mount_point) {
1315                 mp = info->designated_mount_point;
1316         } else {
1317                 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1318                 if (!mp)
1319                         return NULL;
1320         }
1321         return cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1322 }
1323
1324 static char *lxc_cgroup_get_hierarchy_abs_path(const char *subsystem, const char *name, const char *lxcpath)
1325 {
1326         struct cgroup_meta_data *meta;
1327         struct cgroup_process_info *base_info, *info;
1328         struct cgroup_mount_point *mp;
1329         char *result = NULL;
1330
1331         meta = lxc_cgroup_load_meta();
1332         if (!meta)
1333                 return NULL;
1334         base_info = lxc_cgroup_get_container_info(name, lxcpath, meta);
1335         if (!base_info)
1336                 goto out1;
1337         info = find_info_for_subsystem(base_info, subsystem);
1338         if (!info)
1339                 goto out2;
1340         if (info->designated_mount_point) {
1341                 mp = info->designated_mount_point;
1342         } else {
1343                 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1344                 if (!mp)
1345                         goto out3;
1346         }
1347         result = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1348 out3:
1349 out2:
1350         lxc_cgroup_process_info_free(base_info);
1351 out1:
1352         lxc_cgroup_put_meta(meta);
1353         return result;
1354 }
1355
1356 static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfs_data *d)
1357 {
1358         char *subsystem = NULL, *p, *path;
1359         int ret = -1;
1360
1361         subsystem = alloca(strlen(filename) + 1);
1362         strcpy(subsystem, filename);
1363         if ((p = strchr(subsystem, '.')) != NULL)
1364                 *p = '\0';
1365
1366         errno = ENOENT;
1367         path = lxc_cgroup_get_hierarchy_abs_path_data(subsystem, d);
1368         if (path) {
1369                 ret = do_cgroup_set(path, filename, value);
1370                 int saved_errno = errno;
1371                 free(path);
1372                 errno = saved_errno;
1373         }
1374         return ret;
1375 }
1376
1377 static int lxc_cgroupfs_set(const char *filename, const char *value, const char *name, const char *lxcpath)
1378 {
1379         char *subsystem = NULL, *p, *path;
1380         int ret = -1;
1381
1382         subsystem = alloca(strlen(filename) + 1);
1383         strcpy(subsystem, filename);
1384         if ((p = strchr(subsystem, '.')) != NULL)
1385                 *p = '\0';
1386
1387         path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1388         if (path) {
1389                 ret = do_cgroup_set(path, filename, value);
1390                 free(path);
1391         }
1392         return ret;
1393 }
1394
1395 static int lxc_cgroupfs_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
1396 {
1397         char *subsystem = NULL, *p, *path;
1398         int ret = -1;
1399
1400         subsystem = alloca(strlen(filename) + 1);
1401         strcpy(subsystem, filename);
1402         if ((p = strchr(subsystem, '.')) != NULL)
1403                 *p = '\0';
1404
1405         path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1406         if (path) {
1407                 ret = do_cgroup_get(path, filename, value, len);
1408                 free(path);
1409         }
1410         return ret;
1411 }
1412
1413 static bool cgroupfs_mount_cgroup(void *hdata, const char *root, int type)
1414 {
1415         size_t bufsz = strlen(root) + sizeof("/sys/fs/cgroup");
1416         char *path = NULL;
1417         char **parts = NULL;
1418         char *dirname = NULL;
1419         char *abs_path = NULL;
1420         char *abs_path2 = NULL;
1421         struct cgfs_data *cgfs_d;
1422         struct cgroup_process_info *info, *base_info;
1423         int r, saved_errno = 0;
1424         struct lxc_handler *handler = hdata;
1425
1426         if (cgns_supported())
1427                 return true;
1428
1429         cgfs_d = handler->cgroup_data;
1430         if (!cgfs_d)
1431                 return false;
1432         base_info = cgfs_d->info;
1433
1434         /* If we get passed the _NOSPEC types, we default to _MIXED, since we don't
1435          * have access to the lxc_conf object at this point. It really should be up
1436          * to the caller to fix this, but this doesn't really hurt.
1437          */
1438         if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1439                 type = LXC_AUTO_CGROUP_FULL_MIXED;
1440         else if (type == LXC_AUTO_CGROUP_NOSPEC)
1441                 type = LXC_AUTO_CGROUP_MIXED;
1442
1443         if (type < LXC_AUTO_CGROUP_RO || type > LXC_AUTO_CGROUP_FULL_MIXED) {
1444                 ERROR("could not mount cgroups into container: invalid type specified internally");
1445                 errno = EINVAL;
1446                 return false;
1447         }
1448
1449         path = calloc(1, bufsz);
1450         if (!path)
1451                 return false;
1452         snprintf(path, bufsz, "%s/sys/fs/cgroup", root);
1453         r = safe_mount("cgroup_root", path, "tmpfs",
1454                         MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1455                         "size=10240k,mode=755",
1456                         root);
1457         if (r < 0) {
1458                 SYSERROR("could not mount tmpfs to /sys/fs/cgroup in the container");
1459                 return false;
1460         }
1461
1462         /* now mount all the hierarchies we care about */
1463         for (info = base_info; info; info = info->next) {
1464                 size_t subsystem_count, i;
1465                 struct cgroup_mount_point *mp = info->designated_mount_point;
1466
1467                 if (!info->hierarchy)
1468                         continue;
1469
1470                 if (!mountpoint_is_accessible(mp))
1471                         mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1472
1473                 if (!mp) {
1474                         SYSERROR("could not find original mount point for cgroup hierarchy while trying to mount cgroup filesystem");
1475                         goto out_error;
1476                 }
1477
1478                 subsystem_count = lxc_array_len((void **)info->hierarchy->subsystems);
1479                 parts = calloc(subsystem_count + 1, sizeof(char *));
1480                 if (!parts)
1481                         goto out_error;
1482
1483                 for (i = 0; i < subsystem_count; i++) {
1484                         if (!strncmp(info->hierarchy->subsystems[i], "name=", 5))
1485                                 parts[i] = info->hierarchy->subsystems[i] + 5;
1486                         else
1487                                 parts[i] = info->hierarchy->subsystems[i];
1488                 }
1489                 dirname = lxc_string_join(",", (const char **)parts, false);
1490                 if (!dirname)
1491                         goto out_error;
1492
1493                 /* create subsystem directory */
1494                 abs_path = lxc_append_paths(path, dirname);
1495                 if (!abs_path)
1496                         goto out_error;
1497                 r = mkdir_p(abs_path, 0755);
1498                 if (r < 0 && errno != EEXIST) {
1499                         SYSERROR("could not create cgroup subsystem directory /sys/fs/cgroup/%s", dirname);
1500                         goto out_error;
1501                 }
1502
1503                 abs_path2 = lxc_append_paths(abs_path, info->cgroup_path);
1504                 if (!abs_path2)
1505                         goto out_error;
1506
1507                 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_RW || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1508                         /* bind-mount the cgroup entire filesystem there */
1509                         if (strcmp(mp->mount_prefix, "/") != 0) {
1510                                 /* FIXME: maybe we should just try to remount the entire hierarchy
1511                                  *        with a regular mount command? may that works? */
1512                                 ERROR("could not automatically mount cgroup-full to /sys/fs/cgroup/%s: host has no mount point for this cgroup filesystem that has access to the root cgroup", dirname);
1513                                 goto out_error;
1514                         }
1515                         r = mount(mp->mount_point, abs_path, "none", MS_BIND, 0);
1516                         if (r < 0) {
1517                                 SYSERROR("error bind-mounting %s to %s", mp->mount_point, abs_path);
1518                                 goto out_error;
1519                         }
1520                         /* main cgroup path should be read-only */
1521                         if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1522                                 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1523                                 if (r < 0) {
1524                                         SYSERROR("error re-mounting %s readonly", abs_path);
1525                                         goto out_error;
1526                                 }
1527                         }
1528                         /* own cgroup should be read-write */
1529                         if (type == LXC_AUTO_CGROUP_FULL_MIXED) {
1530                                 r = mount(abs_path2, abs_path2, NULL, MS_BIND, NULL);
1531                                 if (r < 0) {
1532                                         SYSERROR("error bind-mounting %s onto itself", abs_path2);
1533                                         goto out_error;
1534                                 }
1535                                 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND, NULL);
1536                                 if (r < 0) {
1537                                         SYSERROR("error re-mounting %s readwrite", abs_path2);
1538                                         goto out_error;
1539                                 }
1540                         }
1541                 } else {
1542                         /* create path for container's cgroup */
1543                         r = mkdir_p(abs_path2, 0755);
1544                         if (r < 0 && errno != EEXIST) {
1545                                 SYSERROR("could not create cgroup directory /sys/fs/cgroup/%s%s", dirname, info->cgroup_path);
1546                                 goto out_error;
1547                         }
1548
1549                         /* for read-only and mixed cases, we have to bind-mount the tmpfs directory
1550                          * that points to the hierarchy itself (i.e. /sys/fs/cgroup/cpu etc.) onto
1551                          * itself and then bind-mount it read-only, since we keep the tmpfs itself
1552                          * read-write (see comment below)
1553                          */
1554                         if (type == LXC_AUTO_CGROUP_MIXED || type == LXC_AUTO_CGROUP_RO) {
1555                                 r = mount(abs_path, abs_path, NULL, MS_BIND, NULL);
1556                                 if (r < 0) {
1557                                         SYSERROR("error bind-mounting %s onto itself", abs_path);
1558                                         goto out_error;
1559                                 }
1560                                 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1561                                 if (r < 0) {
1562                                         SYSERROR("error re-mounting %s readonly", abs_path);
1563                                         goto out_error;
1564                                 }
1565                         }
1566
1567                         free(abs_path);
1568                         abs_path = NULL;
1569
1570                         /* bind-mount container's cgroup to that directory */
1571                         abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1572                         if (!abs_path)
1573                                 goto out_error;
1574                         r = mount(abs_path, abs_path2, "none", MS_BIND, 0);
1575                         if (r < 0 && is_crucial_hierarchy(info->hierarchy)) {
1576                                 SYSERROR("error bind-mounting %s to %s", abs_path, abs_path2);
1577                                 goto out_error;
1578                         }
1579                         if (type == LXC_AUTO_CGROUP_RO) {
1580                                 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1581                                 if (r < 0) {
1582                                         SYSERROR("error re-mounting %s readonly", abs_path2);
1583                                         goto out_error;
1584                                 }
1585                         }
1586                 }
1587
1588                 free(abs_path);
1589                 free(abs_path2);
1590                 abs_path = NULL;
1591                 abs_path2 = NULL;
1592
1593                 /* add symlinks for every single subsystem */
1594                 if (subsystem_count > 1) {
1595                         for (i = 0; i < subsystem_count; i++) {
1596                                 abs_path = lxc_append_paths(path, parts[i]);
1597                                 if (!abs_path)
1598                                         goto out_error;
1599                                 r = symlink(dirname, abs_path);
1600                                 if (r < 0)
1601                                         WARN("could not create symlink %s -> %s in /sys/fs/cgroup of container", parts[i], dirname);
1602                                 free(abs_path);
1603                                 abs_path = NULL;
1604                         }
1605                 }
1606                 free(dirname);
1607                 free(parts);
1608                 dirname = NULL;
1609                 parts = NULL;
1610         }
1611
1612         /* We used to remount the entire tmpfs readonly if any :ro or
1613          * :mixed mode was specified. However, Ubuntu's mountall has the
1614          * unfortunate behavior to block bootup if /sys/fs/cgroup is
1615          * mounted read-only and cannot be remounted read-write.
1616          * (mountall reads /lib/init/fstab and tries to (re-)mount all of
1617          * these if they are not already mounted with the right options;
1618          * it contains an entry for /sys/fs/cgroup. In case it can't do
1619          * that, it prompts for the user to either manually fix it or
1620          * boot anyway. But without user input, booting of the container
1621          * hangs.)
1622          *
1623          * Instead of remounting the entire tmpfs readonly, we only
1624          * remount the paths readonly that are part of the cgroup
1625          * hierarchy.
1626          */
1627
1628         free(path);
1629
1630         return true;
1631
1632 out_error:
1633         saved_errno = errno;
1634         free(path);
1635         free(dirname);
1636         free(parts);
1637         free(abs_path);
1638         free(abs_path2);
1639         errno = saved_errno;
1640         return false;
1641 }
1642
1643 static int cgfs_nrtasks(void *hdata)
1644 {
1645         struct cgfs_data *d = hdata;
1646         struct cgroup_process_info *info;
1647         struct cgroup_mount_point *mp = NULL;
1648         char *abs_path = NULL;
1649         int ret;
1650
1651         if (!d) {
1652                 errno = ENOENT;
1653                 return -1;
1654         }
1655
1656         info = d->info;
1657         if (!info) {
1658                 errno = ENOENT;
1659                 return -1;
1660         }
1661
1662         if (info->designated_mount_point) {
1663                 mp = info->designated_mount_point;
1664         } else {
1665                 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, false);
1666                 if (!mp)
1667                         return -1;
1668         }
1669
1670         abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1671         if (!abs_path)
1672                 return -1;
1673
1674         ret = cgroup_recursive_task_count(abs_path);
1675         free(abs_path);
1676         return ret;
1677 }
1678
1679 static struct cgroup_process_info *
1680 lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str,
1681                              struct cgroup_meta_data *meta)
1682 {
1683         struct cgroup_process_info *result = NULL;
1684         FILE *proc_pid_cgroup = NULL;
1685         char *line = NULL;
1686         size_t sz = 0;
1687         int saved_errno = 0;
1688         struct cgroup_process_info **cptr = &result;
1689         struct cgroup_process_info *entry = NULL;
1690
1691         proc_pid_cgroup = fopen_cloexec(proc_pid_cgroup_str, "r");
1692         if (!proc_pid_cgroup)
1693                 return NULL;
1694
1695         while (getline(&line, &sz, proc_pid_cgroup) != -1) {
1696                 /* file format: hierarchy:subsystems:group */
1697                 char *colon1;
1698                 char *colon2;
1699                 char *endptr;
1700                 int hierarchy_number;
1701                 struct cgroup_hierarchy *h = NULL;
1702
1703                 if (!line[0])
1704                         continue;
1705
1706                 if (line[strlen(line) - 1] == '\n')
1707                         line[strlen(line) - 1] = '\0';
1708
1709                 colon1 = strchr(line, ':');
1710                 if (!colon1)
1711                         continue;
1712                 *colon1++ = '\0';
1713                 colon2 = strchr(colon1, ':');
1714                 if (!colon2)
1715                         continue;
1716                 *colon2++ = '\0';
1717
1718                 endptr = NULL;
1719
1720                 /* With cgroupv2 /proc/self/cgroup can contain entries of the
1721                  * form: 0::/
1722                  * These entries need to be skipped.
1723                  */
1724                 if (!strcmp(colon1, ""))
1725                         continue;
1726
1727                 hierarchy_number = strtoul(line, &endptr, 10);
1728                 if (!endptr || *endptr)
1729                         continue;
1730
1731                 if (hierarchy_number > meta->maximum_hierarchy) {
1732                         /* we encountered a hierarchy we didn't have before,
1733                          * so probably somebody remounted some stuff in the
1734                          * mean time...
1735                          */
1736                         errno = EAGAIN;
1737                         goto out_error;
1738                 }
1739
1740                 h = meta->hierarchies[hierarchy_number];
1741                 if (!h) {
1742                         /* we encountered a hierarchy that was thought to be
1743                          * dead before, so probably somebody remounted some
1744                          * stuff in the mean time...
1745                          */
1746                         errno = EAGAIN;
1747                         goto out_error;
1748                 }
1749
1750                 /* we are told that we should ignore this hierarchy */
1751                 if (!h->used)
1752                         continue;
1753
1754                 entry = calloc(1, sizeof(struct cgroup_process_info));
1755                 if (!entry)
1756                         goto out_error;
1757
1758                 entry->meta_ref = lxc_cgroup_get_meta(meta);
1759                 entry->hierarchy = h;
1760                 entry->cgroup_path = strdup(colon2);
1761                 if (!entry->cgroup_path)
1762                         goto out_error;
1763                 prune_init_scope(entry->cgroup_path);
1764
1765                 *cptr = entry;
1766                 cptr = &entry->next;
1767                 entry = NULL;
1768         }
1769
1770         fclose(proc_pid_cgroup);
1771         free(line);
1772         return result;
1773
1774 out_error:
1775         saved_errno = errno;
1776         if (proc_pid_cgroup)
1777                 fclose(proc_pid_cgroup);
1778         lxc_cgroup_process_info_free(result);
1779         lxc_cgroup_process_info_free(entry);
1780         free(line);
1781         errno = saved_errno;
1782         return NULL;
1783 }
1784
1785 static char **subsystems_from_mount_options(const char *mount_options,
1786                                             char **kernel_list)
1787 {
1788         char *token, *str, *saveptr = NULL;
1789         char **result = NULL;
1790         size_t result_capacity = 0;
1791         size_t result_count = 0;
1792         int saved_errno;
1793         int r;
1794
1795         str = alloca(strlen(mount_options)+1);
1796         strcpy(str, mount_options);
1797         for (; (token = strtok_r(str, ",", &saveptr)); str = NULL) {
1798                 /* we have a subsystem if it's either in the list of
1799                  * subsystems provided by the kernel OR if it starts
1800                  * with name= for named hierarchies
1801                  */
1802                 r = lxc_grow_array((void ***)&result, &result_capacity, result_count + 1, 12);
1803                 if (r < 0)
1804                         goto out_free;
1805                 result[result_count + 1] = NULL;
1806                 if (strncmp(token, "name=", 5) && !lxc_string_in_array(token, (const char **)kernel_list)) {
1807                         /* this is eg 'systemd' but the mount will be
1808                          * 'name=systemd'
1809                          */
1810                         result[result_count] = malloc(strlen(token) + 6);
1811                         if (result[result_count])
1812                                 sprintf(result[result_count], "name=%s", token);
1813                 } else
1814                         result[result_count] = strdup(token);
1815                 if (!result[result_count])
1816                         goto out_free;
1817                 result_count++;
1818         }
1819
1820         return result;
1821
1822 out_free:
1823         saved_errno = errno;
1824         lxc_free_array((void**)result, free);
1825         errno = saved_errno;
1826         return NULL;
1827 }
1828
1829 static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp)
1830 {
1831         if (!mp)
1832                 return;
1833         free(mp->mount_point);
1834         free(mp->mount_prefix);
1835         free(mp);
1836 }
1837
1838 static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h)
1839 {
1840         if (!h)
1841                 return;
1842         if (h->subsystems) {
1843                 lxc_free_array((void **)h->subsystems, free);
1844                 h->subsystems = NULL;
1845         }
1846         if (h->all_mount_points) {
1847                 free(h->all_mount_points);
1848                 h->all_mount_points = NULL;
1849         }
1850         free(h);
1851         h = NULL;
1852 }
1853
1854 static bool is_valid_cgroup(const char *name)
1855 {
1856         const char *p;
1857         for (p = name; *p; p++) {
1858                 /* Use the ASCII printable characters range(32 - 127)
1859                  * is reasonable, we kick out 32(SPACE) because it'll
1860                  * break legacy lxc-ls
1861                  */
1862                 if (*p <= 32 || *p >= 127 || *p == '/')
1863                         return false;
1864         }
1865         return strcmp(name, ".") != 0 && strcmp(name, "..") != 0;
1866 }
1867
1868 static int create_or_remove_cgroup(bool do_remove,
1869                 struct cgroup_mount_point *mp, const char *path, int recurse,
1870                 struct lxc_conf *conf)
1871 {
1872         int r, saved_errno = 0;
1873         char *buf = cgroup_to_absolute_path(mp, path, NULL);
1874         if (!buf)
1875                 return -1;
1876
1877         /* create or remove directory */
1878         if (do_remove) {
1879                 if (!dir_exists(buf))
1880                         return 0;
1881                 if (recurse) {
1882                         if (conf && !lxc_list_empty(&conf->id_map))
1883                                 r = userns_exec_1(conf, rmdir_wrapper, buf,
1884                                                   "rmdir_wrapper");
1885                         else
1886                                 r = cgroup_rmdir(buf);
1887                 } else
1888                         r = rmdir(buf);
1889         } else
1890                 r = mkdir_p(buf, 0777);
1891         saved_errno = errno;
1892         free(buf);
1893         errno = saved_errno;
1894         return r;
1895 }
1896
1897 static int create_cgroup(struct cgroup_mount_point *mp, const char *path)
1898 {
1899         return create_or_remove_cgroup(false, mp, path, false, NULL);
1900 }
1901
1902 static int remove_cgroup(struct cgroup_mount_point *mp,
1903                          const char *path, bool recurse, struct lxc_conf *conf)
1904 {
1905         return create_or_remove_cgroup(true, mp, path, recurse, conf);
1906 }
1907
1908 static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp,
1909                                      const char *path, const char *suffix)
1910 {
1911         /* first we have to make sure we subtract the mount point's prefix */
1912         char *prefix = mp->mount_prefix;
1913         char *buf;
1914         ssize_t len, rv;
1915
1916         /* we want to make sure only absolute paths to cgroups are passed to us */
1917         if (path[0] != '/') {
1918                 errno = EINVAL;
1919                 return NULL;
1920         }
1921
1922         if (prefix && !strcmp(prefix, "/"))
1923                 prefix = NULL;
1924
1925         /* prefix doesn't match */
1926         if (prefix && strncmp(prefix, path, strlen(prefix)) != 0) {
1927                 errno = EINVAL;
1928                 return NULL;
1929         }
1930         /* if prefix is /foo and path is /foobar */
1931         if (prefix && path[strlen(prefix)] != '/' && path[strlen(prefix)] != '\0') {
1932                 errno = EINVAL;
1933                 return NULL;
1934         }
1935
1936         /* remove prefix from path */
1937         path += prefix ? strlen(prefix) : 0;
1938
1939         len = strlen(mp->mount_point) + strlen(path) + (suffix ? strlen(suffix) : 0);
1940         buf = calloc(len + 1, 1);
1941         if (!buf)
1942                 return NULL;
1943         rv = snprintf(buf, len + 1, "%s%s%s", mp->mount_point, path, suffix ? suffix : "");
1944         if (rv > len) {
1945                 free(buf);
1946                 errno = ENOMEM;
1947                 return NULL;
1948         }
1949
1950         return buf;
1951 }
1952
1953 static struct cgroup_process_info *
1954 find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem)
1955 {
1956         struct cgroup_process_info *info_ptr;
1957         for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1958                 struct cgroup_hierarchy *h = info_ptr->hierarchy;
1959                 if (!h)
1960                         continue;
1961                 if (lxc_string_in_array(subsystem, (const char **)h->subsystems))
1962                         return info_ptr;
1963         }
1964         errno = ENOENT;
1965         return NULL;
1966 }
1967
1968 static int do_cgroup_get(const char *cgroup_path, const char *sub_filename,
1969                          char *value, size_t len)
1970 {
1971         const char *parts[3] = {
1972                 cgroup_path,
1973                 sub_filename,
1974                 NULL
1975         };
1976         char *filename;
1977         int ret, saved_errno;
1978
1979         filename = lxc_string_join("/", parts, false);
1980         if (!filename)
1981                 return -1;
1982
1983         ret = lxc_read_from_file(filename, value, len);
1984         saved_errno = errno;
1985         free(filename);
1986         errno = saved_errno;
1987         return ret;
1988 }
1989
1990 static int do_cgroup_set(const char *cgroup_path, const char *sub_filename,
1991                          const char *value)
1992 {
1993         const char *parts[3] = {
1994                 cgroup_path,
1995                 sub_filename,
1996                 NULL
1997         };
1998         char *filename;
1999         int ret, saved_errno;
2000
2001         filename = lxc_string_join("/", parts, false);
2002         if (!filename)
2003                 return -1;
2004
2005         ret = lxc_write_to_file(filename, value, strlen(value), false);
2006         saved_errno = errno;
2007         free(filename);
2008         errno = saved_errno;
2009         return ret;
2010 }
2011
2012 static int do_setup_cgroup_limits(struct cgfs_data *d,
2013                            struct lxc_list *cgroup_settings, bool do_devices)
2014 {
2015         struct lxc_list *iterator, *sorted_cgroup_settings, *next;
2016         struct lxc_cgroup *cg;
2017         int ret = -1;
2018
2019         if (lxc_list_empty(cgroup_settings))
2020                 return 0;
2021
2022         sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2023         if (!sorted_cgroup_settings) {
2024                 return -1;
2025         }
2026
2027         lxc_list_for_each(iterator, sorted_cgroup_settings) {
2028                 cg = iterator->elem;
2029
2030                 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2031                         if (strcmp(cg->subsystem, "devices.deny") == 0 &&
2032                                         cgroup_devices_has_allow_or_deny(d, cg->value, false))
2033                                 continue;
2034                         if (strcmp(cg->subsystem, "devices.allow") == 0 &&
2035                                         cgroup_devices_has_allow_or_deny(d, cg->value, true))
2036                                 continue;
2037                         if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
2038                                 if (do_devices && (errno == EACCES || errno == EPERM)) {
2039                                         WARN("Error setting %s to %s for %s",
2040                                               cg->subsystem, cg->value, d->name);
2041                                         continue;
2042                                 }
2043                                 SYSERROR("Error setting %s to %s for %s",
2044                                       cg->subsystem, cg->value, d->name);
2045                                 goto out;
2046                         }
2047                 }
2048
2049                 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
2050         }
2051
2052         ret = 0;
2053         INFO("cgroup has been setup");
2054 out:
2055         lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2056                 lxc_list_del(iterator);
2057                 free(iterator);
2058         }
2059         free(sorted_cgroup_settings);
2060         return ret;
2061 }
2062
2063 static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d,
2064                                              char *v, bool for_allow)
2065 {
2066         char *path;
2067         FILE *devices_list;
2068         char *line = NULL;
2069         size_t sz = 0;
2070         bool ret = !for_allow;
2071         const char *parts[3] = {
2072                 NULL,
2073                 "devices.list",
2074                 NULL
2075         };
2076
2077         /* XXX FIXME if users could use something other than 'lxc.devices.deny =
2078          * a'.  not sure they ever do, but they *could* right now, I'm assuming
2079          * they do NOT
2080          */
2081         if (!for_allow && strcmp(v, "a") != 0 && strcmp(v, "a *:* rwm") != 0)
2082                 return false;
2083
2084         parts[0] = (const char *)lxc_cgroup_get_hierarchy_abs_path_data("devices", d);
2085         if (!parts[0])
2086                 return false;
2087         path = lxc_string_join("/", parts, false);
2088         if (!path) {
2089                 free((void *)parts[0]);
2090                 return false;
2091         }
2092
2093         devices_list = fopen_cloexec(path, "r");
2094         if (!devices_list) {
2095                 free(path);
2096                 return false;
2097         }
2098
2099         while (getline(&line, &sz, devices_list) != -1) {
2100                 size_t len = strlen(line);
2101                 if (len > 0 && line[len-1] == '\n')
2102                         line[len-1] = '\0';
2103                 if (strcmp(line, "a *:* rwm") == 0) {
2104                         ret = for_allow;
2105                         goto out;
2106                 } else if (for_allow && strcmp(line, v) == 0) {
2107                         ret = true;
2108                         goto out;
2109                 }
2110         }
2111
2112 out:
2113         fclose(devices_list);
2114         free(line);
2115         free(path);
2116         return ret;
2117 }
2118
2119 static int cgroup_recursive_task_count(const char *cgroup_path)
2120 {
2121         DIR *d;
2122         struct dirent *dent;
2123         int n = 0, r;
2124
2125         d = opendir(cgroup_path);
2126         if (!d)
2127                 return 0;
2128
2129         while ((dent = readdir(d))) {
2130                 const char *parts[3] = {
2131                         cgroup_path,
2132                         dent->d_name,
2133                         NULL
2134                 };
2135                 char *sub_path;
2136                 struct stat st;
2137
2138                 if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, ".."))
2139                         continue;
2140                 sub_path = lxc_string_join("/", parts, false);
2141                 if (!sub_path) {
2142                         closedir(d);
2143                         return -1;
2144                 }
2145                 r = stat(sub_path, &st);
2146                 if (r < 0) {
2147                         closedir(d);
2148                         free(sub_path);
2149                         return -1;
2150                 }
2151                 if (S_ISDIR(st.st_mode)) {
2152                         r = cgroup_recursive_task_count(sub_path);
2153                         if (r >= 0)
2154                                 n += r;
2155                 } else if (!strcmp(dent->d_name, "tasks")) {
2156                         r = lxc_count_file_lines(sub_path);
2157                         if (r >= 0)
2158                                 n += r;
2159                 }
2160                 free(sub_path);
2161         }
2162         closedir(d);
2163
2164         return n;
2165 }
2166
2167 static int handle_cgroup_settings(struct cgroup_mount_point *mp,
2168                                   char *cgroup_path)
2169 {
2170         int r, saved_errno = 0;
2171         char buf[2];
2172
2173         mp->need_cpuset_init = false;
2174
2175         /* If this is the memory cgroup, we want to enforce hierarchy.
2176          * But don't fail if for some reason we can't.
2177          */
2178         if (lxc_string_in_array("memory", (const char **)mp->hierarchy->subsystems)) {
2179                 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/memory.use_hierarchy");
2180                 if (cc_path) {
2181                         r = lxc_read_from_file(cc_path, buf, 1);
2182                         if (r < 1 || buf[0] != '1') {
2183                                 r = lxc_write_to_file(cc_path, "1", 1, false);
2184                                 if (r < 0)
2185                                         SYSERROR("failed to set memory.use_hierarchy to 1; continuing");
2186                         }
2187                         free(cc_path);
2188                 }
2189         }
2190
2191         /* if this is a cpuset hierarchy, we have to set cgroup.clone_children in
2192          * the base cgroup, otherwise containers will start with an empty cpuset.mems
2193          * and cpuset.cpus and then
2194          */
2195         if (lxc_string_in_array("cpuset", (const char **)mp->hierarchy->subsystems)) {
2196                 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/cgroup.clone_children");
2197                 struct stat sb;
2198
2199                 if (!cc_path)
2200                         return -1;
2201                 /* cgroup.clone_children is not available when running under
2202                  * older kernel versions; in this case, we'll initialize
2203                  * cpuset.cpus and cpuset.mems later, after the new cgroup
2204                  * was created
2205                  */
2206                 if (stat(cc_path, &sb) != 0 && errno == ENOENT) {
2207                         mp->need_cpuset_init = true;
2208                         free(cc_path);
2209                         return 0;
2210                 }
2211                 r = lxc_read_from_file(cc_path, buf, 1);
2212                 if (r == 1 && buf[0] == '1') {
2213                         free(cc_path);
2214                         return 0;
2215                 }
2216                 r = lxc_write_to_file(cc_path, "1", 1, false);
2217                 saved_errno = errno;
2218                 free(cc_path);
2219                 errno = saved_errno;
2220                 return r < 0 ? -1 : 0;
2221         }
2222         return 0;
2223 }
2224
2225 static int cgroup_read_from_file(const char *fn, char buf[], size_t bufsize)
2226 {
2227         int ret = lxc_read_from_file(fn, buf, bufsize);
2228         if (ret < 0) {
2229                 SYSERROR("failed to read %s", fn);
2230                 return ret;
2231         }
2232         if (ret == bufsize) {
2233                 if (bufsize > 0) {
2234                         /* obviously this wasn't empty */
2235                         buf[bufsize-1] = '\0';
2236                         return ret;
2237                 }
2238                 /* Callers don't do this, but regression/sanity check */
2239                 ERROR("was not expecting 0 bufsize");
2240                 return -1;
2241         }
2242         buf[ret] = '\0';
2243         return ret;
2244 }
2245
2246 static bool do_init_cpuset_file(struct cgroup_mount_point *mp,
2247                                 const char *path, const char *name)
2248 {
2249         char value[1024];
2250         char *childfile, *parentfile = NULL, *tmp;
2251         int ret;
2252         bool ok = false;
2253
2254         childfile = cgroup_to_absolute_path(mp, path, name);
2255         if (!childfile)
2256                 return false;
2257
2258         /* don't overwrite a non-empty value in the file */
2259         ret = cgroup_read_from_file(childfile, value, sizeof(value));
2260         if (ret < 0)
2261                 goto out;
2262         if (value[0] != '\0' && value[0] != '\n') {
2263                 ok = true;
2264                 goto out;
2265         }
2266
2267         /* path to the same name in the parent cgroup */
2268         parentfile = strdup(path);
2269         if (!parentfile)
2270                 goto out;
2271
2272         tmp = strrchr(parentfile, '/');
2273         if (!tmp)
2274                 goto out;
2275         if (tmp == parentfile)
2276                 tmp++; /* keep the '/' at the start */
2277         *tmp = '\0';
2278         tmp = parentfile;
2279         parentfile = cgroup_to_absolute_path(mp, tmp, name);
2280         free(tmp);
2281         if (!parentfile)
2282                 goto out;
2283
2284         /* copy from parent to child cgroup */
2285         ret = cgroup_read_from_file(parentfile, value, sizeof(value));
2286         if (ret < 0)
2287                 goto out;
2288         if (ret == sizeof(value)) {
2289                 /* If anyone actually sees this error, we can address it */
2290                 ERROR("parent cpuset value too long");
2291                 goto out;
2292         }
2293         ok = (lxc_write_to_file(childfile, value, strlen(value), false) >= 0);
2294         if (!ok)
2295                 SYSERROR("failed writing %s", childfile);
2296
2297 out:
2298         free(parentfile);
2299         free(childfile);
2300         return ok;
2301 }
2302
2303 static bool init_cpuset_if_needed(struct cgroup_mount_point *mp,
2304                                   const char *path)
2305 {
2306         /* the files we have to handle here are only in cpuset hierarchies */
2307         if (!lxc_string_in_array("cpuset",
2308                                  (const char **)mp->hierarchy->subsystems))
2309                 return true;
2310
2311         if (!mp->need_cpuset_init)
2312                 return true;
2313
2314         return (do_init_cpuset_file(mp, path, "/cpuset.cpus") &&
2315                 do_init_cpuset_file(mp, path, "/cpuset.mems") );
2316 }
2317
2318 static void print_cgfs_init_debuginfo(struct cgfs_data *d)
2319 {
2320         int i;
2321
2322         if (!getenv("LXC_DEBUG_CGFS"))
2323                 return;
2324
2325         DEBUG("Cgroup information:");
2326         DEBUG("  container name: %s", d->name);
2327         if (!d->meta || !d->meta->hierarchies) {
2328                 DEBUG("  No hierarchies found.");
2329                 return;
2330         }
2331         DEBUG("  Controllers:");
2332         for (i = 0; i <= d->meta->maximum_hierarchy; i++) {
2333                 char **p;
2334                 struct cgroup_hierarchy *h = d->meta->hierarchies[i];
2335                 if (!h) {
2336                         DEBUG("     Empty hierarchy number %d.", i);
2337                         continue;
2338                 }
2339                 for (p = h->subsystems; p && *p; p++) {
2340                         DEBUG("     %2d: %s", i, *p);
2341                 }
2342         }
2343 }
2344
2345 struct cgroup_ops *cgfs_ops_init(void)
2346 {
2347         return &cgfs_ops;
2348 }
2349
2350 static void *cgfs_init(struct lxc_handler *handler)
2351 {
2352         struct cgfs_data *d;
2353
2354         d = malloc(sizeof(*d));
2355         if (!d)
2356                 return NULL;
2357
2358         memset(d, 0, sizeof(*d));
2359         d->name = strdup(handler->name);
2360         if (!d->name)
2361                 goto err1;
2362
2363         d->cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
2364
2365         d->meta = lxc_cgroup_load_meta();
2366         if (!d->meta) {
2367                 ERROR("cgroupfs failed to detect cgroup metadata");
2368                 goto err2;
2369         }
2370
2371         print_cgfs_init_debuginfo(d);
2372
2373         return d;
2374
2375 err2:
2376         free(d->name);
2377 err1:
2378         free(d);
2379         return NULL;
2380 }
2381
2382 static void cgfs_destroy(void *hdata, struct lxc_conf *conf)
2383 {
2384         struct cgfs_data *d = hdata;
2385
2386         if (!d)
2387                 return;
2388         free(d->name);
2389         lxc_cgroup_process_info_free_and_remove(d->info, conf);
2390         lxc_cgroup_put_meta(d->meta);
2391         free(d);
2392 }
2393
2394 static inline bool cgfs_create(void *hdata)
2395 {
2396         struct cgfs_data *d = hdata;
2397         struct cgroup_process_info *i;
2398         struct cgroup_meta_data *md;
2399
2400         if (!d)
2401                 return false;
2402         md = d->meta;
2403         i = lxc_cgroupfs_create(d->name, d->cgroup_pattern, md, NULL);
2404         if (!i)
2405                 return false;
2406         d->info = i;
2407         return true;
2408 }
2409
2410 static inline bool cgfs_enter(void *hdata, pid_t pid)
2411 {
2412         struct cgfs_data *d = hdata;
2413         struct cgroup_process_info *i;
2414         int ret;
2415
2416         if (!d)
2417                 return false;
2418         i = d->info;
2419         ret = lxc_cgroupfs_enter(i, pid, false);
2420
2421         return ret == 0;
2422 }
2423
2424 static inline bool cgfs_create_legacy(void *hdata, pid_t pid)
2425 {
2426         struct cgfs_data *d = hdata;
2427         struct cgroup_process_info *i;
2428
2429         if (!d)
2430                 return false;
2431         i = d->info;
2432         if (lxc_cgroup_create_legacy(i, d->name, pid) < 0) {
2433                 ERROR("failed to create legacy ns cgroups for '%s'", d->name);
2434                 return false;
2435         }
2436         return true;
2437 }
2438
2439 static const char *cgfs_get_cgroup(void *hdata, const char *subsystem)
2440 {
2441         struct cgfs_data *d = hdata;
2442
2443         if (!d)
2444                 return NULL;
2445         return lxc_cgroup_get_hierarchy_path_data(subsystem, d);
2446 }
2447
2448 static bool cgfs_escape(void *hdata)
2449 {
2450         struct cgroup_meta_data *md;
2451         int i;
2452         bool ret = false;
2453
2454         md = lxc_cgroup_load_meta();
2455         if (!md)
2456                 return false;
2457
2458         for (i = 0; i <= md->maximum_hierarchy; i++) {
2459                 struct cgroup_hierarchy *h = md->hierarchies[i];
2460                 struct cgroup_mount_point *mp;
2461                 char *tasks;
2462                 FILE *f;
2463                 int written;
2464
2465                 if (!h) {
2466                         WARN("not escaping hierarchy %d", i);
2467                         continue;
2468                 }
2469
2470                 mp = lxc_cgroup_find_mount_point(h, "/", true);
2471                 if (!mp)
2472                         goto out;
2473
2474                 tasks = cgroup_to_absolute_path(mp, "/", "tasks");
2475                 if (!tasks)
2476                         goto out;
2477
2478                 f = fopen(tasks, "a");
2479                 free(tasks);
2480                 if (!f)
2481                         goto out;
2482
2483                 written = fprintf(f, "%d\n", lxc_raw_getpid());
2484                 fclose(f);
2485                 if (written < 0) {
2486                         SYSERROR("writing tasks failed\n");
2487                         goto out;
2488                 }
2489         }
2490
2491         ret = true;
2492 out:
2493         lxc_cgroup_put_meta(md);
2494         return ret;
2495 }
2496
2497 static int cgfs_num_hierarchies(void)
2498 {
2499         /* not implemented */
2500         return -1;
2501 }
2502
2503 static bool cgfs_get_hierarchies(int i, char ***out)
2504 {
2505         /* not implemented */
2506         return false;
2507 }
2508
2509 static bool cgfs_unfreeze(void *hdata)
2510 {
2511         struct cgfs_data *d = hdata;
2512         char *cgabspath, *cgrelpath;
2513         int ret;
2514
2515         if (!d)
2516                 return false;
2517
2518         cgrelpath = lxc_cgroup_get_hierarchy_path_data("freezer", d);
2519         cgabspath = lxc_cgroup_find_abs_path("freezer", cgrelpath, true, NULL);
2520         if (!cgabspath)
2521                 return false;
2522
2523         ret = do_cgroup_set(cgabspath, "freezer.state", "THAWED");
2524         free(cgabspath);
2525         return ret == 0;
2526 }
2527
2528 static bool cgroupfs_setup_limits(void *hdata, struct lxc_list *cgroup_conf,
2529                                   bool with_devices)
2530 {
2531         struct cgfs_data *d = hdata;
2532
2533         if (!d)
2534                 return false;
2535         return do_setup_cgroup_limits(d, cgroup_conf, with_devices) == 0;
2536 }
2537
2538 static bool lxc_cgroupfs_attach(const char *name, const char *lxcpath, pid_t pid)
2539 {
2540         struct cgroup_meta_data *meta_data;
2541         struct cgroup_process_info *container_info;
2542         int ret;
2543
2544         meta_data = lxc_cgroup_load_meta();
2545         if (!meta_data) {
2546                 ERROR("could not move attached process %d to cgroup of container", pid);
2547                 return false;
2548         }
2549
2550         container_info = lxc_cgroup_get_container_info(name, lxcpath, meta_data);
2551         lxc_cgroup_put_meta(meta_data);
2552         if (!container_info) {
2553                 ERROR("could not move attached process %d to cgroup of container", pid);
2554                 return false;
2555         }
2556
2557         ret = lxc_cgroupfs_enter(container_info, pid, false);
2558         lxc_cgroup_process_info_free(container_info);
2559         if (ret < 0) {
2560                 ERROR("could not move attached process %d to cgroup of container", pid);
2561                 return false;
2562         }
2563         return true;
2564 }
2565
2566 struct chown_data {
2567         const char *cgroup_path;
2568         uid_t origuid;
2569 };
2570
2571 /*
2572  * TODO - someone should refactor this to unshare once passing all the paths
2573  * to be chowned in one go
2574  */
2575 static int chown_cgroup_wrapper(void *data)
2576 {
2577         struct chown_data *arg = data;
2578         uid_t destuid;
2579         char *fpath;
2580
2581         if (setresgid(0,0,0) < 0)
2582                 SYSERROR("Failed to setgid to 0");
2583         if (setresuid(0,0,0) < 0)
2584                 SYSERROR("Failed to setuid to 0");
2585         if (setgroups(0, NULL) < 0)
2586                 SYSERROR("Failed to clear groups");
2587         destuid = get_ns_uid(arg->origuid);
2588
2589         if (chown(arg->cgroup_path, destuid, 0) < 0)
2590                 SYSERROR("Failed chowning %s to %d", arg->cgroup_path, (int)destuid);
2591
2592         fpath = lxc_append_paths(arg->cgroup_path, "tasks");
2593         if (!fpath)
2594                 return -1;
2595         if (chown(fpath, destuid, 0) < 0)
2596                 SYSERROR("Error chowning %s\n", fpath);
2597         free(fpath);
2598
2599         fpath = lxc_append_paths(arg->cgroup_path, "cgroup.procs");
2600         if (!fpath)
2601                 return -1;
2602         if (chown(fpath, destuid, 0) < 0)
2603                 SYSERROR("Error chowning %s", fpath);
2604         free(fpath);
2605
2606         return 0;
2607 }
2608
2609 static bool do_cgfs_chown(char *cgroup_path, struct lxc_conf *conf)
2610 {
2611         struct chown_data data;
2612         char *fpath;
2613
2614         if (!dir_exists(cgroup_path))
2615                 return true;
2616
2617         if (lxc_list_empty(&conf->id_map))
2618                 /* If there's no mapping then we don't need to chown */
2619                 return true;
2620
2621         data.cgroup_path = cgroup_path;
2622         data.origuid = geteuid();
2623
2624         /* Unpriv users can't chown it themselves, so chown from
2625          * a child namespace mapping both our own and the target uid
2626          */
2627         if (userns_exec_1(conf, chown_cgroup_wrapper, &data,
2628                           "chown_cgroup_wrapper") < 0) {
2629                 ERROR("Error requesting cgroup chown in new namespace");
2630                 return false;
2631         }
2632
2633         /*
2634          * Now chmod 775 the directory else the container cannot create cgroups.
2635          * This can't be done in the child namespace because it only group-owns
2636          * the cgroup
2637          */
2638         if (chmod(cgroup_path, 0775) < 0) {
2639                 SYSERROR("Error chmoding %s\n", cgroup_path);
2640                 return false;
2641         }
2642         fpath = lxc_append_paths(cgroup_path, "tasks");
2643         if (!fpath)
2644                 return false;
2645         if (chmod(fpath, 0664) < 0)
2646                 SYSERROR("Error chmoding %s\n", fpath);
2647         free(fpath);
2648         fpath = lxc_append_paths(cgroup_path, "cgroup.procs");
2649         if (!fpath)
2650                 return false;
2651         if (chmod(fpath, 0664) < 0)
2652                 SYSERROR("Error chmoding %s\n", fpath);
2653         free(fpath);
2654
2655         return true;
2656 }
2657
2658 static bool cgfs_chown(void *hdata, struct lxc_conf *conf)
2659 {
2660         struct cgfs_data *d = hdata;
2661         struct cgroup_process_info *info_ptr;
2662         char *cgpath;
2663         bool r = true;
2664
2665         if (!d)
2666                 return false;
2667
2668         for (info_ptr = d->info; info_ptr; info_ptr = info_ptr->next) {
2669                 if (!info_ptr->hierarchy)
2670                         continue;
2671
2672                 if (!info_ptr->designated_mount_point) {
2673                         info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, info_ptr->cgroup_path, true);
2674                         if (!info_ptr->designated_mount_point) {
2675                                 SYSERROR("Could not chown cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", info_ptr->cgroup_path);
2676                                 return false;
2677                         }
2678                 }
2679
2680                 cgpath = cgroup_to_absolute_path(info_ptr->designated_mount_point, info_ptr->cgroup_path, NULL);
2681                 if (!cgpath) {
2682                         SYSERROR("Could not chown cgroup %s: internal error", info_ptr->cgroup_path);
2683                         continue;
2684                 }
2685                 r = do_cgfs_chown(cgpath, conf);
2686                 if (!r && is_crucial_hierarchy(info_ptr->hierarchy)) {
2687                         ERROR("Failed chowning %s\n", cgpath);
2688                         free(cgpath);
2689                         return false;
2690                 }
2691                 free(cgpath);
2692         }
2693
2694         return true;
2695 }
2696
2697 static struct cgroup_ops cgfs_ops = {
2698         .init = cgfs_init,
2699         .destroy = cgfs_destroy,
2700         .create = cgfs_create,
2701         .enter = cgfs_enter,
2702         .create_legacy = cgfs_create_legacy,
2703         .get_cgroup = cgfs_get_cgroup,
2704         .escape = cgfs_escape,
2705         .num_hierarchies = cgfs_num_hierarchies,
2706         .get_hierarchies = cgfs_get_hierarchies,
2707         .get = lxc_cgroupfs_get,
2708         .set = lxc_cgroupfs_set,
2709         .unfreeze = cgfs_unfreeze,
2710         .setup_limits = cgroupfs_setup_limits,
2711         .name = "cgroupfs",
2712         .attach = lxc_cgroupfs_attach,
2713         .chown = cgfs_chown,
2714         .mount_cgroup = cgroupfs_mount_cgroup,
2715         .nrtasks = cgfs_nrtasks,
2716         .driver = CGFS,
2717 };