]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgfs.c
Merge pull request #871 from ysbnim/master
[mirror_lxc.git] / src / lxc / cgfs.c
CommitLineData
576f946d 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
576f946d 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
576f946d 22 */
d06245b8
NC
23#include "config.h"
24
576f946d 25#include <stdio.h>
576f946d 26#include <stdlib.h>
27#include <errno.h>
576f946d 28#include <unistd.h>
29#include <string.h>
341a9bd8 30#include <dirent.h>
576f946d 31#include <fcntl.h>
8b276860 32#include <grp.h>
b98f7d6e 33#include <ctype.h>
576f946d 34#include <sys/types.h>
35#include <sys/stat.h>
36#include <sys/param.h>
37#include <sys/inotify.h>
aae1f3c4 38#include <sys/mount.h>
576f946d 39#include <netinet/in.h>
40#include <net/if.h>
41
e2bcd7db 42#include "error.h"
ae5c8b8e 43#include "commands.h"
b98f7d6e
SH
44#include "list.h"
45#include "conf.h"
33ad9f1a 46#include "utils.h"
4ec31c52 47#include "bdev/bdev.h"
f2363e38
ÇO
48#include "log.h"
49#include "cgroup.h"
50#include "start.h"
484ed030 51#include "state.h"
36eb9bde 52
edaf8b1b
SG
53#if IS_BIONIC
54#include <../include/lxcmntent.h>
55#else
56#include <mntent.h>
57#endif
58
4fb3cba5
DE
59struct cgroup_hierarchy;
60struct cgroup_meta_data;
61struct cgroup_mount_point;
62
63/*
64 * cgroup_meta_data: the metadata about the cgroup infrastructure on this
65 * host
66 */
67struct cgroup_meta_data {
68 ptrdiff_t ref; /* simple refcount */
69 struct cgroup_hierarchy **hierarchies;
70 struct cgroup_mount_point **mount_points;
71 int maximum_hierarchy;
72};
73
74/*
75 * cgroup_hierarchy: describes a single cgroup hierarchy
76 * (may have multiple mount points)
77 */
78struct cgroup_hierarchy {
79 int index;
80 bool used; /* false if the hierarchy should be ignored by lxc */
81 char **subsystems;
82 struct cgroup_mount_point *rw_absolute_mount_point;
83 struct cgroup_mount_point *ro_absolute_mount_point;
84 struct cgroup_mount_point **all_mount_points;
85 size_t all_mount_point_capacity;
86};
87
88/*
89 * cgroup_mount_point: a mount point to where a hierarchy
90 * is mounted to
91 */
92struct cgroup_mount_point {
93 struct cgroup_hierarchy *hierarchy;
94 char *mount_point;
95 char *mount_prefix;
96 bool read_only;
97 bool need_cpuset_init;
98};
99
100/*
101 * cgroup_process_info: describes the membership of a
102 * process to the different cgroup
103 * hierarchies
104 *
105 * Note this is the per-process info tracked by the cgfs_ops.
106 * This is not used with cgmanager.
107 */
108struct cgroup_process_info {
109 struct cgroup_process_info *next;
110 struct cgroup_meta_data *meta_ref;
111 struct cgroup_hierarchy *hierarchy;
112 char *cgroup_path;
113 char *cgroup_path_sub;
114 char **created_paths;
115 size_t created_paths_capacity;
116 size_t created_paths_count;
117 struct cgroup_mount_point *designated_mount_point;
118};
119
120struct cgfs_data {
121 char *name;
122 const char *cgroup_pattern;
123 struct cgroup_meta_data *meta;
124 struct cgroup_process_info *info;
125};
126
127lxc_log_define(lxc_cgfs, lxc);
576f946d 128
33ad9f1a
CS
129static struct cgroup_process_info *lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str, struct cgroup_meta_data *meta);
130static char **subsystems_from_mount_options(const char *mount_options, char **kernel_list);
131static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp);
132static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h);
133static bool is_valid_cgroup(const char *name);
33ad9f1a 134static int create_cgroup(struct cgroup_mount_point *mp, const char *path);
6a9e0f26
SH
135static int remove_cgroup(struct cgroup_mount_point *mp, const char *path, bool recurse,
136 struct lxc_conf *conf);
33ad9f1a
CS
137static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp, const char *path, const char *suffix);
138static struct cgroup_process_info *find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem);
139static int do_cgroup_get(const char *cgroup_path, const char *sub_filename, char *value, size_t len);
140static int do_cgroup_set(const char *cgroup_path, const char *sub_filename, const char *value);
4fb3cba5
DE
141static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d, char *v, bool for_allow);
142static int do_setup_cgroup_limits(struct cgfs_data *d, struct lxc_list *cgroup_settings, bool do_devices);
33ad9f1a
CS
143static int cgroup_recursive_task_count(const char *cgroup_path);
144static int count_lines(const char *fn);
1ea59ad2 145static int handle_cgroup_settings(struct cgroup_mount_point *mp, char *cgroup_path);
d703c2b1 146static bool init_cpuset_if_needed(struct cgroup_mount_point *mp, const char *path);
33ad9f1a 147
4fb3cba5
DE
148static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist);
149static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data);
150static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data);
151
152/* free process membership information */
153static void lxc_cgroup_process_info_free(struct cgroup_process_info *info);
6a9e0f26
SH
154static void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info,
155 struct lxc_conf *conf);
4fb3cba5 156
d4ef7c50 157static struct cgroup_ops cgfs_ops;
d4ef7c50 158
603c64c2
SH
159static int cgroup_rmdir(char *dirname)
160{
161 struct dirent dirent, *direntp;
162 int saved_errno = 0;
163 DIR *dir;
164 int ret, failed=0;
165 char pathname[MAXPATHLEN];
166
167 dir = opendir(dirname);
168 if (!dir) {
169 ERROR("%s: failed to open %s", __func__, dirname);
170 return -1;
171 }
172
173 while (!readdir_r(dir, &dirent, &direntp)) {
174 struct stat mystat;
175 int rc;
176
177 if (!direntp)
178 break;
179
180 if (!strcmp(direntp->d_name, ".") ||
181 !strcmp(direntp->d_name, ".."))
182 continue;
183
184 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
185 if (rc < 0 || rc >= MAXPATHLEN) {
186 ERROR("pathname too long");
187 failed=1;
188 if (!saved_errno)
189 saved_errno = -ENOMEM;
190 continue;
191 }
192 ret = lstat(pathname, &mystat);
193 if (ret) {
194 SYSERROR("%s: failed to stat %s", __func__, pathname);
195 failed=1;
196 if (!saved_errno)
197 saved_errno = errno;
198 continue;
199 }
200 if (S_ISDIR(mystat.st_mode)) {
201 if (cgroup_rmdir(pathname) < 0) {
202 if (!saved_errno)
203 saved_errno = errno;
204 failed=1;
205 }
206 }
207 }
208
209 if (rmdir(dirname) < 0) {
210 SYSERROR("%s: failed to delete %s", __func__, dirname);
211 if (!saved_errno)
212 saved_errno = errno;
213 failed=1;
214 }
215
216 ret = closedir(dir);
217 if (ret) {
218 SYSERROR("%s: failed to close directory %s", __func__, dirname);
219 if (!saved_errno)
220 saved_errno = errno;
221 failed=1;
222 }
223
224 errno = saved_errno;
225 return failed ? -1 : 0;
226}
227
6a9e0f26
SH
228static int rmdir_wrapper(void *data)
229{
230 char *path = data;
231
232 if (setresgid(0,0,0) < 0)
233 SYSERROR("Failed to setgid to 0");
234 if (setresuid(0,0,0) < 0)
235 SYSERROR("Failed to setuid to 0");
236 if (setgroups(0, NULL) < 0)
237 SYSERROR("Failed to clear groups");
238
239 return cgroup_rmdir(path);
240}
241
4fb3cba5 242static struct cgroup_meta_data *lxc_cgroup_load_meta()
33ad9f1a
CS
243{
244 const char *cgroup_use = NULL;
245 char **cgroup_use_list = NULL;
246 struct cgroup_meta_data *md = NULL;
247 int saved_errno;
248
249 errno = 0;
593e8478 250 cgroup_use = lxc_global_config_value("lxc.cgroup.use");
33ad9f1a
CS
251 if (!cgroup_use && errno != 0)
252 return NULL;
253 if (cgroup_use) {
254 cgroup_use_list = lxc_string_split_and_trim(cgroup_use, ',');
255 if (!cgroup_use_list)
256 return NULL;
257 }
576f946d 258
33ad9f1a
CS
259 md = lxc_cgroup_load_meta2((const char **)cgroup_use_list);
260 saved_errno = errno;
261 lxc_free_array((void **)cgroup_use_list, free);
262 errno = saved_errno;
263 return md;
264}
fd37327f 265
b653309a 266/* Step 1: determine all kernel subsystems */
4fb3cba5 267static bool find_cgroup_subsystems(char ***kernel_subsystems)
1d39a065 268{
b653309a
SH
269 FILE *proc_cgroups;
270 bool bret = false;
33ad9f1a
CS
271 char *line = NULL;
272 size_t sz = 0;
b653309a
SH
273 size_t kernel_subsystems_count = 0;
274 size_t kernel_subsystems_capacity = 0;
275 int r;
1d39a065 276
33ad9f1a
CS
277 proc_cgroups = fopen_cloexec("/proc/cgroups", "r");
278 if (!proc_cgroups)
b653309a 279 return false;
1d39a065 280
33ad9f1a
CS
281 while (getline(&line, &sz, proc_cgroups) != -1) {
282 char *tab1;
283 char *tab2;
284 int hierarchy_number;
1d39a065 285
33ad9f1a
CS
286 if (line[0] == '#')
287 continue;
288 if (!line[0])
289 continue;
1d39a065 290
33ad9f1a
CS
291 tab1 = strchr(line, '\t');
292 if (!tab1)
8900b9eb 293 continue;
33ad9f1a
CS
294 *tab1++ = '\0';
295 tab2 = strchr(tab1, '\t');
296 if (!tab2)
297 continue;
298 *tab2 = '\0';
fd37327f 299
33ad9f1a
CS
300 tab2 = NULL;
301 hierarchy_number = strtoul(tab1, &tab2, 10);
302 if (!tab2 || *tab2)
303 continue;
304 (void)hierarchy_number;
305
b653309a 306 r = lxc_grow_array((void ***)kernel_subsystems, &kernel_subsystems_capacity, kernel_subsystems_count + 1, 12);
33ad9f1a 307 if (r < 0)
b653309a
SH
308 goto out;
309 (*kernel_subsystems)[kernel_subsystems_count] = strdup(line);
310 if (!(*kernel_subsystems)[kernel_subsystems_count])
311 goto out;
33ad9f1a 312 kernel_subsystems_count++;
bcbd102c 313 }
b653309a 314 bret = true;
0d9f8e18 315
b653309a 316out:
33ad9f1a 317 fclose(proc_cgroups);
0ccf7c2a 318 free(line);
b653309a
SH
319 return bret;
320}
321
322/* Step 2: determine all hierarchies (by reading /proc/self/cgroup),
323 * since mount points don't specify hierarchy number and
324 * /proc/cgroups does not contain named hierarchies
325 */
326static bool find_cgroup_hierarchies(struct cgroup_meta_data *meta_data,
327 bool all_kernel_subsystems, bool all_named_subsystems,
328 const char **subsystem_whitelist)
329{
330 FILE *proc_self_cgroup;
331 char *line = NULL;
332 size_t sz = 0;
333 int r;
334 bool bret = false;
335 size_t hierarchy_capacity = 0;
ef6e34ee 336
33ad9f1a
CS
337 proc_self_cgroup = fopen_cloexec("/proc/self/cgroup", "r");
338 /* if for some reason (because of setns() and pid namespace for example),
339 * /proc/self is not valid, we try /proc/1/cgroup... */
340 if (!proc_self_cgroup)
341 proc_self_cgroup = fopen_cloexec("/proc/1/cgroup", "r");
342 if (!proc_self_cgroup)
b653309a 343 return false;
33ad9f1a
CS
344
345 while (getline(&line, &sz, proc_self_cgroup) != -1) {
346 /* file format: hierarchy:subsystems:group,
347 * we only extract hierarchy and subsystems
348 * here */
349 char *colon1;
350 char *colon2;
351 int hierarchy_number;
352 struct cgroup_hierarchy *h = NULL;
353 char **p;
354
355 if (!line[0])
356 continue;
ad08bbb7 357
33ad9f1a
CS
358 colon1 = strchr(line, ':');
359 if (!colon1)
8900b9eb 360 continue;
33ad9f1a
CS
361 *colon1++ = '\0';
362 colon2 = strchr(colon1, ':');
363 if (!colon2)
364 continue;
365 *colon2 = '\0';
ad08bbb7 366
33ad9f1a
CS
367 colon2 = NULL;
368 hierarchy_number = strtoul(line, &colon2, 10);
369 if (!colon2 || *colon2)
370 continue;
576f946d 371
33ad9f1a
CS
372 if (hierarchy_number > meta_data->maximum_hierarchy) {
373 /* lxc_grow_array will never shrink, so even if we find a lower
374 * hierarchy number here, the array will never be smaller
375 */
376 r = lxc_grow_array((void ***)&meta_data->hierarchies, &hierarchy_capacity, hierarchy_number + 1, 12);
377 if (r < 0)
b653309a 378 goto out;
5193cc3d 379
33ad9f1a
CS
380 meta_data->maximum_hierarchy = hierarchy_number;
381 }
fd37327f 382
33ad9f1a
CS
383 /* this shouldn't happen, we had this already */
384 if (meta_data->hierarchies[hierarchy_number])
b653309a 385 goto out;
33ad9f1a
CS
386
387 h = calloc(1, sizeof(struct cgroup_hierarchy));
388 if (!h)
b653309a 389 goto out;
33ad9f1a
CS
390
391 meta_data->hierarchies[hierarchy_number] = h;
392
393 h->index = hierarchy_number;
394 h->subsystems = lxc_string_split_and_trim(colon1, ',');
395 if (!h->subsystems)
b653309a 396 goto out;
33ad9f1a
CS
397 /* see if this hierarchy should be considered */
398 if (!all_kernel_subsystems || !all_named_subsystems) {
399 for (p = h->subsystems; *p; p++) {
400 if (!strncmp(*p, "name=", 5)) {
401 if (all_named_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
402 h->used = true;
403 break;
404 }
405 } else {
406 if (all_kernel_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
407 h->used = true;
408 break;
409 }
410 }
411 }
412 } else {
413 /* we want all hierarchy anyway */
414 h->used = true;
ae5c8b8e 415 }
ae5c8b8e 416 }
b653309a 417 bret = true;
0b9c21ab 418
b653309a 419out:
33ad9f1a 420 fclose(proc_self_cgroup);
0ccf7c2a 421 free(line);
b653309a
SH
422 return bret;
423}
424
425/* Step 3: determine all mount points of each hierarchy */
426static bool find_hierarchy_mountpts( struct cgroup_meta_data *meta_data, char **kernel_subsystems)
427{
428 bool bret = false;
429 FILE *proc_self_mountinfo;
430 char *line = NULL;
431 size_t sz = 0;
432 char **tokens = NULL;
433 size_t mount_point_count = 0;
434 size_t mount_point_capacity = 0;
435 size_t token_capacity = 0;
436 int r;
fcca16bc 437 bool is_cgns = cgns_supported();
b653309a 438
33ad9f1a
CS
439 proc_self_mountinfo = fopen_cloexec("/proc/self/mountinfo", "r");
440 /* if for some reason (because of setns() and pid namespace for example),
441 * /proc/self is not valid, we try /proc/1/cgroup... */
442 if (!proc_self_mountinfo)
443 proc_self_mountinfo = fopen_cloexec("/proc/1/mountinfo", "r");
444 if (!proc_self_mountinfo)
b653309a 445 return false;
33ad9f1a
CS
446
447 while (getline(&line, &sz, proc_self_mountinfo) != -1) {
178938fe 448 char *token, *line_tok, *saveptr = NULL;
33ad9f1a
CS
449 size_t i, j, k;
450 struct cgroup_mount_point *mount_point;
451 struct cgroup_hierarchy *h;
452 char **subsystems;
836514a8 453 bool is_lxcfs = false;
33ad9f1a
CS
454
455 if (line[0] && line[strlen(line) - 1] == '\n')
456 line[strlen(line) - 1] = '\0';
457
178938fe 458 for (i = 0, line_tok = line; (token = strtok_r(line_tok, " ", &saveptr)); line_tok = NULL) {
33ad9f1a
CS
459 r = lxc_grow_array((void ***)&tokens, &token_capacity, i + 1, 64);
460 if (r < 0)
b653309a 461 goto out;
33ad9f1a
CS
462 tokens[i++] = token;
463 }
b98f7d6e 464
33ad9f1a
CS
465 /* layout of /proc/self/mountinfo:
466 * 0: id
467 * 1: parent id
468 * 2: device major:minor
469 * 3: mount prefix
8900b9eb 470 * 4: mount point
33ad9f1a
CS
471 * 5: per-mount options
472 * [optional X]: additional data
473 * X+7: "-"
474 * X+8: type
475 * X+9: source
476 * X+10: per-superblock options
477 */
478 for (j = 6; j < i && tokens[j]; j++)
479 if (!strcmp(tokens[j], "-"))
480 break;
fd4f5a56 481
33ad9f1a
CS
482 /* could not find separator */
483 if (j >= i || !tokens[j])
484 continue;
485 /* there should be exactly three fields after
486 * the separator
487 */
488 if (i != j + 4)
489 continue;
fd4f5a56 490
33ad9f1a 491 /* not a cgroup filesystem */
836514a8
U
492 if (strcmp(tokens[j + 1], "cgroup") != 0) {
493 if (strcmp(tokens[j + 1], "fuse.lxcfs") != 0)
494 continue;
495 if (strncmp(tokens[4], "/sys/fs/cgroup/", 15) != 0)
496 continue;
497 is_lxcfs = true;
498 char *curtok = tokens[4] + 15;
499 subsystems = subsystems_from_mount_options(curtok,
500 kernel_subsystems);
501 } else
502 subsystems = subsystems_from_mount_options(tokens[j + 3],
503 kernel_subsystems);
33ad9f1a 504 if (!subsystems)
b653309a 505 goto out;
33ad9f1a
CS
506
507 h = NULL;
508 for (k = 1; k <= meta_data->maximum_hierarchy; k++) {
509 if (meta_data->hierarchies[k] &&
510 meta_data->hierarchies[k]->subsystems[0] &&
511 lxc_string_in_array(meta_data->hierarchies[k]->subsystems[0], (const char **)subsystems)) {
512 /* TODO: we could also check if the lists really match completely,
513 * just to have an additional sanity check */
514 h = meta_data->hierarchies[k];
b98f7d6e 515 break;
33ad9f1a 516 }
b98f7d6e 517 }
33ad9f1a
CS
518 lxc_free_array((void **)subsystems, free);
519
520 r = lxc_grow_array((void ***)&meta_data->mount_points, &mount_point_capacity, mount_point_count + 1, 12);
521 if (r < 0)
b653309a 522 goto out;
33ad9f1a
CS
523
524 /* create mount point object */
525 mount_point = calloc(1, sizeof(*mount_point));
526 if (!mount_point)
b653309a 527 goto out;
33ad9f1a
CS
528
529 meta_data->mount_points[mount_point_count++] = mount_point;
530
531 mount_point->hierarchy = h;
fcca16bc 532 if (is_lxcfs || is_cgns)
836514a8
U
533 mount_point->mount_prefix = strdup("/");
534 else
535 mount_point->mount_prefix = strdup(tokens[3]);
33ad9f1a 536 mount_point->mount_point = strdup(tokens[4]);
33ad9f1a 537 if (!mount_point->mount_point || !mount_point->mount_prefix)
b653309a 538 goto out;
33ad9f1a
CS
539 mount_point->read_only = !lxc_string_in_list("rw", tokens[5], ',');
540
541 if (!strcmp(mount_point->mount_prefix, "/")) {
542 if (mount_point->read_only) {
543 if (!h->ro_absolute_mount_point)
544 h->ro_absolute_mount_point = mount_point;
545 } else {
546 if (!h->rw_absolute_mount_point)
547 h->rw_absolute_mount_point = mount_point;
548 }
b98f7d6e 549 }
ae5c8b8e 550
33ad9f1a
CS
551 k = lxc_array_len((void **)h->all_mount_points);
552 r = lxc_grow_array((void ***)&h->all_mount_points, &h->all_mount_point_capacity, k + 1, 4);
553 if (r < 0)
b653309a 554 goto out;
33ad9f1a 555 h->all_mount_points[k] = mount_point;
fd4f5a56 556 }
b653309a
SH
557 bret = true;
558
559out:
b653309a 560 fclose(proc_self_mountinfo);
b653309a 561 free(tokens);
2cdafc54 562 free(line);
b653309a
SH
563 return bret;
564}
565
4fb3cba5 566static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist)
b653309a
SH
567{
568 bool all_kernel_subsystems = true;
569 bool all_named_subsystems = false;
570 struct cgroup_meta_data *meta_data = NULL;
571 char **kernel_subsystems = NULL;
572 int saved_errno = 0;
573
574 /* if the subsystem whitelist is not specified, include all
575 * hierarchies that contain kernel subsystems by default but
576 * no hierarchies that only contain named subsystems
577 *
578 * if it is specified, the specifier @all will select all
579 * hierarchies, @kernel will select all hierarchies with
580 * kernel subsystems and @named will select all named
581 * hierarchies
582 */
583 all_kernel_subsystems = subsystem_whitelist ?
584 (lxc_string_in_array("@kernel", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
585 true;
586 all_named_subsystems = subsystem_whitelist ?
587 (lxc_string_in_array("@named", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
79c59e6b 588 true;
b653309a
SH
589
590 meta_data = calloc(1, sizeof(struct cgroup_meta_data));
591 if (!meta_data)
592 return NULL;
593 meta_data->ref = 1;
594
595 if (!find_cgroup_subsystems(&kernel_subsystems))
596 goto out_error;
597
598 if (!find_cgroup_hierarchies(meta_data, all_kernel_subsystems,
599 all_named_subsystems, subsystem_whitelist))
600 goto out_error;
601
602 if (!find_hierarchy_mountpts(meta_data, kernel_subsystems))
603 goto out_error;
fd4f5a56 604
33ad9f1a
CS
605 /* oops, we couldn't find anything */
606 if (!meta_data->hierarchies || !meta_data->mount_points) {
607 errno = EINVAL;
608 goto out_error;
ae5c8b8e 609 }
fd4f5a56 610
3a0abb3a 611 lxc_free_array((void **)kernel_subsystems, free);
33ad9f1a
CS
612 return meta_data;
613
614out_error:
615 saved_errno = errno;
33ad9f1a
CS
616 lxc_free_array((void **)kernel_subsystems, free);
617 lxc_cgroup_put_meta(meta_data);
618 errno = saved_errno;
619 return NULL;
fd4f5a56
DL
620}
621
4fb3cba5 622static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data)
e14f67a7 623{
33ad9f1a
CS
624 meta_data->ref++;
625 return meta_data;
626}
e14f67a7 627
4fb3cba5 628static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data)
33ad9f1a
CS
629{
630 size_t i;
631 if (!meta_data)
632 return NULL;
633 if (--meta_data->ref > 0)
634 return meta_data;
635 lxc_free_array((void **)meta_data->mount_points, (lxc_free_fn)lxc_cgroup_mount_point_free);
636 if (meta_data->hierarchies) {
637 for (i = 0; i <= meta_data->maximum_hierarchy; i++)
638 lxc_cgroup_hierarchy_free(meta_data->hierarchies[i]);
e14f67a7 639 }
33ad9f1a 640 free(meta_data->hierarchies);
178938fe 641 free(meta_data);
33ad9f1a 642 return NULL;
e14f67a7
U
643}
644
4fb3cba5 645static struct cgroup_hierarchy *lxc_cgroup_find_hierarchy(struct cgroup_meta_data *meta_data, const char *subsystem)
e14f67a7 646{
33ad9f1a
CS
647 size_t i;
648 for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
649 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
650 if (h && lxc_string_in_array(subsystem, (const char **)h->subsystems))
651 return h;
e14f67a7 652 }
e14f67a7
U
653 return NULL;
654}
655
d3f99e96
SH
656static bool mountpoint_is_accessible(struct cgroup_mount_point *mp)
657{
658 return mp && access(mp->mount_point, F_OK) == 0;
659}
660
4fb3cba5 661static struct cgroup_mount_point *lxc_cgroup_find_mount_point(struct cgroup_hierarchy *hierarchy, const char *group, bool should_be_writable)
b98f7d6e 662{
33ad9f1a
CS
663 struct cgroup_mount_point **mps;
664 struct cgroup_mount_point *current_result = NULL;
665 ssize_t quality = -1;
b98f7d6e 666
33ad9f1a 667 /* trivial case */
d3f99e96 668 if (mountpoint_is_accessible(hierarchy->rw_absolute_mount_point))
33ad9f1a 669 return hierarchy->rw_absolute_mount_point;
d3f99e96 670 if (!should_be_writable && mountpoint_is_accessible(hierarchy->ro_absolute_mount_point))
33ad9f1a 671 return hierarchy->ro_absolute_mount_point;
b98f7d6e 672
33ad9f1a
CS
673 for (mps = hierarchy->all_mount_points; mps && *mps; mps++) {
674 struct cgroup_mount_point *mp = *mps;
675 size_t prefix_len = mp->mount_prefix ? strlen(mp->mount_prefix) : 0;
b98f7d6e 676
33ad9f1a
CS
677 if (prefix_len == 1 && mp->mount_prefix[0] == '/')
678 prefix_len = 0;
b98f7d6e 679
d3f99e96
SH
680 if (!mountpoint_is_accessible(mp))
681 continue;
682
33ad9f1a
CS
683 if (should_be_writable && mp->read_only)
684 continue;
685
686 if (!prefix_len ||
687 (strncmp(group, mp->mount_prefix, prefix_len) == 0 &&
688 (group[prefix_len] == '\0' || group[prefix_len] == '/'))) {
689 /* search for the best quality match, i.e. the match with the
690 * shortest prefix where this group is still contained
691 */
692 if (quality == -1 || prefix_len < quality) {
693 current_result = mp;
694 quality = prefix_len;
695 }
b98f7d6e
SH
696 }
697 }
698
33ad9f1a
CS
699 if (!current_result)
700 errno = ENOENT;
701 return current_result;
b98f7d6e
SH
702}
703
4fb3cba5 704static char *lxc_cgroup_find_abs_path(const char *subsystem, const char *group, bool should_be_writable, const char *suffix)
b98f7d6e 705{
33ad9f1a
CS
706 struct cgroup_meta_data *meta_data;
707 struct cgroup_hierarchy *h;
708 struct cgroup_mount_point *mp;
709 char *result;
710 int saved_errno;
711
712 meta_data = lxc_cgroup_load_meta();
713 if (!meta_data)
714 return NULL;
b98f7d6e 715
33ad9f1a
CS
716 h = lxc_cgroup_find_hierarchy(meta_data, subsystem);
717 if (!h)
718 goto out_error;
b98f7d6e 719
33ad9f1a
CS
720 mp = lxc_cgroup_find_mount_point(h, group, should_be_writable);
721 if (!mp)
722 goto out_error;
b98f7d6e 723
33ad9f1a
CS
724 result = cgroup_to_absolute_path(mp, group, suffix);
725 if (!result)
726 goto out_error;
b98f7d6e 727
33ad9f1a
CS
728 lxc_cgroup_put_meta(meta_data);
729 return result;
b98f7d6e 730
33ad9f1a
CS
731out_error:
732 saved_errno = errno;
733 lxc_cgroup_put_meta(meta_data);
734 errno = saved_errno;
735 return NULL;
b98f7d6e
SH
736}
737
4fb3cba5 738static struct cgroup_process_info *lxc_cgroup_process_info_get(pid_t pid, struct cgroup_meta_data *meta)
fd4f5a56 739{
33ad9f1a
CS
740 char pid_buf[32];
741 snprintf(pid_buf, 32, "/proc/%lu/cgroup", (unsigned long)pid);
742 return lxc_cgroup_process_info_getx(pid_buf, meta);
c8f7c563
CS
743}
744
4fb3cba5 745static struct cgroup_process_info *lxc_cgroup_process_info_get_init(struct cgroup_meta_data *meta)
c8f7c563 746{
33ad9f1a
CS
747 return lxc_cgroup_process_info_get(1, meta);
748}
b98f7d6e 749
4fb3cba5 750static struct cgroup_process_info *lxc_cgroup_process_info_get_self(struct cgroup_meta_data *meta)
33ad9f1a
CS
751{
752 struct cgroup_process_info *i;
753 i = lxc_cgroup_process_info_getx("/proc/self/cgroup", meta);
754 if (!i)
755 i = lxc_cgroup_process_info_get(getpid(), meta);
756 return i;
757}
ae5c8b8e 758
692ba18f
SH
759/*
760 * If a controller has ns cgroup mounted, then in that cgroup the handler->pid
761 * is already in a new cgroup named after the pid. 'mnt' is passed in as
762 * the full current cgroup. Say that is /sys/fs/cgroup/lxc/2975 and the container
763 * name is c1. . We want to rename the cgroup directory to /sys/fs/cgroup/lxc/c1,
764 * and return the string /sys/fs/cgroup/lxc/c1.
765 */
cea0552e 766static char *cgroup_rename_nsgroup(const char *mountpath, const char *oldname, pid_t pid, const char *name)
692ba18f
SH
767{
768 char *dir, *fulloldpath;
769 char *newname, *fullnewpath;
cea0552e 770 int len, newlen, ret;
692ba18f
SH
771
772 /*
773 * if cgroup is mounted at /cgroup and task is in cgroup /ab/, pid 2375 and
774 * name is c1,
775 * dir: /ab
776 * fulloldpath = /cgroup/ab/2375
777 * fullnewpath = /cgroup/ab/c1
778 * newname = /ab/c1
779 */
780 dir = alloca(strlen(oldname) + 1);
781 strcpy(dir, oldname);
782
cea0552e
SH
783 len = strlen(oldname) + strlen(mountpath) + 22;
784 fulloldpath = alloca(len);
785 ret = snprintf(fulloldpath, len, "%s/%s/%ld", mountpath, oldname, (unsigned long)pid);
786 if (ret < 0 || ret >= len)
787 return NULL;
692ba18f
SH
788
789 len = strlen(dir) + strlen(name) + 2;
790 newname = malloc(len);
791 if (!newname) {
792 SYSERROR("Out of memory");
793 return NULL;
794 }
cea0552e
SH
795 ret = snprintf(newname, len, "%s/%s", dir, name);
796 if (ret < 0 || ret >= len) {
797 free(newname);
798 return NULL;
799 }
692ba18f 800
cea0552e
SH
801 newlen = strlen(mountpath) + len + 2;
802 fullnewpath = alloca(newlen);
803 ret = snprintf(fullnewpath, newlen, "%s/%s", mountpath, newname);
804 if (ret < 0 || ret >= newlen) {
805 free(newname);
806 return NULL;
807 }
692ba18f
SH
808
809 if (access(fullnewpath, F_OK) == 0) {
810 if (rmdir(fullnewpath) != 0) {
811 SYSERROR("container cgroup %s already exists.", fullnewpath);
812 free(newname);
813 return NULL;
814 }
815 }
816 if (rename(fulloldpath, fullnewpath)) {
817 SYSERROR("failed to rename cgroup %s->%s", fulloldpath, fullnewpath);
818 free(newname);
819 return NULL;
820 }
821
822 DEBUG("'%s' renamed to '%s'", oldname, newname);
823
824 return newname;
825}
826
ea439aac
SH
827static bool is_crucial_hierarchy(struct cgroup_hierarchy *h)
828{
829 char **p;
830
831 for (p = h->subsystems; *p; p++) {
832 if (is_crucial_cgroup_subsystem(*p))
833 return true;
834 }
835 return false;
836}
837
33ad9f1a 838/* create a new cgroup */
4fb3cba5 839static struct cgroup_process_info *lxc_cgroupfs_create(const char *name, const char *path_pattern, struct cgroup_meta_data *meta_data, const char *sub_pattern)
33ad9f1a 840{
001b026e 841 char **cgroup_path_components = NULL;
33ad9f1a
CS
842 char **p = NULL;
843 char *path_so_far = NULL;
844 char **new_cgroup_paths = NULL;
845 char **new_cgroup_paths_sub = NULL;
846 struct cgroup_mount_point *mp;
847 struct cgroup_hierarchy *h;
848 struct cgroup_process_info *base_info = NULL;
849 struct cgroup_process_info *info_ptr;
850 int saved_errno;
851 int r;
852 unsigned suffix = 0;
853 bool had_sub_pattern = false;
854 size_t i;
ae5c8b8e 855
33ad9f1a
CS
856 if (!is_valid_cgroup(name)) {
857 ERROR("Invalid cgroup name: '%s'", name);
858 errno = EINVAL;
859 return NULL;
ae5c8b8e
SH
860 }
861
33ad9f1a
CS
862 if (!strstr(path_pattern, "%n")) {
863 ERROR("Invalid cgroup path pattern: '%s'; contains no %%n for specifying container name", path_pattern);
864 errno = EINVAL;
865 return NULL;
866 }
fd37327f 867
33ad9f1a
CS
868 /* we will modify the result of this operation directly,
869 * so we don't have to copy the data structure
870 */
871 base_info = (path_pattern[0] == '/') ?
872 lxc_cgroup_process_info_get_init(meta_data) :
873 lxc_cgroup_process_info_get_self(meta_data);
874 if (!base_info)
875 return NULL;
c8f7c563 876
33ad9f1a
CS
877 new_cgroup_paths = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
878 if (!new_cgroup_paths)
879 goto out_initial_error;
880
881 new_cgroup_paths_sub = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
882 if (!new_cgroup_paths_sub)
883 goto out_initial_error;
884
885 /* find mount points we can use */
886 for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
887 h = info_ptr->hierarchy;
888 mp = lxc_cgroup_find_mount_point(h, info_ptr->cgroup_path, true);
889 if (!mp) {
890 ERROR("Could not find writable mount point for cgroup hierarchy %d while trying to create cgroup.", h->index);
891 goto out_initial_error;
892 }
893 info_ptr->designated_mount_point = mp;
460a1cf0 894
692ba18f
SH
895 if (lxc_string_in_array("ns", (const char **)h->subsystems))
896 continue;
2edb53c7
SH
897 if (handle_cgroup_settings(mp, info_ptr->cgroup_path) < 0) {
898 ERROR("Could not set clone_children to 1 for cpuset hierarchy in parent cgroup.");
33ad9f1a 899 goto out_initial_error;
2edb53c7 900 }
33ad9f1a 901 }
b98f7d6e 902
33ad9f1a
CS
903 /* normalize the path */
904 cgroup_path_components = lxc_normalize_path(path_pattern);
905 if (!cgroup_path_components)
906 goto out_initial_error;
907
908 /* go through the path components to see if we can create them */
909 for (p = cgroup_path_components; *p || (sub_pattern && !had_sub_pattern); p++) {
910 /* we only want to create the same component with -1, -2, etc.
911 * if the component contains the container name itself, otherwise
912 * it's not an error if it already exists
913 */
914 char *p_eff = *p ? *p : (char *)sub_pattern;
915 bool contains_name = strstr(p_eff, "%n");
916 char *current_component = NULL;
917 char *current_subpath = NULL;
918 char *current_entire_path = NULL;
919 char *parts[3];
920 size_t j = 0;
921 i = 0;
922
923 /* if we are processing the subpattern, we want to make sure
924 * loop is ended the next time around
925 */
926 if (!*p) {
927 had_sub_pattern = true;
928 p--;
929 }
b98f7d6e 930
33ad9f1a 931 goto find_name_on_this_level;
4fb3cba5 932
33ad9f1a
CS
933 cleanup_name_on_this_level:
934 /* This is reached if we found a name clash.
935 * In that case, remove the cgroup from all previous hierarchies
936 */
937 for (j = 0, info_ptr = base_info; j < i && info_ptr; info_ptr = info_ptr->next, j++) {
77afbedf
SH
938 if (info_ptr->created_paths_count < 1)
939 continue;
6a9e0f26 940 r = remove_cgroup(info_ptr->designated_mount_point, info_ptr->created_paths[info_ptr->created_paths_count - 1], false, NULL);
33ad9f1a
CS
941 if (r < 0)
942 WARN("could not clean up cgroup we created when trying to create container");
943 free(info_ptr->created_paths[info_ptr->created_paths_count - 1]);
944 info_ptr->created_paths[--info_ptr->created_paths_count] = NULL;
945 }
946 if (current_component != current_subpath)
947 free(current_subpath);
948 if (current_component != p_eff)
949 free(current_component);
950 current_component = current_subpath = NULL;
951 /* try again with another suffix */
952 ++suffix;
4fb3cba5 953
33ad9f1a
CS
954 find_name_on_this_level:
955 /* determine name of the path component we should create */
956 if (contains_name && suffix > 0) {
957 char *buf = calloc(strlen(name) + 32, 1);
958 if (!buf)
959 goto out_initial_error;
960 snprintf(buf, strlen(name) + 32, "%s-%u", name, suffix);
961 current_component = lxc_string_replace("%n", buf, p_eff);
962 free(buf);
963 } else {
964 current_component = contains_name ? lxc_string_replace("%n", name, p_eff) : p_eff;
965 }
966 parts[0] = path_so_far;
967 parts[1] = current_component;
968 parts[2] = NULL;
969 current_subpath = path_so_far ? lxc_string_join("/", (const char **)parts, false) : current_component;
970
971 /* Now go through each hierarchy and try to create the
972 * corresponding cgroup
973 */
974 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
975 char *parts2[3];
692ba18f
SH
976
977 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
978 continue;
33ad9f1a
CS
979 current_entire_path = NULL;
980
981 parts2[0] = !strcmp(info_ptr->cgroup_path, "/") ? "" : info_ptr->cgroup_path;
982 parts2[1] = current_subpath;
983 parts2[2] = NULL;
984 current_entire_path = lxc_string_join("/", (const char **)parts2, false);
985
986 if (!*p) {
987 /* we are processing the subpath, so only update that one */
988 free(new_cgroup_paths_sub[i]);
989 new_cgroup_paths_sub[i] = strdup(current_entire_path);
990 if (!new_cgroup_paths_sub[i])
991 goto cleanup_from_error;
992 } else {
993 /* remember which path was used on this controller */
994 free(new_cgroup_paths[i]);
995 new_cgroup_paths[i] = strdup(current_entire_path);
996 if (!new_cgroup_paths[i])
997 goto cleanup_from_error;
998 }
fd4f5a56 999
33ad9f1a
CS
1000 r = create_cgroup(info_ptr->designated_mount_point, current_entire_path);
1001 if (r < 0 && errno == EEXIST && contains_name) {
1002 /* name clash => try new name with new suffix */
1003 free(current_entire_path);
1004 current_entire_path = NULL;
1005 goto cleanup_name_on_this_level;
1006 } else if (r < 0 && errno != EEXIST) {
ea439aac
SH
1007 if (is_crucial_hierarchy(info_ptr->hierarchy)) {
1008 SYSERROR("Could not create cgroup '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
1009 goto cleanup_from_error;
1010 }
1011 goto skip;
33ad9f1a
CS
1012 } else if (r == 0) {
1013 /* successfully created */
1014 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1015 if (r < 0)
1016 goto cleanup_from_error;
d703c2b1 1017 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, current_entire_path)) {
b38b62a6 1018 ERROR("Failed to initialize cpuset for '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
d703c2b1
RV
1019 goto cleanup_from_error;
1020 }
33ad9f1a
CS
1021 info_ptr->created_paths[info_ptr->created_paths_count++] = current_entire_path;
1022 } else {
1023 /* if we didn't create the cgroup, then we have to make sure that
1024 * further cgroups will be created properly
1025 */
d703c2b1 1026 if (handle_cgroup_settings(info_ptr->designated_mount_point, info_ptr->cgroup_path) < 0) {
f6ac3b9e 1027 ERROR("Could not set clone_children to 1 for cpuset hierarchy in pre-existing cgroup.");
33ad9f1a 1028 goto cleanup_from_error;
f6ac3b9e 1029 }
d703c2b1
RV
1030 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, info_ptr->cgroup_path)) {
1031 ERROR("Failed to initialize cpuset in pre-existing '%s'.", info_ptr->cgroup_path);
1032 goto cleanup_from_error;
1033 }
33ad9f1a 1034
ea439aac 1035skip:
33ad9f1a
CS
1036 /* already existed but path component of pattern didn't contain '%n',
1037 * so this is not an error; but then we don't need current_entire_path
1038 * anymore...
1039 */
1040 free(current_entire_path);
1041 current_entire_path = NULL;
1042 }
1043 }
fd4f5a56 1044
33ad9f1a
CS
1045 /* save path so far */
1046 free(path_so_far);
1047 path_so_far = strdup(current_subpath);
1048 if (!path_so_far)
1049 goto cleanup_from_error;
1050
1051 /* cleanup */
1052 if (current_component != current_subpath)
1053 free(current_subpath);
1054 if (current_component != p_eff)
1055 free(current_component);
1056 current_component = current_subpath = NULL;
1057 continue;
4fb3cba5 1058
33ad9f1a 1059 cleanup_from_error:
ec64264d 1060 /* called if an error occurred in the loop, so we
33ad9f1a
CS
1061 * do some additional cleanup here
1062 */
1063 saved_errno = errno;
1064 if (current_component != current_subpath)
1065 free(current_subpath);
1066 if (current_component != p_eff)
1067 free(current_component);
1068 free(current_entire_path);
1069 errno = saved_errno;
1070 goto out_initial_error;
fd4f5a56
DL
1071 }
1072
33ad9f1a
CS
1073 /* we're done, now update the paths */
1074 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
47d8fb3b
CS
1075 /* ignore legacy 'ns' subsystem here, lxc_cgroup_create_legacy
1076 * will take care of it
1077 * Since we do a continue in above loop, new_cgroup_paths[i] is
1078 * unset anyway, as is new_cgroup_paths_sub[i]
692ba18f 1079 */
47d8fb3b
CS
1080 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1081 continue;
1082 free(info_ptr->cgroup_path);
1083 info_ptr->cgroup_path = new_cgroup_paths[i];
1084 info_ptr->cgroup_path_sub = new_cgroup_paths_sub[i];
fd4f5a56 1085 }
33ad9f1a
CS
1086 /* don't use lxc_free_array since we used the array members
1087 * to store them in our result...
1088 */
1089 free(new_cgroup_paths);
1090 free(new_cgroup_paths_sub);
1091 free(path_so_far);
1092 lxc_free_array((void **)cgroup_path_components, free);
1093 return base_info;
1094
1095out_initial_error:
1096 saved_errno = errno;
1097 free(path_so_far);
6a9e0f26 1098 lxc_cgroup_process_info_free_and_remove(base_info, NULL);
33ad9f1a
CS
1099 lxc_free_array((void **)new_cgroup_paths, free);
1100 lxc_free_array((void **)new_cgroup_paths_sub, free);
1101 lxc_free_array((void **)cgroup_path_components, free);
1102 errno = saved_errno;
1103 return NULL;
c8f7c563
CS
1104}
1105
4fb3cba5 1106static int lxc_cgroup_create_legacy(struct cgroup_process_info *base_info, const char *name, pid_t pid)
47d8fb3b
CS
1107{
1108 struct cgroup_process_info *info_ptr;
1109 int r;
1110
1111 for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
1112 if (!lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1113 continue;
1114 /*
1115 * For any path which has ns cgroup mounted, handler->pid is already
1116 * moved into a container called '%d % (handler->pid)'. Rename it to
1117 * the cgroup name and record that.
1118 */
1119 char *tmp = cgroup_rename_nsgroup((const char *)info_ptr->designated_mount_point->mount_point,
1120 info_ptr->cgroup_path, pid, name);
1121 if (!tmp)
1122 return -1;
1123 free(info_ptr->cgroup_path);
1124 info_ptr->cgroup_path = tmp;
1125 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1126 if (r < 0)
1127 return -1;
1128 tmp = strdup(tmp);
1129 if (!tmp)
1130 return -1;
1131 info_ptr->created_paths[info_ptr->created_paths_count++] = tmp;
1132 }
1133 return 0;
1134}
1135
33ad9f1a 1136/* get the cgroup membership of a given container */
4fb3cba5 1137static struct cgroup_process_info *lxc_cgroup_get_container_info(const char *name, const char *lxcpath, struct cgroup_meta_data *meta_data)
c8f7c563 1138{
33ad9f1a
CS
1139 struct cgroup_process_info *result = NULL;
1140 int saved_errno = 0;
1141 size_t i;
1142 struct cgroup_process_info **cptr = &result;
1143 struct cgroup_process_info *entry = NULL;
1144 char *path = NULL;
1145
1146 for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
1147 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
1148 if (!h || !h->used)
1149 continue;
c8f7c563 1150
33ad9f1a
CS
1151 /* use the command interface to look for the cgroup */
1152 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->subsystems[0]);
c661b0a8
DE
1153 if (!path) {
1154 h->used = false;
c661b0a8
DE
1155 continue;
1156 }
33ad9f1a
CS
1157
1158 entry = calloc(1, sizeof(struct cgroup_process_info));
1159 if (!entry)
1160 goto out_error;
1161 entry->meta_ref = lxc_cgroup_get_meta(meta_data);
1162 entry->hierarchy = h;
1163 entry->cgroup_path = path;
1164 path = NULL;
1165
1166 /* it is not an error if we don't find anything here,
1167 * it is up to the caller to decide what to do in that
1168 * case */
1169 entry->designated_mount_point = lxc_cgroup_find_mount_point(h, entry->cgroup_path, true);
1170
1171 *cptr = entry;
1172 cptr = &entry->next;
1173 entry = NULL;
c8f7c563
CS
1174 }
1175
33ad9f1a
CS
1176 return result;
1177out_error:
1178 saved_errno = errno;
1179 free(path);
1180 lxc_cgroup_process_info_free(result);
1181 lxc_cgroup_process_info_free(entry);
1182 errno = saved_errno;
1183 return NULL;
fd4f5a56
DL
1184}
1185
33ad9f1a 1186/* move a processs to the cgroups specified by the membership */
4fb3cba5 1187static int lxc_cgroupfs_enter(struct cgroup_process_info *info, pid_t pid, bool enter_sub)
4f17323e 1188{
33ad9f1a
CS
1189 char pid_buf[32];
1190 char *cgroup_tasks_fn;
1191 int r;
1192 struct cgroup_process_info *info_ptr;
1193
1194 snprintf(pid_buf, 32, "%lu", (unsigned long)pid);
1195 for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1196 char *cgroup_path = (enter_sub && info_ptr->cgroup_path_sub) ?
1197 info_ptr->cgroup_path_sub :
1198 info_ptr->cgroup_path;
1199
1200 if (!info_ptr->designated_mount_point) {
1201 info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, cgroup_path, true);
1202 if (!info_ptr->designated_mount_point) {
1203 SYSERROR("Could not add pid %lu to cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", (unsigned long)pid, cgroup_path);
1204 return -1;
1205 }
1206 }
4f17323e 1207
33ad9f1a
CS
1208 cgroup_tasks_fn = cgroup_to_absolute_path(info_ptr->designated_mount_point, cgroup_path, "/tasks");
1209 if (!cgroup_tasks_fn) {
1210 SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1211 return -1;
1212 }
4f17323e 1213
33ad9f1a 1214 r = lxc_write_to_file(cgroup_tasks_fn, pid_buf, strlen(pid_buf), false);
5903da82 1215 free(cgroup_tasks_fn);
ea439aac 1216 if (r < 0 && is_crucial_hierarchy(info_ptr->hierarchy)) {
33ad9f1a
CS
1217 SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1218 return -1;
1219 }
4f17323e
CS
1220 }
1221
33ad9f1a 1222 return 0;
4f17323e
CS
1223}
1224
33ad9f1a
CS
1225/* free process membership information */
1226void lxc_cgroup_process_info_free(struct cgroup_process_info *info)
fc7de561 1227{
33ad9f1a
CS
1228 struct cgroup_process_info *next;
1229 if (!info)
b98f7d6e 1230 return;
33ad9f1a
CS
1231 next = info->next;
1232 lxc_cgroup_put_meta(info->meta_ref);
1233 free(info->cgroup_path);
1234 free(info->cgroup_path_sub);
1235 lxc_free_array((void **)info->created_paths, free);
1236 free(info);
1237 lxc_cgroup_process_info_free(next);
fc7de561
SH
1238}
1239
33ad9f1a 1240/* free process membership information and remove cgroups that were created */
6a9e0f26 1241void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info, struct lxc_conf *conf)
b98f7d6e 1242{
33ad9f1a
CS
1243 struct cgroup_process_info *next;
1244 char **pp;
1245 if (!info)
1246 return;
1247 next = info->next;
603c64c2 1248 {
33ad9f1a
CS
1249 struct cgroup_mount_point *mp = info->designated_mount_point;
1250 if (!mp)
1251 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1252 if (mp)
1253 /* ignore return value here, perhaps we created the
1254 * '/lxc' cgroup in this container but another container
1255 * is still running (for example)
1256 */
6a9e0f26 1257 (void)remove_cgroup(mp, info->cgroup_path, true, conf);
603c64c2
SH
1258 }
1259 for (pp = info->created_paths; pp && *pp; pp++);
1260 for ((void)(pp && --pp); info->created_paths && pp >= info->created_paths; --pp) {
33ad9f1a 1261 free(*pp);
b98f7d6e 1262 }
33ad9f1a
CS
1263 free(info->created_paths);
1264 lxc_cgroup_put_meta(info->meta_ref);
1265 free(info->cgroup_path);
1266 free(info->cgroup_path_sub);
1267 free(info);
6a9e0f26 1268 lxc_cgroup_process_info_free_and_remove(next, conf);
33ad9f1a 1269}
b98f7d6e 1270
4fb3cba5 1271static char *lxc_cgroup_get_hierarchy_path_data(const char *subsystem, struct cgfs_data *d)
33ad9f1a 1272{
d4ef7c50
SH
1273 struct cgroup_process_info *info = d->info;
1274 info = find_info_for_subsystem(info, subsystem);
33ad9f1a
CS
1275 if (!info)
1276 return NULL;
f348e47c 1277 prune_init_scope(info->cgroup_path);
33ad9f1a 1278 return info->cgroup_path;
b98f7d6e
SH
1279}
1280
4fb3cba5 1281static char *lxc_cgroup_get_hierarchy_abs_path_data(const char *subsystem, struct cgfs_data *d)
b98f7d6e 1282{
d4ef7c50 1283 struct cgroup_process_info *info = d->info;
33ad9f1a 1284 struct cgroup_mount_point *mp = NULL;
d4ef7c50
SH
1285
1286 info = find_info_for_subsystem(info, subsystem);
33ad9f1a
CS
1287 if (!info)
1288 return NULL;
1289 if (info->designated_mount_point) {
8900b9eb 1290 mp = info->designated_mount_point;
33ad9f1a
CS
1291 } else {
1292 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1293 if (!mp)
1294 return NULL;
b98f7d6e 1295 }
33ad9f1a 1296 return cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
b98f7d6e 1297}
55c76589 1298
4fb3cba5 1299static char *lxc_cgroup_get_hierarchy_abs_path(const char *subsystem, const char *name, const char *lxcpath)
9a93d992 1300{
33ad9f1a
CS
1301 struct cgroup_meta_data *meta;
1302 struct cgroup_process_info *base_info, *info;
1303 struct cgroup_mount_point *mp;
1304 char *result = NULL;
33ad9f1a
CS
1305
1306 meta = lxc_cgroup_load_meta();
1307 if (!meta)
9a93d992 1308 return NULL;
33ad9f1a
CS
1309 base_info = lxc_cgroup_get_container_info(name, lxcpath, meta);
1310 if (!base_info)
178938fe 1311 goto out1;
33ad9f1a
CS
1312 info = find_info_for_subsystem(base_info, subsystem);
1313 if (!info)
178938fe 1314 goto out2;
33ad9f1a 1315 if (info->designated_mount_point) {
8900b9eb 1316 mp = info->designated_mount_point;
33ad9f1a
CS
1317 } else {
1318 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1319 if (!mp)
178938fe 1320 goto out3;
33ad9f1a
CS
1321 }
1322 result = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
178938fe 1323out3:
178938fe 1324out2:
33ad9f1a 1325 lxc_cgroup_process_info_free(base_info);
178938fe 1326out1:
33ad9f1a 1327 lxc_cgroup_put_meta(meta);
33ad9f1a
CS
1328 return result;
1329}
9a93d992 1330
4fb3cba5 1331static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfs_data *d)
33ad9f1a
CS
1332{
1333 char *subsystem = NULL, *p, *path;
1334 int ret = -1;
9a93d992 1335
33ad9f1a
CS
1336 subsystem = alloca(strlen(filename) + 1);
1337 strcpy(subsystem, filename);
46cd2845 1338 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a 1339 *p = '\0';
9a93d992 1340
4f875f70 1341 errno = ENOENT;
4fb3cba5 1342 path = lxc_cgroup_get_hierarchy_abs_path_data(subsystem, d);
33ad9f1a
CS
1343 if (path) {
1344 ret = do_cgroup_set(path, filename, value);
4f875f70 1345 int saved_errno = errno;
33ad9f1a 1346 free(path);
4f875f70 1347 errno = saved_errno;
9a93d992 1348 }
33ad9f1a
CS
1349 return ret;
1350}
9a93d992 1351
4fb3cba5 1352static int lxc_cgroupfs_set(const char *filename, const char *value, const char *name, const char *lxcpath)
9a93d992 1353{
33ad9f1a
CS
1354 char *subsystem = NULL, *p, *path;
1355 int ret = -1;
9a93d992 1356
33ad9f1a
CS
1357 subsystem = alloca(strlen(filename) + 1);
1358 strcpy(subsystem, filename);
46cd2845 1359 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a 1360 *p = '\0';
9a93d992 1361
33ad9f1a
CS
1362 path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1363 if (path) {
1364 ret = do_cgroup_set(path, filename, value);
1365 free(path);
1366 }
b98f7d6e 1367 return ret;
9a93d992
SH
1368}
1369
4fb3cba5 1370static int lxc_cgroupfs_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
9a93d992 1371{
33ad9f1a
CS
1372 char *subsystem = NULL, *p, *path;
1373 int ret = -1;
1374
1375 subsystem = alloca(strlen(filename) + 1);
1376 strcpy(subsystem, filename);
46cd2845 1377 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a
CS
1378 *p = '\0';
1379
1380 path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1381 if (path) {
1382 ret = do_cgroup_get(path, filename, value, len);
1383 free(path);
9a93d992 1384 }
33ad9f1a 1385 return ret;
9a93d992
SH
1386}
1387
4fb3cba5 1388static bool cgroupfs_mount_cgroup(void *hdata, const char *root, int type)
aae1f3c4
CS
1389{
1390 size_t bufsz = strlen(root) + sizeof("/sys/fs/cgroup");
1391 char *path = NULL;
1392 char **parts = NULL;
1393 char *dirname = NULL;
1394 char *abs_path = NULL;
1395 char *abs_path2 = NULL;
d4ef7c50
SH
1396 struct cgfs_data *cgfs_d;
1397 struct cgroup_process_info *info, *base_info;
aae1f3c4
CS
1398 int r, saved_errno = 0;
1399
4608594e
SH
1400 if (cgns_supported())
1401 return true;
1402
4fb3cba5
DE
1403 cgfs_d = hdata;
1404 if (!cgfs_d)
1405 return false;
d4ef7c50
SH
1406 base_info = cgfs_d->info;
1407
0769b82a
CS
1408 /* If we get passed the _NOSPEC types, we default to _MIXED, since we don't
1409 * have access to the lxc_conf object at this point. It really should be up
1410 * to the caller to fix this, but this doesn't really hurt.
1411 */
1412 if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1413 type = LXC_AUTO_CGROUP_FULL_MIXED;
1414 else if (type == LXC_AUTO_CGROUP_NOSPEC)
1415 type = LXC_AUTO_CGROUP_MIXED;
1416
7997d7da
CS
1417 if (type < LXC_AUTO_CGROUP_RO || type > LXC_AUTO_CGROUP_FULL_MIXED) {
1418 ERROR("could not mount cgroups into container: invalid type specified internally");
1419 errno = EINVAL;
c476bdce 1420 return false;
7997d7da
CS
1421 }
1422
aae1f3c4
CS
1423 path = calloc(1, bufsz);
1424 if (!path)
c476bdce 1425 return false;
aae1f3c4 1426 snprintf(path, bufsz, "%s/sys/fs/cgroup", root);
592fd47a
SH
1427 r = safe_mount("cgroup_root", path, "tmpfs",
1428 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1429 "size=10240k,mode=755",
1430 root);
aae1f3c4
CS
1431 if (r < 0) {
1432 SYSERROR("could not mount tmpfs to /sys/fs/cgroup in the container");
c476bdce 1433 return false;
aae1f3c4
CS
1434 }
1435
1436 /* now mount all the hierarchies we care about */
1437 for (info = base_info; info; info = info->next) {
1438 size_t subsystem_count, i;
1439 struct cgroup_mount_point *mp = info->designated_mount_point;
d3f99e96 1440 if (!mountpoint_is_accessible(mp))
aae1f3c4 1441 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
d3f99e96 1442
aae1f3c4
CS
1443 if (!mp) {
1444 SYSERROR("could not find original mount point for cgroup hierarchy while trying to mount cgroup filesystem");
1445 goto out_error;
1446 }
1447
1448 subsystem_count = lxc_array_len((void **)info->hierarchy->subsystems);
1449 parts = calloc(subsystem_count + 1, sizeof(char *));
1450 if (!parts)
1451 goto out_error;
1452
1453 for (i = 0; i < subsystem_count; i++) {
1454 if (!strncmp(info->hierarchy->subsystems[i], "name=", 5))
1455 parts[i] = info->hierarchy->subsystems[i] + 5;
1456 else
1457 parts[i] = info->hierarchy->subsystems[i];
1458 }
1459 dirname = lxc_string_join(",", (const char **)parts, false);
1460 if (!dirname)
1461 goto out_error;
1462
1463 /* create subsystem directory */
1464 abs_path = lxc_append_paths(path, dirname);
1465 if (!abs_path)
1466 goto out_error;
1467 r = mkdir_p(abs_path, 0755);
1468 if (r < 0 && errno != EEXIST) {
1469 SYSERROR("could not create cgroup subsystem directory /sys/fs/cgroup/%s", dirname);
1470 goto out_error;
1471 }
1472
aae1f3c4
CS
1473 abs_path2 = lxc_append_paths(abs_path, info->cgroup_path);
1474 if (!abs_path2)
1475 goto out_error;
aae1f3c4 1476
7997d7da
CS
1477 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_RW || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1478 /* bind-mount the cgroup entire filesystem there */
1479 if (strcmp(mp->mount_prefix, "/") != 0) {
1480 /* FIXME: maybe we should just try to remount the entire hierarchy
1481 * with a regular mount command? may that works? */
1482 ERROR("could not automatically mount cgroup-full to /sys/fs/cgroup/%s: host has no mount point for this cgroup filesystem that has access to the root cgroup", dirname);
1483 goto out_error;
1484 }
1485 r = mount(mp->mount_point, abs_path, "none", MS_BIND, 0);
1486 if (r < 0) {
1487 SYSERROR("error bind-mounting %s to %s", mp->mount_point, abs_path);
1488 goto out_error;
1489 }
f8f3c3c0
SG
1490 /* main cgroup path should be read-only */
1491 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1492 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1493 if (r < 0) {
1494 SYSERROR("error re-mounting %s readonly", abs_path);
1495 goto out_error;
1496 }
1497 }
7997d7da
CS
1498 /* own cgroup should be read-write */
1499 if (type == LXC_AUTO_CGROUP_FULL_MIXED) {
1500 r = mount(abs_path2, abs_path2, NULL, MS_BIND, NULL);
1501 if (r < 0) {
1502 SYSERROR("error bind-mounting %s onto itself", abs_path2);
1503 goto out_error;
1504 }
1505 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND, NULL);
1506 if (r < 0) {
1507 SYSERROR("error re-mounting %s readwrite", abs_path2);
1508 goto out_error;
1509 }
1510 }
1511 } else {
1512 /* create path for container's cgroup */
1513 r = mkdir_p(abs_path2, 0755);
1514 if (r < 0 && errno != EEXIST) {
1515 SYSERROR("could not create cgroup directory /sys/fs/cgroup/%s%s", dirname, info->cgroup_path);
1516 goto out_error;
1517 }
aae1f3c4 1518
b46f0553
CS
1519 /* for read-only and mixed cases, we have to bind-mount the tmpfs directory
1520 * that points to the hierarchy itself (i.e. /sys/fs/cgroup/cpu etc.) onto
1521 * itself and then bind-mount it read-only, since we keep the tmpfs itself
1522 * read-write (see comment below)
1523 */
1524 if (type == LXC_AUTO_CGROUP_MIXED || type == LXC_AUTO_CGROUP_RO) {
1525 r = mount(abs_path, abs_path, NULL, MS_BIND, NULL);
1526 if (r < 0) {
1527 SYSERROR("error bind-mounting %s onto itself", abs_path);
1528 goto out_error;
1529 }
1530 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1531 if (r < 0) {
1532 SYSERROR("error re-mounting %s readonly", abs_path);
1533 goto out_error;
1534 }
1535 }
1536
7997d7da
CS
1537 free(abs_path);
1538 abs_path = NULL;
1539
1540 /* bind-mount container's cgroup to that directory */
1541 abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1542 if (!abs_path)
1543 goto out_error;
1544 r = mount(abs_path, abs_path2, "none", MS_BIND, 0);
ea439aac 1545 if (r < 0 && is_crucial_hierarchy(info->hierarchy)) {
7997d7da
CS
1546 SYSERROR("error bind-mounting %s to %s", abs_path, abs_path2);
1547 goto out_error;
1548 }
1549 if (type == LXC_AUTO_CGROUP_RO) {
1550 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1551 if (r < 0) {
1552 SYSERROR("error re-mounting %s readonly", abs_path2);
1553 goto out_error;
1554 }
1555 }
aae1f3c4
CS
1556 }
1557
1558 free(abs_path);
1559 free(abs_path2);
1560 abs_path = NULL;
1561 abs_path2 = NULL;
1562
1563 /* add symlinks for every single subsystem */
1564 if (subsystem_count > 1) {
1565 for (i = 0; i < subsystem_count; i++) {
1566 abs_path = lxc_append_paths(path, parts[i]);
1567 if (!abs_path)
1568 goto out_error;
1569 r = symlink(dirname, abs_path);
1570 if (r < 0)
1571 WARN("could not create symlink %s -> %s in /sys/fs/cgroup of container", parts[i], dirname);
1572 free(abs_path);
1573 abs_path = NULL;
1574 }
1575 }
1576 free(dirname);
1577 free(parts);
1578 dirname = NULL;
1579 parts = NULL;
1580 }
1581
b46f0553
CS
1582 /* We used to remount the entire tmpfs readonly if any :ro or
1583 * :mixed mode was specified. However, Ubuntu's mountall has the
1584 * unfortunate behavior to block bootup if /sys/fs/cgroup is
1585 * mounted read-only and cannot be remounted read-write.
1586 * (mountall reads /lib/init/fstab and tries to (re-)mount all of
1587 * these if they are not already mounted with the right options;
1588 * it contains an entry for /sys/fs/cgroup. In case it can't do
1589 * that, it prompts for the user to either manually fix it or
1590 * boot anyway. But without user input, booting of the container
1591 * hangs.)
1592 *
1593 * Instead of remounting the entire tmpfs readonly, we only
1594 * remount the paths readonly that are part of the cgroup
1595 * hierarchy.
f8f3c3c0 1596 */
f8f3c3c0 1597
aae1f3c4
CS
1598 free(path);
1599
c476bdce 1600 return true;
aae1f3c4
CS
1601
1602out_error:
1603 saved_errno = errno;
1604 free(path);
1605 free(dirname);
1606 free(parts);
1607 free(abs_path);
1608 free(abs_path2);
1609 errno = saved_errno;
c476bdce 1610 return false;
aae1f3c4
CS
1611}
1612
4fb3cba5 1613static int cgfs_nrtasks(void *hdata)
33ad9f1a 1614{
4fb3cba5
DE
1615 struct cgfs_data *d = hdata;
1616 struct cgroup_process_info *info;
33ad9f1a
CS
1617 struct cgroup_mount_point *mp = NULL;
1618 char *abs_path = NULL;
1619 int ret;
460a1cf0 1620
4fb3cba5
DE
1621 if (!d) {
1622 errno = ENOENT;
1623 return -1;
1624 }
1625
1626 info = d->info;
33ad9f1a
CS
1627 if (!info) {
1628 errno = ENOENT;
1629 return -1;
b98f7d6e 1630 }
c8f7c563 1631
33ad9f1a 1632 if (info->designated_mount_point) {
8900b9eb 1633 mp = info->designated_mount_point;
33ad9f1a
CS
1634 } else {
1635 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, false);
1636 if (!mp)
1637 return -1;
c8f7c563
CS
1638 }
1639
33ad9f1a
CS
1640 abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1641 if (!abs_path)
1642 return -1;
1643
1644 ret = cgroup_recursive_task_count(abs_path);
1645 free(abs_path);
1646 return ret;
c8f7c563
CS
1647}
1648
574c4428
QH
1649static struct cgroup_process_info *
1650lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str,
1651 struct cgroup_meta_data *meta)
d08ba6ec 1652{
33ad9f1a
CS
1653 struct cgroup_process_info *result = NULL;
1654 FILE *proc_pid_cgroup = NULL;
1655 char *line = NULL;
1656 size_t sz = 0;
1657 int saved_errno = 0;
1658 struct cgroup_process_info **cptr = &result;
1659 struct cgroup_process_info *entry = NULL;
1660
1661 proc_pid_cgroup = fopen_cloexec(proc_pid_cgroup_str, "r");
1662 if (!proc_pid_cgroup)
b98f7d6e 1663 return NULL;
1ac470c0 1664
33ad9f1a
CS
1665 while (getline(&line, &sz, proc_pid_cgroup) != -1) {
1666 /* file format: hierarchy:subsystems:group */
1667 char *colon1;
1668 char *colon2;
1669 char *endptr;
1670 int hierarchy_number;
1671 struct cgroup_hierarchy *h = NULL;
fd4f5a56 1672
33ad9f1a 1673 if (!line[0])
ae5c8b8e 1674 continue;
b98f7d6e 1675
33ad9f1a
CS
1676 if (line[strlen(line) - 1] == '\n')
1677 line[strlen(line) - 1] = '\0';
1678
1679 colon1 = strchr(line, ':');
1680 if (!colon1)
8900b9eb 1681 continue;
33ad9f1a
CS
1682 *colon1++ = '\0';
1683 colon2 = strchr(colon1, ':');
1684 if (!colon2)
ae5c8b8e 1685 continue;
33ad9f1a 1686 *colon2++ = '\0';
e4659536 1687
33ad9f1a
CS
1688 endptr = NULL;
1689 hierarchy_number = strtoul(line, &endptr, 10);
1690 if (!endptr || *endptr)
9a93d992 1691 continue;
9a93d992 1692
33ad9f1a
CS
1693 if (hierarchy_number > meta->maximum_hierarchy) {
1694 /* we encountered a hierarchy we didn't have before,
1695 * so probably somebody remounted some stuff in the
1696 * mean time...
1697 */
1698 errno = EAGAIN;
1699 goto out_error;
b98f7d6e 1700 }
33ad9f1a
CS
1701
1702 h = meta->hierarchies[hierarchy_number];
1703 if (!h) {
1704 /* we encountered a hierarchy that was thought to be
1705 * dead before, so probably somebody remounted some
1706 * stuff in the mean time...
1707 */
1708 errno = EAGAIN;
1709 goto out_error;
b98f7d6e 1710 }
33ad9f1a
CS
1711
1712 /* we are told that we should ignore this hierarchy */
1713 if (!h->used)
b98f7d6e 1714 continue;
5193cc3d 1715
33ad9f1a
CS
1716 entry = calloc(1, sizeof(struct cgroup_process_info));
1717 if (!entry)
1718 goto out_error;
fd4f5a56 1719
33ad9f1a
CS
1720 entry->meta_ref = lxc_cgroup_get_meta(meta);
1721 entry->hierarchy = h;
1722 entry->cgroup_path = strdup(colon2);
1723 if (!entry->cgroup_path)
1724 goto out_error;
3939a22a 1725 prune_init_scope(entry->cgroup_path);
d08ba6ec 1726
33ad9f1a
CS
1727 *cptr = entry;
1728 cptr = &entry->next;
1729 entry = NULL;
b98f7d6e 1730 }
b98f7d6e 1731
33ad9f1a
CS
1732 fclose(proc_pid_cgroup);
1733 free(line);
1734 return result;
1735
1736out_error:
1737 saved_errno = errno;
1738 if (proc_pid_cgroup)
1739 fclose(proc_pid_cgroup);
1740 lxc_cgroup_process_info_free(result);
1741 lxc_cgroup_process_info_free(entry);
1742 free(line);
1743 errno = saved_errno;
ae5c8b8e 1744 return NULL;
36b86299
DL
1745}
1746
574c4428
QH
1747static char **subsystems_from_mount_options(const char *mount_options,
1748 char **kernel_list)
36b86299 1749{
33ad9f1a
CS
1750 char *token, *str, *saveptr = NULL;
1751 char **result = NULL;
1752 size_t result_capacity = 0;
8900b9eb 1753 size_t result_count = 0;
33ad9f1a
CS
1754 int saved_errno;
1755 int r;
ef342abb 1756
33ad9f1a
CS
1757 str = alloca(strlen(mount_options)+1);
1758 strcpy(str, mount_options);
1759 for (; (token = strtok_r(str, ",", &saveptr)); str = NULL) {
1760 /* we have a subsystem if it's either in the list of
1761 * subsystems provided by the kernel OR if it starts
1762 * with name= for named hierarchies
1763 */
836514a8
U
1764 r = lxc_grow_array((void ***)&result, &result_capacity, result_count + 1, 12);
1765 if (r < 0)
1766 goto out_free;
1767 result[result_count + 1] = NULL;
1768 if (strncmp(token, "name=", 5) && !lxc_string_in_array(token, (const char **)kernel_list)) {
1769 // this is eg 'systemd' but the mount will be 'name=systemd'
1770 result[result_count] = malloc(strlen(token) + 6);
1771 if (result[result_count])
1772 sprintf(result[result_count], "name=%s", token);
1773 } else
33ad9f1a 1774 result[result_count] = strdup(token);
836514a8
U
1775 if (!result[result_count])
1776 goto out_free;
1777 result_count++;
ae5c8b8e 1778 }
f0e64b8b 1779
33ad9f1a
CS
1780 return result;
1781
1782out_free:
1783 saved_errno = errno;
1784 lxc_free_array((void**)result, free);
1785 errno = saved_errno;
1786 return NULL;
b98f7d6e
SH
1787}
1788
574c4428 1789static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp)
b98f7d6e 1790{
33ad9f1a
CS
1791 if (!mp)
1792 return;
1793 free(mp->mount_point);
1794 free(mp->mount_prefix);
1795 free(mp);
bcbd102c
SH
1796}
1797
574c4428 1798static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h)
341a9bd8 1799{
33ad9f1a
CS
1800 if (!h)
1801 return;
1802 lxc_free_array((void **)h->subsystems, free);
8bfcb981 1803 free(h->all_mount_points);
33ad9f1a
CS
1804 free(h);
1805}
341a9bd8 1806
574c4428 1807static bool is_valid_cgroup(const char *name)
33ad9f1a
CS
1808{
1809 const char *p;
1810 for (p = name; *p; p++) {
28bb9321
QH
1811 /* Use the ASCII printable characters range(32 - 127)
1812 * is reasonable, we kick out 32(SPACE) because it'll
1813 * break legacy lxc-ls
1814 */
1815 if (*p <= 32 || *p >= 127 || *p == '/')
33ad9f1a 1816 return false;
341a9bd8 1817 }
33ad9f1a
CS
1818 return strcmp(name, ".") != 0 && strcmp(name, "..") != 0;
1819}
341a9bd8 1820
574c4428 1821static int create_or_remove_cgroup(bool do_remove,
6a9e0f26
SH
1822 struct cgroup_mount_point *mp, const char *path, int recurse,
1823 struct lxc_conf *conf)
33ad9f1a
CS
1824{
1825 int r, saved_errno = 0;
1826 char *buf = cgroup_to_absolute_path(mp, path, NULL);
1827 if (!buf)
1828 return -1;
341a9bd8 1829
33ad9f1a 1830 /* create or remove directory */
603c64c2 1831 if (do_remove) {
01d59fe5
CB
1832 if (!dir_exists(buf))
1833 return 0;
6a9e0f26
SH
1834 if (recurse) {
1835 if (conf && !lxc_list_empty(&conf->id_map))
1836 r = userns_exec_1(conf, rmdir_wrapper, buf);
1837 else
1838 r = cgroup_rmdir(buf);
1839 } else
603c64c2
SH
1840 r = rmdir(buf);
1841 } else
1842 r = mkdir(buf, 0777);
33ad9f1a
CS
1843 saved_errno = errno;
1844 free(buf);
1845 errno = saved_errno;
1846 return r;
341a9bd8 1847}
bcbd102c 1848
574c4428 1849static int create_cgroup(struct cgroup_mount_point *mp, const char *path)
a6ddef61 1850{
6a9e0f26 1851 return create_or_remove_cgroup(false, mp, path, false, NULL);
a6ddef61
MN
1852}
1853
574c4428 1854static int remove_cgroup(struct cgroup_mount_point *mp,
6a9e0f26 1855 const char *path, bool recurse, struct lxc_conf *conf)
576f946d 1856{
6a9e0f26 1857 return create_or_remove_cgroup(true, mp, path, recurse, conf);
33ad9f1a 1858}
576f946d 1859
574c4428
QH
1860static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp,
1861 const char *path, const char *suffix)
33ad9f1a
CS
1862{
1863 /* first we have to make sure we subtract the mount point's prefix */
1864 char *prefix = mp->mount_prefix;
1865 char *buf;
1866 ssize_t len, rv;
1867
1868 /* we want to make sure only absolute paths to cgroups are passed to us */
1869 if (path[0] != '/') {
1870 errno = EINVAL;
1871 return NULL;
1872 }
b98f7d6e 1873
33ad9f1a
CS
1874 if (prefix && !strcmp(prefix, "/"))
1875 prefix = NULL;
b98f7d6e 1876
33ad9f1a
CS
1877 /* prefix doesn't match */
1878 if (prefix && strncmp(prefix, path, strlen(prefix)) != 0) {
1879 errno = EINVAL;
1880 return NULL;
1881 }
1882 /* if prefix is /foo and path is /foobar */
1883 if (prefix && path[strlen(prefix)] != '/' && path[strlen(prefix)] != '\0') {
1884 errno = EINVAL;
1885 return NULL;
1886 }
b98f7d6e 1887
33ad9f1a
CS
1888 /* remove prefix from path */
1889 path += prefix ? strlen(prefix) : 0;
b98f7d6e 1890
33ad9f1a
CS
1891 len = strlen(mp->mount_point) + strlen(path) + (suffix ? strlen(suffix) : 0);
1892 buf = calloc(len + 1, 1);
50266dc6
DE
1893 if (!buf)
1894 return NULL;
33ad9f1a 1895 rv = snprintf(buf, len + 1, "%s%s%s", mp->mount_point, path, suffix ? suffix : "");
8900b9eb 1896 if (rv > len) {
33ad9f1a
CS
1897 free(buf);
1898 errno = ENOMEM;
8900b9eb 1899 return NULL;
8b92dc3a 1900 }
576f946d 1901
33ad9f1a 1902 return buf;
e0f888d9 1903}
283678ed 1904
574c4428
QH
1905static struct cgroup_process_info *
1906find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem)
283678ed 1907{
33ad9f1a
CS
1908 struct cgroup_process_info *info_ptr;
1909 for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1910 struct cgroup_hierarchy *h = info_ptr->hierarchy;
1911 if (lxc_string_in_array(subsystem, (const char **)h->subsystems))
1912 return info_ptr;
b98f7d6e 1913 }
33ad9f1a
CS
1914 errno = ENOENT;
1915 return NULL;
1916}
283678ed 1917
574c4428
QH
1918static int do_cgroup_get(const char *cgroup_path, const char *sub_filename,
1919 char *value, size_t len)
33ad9f1a
CS
1920{
1921 const char *parts[3] = {
1922 cgroup_path,
1923 sub_filename,
1924 NULL
1925 };
1926 char *filename;
1927 int ret, saved_errno;
1928
1929 filename = lxc_string_join("/", parts, false);
1930 if (!filename)
1931 return -1;
1932
1933 ret = lxc_read_from_file(filename, value, len);
1934 saved_errno = errno;
1935 free(filename);
1936 errno = saved_errno;
1937 return ret;
283678ed 1938}
b113383b 1939
574c4428
QH
1940static int do_cgroup_set(const char *cgroup_path, const char *sub_filename,
1941 const char *value)
b113383b 1942{
33ad9f1a
CS
1943 const char *parts[3] = {
1944 cgroup_path,
1945 sub_filename,
1946 NULL
1947 };
1948 char *filename;
1949 int ret, saved_errno;
b113383b 1950
33ad9f1a
CS
1951 filename = lxc_string_join("/", parts, false);
1952 if (!filename)
1953 return -1;
b113383b 1954
33ad9f1a
CS
1955 ret = lxc_write_to_file(filename, value, strlen(value), false);
1956 saved_errno = errno;
1957 free(filename);
1958 errno = saved_errno;
1959 return ret;
b98f7d6e
SH
1960}
1961
4fb3cba5 1962static int do_setup_cgroup_limits(struct cgfs_data *d,
574c4428 1963 struct lxc_list *cgroup_settings, bool do_devices)
b98f7d6e 1964{
365d180a 1965 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
b98f7d6e
SH
1966 struct lxc_cgroup *cg;
1967 int ret = -1;
1968
33ad9f1a 1969 if (lxc_list_empty(cgroup_settings))
b98f7d6e
SH
1970 return 0;
1971
aaf26830 1972 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
fac7c663
KT
1973 if (!sorted_cgroup_settings) {
1974 return -1;
1975 }
aaf26830
KT
1976
1977 lxc_list_for_each(iterator, sorted_cgroup_settings) {
b98f7d6e
SH
1978 cg = iterator->elem;
1979
33ad9f1a 1980 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
b98f7d6e 1981 if (strcmp(cg->subsystem, "devices.deny") == 0 &&
4fb3cba5 1982 cgroup_devices_has_allow_or_deny(d, cg->value, false))
b98f7d6e
SH
1983 continue;
1984 if (strcmp(cg->subsystem, "devices.allow") == 0 &&
4fb3cba5 1985 cgroup_devices_has_allow_or_deny(d, cg->value, true))
b98f7d6e 1986 continue;
4fb3cba5 1987 if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
dddf7c5b 1988 if (do_devices && (errno == EACCES || errno == EPERM)) {
4f875f70
SH
1989 WARN("Error setting %s to %s for %s",
1990 cg->subsystem, cg->value, d->name);
1991 continue;
1992 }
dddf7c5b 1993 SYSERROR("Error setting %s to %s for %s",
4fb3cba5 1994 cg->subsystem, cg->value, d->name);
b98f7d6e
SH
1995 goto out;
1996 }
b113383b 1997 }
b98f7d6e
SH
1998
1999 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
b113383b
SH
2000 }
2001
b98f7d6e
SH
2002 ret = 0;
2003 INFO("cgroup has been setup");
2004out:
365d180a 2005 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
aaf26830
KT
2006 lxc_list_del(iterator);
2007 free(iterator);
2008 }
365d180a 2009 free(sorted_cgroup_settings);
b113383b
SH
2010 return ret;
2011}
b98f7d6e 2012
4fb3cba5 2013static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d,
574c4428 2014 char *v, bool for_allow)
33ad9f1a
CS
2015{
2016 char *path;
2017 FILE *devices_list;
8900b9eb 2018 char *line = NULL;
33ad9f1a
CS
2019 size_t sz = 0;
2020 bool ret = !for_allow;
2021 const char *parts[3] = {
2022 NULL,
2023 "devices.list",
2024 NULL
2025 };
2026
2027 // XXX FIXME if users could use something other than 'lxc.devices.deny = a'.
2028 // not sure they ever do, but they *could*
2029 // right now, I'm assuming they do NOT
2030 if (!for_allow && strcmp(v, "a") != 0 && strcmp(v, "a *:* rwm") != 0)
2031 return false;
2032
4fb3cba5 2033 parts[0] = (const char *)lxc_cgroup_get_hierarchy_abs_path_data("devices", d);
33ad9f1a
CS
2034 if (!parts[0])
2035 return false;
2036 path = lxc_string_join("/", parts, false);
2037 if (!path) {
2038 free((void *)parts[0]);
2039 return false;
2040 }
2041
2042 devices_list = fopen_cloexec(path, "r");
2043 if (!devices_list) {
2044 free(path);
2045 return false;
2046 }
2047
2048 while (getline(&line, &sz, devices_list) != -1) {
2049 size_t len = strlen(line);
2050 if (len > 0 && line[len-1] == '\n')
2051 line[len-1] = '\0';
2052 if (strcmp(line, "a *:* rwm") == 0) {
2053 ret = for_allow;
2054 goto out;
2055 } else if (for_allow && strcmp(line, v) == 0) {
2056 ret = true;
8900b9eb 2057 goto out;
33ad9f1a
CS
2058 }
2059 }
2060
2061out:
2062 fclose(devices_list);
2063 free(line);
2064 free(path);
2065 return ret;
2066}
2067
574c4428 2068static int cgroup_recursive_task_count(const char *cgroup_path)
b98f7d6e 2069{
33ad9f1a
CS
2070 DIR *d;
2071 struct dirent *dent_buf;
2072 struct dirent *dent;
8900b9eb 2073 ssize_t name_max;
33ad9f1a
CS
2074 int n = 0, r;
2075
2076 /* see man readdir_r(3) */
2077 name_max = pathconf(cgroup_path, _PC_NAME_MAX);
2078 if (name_max <= 0)
2079 name_max = 255;
2080 dent_buf = malloc(offsetof(struct dirent, d_name) + name_max + 1);
2081 if (!dent_buf)
2082 return -1;
2083
2084 d = opendir(cgroup_path);
034ef75d
SH
2085 if (!d) {
2086 free(dent_buf);
33ad9f1a 2087 return 0;
034ef75d 2088 }
33ad9f1a
CS
2089
2090 while (readdir_r(d, dent_buf, &dent) == 0 && dent) {
2091 const char *parts[3] = {
2092 cgroup_path,
2093 dent->d_name,
2094 NULL
2095 };
2096 char *sub_path;
2097 struct stat st;
2098
2099 if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, ".."))
2100 continue;
2101 sub_path = lxc_string_join("/", parts, false);
2102 if (!sub_path) {
2103 closedir(d);
2104 free(dent_buf);
2105 return -1;
2106 }
2107 r = stat(sub_path, &st);
2108 if (r < 0) {
2109 closedir(d);
2110 free(dent_buf);
2111 free(sub_path);
2112 return -1;
2113 }
2114 if (S_ISDIR(st.st_mode)) {
2115 r = cgroup_recursive_task_count(sub_path);
2116 if (r >= 0)
2117 n += r;
2118 } else if (!strcmp(dent->d_name, "tasks")) {
2119 r = count_lines(sub_path);
2120 if (r >= 0)
2121 n += r;
2122 }
2123 free(sub_path);
2124 }
2125 closedir(d);
2126 free(dent_buf);
2127
2128 return n;
2129}
2130
574c4428 2131static int count_lines(const char *fn)
33ad9f1a
CS
2132{
2133 FILE *f;
2134 char *line = NULL;
2135 size_t sz = 0;
2136 int n = 0;
2137
2138 f = fopen_cloexec(fn, "r");
2139 if (!f)
2140 return -1;
2141
2142 while (getline(&line, &sz, f) != -1) {
2143 n++;
2144 }
2145 free(line);
2146 fclose(f);
2147 return n;
b98f7d6e
SH
2148}
2149
574c4428
QH
2150static int handle_cgroup_settings(struct cgroup_mount_point *mp,
2151 char *cgroup_path)
b98f7d6e 2152{
33ad9f1a 2153 int r, saved_errno = 0;
7e7243e1 2154 char buf[2];
1ea59ad2 2155
934b1673
SH
2156 mp->need_cpuset_init = false;
2157
1ea59ad2
SH
2158 /* If this is the memory cgroup, we want to enforce hierarchy.
2159 * But don't fail if for some reason we can't.
2160 */
2edb53c7
SH
2161 if (lxc_string_in_array("memory", (const char **)mp->hierarchy->subsystems)) {
2162 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/memory.use_hierarchy");
2163 if (cc_path) {
2164 r = lxc_read_from_file(cc_path, buf, 1);
2165 if (r < 1 || buf[0] != '1') {
2166 r = lxc_write_to_file(cc_path, "1", 1, false);
2167 if (r < 0)
a8916143 2168 SYSERROR("failed to set memory.use_hierarchy to 1; continuing");
2edb53c7 2169 }
1ea59ad2
SH
2170 free(cc_path);
2171 }
2edb53c7 2172 }
1ea59ad2 2173
33ad9f1a
CS
2174 /* if this is a cpuset hierarchy, we have to set cgroup.clone_children in
2175 * the base cgroup, otherwise containers will start with an empty cpuset.mems
2176 * and cpuset.cpus and then
2177 */
2edb53c7
SH
2178 if (lxc_string_in_array("cpuset", (const char **)mp->hierarchy->subsystems)) {
2179 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/cgroup.clone_children");
d703c2b1
RV
2180 struct stat sb;
2181
33ad9f1a 2182 if (!cc_path)
2edb53c7 2183 return -1;
d703c2b1
RV
2184 /* cgroup.clone_children is not available when running under
2185 * older kernel versions; in this case, we'll initialize
2186 * cpuset.cpus and cpuset.mems later, after the new cgroup
2187 * was created
2188 */
2189 if (stat(cc_path, &sb) != 0 && errno == ENOENT) {
934b1673 2190 mp->need_cpuset_init = true;
d703c2b1
RV
2191 free(cc_path);
2192 return 0;
2193 }
7e7243e1
SH
2194 r = lxc_read_from_file(cc_path, buf, 1);
2195 if (r == 1 && buf[0] == '1') {
2196 free(cc_path);
2edb53c7 2197 return 0;
7e7243e1 2198 }
33ad9f1a 2199 r = lxc_write_to_file(cc_path, "1", 1, false);
2edb53c7
SH
2200 saved_errno = errno;
2201 free(cc_path);
2202 errno = saved_errno;
2203 return r < 0 ? -1 : 0;
33ad9f1a
CS
2204 }
2205 return 0;
b98f7d6e 2206}
484ed030 2207
934b1673 2208static int cgroup_read_from_file(const char *fn, char buf[], size_t bufsize)
d703c2b1
RV
2209{
2210 int ret = lxc_read_from_file(fn, buf, bufsize);
2211 if (ret < 0) {
2212 SYSERROR("failed to read %s", fn);
934b1673 2213 return ret;
d703c2b1
RV
2214 }
2215 if (ret == bufsize) {
934b1673
SH
2216 if (bufsize > 0) {
2217 /* obviously this wasn't empty */
2218 buf[bufsize-1] = '\0';
2219 return ret;
2220 }
2221 /* Callers don't do this, but regression/sanity check */
2222 ERROR("%s: was not expecting 0 bufsize", __func__);
2223 return -1;
d703c2b1
RV
2224 }
2225 buf[ret] = '\0';
934b1673 2226 return ret;
d703c2b1
RV
2227}
2228
2229static bool do_init_cpuset_file(struct cgroup_mount_point *mp,
2230 const char *path, const char *name)
2231{
934b1673
SH
2232 char value[1024];
2233 char *childfile, *parentfile = NULL, *tmp;
2234 int ret;
2235 bool ok = false;
2236
d703c2b1
RV
2237 childfile = cgroup_to_absolute_path(mp, path, name);
2238 if (!childfile)
2239 return false;
2240
2241 /* don't overwrite a non-empty value in the file */
934b1673
SH
2242 ret = cgroup_read_from_file(childfile, value, sizeof(value));
2243 if (ret < 0)
2244 goto out;
d703c2b1 2245 if (value[0] != '\0' && value[0] != '\n') {
934b1673
SH
2246 ok = true;
2247 goto out;
d703c2b1
RV
2248 }
2249
2250 /* path to the same name in the parent cgroup */
2251 parentfile = strdup(path);
2252 if (!parentfile)
934b1673
SH
2253 goto out;
2254
d703c2b1 2255 tmp = strrchr(parentfile, '/');
934b1673
SH
2256 if (!tmp)
2257 goto out;
d703c2b1
RV
2258 if (tmp == parentfile)
2259 tmp++; /* keep the '/' at the start */
2260 *tmp = '\0';
2261 tmp = parentfile;
2262 parentfile = cgroup_to_absolute_path(mp, tmp, name);
2263 free(tmp);
934b1673
SH
2264 if (!parentfile)
2265 goto out;
d703c2b1
RV
2266
2267 /* copy from parent to child cgroup */
934b1673
SH
2268 ret = cgroup_read_from_file(parentfile, value, sizeof(value));
2269 if (ret < 0)
2270 goto out;
2271 if (ret == sizeof(value)) {
2272 /* If anyone actually sees this error, we can address it */
2273 ERROR("parent cpuset value too long");
2274 goto out;
d703c2b1
RV
2275 }
2276 ok = (lxc_write_to_file(childfile, value, strlen(value), false) >= 0);
2277 if (!ok)
2278 SYSERROR("failed writing %s", childfile);
b1dad6f6
RV
2279
2280out:
f10fad2f 2281 free(parentfile);
d703c2b1 2282 free(childfile);
d703c2b1
RV
2283 return ok;
2284}
2285
2286static bool init_cpuset_if_needed(struct cgroup_mount_point *mp,
2287 const char *path)
2288{
2289 /* the files we have to handle here are only in cpuset hierarchies */
2290 if (!lxc_string_in_array("cpuset",
2291 (const char **)mp->hierarchy->subsystems))
2292 return true;
2293
b1dad6f6
RV
2294 if (!mp->need_cpuset_init)
2295 return true;
2296
d703c2b1
RV
2297 return (do_init_cpuset_file(mp, path, "/cpuset.cpus") &&
2298 do_init_cpuset_file(mp, path, "/cpuset.mems") );
2299}
2300
4fb3cba5 2301struct cgroup_ops *cgfs_ops_init(void)
484ed030 2302{
4fb3cba5 2303 return &cgfs_ops;
d4ef7c50 2304}
484ed030 2305
4fb3cba5 2306static void *cgfs_init(const char *name)
d4ef7c50 2307{
4fb3cba5 2308 struct cgfs_data *d;
484ed030 2309
4fb3cba5
DE
2310 d = malloc(sizeof(*d));
2311 if (!d)
2312 return NULL;
484ed030 2313
4fb3cba5
DE
2314 memset(d, 0, sizeof(*d));
2315 d->name = strdup(name);
2316 if (!d->name)
2317 goto err1;
2318
5e1c5795 2319 d->cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
4fb3cba5
DE
2320
2321 d->meta = lxc_cgroup_load_meta();
2322 if (!d->meta) {
2323 ERROR("cgroupfs failed to detect cgroup metadata");
2324 goto err2;
2325 }
2326 return d;
2327
2328err2:
2329 free(d->name);
2330err1:
2331 free(d);
2332 return NULL;
d4ef7c50 2333}
484ed030 2334
6a9e0f26 2335static void cgfs_destroy(void *hdata, struct lxc_conf *conf)
d4ef7c50 2336{
4fb3cba5
DE
2337 struct cgfs_data *d = hdata;
2338
d4ef7c50
SH
2339 if (!d)
2340 return;
f10fad2f 2341 free(d->name);
6a9e0f26 2342 lxc_cgroup_process_info_free_and_remove(d->info, conf);
c55d4505 2343 lxc_cgroup_put_meta(d->meta);
d4ef7c50 2344 free(d);
d4ef7c50 2345}
484ed030 2346
4fb3cba5 2347static inline bool cgfs_create(void *hdata)
d4ef7c50 2348{
4fb3cba5
DE
2349 struct cgfs_data *d = hdata;
2350 struct cgroup_process_info *i;
2351 struct cgroup_meta_data *md;
484ed030 2352
4fb3cba5 2353 if (!d)
d4ef7c50 2354 return false;
4fb3cba5
DE
2355 md = d->meta;
2356 i = lxc_cgroupfs_create(d->name, d->cgroup_pattern, md, NULL);
d4ef7c50
SH
2357 if (!i)
2358 return false;
2359 d->info = i;
2360 return true;
2361}
484ed030 2362
4fb3cba5 2363static inline bool cgfs_enter(void *hdata, pid_t pid)
d4ef7c50 2364{
4fb3cba5
DE
2365 struct cgfs_data *d = hdata;
2366 struct cgroup_process_info *i;
d4ef7c50 2367 int ret;
4fb3cba5
DE
2368
2369 if (!d)
2370 return false;
2371 i = d->info;
2372 ret = lxc_cgroupfs_enter(i, pid, false);
484ed030 2373
d4ef7c50
SH
2374 return ret == 0;
2375}
2376
4fb3cba5 2377static inline bool cgfs_create_legacy(void *hdata, pid_t pid)
d4ef7c50 2378{
4fb3cba5
DE
2379 struct cgfs_data *d = hdata;
2380 struct cgroup_process_info *i;
2381
2382 if (!d)
2383 return false;
2384 i = d->info;
2385 if (lxc_cgroup_create_legacy(i, d->name, pid) < 0) {
2386 ERROR("failed to create legacy ns cgroups for '%s'", d->name);
d4ef7c50 2387 return false;
484ed030 2388 }
d4ef7c50
SH
2389 return true;
2390}
484ed030 2391
4fb3cba5 2392static const char *cgfs_get_cgroup(void *hdata, const char *subsystem)
d4ef7c50 2393{
4fb3cba5
DE
2394 struct cgfs_data *d = hdata;
2395
2396 if (!d)
2397 return NULL;
2398 return lxc_cgroup_get_hierarchy_path_data(subsystem, d);
484ed030
SH
2399}
2400
2ba7a429
TA
2401static const char *cgfs_canonical_path(void *hdata)
2402{
2403 struct cgfs_data *d = hdata;
2404 struct cgroup_process_info *info_ptr;
2405 char *path = NULL;
2406
2407 if (!d)
2408 return NULL;
2409
2410 for (info_ptr = d->info; info_ptr; info_ptr = info_ptr->next) {
2411 if (!path)
2412 path = info_ptr->cgroup_path;
2413 else if (strcmp(path, info_ptr->cgroup_path) != 0) {
2414 ERROR("not all paths match %s, %s has path %s", path,
2415 info_ptr->hierarchy->subsystems[0], info_ptr->cgroup_path);
2416 return NULL;
2417 }
2418 }
2419
2420 return path;
2421}
2422
06078509
TA
2423static bool cgfs_escape(void)
2424{
2425 struct cgroup_meta_data *md;
2426 int i;
2427 bool ret = false;
2428
2429 md = lxc_cgroup_load_meta();
2430 if (!md)
2431 return false;
2432
2433 for (i = 1; i <= md->maximum_hierarchy; i++) {
2434 struct cgroup_hierarchy *h = md->hierarchies[i];
2435 struct cgroup_mount_point *mp;
2436 char *tasks;
2437 FILE *f;
2438 int written;
2439
2440 if (!h) {
2441 WARN("not escaping hierarchy %d", i);
2442 continue;
2443 }
2444
2445 mp = lxc_cgroup_find_mount_point(h, "/", true);
2446 if (!mp)
2447 goto out;
2448
2449 tasks = cgroup_to_absolute_path(mp, "/", "tasks");
2450 if (!tasks)
2451 goto out;
2452
2453 f = fopen(tasks, "a");
2454 free(tasks);
2455 if (!f)
2456 goto out;
2457
2458 written = fprintf(f, "%d\n", getpid());
2459 fclose(f);
2460 if (written < 0) {
2461 SYSERROR("writing tasks failed\n");
2462 goto out;
2463 }
2464 }
2465
2466 ret = true;
2467out:
2468 lxc_cgroup_put_meta(md);
2469 return ret;
2470}
2471
4fb3cba5 2472static bool cgfs_unfreeze(void *hdata)
0086f499 2473{
4fb3cba5 2474 struct cgfs_data *d = hdata;
0086f499
SH
2475 char *cgabspath, *cgrelpath;
2476 int ret;
2477
4fb3cba5
DE
2478 if (!d)
2479 return false;
2480
2481 cgrelpath = lxc_cgroup_get_hierarchy_path_data("freezer", d);
0086f499
SH
2482 cgabspath = lxc_cgroup_find_abs_path("freezer", cgrelpath, true, NULL);
2483 if (!cgabspath)
ecfcb3f0 2484 return false;
0086f499
SH
2485
2486 ret = do_cgroup_set(cgabspath, "freezer.state", "THAWED");
2487 free(cgabspath);
ecfcb3f0 2488 return ret == 0;
0086f499
SH
2489}
2490
4fb3cba5
DE
2491static bool cgroupfs_setup_limits(void *hdata, struct lxc_list *cgroup_conf,
2492 bool with_devices)
9daf6f5d 2493{
4fb3cba5
DE
2494 struct cgfs_data *d = hdata;
2495
2496 if (!d)
2497 return false;
2498 return do_setup_cgroup_limits(d, cgroup_conf, with_devices) == 0;
9daf6f5d
SH
2499}
2500
4fb3cba5 2501static bool lxc_cgroupfs_attach(const char *name, const char *lxcpath, pid_t pid)
5d897655
SH
2502{
2503 struct cgroup_meta_data *meta_data;
2504 struct cgroup_process_info *container_info;
2505 int ret;
2506
2507 meta_data = lxc_cgroup_load_meta();
2508 if (!meta_data) {
2509 ERROR("could not move attached process %d to cgroup of container", pid);
2510 return false;
2511 }
2512
2513 container_info = lxc_cgroup_get_container_info(name, lxcpath, meta_data);
2514 lxc_cgroup_put_meta(meta_data);
2515 if (!container_info) {
2516 ERROR("could not move attached process %d to cgroup of container", pid);
2517 return false;
2518 }
2519
2520 ret = lxc_cgroupfs_enter(container_info, pid, false);
2521 lxc_cgroup_process_info_free(container_info);
2522 if (ret < 0) {
2523 ERROR("could not move attached process %d to cgroup of container", pid);
2524 return false;
2525 }
2526 return true;
2527}
2528
8b276860
SH
2529struct chown_data {
2530 const char *cgroup_path;
2531 uid_t origuid;
2532};
2533
2534/*
2535 * TODO - someone should refactor this to unshare once passing all the paths
2536 * to be chowned in one go
2537 */
2538static int chown_cgroup_wrapper(void *data)
2539{
2540 struct chown_data *arg = data;
2541 uid_t destuid;
2542 char *fpath;
2543
8b276860
SH
2544 if (setresgid(0,0,0) < 0)
2545 SYSERROR("Failed to setgid to 0");
2546 if (setresuid(0,0,0) < 0)
2547 SYSERROR("Failed to setuid to 0");
2548 if (setgroups(0, NULL) < 0)
2549 SYSERROR("Failed to clear groups");
2550 destuid = get_ns_uid(arg->origuid);
2551
2552 if (chown(arg->cgroup_path, destuid, 0) < 0)
2553 SYSERROR("Failed chowning %s to %d", arg->cgroup_path, (int)destuid);
2554
2555 fpath = lxc_append_paths(arg->cgroup_path, "tasks");
2556 if (!fpath)
2557 return -1;
2558 if (chown(fpath, destuid, 0) < 0)
2559 SYSERROR("Error chowning %s\n", fpath);
2560 free(fpath);
01d59fe5 2561
8b276860
SH
2562 fpath = lxc_append_paths(arg->cgroup_path, "cgroup.procs");
2563 if (!fpath)
2564 return -1;
2565 if (chown(fpath, destuid, 0) < 0)
2566 SYSERROR("Error chowning %s", fpath);
2567 free(fpath);
2568
2569 return 0;
2570}
2571
2572static bool do_cgfs_chown(char *cgroup_path, struct lxc_conf *conf)
2573{
2574 struct chown_data data;
2575 char *fpath;
2576
01d59fe5
CB
2577 if (!dir_exists(cgroup_path))
2578 return true;
2579
8b276860
SH
2580 if (lxc_list_empty(&conf->id_map))
2581 /* If there's no mapping then we don't need to chown */
2582 return true;
2583
2584 data.cgroup_path = cgroup_path;
2585 data.origuid = geteuid();
2586
2587 /* Unpriv users can't chown it themselves, so chown from
2588 * a child namespace mapping both our own and the target uid
2589 */
2590 if (userns_exec_1(conf, chown_cgroup_wrapper, &data) < 0) {
2591 ERROR("Error requesting cgroup chown in new namespace");
2592 return false;
2593 }
2594
2595 /*
2596 * Now chmod 775 the directory else the container cannot create cgroups.
2597 * This can't be done in the child namespace because it only group-owns
2598 * the cgroup
2599 */
2600 if (chmod(cgroup_path, 0775) < 0) {
2601 SYSERROR("Error chmoding %s\n", cgroup_path);
2602 return false;
2603 }
2604 fpath = lxc_append_paths(cgroup_path, "tasks");
2605 if (!fpath)
2606 return false;
2607 if (chmod(fpath, 0664) < 0)
2608 SYSERROR("Error chmoding %s\n", fpath);
2609 free(fpath);
2610 fpath = lxc_append_paths(cgroup_path, "cgroup.procs");
2611 if (!fpath)
2612 return false;
2613 if (chmod(fpath, 0664) < 0)
2614 SYSERROR("Error chmoding %s\n", fpath);
2615 free(fpath);
2616
2617 return true;
2618}
2619
2620static bool cgfs_chown(void *hdata, struct lxc_conf *conf)
2621{
2622 struct cgfs_data *d = hdata;
2623 struct cgroup_process_info *info_ptr;
2624 char *cgpath;
2625 bool r = true;
2626
2627 if (!d)
2628 return false;
2629
2630 for (info_ptr = d->info; info_ptr; info_ptr = info_ptr->next) {
2631 if (!info_ptr->designated_mount_point) {
2632 info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, info_ptr->cgroup_path, true);
2633 if (!info_ptr->designated_mount_point) {
2634 SYSERROR("Could not chown cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", info_ptr->cgroup_path);
2635 return false;
2636 }
2637 }
2638
2639 cgpath = cgroup_to_absolute_path(info_ptr->designated_mount_point, info_ptr->cgroup_path, NULL);
2640 if (!cgpath) {
2641 SYSERROR("Could not chown cgroup %s: internal error", info_ptr->cgroup_path);
2642 continue;
2643 }
2644 r = do_cgfs_chown(cgpath, conf);
ea439aac 2645 if (!r && is_crucial_hierarchy(info_ptr->hierarchy)) {
8b276860
SH
2646 ERROR("Failed chowning %s\n", cgpath);
2647 free(cgpath);
2648 return false;
2649 }
2650 free(cgpath);
2651 }
2652
2653 return true;
2654}
2655
d4ef7c50 2656static struct cgroup_ops cgfs_ops = {
d4ef7c50 2657 .init = cgfs_init,
4fb3cba5 2658 .destroy = cgfs_destroy,
d4ef7c50
SH
2659 .create = cgfs_create,
2660 .enter = cgfs_enter,
2661 .create_legacy = cgfs_create_legacy,
2662 .get_cgroup = cgfs_get_cgroup,
2ba7a429 2663 .canonical_path = cgfs_canonical_path,
06078509 2664 .escape = cgfs_escape,
d4ef7c50
SH
2665 .get = lxc_cgroupfs_get,
2666 .set = lxc_cgroupfs_set,
4fb3cba5 2667 .unfreeze = cgfs_unfreeze,
9daf6f5d 2668 .setup_limits = cgroupfs_setup_limits,
d4ef7c50 2669 .name = "cgroupfs",
5d897655 2670 .attach = lxc_cgroupfs_attach,
8b276860 2671 .chown = cgfs_chown,
c476bdce 2672 .mount_cgroup = cgroupfs_mount_cgroup,
4fb3cba5 2673 .nrtasks = cgfs_nrtasks,
23befb18 2674 .driver = CGFS,
d4ef7c50 2675};