]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfs.c
cgfs: add print_cgfs_init_debuginfo()
[mirror_lxc.git] / src / lxc / cgroups / cgfs.c
CommitLineData
576f946d 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
576f946d 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
576f946d 22 */
d06245b8
NC
23#include "config.h"
24
576f946d 25#include <stdio.h>
576f946d 26#include <stdlib.h>
27#include <errno.h>
576f946d 28#include <unistd.h>
29#include <string.h>
341a9bd8 30#include <dirent.h>
576f946d 31#include <fcntl.h>
8b276860 32#include <grp.h>
b98f7d6e 33#include <ctype.h>
576f946d 34#include <sys/types.h>
35#include <sys/stat.h>
36#include <sys/param.h>
37#include <sys/inotify.h>
aae1f3c4 38#include <sys/mount.h>
576f946d 39#include <netinet/in.h>
40#include <net/if.h>
41
d8e48992 42#include "bdev.h"
e2bcd7db 43#include "error.h"
ae5c8b8e 44#include "commands.h"
b98f7d6e
SH
45#include "list.h"
46#include "conf.h"
33ad9f1a 47#include "utils.h"
f2363e38
ÇO
48#include "log.h"
49#include "cgroup.h"
50#include "start.h"
484ed030 51#include "state.h"
36eb9bde 52
edaf8b1b
SG
53#if IS_BIONIC
54#include <../include/lxcmntent.h>
55#else
56#include <mntent.h>
57#endif
58
4fb3cba5
DE
59struct cgroup_hierarchy;
60struct cgroup_meta_data;
61struct cgroup_mount_point;
62
63/*
64 * cgroup_meta_data: the metadata about the cgroup infrastructure on this
65 * host
66 */
67struct cgroup_meta_data {
68 ptrdiff_t ref; /* simple refcount */
69 struct cgroup_hierarchy **hierarchies;
70 struct cgroup_mount_point **mount_points;
71 int maximum_hierarchy;
72};
73
74/*
75 * cgroup_hierarchy: describes a single cgroup hierarchy
76 * (may have multiple mount points)
77 */
78struct cgroup_hierarchy {
79 int index;
80 bool used; /* false if the hierarchy should be ignored by lxc */
81 char **subsystems;
82 struct cgroup_mount_point *rw_absolute_mount_point;
83 struct cgroup_mount_point *ro_absolute_mount_point;
84 struct cgroup_mount_point **all_mount_points;
85 size_t all_mount_point_capacity;
86};
87
88/*
89 * cgroup_mount_point: a mount point to where a hierarchy
90 * is mounted to
91 */
92struct cgroup_mount_point {
93 struct cgroup_hierarchy *hierarchy;
94 char *mount_point;
95 char *mount_prefix;
96 bool read_only;
97 bool need_cpuset_init;
98};
99
100/*
101 * cgroup_process_info: describes the membership of a
102 * process to the different cgroup
103 * hierarchies
104 *
105 * Note this is the per-process info tracked by the cgfs_ops.
106 * This is not used with cgmanager.
107 */
108struct cgroup_process_info {
109 struct cgroup_process_info *next;
110 struct cgroup_meta_data *meta_ref;
111 struct cgroup_hierarchy *hierarchy;
112 char *cgroup_path;
113 char *cgroup_path_sub;
114 char **created_paths;
115 size_t created_paths_capacity;
116 size_t created_paths_count;
117 struct cgroup_mount_point *designated_mount_point;
118};
119
120struct cgfs_data {
121 char *name;
122 const char *cgroup_pattern;
123 struct cgroup_meta_data *meta;
124 struct cgroup_process_info *info;
125};
126
127lxc_log_define(lxc_cgfs, lxc);
576f946d 128
33ad9f1a
CS
129static struct cgroup_process_info *lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str, struct cgroup_meta_data *meta);
130static char **subsystems_from_mount_options(const char *mount_options, char **kernel_list);
131static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp);
132static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h);
133static bool is_valid_cgroup(const char *name);
33ad9f1a 134static int create_cgroup(struct cgroup_mount_point *mp, const char *path);
6a9e0f26
SH
135static int remove_cgroup(struct cgroup_mount_point *mp, const char *path, bool recurse,
136 struct lxc_conf *conf);
33ad9f1a
CS
137static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp, const char *path, const char *suffix);
138static struct cgroup_process_info *find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem);
139static int do_cgroup_get(const char *cgroup_path, const char *sub_filename, char *value, size_t len);
140static int do_cgroup_set(const char *cgroup_path, const char *sub_filename, const char *value);
4fb3cba5
DE
141static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d, char *v, bool for_allow);
142static int do_setup_cgroup_limits(struct cgfs_data *d, struct lxc_list *cgroup_settings, bool do_devices);
33ad9f1a 143static int cgroup_recursive_task_count(const char *cgroup_path);
1ea59ad2 144static int handle_cgroup_settings(struct cgroup_mount_point *mp, char *cgroup_path);
d703c2b1 145static bool init_cpuset_if_needed(struct cgroup_mount_point *mp, const char *path);
33ad9f1a 146
4fb3cba5
DE
147static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist);
148static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data);
149static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data);
150
151/* free process membership information */
152static void lxc_cgroup_process_info_free(struct cgroup_process_info *info);
6a9e0f26
SH
153static void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info,
154 struct lxc_conf *conf);
4fb3cba5 155
d4ef7c50 156static struct cgroup_ops cgfs_ops;
d4ef7c50 157
603c64c2
SH
158static int cgroup_rmdir(char *dirname)
159{
74f96976 160 struct dirent *direntp;
603c64c2
SH
161 int saved_errno = 0;
162 DIR *dir;
163 int ret, failed=0;
164 char pathname[MAXPATHLEN];
165
166 dir = opendir(dirname);
167 if (!dir) {
168 ERROR("%s: failed to open %s", __func__, dirname);
169 return -1;
170 }
171
74f96976 172 while ((direntp = readdir(dir))) {
603c64c2
SH
173 struct stat mystat;
174 int rc;
175
176 if (!direntp)
177 break;
178
179 if (!strcmp(direntp->d_name, ".") ||
180 !strcmp(direntp->d_name, ".."))
181 continue;
182
183 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
184 if (rc < 0 || rc >= MAXPATHLEN) {
185 ERROR("pathname too long");
186 failed=1;
187 if (!saved_errno)
188 saved_errno = -ENOMEM;
189 continue;
190 }
191 ret = lstat(pathname, &mystat);
192 if (ret) {
193 SYSERROR("%s: failed to stat %s", __func__, pathname);
194 failed=1;
195 if (!saved_errno)
196 saved_errno = errno;
197 continue;
198 }
199 if (S_ISDIR(mystat.st_mode)) {
200 if (cgroup_rmdir(pathname) < 0) {
201 if (!saved_errno)
202 saved_errno = errno;
203 failed=1;
204 }
205 }
206 }
207
208 if (rmdir(dirname) < 0) {
209 SYSERROR("%s: failed to delete %s", __func__, dirname);
210 if (!saved_errno)
211 saved_errno = errno;
212 failed=1;
213 }
214
215 ret = closedir(dir);
216 if (ret) {
217 SYSERROR("%s: failed to close directory %s", __func__, dirname);
218 if (!saved_errno)
219 saved_errno = errno;
220 failed=1;
221 }
222
223 errno = saved_errno;
224 return failed ? -1 : 0;
225}
226
6a9e0f26
SH
227static int rmdir_wrapper(void *data)
228{
229 char *path = data;
230
231 if (setresgid(0,0,0) < 0)
232 SYSERROR("Failed to setgid to 0");
233 if (setresuid(0,0,0) < 0)
234 SYSERROR("Failed to setuid to 0");
235 if (setgroups(0, NULL) < 0)
236 SYSERROR("Failed to clear groups");
237
238 return cgroup_rmdir(path);
239}
240
4fb3cba5 241static struct cgroup_meta_data *lxc_cgroup_load_meta()
33ad9f1a
CS
242{
243 const char *cgroup_use = NULL;
244 char **cgroup_use_list = NULL;
245 struct cgroup_meta_data *md = NULL;
246 int saved_errno;
247
248 errno = 0;
593e8478 249 cgroup_use = lxc_global_config_value("lxc.cgroup.use");
33ad9f1a
CS
250 if (!cgroup_use && errno != 0)
251 return NULL;
252 if (cgroup_use) {
253 cgroup_use_list = lxc_string_split_and_trim(cgroup_use, ',');
254 if (!cgroup_use_list)
255 return NULL;
256 }
576f946d 257
33ad9f1a
CS
258 md = lxc_cgroup_load_meta2((const char **)cgroup_use_list);
259 saved_errno = errno;
260 lxc_free_array((void **)cgroup_use_list, free);
261 errno = saved_errno;
262 return md;
263}
fd37327f 264
b653309a 265/* Step 1: determine all kernel subsystems */
4fb3cba5 266static bool find_cgroup_subsystems(char ***kernel_subsystems)
1d39a065 267{
b653309a
SH
268 FILE *proc_cgroups;
269 bool bret = false;
33ad9f1a
CS
270 char *line = NULL;
271 size_t sz = 0;
b653309a
SH
272 size_t kernel_subsystems_count = 0;
273 size_t kernel_subsystems_capacity = 0;
274 int r;
1d39a065 275
33ad9f1a
CS
276 proc_cgroups = fopen_cloexec("/proc/cgroups", "r");
277 if (!proc_cgroups)
b653309a 278 return false;
1d39a065 279
33ad9f1a
CS
280 while (getline(&line, &sz, proc_cgroups) != -1) {
281 char *tab1;
282 char *tab2;
283 int hierarchy_number;
1d39a065 284
33ad9f1a
CS
285 if (line[0] == '#')
286 continue;
287 if (!line[0])
288 continue;
1d39a065 289
33ad9f1a
CS
290 tab1 = strchr(line, '\t');
291 if (!tab1)
8900b9eb 292 continue;
33ad9f1a
CS
293 *tab1++ = '\0';
294 tab2 = strchr(tab1, '\t');
295 if (!tab2)
296 continue;
297 *tab2 = '\0';
fd37327f 298
33ad9f1a
CS
299 tab2 = NULL;
300 hierarchy_number = strtoul(tab1, &tab2, 10);
301 if (!tab2 || *tab2)
302 continue;
303 (void)hierarchy_number;
304
b653309a 305 r = lxc_grow_array((void ***)kernel_subsystems, &kernel_subsystems_capacity, kernel_subsystems_count + 1, 12);
33ad9f1a 306 if (r < 0)
b653309a
SH
307 goto out;
308 (*kernel_subsystems)[kernel_subsystems_count] = strdup(line);
309 if (!(*kernel_subsystems)[kernel_subsystems_count])
310 goto out;
33ad9f1a 311 kernel_subsystems_count++;
bcbd102c 312 }
b653309a 313 bret = true;
0d9f8e18 314
b653309a 315out:
33ad9f1a 316 fclose(proc_cgroups);
0ccf7c2a 317 free(line);
b653309a
SH
318 return bret;
319}
320
321/* Step 2: determine all hierarchies (by reading /proc/self/cgroup),
322 * since mount points don't specify hierarchy number and
323 * /proc/cgroups does not contain named hierarchies
324 */
325static bool find_cgroup_hierarchies(struct cgroup_meta_data *meta_data,
326 bool all_kernel_subsystems, bool all_named_subsystems,
327 const char **subsystem_whitelist)
328{
329 FILE *proc_self_cgroup;
330 char *line = NULL;
331 size_t sz = 0;
332 int r;
333 bool bret = false;
334 size_t hierarchy_capacity = 0;
ef6e34ee 335
33ad9f1a
CS
336 proc_self_cgroup = fopen_cloexec("/proc/self/cgroup", "r");
337 /* if for some reason (because of setns() and pid namespace for example),
338 * /proc/self is not valid, we try /proc/1/cgroup... */
339 if (!proc_self_cgroup)
340 proc_self_cgroup = fopen_cloexec("/proc/1/cgroup", "r");
341 if (!proc_self_cgroup)
b653309a 342 return false;
33ad9f1a
CS
343
344 while (getline(&line, &sz, proc_self_cgroup) != -1) {
345 /* file format: hierarchy:subsystems:group,
346 * we only extract hierarchy and subsystems
347 * here */
348 char *colon1;
349 char *colon2;
350 int hierarchy_number;
351 struct cgroup_hierarchy *h = NULL;
352 char **p;
353
354 if (!line[0])
355 continue;
ad08bbb7 356
33ad9f1a
CS
357 colon1 = strchr(line, ':');
358 if (!colon1)
8900b9eb 359 continue;
33ad9f1a
CS
360 *colon1++ = '\0';
361 colon2 = strchr(colon1, ':');
362 if (!colon2)
363 continue;
364 *colon2 = '\0';
ad08bbb7 365
33ad9f1a
CS
366 colon2 = NULL;
367 hierarchy_number = strtoul(line, &colon2, 10);
368 if (!colon2 || *colon2)
369 continue;
576f946d 370
33ad9f1a
CS
371 if (hierarchy_number > meta_data->maximum_hierarchy) {
372 /* lxc_grow_array will never shrink, so even if we find a lower
373 * hierarchy number here, the array will never be smaller
374 */
375 r = lxc_grow_array((void ***)&meta_data->hierarchies, &hierarchy_capacity, hierarchy_number + 1, 12);
376 if (r < 0)
b653309a 377 goto out;
5193cc3d 378
33ad9f1a
CS
379 meta_data->maximum_hierarchy = hierarchy_number;
380 }
fd37327f 381
33ad9f1a
CS
382 /* this shouldn't happen, we had this already */
383 if (meta_data->hierarchies[hierarchy_number])
b653309a 384 goto out;
33ad9f1a
CS
385
386 h = calloc(1, sizeof(struct cgroup_hierarchy));
387 if (!h)
b653309a 388 goto out;
33ad9f1a
CS
389
390 meta_data->hierarchies[hierarchy_number] = h;
391
392 h->index = hierarchy_number;
393 h->subsystems = lxc_string_split_and_trim(colon1, ',');
394 if (!h->subsystems)
b653309a 395 goto out;
33ad9f1a
CS
396 /* see if this hierarchy should be considered */
397 if (!all_kernel_subsystems || !all_named_subsystems) {
398 for (p = h->subsystems; *p; p++) {
399 if (!strncmp(*p, "name=", 5)) {
400 if (all_named_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
401 h->used = true;
402 break;
403 }
404 } else {
405 if (all_kernel_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
406 h->used = true;
407 break;
408 }
409 }
410 }
411 } else {
412 /* we want all hierarchy anyway */
413 h->used = true;
ae5c8b8e 414 }
ae5c8b8e 415 }
b653309a 416 bret = true;
0b9c21ab 417
b653309a 418out:
33ad9f1a 419 fclose(proc_self_cgroup);
0ccf7c2a 420 free(line);
b653309a
SH
421 return bret;
422}
423
424/* Step 3: determine all mount points of each hierarchy */
425static bool find_hierarchy_mountpts( struct cgroup_meta_data *meta_data, char **kernel_subsystems)
426{
427 bool bret = false;
428 FILE *proc_self_mountinfo;
429 char *line = NULL;
430 size_t sz = 0;
431 char **tokens = NULL;
432 size_t mount_point_count = 0;
433 size_t mount_point_capacity = 0;
434 size_t token_capacity = 0;
435 int r;
fcca16bc 436 bool is_cgns = cgns_supported();
b653309a 437
33ad9f1a
CS
438 proc_self_mountinfo = fopen_cloexec("/proc/self/mountinfo", "r");
439 /* if for some reason (because of setns() and pid namespace for example),
440 * /proc/self is not valid, we try /proc/1/cgroup... */
441 if (!proc_self_mountinfo)
442 proc_self_mountinfo = fopen_cloexec("/proc/1/mountinfo", "r");
443 if (!proc_self_mountinfo)
b653309a 444 return false;
33ad9f1a
CS
445
446 while (getline(&line, &sz, proc_self_mountinfo) != -1) {
178938fe 447 char *token, *line_tok, *saveptr = NULL;
33ad9f1a
CS
448 size_t i, j, k;
449 struct cgroup_mount_point *mount_point;
450 struct cgroup_hierarchy *h;
451 char **subsystems;
836514a8 452 bool is_lxcfs = false;
33ad9f1a
CS
453
454 if (line[0] && line[strlen(line) - 1] == '\n')
455 line[strlen(line) - 1] = '\0';
456
178938fe 457 for (i = 0, line_tok = line; (token = strtok_r(line_tok, " ", &saveptr)); line_tok = NULL) {
33ad9f1a
CS
458 r = lxc_grow_array((void ***)&tokens, &token_capacity, i + 1, 64);
459 if (r < 0)
b653309a 460 goto out;
33ad9f1a
CS
461 tokens[i++] = token;
462 }
b98f7d6e 463
33ad9f1a
CS
464 /* layout of /proc/self/mountinfo:
465 * 0: id
466 * 1: parent id
467 * 2: device major:minor
468 * 3: mount prefix
8900b9eb 469 * 4: mount point
33ad9f1a
CS
470 * 5: per-mount options
471 * [optional X]: additional data
472 * X+7: "-"
473 * X+8: type
474 * X+9: source
475 * X+10: per-superblock options
476 */
477 for (j = 6; j < i && tokens[j]; j++)
478 if (!strcmp(tokens[j], "-"))
479 break;
fd4f5a56 480
33ad9f1a
CS
481 /* could not find separator */
482 if (j >= i || !tokens[j])
483 continue;
484 /* there should be exactly three fields after
485 * the separator
486 */
487 if (i != j + 4)
488 continue;
fd4f5a56 489
33ad9f1a 490 /* not a cgroup filesystem */
836514a8
U
491 if (strcmp(tokens[j + 1], "cgroup") != 0) {
492 if (strcmp(tokens[j + 1], "fuse.lxcfs") != 0)
493 continue;
494 if (strncmp(tokens[4], "/sys/fs/cgroup/", 15) != 0)
495 continue;
496 is_lxcfs = true;
497 char *curtok = tokens[4] + 15;
498 subsystems = subsystems_from_mount_options(curtok,
499 kernel_subsystems);
500 } else
501 subsystems = subsystems_from_mount_options(tokens[j + 3],
502 kernel_subsystems);
33ad9f1a 503 if (!subsystems)
b653309a 504 goto out;
33ad9f1a
CS
505
506 h = NULL;
507 for (k = 1; k <= meta_data->maximum_hierarchy; k++) {
508 if (meta_data->hierarchies[k] &&
509 meta_data->hierarchies[k]->subsystems[0] &&
510 lxc_string_in_array(meta_data->hierarchies[k]->subsystems[0], (const char **)subsystems)) {
511 /* TODO: we could also check if the lists really match completely,
512 * just to have an additional sanity check */
513 h = meta_data->hierarchies[k];
b98f7d6e 514 break;
33ad9f1a 515 }
b98f7d6e 516 }
33ad9f1a
CS
517 lxc_free_array((void **)subsystems, free);
518
519 r = lxc_grow_array((void ***)&meta_data->mount_points, &mount_point_capacity, mount_point_count + 1, 12);
520 if (r < 0)
b653309a 521 goto out;
33ad9f1a
CS
522
523 /* create mount point object */
524 mount_point = calloc(1, sizeof(*mount_point));
525 if (!mount_point)
b653309a 526 goto out;
33ad9f1a
CS
527
528 meta_data->mount_points[mount_point_count++] = mount_point;
529
530 mount_point->hierarchy = h;
fcca16bc 531 if (is_lxcfs || is_cgns)
836514a8
U
532 mount_point->mount_prefix = strdup("/");
533 else
534 mount_point->mount_prefix = strdup(tokens[3]);
33ad9f1a 535 mount_point->mount_point = strdup(tokens[4]);
33ad9f1a 536 if (!mount_point->mount_point || !mount_point->mount_prefix)
b653309a 537 goto out;
33ad9f1a
CS
538 mount_point->read_only = !lxc_string_in_list("rw", tokens[5], ',');
539
540 if (!strcmp(mount_point->mount_prefix, "/")) {
541 if (mount_point->read_only) {
542 if (!h->ro_absolute_mount_point)
543 h->ro_absolute_mount_point = mount_point;
544 } else {
545 if (!h->rw_absolute_mount_point)
546 h->rw_absolute_mount_point = mount_point;
547 }
b98f7d6e 548 }
ae5c8b8e 549
33ad9f1a
CS
550 k = lxc_array_len((void **)h->all_mount_points);
551 r = lxc_grow_array((void ***)&h->all_mount_points, &h->all_mount_point_capacity, k + 1, 4);
552 if (r < 0)
b653309a 553 goto out;
33ad9f1a 554 h->all_mount_points[k] = mount_point;
fd4f5a56 555 }
b653309a
SH
556 bret = true;
557
558out:
b653309a 559 fclose(proc_self_mountinfo);
b653309a 560 free(tokens);
2cdafc54 561 free(line);
b653309a
SH
562 return bret;
563}
564
4fb3cba5 565static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist)
b653309a
SH
566{
567 bool all_kernel_subsystems = true;
568 bool all_named_subsystems = false;
569 struct cgroup_meta_data *meta_data = NULL;
570 char **kernel_subsystems = NULL;
571 int saved_errno = 0;
572
573 /* if the subsystem whitelist is not specified, include all
574 * hierarchies that contain kernel subsystems by default but
575 * no hierarchies that only contain named subsystems
576 *
577 * if it is specified, the specifier @all will select all
578 * hierarchies, @kernel will select all hierarchies with
579 * kernel subsystems and @named will select all named
580 * hierarchies
581 */
582 all_kernel_subsystems = subsystem_whitelist ?
583 (lxc_string_in_array("@kernel", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
584 true;
585 all_named_subsystems = subsystem_whitelist ?
586 (lxc_string_in_array("@named", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
79c59e6b 587 true;
b653309a
SH
588
589 meta_data = calloc(1, sizeof(struct cgroup_meta_data));
590 if (!meta_data)
591 return NULL;
592 meta_data->ref = 1;
593
594 if (!find_cgroup_subsystems(&kernel_subsystems))
595 goto out_error;
596
597 if (!find_cgroup_hierarchies(meta_data, all_kernel_subsystems,
598 all_named_subsystems, subsystem_whitelist))
599 goto out_error;
600
601 if (!find_hierarchy_mountpts(meta_data, kernel_subsystems))
602 goto out_error;
fd4f5a56 603
33ad9f1a
CS
604 /* oops, we couldn't find anything */
605 if (!meta_data->hierarchies || !meta_data->mount_points) {
606 errno = EINVAL;
607 goto out_error;
ae5c8b8e 608 }
fd4f5a56 609
3a0abb3a 610 lxc_free_array((void **)kernel_subsystems, free);
33ad9f1a
CS
611 return meta_data;
612
613out_error:
614 saved_errno = errno;
33ad9f1a
CS
615 lxc_free_array((void **)kernel_subsystems, free);
616 lxc_cgroup_put_meta(meta_data);
617 errno = saved_errno;
618 return NULL;
fd4f5a56
DL
619}
620
4fb3cba5 621static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data)
e14f67a7 622{
33ad9f1a
CS
623 meta_data->ref++;
624 return meta_data;
625}
e14f67a7 626
4fb3cba5 627static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data)
33ad9f1a
CS
628{
629 size_t i;
630 if (!meta_data)
631 return NULL;
632 if (--meta_data->ref > 0)
633 return meta_data;
634 lxc_free_array((void **)meta_data->mount_points, (lxc_free_fn)lxc_cgroup_mount_point_free);
2446c321 635 if (meta_data->hierarchies)
33ad9f1a 636 for (i = 0; i <= meta_data->maximum_hierarchy; i++)
2446c321
CB
637 if (meta_data->hierarchies[i])
638 lxc_cgroup_hierarchy_free(meta_data->hierarchies[i]);
33ad9f1a 639 free(meta_data->hierarchies);
178938fe 640 free(meta_data);
33ad9f1a 641 return NULL;
e14f67a7
U
642}
643
4fb3cba5 644static struct cgroup_hierarchy *lxc_cgroup_find_hierarchy(struct cgroup_meta_data *meta_data, const char *subsystem)
e14f67a7 645{
33ad9f1a
CS
646 size_t i;
647 for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
648 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
649 if (h && lxc_string_in_array(subsystem, (const char **)h->subsystems))
650 return h;
e14f67a7 651 }
e14f67a7
U
652 return NULL;
653}
654
d3f99e96
SH
655static bool mountpoint_is_accessible(struct cgroup_mount_point *mp)
656{
657 return mp && access(mp->mount_point, F_OK) == 0;
658}
659
4fb3cba5 660static struct cgroup_mount_point *lxc_cgroup_find_mount_point(struct cgroup_hierarchy *hierarchy, const char *group, bool should_be_writable)
b98f7d6e 661{
33ad9f1a
CS
662 struct cgroup_mount_point **mps;
663 struct cgroup_mount_point *current_result = NULL;
664 ssize_t quality = -1;
b98f7d6e 665
33ad9f1a 666 /* trivial case */
d3f99e96 667 if (mountpoint_is_accessible(hierarchy->rw_absolute_mount_point))
33ad9f1a 668 return hierarchy->rw_absolute_mount_point;
d3f99e96 669 if (!should_be_writable && mountpoint_is_accessible(hierarchy->ro_absolute_mount_point))
33ad9f1a 670 return hierarchy->ro_absolute_mount_point;
b98f7d6e 671
33ad9f1a
CS
672 for (mps = hierarchy->all_mount_points; mps && *mps; mps++) {
673 struct cgroup_mount_point *mp = *mps;
674 size_t prefix_len = mp->mount_prefix ? strlen(mp->mount_prefix) : 0;
b98f7d6e 675
33ad9f1a
CS
676 if (prefix_len == 1 && mp->mount_prefix[0] == '/')
677 prefix_len = 0;
b98f7d6e 678
d3f99e96
SH
679 if (!mountpoint_is_accessible(mp))
680 continue;
681
33ad9f1a
CS
682 if (should_be_writable && mp->read_only)
683 continue;
684
685 if (!prefix_len ||
686 (strncmp(group, mp->mount_prefix, prefix_len) == 0 &&
687 (group[prefix_len] == '\0' || group[prefix_len] == '/'))) {
688 /* search for the best quality match, i.e. the match with the
689 * shortest prefix where this group is still contained
690 */
691 if (quality == -1 || prefix_len < quality) {
692 current_result = mp;
693 quality = prefix_len;
694 }
b98f7d6e
SH
695 }
696 }
697
33ad9f1a
CS
698 if (!current_result)
699 errno = ENOENT;
700 return current_result;
b98f7d6e
SH
701}
702
4fb3cba5 703static char *lxc_cgroup_find_abs_path(const char *subsystem, const char *group, bool should_be_writable, const char *suffix)
b98f7d6e 704{
33ad9f1a
CS
705 struct cgroup_meta_data *meta_data;
706 struct cgroup_hierarchy *h;
707 struct cgroup_mount_point *mp;
708 char *result;
709 int saved_errno;
710
711 meta_data = lxc_cgroup_load_meta();
712 if (!meta_data)
713 return NULL;
b98f7d6e 714
33ad9f1a
CS
715 h = lxc_cgroup_find_hierarchy(meta_data, subsystem);
716 if (!h)
717 goto out_error;
b98f7d6e 718
33ad9f1a
CS
719 mp = lxc_cgroup_find_mount_point(h, group, should_be_writable);
720 if (!mp)
721 goto out_error;
b98f7d6e 722
33ad9f1a
CS
723 result = cgroup_to_absolute_path(mp, group, suffix);
724 if (!result)
725 goto out_error;
b98f7d6e 726
33ad9f1a
CS
727 lxc_cgroup_put_meta(meta_data);
728 return result;
b98f7d6e 729
33ad9f1a
CS
730out_error:
731 saved_errno = errno;
732 lxc_cgroup_put_meta(meta_data);
733 errno = saved_errno;
734 return NULL;
b98f7d6e
SH
735}
736
4fb3cba5 737static struct cgroup_process_info *lxc_cgroup_process_info_get(pid_t pid, struct cgroup_meta_data *meta)
fd4f5a56 738{
33ad9f1a
CS
739 char pid_buf[32];
740 snprintf(pid_buf, 32, "/proc/%lu/cgroup", (unsigned long)pid);
741 return lxc_cgroup_process_info_getx(pid_buf, meta);
c8f7c563
CS
742}
743
4fb3cba5 744static struct cgroup_process_info *lxc_cgroup_process_info_get_init(struct cgroup_meta_data *meta)
c8f7c563 745{
33ad9f1a
CS
746 return lxc_cgroup_process_info_get(1, meta);
747}
b98f7d6e 748
4fb3cba5 749static struct cgroup_process_info *lxc_cgroup_process_info_get_self(struct cgroup_meta_data *meta)
33ad9f1a
CS
750{
751 struct cgroup_process_info *i;
752 i = lxc_cgroup_process_info_getx("/proc/self/cgroup", meta);
753 if (!i)
754 i = lxc_cgroup_process_info_get(getpid(), meta);
755 return i;
756}
ae5c8b8e 757
692ba18f
SH
758/*
759 * If a controller has ns cgroup mounted, then in that cgroup the handler->pid
760 * is already in a new cgroup named after the pid. 'mnt' is passed in as
761 * the full current cgroup. Say that is /sys/fs/cgroup/lxc/2975 and the container
762 * name is c1. . We want to rename the cgroup directory to /sys/fs/cgroup/lxc/c1,
763 * and return the string /sys/fs/cgroup/lxc/c1.
764 */
cea0552e 765static char *cgroup_rename_nsgroup(const char *mountpath, const char *oldname, pid_t pid, const char *name)
692ba18f
SH
766{
767 char *dir, *fulloldpath;
768 char *newname, *fullnewpath;
cea0552e 769 int len, newlen, ret;
692ba18f
SH
770
771 /*
772 * if cgroup is mounted at /cgroup and task is in cgroup /ab/, pid 2375 and
773 * name is c1,
774 * dir: /ab
775 * fulloldpath = /cgroup/ab/2375
776 * fullnewpath = /cgroup/ab/c1
777 * newname = /ab/c1
778 */
779 dir = alloca(strlen(oldname) + 1);
780 strcpy(dir, oldname);
781
cea0552e
SH
782 len = strlen(oldname) + strlen(mountpath) + 22;
783 fulloldpath = alloca(len);
784 ret = snprintf(fulloldpath, len, "%s/%s/%ld", mountpath, oldname, (unsigned long)pid);
785 if (ret < 0 || ret >= len)
786 return NULL;
692ba18f
SH
787
788 len = strlen(dir) + strlen(name) + 2;
789 newname = malloc(len);
790 if (!newname) {
791 SYSERROR("Out of memory");
792 return NULL;
793 }
cea0552e
SH
794 ret = snprintf(newname, len, "%s/%s", dir, name);
795 if (ret < 0 || ret >= len) {
796 free(newname);
797 return NULL;
798 }
692ba18f 799
cea0552e
SH
800 newlen = strlen(mountpath) + len + 2;
801 fullnewpath = alloca(newlen);
802 ret = snprintf(fullnewpath, newlen, "%s/%s", mountpath, newname);
803 if (ret < 0 || ret >= newlen) {
804 free(newname);
805 return NULL;
806 }
692ba18f
SH
807
808 if (access(fullnewpath, F_OK) == 0) {
809 if (rmdir(fullnewpath) != 0) {
810 SYSERROR("container cgroup %s already exists.", fullnewpath);
811 free(newname);
812 return NULL;
813 }
814 }
815 if (rename(fulloldpath, fullnewpath)) {
816 SYSERROR("failed to rename cgroup %s->%s", fulloldpath, fullnewpath);
817 free(newname);
818 return NULL;
819 }
820
821 DEBUG("'%s' renamed to '%s'", oldname, newname);
822
823 return newname;
824}
825
ea439aac
SH
826static bool is_crucial_hierarchy(struct cgroup_hierarchy *h)
827{
828 char **p;
829
830 for (p = h->subsystems; *p; p++) {
831 if (is_crucial_cgroup_subsystem(*p))
832 return true;
833 }
834 return false;
835}
836
33ad9f1a 837/* create a new cgroup */
4fb3cba5 838static struct cgroup_process_info *lxc_cgroupfs_create(const char *name, const char *path_pattern, struct cgroup_meta_data *meta_data, const char *sub_pattern)
33ad9f1a 839{
001b026e 840 char **cgroup_path_components = NULL;
33ad9f1a
CS
841 char **p = NULL;
842 char *path_so_far = NULL;
843 char **new_cgroup_paths = NULL;
844 char **new_cgroup_paths_sub = NULL;
845 struct cgroup_mount_point *mp;
846 struct cgroup_hierarchy *h;
847 struct cgroup_process_info *base_info = NULL;
848 struct cgroup_process_info *info_ptr;
849 int saved_errno;
850 int r;
851 unsigned suffix = 0;
852 bool had_sub_pattern = false;
853 size_t i;
ae5c8b8e 854
33ad9f1a
CS
855 if (!is_valid_cgroup(name)) {
856 ERROR("Invalid cgroup name: '%s'", name);
857 errno = EINVAL;
858 return NULL;
ae5c8b8e
SH
859 }
860
33ad9f1a
CS
861 if (!strstr(path_pattern, "%n")) {
862 ERROR("Invalid cgroup path pattern: '%s'; contains no %%n for specifying container name", path_pattern);
863 errno = EINVAL;
864 return NULL;
865 }
fd37327f 866
33ad9f1a
CS
867 /* we will modify the result of this operation directly,
868 * so we don't have to copy the data structure
869 */
870 base_info = (path_pattern[0] == '/') ?
871 lxc_cgroup_process_info_get_init(meta_data) :
872 lxc_cgroup_process_info_get_self(meta_data);
873 if (!base_info)
874 return NULL;
c8f7c563 875
33ad9f1a
CS
876 new_cgroup_paths = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
877 if (!new_cgroup_paths)
878 goto out_initial_error;
879
880 new_cgroup_paths_sub = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
881 if (!new_cgroup_paths_sub)
882 goto out_initial_error;
883
884 /* find mount points we can use */
885 for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
886 h = info_ptr->hierarchy;
887 mp = lxc_cgroup_find_mount_point(h, info_ptr->cgroup_path, true);
888 if (!mp) {
889 ERROR("Could not find writable mount point for cgroup hierarchy %d while trying to create cgroup.", h->index);
890 goto out_initial_error;
891 }
892 info_ptr->designated_mount_point = mp;
460a1cf0 893
692ba18f
SH
894 if (lxc_string_in_array("ns", (const char **)h->subsystems))
895 continue;
2edb53c7
SH
896 if (handle_cgroup_settings(mp, info_ptr->cgroup_path) < 0) {
897 ERROR("Could not set clone_children to 1 for cpuset hierarchy in parent cgroup.");
33ad9f1a 898 goto out_initial_error;
2edb53c7 899 }
33ad9f1a 900 }
b98f7d6e 901
33ad9f1a
CS
902 /* normalize the path */
903 cgroup_path_components = lxc_normalize_path(path_pattern);
904 if (!cgroup_path_components)
905 goto out_initial_error;
906
907 /* go through the path components to see if we can create them */
908 for (p = cgroup_path_components; *p || (sub_pattern && !had_sub_pattern); p++) {
909 /* we only want to create the same component with -1, -2, etc.
910 * if the component contains the container name itself, otherwise
911 * it's not an error if it already exists
912 */
913 char *p_eff = *p ? *p : (char *)sub_pattern;
914 bool contains_name = strstr(p_eff, "%n");
915 char *current_component = NULL;
916 char *current_subpath = NULL;
917 char *current_entire_path = NULL;
918 char *parts[3];
919 size_t j = 0;
920 i = 0;
921
922 /* if we are processing the subpattern, we want to make sure
923 * loop is ended the next time around
924 */
925 if (!*p) {
926 had_sub_pattern = true;
927 p--;
928 }
b98f7d6e 929
33ad9f1a 930 goto find_name_on_this_level;
4fb3cba5 931
33ad9f1a
CS
932 cleanup_name_on_this_level:
933 /* This is reached if we found a name clash.
934 * In that case, remove the cgroup from all previous hierarchies
935 */
936 for (j = 0, info_ptr = base_info; j < i && info_ptr; info_ptr = info_ptr->next, j++) {
77afbedf
SH
937 if (info_ptr->created_paths_count < 1)
938 continue;
6a9e0f26 939 r = remove_cgroup(info_ptr->designated_mount_point, info_ptr->created_paths[info_ptr->created_paths_count - 1], false, NULL);
33ad9f1a
CS
940 if (r < 0)
941 WARN("could not clean up cgroup we created when trying to create container");
942 free(info_ptr->created_paths[info_ptr->created_paths_count - 1]);
943 info_ptr->created_paths[--info_ptr->created_paths_count] = NULL;
944 }
945 if (current_component != current_subpath)
946 free(current_subpath);
947 if (current_component != p_eff)
948 free(current_component);
949 current_component = current_subpath = NULL;
950 /* try again with another suffix */
951 ++suffix;
4fb3cba5 952
33ad9f1a
CS
953 find_name_on_this_level:
954 /* determine name of the path component we should create */
955 if (contains_name && suffix > 0) {
956 char *buf = calloc(strlen(name) + 32, 1);
957 if (!buf)
958 goto out_initial_error;
959 snprintf(buf, strlen(name) + 32, "%s-%u", name, suffix);
960 current_component = lxc_string_replace("%n", buf, p_eff);
961 free(buf);
962 } else {
963 current_component = contains_name ? lxc_string_replace("%n", name, p_eff) : p_eff;
964 }
965 parts[0] = path_so_far;
966 parts[1] = current_component;
967 parts[2] = NULL;
968 current_subpath = path_so_far ? lxc_string_join("/", (const char **)parts, false) : current_component;
969
970 /* Now go through each hierarchy and try to create the
971 * corresponding cgroup
972 */
973 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
974 char *parts2[3];
692ba18f
SH
975
976 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
977 continue;
33ad9f1a
CS
978 current_entire_path = NULL;
979
980 parts2[0] = !strcmp(info_ptr->cgroup_path, "/") ? "" : info_ptr->cgroup_path;
981 parts2[1] = current_subpath;
982 parts2[2] = NULL;
983 current_entire_path = lxc_string_join("/", (const char **)parts2, false);
984
985 if (!*p) {
986 /* we are processing the subpath, so only update that one */
987 free(new_cgroup_paths_sub[i]);
988 new_cgroup_paths_sub[i] = strdup(current_entire_path);
989 if (!new_cgroup_paths_sub[i])
990 goto cleanup_from_error;
991 } else {
992 /* remember which path was used on this controller */
993 free(new_cgroup_paths[i]);
994 new_cgroup_paths[i] = strdup(current_entire_path);
995 if (!new_cgroup_paths[i])
996 goto cleanup_from_error;
997 }
fd4f5a56 998
33ad9f1a
CS
999 r = create_cgroup(info_ptr->designated_mount_point, current_entire_path);
1000 if (r < 0 && errno == EEXIST && contains_name) {
1001 /* name clash => try new name with new suffix */
1002 free(current_entire_path);
1003 current_entire_path = NULL;
1004 goto cleanup_name_on_this_level;
1005 } else if (r < 0 && errno != EEXIST) {
ea439aac
SH
1006 if (is_crucial_hierarchy(info_ptr->hierarchy)) {
1007 SYSERROR("Could not create cgroup '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
1008 goto cleanup_from_error;
1009 }
1010 goto skip;
33ad9f1a
CS
1011 } else if (r == 0) {
1012 /* successfully created */
1013 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1014 if (r < 0)
1015 goto cleanup_from_error;
d703c2b1 1016 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, current_entire_path)) {
b38b62a6 1017 ERROR("Failed to initialize cpuset for '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
d703c2b1
RV
1018 goto cleanup_from_error;
1019 }
33ad9f1a
CS
1020 info_ptr->created_paths[info_ptr->created_paths_count++] = current_entire_path;
1021 } else {
1022 /* if we didn't create the cgroup, then we have to make sure that
1023 * further cgroups will be created properly
1024 */
d703c2b1 1025 if (handle_cgroup_settings(info_ptr->designated_mount_point, info_ptr->cgroup_path) < 0) {
f6ac3b9e 1026 ERROR("Could not set clone_children to 1 for cpuset hierarchy in pre-existing cgroup.");
33ad9f1a 1027 goto cleanup_from_error;
f6ac3b9e 1028 }
d703c2b1
RV
1029 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, info_ptr->cgroup_path)) {
1030 ERROR("Failed to initialize cpuset in pre-existing '%s'.", info_ptr->cgroup_path);
1031 goto cleanup_from_error;
1032 }
33ad9f1a 1033
ea439aac 1034skip:
33ad9f1a
CS
1035 /* already existed but path component of pattern didn't contain '%n',
1036 * so this is not an error; but then we don't need current_entire_path
1037 * anymore...
1038 */
1039 free(current_entire_path);
1040 current_entire_path = NULL;
1041 }
1042 }
fd4f5a56 1043
33ad9f1a
CS
1044 /* save path so far */
1045 free(path_so_far);
1046 path_so_far = strdup(current_subpath);
1047 if (!path_so_far)
1048 goto cleanup_from_error;
1049
1050 /* cleanup */
1051 if (current_component != current_subpath)
1052 free(current_subpath);
1053 if (current_component != p_eff)
1054 free(current_component);
1055 current_component = current_subpath = NULL;
1056 continue;
4fb3cba5 1057
33ad9f1a 1058 cleanup_from_error:
ec64264d 1059 /* called if an error occurred in the loop, so we
33ad9f1a
CS
1060 * do some additional cleanup here
1061 */
1062 saved_errno = errno;
1063 if (current_component != current_subpath)
1064 free(current_subpath);
1065 if (current_component != p_eff)
1066 free(current_component);
1067 free(current_entire_path);
1068 errno = saved_errno;
1069 goto out_initial_error;
fd4f5a56
DL
1070 }
1071
33ad9f1a
CS
1072 /* we're done, now update the paths */
1073 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
47d8fb3b
CS
1074 /* ignore legacy 'ns' subsystem here, lxc_cgroup_create_legacy
1075 * will take care of it
1076 * Since we do a continue in above loop, new_cgroup_paths[i] is
1077 * unset anyway, as is new_cgroup_paths_sub[i]
692ba18f 1078 */
47d8fb3b
CS
1079 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1080 continue;
1081 free(info_ptr->cgroup_path);
1082 info_ptr->cgroup_path = new_cgroup_paths[i];
1083 info_ptr->cgroup_path_sub = new_cgroup_paths_sub[i];
fd4f5a56 1084 }
33ad9f1a
CS
1085 /* don't use lxc_free_array since we used the array members
1086 * to store them in our result...
1087 */
1088 free(new_cgroup_paths);
1089 free(new_cgroup_paths_sub);
1090 free(path_so_far);
1091 lxc_free_array((void **)cgroup_path_components, free);
1092 return base_info;
1093
1094out_initial_error:
1095 saved_errno = errno;
1096 free(path_so_far);
6a9e0f26 1097 lxc_cgroup_process_info_free_and_remove(base_info, NULL);
33ad9f1a
CS
1098 lxc_free_array((void **)new_cgroup_paths, free);
1099 lxc_free_array((void **)new_cgroup_paths_sub, free);
1100 lxc_free_array((void **)cgroup_path_components, free);
1101 errno = saved_errno;
1102 return NULL;
c8f7c563
CS
1103}
1104
4fb3cba5 1105static int lxc_cgroup_create_legacy(struct cgroup_process_info *base_info, const char *name, pid_t pid)
47d8fb3b
CS
1106{
1107 struct cgroup_process_info *info_ptr;
1108 int r;
1109
1110 for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
1111 if (!lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1112 continue;
1113 /*
1114 * For any path which has ns cgroup mounted, handler->pid is already
1115 * moved into a container called '%d % (handler->pid)'. Rename it to
1116 * the cgroup name and record that.
1117 */
1118 char *tmp = cgroup_rename_nsgroup((const char *)info_ptr->designated_mount_point->mount_point,
1119 info_ptr->cgroup_path, pid, name);
1120 if (!tmp)
1121 return -1;
1122 free(info_ptr->cgroup_path);
1123 info_ptr->cgroup_path = tmp;
1124 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1125 if (r < 0)
1126 return -1;
1127 tmp = strdup(tmp);
1128 if (!tmp)
1129 return -1;
1130 info_ptr->created_paths[info_ptr->created_paths_count++] = tmp;
1131 }
1132 return 0;
1133}
1134
33ad9f1a 1135/* get the cgroup membership of a given container */
4fb3cba5 1136static struct cgroup_process_info *lxc_cgroup_get_container_info(const char *name, const char *lxcpath, struct cgroup_meta_data *meta_data)
c8f7c563 1137{
33ad9f1a
CS
1138 struct cgroup_process_info *result = NULL;
1139 int saved_errno = 0;
1140 size_t i;
1141 struct cgroup_process_info **cptr = &result;
1142 struct cgroup_process_info *entry = NULL;
1143 char *path = NULL;
1144
1145 for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
1146 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
1147 if (!h || !h->used)
1148 continue;
c8f7c563 1149
33ad9f1a
CS
1150 /* use the command interface to look for the cgroup */
1151 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->subsystems[0]);
c661b0a8
DE
1152 if (!path) {
1153 h->used = false;
c661b0a8
DE
1154 continue;
1155 }
33ad9f1a
CS
1156
1157 entry = calloc(1, sizeof(struct cgroup_process_info));
1158 if (!entry)
1159 goto out_error;
1160 entry->meta_ref = lxc_cgroup_get_meta(meta_data);
1161 entry->hierarchy = h;
1162 entry->cgroup_path = path;
1163 path = NULL;
1164
1165 /* it is not an error if we don't find anything here,
1166 * it is up to the caller to decide what to do in that
1167 * case */
1168 entry->designated_mount_point = lxc_cgroup_find_mount_point(h, entry->cgroup_path, true);
1169
1170 *cptr = entry;
1171 cptr = &entry->next;
1172 entry = NULL;
c8f7c563
CS
1173 }
1174
33ad9f1a
CS
1175 return result;
1176out_error:
1177 saved_errno = errno;
1178 free(path);
1179 lxc_cgroup_process_info_free(result);
1180 lxc_cgroup_process_info_free(entry);
1181 errno = saved_errno;
1182 return NULL;
fd4f5a56
DL
1183}
1184
33ad9f1a 1185/* move a processs to the cgroups specified by the membership */
4fb3cba5 1186static int lxc_cgroupfs_enter(struct cgroup_process_info *info, pid_t pid, bool enter_sub)
4f17323e 1187{
33ad9f1a
CS
1188 char pid_buf[32];
1189 char *cgroup_tasks_fn;
1190 int r;
1191 struct cgroup_process_info *info_ptr;
1192
1193 snprintf(pid_buf, 32, "%lu", (unsigned long)pid);
1194 for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1195 char *cgroup_path = (enter_sub && info_ptr->cgroup_path_sub) ?
1196 info_ptr->cgroup_path_sub :
1197 info_ptr->cgroup_path;
1198
1199 if (!info_ptr->designated_mount_point) {
1200 info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, cgroup_path, true);
1201 if (!info_ptr->designated_mount_point) {
1202 SYSERROR("Could not add pid %lu to cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", (unsigned long)pid, cgroup_path);
1203 return -1;
1204 }
1205 }
4f17323e 1206
33ad9f1a
CS
1207 cgroup_tasks_fn = cgroup_to_absolute_path(info_ptr->designated_mount_point, cgroup_path, "/tasks");
1208 if (!cgroup_tasks_fn) {
1209 SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1210 return -1;
1211 }
4f17323e 1212
33ad9f1a 1213 r = lxc_write_to_file(cgroup_tasks_fn, pid_buf, strlen(pid_buf), false);
5903da82 1214 free(cgroup_tasks_fn);
ea439aac 1215 if (r < 0 && is_crucial_hierarchy(info_ptr->hierarchy)) {
33ad9f1a
CS
1216 SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1217 return -1;
1218 }
4f17323e
CS
1219 }
1220
33ad9f1a 1221 return 0;
4f17323e
CS
1222}
1223
33ad9f1a
CS
1224/* free process membership information */
1225void lxc_cgroup_process_info_free(struct cgroup_process_info *info)
fc7de561 1226{
33ad9f1a
CS
1227 struct cgroup_process_info *next;
1228 if (!info)
b98f7d6e 1229 return;
33ad9f1a
CS
1230 next = info->next;
1231 lxc_cgroup_put_meta(info->meta_ref);
1232 free(info->cgroup_path);
1233 free(info->cgroup_path_sub);
1234 lxc_free_array((void **)info->created_paths, free);
1235 free(info);
1236 lxc_cgroup_process_info_free(next);
fc7de561
SH
1237}
1238
33ad9f1a 1239/* free process membership information and remove cgroups that were created */
6a9e0f26 1240void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info, struct lxc_conf *conf)
b98f7d6e 1241{
33ad9f1a
CS
1242 struct cgroup_process_info *next;
1243 char **pp;
1244 if (!info)
1245 return;
1246 next = info->next;
603c64c2 1247 {
33ad9f1a
CS
1248 struct cgroup_mount_point *mp = info->designated_mount_point;
1249 if (!mp)
1250 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1251 if (mp)
1252 /* ignore return value here, perhaps we created the
1253 * '/lxc' cgroup in this container but another container
1254 * is still running (for example)
1255 */
6a9e0f26 1256 (void)remove_cgroup(mp, info->cgroup_path, true, conf);
603c64c2
SH
1257 }
1258 for (pp = info->created_paths; pp && *pp; pp++);
1259 for ((void)(pp && --pp); info->created_paths && pp >= info->created_paths; --pp) {
33ad9f1a 1260 free(*pp);
b98f7d6e 1261 }
33ad9f1a
CS
1262 free(info->created_paths);
1263 lxc_cgroup_put_meta(info->meta_ref);
1264 free(info->cgroup_path);
1265 free(info->cgroup_path_sub);
1266 free(info);
6a9e0f26 1267 lxc_cgroup_process_info_free_and_remove(next, conf);
33ad9f1a 1268}
b98f7d6e 1269
4fb3cba5 1270static char *lxc_cgroup_get_hierarchy_path_data(const char *subsystem, struct cgfs_data *d)
33ad9f1a 1271{
d4ef7c50
SH
1272 struct cgroup_process_info *info = d->info;
1273 info = find_info_for_subsystem(info, subsystem);
33ad9f1a
CS
1274 if (!info)
1275 return NULL;
f348e47c 1276 prune_init_scope(info->cgroup_path);
33ad9f1a 1277 return info->cgroup_path;
b98f7d6e
SH
1278}
1279
4fb3cba5 1280static char *lxc_cgroup_get_hierarchy_abs_path_data(const char *subsystem, struct cgfs_data *d)
b98f7d6e 1281{
d4ef7c50 1282 struct cgroup_process_info *info = d->info;
33ad9f1a 1283 struct cgroup_mount_point *mp = NULL;
d4ef7c50
SH
1284
1285 info = find_info_for_subsystem(info, subsystem);
33ad9f1a
CS
1286 if (!info)
1287 return NULL;
1288 if (info->designated_mount_point) {
8900b9eb 1289 mp = info->designated_mount_point;
33ad9f1a
CS
1290 } else {
1291 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1292 if (!mp)
1293 return NULL;
b98f7d6e 1294 }
33ad9f1a 1295 return cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
b98f7d6e 1296}
55c76589 1297
4fb3cba5 1298static char *lxc_cgroup_get_hierarchy_abs_path(const char *subsystem, const char *name, const char *lxcpath)
9a93d992 1299{
33ad9f1a
CS
1300 struct cgroup_meta_data *meta;
1301 struct cgroup_process_info *base_info, *info;
1302 struct cgroup_mount_point *mp;
1303 char *result = NULL;
33ad9f1a
CS
1304
1305 meta = lxc_cgroup_load_meta();
1306 if (!meta)
9a93d992 1307 return NULL;
33ad9f1a
CS
1308 base_info = lxc_cgroup_get_container_info(name, lxcpath, meta);
1309 if (!base_info)
178938fe 1310 goto out1;
33ad9f1a
CS
1311 info = find_info_for_subsystem(base_info, subsystem);
1312 if (!info)
178938fe 1313 goto out2;
33ad9f1a 1314 if (info->designated_mount_point) {
8900b9eb 1315 mp = info->designated_mount_point;
33ad9f1a
CS
1316 } else {
1317 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1318 if (!mp)
178938fe 1319 goto out3;
33ad9f1a
CS
1320 }
1321 result = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
178938fe 1322out3:
178938fe 1323out2:
33ad9f1a 1324 lxc_cgroup_process_info_free(base_info);
178938fe 1325out1:
33ad9f1a 1326 lxc_cgroup_put_meta(meta);
33ad9f1a
CS
1327 return result;
1328}
9a93d992 1329
4fb3cba5 1330static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfs_data *d)
33ad9f1a
CS
1331{
1332 char *subsystem = NULL, *p, *path;
1333 int ret = -1;
9a93d992 1334
33ad9f1a
CS
1335 subsystem = alloca(strlen(filename) + 1);
1336 strcpy(subsystem, filename);
46cd2845 1337 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a 1338 *p = '\0';
9a93d992 1339
4f875f70 1340 errno = ENOENT;
4fb3cba5 1341 path = lxc_cgroup_get_hierarchy_abs_path_data(subsystem, d);
33ad9f1a
CS
1342 if (path) {
1343 ret = do_cgroup_set(path, filename, value);
4f875f70 1344 int saved_errno = errno;
33ad9f1a 1345 free(path);
4f875f70 1346 errno = saved_errno;
9a93d992 1347 }
33ad9f1a
CS
1348 return ret;
1349}
9a93d992 1350
4fb3cba5 1351static int lxc_cgroupfs_set(const char *filename, const char *value, const char *name, const char *lxcpath)
9a93d992 1352{
33ad9f1a
CS
1353 char *subsystem = NULL, *p, *path;
1354 int ret = -1;
9a93d992 1355
33ad9f1a
CS
1356 subsystem = alloca(strlen(filename) + 1);
1357 strcpy(subsystem, filename);
46cd2845 1358 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a 1359 *p = '\0';
9a93d992 1360
33ad9f1a
CS
1361 path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1362 if (path) {
1363 ret = do_cgroup_set(path, filename, value);
1364 free(path);
1365 }
b98f7d6e 1366 return ret;
9a93d992
SH
1367}
1368
4fb3cba5 1369static int lxc_cgroupfs_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
9a93d992 1370{
33ad9f1a
CS
1371 char *subsystem = NULL, *p, *path;
1372 int ret = -1;
1373
1374 subsystem = alloca(strlen(filename) + 1);
1375 strcpy(subsystem, filename);
46cd2845 1376 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a
CS
1377 *p = '\0';
1378
1379 path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1380 if (path) {
1381 ret = do_cgroup_get(path, filename, value, len);
1382 free(path);
9a93d992 1383 }
33ad9f1a 1384 return ret;
9a93d992
SH
1385}
1386
4fb3cba5 1387static bool cgroupfs_mount_cgroup(void *hdata, const char *root, int type)
aae1f3c4
CS
1388{
1389 size_t bufsz = strlen(root) + sizeof("/sys/fs/cgroup");
1390 char *path = NULL;
1391 char **parts = NULL;
1392 char *dirname = NULL;
1393 char *abs_path = NULL;
1394 char *abs_path2 = NULL;
d4ef7c50
SH
1395 struct cgfs_data *cgfs_d;
1396 struct cgroup_process_info *info, *base_info;
aae1f3c4
CS
1397 int r, saved_errno = 0;
1398
4608594e
SH
1399 if (cgns_supported())
1400 return true;
1401
4fb3cba5
DE
1402 cgfs_d = hdata;
1403 if (!cgfs_d)
1404 return false;
d4ef7c50
SH
1405 base_info = cgfs_d->info;
1406
0769b82a
CS
1407 /* If we get passed the _NOSPEC types, we default to _MIXED, since we don't
1408 * have access to the lxc_conf object at this point. It really should be up
1409 * to the caller to fix this, but this doesn't really hurt.
1410 */
1411 if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1412 type = LXC_AUTO_CGROUP_FULL_MIXED;
1413 else if (type == LXC_AUTO_CGROUP_NOSPEC)
1414 type = LXC_AUTO_CGROUP_MIXED;
1415
7997d7da
CS
1416 if (type < LXC_AUTO_CGROUP_RO || type > LXC_AUTO_CGROUP_FULL_MIXED) {
1417 ERROR("could not mount cgroups into container: invalid type specified internally");
1418 errno = EINVAL;
c476bdce 1419 return false;
7997d7da
CS
1420 }
1421
aae1f3c4
CS
1422 path = calloc(1, bufsz);
1423 if (!path)
c476bdce 1424 return false;
aae1f3c4 1425 snprintf(path, bufsz, "%s/sys/fs/cgroup", root);
592fd47a
SH
1426 r = safe_mount("cgroup_root", path, "tmpfs",
1427 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1428 "size=10240k,mode=755",
1429 root);
aae1f3c4
CS
1430 if (r < 0) {
1431 SYSERROR("could not mount tmpfs to /sys/fs/cgroup in the container");
c476bdce 1432 return false;
aae1f3c4
CS
1433 }
1434
1435 /* now mount all the hierarchies we care about */
1436 for (info = base_info; info; info = info->next) {
1437 size_t subsystem_count, i;
1438 struct cgroup_mount_point *mp = info->designated_mount_point;
d3f99e96 1439 if (!mountpoint_is_accessible(mp))
aae1f3c4 1440 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
d3f99e96 1441
aae1f3c4
CS
1442 if (!mp) {
1443 SYSERROR("could not find original mount point for cgroup hierarchy while trying to mount cgroup filesystem");
1444 goto out_error;
1445 }
1446
1447 subsystem_count = lxc_array_len((void **)info->hierarchy->subsystems);
1448 parts = calloc(subsystem_count + 1, sizeof(char *));
1449 if (!parts)
1450 goto out_error;
1451
1452 for (i = 0; i < subsystem_count; i++) {
1453 if (!strncmp(info->hierarchy->subsystems[i], "name=", 5))
1454 parts[i] = info->hierarchy->subsystems[i] + 5;
1455 else
1456 parts[i] = info->hierarchy->subsystems[i];
1457 }
1458 dirname = lxc_string_join(",", (const char **)parts, false);
1459 if (!dirname)
1460 goto out_error;
1461
1462 /* create subsystem directory */
1463 abs_path = lxc_append_paths(path, dirname);
1464 if (!abs_path)
1465 goto out_error;
1466 r = mkdir_p(abs_path, 0755);
1467 if (r < 0 && errno != EEXIST) {
1468 SYSERROR("could not create cgroup subsystem directory /sys/fs/cgroup/%s", dirname);
1469 goto out_error;
1470 }
1471
aae1f3c4
CS
1472 abs_path2 = lxc_append_paths(abs_path, info->cgroup_path);
1473 if (!abs_path2)
1474 goto out_error;
aae1f3c4 1475
7997d7da
CS
1476 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_RW || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1477 /* bind-mount the cgroup entire filesystem there */
1478 if (strcmp(mp->mount_prefix, "/") != 0) {
1479 /* FIXME: maybe we should just try to remount the entire hierarchy
1480 * with a regular mount command? may that works? */
1481 ERROR("could not automatically mount cgroup-full to /sys/fs/cgroup/%s: host has no mount point for this cgroup filesystem that has access to the root cgroup", dirname);
1482 goto out_error;
1483 }
1484 r = mount(mp->mount_point, abs_path, "none", MS_BIND, 0);
1485 if (r < 0) {
1486 SYSERROR("error bind-mounting %s to %s", mp->mount_point, abs_path);
1487 goto out_error;
1488 }
f8f3c3c0
SG
1489 /* main cgroup path should be read-only */
1490 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1491 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1492 if (r < 0) {
1493 SYSERROR("error re-mounting %s readonly", abs_path);
1494 goto out_error;
1495 }
1496 }
7997d7da
CS
1497 /* own cgroup should be read-write */
1498 if (type == LXC_AUTO_CGROUP_FULL_MIXED) {
1499 r = mount(abs_path2, abs_path2, NULL, MS_BIND, NULL);
1500 if (r < 0) {
1501 SYSERROR("error bind-mounting %s onto itself", abs_path2);
1502 goto out_error;
1503 }
1504 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND, NULL);
1505 if (r < 0) {
1506 SYSERROR("error re-mounting %s readwrite", abs_path2);
1507 goto out_error;
1508 }
1509 }
1510 } else {
1511 /* create path for container's cgroup */
1512 r = mkdir_p(abs_path2, 0755);
1513 if (r < 0 && errno != EEXIST) {
1514 SYSERROR("could not create cgroup directory /sys/fs/cgroup/%s%s", dirname, info->cgroup_path);
1515 goto out_error;
1516 }
aae1f3c4 1517
b46f0553
CS
1518 /* for read-only and mixed cases, we have to bind-mount the tmpfs directory
1519 * that points to the hierarchy itself (i.e. /sys/fs/cgroup/cpu etc.) onto
1520 * itself and then bind-mount it read-only, since we keep the tmpfs itself
1521 * read-write (see comment below)
1522 */
1523 if (type == LXC_AUTO_CGROUP_MIXED || type == LXC_AUTO_CGROUP_RO) {
1524 r = mount(abs_path, abs_path, NULL, MS_BIND, NULL);
1525 if (r < 0) {
1526 SYSERROR("error bind-mounting %s onto itself", abs_path);
1527 goto out_error;
1528 }
1529 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1530 if (r < 0) {
1531 SYSERROR("error re-mounting %s readonly", abs_path);
1532 goto out_error;
1533 }
1534 }
1535
7997d7da
CS
1536 free(abs_path);
1537 abs_path = NULL;
1538
1539 /* bind-mount container's cgroup to that directory */
1540 abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1541 if (!abs_path)
1542 goto out_error;
1543 r = mount(abs_path, abs_path2, "none", MS_BIND, 0);
ea439aac 1544 if (r < 0 && is_crucial_hierarchy(info->hierarchy)) {
7997d7da
CS
1545 SYSERROR("error bind-mounting %s to %s", abs_path, abs_path2);
1546 goto out_error;
1547 }
1548 if (type == LXC_AUTO_CGROUP_RO) {
1549 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1550 if (r < 0) {
1551 SYSERROR("error re-mounting %s readonly", abs_path2);
1552 goto out_error;
1553 }
1554 }
aae1f3c4
CS
1555 }
1556
1557 free(abs_path);
1558 free(abs_path2);
1559 abs_path = NULL;
1560 abs_path2 = NULL;
1561
1562 /* add symlinks for every single subsystem */
1563 if (subsystem_count > 1) {
1564 for (i = 0; i < subsystem_count; i++) {
1565 abs_path = lxc_append_paths(path, parts[i]);
1566 if (!abs_path)
1567 goto out_error;
1568 r = symlink(dirname, abs_path);
1569 if (r < 0)
1570 WARN("could not create symlink %s -> %s in /sys/fs/cgroup of container", parts[i], dirname);
1571 free(abs_path);
1572 abs_path = NULL;
1573 }
1574 }
1575 free(dirname);
1576 free(parts);
1577 dirname = NULL;
1578 parts = NULL;
1579 }
1580
b46f0553
CS
1581 /* We used to remount the entire tmpfs readonly if any :ro or
1582 * :mixed mode was specified. However, Ubuntu's mountall has the
1583 * unfortunate behavior to block bootup if /sys/fs/cgroup is
1584 * mounted read-only and cannot be remounted read-write.
1585 * (mountall reads /lib/init/fstab and tries to (re-)mount all of
1586 * these if they are not already mounted with the right options;
1587 * it contains an entry for /sys/fs/cgroup. In case it can't do
1588 * that, it prompts for the user to either manually fix it or
1589 * boot anyway. But without user input, booting of the container
1590 * hangs.)
1591 *
1592 * Instead of remounting the entire tmpfs readonly, we only
1593 * remount the paths readonly that are part of the cgroup
1594 * hierarchy.
f8f3c3c0 1595 */
f8f3c3c0 1596
aae1f3c4
CS
1597 free(path);
1598
c476bdce 1599 return true;
aae1f3c4
CS
1600
1601out_error:
1602 saved_errno = errno;
1603 free(path);
1604 free(dirname);
1605 free(parts);
1606 free(abs_path);
1607 free(abs_path2);
1608 errno = saved_errno;
c476bdce 1609 return false;
aae1f3c4
CS
1610}
1611
4fb3cba5 1612static int cgfs_nrtasks(void *hdata)
33ad9f1a 1613{
4fb3cba5
DE
1614 struct cgfs_data *d = hdata;
1615 struct cgroup_process_info *info;
33ad9f1a
CS
1616 struct cgroup_mount_point *mp = NULL;
1617 char *abs_path = NULL;
1618 int ret;
460a1cf0 1619
4fb3cba5
DE
1620 if (!d) {
1621 errno = ENOENT;
1622 return -1;
1623 }
1624
1625 info = d->info;
33ad9f1a
CS
1626 if (!info) {
1627 errno = ENOENT;
1628 return -1;
b98f7d6e 1629 }
c8f7c563 1630
33ad9f1a 1631 if (info->designated_mount_point) {
8900b9eb 1632 mp = info->designated_mount_point;
33ad9f1a
CS
1633 } else {
1634 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, false);
1635 if (!mp)
1636 return -1;
c8f7c563
CS
1637 }
1638
33ad9f1a
CS
1639 abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1640 if (!abs_path)
1641 return -1;
1642
1643 ret = cgroup_recursive_task_count(abs_path);
1644 free(abs_path);
1645 return ret;
c8f7c563
CS
1646}
1647
574c4428
QH
1648static struct cgroup_process_info *
1649lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str,
1650 struct cgroup_meta_data *meta)
d08ba6ec 1651{
33ad9f1a
CS
1652 struct cgroup_process_info *result = NULL;
1653 FILE *proc_pid_cgroup = NULL;
1654 char *line = NULL;
1655 size_t sz = 0;
1656 int saved_errno = 0;
1657 struct cgroup_process_info **cptr = &result;
1658 struct cgroup_process_info *entry = NULL;
1659
1660 proc_pid_cgroup = fopen_cloexec(proc_pid_cgroup_str, "r");
1661 if (!proc_pid_cgroup)
b98f7d6e 1662 return NULL;
1ac470c0 1663
33ad9f1a
CS
1664 while (getline(&line, &sz, proc_pid_cgroup) != -1) {
1665 /* file format: hierarchy:subsystems:group */
1666 char *colon1;
1667 char *colon2;
1668 char *endptr;
1669 int hierarchy_number;
1670 struct cgroup_hierarchy *h = NULL;
fd4f5a56 1671
33ad9f1a 1672 if (!line[0])
ae5c8b8e 1673 continue;
b98f7d6e 1674
33ad9f1a
CS
1675 if (line[strlen(line) - 1] == '\n')
1676 line[strlen(line) - 1] = '\0';
1677
1678 colon1 = strchr(line, ':');
1679 if (!colon1)
8900b9eb 1680 continue;
33ad9f1a
CS
1681 *colon1++ = '\0';
1682 colon2 = strchr(colon1, ':');
1683 if (!colon2)
ae5c8b8e 1684 continue;
33ad9f1a 1685 *colon2++ = '\0';
e4659536 1686
33ad9f1a
CS
1687 endptr = NULL;
1688 hierarchy_number = strtoul(line, &endptr, 10);
1689 if (!endptr || *endptr)
9a93d992 1690 continue;
9a93d992 1691
33ad9f1a
CS
1692 if (hierarchy_number > meta->maximum_hierarchy) {
1693 /* we encountered a hierarchy we didn't have before,
1694 * so probably somebody remounted some stuff in the
1695 * mean time...
1696 */
1697 errno = EAGAIN;
1698 goto out_error;
b98f7d6e 1699 }
33ad9f1a
CS
1700
1701 h = meta->hierarchies[hierarchy_number];
1702 if (!h) {
1703 /* we encountered a hierarchy that was thought to be
1704 * dead before, so probably somebody remounted some
1705 * stuff in the mean time...
1706 */
1707 errno = EAGAIN;
1708 goto out_error;
b98f7d6e 1709 }
33ad9f1a
CS
1710
1711 /* we are told that we should ignore this hierarchy */
1712 if (!h->used)
b98f7d6e 1713 continue;
5193cc3d 1714
33ad9f1a
CS
1715 entry = calloc(1, sizeof(struct cgroup_process_info));
1716 if (!entry)
1717 goto out_error;
fd4f5a56 1718
33ad9f1a
CS
1719 entry->meta_ref = lxc_cgroup_get_meta(meta);
1720 entry->hierarchy = h;
1721 entry->cgroup_path = strdup(colon2);
1722 if (!entry->cgroup_path)
1723 goto out_error;
3939a22a 1724 prune_init_scope(entry->cgroup_path);
d08ba6ec 1725
33ad9f1a
CS
1726 *cptr = entry;
1727 cptr = &entry->next;
1728 entry = NULL;
b98f7d6e 1729 }
b98f7d6e 1730
33ad9f1a
CS
1731 fclose(proc_pid_cgroup);
1732 free(line);
1733 return result;
1734
1735out_error:
1736 saved_errno = errno;
1737 if (proc_pid_cgroup)
1738 fclose(proc_pid_cgroup);
1739 lxc_cgroup_process_info_free(result);
1740 lxc_cgroup_process_info_free(entry);
1741 free(line);
1742 errno = saved_errno;
ae5c8b8e 1743 return NULL;
36b86299
DL
1744}
1745
574c4428
QH
1746static char **subsystems_from_mount_options(const char *mount_options,
1747 char **kernel_list)
36b86299 1748{
33ad9f1a
CS
1749 char *token, *str, *saveptr = NULL;
1750 char **result = NULL;
1751 size_t result_capacity = 0;
8900b9eb 1752 size_t result_count = 0;
33ad9f1a
CS
1753 int saved_errno;
1754 int r;
ef342abb 1755
33ad9f1a
CS
1756 str = alloca(strlen(mount_options)+1);
1757 strcpy(str, mount_options);
1758 for (; (token = strtok_r(str, ",", &saveptr)); str = NULL) {
1759 /* we have a subsystem if it's either in the list of
1760 * subsystems provided by the kernel OR if it starts
1761 * with name= for named hierarchies
1762 */
836514a8
U
1763 r = lxc_grow_array((void ***)&result, &result_capacity, result_count + 1, 12);
1764 if (r < 0)
1765 goto out_free;
1766 result[result_count + 1] = NULL;
1767 if (strncmp(token, "name=", 5) && !lxc_string_in_array(token, (const char **)kernel_list)) {
1768 // this is eg 'systemd' but the mount will be 'name=systemd'
1769 result[result_count] = malloc(strlen(token) + 6);
1770 if (result[result_count])
1771 sprintf(result[result_count], "name=%s", token);
1772 } else
33ad9f1a 1773 result[result_count] = strdup(token);
836514a8
U
1774 if (!result[result_count])
1775 goto out_free;
1776 result_count++;
ae5c8b8e 1777 }
f0e64b8b 1778
33ad9f1a
CS
1779 return result;
1780
1781out_free:
1782 saved_errno = errno;
1783 lxc_free_array((void**)result, free);
1784 errno = saved_errno;
1785 return NULL;
b98f7d6e
SH
1786}
1787
574c4428 1788static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp)
b98f7d6e 1789{
33ad9f1a
CS
1790 if (!mp)
1791 return;
1792 free(mp->mount_point);
1793 free(mp->mount_prefix);
1794 free(mp);
bcbd102c
SH
1795}
1796
574c4428 1797static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h)
341a9bd8 1798{
33ad9f1a
CS
1799 if (!h)
1800 return;
2446c321
CB
1801 if (h->subsystems) {
1802 lxc_free_array((void **)h->subsystems, free);
1803 h->subsystems = NULL;
1804 }
1805 if (h->all_mount_points) {
1806 free(h->all_mount_points);
1807 h->all_mount_points = NULL;
1808 }
33ad9f1a 1809 free(h);
2446c321 1810 h = NULL;
33ad9f1a 1811}
341a9bd8 1812
574c4428 1813static bool is_valid_cgroup(const char *name)
33ad9f1a
CS
1814{
1815 const char *p;
1816 for (p = name; *p; p++) {
28bb9321
QH
1817 /* Use the ASCII printable characters range(32 - 127)
1818 * is reasonable, we kick out 32(SPACE) because it'll
1819 * break legacy lxc-ls
1820 */
1821 if (*p <= 32 || *p >= 127 || *p == '/')
33ad9f1a 1822 return false;
341a9bd8 1823 }
33ad9f1a
CS
1824 return strcmp(name, ".") != 0 && strcmp(name, "..") != 0;
1825}
341a9bd8 1826
574c4428 1827static int create_or_remove_cgroup(bool do_remove,
6a9e0f26
SH
1828 struct cgroup_mount_point *mp, const char *path, int recurse,
1829 struct lxc_conf *conf)
33ad9f1a
CS
1830{
1831 int r, saved_errno = 0;
1832 char *buf = cgroup_to_absolute_path(mp, path, NULL);
1833 if (!buf)
1834 return -1;
341a9bd8 1835
33ad9f1a 1836 /* create or remove directory */
603c64c2 1837 if (do_remove) {
01d59fe5
CB
1838 if (!dir_exists(buf))
1839 return 0;
6a9e0f26
SH
1840 if (recurse) {
1841 if (conf && !lxc_list_empty(&conf->id_map))
1842 r = userns_exec_1(conf, rmdir_wrapper, buf);
1843 else
1844 r = cgroup_rmdir(buf);
1845 } else
603c64c2
SH
1846 r = rmdir(buf);
1847 } else
1848 r = mkdir(buf, 0777);
33ad9f1a
CS
1849 saved_errno = errno;
1850 free(buf);
1851 errno = saved_errno;
1852 return r;
341a9bd8 1853}
bcbd102c 1854
574c4428 1855static int create_cgroup(struct cgroup_mount_point *mp, const char *path)
a6ddef61 1856{
6a9e0f26 1857 return create_or_remove_cgroup(false, mp, path, false, NULL);
a6ddef61
MN
1858}
1859
574c4428 1860static int remove_cgroup(struct cgroup_mount_point *mp,
6a9e0f26 1861 const char *path, bool recurse, struct lxc_conf *conf)
576f946d 1862{
6a9e0f26 1863 return create_or_remove_cgroup(true, mp, path, recurse, conf);
33ad9f1a 1864}
576f946d 1865
574c4428
QH
1866static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp,
1867 const char *path, const char *suffix)
33ad9f1a
CS
1868{
1869 /* first we have to make sure we subtract the mount point's prefix */
1870 char *prefix = mp->mount_prefix;
1871 char *buf;
1872 ssize_t len, rv;
1873
1874 /* we want to make sure only absolute paths to cgroups are passed to us */
1875 if (path[0] != '/') {
1876 errno = EINVAL;
1877 return NULL;
1878 }
b98f7d6e 1879
33ad9f1a
CS
1880 if (prefix && !strcmp(prefix, "/"))
1881 prefix = NULL;
b98f7d6e 1882
33ad9f1a
CS
1883 /* prefix doesn't match */
1884 if (prefix && strncmp(prefix, path, strlen(prefix)) != 0) {
1885 errno = EINVAL;
1886 return NULL;
1887 }
1888 /* if prefix is /foo and path is /foobar */
1889 if (prefix && path[strlen(prefix)] != '/' && path[strlen(prefix)] != '\0') {
1890 errno = EINVAL;
1891 return NULL;
1892 }
b98f7d6e 1893
33ad9f1a
CS
1894 /* remove prefix from path */
1895 path += prefix ? strlen(prefix) : 0;
b98f7d6e 1896
33ad9f1a
CS
1897 len = strlen(mp->mount_point) + strlen(path) + (suffix ? strlen(suffix) : 0);
1898 buf = calloc(len + 1, 1);
50266dc6
DE
1899 if (!buf)
1900 return NULL;
33ad9f1a 1901 rv = snprintf(buf, len + 1, "%s%s%s", mp->mount_point, path, suffix ? suffix : "");
8900b9eb 1902 if (rv > len) {
33ad9f1a
CS
1903 free(buf);
1904 errno = ENOMEM;
8900b9eb 1905 return NULL;
8b92dc3a 1906 }
576f946d 1907
33ad9f1a 1908 return buf;
e0f888d9 1909}
283678ed 1910
574c4428
QH
1911static struct cgroup_process_info *
1912find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem)
283678ed 1913{
33ad9f1a
CS
1914 struct cgroup_process_info *info_ptr;
1915 for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1916 struct cgroup_hierarchy *h = info_ptr->hierarchy;
1917 if (lxc_string_in_array(subsystem, (const char **)h->subsystems))
1918 return info_ptr;
b98f7d6e 1919 }
33ad9f1a
CS
1920 errno = ENOENT;
1921 return NULL;
1922}
283678ed 1923
574c4428
QH
1924static int do_cgroup_get(const char *cgroup_path, const char *sub_filename,
1925 char *value, size_t len)
33ad9f1a
CS
1926{
1927 const char *parts[3] = {
1928 cgroup_path,
1929 sub_filename,
1930 NULL
1931 };
1932 char *filename;
1933 int ret, saved_errno;
1934
1935 filename = lxc_string_join("/", parts, false);
1936 if (!filename)
1937 return -1;
1938
1939 ret = lxc_read_from_file(filename, value, len);
1940 saved_errno = errno;
1941 free(filename);
1942 errno = saved_errno;
1943 return ret;
283678ed 1944}
b113383b 1945
574c4428
QH
1946static int do_cgroup_set(const char *cgroup_path, const char *sub_filename,
1947 const char *value)
b113383b 1948{
33ad9f1a
CS
1949 const char *parts[3] = {
1950 cgroup_path,
1951 sub_filename,
1952 NULL
1953 };
1954 char *filename;
1955 int ret, saved_errno;
b113383b 1956
33ad9f1a
CS
1957 filename = lxc_string_join("/", parts, false);
1958 if (!filename)
1959 return -1;
b113383b 1960
33ad9f1a
CS
1961 ret = lxc_write_to_file(filename, value, strlen(value), false);
1962 saved_errno = errno;
1963 free(filename);
1964 errno = saved_errno;
1965 return ret;
b98f7d6e
SH
1966}
1967
4fb3cba5 1968static int do_setup_cgroup_limits(struct cgfs_data *d,
574c4428 1969 struct lxc_list *cgroup_settings, bool do_devices)
b98f7d6e 1970{
365d180a 1971 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
b98f7d6e
SH
1972 struct lxc_cgroup *cg;
1973 int ret = -1;
1974
33ad9f1a 1975 if (lxc_list_empty(cgroup_settings))
b98f7d6e
SH
1976 return 0;
1977
aaf26830 1978 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
fac7c663
KT
1979 if (!sorted_cgroup_settings) {
1980 return -1;
1981 }
aaf26830
KT
1982
1983 lxc_list_for_each(iterator, sorted_cgroup_settings) {
b98f7d6e
SH
1984 cg = iterator->elem;
1985
33ad9f1a 1986 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
b98f7d6e 1987 if (strcmp(cg->subsystem, "devices.deny") == 0 &&
4fb3cba5 1988 cgroup_devices_has_allow_or_deny(d, cg->value, false))
b98f7d6e
SH
1989 continue;
1990 if (strcmp(cg->subsystem, "devices.allow") == 0 &&
4fb3cba5 1991 cgroup_devices_has_allow_or_deny(d, cg->value, true))
b98f7d6e 1992 continue;
4fb3cba5 1993 if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
dddf7c5b 1994 if (do_devices && (errno == EACCES || errno == EPERM)) {
4f875f70
SH
1995 WARN("Error setting %s to %s for %s",
1996 cg->subsystem, cg->value, d->name);
1997 continue;
1998 }
dddf7c5b 1999 SYSERROR("Error setting %s to %s for %s",
4fb3cba5 2000 cg->subsystem, cg->value, d->name);
b98f7d6e
SH
2001 goto out;
2002 }
b113383b 2003 }
b98f7d6e
SH
2004
2005 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
b113383b
SH
2006 }
2007
b98f7d6e
SH
2008 ret = 0;
2009 INFO("cgroup has been setup");
2010out:
365d180a 2011 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
aaf26830
KT
2012 lxc_list_del(iterator);
2013 free(iterator);
2014 }
365d180a 2015 free(sorted_cgroup_settings);
b113383b
SH
2016 return ret;
2017}
b98f7d6e 2018
4fb3cba5 2019static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d,
574c4428 2020 char *v, bool for_allow)
33ad9f1a
CS
2021{
2022 char *path;
2023 FILE *devices_list;
8900b9eb 2024 char *line = NULL;
33ad9f1a
CS
2025 size_t sz = 0;
2026 bool ret = !for_allow;
2027 const char *parts[3] = {
2028 NULL,
2029 "devices.list",
2030 NULL
2031 };
2032
2033 // XXX FIXME if users could use something other than 'lxc.devices.deny = a'.
2034 // not sure they ever do, but they *could*
2035 // right now, I'm assuming they do NOT
2036 if (!for_allow && strcmp(v, "a") != 0 && strcmp(v, "a *:* rwm") != 0)
2037 return false;
2038
4fb3cba5 2039 parts[0] = (const char *)lxc_cgroup_get_hierarchy_abs_path_data("devices", d);
33ad9f1a
CS
2040 if (!parts[0])
2041 return false;
2042 path = lxc_string_join("/", parts, false);
2043 if (!path) {
2044 free((void *)parts[0]);
2045 return false;
2046 }
2047
2048 devices_list = fopen_cloexec(path, "r");
2049 if (!devices_list) {
2050 free(path);
2051 return false;
2052 }
2053
2054 while (getline(&line, &sz, devices_list) != -1) {
2055 size_t len = strlen(line);
2056 if (len > 0 && line[len-1] == '\n')
2057 line[len-1] = '\0';
2058 if (strcmp(line, "a *:* rwm") == 0) {
2059 ret = for_allow;
2060 goto out;
2061 } else if (for_allow && strcmp(line, v) == 0) {
2062 ret = true;
8900b9eb 2063 goto out;
33ad9f1a
CS
2064 }
2065 }
2066
2067out:
2068 fclose(devices_list);
2069 free(line);
2070 free(path);
2071 return ret;
2072}
2073
574c4428 2074static int cgroup_recursive_task_count(const char *cgroup_path)
b98f7d6e 2075{
33ad9f1a 2076 DIR *d;
33ad9f1a 2077 struct dirent *dent;
33ad9f1a
CS
2078 int n = 0, r;
2079
33ad9f1a 2080 d = opendir(cgroup_path);
74f96976 2081 if (!d)
33ad9f1a
CS
2082 return 0;
2083
74f96976 2084 while ((dent = readdir(d))) {
33ad9f1a
CS
2085 const char *parts[3] = {
2086 cgroup_path,
2087 dent->d_name,
2088 NULL
2089 };
2090 char *sub_path;
2091 struct stat st;
2092
2093 if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, ".."))
2094 continue;
2095 sub_path = lxc_string_join("/", parts, false);
2096 if (!sub_path) {
2097 closedir(d);
33ad9f1a
CS
2098 return -1;
2099 }
2100 r = stat(sub_path, &st);
2101 if (r < 0) {
2102 closedir(d);
33ad9f1a
CS
2103 free(sub_path);
2104 return -1;
2105 }
2106 if (S_ISDIR(st.st_mode)) {
2107 r = cgroup_recursive_task_count(sub_path);
2108 if (r >= 0)
2109 n += r;
2110 } else if (!strcmp(dent->d_name, "tasks")) {
ccb4cabe 2111 r = lxc_count_file_lines(sub_path);
33ad9f1a
CS
2112 if (r >= 0)
2113 n += r;
2114 }
2115 free(sub_path);
2116 }
2117 closedir(d);
33ad9f1a
CS
2118
2119 return n;
2120}
2121
574c4428
QH
2122static int handle_cgroup_settings(struct cgroup_mount_point *mp,
2123 char *cgroup_path)
b98f7d6e 2124{
33ad9f1a 2125 int r, saved_errno = 0;
7e7243e1 2126 char buf[2];
1ea59ad2 2127
934b1673
SH
2128 mp->need_cpuset_init = false;
2129
1ea59ad2
SH
2130 /* If this is the memory cgroup, we want to enforce hierarchy.
2131 * But don't fail if for some reason we can't.
2132 */
2edb53c7
SH
2133 if (lxc_string_in_array("memory", (const char **)mp->hierarchy->subsystems)) {
2134 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/memory.use_hierarchy");
2135 if (cc_path) {
2136 r = lxc_read_from_file(cc_path, buf, 1);
2137 if (r < 1 || buf[0] != '1') {
2138 r = lxc_write_to_file(cc_path, "1", 1, false);
2139 if (r < 0)
a8916143 2140 SYSERROR("failed to set memory.use_hierarchy to 1; continuing");
2edb53c7 2141 }
1ea59ad2
SH
2142 free(cc_path);
2143 }
2edb53c7 2144 }
1ea59ad2 2145
33ad9f1a
CS
2146 /* if this is a cpuset hierarchy, we have to set cgroup.clone_children in
2147 * the base cgroup, otherwise containers will start with an empty cpuset.mems
2148 * and cpuset.cpus and then
2149 */
2edb53c7
SH
2150 if (lxc_string_in_array("cpuset", (const char **)mp->hierarchy->subsystems)) {
2151 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/cgroup.clone_children");
d703c2b1
RV
2152 struct stat sb;
2153
33ad9f1a 2154 if (!cc_path)
2edb53c7 2155 return -1;
d703c2b1
RV
2156 /* cgroup.clone_children is not available when running under
2157 * older kernel versions; in this case, we'll initialize
2158 * cpuset.cpus and cpuset.mems later, after the new cgroup
2159 * was created
2160 */
2161 if (stat(cc_path, &sb) != 0 && errno == ENOENT) {
934b1673 2162 mp->need_cpuset_init = true;
d703c2b1
RV
2163 free(cc_path);
2164 return 0;
2165 }
7e7243e1
SH
2166 r = lxc_read_from_file(cc_path, buf, 1);
2167 if (r == 1 && buf[0] == '1') {
2168 free(cc_path);
2edb53c7 2169 return 0;
7e7243e1 2170 }
33ad9f1a 2171 r = lxc_write_to_file(cc_path, "1", 1, false);
2edb53c7
SH
2172 saved_errno = errno;
2173 free(cc_path);
2174 errno = saved_errno;
2175 return r < 0 ? -1 : 0;
33ad9f1a
CS
2176 }
2177 return 0;
b98f7d6e 2178}
484ed030 2179
934b1673 2180static int cgroup_read_from_file(const char *fn, char buf[], size_t bufsize)
d703c2b1
RV
2181{
2182 int ret = lxc_read_from_file(fn, buf, bufsize);
2183 if (ret < 0) {
2184 SYSERROR("failed to read %s", fn);
934b1673 2185 return ret;
d703c2b1
RV
2186 }
2187 if (ret == bufsize) {
934b1673
SH
2188 if (bufsize > 0) {
2189 /* obviously this wasn't empty */
2190 buf[bufsize-1] = '\0';
2191 return ret;
2192 }
2193 /* Callers don't do this, but regression/sanity check */
2194 ERROR("%s: was not expecting 0 bufsize", __func__);
2195 return -1;
d703c2b1
RV
2196 }
2197 buf[ret] = '\0';
934b1673 2198 return ret;
d703c2b1
RV
2199}
2200
2201static bool do_init_cpuset_file(struct cgroup_mount_point *mp,
2202 const char *path, const char *name)
2203{
934b1673
SH
2204 char value[1024];
2205 char *childfile, *parentfile = NULL, *tmp;
2206 int ret;
2207 bool ok = false;
2208
d703c2b1
RV
2209 childfile = cgroup_to_absolute_path(mp, path, name);
2210 if (!childfile)
2211 return false;
2212
2213 /* don't overwrite a non-empty value in the file */
934b1673
SH
2214 ret = cgroup_read_from_file(childfile, value, sizeof(value));
2215 if (ret < 0)
2216 goto out;
d703c2b1 2217 if (value[0] != '\0' && value[0] != '\n') {
934b1673
SH
2218 ok = true;
2219 goto out;
d703c2b1
RV
2220 }
2221
2222 /* path to the same name in the parent cgroup */
2223 parentfile = strdup(path);
2224 if (!parentfile)
934b1673
SH
2225 goto out;
2226
d703c2b1 2227 tmp = strrchr(parentfile, '/');
934b1673
SH
2228 if (!tmp)
2229 goto out;
d703c2b1
RV
2230 if (tmp == parentfile)
2231 tmp++; /* keep the '/' at the start */
2232 *tmp = '\0';
2233 tmp = parentfile;
2234 parentfile = cgroup_to_absolute_path(mp, tmp, name);
2235 free(tmp);
934b1673
SH
2236 if (!parentfile)
2237 goto out;
d703c2b1
RV
2238
2239 /* copy from parent to child cgroup */
934b1673
SH
2240 ret = cgroup_read_from_file(parentfile, value, sizeof(value));
2241 if (ret < 0)
2242 goto out;
2243 if (ret == sizeof(value)) {
2244 /* If anyone actually sees this error, we can address it */
2245 ERROR("parent cpuset value too long");
2246 goto out;
d703c2b1
RV
2247 }
2248 ok = (lxc_write_to_file(childfile, value, strlen(value), false) >= 0);
2249 if (!ok)
2250 SYSERROR("failed writing %s", childfile);
b1dad6f6
RV
2251
2252out:
f10fad2f 2253 free(parentfile);
d703c2b1 2254 free(childfile);
d703c2b1
RV
2255 return ok;
2256}
2257
2258static bool init_cpuset_if_needed(struct cgroup_mount_point *mp,
2259 const char *path)
2260{
2261 /* the files we have to handle here are only in cpuset hierarchies */
2262 if (!lxc_string_in_array("cpuset",
2263 (const char **)mp->hierarchy->subsystems))
2264 return true;
2265
b1dad6f6
RV
2266 if (!mp->need_cpuset_init)
2267 return true;
2268
d703c2b1
RV
2269 return (do_init_cpuset_file(mp, path, "/cpuset.cpus") &&
2270 do_init_cpuset_file(mp, path, "/cpuset.mems") );
2271}
2272
1a704014
CB
2273static void print_cgfs_init_debuginfo(struct cgfs_data *d)
2274{
2275 int i;
2276
2277 if (!getenv("LXC_DEBUG_CGFS"))
2278 return;
2279
2280 DEBUG("Cgroup information:");
2281 DEBUG(" container name: %s", d->name);
2282 if (!d->meta || !d->meta->hierarchies) {
2283 DEBUG(" No hierarchies found.");
2284 return;
2285 }
2286 DEBUG(" Controllers:");
2287 for (i = 0; i <= d->meta->maximum_hierarchy; i++) {
2288 char **p;
2289 struct cgroup_hierarchy *h = d->meta->hierarchies[i];
2290 if (!h) {
2291 DEBUG(" Empty hierarchy number %d.", i);
2292 continue;
2293 }
2294 for (p = h->subsystems; p && *p; p++) {
2295 DEBUG(" %2d: %s", i, *p);
2296 }
2297 }
2298}
2299
4fb3cba5 2300struct cgroup_ops *cgfs_ops_init(void)
484ed030 2301{
4fb3cba5 2302 return &cgfs_ops;
d4ef7c50 2303}
484ed030 2304
4fb3cba5 2305static void *cgfs_init(const char *name)
d4ef7c50 2306{
4fb3cba5 2307 struct cgfs_data *d;
484ed030 2308
4fb3cba5
DE
2309 d = malloc(sizeof(*d));
2310 if (!d)
2311 return NULL;
484ed030 2312
4fb3cba5
DE
2313 memset(d, 0, sizeof(*d));
2314 d->name = strdup(name);
2315 if (!d->name)
2316 goto err1;
2317
5e1c5795 2318 d->cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
4fb3cba5
DE
2319
2320 d->meta = lxc_cgroup_load_meta();
2321 if (!d->meta) {
2322 ERROR("cgroupfs failed to detect cgroup metadata");
2323 goto err2;
2324 }
1a704014
CB
2325
2326 print_cgfs_init_debuginfo(d);
2327
4fb3cba5
DE
2328 return d;
2329
2330err2:
2331 free(d->name);
2332err1:
2333 free(d);
2334 return NULL;
d4ef7c50 2335}
484ed030 2336
6a9e0f26 2337static void cgfs_destroy(void *hdata, struct lxc_conf *conf)
d4ef7c50 2338{
4fb3cba5
DE
2339 struct cgfs_data *d = hdata;
2340
d4ef7c50
SH
2341 if (!d)
2342 return;
f10fad2f 2343 free(d->name);
6a9e0f26 2344 lxc_cgroup_process_info_free_and_remove(d->info, conf);
c55d4505 2345 lxc_cgroup_put_meta(d->meta);
d4ef7c50 2346 free(d);
d4ef7c50 2347}
484ed030 2348
4fb3cba5 2349static inline bool cgfs_create(void *hdata)
d4ef7c50 2350{
4fb3cba5
DE
2351 struct cgfs_data *d = hdata;
2352 struct cgroup_process_info *i;
2353 struct cgroup_meta_data *md;
484ed030 2354
4fb3cba5 2355 if (!d)
d4ef7c50 2356 return false;
4fb3cba5
DE
2357 md = d->meta;
2358 i = lxc_cgroupfs_create(d->name, d->cgroup_pattern, md, NULL);
d4ef7c50
SH
2359 if (!i)
2360 return false;
2361 d->info = i;
2362 return true;
2363}
484ed030 2364
4fb3cba5 2365static inline bool cgfs_enter(void *hdata, pid_t pid)
d4ef7c50 2366{
4fb3cba5
DE
2367 struct cgfs_data *d = hdata;
2368 struct cgroup_process_info *i;
d4ef7c50 2369 int ret;
4fb3cba5
DE
2370
2371 if (!d)
2372 return false;
2373 i = d->info;
2374 ret = lxc_cgroupfs_enter(i, pid, false);
484ed030 2375
d4ef7c50
SH
2376 return ret == 0;
2377}
2378
4fb3cba5 2379static inline bool cgfs_create_legacy(void *hdata, pid_t pid)
d4ef7c50 2380{
4fb3cba5
DE
2381 struct cgfs_data *d = hdata;
2382 struct cgroup_process_info *i;
2383
2384 if (!d)
2385 return false;
2386 i = d->info;
2387 if (lxc_cgroup_create_legacy(i, d->name, pid) < 0) {
2388 ERROR("failed to create legacy ns cgroups for '%s'", d->name);
d4ef7c50 2389 return false;
484ed030 2390 }
d4ef7c50
SH
2391 return true;
2392}
484ed030 2393
4fb3cba5 2394static const char *cgfs_get_cgroup(void *hdata, const char *subsystem)
d4ef7c50 2395{
4fb3cba5
DE
2396 struct cgfs_data *d = hdata;
2397
2398 if (!d)
2399 return NULL;
2400 return lxc_cgroup_get_hierarchy_path_data(subsystem, d);
484ed030
SH
2401}
2402
ccb4cabe 2403static bool cgfs_escape(void *hdata)
06078509
TA
2404{
2405 struct cgroup_meta_data *md;
2406 int i;
2407 bool ret = false;
2408
2409 md = lxc_cgroup_load_meta();
2410 if (!md)
2411 return false;
2412
2413 for (i = 1; i <= md->maximum_hierarchy; i++) {
2414 struct cgroup_hierarchy *h = md->hierarchies[i];
2415 struct cgroup_mount_point *mp;
2416 char *tasks;
2417 FILE *f;
2418 int written;
2419
2420 if (!h) {
2421 WARN("not escaping hierarchy %d", i);
2422 continue;
2423 }
2424
2425 mp = lxc_cgroup_find_mount_point(h, "/", true);
2426 if (!mp)
2427 goto out;
2428
2429 tasks = cgroup_to_absolute_path(mp, "/", "tasks");
2430 if (!tasks)
2431 goto out;
2432
2433 f = fopen(tasks, "a");
2434 free(tasks);
2435 if (!f)
2436 goto out;
2437
2438 written = fprintf(f, "%d\n", getpid());
2439 fclose(f);
2440 if (written < 0) {
2441 SYSERROR("writing tasks failed\n");
2442 goto out;
2443 }
2444 }
2445
2446 ret = true;
2447out:
2448 lxc_cgroup_put_meta(md);
2449 return ret;
2450}
2451
36662416
TA
2452static int cgfs_num_hierarchies(void)
2453{
2454 /* not implemented */
2455 return -1;
2456}
2457
2458static bool cgfs_get_hierarchies(int i, char ***out)
2459{
2460 /* not implemented */
2461 return false;
2462}
2463
4fb3cba5 2464static bool cgfs_unfreeze(void *hdata)
0086f499 2465{
4fb3cba5 2466 struct cgfs_data *d = hdata;
0086f499
SH
2467 char *cgabspath, *cgrelpath;
2468 int ret;
2469
4fb3cba5
DE
2470 if (!d)
2471 return false;
2472
2473 cgrelpath = lxc_cgroup_get_hierarchy_path_data("freezer", d);
0086f499
SH
2474 cgabspath = lxc_cgroup_find_abs_path("freezer", cgrelpath, true, NULL);
2475 if (!cgabspath)
ecfcb3f0 2476 return false;
0086f499
SH
2477
2478 ret = do_cgroup_set(cgabspath, "freezer.state", "THAWED");
2479 free(cgabspath);
ecfcb3f0 2480 return ret == 0;
0086f499
SH
2481}
2482
4fb3cba5
DE
2483static bool cgroupfs_setup_limits(void *hdata, struct lxc_list *cgroup_conf,
2484 bool with_devices)
9daf6f5d 2485{
4fb3cba5
DE
2486 struct cgfs_data *d = hdata;
2487
2488 if (!d)
2489 return false;
2490 return do_setup_cgroup_limits(d, cgroup_conf, with_devices) == 0;
9daf6f5d
SH
2491}
2492
4fb3cba5 2493static bool lxc_cgroupfs_attach(const char *name, const char *lxcpath, pid_t pid)
5d897655
SH
2494{
2495 struct cgroup_meta_data *meta_data;
2496 struct cgroup_process_info *container_info;
2497 int ret;
2498
2499 meta_data = lxc_cgroup_load_meta();
2500 if (!meta_data) {
2501 ERROR("could not move attached process %d to cgroup of container", pid);
2502 return false;
2503 }
2504
2505 container_info = lxc_cgroup_get_container_info(name, lxcpath, meta_data);
2506 lxc_cgroup_put_meta(meta_data);
2507 if (!container_info) {
2508 ERROR("could not move attached process %d to cgroup of container", pid);
2509 return false;
2510 }
2511
2512 ret = lxc_cgroupfs_enter(container_info, pid, false);
2513 lxc_cgroup_process_info_free(container_info);
2514 if (ret < 0) {
2515 ERROR("could not move attached process %d to cgroup of container", pid);
2516 return false;
2517 }
2518 return true;
2519}
2520
8b276860
SH
2521struct chown_data {
2522 const char *cgroup_path;
2523 uid_t origuid;
2524};
2525
2526/*
2527 * TODO - someone should refactor this to unshare once passing all the paths
2528 * to be chowned in one go
2529 */
2530static int chown_cgroup_wrapper(void *data)
2531{
2532 struct chown_data *arg = data;
2533 uid_t destuid;
2534 char *fpath;
2535
8b276860
SH
2536 if (setresgid(0,0,0) < 0)
2537 SYSERROR("Failed to setgid to 0");
2538 if (setresuid(0,0,0) < 0)
2539 SYSERROR("Failed to setuid to 0");
2540 if (setgroups(0, NULL) < 0)
2541 SYSERROR("Failed to clear groups");
2542 destuid = get_ns_uid(arg->origuid);
2543
2544 if (chown(arg->cgroup_path, destuid, 0) < 0)
2545 SYSERROR("Failed chowning %s to %d", arg->cgroup_path, (int)destuid);
2546
2547 fpath = lxc_append_paths(arg->cgroup_path, "tasks");
2548 if (!fpath)
2549 return -1;
2550 if (chown(fpath, destuid, 0) < 0)
2551 SYSERROR("Error chowning %s\n", fpath);
2552 free(fpath);
01d59fe5 2553
8b276860
SH
2554 fpath = lxc_append_paths(arg->cgroup_path, "cgroup.procs");
2555 if (!fpath)
2556 return -1;
2557 if (chown(fpath, destuid, 0) < 0)
2558 SYSERROR("Error chowning %s", fpath);
2559 free(fpath);
2560
2561 return 0;
2562}
2563
2564static bool do_cgfs_chown(char *cgroup_path, struct lxc_conf *conf)
2565{
2566 struct chown_data data;
2567 char *fpath;
2568
01d59fe5
CB
2569 if (!dir_exists(cgroup_path))
2570 return true;
2571
8b276860
SH
2572 if (lxc_list_empty(&conf->id_map))
2573 /* If there's no mapping then we don't need to chown */
2574 return true;
2575
2576 data.cgroup_path = cgroup_path;
2577 data.origuid = geteuid();
2578
2579 /* Unpriv users can't chown it themselves, so chown from
2580 * a child namespace mapping both our own and the target uid
2581 */
2582 if (userns_exec_1(conf, chown_cgroup_wrapper, &data) < 0) {
2583 ERROR("Error requesting cgroup chown in new namespace");
2584 return false;
2585 }
2586
2587 /*
2588 * Now chmod 775 the directory else the container cannot create cgroups.
2589 * This can't be done in the child namespace because it only group-owns
2590 * the cgroup
2591 */
2592 if (chmod(cgroup_path, 0775) < 0) {
2593 SYSERROR("Error chmoding %s\n", cgroup_path);
2594 return false;
2595 }
2596 fpath = lxc_append_paths(cgroup_path, "tasks");
2597 if (!fpath)
2598 return false;
2599 if (chmod(fpath, 0664) < 0)
2600 SYSERROR("Error chmoding %s\n", fpath);
2601 free(fpath);
2602 fpath = lxc_append_paths(cgroup_path, "cgroup.procs");
2603 if (!fpath)
2604 return false;
2605 if (chmod(fpath, 0664) < 0)
2606 SYSERROR("Error chmoding %s\n", fpath);
2607 free(fpath);
2608
2609 return true;
2610}
2611
2612static bool cgfs_chown(void *hdata, struct lxc_conf *conf)
2613{
2614 struct cgfs_data *d = hdata;
2615 struct cgroup_process_info *info_ptr;
2616 char *cgpath;
2617 bool r = true;
2618
2619 if (!d)
2620 return false;
2621
2622 for (info_ptr = d->info; info_ptr; info_ptr = info_ptr->next) {
2623 if (!info_ptr->designated_mount_point) {
2624 info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, info_ptr->cgroup_path, true);
2625 if (!info_ptr->designated_mount_point) {
2626 SYSERROR("Could not chown cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", info_ptr->cgroup_path);
2627 return false;
2628 }
2629 }
2630
2631 cgpath = cgroup_to_absolute_path(info_ptr->designated_mount_point, info_ptr->cgroup_path, NULL);
2632 if (!cgpath) {
2633 SYSERROR("Could not chown cgroup %s: internal error", info_ptr->cgroup_path);
2634 continue;
2635 }
2636 r = do_cgfs_chown(cgpath, conf);
ea439aac 2637 if (!r && is_crucial_hierarchy(info_ptr->hierarchy)) {
8b276860
SH
2638 ERROR("Failed chowning %s\n", cgpath);
2639 free(cgpath);
2640 return false;
2641 }
2642 free(cgpath);
2643 }
2644
2645 return true;
2646}
2647
d4ef7c50 2648static struct cgroup_ops cgfs_ops = {
d4ef7c50 2649 .init = cgfs_init,
4fb3cba5 2650 .destroy = cgfs_destroy,
d4ef7c50
SH
2651 .create = cgfs_create,
2652 .enter = cgfs_enter,
2653 .create_legacy = cgfs_create_legacy,
2654 .get_cgroup = cgfs_get_cgroup,
06078509 2655 .escape = cgfs_escape,
36662416
TA
2656 .num_hierarchies = cgfs_num_hierarchies,
2657 .get_hierarchies = cgfs_get_hierarchies,
d4ef7c50
SH
2658 .get = lxc_cgroupfs_get,
2659 .set = lxc_cgroupfs_set,
4fb3cba5 2660 .unfreeze = cgfs_unfreeze,
9daf6f5d 2661 .setup_limits = cgroupfs_setup_limits,
d4ef7c50 2662 .name = "cgroupfs",
5d897655 2663 .attach = lxc_cgroupfs_attach,
8b276860 2664 .chown = cgfs_chown,
c476bdce 2665 .mount_cgroup = cgroupfs_mount_cgroup,
4fb3cba5 2666 .nrtasks = cgfs_nrtasks,
23befb18 2667 .driver = CGFS,
d4ef7c50 2668};