]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgroups/cgfs.c
cgfsng: controller_found()
[mirror_lxc.git] / src / lxc / cgroups / cgfs.c
CommitLineData
576f946d 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
576f946d 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
576f946d 22 */
d06245b8
NC
23#include "config.h"
24
576f946d 25#include <stdio.h>
576f946d 26#include <stdlib.h>
27#include <errno.h>
576f946d 28#include <unistd.h>
29#include <string.h>
341a9bd8 30#include <dirent.h>
576f946d 31#include <fcntl.h>
8b276860 32#include <grp.h>
b98f7d6e 33#include <ctype.h>
576f946d 34#include <sys/types.h>
35#include <sys/stat.h>
36#include <sys/param.h>
37#include <sys/inotify.h>
aae1f3c4 38#include <sys/mount.h>
576f946d 39#include <netinet/in.h>
40#include <net/if.h>
41
e2bcd7db 42#include "error.h"
ae5c8b8e 43#include "commands.h"
b98f7d6e
SH
44#include "list.h"
45#include "conf.h"
33ad9f1a 46#include "utils.h"
f2363e38
ÇO
47#include "log.h"
48#include "cgroup.h"
49#include "start.h"
484ed030 50#include "state.h"
28d832c4 51#include "storage.h"
36eb9bde 52
edaf8b1b
SG
53#if IS_BIONIC
54#include <../include/lxcmntent.h>
55#else
56#include <mntent.h>
57#endif
58
4fb3cba5
DE
59struct cgroup_hierarchy;
60struct cgroup_meta_data;
61struct cgroup_mount_point;
62
63/*
64 * cgroup_meta_data: the metadata about the cgroup infrastructure on this
65 * host
66 */
67struct cgroup_meta_data {
68 ptrdiff_t ref; /* simple refcount */
69 struct cgroup_hierarchy **hierarchies;
70 struct cgroup_mount_point **mount_points;
71 int maximum_hierarchy;
72};
73
74/*
75 * cgroup_hierarchy: describes a single cgroup hierarchy
76 * (may have multiple mount points)
77 */
78struct cgroup_hierarchy {
79 int index;
80 bool used; /* false if the hierarchy should be ignored by lxc */
81 char **subsystems;
82 struct cgroup_mount_point *rw_absolute_mount_point;
83 struct cgroup_mount_point *ro_absolute_mount_point;
84 struct cgroup_mount_point **all_mount_points;
85 size_t all_mount_point_capacity;
86};
87
88/*
89 * cgroup_mount_point: a mount point to where a hierarchy
90 * is mounted to
91 */
92struct cgroup_mount_point {
93 struct cgroup_hierarchy *hierarchy;
94 char *mount_point;
95 char *mount_prefix;
96 bool read_only;
97 bool need_cpuset_init;
98};
99
100/*
101 * cgroup_process_info: describes the membership of a
102 * process to the different cgroup
103 * hierarchies
104 *
105 * Note this is the per-process info tracked by the cgfs_ops.
4fb3cba5
DE
106 */
107struct cgroup_process_info {
108 struct cgroup_process_info *next;
109 struct cgroup_meta_data *meta_ref;
110 struct cgroup_hierarchy *hierarchy;
111 char *cgroup_path;
112 char *cgroup_path_sub;
113 char **created_paths;
114 size_t created_paths_capacity;
115 size_t created_paths_count;
116 struct cgroup_mount_point *designated_mount_point;
117};
118
119struct cgfs_data {
120 char *name;
121 const char *cgroup_pattern;
122 struct cgroup_meta_data *meta;
123 struct cgroup_process_info *info;
124};
125
126lxc_log_define(lxc_cgfs, lxc);
576f946d 127
33ad9f1a
CS
128static struct cgroup_process_info *lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str, struct cgroup_meta_data *meta);
129static char **subsystems_from_mount_options(const char *mount_options, char **kernel_list);
130static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp);
131static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h);
132static bool is_valid_cgroup(const char *name);
33ad9f1a 133static int create_cgroup(struct cgroup_mount_point *mp, const char *path);
6a9e0f26
SH
134static int remove_cgroup(struct cgroup_mount_point *mp, const char *path, bool recurse,
135 struct lxc_conf *conf);
33ad9f1a
CS
136static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp, const char *path, const char *suffix);
137static struct cgroup_process_info *find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem);
138static int do_cgroup_get(const char *cgroup_path, const char *sub_filename, char *value, size_t len);
139static int do_cgroup_set(const char *cgroup_path, const char *sub_filename, const char *value);
4fb3cba5
DE
140static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d, char *v, bool for_allow);
141static int do_setup_cgroup_limits(struct cgfs_data *d, struct lxc_list *cgroup_settings, bool do_devices);
33ad9f1a 142static int cgroup_recursive_task_count(const char *cgroup_path);
1ea59ad2 143static int handle_cgroup_settings(struct cgroup_mount_point *mp, char *cgroup_path);
d703c2b1 144static bool init_cpuset_if_needed(struct cgroup_mount_point *mp, const char *path);
33ad9f1a 145
4fb3cba5
DE
146static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist);
147static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data);
148static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data);
149
150/* free process membership information */
151static void lxc_cgroup_process_info_free(struct cgroup_process_info *info);
6a9e0f26
SH
152static void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info,
153 struct lxc_conf *conf);
4fb3cba5 154
d4ef7c50 155static struct cgroup_ops cgfs_ops;
d4ef7c50 156
603c64c2
SH
157static int cgroup_rmdir(char *dirname)
158{
74f96976 159 struct dirent *direntp;
603c64c2
SH
160 int saved_errno = 0;
161 DIR *dir;
162 int ret, failed=0;
163 char pathname[MAXPATHLEN];
164
165 dir = opendir(dirname);
166 if (!dir) {
b103ceac 167 ERROR("Failed to open %s", dirname);
603c64c2
SH
168 return -1;
169 }
170
74f96976 171 while ((direntp = readdir(dir))) {
603c64c2
SH
172 struct stat mystat;
173 int rc;
174
175 if (!direntp)
176 break;
177
178 if (!strcmp(direntp->d_name, ".") ||
179 !strcmp(direntp->d_name, ".."))
180 continue;
181
182 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
183 if (rc < 0 || rc >= MAXPATHLEN) {
184 ERROR("pathname too long");
185 failed=1;
186 if (!saved_errno)
187 saved_errno = -ENOMEM;
188 continue;
189 }
190 ret = lstat(pathname, &mystat);
191 if (ret) {
b103ceac 192 SYSERROR("Failed to stat %s", pathname);
603c64c2
SH
193 failed=1;
194 if (!saved_errno)
195 saved_errno = errno;
196 continue;
197 }
198 if (S_ISDIR(mystat.st_mode)) {
199 if (cgroup_rmdir(pathname) < 0) {
200 if (!saved_errno)
201 saved_errno = errno;
202 failed=1;
203 }
204 }
205 }
206
207 if (rmdir(dirname) < 0) {
b103ceac 208 SYSERROR("Failed to delete %s", dirname);
603c64c2
SH
209 if (!saved_errno)
210 saved_errno = errno;
211 failed=1;
212 }
213
214 ret = closedir(dir);
215 if (ret) {
b103ceac 216 SYSERROR("Failed to close directory %s", dirname);
603c64c2
SH
217 if (!saved_errno)
218 saved_errno = errno;
219 failed=1;
220 }
221
222 errno = saved_errno;
223 return failed ? -1 : 0;
224}
225
6a9e0f26
SH
226static int rmdir_wrapper(void *data)
227{
228 char *path = data;
229
230 if (setresgid(0,0,0) < 0)
231 SYSERROR("Failed to setgid to 0");
232 if (setresuid(0,0,0) < 0)
233 SYSERROR("Failed to setuid to 0");
234 if (setgroups(0, NULL) < 0)
235 SYSERROR("Failed to clear groups");
236
237 return cgroup_rmdir(path);
238}
239
4fb3cba5 240static struct cgroup_meta_data *lxc_cgroup_load_meta()
33ad9f1a
CS
241{
242 const char *cgroup_use = NULL;
243 char **cgroup_use_list = NULL;
244 struct cgroup_meta_data *md = NULL;
245 int saved_errno;
246
247 errno = 0;
593e8478 248 cgroup_use = lxc_global_config_value("lxc.cgroup.use");
33ad9f1a
CS
249 if (!cgroup_use && errno != 0)
250 return NULL;
251 if (cgroup_use) {
252 cgroup_use_list = lxc_string_split_and_trim(cgroup_use, ',');
253 if (!cgroup_use_list)
254 return NULL;
255 }
576f946d 256
33ad9f1a
CS
257 md = lxc_cgroup_load_meta2((const char **)cgroup_use_list);
258 saved_errno = errno;
259 lxc_free_array((void **)cgroup_use_list, free);
260 errno = saved_errno;
261 return md;
262}
fd37327f 263
b653309a 264/* Step 1: determine all kernel subsystems */
4fb3cba5 265static bool find_cgroup_subsystems(char ***kernel_subsystems)
1d39a065 266{
b653309a
SH
267 FILE *proc_cgroups;
268 bool bret = false;
33ad9f1a
CS
269 char *line = NULL;
270 size_t sz = 0;
b653309a
SH
271 size_t kernel_subsystems_count = 0;
272 size_t kernel_subsystems_capacity = 0;
273 int r;
1d39a065 274
33ad9f1a
CS
275 proc_cgroups = fopen_cloexec("/proc/cgroups", "r");
276 if (!proc_cgroups)
b653309a 277 return false;
1d39a065 278
33ad9f1a
CS
279 while (getline(&line, &sz, proc_cgroups) != -1) {
280 char *tab1;
281 char *tab2;
282 int hierarchy_number;
1d39a065 283
33ad9f1a
CS
284 if (line[0] == '#')
285 continue;
286 if (!line[0])
287 continue;
1d39a065 288
33ad9f1a
CS
289 tab1 = strchr(line, '\t');
290 if (!tab1)
8900b9eb 291 continue;
33ad9f1a
CS
292 *tab1++ = '\0';
293 tab2 = strchr(tab1, '\t');
294 if (!tab2)
295 continue;
296 *tab2 = '\0';
fd37327f 297
33ad9f1a
CS
298 tab2 = NULL;
299 hierarchy_number = strtoul(tab1, &tab2, 10);
300 if (!tab2 || *tab2)
301 continue;
302 (void)hierarchy_number;
303
b653309a 304 r = lxc_grow_array((void ***)kernel_subsystems, &kernel_subsystems_capacity, kernel_subsystems_count + 1, 12);
33ad9f1a 305 if (r < 0)
b653309a
SH
306 goto out;
307 (*kernel_subsystems)[kernel_subsystems_count] = strdup(line);
308 if (!(*kernel_subsystems)[kernel_subsystems_count])
309 goto out;
33ad9f1a 310 kernel_subsystems_count++;
bcbd102c 311 }
b653309a 312 bret = true;
0d9f8e18 313
b653309a 314out:
33ad9f1a 315 fclose(proc_cgroups);
0ccf7c2a 316 free(line);
b653309a
SH
317 return bret;
318}
319
320/* Step 2: determine all hierarchies (by reading /proc/self/cgroup),
321 * since mount points don't specify hierarchy number and
322 * /proc/cgroups does not contain named hierarchies
323 */
324static bool find_cgroup_hierarchies(struct cgroup_meta_data *meta_data,
325 bool all_kernel_subsystems, bool all_named_subsystems,
326 const char **subsystem_whitelist)
327{
328 FILE *proc_self_cgroup;
329 char *line = NULL;
330 size_t sz = 0;
331 int r;
332 bool bret = false;
333 size_t hierarchy_capacity = 0;
ef6e34ee 334
33ad9f1a
CS
335 proc_self_cgroup = fopen_cloexec("/proc/self/cgroup", "r");
336 /* if for some reason (because of setns() and pid namespace for example),
337 * /proc/self is not valid, we try /proc/1/cgroup... */
338 if (!proc_self_cgroup)
339 proc_self_cgroup = fopen_cloexec("/proc/1/cgroup", "r");
340 if (!proc_self_cgroup)
b653309a 341 return false;
33ad9f1a
CS
342
343 while (getline(&line, &sz, proc_self_cgroup) != -1) {
344 /* file format: hierarchy:subsystems:group,
345 * we only extract hierarchy and subsystems
346 * here */
347 char *colon1;
348 char *colon2;
349 int hierarchy_number;
350 struct cgroup_hierarchy *h = NULL;
351 char **p;
352
353 if (!line[0])
354 continue;
ad08bbb7 355
33ad9f1a
CS
356 colon1 = strchr(line, ':');
357 if (!colon1)
8900b9eb 358 continue;
33ad9f1a
CS
359 *colon1++ = '\0';
360 colon2 = strchr(colon1, ':');
361 if (!colon2)
362 continue;
363 *colon2 = '\0';
ad08bbb7 364
33ad9f1a 365 colon2 = NULL;
82a2fe03
CB
366
367 /* With cgroupv2 /proc/self/cgroup can contain entries of the
368 * form: 0::/
369 * These entries need to be skipped.
370 */
371 if (!strcmp(colon1, ""))
372 continue;
373
33ad9f1a
CS
374 hierarchy_number = strtoul(line, &colon2, 10);
375 if (!colon2 || *colon2)
376 continue;
576f946d 377
33ad9f1a
CS
378 if (hierarchy_number > meta_data->maximum_hierarchy) {
379 /* lxc_grow_array will never shrink, so even if we find a lower
380 * hierarchy number here, the array will never be smaller
381 */
382 r = lxc_grow_array((void ***)&meta_data->hierarchies, &hierarchy_capacity, hierarchy_number + 1, 12);
383 if (r < 0)
b653309a 384 goto out;
5193cc3d 385
33ad9f1a
CS
386 meta_data->maximum_hierarchy = hierarchy_number;
387 }
fd37327f 388
33ad9f1a
CS
389 /* this shouldn't happen, we had this already */
390 if (meta_data->hierarchies[hierarchy_number])
b653309a 391 goto out;
33ad9f1a
CS
392
393 h = calloc(1, sizeof(struct cgroup_hierarchy));
394 if (!h)
b653309a 395 goto out;
33ad9f1a
CS
396
397 meta_data->hierarchies[hierarchy_number] = h;
398
399 h->index = hierarchy_number;
400 h->subsystems = lxc_string_split_and_trim(colon1, ',');
401 if (!h->subsystems)
b653309a 402 goto out;
33ad9f1a
CS
403 /* see if this hierarchy should be considered */
404 if (!all_kernel_subsystems || !all_named_subsystems) {
405 for (p = h->subsystems; *p; p++) {
406 if (!strncmp(*p, "name=", 5)) {
407 if (all_named_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
408 h->used = true;
409 break;
410 }
411 } else {
412 if (all_kernel_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
413 h->used = true;
414 break;
415 }
416 }
417 }
418 } else {
419 /* we want all hierarchy anyway */
420 h->used = true;
ae5c8b8e 421 }
ae5c8b8e 422 }
b653309a 423 bret = true;
0b9c21ab 424
b653309a 425out:
33ad9f1a 426 fclose(proc_self_cgroup);
0ccf7c2a 427 free(line);
b653309a
SH
428 return bret;
429}
430
431/* Step 3: determine all mount points of each hierarchy */
432static bool find_hierarchy_mountpts( struct cgroup_meta_data *meta_data, char **kernel_subsystems)
433{
434 bool bret = false;
435 FILE *proc_self_mountinfo;
436 char *line = NULL;
437 size_t sz = 0;
438 char **tokens = NULL;
439 size_t mount_point_count = 0;
440 size_t mount_point_capacity = 0;
441 size_t token_capacity = 0;
442 int r;
fcca16bc 443 bool is_cgns = cgns_supported();
b653309a 444
33ad9f1a
CS
445 proc_self_mountinfo = fopen_cloexec("/proc/self/mountinfo", "r");
446 /* if for some reason (because of setns() and pid namespace for example),
447 * /proc/self is not valid, we try /proc/1/cgroup... */
448 if (!proc_self_mountinfo)
449 proc_self_mountinfo = fopen_cloexec("/proc/1/mountinfo", "r");
450 if (!proc_self_mountinfo)
b653309a 451 return false;
33ad9f1a
CS
452
453 while (getline(&line, &sz, proc_self_mountinfo) != -1) {
178938fe 454 char *token, *line_tok, *saveptr = NULL;
33ad9f1a
CS
455 size_t i, j, k;
456 struct cgroup_mount_point *mount_point;
457 struct cgroup_hierarchy *h;
458 char **subsystems;
836514a8 459 bool is_lxcfs = false;
33ad9f1a
CS
460
461 if (line[0] && line[strlen(line) - 1] == '\n')
462 line[strlen(line) - 1] = '\0';
463
178938fe 464 for (i = 0, line_tok = line; (token = strtok_r(line_tok, " ", &saveptr)); line_tok = NULL) {
33ad9f1a
CS
465 r = lxc_grow_array((void ***)&tokens, &token_capacity, i + 1, 64);
466 if (r < 0)
b653309a 467 goto out;
33ad9f1a
CS
468 tokens[i++] = token;
469 }
b98f7d6e 470
33ad9f1a
CS
471 /* layout of /proc/self/mountinfo:
472 * 0: id
473 * 1: parent id
474 * 2: device major:minor
475 * 3: mount prefix
8900b9eb 476 * 4: mount point
33ad9f1a
CS
477 * 5: per-mount options
478 * [optional X]: additional data
479 * X+7: "-"
480 * X+8: type
481 * X+9: source
482 * X+10: per-superblock options
483 */
484 for (j = 6; j < i && tokens[j]; j++)
485 if (!strcmp(tokens[j], "-"))
486 break;
fd4f5a56 487
33ad9f1a
CS
488 /* could not find separator */
489 if (j >= i || !tokens[j])
490 continue;
491 /* there should be exactly three fields after
492 * the separator
493 */
494 if (i != j + 4)
495 continue;
fd4f5a56 496
33ad9f1a 497 /* not a cgroup filesystem */
836514a8
U
498 if (strcmp(tokens[j + 1], "cgroup") != 0) {
499 if (strcmp(tokens[j + 1], "fuse.lxcfs") != 0)
500 continue;
501 if (strncmp(tokens[4], "/sys/fs/cgroup/", 15) != 0)
502 continue;
503 is_lxcfs = true;
504 char *curtok = tokens[4] + 15;
505 subsystems = subsystems_from_mount_options(curtok,
506 kernel_subsystems);
507 } else
508 subsystems = subsystems_from_mount_options(tokens[j + 3],
509 kernel_subsystems);
33ad9f1a 510 if (!subsystems)
b653309a 511 goto out;
33ad9f1a
CS
512
513 h = NULL;
517587ef 514 for (k = 0; k <= meta_data->maximum_hierarchy; k++) {
33ad9f1a
CS
515 if (meta_data->hierarchies[k] &&
516 meta_data->hierarchies[k]->subsystems[0] &&
517 lxc_string_in_array(meta_data->hierarchies[k]->subsystems[0], (const char **)subsystems)) {
518 /* TODO: we could also check if the lists really match completely,
519 * just to have an additional sanity check */
520 h = meta_data->hierarchies[k];
b98f7d6e 521 break;
33ad9f1a 522 }
b98f7d6e 523 }
33ad9f1a 524 lxc_free_array((void **)subsystems, free);
287df277
CB
525 if (!h)
526 goto out;
33ad9f1a
CS
527
528 r = lxc_grow_array((void ***)&meta_data->mount_points, &mount_point_capacity, mount_point_count + 1, 12);
529 if (r < 0)
b653309a 530 goto out;
33ad9f1a
CS
531
532 /* create mount point object */
533 mount_point = calloc(1, sizeof(*mount_point));
534 if (!mount_point)
b653309a 535 goto out;
33ad9f1a
CS
536
537 meta_data->mount_points[mount_point_count++] = mount_point;
538
539 mount_point->hierarchy = h;
fcca16bc 540 if (is_lxcfs || is_cgns)
836514a8
U
541 mount_point->mount_prefix = strdup("/");
542 else
543 mount_point->mount_prefix = strdup(tokens[3]);
33ad9f1a 544 mount_point->mount_point = strdup(tokens[4]);
33ad9f1a 545 if (!mount_point->mount_point || !mount_point->mount_prefix)
b653309a 546 goto out;
33ad9f1a
CS
547 mount_point->read_only = !lxc_string_in_list("rw", tokens[5], ',');
548
549 if (!strcmp(mount_point->mount_prefix, "/")) {
550 if (mount_point->read_only) {
551 if (!h->ro_absolute_mount_point)
552 h->ro_absolute_mount_point = mount_point;
553 } else {
554 if (!h->rw_absolute_mount_point)
555 h->rw_absolute_mount_point = mount_point;
556 }
b98f7d6e 557 }
ae5c8b8e 558
9d6514f2
CB
559 if (h)
560 k = lxc_array_len((void **)h->all_mount_points);
561 else
562 k = 0;
33ad9f1a
CS
563 r = lxc_grow_array((void ***)&h->all_mount_points, &h->all_mount_point_capacity, k + 1, 4);
564 if (r < 0)
b653309a 565 goto out;
33ad9f1a 566 h->all_mount_points[k] = mount_point;
fd4f5a56 567 }
b653309a
SH
568 bret = true;
569
570out:
b653309a 571 fclose(proc_self_mountinfo);
b653309a 572 free(tokens);
2cdafc54 573 free(line);
b653309a
SH
574 return bret;
575}
576
4fb3cba5 577static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist)
b653309a
SH
578{
579 bool all_kernel_subsystems = true;
580 bool all_named_subsystems = false;
581 struct cgroup_meta_data *meta_data = NULL;
582 char **kernel_subsystems = NULL;
583 int saved_errno = 0;
584
585 /* if the subsystem whitelist is not specified, include all
586 * hierarchies that contain kernel subsystems by default but
587 * no hierarchies that only contain named subsystems
588 *
589 * if it is specified, the specifier @all will select all
590 * hierarchies, @kernel will select all hierarchies with
591 * kernel subsystems and @named will select all named
592 * hierarchies
593 */
594 all_kernel_subsystems = subsystem_whitelist ?
595 (lxc_string_in_array("@kernel", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
596 true;
597 all_named_subsystems = subsystem_whitelist ?
598 (lxc_string_in_array("@named", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
79c59e6b 599 true;
b653309a
SH
600
601 meta_data = calloc(1, sizeof(struct cgroup_meta_data));
602 if (!meta_data)
603 return NULL;
604 meta_data->ref = 1;
605
606 if (!find_cgroup_subsystems(&kernel_subsystems))
607 goto out_error;
608
609 if (!find_cgroup_hierarchies(meta_data, all_kernel_subsystems,
610 all_named_subsystems, subsystem_whitelist))
611 goto out_error;
612
613 if (!find_hierarchy_mountpts(meta_data, kernel_subsystems))
614 goto out_error;
fd4f5a56 615
33ad9f1a
CS
616 /* oops, we couldn't find anything */
617 if (!meta_data->hierarchies || !meta_data->mount_points) {
618 errno = EINVAL;
619 goto out_error;
ae5c8b8e 620 }
fd4f5a56 621
3a0abb3a 622 lxc_free_array((void **)kernel_subsystems, free);
33ad9f1a
CS
623 return meta_data;
624
625out_error:
626 saved_errno = errno;
33ad9f1a
CS
627 lxc_free_array((void **)kernel_subsystems, free);
628 lxc_cgroup_put_meta(meta_data);
629 errno = saved_errno;
630 return NULL;
fd4f5a56
DL
631}
632
4fb3cba5 633static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data)
e14f67a7 634{
33ad9f1a
CS
635 meta_data->ref++;
636 return meta_data;
637}
e14f67a7 638
4fb3cba5 639static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data)
33ad9f1a
CS
640{
641 size_t i;
642 if (!meta_data)
643 return NULL;
644 if (--meta_data->ref > 0)
645 return meta_data;
646 lxc_free_array((void **)meta_data->mount_points, (lxc_free_fn)lxc_cgroup_mount_point_free);
2446c321 647 if (meta_data->hierarchies)
33ad9f1a 648 for (i = 0; i <= meta_data->maximum_hierarchy; i++)
2446c321
CB
649 if (meta_data->hierarchies[i])
650 lxc_cgroup_hierarchy_free(meta_data->hierarchies[i]);
33ad9f1a 651 free(meta_data->hierarchies);
178938fe 652 free(meta_data);
33ad9f1a 653 return NULL;
e14f67a7
U
654}
655
4fb3cba5 656static struct cgroup_hierarchy *lxc_cgroup_find_hierarchy(struct cgroup_meta_data *meta_data, const char *subsystem)
e14f67a7 657{
33ad9f1a
CS
658 size_t i;
659 for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
660 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
517587ef
CB
661 if (!h)
662 continue;
33ad9f1a
CS
663 if (h && lxc_string_in_array(subsystem, (const char **)h->subsystems))
664 return h;
e14f67a7 665 }
e14f67a7
U
666 return NULL;
667}
668
d3f99e96
SH
669static bool mountpoint_is_accessible(struct cgroup_mount_point *mp)
670{
671 return mp && access(mp->mount_point, F_OK) == 0;
672}
673
4fb3cba5 674static struct cgroup_mount_point *lxc_cgroup_find_mount_point(struct cgroup_hierarchy *hierarchy, const char *group, bool should_be_writable)
b98f7d6e 675{
33ad9f1a
CS
676 struct cgroup_mount_point **mps;
677 struct cgroup_mount_point *current_result = NULL;
678 ssize_t quality = -1;
b98f7d6e 679
33ad9f1a 680 /* trivial case */
d3f99e96 681 if (mountpoint_is_accessible(hierarchy->rw_absolute_mount_point))
33ad9f1a 682 return hierarchy->rw_absolute_mount_point;
d3f99e96 683 if (!should_be_writable && mountpoint_is_accessible(hierarchy->ro_absolute_mount_point))
33ad9f1a 684 return hierarchy->ro_absolute_mount_point;
b98f7d6e 685
33ad9f1a
CS
686 for (mps = hierarchy->all_mount_points; mps && *mps; mps++) {
687 struct cgroup_mount_point *mp = *mps;
688 size_t prefix_len = mp->mount_prefix ? strlen(mp->mount_prefix) : 0;
b98f7d6e 689
33ad9f1a
CS
690 if (prefix_len == 1 && mp->mount_prefix[0] == '/')
691 prefix_len = 0;
b98f7d6e 692
d3f99e96
SH
693 if (!mountpoint_is_accessible(mp))
694 continue;
695
33ad9f1a
CS
696 if (should_be_writable && mp->read_only)
697 continue;
698
699 if (!prefix_len ||
700 (strncmp(group, mp->mount_prefix, prefix_len) == 0 &&
701 (group[prefix_len] == '\0' || group[prefix_len] == '/'))) {
702 /* search for the best quality match, i.e. the match with the
703 * shortest prefix where this group is still contained
704 */
705 if (quality == -1 || prefix_len < quality) {
706 current_result = mp;
707 quality = prefix_len;
708 }
b98f7d6e
SH
709 }
710 }
711
33ad9f1a
CS
712 if (!current_result)
713 errno = ENOENT;
714 return current_result;
b98f7d6e
SH
715}
716
4fb3cba5 717static char *lxc_cgroup_find_abs_path(const char *subsystem, const char *group, bool should_be_writable, const char *suffix)
b98f7d6e 718{
33ad9f1a
CS
719 struct cgroup_meta_data *meta_data;
720 struct cgroup_hierarchy *h;
721 struct cgroup_mount_point *mp;
722 char *result;
723 int saved_errno;
724
725 meta_data = lxc_cgroup_load_meta();
726 if (!meta_data)
727 return NULL;
b98f7d6e 728
33ad9f1a
CS
729 h = lxc_cgroup_find_hierarchy(meta_data, subsystem);
730 if (!h)
731 goto out_error;
b98f7d6e 732
33ad9f1a
CS
733 mp = lxc_cgroup_find_mount_point(h, group, should_be_writable);
734 if (!mp)
735 goto out_error;
b98f7d6e 736
33ad9f1a
CS
737 result = cgroup_to_absolute_path(mp, group, suffix);
738 if (!result)
739 goto out_error;
b98f7d6e 740
33ad9f1a
CS
741 lxc_cgroup_put_meta(meta_data);
742 return result;
b98f7d6e 743
33ad9f1a
CS
744out_error:
745 saved_errno = errno;
746 lxc_cgroup_put_meta(meta_data);
747 errno = saved_errno;
748 return NULL;
b98f7d6e
SH
749}
750
4fb3cba5 751static struct cgroup_process_info *lxc_cgroup_process_info_get(pid_t pid, struct cgroup_meta_data *meta)
fd4f5a56 752{
33ad9f1a
CS
753 char pid_buf[32];
754 snprintf(pid_buf, 32, "/proc/%lu/cgroup", (unsigned long)pid);
755 return lxc_cgroup_process_info_getx(pid_buf, meta);
c8f7c563
CS
756}
757
4fb3cba5 758static struct cgroup_process_info *lxc_cgroup_process_info_get_init(struct cgroup_meta_data *meta)
c8f7c563 759{
33ad9f1a
CS
760 return lxc_cgroup_process_info_get(1, meta);
761}
b98f7d6e 762
4fb3cba5 763static struct cgroup_process_info *lxc_cgroup_process_info_get_self(struct cgroup_meta_data *meta)
33ad9f1a
CS
764{
765 struct cgroup_process_info *i;
766 i = lxc_cgroup_process_info_getx("/proc/self/cgroup", meta);
767 if (!i)
0059379f 768 i = lxc_cgroup_process_info_get(lxc_raw_getpid(), meta);
33ad9f1a
CS
769 return i;
770}
ae5c8b8e 771
692ba18f
SH
772/*
773 * If a controller has ns cgroup mounted, then in that cgroup the handler->pid
774 * is already in a new cgroup named after the pid. 'mnt' is passed in as
775 * the full current cgroup. Say that is /sys/fs/cgroup/lxc/2975 and the container
776 * name is c1. . We want to rename the cgroup directory to /sys/fs/cgroup/lxc/c1,
777 * and return the string /sys/fs/cgroup/lxc/c1.
778 */
cea0552e 779static char *cgroup_rename_nsgroup(const char *mountpath, const char *oldname, pid_t pid, const char *name)
692ba18f
SH
780{
781 char *dir, *fulloldpath;
782 char *newname, *fullnewpath;
cea0552e 783 int len, newlen, ret;
692ba18f
SH
784
785 /*
786 * if cgroup is mounted at /cgroup and task is in cgroup /ab/, pid 2375 and
787 * name is c1,
788 * dir: /ab
789 * fulloldpath = /cgroup/ab/2375
790 * fullnewpath = /cgroup/ab/c1
791 * newname = /ab/c1
792 */
793 dir = alloca(strlen(oldname) + 1);
794 strcpy(dir, oldname);
795
cea0552e
SH
796 len = strlen(oldname) + strlen(mountpath) + 22;
797 fulloldpath = alloca(len);
6fc7d8b6 798 ret = snprintf(fulloldpath, len, "%s/%s/%lu", mountpath, oldname, (unsigned long)pid);
cea0552e
SH
799 if (ret < 0 || ret >= len)
800 return NULL;
692ba18f
SH
801
802 len = strlen(dir) + strlen(name) + 2;
803 newname = malloc(len);
804 if (!newname) {
805 SYSERROR("Out of memory");
806 return NULL;
807 }
cea0552e
SH
808 ret = snprintf(newname, len, "%s/%s", dir, name);
809 if (ret < 0 || ret >= len) {
810 free(newname);
811 return NULL;
812 }
692ba18f 813
cea0552e
SH
814 newlen = strlen(mountpath) + len + 2;
815 fullnewpath = alloca(newlen);
816 ret = snprintf(fullnewpath, newlen, "%s/%s", mountpath, newname);
817 if (ret < 0 || ret >= newlen) {
818 free(newname);
819 return NULL;
820 }
692ba18f
SH
821
822 if (access(fullnewpath, F_OK) == 0) {
823 if (rmdir(fullnewpath) != 0) {
824 SYSERROR("container cgroup %s already exists.", fullnewpath);
825 free(newname);
826 return NULL;
827 }
828 }
829 if (rename(fulloldpath, fullnewpath)) {
830 SYSERROR("failed to rename cgroup %s->%s", fulloldpath, fullnewpath);
831 free(newname);
832 return NULL;
833 }
834
835 DEBUG("'%s' renamed to '%s'", oldname, newname);
836
837 return newname;
838}
839
ea439aac
SH
840static bool is_crucial_hierarchy(struct cgroup_hierarchy *h)
841{
842 char **p;
843
844 for (p = h->subsystems; *p; p++) {
845 if (is_crucial_cgroup_subsystem(*p))
846 return true;
847 }
848 return false;
849}
850
33ad9f1a 851/* create a new cgroup */
4fb3cba5 852static struct cgroup_process_info *lxc_cgroupfs_create(const char *name, const char *path_pattern, struct cgroup_meta_data *meta_data, const char *sub_pattern)
33ad9f1a 853{
001b026e 854 char **cgroup_path_components = NULL;
33ad9f1a
CS
855 char **p = NULL;
856 char *path_so_far = NULL;
857 char **new_cgroup_paths = NULL;
858 char **new_cgroup_paths_sub = NULL;
859 struct cgroup_mount_point *mp;
860 struct cgroup_hierarchy *h;
861 struct cgroup_process_info *base_info = NULL;
862 struct cgroup_process_info *info_ptr;
863 int saved_errno;
864 int r;
865 unsigned suffix = 0;
866 bool had_sub_pattern = false;
867 size_t i;
ae5c8b8e 868
33ad9f1a
CS
869 if (!is_valid_cgroup(name)) {
870 ERROR("Invalid cgroup name: '%s'", name);
871 errno = EINVAL;
872 return NULL;
ae5c8b8e
SH
873 }
874
33ad9f1a
CS
875 if (!strstr(path_pattern, "%n")) {
876 ERROR("Invalid cgroup path pattern: '%s'; contains no %%n for specifying container name", path_pattern);
877 errno = EINVAL;
878 return NULL;
879 }
fd37327f 880
33ad9f1a
CS
881 /* we will modify the result of this operation directly,
882 * so we don't have to copy the data structure
883 */
884 base_info = (path_pattern[0] == '/') ?
885 lxc_cgroup_process_info_get_init(meta_data) :
886 lxc_cgroup_process_info_get_self(meta_data);
887 if (!base_info)
888 return NULL;
c8f7c563 889
33ad9f1a
CS
890 new_cgroup_paths = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
891 if (!new_cgroup_paths)
892 goto out_initial_error;
893
894 new_cgroup_paths_sub = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
895 if (!new_cgroup_paths_sub)
896 goto out_initial_error;
897
898 /* find mount points we can use */
899 for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
900 h = info_ptr->hierarchy;
517587ef
CB
901 if (!h)
902 continue;
33ad9f1a
CS
903 mp = lxc_cgroup_find_mount_point(h, info_ptr->cgroup_path, true);
904 if (!mp) {
905 ERROR("Could not find writable mount point for cgroup hierarchy %d while trying to create cgroup.", h->index);
906 goto out_initial_error;
907 }
908 info_ptr->designated_mount_point = mp;
460a1cf0 909
692ba18f
SH
910 if (lxc_string_in_array("ns", (const char **)h->subsystems))
911 continue;
2edb53c7
SH
912 if (handle_cgroup_settings(mp, info_ptr->cgroup_path) < 0) {
913 ERROR("Could not set clone_children to 1 for cpuset hierarchy in parent cgroup.");
33ad9f1a 914 goto out_initial_error;
2edb53c7 915 }
33ad9f1a 916 }
b98f7d6e 917
33ad9f1a
CS
918 /* normalize the path */
919 cgroup_path_components = lxc_normalize_path(path_pattern);
920 if (!cgroup_path_components)
921 goto out_initial_error;
922
923 /* go through the path components to see if we can create them */
924 for (p = cgroup_path_components; *p || (sub_pattern && !had_sub_pattern); p++) {
925 /* we only want to create the same component with -1, -2, etc.
926 * if the component contains the container name itself, otherwise
927 * it's not an error if it already exists
928 */
929 char *p_eff = *p ? *p : (char *)sub_pattern;
930 bool contains_name = strstr(p_eff, "%n");
931 char *current_component = NULL;
932 char *current_subpath = NULL;
933 char *current_entire_path = NULL;
934 char *parts[3];
935 size_t j = 0;
936 i = 0;
937
938 /* if we are processing the subpattern, we want to make sure
939 * loop is ended the next time around
940 */
941 if (!*p) {
942 had_sub_pattern = true;
943 p--;
944 }
b98f7d6e 945
33ad9f1a 946 goto find_name_on_this_level;
4fb3cba5 947
33ad9f1a
CS
948 cleanup_name_on_this_level:
949 /* This is reached if we found a name clash.
950 * In that case, remove the cgroup from all previous hierarchies
951 */
952 for (j = 0, info_ptr = base_info; j < i && info_ptr; info_ptr = info_ptr->next, j++) {
77afbedf
SH
953 if (info_ptr->created_paths_count < 1)
954 continue;
6a9e0f26 955 r = remove_cgroup(info_ptr->designated_mount_point, info_ptr->created_paths[info_ptr->created_paths_count - 1], false, NULL);
33ad9f1a
CS
956 if (r < 0)
957 WARN("could not clean up cgroup we created when trying to create container");
958 free(info_ptr->created_paths[info_ptr->created_paths_count - 1]);
959 info_ptr->created_paths[--info_ptr->created_paths_count] = NULL;
960 }
961 if (current_component != current_subpath)
962 free(current_subpath);
963 if (current_component != p_eff)
964 free(current_component);
965 current_component = current_subpath = NULL;
966 /* try again with another suffix */
967 ++suffix;
4fb3cba5 968
33ad9f1a
CS
969 find_name_on_this_level:
970 /* determine name of the path component we should create */
971 if (contains_name && suffix > 0) {
972 char *buf = calloc(strlen(name) + 32, 1);
973 if (!buf)
974 goto out_initial_error;
975 snprintf(buf, strlen(name) + 32, "%s-%u", name, suffix);
976 current_component = lxc_string_replace("%n", buf, p_eff);
977 free(buf);
978 } else {
979 current_component = contains_name ? lxc_string_replace("%n", name, p_eff) : p_eff;
980 }
981 parts[0] = path_so_far;
982 parts[1] = current_component;
983 parts[2] = NULL;
984 current_subpath = path_so_far ? lxc_string_join("/", (const char **)parts, false) : current_component;
985
986 /* Now go through each hierarchy and try to create the
987 * corresponding cgroup
988 */
989 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
990 char *parts2[3];
692ba18f 991
517587ef
CB
992 if (!info_ptr->hierarchy)
993 continue;
994
692ba18f
SH
995 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
996 continue;
33ad9f1a
CS
997 current_entire_path = NULL;
998
999 parts2[0] = !strcmp(info_ptr->cgroup_path, "/") ? "" : info_ptr->cgroup_path;
1000 parts2[1] = current_subpath;
1001 parts2[2] = NULL;
1002 current_entire_path = lxc_string_join("/", (const char **)parts2, false);
1003
1004 if (!*p) {
1005 /* we are processing the subpath, so only update that one */
1006 free(new_cgroup_paths_sub[i]);
1007 new_cgroup_paths_sub[i] = strdup(current_entire_path);
1008 if (!new_cgroup_paths_sub[i])
1009 goto cleanup_from_error;
1010 } else {
1011 /* remember which path was used on this controller */
1012 free(new_cgroup_paths[i]);
1013 new_cgroup_paths[i] = strdup(current_entire_path);
1014 if (!new_cgroup_paths[i])
1015 goto cleanup_from_error;
1016 }
fd4f5a56 1017
33ad9f1a
CS
1018 r = create_cgroup(info_ptr->designated_mount_point, current_entire_path);
1019 if (r < 0 && errno == EEXIST && contains_name) {
1020 /* name clash => try new name with new suffix */
1021 free(current_entire_path);
1022 current_entire_path = NULL;
1023 goto cleanup_name_on_this_level;
1024 } else if (r < 0 && errno != EEXIST) {
ea439aac
SH
1025 if (is_crucial_hierarchy(info_ptr->hierarchy)) {
1026 SYSERROR("Could not create cgroup '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
1027 goto cleanup_from_error;
1028 }
1029 goto skip;
33ad9f1a
CS
1030 } else if (r == 0) {
1031 /* successfully created */
1032 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1033 if (r < 0)
1034 goto cleanup_from_error;
d703c2b1 1035 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, current_entire_path)) {
b38b62a6 1036 ERROR("Failed to initialize cpuset for '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
d703c2b1
RV
1037 goto cleanup_from_error;
1038 }
33ad9f1a
CS
1039 info_ptr->created_paths[info_ptr->created_paths_count++] = current_entire_path;
1040 } else {
1041 /* if we didn't create the cgroup, then we have to make sure that
1042 * further cgroups will be created properly
1043 */
d703c2b1 1044 if (handle_cgroup_settings(info_ptr->designated_mount_point, info_ptr->cgroup_path) < 0) {
f6ac3b9e 1045 ERROR("Could not set clone_children to 1 for cpuset hierarchy in pre-existing cgroup.");
33ad9f1a 1046 goto cleanup_from_error;
f6ac3b9e 1047 }
d703c2b1
RV
1048 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, info_ptr->cgroup_path)) {
1049 ERROR("Failed to initialize cpuset in pre-existing '%s'.", info_ptr->cgroup_path);
1050 goto cleanup_from_error;
1051 }
33ad9f1a 1052
ea439aac 1053skip:
33ad9f1a
CS
1054 /* already existed but path component of pattern didn't contain '%n',
1055 * so this is not an error; but then we don't need current_entire_path
1056 * anymore...
1057 */
1058 free(current_entire_path);
1059 current_entire_path = NULL;
1060 }
1061 }
fd4f5a56 1062
33ad9f1a
CS
1063 /* save path so far */
1064 free(path_so_far);
1065 path_so_far = strdup(current_subpath);
1066 if (!path_so_far)
1067 goto cleanup_from_error;
1068
1069 /* cleanup */
1070 if (current_component != current_subpath)
1071 free(current_subpath);
1072 if (current_component != p_eff)
1073 free(current_component);
1074 current_component = current_subpath = NULL;
1075 continue;
4fb3cba5 1076
33ad9f1a 1077 cleanup_from_error:
ec64264d 1078 /* called if an error occurred in the loop, so we
33ad9f1a
CS
1079 * do some additional cleanup here
1080 */
1081 saved_errno = errno;
1082 if (current_component != current_subpath)
1083 free(current_subpath);
1084 if (current_component != p_eff)
1085 free(current_component);
1086 free(current_entire_path);
1087 errno = saved_errno;
1088 goto out_initial_error;
fd4f5a56
DL
1089 }
1090
33ad9f1a
CS
1091 /* we're done, now update the paths */
1092 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
517587ef
CB
1093 if (!info_ptr->hierarchy)
1094 continue;
47d8fb3b
CS
1095 /* ignore legacy 'ns' subsystem here, lxc_cgroup_create_legacy
1096 * will take care of it
1097 * Since we do a continue in above loop, new_cgroup_paths[i] is
1098 * unset anyway, as is new_cgroup_paths_sub[i]
692ba18f 1099 */
47d8fb3b
CS
1100 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1101 continue;
1102 free(info_ptr->cgroup_path);
1103 info_ptr->cgroup_path = new_cgroup_paths[i];
1104 info_ptr->cgroup_path_sub = new_cgroup_paths_sub[i];
fd4f5a56 1105 }
33ad9f1a
CS
1106 /* don't use lxc_free_array since we used the array members
1107 * to store them in our result...
1108 */
1109 free(new_cgroup_paths);
1110 free(new_cgroup_paths_sub);
1111 free(path_so_far);
1112 lxc_free_array((void **)cgroup_path_components, free);
1113 return base_info;
1114
1115out_initial_error:
1116 saved_errno = errno;
1117 free(path_so_far);
6a9e0f26 1118 lxc_cgroup_process_info_free_and_remove(base_info, NULL);
33ad9f1a
CS
1119 lxc_free_array((void **)new_cgroup_paths, free);
1120 lxc_free_array((void **)new_cgroup_paths_sub, free);
1121 lxc_free_array((void **)cgroup_path_components, free);
1122 errno = saved_errno;
1123 return NULL;
c8f7c563
CS
1124}
1125
4fb3cba5 1126static int lxc_cgroup_create_legacy(struct cgroup_process_info *base_info, const char *name, pid_t pid)
47d8fb3b
CS
1127{
1128 struct cgroup_process_info *info_ptr;
1129 int r;
1130
1131 for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
517587ef
CB
1132 if (!info_ptr->hierarchy)
1133 continue;
1134
47d8fb3b
CS
1135 if (!lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1136 continue;
1137 /*
1138 * For any path which has ns cgroup mounted, handler->pid is already
1139 * moved into a container called '%d % (handler->pid)'. Rename it to
1140 * the cgroup name and record that.
1141 */
1142 char *tmp = cgroup_rename_nsgroup((const char *)info_ptr->designated_mount_point->mount_point,
1143 info_ptr->cgroup_path, pid, name);
1144 if (!tmp)
1145 return -1;
1146 free(info_ptr->cgroup_path);
1147 info_ptr->cgroup_path = tmp;
1148 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1149 if (r < 0)
1150 return -1;
1151 tmp = strdup(tmp);
1152 if (!tmp)
1153 return -1;
1154 info_ptr->created_paths[info_ptr->created_paths_count++] = tmp;
1155 }
1156 return 0;
1157}
1158
33ad9f1a 1159/* get the cgroup membership of a given container */
4fb3cba5 1160static struct cgroup_process_info *lxc_cgroup_get_container_info(const char *name, const char *lxcpath, struct cgroup_meta_data *meta_data)
c8f7c563 1161{
33ad9f1a
CS
1162 struct cgroup_process_info *result = NULL;
1163 int saved_errno = 0;
1164 size_t i;
1165 struct cgroup_process_info **cptr = &result;
1166 struct cgroup_process_info *entry = NULL;
1167 char *path = NULL;
1168
1169 for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
1170 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
1171 if (!h || !h->used)
1172 continue;
c8f7c563 1173
33ad9f1a
CS
1174 /* use the command interface to look for the cgroup */
1175 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->subsystems[0]);
c661b0a8
DE
1176 if (!path) {
1177 h->used = false;
c661b0a8
DE
1178 continue;
1179 }
33ad9f1a
CS
1180
1181 entry = calloc(1, sizeof(struct cgroup_process_info));
1182 if (!entry)
1183 goto out_error;
1184 entry->meta_ref = lxc_cgroup_get_meta(meta_data);
1185 entry->hierarchy = h;
1186 entry->cgroup_path = path;
1187 path = NULL;
1188
1189 /* it is not an error if we don't find anything here,
1190 * it is up to the caller to decide what to do in that
1191 * case */
1192 entry->designated_mount_point = lxc_cgroup_find_mount_point(h, entry->cgroup_path, true);
1193
1194 *cptr = entry;
1195 cptr = &entry->next;
1196 entry = NULL;
c8f7c563
CS
1197 }
1198
33ad9f1a
CS
1199 return result;
1200out_error:
1201 saved_errno = errno;
1202 free(path);
1203 lxc_cgroup_process_info_free(result);
1204 lxc_cgroup_process_info_free(entry);
1205 errno = saved_errno;
1206 return NULL;
fd4f5a56
DL
1207}
1208
33ad9f1a 1209/* move a processs to the cgroups specified by the membership */
4fb3cba5 1210static int lxc_cgroupfs_enter(struct cgroup_process_info *info, pid_t pid, bool enter_sub)
4f17323e 1211{
33ad9f1a
CS
1212 char pid_buf[32];
1213 char *cgroup_tasks_fn;
1214 int r;
1215 struct cgroup_process_info *info_ptr;
1216
1217 snprintf(pid_buf, 32, "%lu", (unsigned long)pid);
1218 for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
517587ef
CB
1219 if (!info_ptr->hierarchy)
1220 continue;
1221
33ad9f1a
CS
1222 char *cgroup_path = (enter_sub && info_ptr->cgroup_path_sub) ?
1223 info_ptr->cgroup_path_sub :
1224 info_ptr->cgroup_path;
1225
1226 if (!info_ptr->designated_mount_point) {
1227 info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, cgroup_path, true);
1228 if (!info_ptr->designated_mount_point) {
1229 SYSERROR("Could not add pid %lu to cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", (unsigned long)pid, cgroup_path);
1230 return -1;
1231 }
1232 }
4f17323e 1233
33ad9f1a
CS
1234 cgroup_tasks_fn = cgroup_to_absolute_path(info_ptr->designated_mount_point, cgroup_path, "/tasks");
1235 if (!cgroup_tasks_fn) {
1236 SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1237 return -1;
1238 }
4f17323e 1239
33ad9f1a 1240 r = lxc_write_to_file(cgroup_tasks_fn, pid_buf, strlen(pid_buf), false);
5903da82 1241 free(cgroup_tasks_fn);
ea439aac 1242 if (r < 0 && is_crucial_hierarchy(info_ptr->hierarchy)) {
33ad9f1a
CS
1243 SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1244 return -1;
1245 }
4f17323e
CS
1246 }
1247
33ad9f1a 1248 return 0;
4f17323e
CS
1249}
1250
33ad9f1a
CS
1251/* free process membership information */
1252void lxc_cgroup_process_info_free(struct cgroup_process_info *info)
fc7de561 1253{
33ad9f1a
CS
1254 struct cgroup_process_info *next;
1255 if (!info)
b98f7d6e 1256 return;
33ad9f1a
CS
1257 next = info->next;
1258 lxc_cgroup_put_meta(info->meta_ref);
1259 free(info->cgroup_path);
1260 free(info->cgroup_path_sub);
1261 lxc_free_array((void **)info->created_paths, free);
1262 free(info);
1263 lxc_cgroup_process_info_free(next);
fc7de561
SH
1264}
1265
33ad9f1a 1266/* free process membership information and remove cgroups that were created */
6a9e0f26 1267void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info, struct lxc_conf *conf)
b98f7d6e 1268{
33ad9f1a
CS
1269 struct cgroup_process_info *next;
1270 char **pp;
1271 if (!info)
1272 return;
1273 next = info->next;
603c64c2 1274 {
33ad9f1a
CS
1275 struct cgroup_mount_point *mp = info->designated_mount_point;
1276 if (!mp)
1277 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1278 if (mp)
1279 /* ignore return value here, perhaps we created the
1280 * '/lxc' cgroup in this container but another container
1281 * is still running (for example)
1282 */
6a9e0f26 1283 (void)remove_cgroup(mp, info->cgroup_path, true, conf);
603c64c2
SH
1284 }
1285 for (pp = info->created_paths; pp && *pp; pp++);
1286 for ((void)(pp && --pp); info->created_paths && pp >= info->created_paths; --pp) {
33ad9f1a 1287 free(*pp);
b98f7d6e 1288 }
33ad9f1a
CS
1289 free(info->created_paths);
1290 lxc_cgroup_put_meta(info->meta_ref);
1291 free(info->cgroup_path);
1292 free(info->cgroup_path_sub);
1293 free(info);
6a9e0f26 1294 lxc_cgroup_process_info_free_and_remove(next, conf);
33ad9f1a 1295}
b98f7d6e 1296
4fb3cba5 1297static char *lxc_cgroup_get_hierarchy_path_data(const char *subsystem, struct cgfs_data *d)
33ad9f1a 1298{
d4ef7c50
SH
1299 struct cgroup_process_info *info = d->info;
1300 info = find_info_for_subsystem(info, subsystem);
33ad9f1a
CS
1301 if (!info)
1302 return NULL;
f348e47c 1303 prune_init_scope(info->cgroup_path);
33ad9f1a 1304 return info->cgroup_path;
b98f7d6e
SH
1305}
1306
4fb3cba5 1307static char *lxc_cgroup_get_hierarchy_abs_path_data(const char *subsystem, struct cgfs_data *d)
b98f7d6e 1308{
d4ef7c50 1309 struct cgroup_process_info *info = d->info;
33ad9f1a 1310 struct cgroup_mount_point *mp = NULL;
d4ef7c50
SH
1311
1312 info = find_info_for_subsystem(info, subsystem);
33ad9f1a
CS
1313 if (!info)
1314 return NULL;
1315 if (info->designated_mount_point) {
8900b9eb 1316 mp = info->designated_mount_point;
33ad9f1a
CS
1317 } else {
1318 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1319 if (!mp)
1320 return NULL;
b98f7d6e 1321 }
33ad9f1a 1322 return cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
b98f7d6e 1323}
55c76589 1324
4fb3cba5 1325static char *lxc_cgroup_get_hierarchy_abs_path(const char *subsystem, const char *name, const char *lxcpath)
9a93d992 1326{
33ad9f1a
CS
1327 struct cgroup_meta_data *meta;
1328 struct cgroup_process_info *base_info, *info;
1329 struct cgroup_mount_point *mp;
1330 char *result = NULL;
33ad9f1a
CS
1331
1332 meta = lxc_cgroup_load_meta();
1333 if (!meta)
9a93d992 1334 return NULL;
33ad9f1a
CS
1335 base_info = lxc_cgroup_get_container_info(name, lxcpath, meta);
1336 if (!base_info)
178938fe 1337 goto out1;
33ad9f1a
CS
1338 info = find_info_for_subsystem(base_info, subsystem);
1339 if (!info)
178938fe 1340 goto out2;
33ad9f1a 1341 if (info->designated_mount_point) {
8900b9eb 1342 mp = info->designated_mount_point;
33ad9f1a
CS
1343 } else {
1344 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1345 if (!mp)
178938fe 1346 goto out3;
33ad9f1a
CS
1347 }
1348 result = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
178938fe 1349out3:
178938fe 1350out2:
33ad9f1a 1351 lxc_cgroup_process_info_free(base_info);
178938fe 1352out1:
33ad9f1a 1353 lxc_cgroup_put_meta(meta);
33ad9f1a
CS
1354 return result;
1355}
9a93d992 1356
4fb3cba5 1357static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfs_data *d)
33ad9f1a
CS
1358{
1359 char *subsystem = NULL, *p, *path;
1360 int ret = -1;
9a93d992 1361
33ad9f1a
CS
1362 subsystem = alloca(strlen(filename) + 1);
1363 strcpy(subsystem, filename);
46cd2845 1364 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a 1365 *p = '\0';
9a93d992 1366
4f875f70 1367 errno = ENOENT;
4fb3cba5 1368 path = lxc_cgroup_get_hierarchy_abs_path_data(subsystem, d);
33ad9f1a
CS
1369 if (path) {
1370 ret = do_cgroup_set(path, filename, value);
4f875f70 1371 int saved_errno = errno;
33ad9f1a 1372 free(path);
4f875f70 1373 errno = saved_errno;
9a93d992 1374 }
33ad9f1a
CS
1375 return ret;
1376}
9a93d992 1377
4fb3cba5 1378static int lxc_cgroupfs_set(const char *filename, const char *value, const char *name, const char *lxcpath)
9a93d992 1379{
33ad9f1a
CS
1380 char *subsystem = NULL, *p, *path;
1381 int ret = -1;
9a93d992 1382
33ad9f1a
CS
1383 subsystem = alloca(strlen(filename) + 1);
1384 strcpy(subsystem, filename);
46cd2845 1385 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a 1386 *p = '\0';
9a93d992 1387
33ad9f1a
CS
1388 path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1389 if (path) {
1390 ret = do_cgroup_set(path, filename, value);
1391 free(path);
1392 }
b98f7d6e 1393 return ret;
9a93d992
SH
1394}
1395
4fb3cba5 1396static int lxc_cgroupfs_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
9a93d992 1397{
33ad9f1a
CS
1398 char *subsystem = NULL, *p, *path;
1399 int ret = -1;
1400
1401 subsystem = alloca(strlen(filename) + 1);
1402 strcpy(subsystem, filename);
46cd2845 1403 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a
CS
1404 *p = '\0';
1405
1406 path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1407 if (path) {
1408 ret = do_cgroup_get(path, filename, value, len);
1409 free(path);
9a93d992 1410 }
33ad9f1a 1411 return ret;
9a93d992
SH
1412}
1413
4fb3cba5 1414static bool cgroupfs_mount_cgroup(void *hdata, const char *root, int type)
aae1f3c4
CS
1415{
1416 size_t bufsz = strlen(root) + sizeof("/sys/fs/cgroup");
1417 char *path = NULL;
1418 char **parts = NULL;
1419 char *dirname = NULL;
1420 char *abs_path = NULL;
1421 char *abs_path2 = NULL;
d4ef7c50
SH
1422 struct cgfs_data *cgfs_d;
1423 struct cgroup_process_info *info, *base_info;
aae1f3c4 1424 int r, saved_errno = 0;
b635e92d 1425 struct lxc_handler *handler = hdata;
aae1f3c4 1426
4608594e
SH
1427 if (cgns_supported())
1428 return true;
1429
b635e92d 1430 cgfs_d = handler->cgroup_data;
4fb3cba5
DE
1431 if (!cgfs_d)
1432 return false;
d4ef7c50
SH
1433 base_info = cgfs_d->info;
1434
0769b82a
CS
1435 /* If we get passed the _NOSPEC types, we default to _MIXED, since we don't
1436 * have access to the lxc_conf object at this point. It really should be up
1437 * to the caller to fix this, but this doesn't really hurt.
1438 */
1439 if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1440 type = LXC_AUTO_CGROUP_FULL_MIXED;
1441 else if (type == LXC_AUTO_CGROUP_NOSPEC)
1442 type = LXC_AUTO_CGROUP_MIXED;
1443
7997d7da
CS
1444 if (type < LXC_AUTO_CGROUP_RO || type > LXC_AUTO_CGROUP_FULL_MIXED) {
1445 ERROR("could not mount cgroups into container: invalid type specified internally");
1446 errno = EINVAL;
c476bdce 1447 return false;
7997d7da
CS
1448 }
1449
aae1f3c4
CS
1450 path = calloc(1, bufsz);
1451 if (!path)
c476bdce 1452 return false;
aae1f3c4 1453 snprintf(path, bufsz, "%s/sys/fs/cgroup", root);
592fd47a
SH
1454 r = safe_mount("cgroup_root", path, "tmpfs",
1455 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1456 "size=10240k,mode=755",
1457 root);
aae1f3c4
CS
1458 if (r < 0) {
1459 SYSERROR("could not mount tmpfs to /sys/fs/cgroup in the container");
c476bdce 1460 return false;
aae1f3c4
CS
1461 }
1462
1463 /* now mount all the hierarchies we care about */
1464 for (info = base_info; info; info = info->next) {
1465 size_t subsystem_count, i;
1466 struct cgroup_mount_point *mp = info->designated_mount_point;
517587ef
CB
1467
1468 if (!info->hierarchy)
1469 continue;
1470
d3f99e96 1471 if (!mountpoint_is_accessible(mp))
aae1f3c4 1472 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
d3f99e96 1473
aae1f3c4
CS
1474 if (!mp) {
1475 SYSERROR("could not find original mount point for cgroup hierarchy while trying to mount cgroup filesystem");
1476 goto out_error;
1477 }
1478
1479 subsystem_count = lxc_array_len((void **)info->hierarchy->subsystems);
1480 parts = calloc(subsystem_count + 1, sizeof(char *));
1481 if (!parts)
1482 goto out_error;
1483
1484 for (i = 0; i < subsystem_count; i++) {
1485 if (!strncmp(info->hierarchy->subsystems[i], "name=", 5))
1486 parts[i] = info->hierarchy->subsystems[i] + 5;
1487 else
1488 parts[i] = info->hierarchy->subsystems[i];
1489 }
1490 dirname = lxc_string_join(",", (const char **)parts, false);
1491 if (!dirname)
1492 goto out_error;
1493
1494 /* create subsystem directory */
1495 abs_path = lxc_append_paths(path, dirname);
1496 if (!abs_path)
1497 goto out_error;
1498 r = mkdir_p(abs_path, 0755);
1499 if (r < 0 && errno != EEXIST) {
1500 SYSERROR("could not create cgroup subsystem directory /sys/fs/cgroup/%s", dirname);
1501 goto out_error;
1502 }
1503
aae1f3c4
CS
1504 abs_path2 = lxc_append_paths(abs_path, info->cgroup_path);
1505 if (!abs_path2)
1506 goto out_error;
aae1f3c4 1507
7997d7da
CS
1508 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_RW || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1509 /* bind-mount the cgroup entire filesystem there */
1510 if (strcmp(mp->mount_prefix, "/") != 0) {
1511 /* FIXME: maybe we should just try to remount the entire hierarchy
1512 * with a regular mount command? may that works? */
1513 ERROR("could not automatically mount cgroup-full to /sys/fs/cgroup/%s: host has no mount point for this cgroup filesystem that has access to the root cgroup", dirname);
1514 goto out_error;
1515 }
1516 r = mount(mp->mount_point, abs_path, "none", MS_BIND, 0);
1517 if (r < 0) {
1518 SYSERROR("error bind-mounting %s to %s", mp->mount_point, abs_path);
1519 goto out_error;
1520 }
f8f3c3c0
SG
1521 /* main cgroup path should be read-only */
1522 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1523 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1524 if (r < 0) {
1525 SYSERROR("error re-mounting %s readonly", abs_path);
1526 goto out_error;
1527 }
1528 }
7997d7da
CS
1529 /* own cgroup should be read-write */
1530 if (type == LXC_AUTO_CGROUP_FULL_MIXED) {
1531 r = mount(abs_path2, abs_path2, NULL, MS_BIND, NULL);
1532 if (r < 0) {
1533 SYSERROR("error bind-mounting %s onto itself", abs_path2);
1534 goto out_error;
1535 }
1536 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND, NULL);
1537 if (r < 0) {
1538 SYSERROR("error re-mounting %s readwrite", abs_path2);
1539 goto out_error;
1540 }
1541 }
1542 } else {
1543 /* create path for container's cgroup */
1544 r = mkdir_p(abs_path2, 0755);
1545 if (r < 0 && errno != EEXIST) {
1546 SYSERROR("could not create cgroup directory /sys/fs/cgroup/%s%s", dirname, info->cgroup_path);
1547 goto out_error;
1548 }
aae1f3c4 1549
b46f0553
CS
1550 /* for read-only and mixed cases, we have to bind-mount the tmpfs directory
1551 * that points to the hierarchy itself (i.e. /sys/fs/cgroup/cpu etc.) onto
1552 * itself and then bind-mount it read-only, since we keep the tmpfs itself
1553 * read-write (see comment below)
1554 */
1555 if (type == LXC_AUTO_CGROUP_MIXED || type == LXC_AUTO_CGROUP_RO) {
1556 r = mount(abs_path, abs_path, NULL, MS_BIND, NULL);
1557 if (r < 0) {
1558 SYSERROR("error bind-mounting %s onto itself", abs_path);
1559 goto out_error;
1560 }
1561 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1562 if (r < 0) {
1563 SYSERROR("error re-mounting %s readonly", abs_path);
1564 goto out_error;
1565 }
1566 }
1567
7997d7da
CS
1568 free(abs_path);
1569 abs_path = NULL;
1570
1571 /* bind-mount container's cgroup to that directory */
1572 abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1573 if (!abs_path)
1574 goto out_error;
1575 r = mount(abs_path, abs_path2, "none", MS_BIND, 0);
ea439aac 1576 if (r < 0 && is_crucial_hierarchy(info->hierarchy)) {
7997d7da
CS
1577 SYSERROR("error bind-mounting %s to %s", abs_path, abs_path2);
1578 goto out_error;
1579 }
1580 if (type == LXC_AUTO_CGROUP_RO) {
1581 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1582 if (r < 0) {
1583 SYSERROR("error re-mounting %s readonly", abs_path2);
1584 goto out_error;
1585 }
1586 }
aae1f3c4
CS
1587 }
1588
1589 free(abs_path);
1590 free(abs_path2);
1591 abs_path = NULL;
1592 abs_path2 = NULL;
1593
1594 /* add symlinks for every single subsystem */
1595 if (subsystem_count > 1) {
1596 for (i = 0; i < subsystem_count; i++) {
1597 abs_path = lxc_append_paths(path, parts[i]);
1598 if (!abs_path)
1599 goto out_error;
1600 r = symlink(dirname, abs_path);
1601 if (r < 0)
1602 WARN("could not create symlink %s -> %s in /sys/fs/cgroup of container", parts[i], dirname);
1603 free(abs_path);
1604 abs_path = NULL;
1605 }
1606 }
1607 free(dirname);
1608 free(parts);
1609 dirname = NULL;
1610 parts = NULL;
1611 }
1612
b46f0553
CS
1613 /* We used to remount the entire tmpfs readonly if any :ro or
1614 * :mixed mode was specified. However, Ubuntu's mountall has the
1615 * unfortunate behavior to block bootup if /sys/fs/cgroup is
1616 * mounted read-only and cannot be remounted read-write.
1617 * (mountall reads /lib/init/fstab and tries to (re-)mount all of
1618 * these if they are not already mounted with the right options;
1619 * it contains an entry for /sys/fs/cgroup. In case it can't do
1620 * that, it prompts for the user to either manually fix it or
1621 * boot anyway. But without user input, booting of the container
1622 * hangs.)
1623 *
1624 * Instead of remounting the entire tmpfs readonly, we only
1625 * remount the paths readonly that are part of the cgroup
1626 * hierarchy.
f8f3c3c0 1627 */
f8f3c3c0 1628
aae1f3c4
CS
1629 free(path);
1630
c476bdce 1631 return true;
aae1f3c4
CS
1632
1633out_error:
1634 saved_errno = errno;
1635 free(path);
1636 free(dirname);
1637 free(parts);
1638 free(abs_path);
1639 free(abs_path2);
1640 errno = saved_errno;
c476bdce 1641 return false;
aae1f3c4
CS
1642}
1643
4fb3cba5 1644static int cgfs_nrtasks(void *hdata)
33ad9f1a 1645{
4fb3cba5
DE
1646 struct cgfs_data *d = hdata;
1647 struct cgroup_process_info *info;
33ad9f1a
CS
1648 struct cgroup_mount_point *mp = NULL;
1649 char *abs_path = NULL;
1650 int ret;
460a1cf0 1651
4fb3cba5
DE
1652 if (!d) {
1653 errno = ENOENT;
1654 return -1;
1655 }
1656
1657 info = d->info;
33ad9f1a
CS
1658 if (!info) {
1659 errno = ENOENT;
1660 return -1;
b98f7d6e 1661 }
c8f7c563 1662
33ad9f1a 1663 if (info->designated_mount_point) {
8900b9eb 1664 mp = info->designated_mount_point;
33ad9f1a
CS
1665 } else {
1666 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, false);
1667 if (!mp)
1668 return -1;
c8f7c563
CS
1669 }
1670
33ad9f1a
CS
1671 abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1672 if (!abs_path)
1673 return -1;
1674
1675 ret = cgroup_recursive_task_count(abs_path);
1676 free(abs_path);
1677 return ret;
c8f7c563
CS
1678}
1679
574c4428
QH
1680static struct cgroup_process_info *
1681lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str,
1682 struct cgroup_meta_data *meta)
d08ba6ec 1683{
33ad9f1a
CS
1684 struct cgroup_process_info *result = NULL;
1685 FILE *proc_pid_cgroup = NULL;
1686 char *line = NULL;
1687 size_t sz = 0;
1688 int saved_errno = 0;
1689 struct cgroup_process_info **cptr = &result;
1690 struct cgroup_process_info *entry = NULL;
1691
1692 proc_pid_cgroup = fopen_cloexec(proc_pid_cgroup_str, "r");
1693 if (!proc_pid_cgroup)
b98f7d6e 1694 return NULL;
1ac470c0 1695
33ad9f1a
CS
1696 while (getline(&line, &sz, proc_pid_cgroup) != -1) {
1697 /* file format: hierarchy:subsystems:group */
1698 char *colon1;
1699 char *colon2;
1700 char *endptr;
1701 int hierarchy_number;
1702 struct cgroup_hierarchy *h = NULL;
fd4f5a56 1703
33ad9f1a 1704 if (!line[0])
ae5c8b8e 1705 continue;
b98f7d6e 1706
33ad9f1a
CS
1707 if (line[strlen(line) - 1] == '\n')
1708 line[strlen(line) - 1] = '\0';
1709
1710 colon1 = strchr(line, ':');
1711 if (!colon1)
8900b9eb 1712 continue;
33ad9f1a
CS
1713 *colon1++ = '\0';
1714 colon2 = strchr(colon1, ':');
1715 if (!colon2)
ae5c8b8e 1716 continue;
33ad9f1a 1717 *colon2++ = '\0';
e4659536 1718
33ad9f1a 1719 endptr = NULL;
82a2fe03
CB
1720
1721 /* With cgroupv2 /proc/self/cgroup can contain entries of the
1722 * form: 0::/
1723 * These entries need to be skipped.
1724 */
1725 if (!strcmp(colon1, ""))
1726 continue;
1727
33ad9f1a
CS
1728 hierarchy_number = strtoul(line, &endptr, 10);
1729 if (!endptr || *endptr)
9a93d992 1730 continue;
9a93d992 1731
33ad9f1a
CS
1732 if (hierarchy_number > meta->maximum_hierarchy) {
1733 /* we encountered a hierarchy we didn't have before,
1734 * so probably somebody remounted some stuff in the
1735 * mean time...
1736 */
1737 errno = EAGAIN;
1738 goto out_error;
b98f7d6e 1739 }
33ad9f1a
CS
1740
1741 h = meta->hierarchies[hierarchy_number];
1742 if (!h) {
1743 /* we encountered a hierarchy that was thought to be
1744 * dead before, so probably somebody remounted some
1745 * stuff in the mean time...
1746 */
1747 errno = EAGAIN;
1748 goto out_error;
b98f7d6e 1749 }
33ad9f1a
CS
1750
1751 /* we are told that we should ignore this hierarchy */
1752 if (!h->used)
b98f7d6e 1753 continue;
5193cc3d 1754
33ad9f1a
CS
1755 entry = calloc(1, sizeof(struct cgroup_process_info));
1756 if (!entry)
1757 goto out_error;
fd4f5a56 1758
33ad9f1a
CS
1759 entry->meta_ref = lxc_cgroup_get_meta(meta);
1760 entry->hierarchy = h;
1761 entry->cgroup_path = strdup(colon2);
1762 if (!entry->cgroup_path)
1763 goto out_error;
3939a22a 1764 prune_init_scope(entry->cgroup_path);
d08ba6ec 1765
33ad9f1a
CS
1766 *cptr = entry;
1767 cptr = &entry->next;
1768 entry = NULL;
b98f7d6e 1769 }
b98f7d6e 1770
33ad9f1a
CS
1771 fclose(proc_pid_cgroup);
1772 free(line);
1773 return result;
1774
1775out_error:
1776 saved_errno = errno;
1777 if (proc_pid_cgroup)
1778 fclose(proc_pid_cgroup);
1779 lxc_cgroup_process_info_free(result);
1780 lxc_cgroup_process_info_free(entry);
1781 free(line);
1782 errno = saved_errno;
ae5c8b8e 1783 return NULL;
36b86299
DL
1784}
1785
574c4428
QH
1786static char **subsystems_from_mount_options(const char *mount_options,
1787 char **kernel_list)
36b86299 1788{
33ad9f1a
CS
1789 char *token, *str, *saveptr = NULL;
1790 char **result = NULL;
1791 size_t result_capacity = 0;
8900b9eb 1792 size_t result_count = 0;
33ad9f1a
CS
1793 int saved_errno;
1794 int r;
ef342abb 1795
33ad9f1a
CS
1796 str = alloca(strlen(mount_options)+1);
1797 strcpy(str, mount_options);
1798 for (; (token = strtok_r(str, ",", &saveptr)); str = NULL) {
1799 /* we have a subsystem if it's either in the list of
1800 * subsystems provided by the kernel OR if it starts
1801 * with name= for named hierarchies
1802 */
836514a8
U
1803 r = lxc_grow_array((void ***)&result, &result_capacity, result_count + 1, 12);
1804 if (r < 0)
1805 goto out_free;
1806 result[result_count + 1] = NULL;
1807 if (strncmp(token, "name=", 5) && !lxc_string_in_array(token, (const char **)kernel_list)) {
1a0e70ac
CB
1808 /* this is eg 'systemd' but the mount will be
1809 * 'name=systemd'
1810 */
836514a8
U
1811 result[result_count] = malloc(strlen(token) + 6);
1812 if (result[result_count])
1813 sprintf(result[result_count], "name=%s", token);
1814 } else
33ad9f1a 1815 result[result_count] = strdup(token);
836514a8
U
1816 if (!result[result_count])
1817 goto out_free;
1818 result_count++;
ae5c8b8e 1819 }
f0e64b8b 1820
33ad9f1a
CS
1821 return result;
1822
1823out_free:
1824 saved_errno = errno;
1825 lxc_free_array((void**)result, free);
1826 errno = saved_errno;
1827 return NULL;
b98f7d6e
SH
1828}
1829
574c4428 1830static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp)
b98f7d6e 1831{
33ad9f1a
CS
1832 if (!mp)
1833 return;
1834 free(mp->mount_point);
1835 free(mp->mount_prefix);
1836 free(mp);
bcbd102c
SH
1837}
1838
574c4428 1839static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h)
341a9bd8 1840{
33ad9f1a
CS
1841 if (!h)
1842 return;
2446c321
CB
1843 if (h->subsystems) {
1844 lxc_free_array((void **)h->subsystems, free);
1845 h->subsystems = NULL;
1846 }
1847 if (h->all_mount_points) {
1848 free(h->all_mount_points);
1849 h->all_mount_points = NULL;
1850 }
33ad9f1a 1851 free(h);
2446c321 1852 h = NULL;
33ad9f1a 1853}
341a9bd8 1854
574c4428 1855static bool is_valid_cgroup(const char *name)
33ad9f1a
CS
1856{
1857 const char *p;
1858 for (p = name; *p; p++) {
28bb9321
QH
1859 /* Use the ASCII printable characters range(32 - 127)
1860 * is reasonable, we kick out 32(SPACE) because it'll
1861 * break legacy lxc-ls
1862 */
1863 if (*p <= 32 || *p >= 127 || *p == '/')
33ad9f1a 1864 return false;
341a9bd8 1865 }
33ad9f1a
CS
1866 return strcmp(name, ".") != 0 && strcmp(name, "..") != 0;
1867}
341a9bd8 1868
574c4428 1869static int create_or_remove_cgroup(bool do_remove,
6a9e0f26
SH
1870 struct cgroup_mount_point *mp, const char *path, int recurse,
1871 struct lxc_conf *conf)
33ad9f1a
CS
1872{
1873 int r, saved_errno = 0;
1874 char *buf = cgroup_to_absolute_path(mp, path, NULL);
1875 if (!buf)
1876 return -1;
341a9bd8 1877
33ad9f1a 1878 /* create or remove directory */
603c64c2 1879 if (do_remove) {
01d59fe5
CB
1880 if (!dir_exists(buf))
1881 return 0;
6a9e0f26
SH
1882 if (recurse) {
1883 if (conf && !lxc_list_empty(&conf->id_map))
c9b7c33e
CB
1884 r = userns_exec_1(conf, rmdir_wrapper, buf,
1885 "rmdir_wrapper");
6a9e0f26
SH
1886 else
1887 r = cgroup_rmdir(buf);
1888 } else
603c64c2
SH
1889 r = rmdir(buf);
1890 } else
2f604eb5 1891 r = mkdir_p(buf, 0777);
33ad9f1a
CS
1892 saved_errno = errno;
1893 free(buf);
1894 errno = saved_errno;
1895 return r;
341a9bd8 1896}
bcbd102c 1897
574c4428 1898static int create_cgroup(struct cgroup_mount_point *mp, const char *path)
a6ddef61 1899{
6a9e0f26 1900 return create_or_remove_cgroup(false, mp, path, false, NULL);
a6ddef61
MN
1901}
1902
574c4428 1903static int remove_cgroup(struct cgroup_mount_point *mp,
6a9e0f26 1904 const char *path, bool recurse, struct lxc_conf *conf)
576f946d 1905{
6a9e0f26 1906 return create_or_remove_cgroup(true, mp, path, recurse, conf);
33ad9f1a 1907}
576f946d 1908
574c4428
QH
1909static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp,
1910 const char *path, const char *suffix)
33ad9f1a
CS
1911{
1912 /* first we have to make sure we subtract the mount point's prefix */
1913 char *prefix = mp->mount_prefix;
1914 char *buf;
1915 ssize_t len, rv;
1916
1917 /* we want to make sure only absolute paths to cgroups are passed to us */
1918 if (path[0] != '/') {
1919 errno = EINVAL;
1920 return NULL;
1921 }
b98f7d6e 1922
33ad9f1a
CS
1923 if (prefix && !strcmp(prefix, "/"))
1924 prefix = NULL;
b98f7d6e 1925
33ad9f1a
CS
1926 /* prefix doesn't match */
1927 if (prefix && strncmp(prefix, path, strlen(prefix)) != 0) {
1928 errno = EINVAL;
1929 return NULL;
1930 }
1931 /* if prefix is /foo and path is /foobar */
1932 if (prefix && path[strlen(prefix)] != '/' && path[strlen(prefix)] != '\0') {
1933 errno = EINVAL;
1934 return NULL;
1935 }
b98f7d6e 1936
33ad9f1a
CS
1937 /* remove prefix from path */
1938 path += prefix ? strlen(prefix) : 0;
b98f7d6e 1939
33ad9f1a
CS
1940 len = strlen(mp->mount_point) + strlen(path) + (suffix ? strlen(suffix) : 0);
1941 buf = calloc(len + 1, 1);
50266dc6
DE
1942 if (!buf)
1943 return NULL;
33ad9f1a 1944 rv = snprintf(buf, len + 1, "%s%s%s", mp->mount_point, path, suffix ? suffix : "");
8900b9eb 1945 if (rv > len) {
33ad9f1a
CS
1946 free(buf);
1947 errno = ENOMEM;
8900b9eb 1948 return NULL;
8b92dc3a 1949 }
576f946d 1950
33ad9f1a 1951 return buf;
e0f888d9 1952}
283678ed 1953
574c4428
QH
1954static struct cgroup_process_info *
1955find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem)
283678ed 1956{
33ad9f1a
CS
1957 struct cgroup_process_info *info_ptr;
1958 for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1959 struct cgroup_hierarchy *h = info_ptr->hierarchy;
517587ef
CB
1960 if (!h)
1961 continue;
33ad9f1a
CS
1962 if (lxc_string_in_array(subsystem, (const char **)h->subsystems))
1963 return info_ptr;
b98f7d6e 1964 }
33ad9f1a
CS
1965 errno = ENOENT;
1966 return NULL;
1967}
283678ed 1968
574c4428
QH
1969static int do_cgroup_get(const char *cgroup_path, const char *sub_filename,
1970 char *value, size_t len)
33ad9f1a
CS
1971{
1972 const char *parts[3] = {
1973 cgroup_path,
1974 sub_filename,
1975 NULL
1976 };
1977 char *filename;
1978 int ret, saved_errno;
1979
1980 filename = lxc_string_join("/", parts, false);
1981 if (!filename)
1982 return -1;
1983
1984 ret = lxc_read_from_file(filename, value, len);
1985 saved_errno = errno;
1986 free(filename);
1987 errno = saved_errno;
1988 return ret;
283678ed 1989}
b113383b 1990
574c4428
QH
1991static int do_cgroup_set(const char *cgroup_path, const char *sub_filename,
1992 const char *value)
b113383b 1993{
33ad9f1a
CS
1994 const char *parts[3] = {
1995 cgroup_path,
1996 sub_filename,
1997 NULL
1998 };
1999 char *filename;
2000 int ret, saved_errno;
b113383b 2001
33ad9f1a
CS
2002 filename = lxc_string_join("/", parts, false);
2003 if (!filename)
2004 return -1;
b113383b 2005
33ad9f1a
CS
2006 ret = lxc_write_to_file(filename, value, strlen(value), false);
2007 saved_errno = errno;
2008 free(filename);
2009 errno = saved_errno;
2010 return ret;
b98f7d6e
SH
2011}
2012
4fb3cba5 2013static int do_setup_cgroup_limits(struct cgfs_data *d,
574c4428 2014 struct lxc_list *cgroup_settings, bool do_devices)
b98f7d6e 2015{
365d180a 2016 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
b98f7d6e
SH
2017 struct lxc_cgroup *cg;
2018 int ret = -1;
2019
33ad9f1a 2020 if (lxc_list_empty(cgroup_settings))
b98f7d6e
SH
2021 return 0;
2022
aaf26830 2023 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
fac7c663
KT
2024 if (!sorted_cgroup_settings) {
2025 return -1;
2026 }
aaf26830
KT
2027
2028 lxc_list_for_each(iterator, sorted_cgroup_settings) {
b98f7d6e
SH
2029 cg = iterator->elem;
2030
33ad9f1a 2031 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
b98f7d6e 2032 if (strcmp(cg->subsystem, "devices.deny") == 0 &&
4fb3cba5 2033 cgroup_devices_has_allow_or_deny(d, cg->value, false))
b98f7d6e
SH
2034 continue;
2035 if (strcmp(cg->subsystem, "devices.allow") == 0 &&
4fb3cba5 2036 cgroup_devices_has_allow_or_deny(d, cg->value, true))
b98f7d6e 2037 continue;
4fb3cba5 2038 if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
dddf7c5b 2039 if (do_devices && (errno == EACCES || errno == EPERM)) {
4f875f70
SH
2040 WARN("Error setting %s to %s for %s",
2041 cg->subsystem, cg->value, d->name);
2042 continue;
2043 }
dddf7c5b 2044 SYSERROR("Error setting %s to %s for %s",
4fb3cba5 2045 cg->subsystem, cg->value, d->name);
b98f7d6e
SH
2046 goto out;
2047 }
b113383b 2048 }
b98f7d6e
SH
2049
2050 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
b113383b
SH
2051 }
2052
b98f7d6e
SH
2053 ret = 0;
2054 INFO("cgroup has been setup");
2055out:
365d180a 2056 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
aaf26830
KT
2057 lxc_list_del(iterator);
2058 free(iterator);
2059 }
365d180a 2060 free(sorted_cgroup_settings);
b113383b
SH
2061 return ret;
2062}
b98f7d6e 2063
4fb3cba5 2064static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d,
574c4428 2065 char *v, bool for_allow)
33ad9f1a
CS
2066{
2067 char *path;
2068 FILE *devices_list;
8900b9eb 2069 char *line = NULL;
33ad9f1a
CS
2070 size_t sz = 0;
2071 bool ret = !for_allow;
2072 const char *parts[3] = {
2073 NULL,
2074 "devices.list",
2075 NULL
2076 };
2077
1a0e70ac
CB
2078 /* XXX FIXME if users could use something other than 'lxc.devices.deny =
2079 * a'. not sure they ever do, but they *could* right now, I'm assuming
2080 * they do NOT
2081 */
33ad9f1a
CS
2082 if (!for_allow && strcmp(v, "a") != 0 && strcmp(v, "a *:* rwm") != 0)
2083 return false;
2084
4fb3cba5 2085 parts[0] = (const char *)lxc_cgroup_get_hierarchy_abs_path_data("devices", d);
33ad9f1a
CS
2086 if (!parts[0])
2087 return false;
2088 path = lxc_string_join("/", parts, false);
2089 if (!path) {
2090 free((void *)parts[0]);
2091 return false;
2092 }
2093
2094 devices_list = fopen_cloexec(path, "r");
2095 if (!devices_list) {
2096 free(path);
2097 return false;
2098 }
2099
2100 while (getline(&line, &sz, devices_list) != -1) {
2101 size_t len = strlen(line);
2102 if (len > 0 && line[len-1] == '\n')
2103 line[len-1] = '\0';
2104 if (strcmp(line, "a *:* rwm") == 0) {
2105 ret = for_allow;
2106 goto out;
2107 } else if (for_allow && strcmp(line, v) == 0) {
2108 ret = true;
8900b9eb 2109 goto out;
33ad9f1a
CS
2110 }
2111 }
2112
2113out:
2114 fclose(devices_list);
2115 free(line);
2116 free(path);
2117 return ret;
2118}
2119
574c4428 2120static int cgroup_recursive_task_count(const char *cgroup_path)
b98f7d6e 2121{
33ad9f1a 2122 DIR *d;
33ad9f1a 2123 struct dirent *dent;
33ad9f1a
CS
2124 int n = 0, r;
2125
33ad9f1a 2126 d = opendir(cgroup_path);
74f96976 2127 if (!d)
33ad9f1a
CS
2128 return 0;
2129
74f96976 2130 while ((dent = readdir(d))) {
33ad9f1a
CS
2131 const char *parts[3] = {
2132 cgroup_path,
2133 dent->d_name,
2134 NULL
2135 };
2136 char *sub_path;
2137 struct stat st;
2138
2139 if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, ".."))
2140 continue;
2141 sub_path = lxc_string_join("/", parts, false);
2142 if (!sub_path) {
2143 closedir(d);
33ad9f1a
CS
2144 return -1;
2145 }
2146 r = stat(sub_path, &st);
2147 if (r < 0) {
2148 closedir(d);
33ad9f1a
CS
2149 free(sub_path);
2150 return -1;
2151 }
2152 if (S_ISDIR(st.st_mode)) {
2153 r = cgroup_recursive_task_count(sub_path);
2154 if (r >= 0)
2155 n += r;
2156 } else if (!strcmp(dent->d_name, "tasks")) {
ccb4cabe 2157 r = lxc_count_file_lines(sub_path);
33ad9f1a
CS
2158 if (r >= 0)
2159 n += r;
2160 }
2161 free(sub_path);
2162 }
2163 closedir(d);
33ad9f1a
CS
2164
2165 return n;
2166}
2167
574c4428
QH
2168static int handle_cgroup_settings(struct cgroup_mount_point *mp,
2169 char *cgroup_path)
b98f7d6e 2170{
33ad9f1a 2171 int r, saved_errno = 0;
7e7243e1 2172 char buf[2];
1ea59ad2 2173
934b1673
SH
2174 mp->need_cpuset_init = false;
2175
1ea59ad2
SH
2176 /* If this is the memory cgroup, we want to enforce hierarchy.
2177 * But don't fail if for some reason we can't.
2178 */
2edb53c7
SH
2179 if (lxc_string_in_array("memory", (const char **)mp->hierarchy->subsystems)) {
2180 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/memory.use_hierarchy");
2181 if (cc_path) {
2182 r = lxc_read_from_file(cc_path, buf, 1);
2183 if (r < 1 || buf[0] != '1') {
2184 r = lxc_write_to_file(cc_path, "1", 1, false);
2185 if (r < 0)
a8916143 2186 SYSERROR("failed to set memory.use_hierarchy to 1; continuing");
2edb53c7 2187 }
1ea59ad2
SH
2188 free(cc_path);
2189 }
2edb53c7 2190 }
1ea59ad2 2191
33ad9f1a
CS
2192 /* if this is a cpuset hierarchy, we have to set cgroup.clone_children in
2193 * the base cgroup, otherwise containers will start with an empty cpuset.mems
2194 * and cpuset.cpus and then
2195 */
2edb53c7
SH
2196 if (lxc_string_in_array("cpuset", (const char **)mp->hierarchy->subsystems)) {
2197 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/cgroup.clone_children");
d703c2b1
RV
2198 struct stat sb;
2199
33ad9f1a 2200 if (!cc_path)
2edb53c7 2201 return -1;
d703c2b1
RV
2202 /* cgroup.clone_children is not available when running under
2203 * older kernel versions; in this case, we'll initialize
2204 * cpuset.cpus and cpuset.mems later, after the new cgroup
2205 * was created
2206 */
2207 if (stat(cc_path, &sb) != 0 && errno == ENOENT) {
934b1673 2208 mp->need_cpuset_init = true;
d703c2b1
RV
2209 free(cc_path);
2210 return 0;
2211 }
7e7243e1
SH
2212 r = lxc_read_from_file(cc_path, buf, 1);
2213 if (r == 1 && buf[0] == '1') {
2214 free(cc_path);
2edb53c7 2215 return 0;
7e7243e1 2216 }
33ad9f1a 2217 r = lxc_write_to_file(cc_path, "1", 1, false);
2edb53c7
SH
2218 saved_errno = errno;
2219 free(cc_path);
2220 errno = saved_errno;
2221 return r < 0 ? -1 : 0;
33ad9f1a
CS
2222 }
2223 return 0;
b98f7d6e 2224}
484ed030 2225
934b1673 2226static int cgroup_read_from_file(const char *fn, char buf[], size_t bufsize)
d703c2b1
RV
2227{
2228 int ret = lxc_read_from_file(fn, buf, bufsize);
2229 if (ret < 0) {
2230 SYSERROR("failed to read %s", fn);
934b1673 2231 return ret;
d703c2b1
RV
2232 }
2233 if (ret == bufsize) {
934b1673
SH
2234 if (bufsize > 0) {
2235 /* obviously this wasn't empty */
2236 buf[bufsize-1] = '\0';
2237 return ret;
2238 }
2239 /* Callers don't do this, but regression/sanity check */
b103ceac 2240 ERROR("was not expecting 0 bufsize");
934b1673 2241 return -1;
d703c2b1
RV
2242 }
2243 buf[ret] = '\0';
934b1673 2244 return ret;
d703c2b1
RV
2245}
2246
2247static bool do_init_cpuset_file(struct cgroup_mount_point *mp,
2248 const char *path, const char *name)
2249{
934b1673
SH
2250 char value[1024];
2251 char *childfile, *parentfile = NULL, *tmp;
2252 int ret;
2253 bool ok = false;
2254
d703c2b1
RV
2255 childfile = cgroup_to_absolute_path(mp, path, name);
2256 if (!childfile)
2257 return false;
2258
2259 /* don't overwrite a non-empty value in the file */
934b1673
SH
2260 ret = cgroup_read_from_file(childfile, value, sizeof(value));
2261 if (ret < 0)
2262 goto out;
d703c2b1 2263 if (value[0] != '\0' && value[0] != '\n') {
934b1673
SH
2264 ok = true;
2265 goto out;
d703c2b1
RV
2266 }
2267
2268 /* path to the same name in the parent cgroup */
2269 parentfile = strdup(path);
2270 if (!parentfile)
934b1673
SH
2271 goto out;
2272
d703c2b1 2273 tmp = strrchr(parentfile, '/');
934b1673
SH
2274 if (!tmp)
2275 goto out;
d703c2b1
RV
2276 if (tmp == parentfile)
2277 tmp++; /* keep the '/' at the start */
2278 *tmp = '\0';
2279 tmp = parentfile;
2280 parentfile = cgroup_to_absolute_path(mp, tmp, name);
2281 free(tmp);
934b1673
SH
2282 if (!parentfile)
2283 goto out;
d703c2b1
RV
2284
2285 /* copy from parent to child cgroup */
934b1673
SH
2286 ret = cgroup_read_from_file(parentfile, value, sizeof(value));
2287 if (ret < 0)
2288 goto out;
2289 if (ret == sizeof(value)) {
2290 /* If anyone actually sees this error, we can address it */
2291 ERROR("parent cpuset value too long");
2292 goto out;
d703c2b1
RV
2293 }
2294 ok = (lxc_write_to_file(childfile, value, strlen(value), false) >= 0);
2295 if (!ok)
2296 SYSERROR("failed writing %s", childfile);
b1dad6f6
RV
2297
2298out:
f10fad2f 2299 free(parentfile);
d703c2b1 2300 free(childfile);
d703c2b1
RV
2301 return ok;
2302}
2303
2304static bool init_cpuset_if_needed(struct cgroup_mount_point *mp,
2305 const char *path)
2306{
2307 /* the files we have to handle here are only in cpuset hierarchies */
2308 if (!lxc_string_in_array("cpuset",
2309 (const char **)mp->hierarchy->subsystems))
2310 return true;
2311
b1dad6f6
RV
2312 if (!mp->need_cpuset_init)
2313 return true;
2314
d703c2b1
RV
2315 return (do_init_cpuset_file(mp, path, "/cpuset.cpus") &&
2316 do_init_cpuset_file(mp, path, "/cpuset.mems") );
2317}
2318
1a704014
CB
2319static void print_cgfs_init_debuginfo(struct cgfs_data *d)
2320{
2321 int i;
2322
2323 if (!getenv("LXC_DEBUG_CGFS"))
2324 return;
2325
2326 DEBUG("Cgroup information:");
2327 DEBUG(" container name: %s", d->name);
2328 if (!d->meta || !d->meta->hierarchies) {
2329 DEBUG(" No hierarchies found.");
2330 return;
2331 }
2332 DEBUG(" Controllers:");
2333 for (i = 0; i <= d->meta->maximum_hierarchy; i++) {
2334 char **p;
2335 struct cgroup_hierarchy *h = d->meta->hierarchies[i];
2336 if (!h) {
2337 DEBUG(" Empty hierarchy number %d.", i);
2338 continue;
2339 }
2340 for (p = h->subsystems; p && *p; p++) {
2341 DEBUG(" %2d: %s", i, *p);
2342 }
2343 }
2344}
2345
4fb3cba5 2346struct cgroup_ops *cgfs_ops_init(void)
484ed030 2347{
4fb3cba5 2348 return &cgfs_ops;
d4ef7c50 2349}
484ed030 2350
43654d34 2351static void *cgfs_init(struct lxc_handler *handler)
d4ef7c50 2352{
4fb3cba5 2353 struct cgfs_data *d;
484ed030 2354
4fb3cba5
DE
2355 d = malloc(sizeof(*d));
2356 if (!d)
2357 return NULL;
484ed030 2358
4fb3cba5 2359 memset(d, 0, sizeof(*d));
43654d34 2360 d->name = strdup(handler->name);
4fb3cba5
DE
2361 if (!d->name)
2362 goto err1;
2363
5e1c5795 2364 d->cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
4fb3cba5
DE
2365
2366 d->meta = lxc_cgroup_load_meta();
2367 if (!d->meta) {
2368 ERROR("cgroupfs failed to detect cgroup metadata");
2369 goto err2;
2370 }
1a704014
CB
2371
2372 print_cgfs_init_debuginfo(d);
2373
4fb3cba5
DE
2374 return d;
2375
2376err2:
2377 free(d->name);
2378err1:
2379 free(d);
2380 return NULL;
d4ef7c50 2381}
484ed030 2382
6a9e0f26 2383static void cgfs_destroy(void *hdata, struct lxc_conf *conf)
d4ef7c50 2384{
4fb3cba5
DE
2385 struct cgfs_data *d = hdata;
2386
d4ef7c50
SH
2387 if (!d)
2388 return;
f10fad2f 2389 free(d->name);
6a9e0f26 2390 lxc_cgroup_process_info_free_and_remove(d->info, conf);
c55d4505 2391 lxc_cgroup_put_meta(d->meta);
d4ef7c50 2392 free(d);
d4ef7c50 2393}
484ed030 2394
4fb3cba5 2395static inline bool cgfs_create(void *hdata)
d4ef7c50 2396{
4fb3cba5
DE
2397 struct cgfs_data *d = hdata;
2398 struct cgroup_process_info *i;
2399 struct cgroup_meta_data *md;
484ed030 2400
4fb3cba5 2401 if (!d)
d4ef7c50 2402 return false;
4fb3cba5
DE
2403 md = d->meta;
2404 i = lxc_cgroupfs_create(d->name, d->cgroup_pattern, md, NULL);
d4ef7c50
SH
2405 if (!i)
2406 return false;
2407 d->info = i;
2408 return true;
2409}
484ed030 2410
4fb3cba5 2411static inline bool cgfs_enter(void *hdata, pid_t pid)
d4ef7c50 2412{
4fb3cba5
DE
2413 struct cgfs_data *d = hdata;
2414 struct cgroup_process_info *i;
d4ef7c50 2415 int ret;
4fb3cba5
DE
2416
2417 if (!d)
2418 return false;
2419 i = d->info;
2420 ret = lxc_cgroupfs_enter(i, pid, false);
484ed030 2421
d4ef7c50
SH
2422 return ret == 0;
2423}
2424
4fb3cba5 2425static inline bool cgfs_create_legacy(void *hdata, pid_t pid)
d4ef7c50 2426{
4fb3cba5
DE
2427 struct cgfs_data *d = hdata;
2428 struct cgroup_process_info *i;
2429
2430 if (!d)
2431 return false;
2432 i = d->info;
2433 if (lxc_cgroup_create_legacy(i, d->name, pid) < 0) {
2434 ERROR("failed to create legacy ns cgroups for '%s'", d->name);
d4ef7c50 2435 return false;
484ed030 2436 }
d4ef7c50
SH
2437 return true;
2438}
484ed030 2439
4fb3cba5 2440static const char *cgfs_get_cgroup(void *hdata, const char *subsystem)
d4ef7c50 2441{
4fb3cba5
DE
2442 struct cgfs_data *d = hdata;
2443
2444 if (!d)
2445 return NULL;
2446 return lxc_cgroup_get_hierarchy_path_data(subsystem, d);
484ed030
SH
2447}
2448
ccb4cabe 2449static bool cgfs_escape(void *hdata)
06078509
TA
2450{
2451 struct cgroup_meta_data *md;
2452 int i;
2453 bool ret = false;
2454
2455 md = lxc_cgroup_load_meta();
2456 if (!md)
2457 return false;
2458
517587ef 2459 for (i = 0; i <= md->maximum_hierarchy; i++) {
06078509
TA
2460 struct cgroup_hierarchy *h = md->hierarchies[i];
2461 struct cgroup_mount_point *mp;
2462 char *tasks;
2463 FILE *f;
2464 int written;
2465
2466 if (!h) {
2467 WARN("not escaping hierarchy %d", i);
2468 continue;
2469 }
2470
2471 mp = lxc_cgroup_find_mount_point(h, "/", true);
2472 if (!mp)
2473 goto out;
2474
2475 tasks = cgroup_to_absolute_path(mp, "/", "tasks");
2476 if (!tasks)
2477 goto out;
2478
2479 f = fopen(tasks, "a");
2480 free(tasks);
2481 if (!f)
2482 goto out;
2483
0059379f 2484 written = fprintf(f, "%d\n", lxc_raw_getpid());
06078509
TA
2485 fclose(f);
2486 if (written < 0) {
2487 SYSERROR("writing tasks failed\n");
2488 goto out;
2489 }
2490 }
2491
2492 ret = true;
2493out:
2494 lxc_cgroup_put_meta(md);
2495 return ret;
2496}
2497
36662416
TA
2498static int cgfs_num_hierarchies(void)
2499{
2500 /* not implemented */
2501 return -1;
2502}
2503
2504static bool cgfs_get_hierarchies(int i, char ***out)
2505{
2506 /* not implemented */
2507 return false;
2508}
2509
4fb3cba5 2510static bool cgfs_unfreeze(void *hdata)
0086f499 2511{
4fb3cba5 2512 struct cgfs_data *d = hdata;
0086f499
SH
2513 char *cgabspath, *cgrelpath;
2514 int ret;
2515
4fb3cba5
DE
2516 if (!d)
2517 return false;
2518
2519 cgrelpath = lxc_cgroup_get_hierarchy_path_data("freezer", d);
0086f499
SH
2520 cgabspath = lxc_cgroup_find_abs_path("freezer", cgrelpath, true, NULL);
2521 if (!cgabspath)
ecfcb3f0 2522 return false;
0086f499
SH
2523
2524 ret = do_cgroup_set(cgabspath, "freezer.state", "THAWED");
2525 free(cgabspath);
ecfcb3f0 2526 return ret == 0;
0086f499
SH
2527}
2528
6b38e644 2529static bool cgroupfs_setup_limits(void *hdata, struct lxc_conf *conf,
4fb3cba5 2530 bool with_devices)
9daf6f5d 2531{
4fb3cba5
DE
2532 struct cgfs_data *d = hdata;
2533
2534 if (!d)
2535 return false;
6b38e644 2536 return do_setup_cgroup_limits(d, &conf->cgroup, with_devices) == 0;
9daf6f5d
SH
2537}
2538
4fb3cba5 2539static bool lxc_cgroupfs_attach(const char *name, const char *lxcpath, pid_t pid)
5d897655
SH
2540{
2541 struct cgroup_meta_data *meta_data;
2542 struct cgroup_process_info *container_info;
2543 int ret;
2544
2545 meta_data = lxc_cgroup_load_meta();
2546 if (!meta_data) {
2547 ERROR("could not move attached process %d to cgroup of container", pid);
2548 return false;
2549 }
2550
2551 container_info = lxc_cgroup_get_container_info(name, lxcpath, meta_data);
2552 lxc_cgroup_put_meta(meta_data);
2553 if (!container_info) {
2554 ERROR("could not move attached process %d to cgroup of container", pid);
2555 return false;
2556 }
2557
2558 ret = lxc_cgroupfs_enter(container_info, pid, false);
2559 lxc_cgroup_process_info_free(container_info);
2560 if (ret < 0) {
2561 ERROR("could not move attached process %d to cgroup of container", pid);
2562 return false;
2563 }
2564 return true;
2565}
2566
8b276860
SH
2567struct chown_data {
2568 const char *cgroup_path;
2569 uid_t origuid;
2570};
2571
2572/*
2573 * TODO - someone should refactor this to unshare once passing all the paths
2574 * to be chowned in one go
2575 */
2576static int chown_cgroup_wrapper(void *data)
2577{
2578 struct chown_data *arg = data;
2579 uid_t destuid;
2580 char *fpath;
2581
8b276860
SH
2582 if (setresgid(0,0,0) < 0)
2583 SYSERROR("Failed to setgid to 0");
2584 if (setresuid(0,0,0) < 0)
2585 SYSERROR("Failed to setuid to 0");
2586 if (setgroups(0, NULL) < 0)
2587 SYSERROR("Failed to clear groups");
2588 destuid = get_ns_uid(arg->origuid);
2589
2590 if (chown(arg->cgroup_path, destuid, 0) < 0)
2591 SYSERROR("Failed chowning %s to %d", arg->cgroup_path, (int)destuid);
2592
2593 fpath = lxc_append_paths(arg->cgroup_path, "tasks");
2594 if (!fpath)
2595 return -1;
2596 if (chown(fpath, destuid, 0) < 0)
2597 SYSERROR("Error chowning %s\n", fpath);
2598 free(fpath);
01d59fe5 2599
8b276860
SH
2600 fpath = lxc_append_paths(arg->cgroup_path, "cgroup.procs");
2601 if (!fpath)
2602 return -1;
2603 if (chown(fpath, destuid, 0) < 0)
2604 SYSERROR("Error chowning %s", fpath);
2605 free(fpath);
2606
2607 return 0;
2608}
2609
2610static bool do_cgfs_chown(char *cgroup_path, struct lxc_conf *conf)
2611{
2612 struct chown_data data;
2613 char *fpath;
2614
01d59fe5
CB
2615 if (!dir_exists(cgroup_path))
2616 return true;
2617
8b276860
SH
2618 if (lxc_list_empty(&conf->id_map))
2619 /* If there's no mapping then we don't need to chown */
2620 return true;
2621
2622 data.cgroup_path = cgroup_path;
2623 data.origuid = geteuid();
2624
2625 /* Unpriv users can't chown it themselves, so chown from
2626 * a child namespace mapping both our own and the target uid
2627 */
c9b7c33e
CB
2628 if (userns_exec_1(conf, chown_cgroup_wrapper, &data,
2629 "chown_cgroup_wrapper") < 0) {
8b276860
SH
2630 ERROR("Error requesting cgroup chown in new namespace");
2631 return false;
2632 }
2633
2634 /*
2635 * Now chmod 775 the directory else the container cannot create cgroups.
2636 * This can't be done in the child namespace because it only group-owns
2637 * the cgroup
2638 */
2639 if (chmod(cgroup_path, 0775) < 0) {
2640 SYSERROR("Error chmoding %s\n", cgroup_path);
2641 return false;
2642 }
2643 fpath = lxc_append_paths(cgroup_path, "tasks");
2644 if (!fpath)
2645 return false;
2646 if (chmod(fpath, 0664) < 0)
2647 SYSERROR("Error chmoding %s\n", fpath);
2648 free(fpath);
2649 fpath = lxc_append_paths(cgroup_path, "cgroup.procs");
2650 if (!fpath)
2651 return false;
2652 if (chmod(fpath, 0664) < 0)
2653 SYSERROR("Error chmoding %s\n", fpath);
2654 free(fpath);
2655
2656 return true;
2657}
2658
2659static bool cgfs_chown(void *hdata, struct lxc_conf *conf)
2660{
2661 struct cgfs_data *d = hdata;
2662 struct cgroup_process_info *info_ptr;
2663 char *cgpath;
2664 bool r = true;
2665
2666 if (!d)
2667 return false;
2668
2669 for (info_ptr = d->info; info_ptr; info_ptr = info_ptr->next) {
517587ef
CB
2670 if (!info_ptr->hierarchy)
2671 continue;
2672
8b276860
SH
2673 if (!info_ptr->designated_mount_point) {
2674 info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, info_ptr->cgroup_path, true);
2675 if (!info_ptr->designated_mount_point) {
2676 SYSERROR("Could not chown cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", info_ptr->cgroup_path);
2677 return false;
2678 }
2679 }
2680
2681 cgpath = cgroup_to_absolute_path(info_ptr->designated_mount_point, info_ptr->cgroup_path, NULL);
2682 if (!cgpath) {
2683 SYSERROR("Could not chown cgroup %s: internal error", info_ptr->cgroup_path);
2684 continue;
2685 }
2686 r = do_cgfs_chown(cgpath, conf);
ea439aac 2687 if (!r && is_crucial_hierarchy(info_ptr->hierarchy)) {
8b276860
SH
2688 ERROR("Failed chowning %s\n", cgpath);
2689 free(cgpath);
2690 return false;
2691 }
2692 free(cgpath);
2693 }
2694
2695 return true;
2696}
2697
d4ef7c50 2698static struct cgroup_ops cgfs_ops = {
d4ef7c50 2699 .init = cgfs_init,
4fb3cba5 2700 .destroy = cgfs_destroy,
d4ef7c50
SH
2701 .create = cgfs_create,
2702 .enter = cgfs_enter,
2703 .create_legacy = cgfs_create_legacy,
2704 .get_cgroup = cgfs_get_cgroup,
06078509 2705 .escape = cgfs_escape,
36662416
TA
2706 .num_hierarchies = cgfs_num_hierarchies,
2707 .get_hierarchies = cgfs_get_hierarchies,
d4ef7c50
SH
2708 .get = lxc_cgroupfs_get,
2709 .set = lxc_cgroupfs_set,
4fb3cba5 2710 .unfreeze = cgfs_unfreeze,
9daf6f5d 2711 .setup_limits = cgroupfs_setup_limits,
d4ef7c50 2712 .name = "cgroupfs",
5d897655 2713 .attach = lxc_cgroupfs_attach,
8b276860 2714 .chown = cgfs_chown,
c476bdce 2715 .mount_cgroup = cgroupfs_mount_cgroup,
4fb3cba5 2716 .nrtasks = cgfs_nrtasks,
23befb18 2717 .driver = CGFS,
d4ef7c50 2718};