]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgfs.c
cgfs: also check for EACCES when writing devices
[mirror_lxc.git] / src / lxc / cgfs.c
CommitLineData
576f946d 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
576f946d 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
576f946d 22 */
d06245b8
NC
23#include "config.h"
24
576f946d 25#include <stdio.h>
576f946d 26#include <stdlib.h>
27#include <errno.h>
576f946d 28#include <unistd.h>
29#include <string.h>
341a9bd8 30#include <dirent.h>
576f946d 31#include <fcntl.h>
8b276860 32#include <grp.h>
b98f7d6e 33#include <ctype.h>
576f946d 34#include <sys/types.h>
35#include <sys/stat.h>
36#include <sys/param.h>
37#include <sys/inotify.h>
aae1f3c4 38#include <sys/mount.h>
576f946d 39#include <netinet/in.h>
40#include <net/if.h>
41
e2bcd7db 42#include "error.h"
ae5c8b8e 43#include "commands.h"
b98f7d6e
SH
44#include "list.h"
45#include "conf.h"
33ad9f1a 46#include "utils.h"
4ec31c52 47#include "bdev/bdev.h"
f2363e38
ÇO
48#include "log.h"
49#include "cgroup.h"
50#include "start.h"
484ed030 51#include "state.h"
36eb9bde 52
edaf8b1b
SG
53#if IS_BIONIC
54#include <../include/lxcmntent.h>
55#else
56#include <mntent.h>
57#endif
58
4fb3cba5
DE
59struct cgroup_hierarchy;
60struct cgroup_meta_data;
61struct cgroup_mount_point;
62
63/*
64 * cgroup_meta_data: the metadata about the cgroup infrastructure on this
65 * host
66 */
67struct cgroup_meta_data {
68 ptrdiff_t ref; /* simple refcount */
69 struct cgroup_hierarchy **hierarchies;
70 struct cgroup_mount_point **mount_points;
71 int maximum_hierarchy;
72};
73
74/*
75 * cgroup_hierarchy: describes a single cgroup hierarchy
76 * (may have multiple mount points)
77 */
78struct cgroup_hierarchy {
79 int index;
80 bool used; /* false if the hierarchy should be ignored by lxc */
81 char **subsystems;
82 struct cgroup_mount_point *rw_absolute_mount_point;
83 struct cgroup_mount_point *ro_absolute_mount_point;
84 struct cgroup_mount_point **all_mount_points;
85 size_t all_mount_point_capacity;
86};
87
88/*
89 * cgroup_mount_point: a mount point to where a hierarchy
90 * is mounted to
91 */
92struct cgroup_mount_point {
93 struct cgroup_hierarchy *hierarchy;
94 char *mount_point;
95 char *mount_prefix;
96 bool read_only;
97 bool need_cpuset_init;
98};
99
100/*
101 * cgroup_process_info: describes the membership of a
102 * process to the different cgroup
103 * hierarchies
104 *
105 * Note this is the per-process info tracked by the cgfs_ops.
106 * This is not used with cgmanager.
107 */
108struct cgroup_process_info {
109 struct cgroup_process_info *next;
110 struct cgroup_meta_data *meta_ref;
111 struct cgroup_hierarchy *hierarchy;
112 char *cgroup_path;
113 char *cgroup_path_sub;
114 char **created_paths;
115 size_t created_paths_capacity;
116 size_t created_paths_count;
117 struct cgroup_mount_point *designated_mount_point;
118};
119
120struct cgfs_data {
121 char *name;
122 const char *cgroup_pattern;
123 struct cgroup_meta_data *meta;
124 struct cgroup_process_info *info;
125};
126
127lxc_log_define(lxc_cgfs, lxc);
576f946d 128
33ad9f1a
CS
129static struct cgroup_process_info *lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str, struct cgroup_meta_data *meta);
130static char **subsystems_from_mount_options(const char *mount_options, char **kernel_list);
131static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp);
132static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h);
133static bool is_valid_cgroup(const char *name);
33ad9f1a 134static int create_cgroup(struct cgroup_mount_point *mp, const char *path);
603c64c2 135static int remove_cgroup(struct cgroup_mount_point *mp, const char *path, bool recurse);
33ad9f1a
CS
136static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp, const char *path, const char *suffix);
137static struct cgroup_process_info *find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem);
138static int do_cgroup_get(const char *cgroup_path, const char *sub_filename, char *value, size_t len);
139static int do_cgroup_set(const char *cgroup_path, const char *sub_filename, const char *value);
4fb3cba5
DE
140static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d, char *v, bool for_allow);
141static int do_setup_cgroup_limits(struct cgfs_data *d, struct lxc_list *cgroup_settings, bool do_devices);
33ad9f1a
CS
142static int cgroup_recursive_task_count(const char *cgroup_path);
143static int count_lines(const char *fn);
1ea59ad2 144static int handle_cgroup_settings(struct cgroup_mount_point *mp, char *cgroup_path);
d703c2b1 145static bool init_cpuset_if_needed(struct cgroup_mount_point *mp, const char *path);
33ad9f1a 146
4fb3cba5
DE
147static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist);
148static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data);
149static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data);
150
151/* free process membership information */
152static void lxc_cgroup_process_info_free(struct cgroup_process_info *info);
153static void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info);
154
d4ef7c50 155static struct cgroup_ops cgfs_ops;
d4ef7c50 156
603c64c2
SH
157static int cgroup_rmdir(char *dirname)
158{
159 struct dirent dirent, *direntp;
160 int saved_errno = 0;
161 DIR *dir;
162 int ret, failed=0;
163 char pathname[MAXPATHLEN];
164
165 dir = opendir(dirname);
166 if (!dir) {
167 ERROR("%s: failed to open %s", __func__, dirname);
168 return -1;
169 }
170
171 while (!readdir_r(dir, &dirent, &direntp)) {
172 struct stat mystat;
173 int rc;
174
175 if (!direntp)
176 break;
177
178 if (!strcmp(direntp->d_name, ".") ||
179 !strcmp(direntp->d_name, ".."))
180 continue;
181
182 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
183 if (rc < 0 || rc >= MAXPATHLEN) {
184 ERROR("pathname too long");
185 failed=1;
186 if (!saved_errno)
187 saved_errno = -ENOMEM;
188 continue;
189 }
190 ret = lstat(pathname, &mystat);
191 if (ret) {
192 SYSERROR("%s: failed to stat %s", __func__, pathname);
193 failed=1;
194 if (!saved_errno)
195 saved_errno = errno;
196 continue;
197 }
198 if (S_ISDIR(mystat.st_mode)) {
199 if (cgroup_rmdir(pathname) < 0) {
200 if (!saved_errno)
201 saved_errno = errno;
202 failed=1;
203 }
204 }
205 }
206
207 if (rmdir(dirname) < 0) {
208 SYSERROR("%s: failed to delete %s", __func__, dirname);
209 if (!saved_errno)
210 saved_errno = errno;
211 failed=1;
212 }
213
214 ret = closedir(dir);
215 if (ret) {
216 SYSERROR("%s: failed to close directory %s", __func__, dirname);
217 if (!saved_errno)
218 saved_errno = errno;
219 failed=1;
220 }
221
222 errno = saved_errno;
223 return failed ? -1 : 0;
224}
225
4fb3cba5 226static struct cgroup_meta_data *lxc_cgroup_load_meta()
33ad9f1a
CS
227{
228 const char *cgroup_use = NULL;
229 char **cgroup_use_list = NULL;
230 struct cgroup_meta_data *md = NULL;
231 int saved_errno;
232
233 errno = 0;
593e8478 234 cgroup_use = lxc_global_config_value("lxc.cgroup.use");
33ad9f1a
CS
235 if (!cgroup_use && errno != 0)
236 return NULL;
237 if (cgroup_use) {
238 cgroup_use_list = lxc_string_split_and_trim(cgroup_use, ',');
239 if (!cgroup_use_list)
240 return NULL;
241 }
576f946d 242
33ad9f1a
CS
243 md = lxc_cgroup_load_meta2((const char **)cgroup_use_list);
244 saved_errno = errno;
245 lxc_free_array((void **)cgroup_use_list, free);
246 errno = saved_errno;
247 return md;
248}
fd37327f 249
b653309a 250/* Step 1: determine all kernel subsystems */
4fb3cba5 251static bool find_cgroup_subsystems(char ***kernel_subsystems)
1d39a065 252{
b653309a
SH
253 FILE *proc_cgroups;
254 bool bret = false;
33ad9f1a
CS
255 char *line = NULL;
256 size_t sz = 0;
b653309a
SH
257 size_t kernel_subsystems_count = 0;
258 size_t kernel_subsystems_capacity = 0;
259 int r;
1d39a065 260
33ad9f1a
CS
261 proc_cgroups = fopen_cloexec("/proc/cgroups", "r");
262 if (!proc_cgroups)
b653309a 263 return false;
1d39a065 264
33ad9f1a
CS
265 while (getline(&line, &sz, proc_cgroups) != -1) {
266 char *tab1;
267 char *tab2;
268 int hierarchy_number;
1d39a065 269
33ad9f1a
CS
270 if (line[0] == '#')
271 continue;
272 if (!line[0])
273 continue;
1d39a065 274
33ad9f1a
CS
275 tab1 = strchr(line, '\t');
276 if (!tab1)
8900b9eb 277 continue;
33ad9f1a
CS
278 *tab1++ = '\0';
279 tab2 = strchr(tab1, '\t');
280 if (!tab2)
281 continue;
282 *tab2 = '\0';
fd37327f 283
33ad9f1a
CS
284 tab2 = NULL;
285 hierarchy_number = strtoul(tab1, &tab2, 10);
286 if (!tab2 || *tab2)
287 continue;
288 (void)hierarchy_number;
289
b653309a 290 r = lxc_grow_array((void ***)kernel_subsystems, &kernel_subsystems_capacity, kernel_subsystems_count + 1, 12);
33ad9f1a 291 if (r < 0)
b653309a
SH
292 goto out;
293 (*kernel_subsystems)[kernel_subsystems_count] = strdup(line);
294 if (!(*kernel_subsystems)[kernel_subsystems_count])
295 goto out;
33ad9f1a 296 kernel_subsystems_count++;
bcbd102c 297 }
b653309a 298 bret = true;
0d9f8e18 299
b653309a 300out:
33ad9f1a 301 fclose(proc_cgroups);
0ccf7c2a 302 free(line);
b653309a
SH
303 return bret;
304}
305
306/* Step 2: determine all hierarchies (by reading /proc/self/cgroup),
307 * since mount points don't specify hierarchy number and
308 * /proc/cgroups does not contain named hierarchies
309 */
310static bool find_cgroup_hierarchies(struct cgroup_meta_data *meta_data,
311 bool all_kernel_subsystems, bool all_named_subsystems,
312 const char **subsystem_whitelist)
313{
314 FILE *proc_self_cgroup;
315 char *line = NULL;
316 size_t sz = 0;
317 int r;
318 bool bret = false;
319 size_t hierarchy_capacity = 0;
ef6e34ee 320
33ad9f1a
CS
321 proc_self_cgroup = fopen_cloexec("/proc/self/cgroup", "r");
322 /* if for some reason (because of setns() and pid namespace for example),
323 * /proc/self is not valid, we try /proc/1/cgroup... */
324 if (!proc_self_cgroup)
325 proc_self_cgroup = fopen_cloexec("/proc/1/cgroup", "r");
326 if (!proc_self_cgroup)
b653309a 327 return false;
33ad9f1a
CS
328
329 while (getline(&line, &sz, proc_self_cgroup) != -1) {
330 /* file format: hierarchy:subsystems:group,
331 * we only extract hierarchy and subsystems
332 * here */
333 char *colon1;
334 char *colon2;
335 int hierarchy_number;
336 struct cgroup_hierarchy *h = NULL;
337 char **p;
338
339 if (!line[0])
340 continue;
ad08bbb7 341
33ad9f1a
CS
342 colon1 = strchr(line, ':');
343 if (!colon1)
8900b9eb 344 continue;
33ad9f1a
CS
345 *colon1++ = '\0';
346 colon2 = strchr(colon1, ':');
347 if (!colon2)
348 continue;
349 *colon2 = '\0';
ad08bbb7 350
33ad9f1a
CS
351 colon2 = NULL;
352 hierarchy_number = strtoul(line, &colon2, 10);
353 if (!colon2 || *colon2)
354 continue;
576f946d 355
33ad9f1a
CS
356 if (hierarchy_number > meta_data->maximum_hierarchy) {
357 /* lxc_grow_array will never shrink, so even if we find a lower
358 * hierarchy number here, the array will never be smaller
359 */
360 r = lxc_grow_array((void ***)&meta_data->hierarchies, &hierarchy_capacity, hierarchy_number + 1, 12);
361 if (r < 0)
b653309a 362 goto out;
5193cc3d 363
33ad9f1a
CS
364 meta_data->maximum_hierarchy = hierarchy_number;
365 }
fd37327f 366
33ad9f1a
CS
367 /* this shouldn't happen, we had this already */
368 if (meta_data->hierarchies[hierarchy_number])
b653309a 369 goto out;
33ad9f1a
CS
370
371 h = calloc(1, sizeof(struct cgroup_hierarchy));
372 if (!h)
b653309a 373 goto out;
33ad9f1a
CS
374
375 meta_data->hierarchies[hierarchy_number] = h;
376
377 h->index = hierarchy_number;
378 h->subsystems = lxc_string_split_and_trim(colon1, ',');
379 if (!h->subsystems)
b653309a 380 goto out;
33ad9f1a
CS
381 /* see if this hierarchy should be considered */
382 if (!all_kernel_subsystems || !all_named_subsystems) {
383 for (p = h->subsystems; *p; p++) {
384 if (!strncmp(*p, "name=", 5)) {
385 if (all_named_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
386 h->used = true;
387 break;
388 }
389 } else {
390 if (all_kernel_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
391 h->used = true;
392 break;
393 }
394 }
395 }
396 } else {
397 /* we want all hierarchy anyway */
398 h->used = true;
ae5c8b8e 399 }
ae5c8b8e 400 }
b653309a 401 bret = true;
0b9c21ab 402
b653309a 403out:
33ad9f1a 404 fclose(proc_self_cgroup);
0ccf7c2a 405 free(line);
b653309a
SH
406 return bret;
407}
408
409/* Step 3: determine all mount points of each hierarchy */
410static bool find_hierarchy_mountpts( struct cgroup_meta_data *meta_data, char **kernel_subsystems)
411{
412 bool bret = false;
413 FILE *proc_self_mountinfo;
414 char *line = NULL;
415 size_t sz = 0;
416 char **tokens = NULL;
417 size_t mount_point_count = 0;
418 size_t mount_point_capacity = 0;
419 size_t token_capacity = 0;
420 int r;
421
33ad9f1a
CS
422 proc_self_mountinfo = fopen_cloexec("/proc/self/mountinfo", "r");
423 /* if for some reason (because of setns() and pid namespace for example),
424 * /proc/self is not valid, we try /proc/1/cgroup... */
425 if (!proc_self_mountinfo)
426 proc_self_mountinfo = fopen_cloexec("/proc/1/mountinfo", "r");
427 if (!proc_self_mountinfo)
b653309a 428 return false;
33ad9f1a
CS
429
430 while (getline(&line, &sz, proc_self_mountinfo) != -1) {
178938fe 431 char *token, *line_tok, *saveptr = NULL;
33ad9f1a
CS
432 size_t i, j, k;
433 struct cgroup_mount_point *mount_point;
434 struct cgroup_hierarchy *h;
435 char **subsystems;
436
437 if (line[0] && line[strlen(line) - 1] == '\n')
438 line[strlen(line) - 1] = '\0';
439
178938fe 440 for (i = 0, line_tok = line; (token = strtok_r(line_tok, " ", &saveptr)); line_tok = NULL) {
33ad9f1a
CS
441 r = lxc_grow_array((void ***)&tokens, &token_capacity, i + 1, 64);
442 if (r < 0)
b653309a 443 goto out;
33ad9f1a
CS
444 tokens[i++] = token;
445 }
b98f7d6e 446
33ad9f1a
CS
447 /* layout of /proc/self/mountinfo:
448 * 0: id
449 * 1: parent id
450 * 2: device major:minor
451 * 3: mount prefix
8900b9eb 452 * 4: mount point
33ad9f1a
CS
453 * 5: per-mount options
454 * [optional X]: additional data
455 * X+7: "-"
456 * X+8: type
457 * X+9: source
458 * X+10: per-superblock options
459 */
460 for (j = 6; j < i && tokens[j]; j++)
461 if (!strcmp(tokens[j], "-"))
462 break;
fd4f5a56 463
33ad9f1a
CS
464 /* could not find separator */
465 if (j >= i || !tokens[j])
466 continue;
467 /* there should be exactly three fields after
468 * the separator
469 */
470 if (i != j + 4)
471 continue;
fd4f5a56 472
33ad9f1a
CS
473 /* not a cgroup filesystem */
474 if (strcmp(tokens[j + 1], "cgroup") != 0)
475 continue;
b98f7d6e 476
33ad9f1a
CS
477 subsystems = subsystems_from_mount_options(tokens[j + 3], kernel_subsystems);
478 if (!subsystems)
b653309a 479 goto out;
33ad9f1a
CS
480
481 h = NULL;
482 for (k = 1; k <= meta_data->maximum_hierarchy; k++) {
483 if (meta_data->hierarchies[k] &&
484 meta_data->hierarchies[k]->subsystems[0] &&
485 lxc_string_in_array(meta_data->hierarchies[k]->subsystems[0], (const char **)subsystems)) {
486 /* TODO: we could also check if the lists really match completely,
487 * just to have an additional sanity check */
488 h = meta_data->hierarchies[k];
b98f7d6e 489 break;
33ad9f1a 490 }
b98f7d6e 491 }
33ad9f1a
CS
492 lxc_free_array((void **)subsystems, free);
493
494 r = lxc_grow_array((void ***)&meta_data->mount_points, &mount_point_capacity, mount_point_count + 1, 12);
495 if (r < 0)
b653309a 496 goto out;
33ad9f1a
CS
497
498 /* create mount point object */
499 mount_point = calloc(1, sizeof(*mount_point));
500 if (!mount_point)
b653309a 501 goto out;
33ad9f1a
CS
502
503 meta_data->mount_points[mount_point_count++] = mount_point;
504
505 mount_point->hierarchy = h;
506 mount_point->mount_point = strdup(tokens[4]);
507 mount_point->mount_prefix = strdup(tokens[3]);
508 if (!mount_point->mount_point || !mount_point->mount_prefix)
b653309a 509 goto out;
33ad9f1a
CS
510 mount_point->read_only = !lxc_string_in_list("rw", tokens[5], ',');
511
512 if (!strcmp(mount_point->mount_prefix, "/")) {
513 if (mount_point->read_only) {
514 if (!h->ro_absolute_mount_point)
515 h->ro_absolute_mount_point = mount_point;
516 } else {
517 if (!h->rw_absolute_mount_point)
518 h->rw_absolute_mount_point = mount_point;
519 }
b98f7d6e 520 }
ae5c8b8e 521
33ad9f1a
CS
522 k = lxc_array_len((void **)h->all_mount_points);
523 r = lxc_grow_array((void ***)&h->all_mount_points, &h->all_mount_point_capacity, k + 1, 4);
524 if (r < 0)
b653309a 525 goto out;
33ad9f1a 526 h->all_mount_points[k] = mount_point;
fd4f5a56 527 }
b653309a
SH
528 bret = true;
529
530out:
b653309a 531 fclose(proc_self_mountinfo);
b653309a 532 free(tokens);
2cdafc54 533 free(line);
b653309a
SH
534 return bret;
535}
536
4fb3cba5 537static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist)
b653309a
SH
538{
539 bool all_kernel_subsystems = true;
540 bool all_named_subsystems = false;
541 struct cgroup_meta_data *meta_data = NULL;
542 char **kernel_subsystems = NULL;
543 int saved_errno = 0;
544
545 /* if the subsystem whitelist is not specified, include all
546 * hierarchies that contain kernel subsystems by default but
547 * no hierarchies that only contain named subsystems
548 *
549 * if it is specified, the specifier @all will select all
550 * hierarchies, @kernel will select all hierarchies with
551 * kernel subsystems and @named will select all named
552 * hierarchies
553 */
554 all_kernel_subsystems = subsystem_whitelist ?
555 (lxc_string_in_array("@kernel", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
556 true;
557 all_named_subsystems = subsystem_whitelist ?
558 (lxc_string_in_array("@named", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
79c59e6b 559 true;
b653309a
SH
560
561 meta_data = calloc(1, sizeof(struct cgroup_meta_data));
562 if (!meta_data)
563 return NULL;
564 meta_data->ref = 1;
565
566 if (!find_cgroup_subsystems(&kernel_subsystems))
567 goto out_error;
568
569 if (!find_cgroup_hierarchies(meta_data, all_kernel_subsystems,
570 all_named_subsystems, subsystem_whitelist))
571 goto out_error;
572
573 if (!find_hierarchy_mountpts(meta_data, kernel_subsystems))
574 goto out_error;
fd4f5a56 575
33ad9f1a
CS
576 /* oops, we couldn't find anything */
577 if (!meta_data->hierarchies || !meta_data->mount_points) {
578 errno = EINVAL;
579 goto out_error;
ae5c8b8e 580 }
fd4f5a56 581
3a0abb3a 582 lxc_free_array((void **)kernel_subsystems, free);
33ad9f1a
CS
583 return meta_data;
584
585out_error:
586 saved_errno = errno;
33ad9f1a
CS
587 lxc_free_array((void **)kernel_subsystems, free);
588 lxc_cgroup_put_meta(meta_data);
589 errno = saved_errno;
590 return NULL;
fd4f5a56
DL
591}
592
4fb3cba5 593static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data)
e14f67a7 594{
33ad9f1a
CS
595 meta_data->ref++;
596 return meta_data;
597}
e14f67a7 598
4fb3cba5 599static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data)
33ad9f1a
CS
600{
601 size_t i;
602 if (!meta_data)
603 return NULL;
604 if (--meta_data->ref > 0)
605 return meta_data;
606 lxc_free_array((void **)meta_data->mount_points, (lxc_free_fn)lxc_cgroup_mount_point_free);
607 if (meta_data->hierarchies) {
608 for (i = 0; i <= meta_data->maximum_hierarchy; i++)
609 lxc_cgroup_hierarchy_free(meta_data->hierarchies[i]);
e14f67a7 610 }
33ad9f1a 611 free(meta_data->hierarchies);
178938fe 612 free(meta_data);
33ad9f1a 613 return NULL;
e14f67a7
U
614}
615
4fb3cba5 616static struct cgroup_hierarchy *lxc_cgroup_find_hierarchy(struct cgroup_meta_data *meta_data, const char *subsystem)
e14f67a7 617{
33ad9f1a
CS
618 size_t i;
619 for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
620 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
621 if (h && lxc_string_in_array(subsystem, (const char **)h->subsystems))
622 return h;
e14f67a7 623 }
e14f67a7
U
624 return NULL;
625}
626
4fb3cba5 627static struct cgroup_mount_point *lxc_cgroup_find_mount_point(struct cgroup_hierarchy *hierarchy, const char *group, bool should_be_writable)
b98f7d6e 628{
33ad9f1a
CS
629 struct cgroup_mount_point **mps;
630 struct cgroup_mount_point *current_result = NULL;
631 ssize_t quality = -1;
b98f7d6e 632
33ad9f1a
CS
633 /* trivial case */
634 if (hierarchy->rw_absolute_mount_point)
635 return hierarchy->rw_absolute_mount_point;
636 if (!should_be_writable && hierarchy->ro_absolute_mount_point)
637 return hierarchy->ro_absolute_mount_point;
b98f7d6e 638
33ad9f1a
CS
639 for (mps = hierarchy->all_mount_points; mps && *mps; mps++) {
640 struct cgroup_mount_point *mp = *mps;
641 size_t prefix_len = mp->mount_prefix ? strlen(mp->mount_prefix) : 0;
b98f7d6e 642
33ad9f1a
CS
643 if (prefix_len == 1 && mp->mount_prefix[0] == '/')
644 prefix_len = 0;
b98f7d6e 645
33ad9f1a
CS
646 if (should_be_writable && mp->read_only)
647 continue;
648
649 if (!prefix_len ||
650 (strncmp(group, mp->mount_prefix, prefix_len) == 0 &&
651 (group[prefix_len] == '\0' || group[prefix_len] == '/'))) {
652 /* search for the best quality match, i.e. the match with the
653 * shortest prefix where this group is still contained
654 */
655 if (quality == -1 || prefix_len < quality) {
656 current_result = mp;
657 quality = prefix_len;
658 }
b98f7d6e
SH
659 }
660 }
661
33ad9f1a
CS
662 if (!current_result)
663 errno = ENOENT;
664 return current_result;
b98f7d6e
SH
665}
666
4fb3cba5 667static char *lxc_cgroup_find_abs_path(const char *subsystem, const char *group, bool should_be_writable, const char *suffix)
b98f7d6e 668{
33ad9f1a
CS
669 struct cgroup_meta_data *meta_data;
670 struct cgroup_hierarchy *h;
671 struct cgroup_mount_point *mp;
672 char *result;
673 int saved_errno;
674
675 meta_data = lxc_cgroup_load_meta();
676 if (!meta_data)
677 return NULL;
b98f7d6e 678
33ad9f1a
CS
679 h = lxc_cgroup_find_hierarchy(meta_data, subsystem);
680 if (!h)
681 goto out_error;
b98f7d6e 682
33ad9f1a
CS
683 mp = lxc_cgroup_find_mount_point(h, group, should_be_writable);
684 if (!mp)
685 goto out_error;
b98f7d6e 686
33ad9f1a
CS
687 result = cgroup_to_absolute_path(mp, group, suffix);
688 if (!result)
689 goto out_error;
b98f7d6e 690
33ad9f1a
CS
691 lxc_cgroup_put_meta(meta_data);
692 return result;
b98f7d6e 693
33ad9f1a
CS
694out_error:
695 saved_errno = errno;
696 lxc_cgroup_put_meta(meta_data);
697 errno = saved_errno;
698 return NULL;
b98f7d6e
SH
699}
700
4fb3cba5 701static struct cgroup_process_info *lxc_cgroup_process_info_get(pid_t pid, struct cgroup_meta_data *meta)
fd4f5a56 702{
33ad9f1a
CS
703 char pid_buf[32];
704 snprintf(pid_buf, 32, "/proc/%lu/cgroup", (unsigned long)pid);
705 return lxc_cgroup_process_info_getx(pid_buf, meta);
c8f7c563
CS
706}
707
4fb3cba5 708static struct cgroup_process_info *lxc_cgroup_process_info_get_init(struct cgroup_meta_data *meta)
c8f7c563 709{
33ad9f1a
CS
710 return lxc_cgroup_process_info_get(1, meta);
711}
b98f7d6e 712
4fb3cba5 713static struct cgroup_process_info *lxc_cgroup_process_info_get_self(struct cgroup_meta_data *meta)
33ad9f1a
CS
714{
715 struct cgroup_process_info *i;
716 i = lxc_cgroup_process_info_getx("/proc/self/cgroup", meta);
717 if (!i)
718 i = lxc_cgroup_process_info_get(getpid(), meta);
719 return i;
720}
ae5c8b8e 721
692ba18f
SH
722/*
723 * If a controller has ns cgroup mounted, then in that cgroup the handler->pid
724 * is already in a new cgroup named after the pid. 'mnt' is passed in as
725 * the full current cgroup. Say that is /sys/fs/cgroup/lxc/2975 and the container
726 * name is c1. . We want to rename the cgroup directory to /sys/fs/cgroup/lxc/c1,
727 * and return the string /sys/fs/cgroup/lxc/c1.
728 */
cea0552e 729static char *cgroup_rename_nsgroup(const char *mountpath, const char *oldname, pid_t pid, const char *name)
692ba18f
SH
730{
731 char *dir, *fulloldpath;
732 char *newname, *fullnewpath;
cea0552e 733 int len, newlen, ret;
692ba18f
SH
734
735 /*
736 * if cgroup is mounted at /cgroup and task is in cgroup /ab/, pid 2375 and
737 * name is c1,
738 * dir: /ab
739 * fulloldpath = /cgroup/ab/2375
740 * fullnewpath = /cgroup/ab/c1
741 * newname = /ab/c1
742 */
743 dir = alloca(strlen(oldname) + 1);
744 strcpy(dir, oldname);
745
cea0552e
SH
746 len = strlen(oldname) + strlen(mountpath) + 22;
747 fulloldpath = alloca(len);
748 ret = snprintf(fulloldpath, len, "%s/%s/%ld", mountpath, oldname, (unsigned long)pid);
749 if (ret < 0 || ret >= len)
750 return NULL;
692ba18f
SH
751
752 len = strlen(dir) + strlen(name) + 2;
753 newname = malloc(len);
754 if (!newname) {
755 SYSERROR("Out of memory");
756 return NULL;
757 }
cea0552e
SH
758 ret = snprintf(newname, len, "%s/%s", dir, name);
759 if (ret < 0 || ret >= len) {
760 free(newname);
761 return NULL;
762 }
692ba18f 763
cea0552e
SH
764 newlen = strlen(mountpath) + len + 2;
765 fullnewpath = alloca(newlen);
766 ret = snprintf(fullnewpath, newlen, "%s/%s", mountpath, newname);
767 if (ret < 0 || ret >= newlen) {
768 free(newname);
769 return NULL;
770 }
692ba18f
SH
771
772 if (access(fullnewpath, F_OK) == 0) {
773 if (rmdir(fullnewpath) != 0) {
774 SYSERROR("container cgroup %s already exists.", fullnewpath);
775 free(newname);
776 return NULL;
777 }
778 }
779 if (rename(fulloldpath, fullnewpath)) {
780 SYSERROR("failed to rename cgroup %s->%s", fulloldpath, fullnewpath);
781 free(newname);
782 return NULL;
783 }
784
785 DEBUG("'%s' renamed to '%s'", oldname, newname);
786
787 return newname;
788}
789
33ad9f1a 790/* create a new cgroup */
4fb3cba5 791static struct cgroup_process_info *lxc_cgroupfs_create(const char *name, const char *path_pattern, struct cgroup_meta_data *meta_data, const char *sub_pattern)
33ad9f1a 792{
001b026e 793 char **cgroup_path_components = NULL;
33ad9f1a
CS
794 char **p = NULL;
795 char *path_so_far = NULL;
796 char **new_cgroup_paths = NULL;
797 char **new_cgroup_paths_sub = NULL;
798 struct cgroup_mount_point *mp;
799 struct cgroup_hierarchy *h;
800 struct cgroup_process_info *base_info = NULL;
801 struct cgroup_process_info *info_ptr;
802 int saved_errno;
803 int r;
804 unsigned suffix = 0;
805 bool had_sub_pattern = false;
806 size_t i;
ae5c8b8e 807
33ad9f1a
CS
808 if (!is_valid_cgroup(name)) {
809 ERROR("Invalid cgroup name: '%s'", name);
810 errno = EINVAL;
811 return NULL;
ae5c8b8e
SH
812 }
813
33ad9f1a
CS
814 if (!strstr(path_pattern, "%n")) {
815 ERROR("Invalid cgroup path pattern: '%s'; contains no %%n for specifying container name", path_pattern);
816 errno = EINVAL;
817 return NULL;
818 }
fd37327f 819
33ad9f1a
CS
820 /* we will modify the result of this operation directly,
821 * so we don't have to copy the data structure
822 */
823 base_info = (path_pattern[0] == '/') ?
824 lxc_cgroup_process_info_get_init(meta_data) :
825 lxc_cgroup_process_info_get_self(meta_data);
826 if (!base_info)
827 return NULL;
c8f7c563 828
33ad9f1a
CS
829 new_cgroup_paths = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
830 if (!new_cgroup_paths)
831 goto out_initial_error;
832
833 new_cgroup_paths_sub = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
834 if (!new_cgroup_paths_sub)
835 goto out_initial_error;
836
837 /* find mount points we can use */
838 for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
839 h = info_ptr->hierarchy;
840 mp = lxc_cgroup_find_mount_point(h, info_ptr->cgroup_path, true);
841 if (!mp) {
842 ERROR("Could not find writable mount point for cgroup hierarchy %d while trying to create cgroup.", h->index);
843 goto out_initial_error;
844 }
845 info_ptr->designated_mount_point = mp;
460a1cf0 846
692ba18f
SH
847 if (lxc_string_in_array("ns", (const char **)h->subsystems))
848 continue;
2edb53c7
SH
849 if (handle_cgroup_settings(mp, info_ptr->cgroup_path) < 0) {
850 ERROR("Could not set clone_children to 1 for cpuset hierarchy in parent cgroup.");
33ad9f1a 851 goto out_initial_error;
2edb53c7 852 }
33ad9f1a 853 }
b98f7d6e 854
33ad9f1a
CS
855 /* normalize the path */
856 cgroup_path_components = lxc_normalize_path(path_pattern);
857 if (!cgroup_path_components)
858 goto out_initial_error;
859
860 /* go through the path components to see if we can create them */
861 for (p = cgroup_path_components; *p || (sub_pattern && !had_sub_pattern); p++) {
862 /* we only want to create the same component with -1, -2, etc.
863 * if the component contains the container name itself, otherwise
864 * it's not an error if it already exists
865 */
866 char *p_eff = *p ? *p : (char *)sub_pattern;
867 bool contains_name = strstr(p_eff, "%n");
868 char *current_component = NULL;
869 char *current_subpath = NULL;
870 char *current_entire_path = NULL;
871 char *parts[3];
872 size_t j = 0;
873 i = 0;
874
875 /* if we are processing the subpattern, we want to make sure
876 * loop is ended the next time around
877 */
878 if (!*p) {
879 had_sub_pattern = true;
880 p--;
881 }
b98f7d6e 882
33ad9f1a 883 goto find_name_on_this_level;
4fb3cba5 884
33ad9f1a
CS
885 cleanup_name_on_this_level:
886 /* This is reached if we found a name clash.
887 * In that case, remove the cgroup from all previous hierarchies
888 */
889 for (j = 0, info_ptr = base_info; j < i && info_ptr; info_ptr = info_ptr->next, j++) {
603c64c2 890 r = remove_cgroup(info_ptr->designated_mount_point, info_ptr->created_paths[info_ptr->created_paths_count - 1], false);
33ad9f1a
CS
891 if (r < 0)
892 WARN("could not clean up cgroup we created when trying to create container");
893 free(info_ptr->created_paths[info_ptr->created_paths_count - 1]);
894 info_ptr->created_paths[--info_ptr->created_paths_count] = NULL;
895 }
896 if (current_component != current_subpath)
897 free(current_subpath);
898 if (current_component != p_eff)
899 free(current_component);
900 current_component = current_subpath = NULL;
901 /* try again with another suffix */
902 ++suffix;
4fb3cba5 903
33ad9f1a
CS
904 find_name_on_this_level:
905 /* determine name of the path component we should create */
906 if (contains_name && suffix > 0) {
907 char *buf = calloc(strlen(name) + 32, 1);
908 if (!buf)
909 goto out_initial_error;
910 snprintf(buf, strlen(name) + 32, "%s-%u", name, suffix);
911 current_component = lxc_string_replace("%n", buf, p_eff);
912 free(buf);
913 } else {
914 current_component = contains_name ? lxc_string_replace("%n", name, p_eff) : p_eff;
915 }
916 parts[0] = path_so_far;
917 parts[1] = current_component;
918 parts[2] = NULL;
919 current_subpath = path_so_far ? lxc_string_join("/", (const char **)parts, false) : current_component;
920
921 /* Now go through each hierarchy and try to create the
922 * corresponding cgroup
923 */
924 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
925 char *parts2[3];
692ba18f
SH
926
927 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
928 continue;
33ad9f1a
CS
929 current_entire_path = NULL;
930
931 parts2[0] = !strcmp(info_ptr->cgroup_path, "/") ? "" : info_ptr->cgroup_path;
932 parts2[1] = current_subpath;
933 parts2[2] = NULL;
934 current_entire_path = lxc_string_join("/", (const char **)parts2, false);
935
936 if (!*p) {
937 /* we are processing the subpath, so only update that one */
938 free(new_cgroup_paths_sub[i]);
939 new_cgroup_paths_sub[i] = strdup(current_entire_path);
940 if (!new_cgroup_paths_sub[i])
941 goto cleanup_from_error;
942 } else {
943 /* remember which path was used on this controller */
944 free(new_cgroup_paths[i]);
945 new_cgroup_paths[i] = strdup(current_entire_path);
946 if (!new_cgroup_paths[i])
947 goto cleanup_from_error;
948 }
fd4f5a56 949
33ad9f1a
CS
950 r = create_cgroup(info_ptr->designated_mount_point, current_entire_path);
951 if (r < 0 && errno == EEXIST && contains_name) {
952 /* name clash => try new name with new suffix */
953 free(current_entire_path);
954 current_entire_path = NULL;
955 goto cleanup_name_on_this_level;
956 } else if (r < 0 && errno != EEXIST) {
b38b62a6 957 SYSERROR("Could not create cgroup '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
33ad9f1a
CS
958 goto cleanup_from_error;
959 } else if (r == 0) {
960 /* successfully created */
961 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
962 if (r < 0)
963 goto cleanup_from_error;
d703c2b1 964 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, current_entire_path)) {
b38b62a6 965 ERROR("Failed to initialize cpuset for '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
d703c2b1
RV
966 goto cleanup_from_error;
967 }
33ad9f1a
CS
968 info_ptr->created_paths[info_ptr->created_paths_count++] = current_entire_path;
969 } else {
970 /* if we didn't create the cgroup, then we have to make sure that
971 * further cgroups will be created properly
972 */
d703c2b1 973 if (handle_cgroup_settings(info_ptr->designated_mount_point, info_ptr->cgroup_path) < 0) {
f6ac3b9e 974 ERROR("Could not set clone_children to 1 for cpuset hierarchy in pre-existing cgroup.");
33ad9f1a 975 goto cleanup_from_error;
f6ac3b9e 976 }
d703c2b1
RV
977 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, info_ptr->cgroup_path)) {
978 ERROR("Failed to initialize cpuset in pre-existing '%s'.", info_ptr->cgroup_path);
979 goto cleanup_from_error;
980 }
33ad9f1a
CS
981
982 /* already existed but path component of pattern didn't contain '%n',
983 * so this is not an error; but then we don't need current_entire_path
984 * anymore...
985 */
986 free(current_entire_path);
987 current_entire_path = NULL;
988 }
989 }
fd4f5a56 990
33ad9f1a
CS
991 /* save path so far */
992 free(path_so_far);
993 path_so_far = strdup(current_subpath);
994 if (!path_so_far)
995 goto cleanup_from_error;
996
997 /* cleanup */
998 if (current_component != current_subpath)
999 free(current_subpath);
1000 if (current_component != p_eff)
1001 free(current_component);
1002 current_component = current_subpath = NULL;
1003 continue;
4fb3cba5 1004
33ad9f1a 1005 cleanup_from_error:
ec64264d 1006 /* called if an error occurred in the loop, so we
33ad9f1a
CS
1007 * do some additional cleanup here
1008 */
1009 saved_errno = errno;
1010 if (current_component != current_subpath)
1011 free(current_subpath);
1012 if (current_component != p_eff)
1013 free(current_component);
1014 free(current_entire_path);
1015 errno = saved_errno;
1016 goto out_initial_error;
fd4f5a56
DL
1017 }
1018
33ad9f1a
CS
1019 /* we're done, now update the paths */
1020 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
47d8fb3b
CS
1021 /* ignore legacy 'ns' subsystem here, lxc_cgroup_create_legacy
1022 * will take care of it
1023 * Since we do a continue in above loop, new_cgroup_paths[i] is
1024 * unset anyway, as is new_cgroup_paths_sub[i]
692ba18f 1025 */
47d8fb3b
CS
1026 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1027 continue;
1028 free(info_ptr->cgroup_path);
1029 info_ptr->cgroup_path = new_cgroup_paths[i];
1030 info_ptr->cgroup_path_sub = new_cgroup_paths_sub[i];
fd4f5a56 1031 }
33ad9f1a
CS
1032 /* don't use lxc_free_array since we used the array members
1033 * to store them in our result...
1034 */
1035 free(new_cgroup_paths);
1036 free(new_cgroup_paths_sub);
1037 free(path_so_far);
1038 lxc_free_array((void **)cgroup_path_components, free);
1039 return base_info;
1040
1041out_initial_error:
1042 saved_errno = errno;
1043 free(path_so_far);
1044 lxc_cgroup_process_info_free_and_remove(base_info);
1045 lxc_free_array((void **)new_cgroup_paths, free);
1046 lxc_free_array((void **)new_cgroup_paths_sub, free);
1047 lxc_free_array((void **)cgroup_path_components, free);
1048 errno = saved_errno;
1049 return NULL;
c8f7c563
CS
1050}
1051
4fb3cba5 1052static int lxc_cgroup_create_legacy(struct cgroup_process_info *base_info, const char *name, pid_t pid)
47d8fb3b
CS
1053{
1054 struct cgroup_process_info *info_ptr;
1055 int r;
1056
1057 for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
1058 if (!lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1059 continue;
1060 /*
1061 * For any path which has ns cgroup mounted, handler->pid is already
1062 * moved into a container called '%d % (handler->pid)'. Rename it to
1063 * the cgroup name and record that.
1064 */
1065 char *tmp = cgroup_rename_nsgroup((const char *)info_ptr->designated_mount_point->mount_point,
1066 info_ptr->cgroup_path, pid, name);
1067 if (!tmp)
1068 return -1;
1069 free(info_ptr->cgroup_path);
1070 info_ptr->cgroup_path = tmp;
1071 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1072 if (r < 0)
1073 return -1;
1074 tmp = strdup(tmp);
1075 if (!tmp)
1076 return -1;
1077 info_ptr->created_paths[info_ptr->created_paths_count++] = tmp;
1078 }
1079 return 0;
1080}
1081
33ad9f1a 1082/* get the cgroup membership of a given container */
4fb3cba5 1083static struct cgroup_process_info *lxc_cgroup_get_container_info(const char *name, const char *lxcpath, struct cgroup_meta_data *meta_data)
c8f7c563 1084{
33ad9f1a
CS
1085 struct cgroup_process_info *result = NULL;
1086 int saved_errno = 0;
1087 size_t i;
1088 struct cgroup_process_info **cptr = &result;
1089 struct cgroup_process_info *entry = NULL;
1090 char *path = NULL;
1091
1092 for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
1093 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
1094 if (!h || !h->used)
1095 continue;
c8f7c563 1096
33ad9f1a
CS
1097 /* use the command interface to look for the cgroup */
1098 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->subsystems[0]);
c661b0a8
DE
1099 if (!path) {
1100 h->used = false;
1101 WARN("Not attaching to cgroup %s unknown to %s %s", h->subsystems[0], lxcpath, name);
1102 continue;
1103 }
33ad9f1a
CS
1104
1105 entry = calloc(1, sizeof(struct cgroup_process_info));
1106 if (!entry)
1107 goto out_error;
1108 entry->meta_ref = lxc_cgroup_get_meta(meta_data);
1109 entry->hierarchy = h;
1110 entry->cgroup_path = path;
1111 path = NULL;
1112
1113 /* it is not an error if we don't find anything here,
1114 * it is up to the caller to decide what to do in that
1115 * case */
1116 entry->designated_mount_point = lxc_cgroup_find_mount_point(h, entry->cgroup_path, true);
1117
1118 *cptr = entry;
1119 cptr = &entry->next;
1120 entry = NULL;
c8f7c563
CS
1121 }
1122
33ad9f1a
CS
1123 return result;
1124out_error:
1125 saved_errno = errno;
1126 free(path);
1127 lxc_cgroup_process_info_free(result);
1128 lxc_cgroup_process_info_free(entry);
1129 errno = saved_errno;
1130 return NULL;
fd4f5a56
DL
1131}
1132
33ad9f1a 1133/* move a processs to the cgroups specified by the membership */
4fb3cba5 1134static int lxc_cgroupfs_enter(struct cgroup_process_info *info, pid_t pid, bool enter_sub)
4f17323e 1135{
33ad9f1a
CS
1136 char pid_buf[32];
1137 char *cgroup_tasks_fn;
1138 int r;
1139 struct cgroup_process_info *info_ptr;
1140
1141 snprintf(pid_buf, 32, "%lu", (unsigned long)pid);
1142 for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1143 char *cgroup_path = (enter_sub && info_ptr->cgroup_path_sub) ?
1144 info_ptr->cgroup_path_sub :
1145 info_ptr->cgroup_path;
1146
1147 if (!info_ptr->designated_mount_point) {
1148 info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, cgroup_path, true);
1149 if (!info_ptr->designated_mount_point) {
1150 SYSERROR("Could not add pid %lu to cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", (unsigned long)pid, cgroup_path);
1151 return -1;
1152 }
1153 }
4f17323e 1154
33ad9f1a
CS
1155 cgroup_tasks_fn = cgroup_to_absolute_path(info_ptr->designated_mount_point, cgroup_path, "/tasks");
1156 if (!cgroup_tasks_fn) {
1157 SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1158 return -1;
1159 }
4f17323e 1160
33ad9f1a 1161 r = lxc_write_to_file(cgroup_tasks_fn, pid_buf, strlen(pid_buf), false);
5903da82 1162 free(cgroup_tasks_fn);
33ad9f1a
CS
1163 if (r < 0) {
1164 SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1165 return -1;
1166 }
4f17323e
CS
1167 }
1168
33ad9f1a 1169 return 0;
4f17323e
CS
1170}
1171
33ad9f1a
CS
1172/* free process membership information */
1173void lxc_cgroup_process_info_free(struct cgroup_process_info *info)
fc7de561 1174{
33ad9f1a
CS
1175 struct cgroup_process_info *next;
1176 if (!info)
b98f7d6e 1177 return;
33ad9f1a
CS
1178 next = info->next;
1179 lxc_cgroup_put_meta(info->meta_ref);
1180 free(info->cgroup_path);
1181 free(info->cgroup_path_sub);
1182 lxc_free_array((void **)info->created_paths, free);
1183 free(info);
1184 lxc_cgroup_process_info_free(next);
fc7de561
SH
1185}
1186
33ad9f1a
CS
1187/* free process membership information and remove cgroups that were created */
1188void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info)
b98f7d6e 1189{
33ad9f1a
CS
1190 struct cgroup_process_info *next;
1191 char **pp;
1192 if (!info)
1193 return;
1194 next = info->next;
603c64c2 1195 {
33ad9f1a
CS
1196 struct cgroup_mount_point *mp = info->designated_mount_point;
1197 if (!mp)
1198 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1199 if (mp)
1200 /* ignore return value here, perhaps we created the
1201 * '/lxc' cgroup in this container but another container
1202 * is still running (for example)
1203 */
603c64c2
SH
1204 (void)remove_cgroup(mp, info->cgroup_path, true);
1205 }
1206 for (pp = info->created_paths; pp && *pp; pp++);
1207 for ((void)(pp && --pp); info->created_paths && pp >= info->created_paths; --pp) {
33ad9f1a 1208 free(*pp);
b98f7d6e 1209 }
33ad9f1a
CS
1210 free(info->created_paths);
1211 lxc_cgroup_put_meta(info->meta_ref);
1212 free(info->cgroup_path);
1213 free(info->cgroup_path_sub);
1214 free(info);
9431aa65 1215 lxc_cgroup_process_info_free_and_remove(next);
33ad9f1a 1216}
b98f7d6e 1217
4fb3cba5 1218static char *lxc_cgroup_get_hierarchy_path_data(const char *subsystem, struct cgfs_data *d)
33ad9f1a 1219{
d4ef7c50
SH
1220 struct cgroup_process_info *info = d->info;
1221 info = find_info_for_subsystem(info, subsystem);
33ad9f1a
CS
1222 if (!info)
1223 return NULL;
f348e47c 1224 prune_init_scope(info->cgroup_path);
33ad9f1a 1225 return info->cgroup_path;
b98f7d6e
SH
1226}
1227
4fb3cba5 1228static char *lxc_cgroup_get_hierarchy_abs_path_data(const char *subsystem, struct cgfs_data *d)
b98f7d6e 1229{
d4ef7c50 1230 struct cgroup_process_info *info = d->info;
33ad9f1a 1231 struct cgroup_mount_point *mp = NULL;
d4ef7c50
SH
1232
1233 info = find_info_for_subsystem(info, subsystem);
33ad9f1a
CS
1234 if (!info)
1235 return NULL;
1236 if (info->designated_mount_point) {
8900b9eb 1237 mp = info->designated_mount_point;
33ad9f1a
CS
1238 } else {
1239 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1240 if (!mp)
1241 return NULL;
b98f7d6e 1242 }
33ad9f1a 1243 return cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
b98f7d6e 1244}
55c76589 1245
4fb3cba5 1246static char *lxc_cgroup_get_hierarchy_abs_path(const char *subsystem, const char *name, const char *lxcpath)
9a93d992 1247{
33ad9f1a
CS
1248 struct cgroup_meta_data *meta;
1249 struct cgroup_process_info *base_info, *info;
1250 struct cgroup_mount_point *mp;
1251 char *result = NULL;
33ad9f1a
CS
1252
1253 meta = lxc_cgroup_load_meta();
1254 if (!meta)
9a93d992 1255 return NULL;
33ad9f1a
CS
1256 base_info = lxc_cgroup_get_container_info(name, lxcpath, meta);
1257 if (!base_info)
178938fe 1258 goto out1;
33ad9f1a
CS
1259 info = find_info_for_subsystem(base_info, subsystem);
1260 if (!info)
178938fe 1261 goto out2;
33ad9f1a 1262 if (info->designated_mount_point) {
8900b9eb 1263 mp = info->designated_mount_point;
33ad9f1a
CS
1264 } else {
1265 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1266 if (!mp)
178938fe 1267 goto out3;
33ad9f1a
CS
1268 }
1269 result = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
178938fe 1270out3:
178938fe 1271out2:
33ad9f1a 1272 lxc_cgroup_process_info_free(base_info);
178938fe 1273out1:
33ad9f1a 1274 lxc_cgroup_put_meta(meta);
33ad9f1a
CS
1275 return result;
1276}
9a93d992 1277
4fb3cba5 1278static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfs_data *d)
33ad9f1a
CS
1279{
1280 char *subsystem = NULL, *p, *path;
1281 int ret = -1;
9a93d992 1282
33ad9f1a
CS
1283 subsystem = alloca(strlen(filename) + 1);
1284 strcpy(subsystem, filename);
46cd2845 1285 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a 1286 *p = '\0';
9a93d992 1287
4f875f70 1288 errno = ENOENT;
4fb3cba5 1289 path = lxc_cgroup_get_hierarchy_abs_path_data(subsystem, d);
33ad9f1a
CS
1290 if (path) {
1291 ret = do_cgroup_set(path, filename, value);
4f875f70 1292 int saved_errno = errno;
33ad9f1a 1293 free(path);
4f875f70 1294 errno = saved_errno;
9a93d992 1295 }
33ad9f1a
CS
1296 return ret;
1297}
9a93d992 1298
4fb3cba5 1299static int lxc_cgroupfs_set(const char *filename, const char *value, const char *name, const char *lxcpath)
9a93d992 1300{
33ad9f1a
CS
1301 char *subsystem = NULL, *p, *path;
1302 int ret = -1;
9a93d992 1303
33ad9f1a
CS
1304 subsystem = alloca(strlen(filename) + 1);
1305 strcpy(subsystem, filename);
46cd2845 1306 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a 1307 *p = '\0';
9a93d992 1308
33ad9f1a
CS
1309 path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1310 if (path) {
1311 ret = do_cgroup_set(path, filename, value);
1312 free(path);
1313 }
b98f7d6e 1314 return ret;
9a93d992
SH
1315}
1316
4fb3cba5 1317static int lxc_cgroupfs_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
9a93d992 1318{
33ad9f1a
CS
1319 char *subsystem = NULL, *p, *path;
1320 int ret = -1;
1321
1322 subsystem = alloca(strlen(filename) + 1);
1323 strcpy(subsystem, filename);
46cd2845 1324 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a
CS
1325 *p = '\0';
1326
1327 path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1328 if (path) {
1329 ret = do_cgroup_get(path, filename, value, len);
1330 free(path);
9a93d992 1331 }
33ad9f1a 1332 return ret;
9a93d992
SH
1333}
1334
4fb3cba5 1335static bool cgroupfs_mount_cgroup(void *hdata, const char *root, int type)
aae1f3c4
CS
1336{
1337 size_t bufsz = strlen(root) + sizeof("/sys/fs/cgroup");
1338 char *path = NULL;
1339 char **parts = NULL;
1340 char *dirname = NULL;
1341 char *abs_path = NULL;
1342 char *abs_path2 = NULL;
d4ef7c50
SH
1343 struct cgfs_data *cgfs_d;
1344 struct cgroup_process_info *info, *base_info;
aae1f3c4
CS
1345 int r, saved_errno = 0;
1346
4fb3cba5
DE
1347 cgfs_d = hdata;
1348 if (!cgfs_d)
1349 return false;
d4ef7c50
SH
1350 base_info = cgfs_d->info;
1351
0769b82a
CS
1352 /* If we get passed the _NOSPEC types, we default to _MIXED, since we don't
1353 * have access to the lxc_conf object at this point. It really should be up
1354 * to the caller to fix this, but this doesn't really hurt.
1355 */
1356 if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1357 type = LXC_AUTO_CGROUP_FULL_MIXED;
1358 else if (type == LXC_AUTO_CGROUP_NOSPEC)
1359 type = LXC_AUTO_CGROUP_MIXED;
1360
7997d7da
CS
1361 if (type < LXC_AUTO_CGROUP_RO || type > LXC_AUTO_CGROUP_FULL_MIXED) {
1362 ERROR("could not mount cgroups into container: invalid type specified internally");
1363 errno = EINVAL;
c476bdce 1364 return false;
7997d7da
CS
1365 }
1366
aae1f3c4
CS
1367 path = calloc(1, bufsz);
1368 if (!path)
c476bdce 1369 return false;
aae1f3c4 1370 snprintf(path, bufsz, "%s/sys/fs/cgroup", root);
592fd47a
SH
1371 r = safe_mount("cgroup_root", path, "tmpfs",
1372 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1373 "size=10240k,mode=755",
1374 root);
aae1f3c4
CS
1375 if (r < 0) {
1376 SYSERROR("could not mount tmpfs to /sys/fs/cgroup in the container");
c476bdce 1377 return false;
aae1f3c4
CS
1378 }
1379
1380 /* now mount all the hierarchies we care about */
1381 for (info = base_info; info; info = info->next) {
1382 size_t subsystem_count, i;
1383 struct cgroup_mount_point *mp = info->designated_mount_point;
1384 if (!mp)
1385 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1386 if (!mp) {
1387 SYSERROR("could not find original mount point for cgroup hierarchy while trying to mount cgroup filesystem");
1388 goto out_error;
1389 }
1390
1391 subsystem_count = lxc_array_len((void **)info->hierarchy->subsystems);
1392 parts = calloc(subsystem_count + 1, sizeof(char *));
1393 if (!parts)
1394 goto out_error;
1395
1396 for (i = 0; i < subsystem_count; i++) {
1397 if (!strncmp(info->hierarchy->subsystems[i], "name=", 5))
1398 parts[i] = info->hierarchy->subsystems[i] + 5;
1399 else
1400 parts[i] = info->hierarchy->subsystems[i];
1401 }
1402 dirname = lxc_string_join(",", (const char **)parts, false);
1403 if (!dirname)
1404 goto out_error;
1405
1406 /* create subsystem directory */
1407 abs_path = lxc_append_paths(path, dirname);
1408 if (!abs_path)
1409 goto out_error;
1410 r = mkdir_p(abs_path, 0755);
1411 if (r < 0 && errno != EEXIST) {
1412 SYSERROR("could not create cgroup subsystem directory /sys/fs/cgroup/%s", dirname);
1413 goto out_error;
1414 }
1415
aae1f3c4
CS
1416 abs_path2 = lxc_append_paths(abs_path, info->cgroup_path);
1417 if (!abs_path2)
1418 goto out_error;
aae1f3c4 1419
7997d7da
CS
1420 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_RW || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1421 /* bind-mount the cgroup entire filesystem there */
1422 if (strcmp(mp->mount_prefix, "/") != 0) {
1423 /* FIXME: maybe we should just try to remount the entire hierarchy
1424 * with a regular mount command? may that works? */
1425 ERROR("could not automatically mount cgroup-full to /sys/fs/cgroup/%s: host has no mount point for this cgroup filesystem that has access to the root cgroup", dirname);
1426 goto out_error;
1427 }
1428 r = mount(mp->mount_point, abs_path, "none", MS_BIND, 0);
1429 if (r < 0) {
1430 SYSERROR("error bind-mounting %s to %s", mp->mount_point, abs_path);
1431 goto out_error;
1432 }
f8f3c3c0
SG
1433 /* main cgroup path should be read-only */
1434 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1435 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1436 if (r < 0) {
1437 SYSERROR("error re-mounting %s readonly", abs_path);
1438 goto out_error;
1439 }
1440 }
7997d7da
CS
1441 /* own cgroup should be read-write */
1442 if (type == LXC_AUTO_CGROUP_FULL_MIXED) {
1443 r = mount(abs_path2, abs_path2, NULL, MS_BIND, NULL);
1444 if (r < 0) {
1445 SYSERROR("error bind-mounting %s onto itself", abs_path2);
1446 goto out_error;
1447 }
1448 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND, NULL);
1449 if (r < 0) {
1450 SYSERROR("error re-mounting %s readwrite", abs_path2);
1451 goto out_error;
1452 }
1453 }
1454 } else {
1455 /* create path for container's cgroup */
1456 r = mkdir_p(abs_path2, 0755);
1457 if (r < 0 && errno != EEXIST) {
1458 SYSERROR("could not create cgroup directory /sys/fs/cgroup/%s%s", dirname, info->cgroup_path);
1459 goto out_error;
1460 }
aae1f3c4 1461
b46f0553
CS
1462 /* for read-only and mixed cases, we have to bind-mount the tmpfs directory
1463 * that points to the hierarchy itself (i.e. /sys/fs/cgroup/cpu etc.) onto
1464 * itself and then bind-mount it read-only, since we keep the tmpfs itself
1465 * read-write (see comment below)
1466 */
1467 if (type == LXC_AUTO_CGROUP_MIXED || type == LXC_AUTO_CGROUP_RO) {
1468 r = mount(abs_path, abs_path, NULL, MS_BIND, NULL);
1469 if (r < 0) {
1470 SYSERROR("error bind-mounting %s onto itself", abs_path);
1471 goto out_error;
1472 }
1473 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1474 if (r < 0) {
1475 SYSERROR("error re-mounting %s readonly", abs_path);
1476 goto out_error;
1477 }
1478 }
1479
7997d7da
CS
1480 free(abs_path);
1481 abs_path = NULL;
1482
1483 /* bind-mount container's cgroup to that directory */
1484 abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1485 if (!abs_path)
1486 goto out_error;
1487 r = mount(abs_path, abs_path2, "none", MS_BIND, 0);
1488 if (r < 0) {
1489 SYSERROR("error bind-mounting %s to %s", abs_path, abs_path2);
1490 goto out_error;
1491 }
1492 if (type == LXC_AUTO_CGROUP_RO) {
1493 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1494 if (r < 0) {
1495 SYSERROR("error re-mounting %s readonly", abs_path2);
1496 goto out_error;
1497 }
1498 }
aae1f3c4
CS
1499 }
1500
1501 free(abs_path);
1502 free(abs_path2);
1503 abs_path = NULL;
1504 abs_path2 = NULL;
1505
1506 /* add symlinks for every single subsystem */
1507 if (subsystem_count > 1) {
1508 for (i = 0; i < subsystem_count; i++) {
1509 abs_path = lxc_append_paths(path, parts[i]);
1510 if (!abs_path)
1511 goto out_error;
1512 r = symlink(dirname, abs_path);
1513 if (r < 0)
1514 WARN("could not create symlink %s -> %s in /sys/fs/cgroup of container", parts[i], dirname);
1515 free(abs_path);
1516 abs_path = NULL;
1517 }
1518 }
1519 free(dirname);
1520 free(parts);
1521 dirname = NULL;
1522 parts = NULL;
1523 }
1524
b46f0553
CS
1525 /* We used to remount the entire tmpfs readonly if any :ro or
1526 * :mixed mode was specified. However, Ubuntu's mountall has the
1527 * unfortunate behavior to block bootup if /sys/fs/cgroup is
1528 * mounted read-only and cannot be remounted read-write.
1529 * (mountall reads /lib/init/fstab and tries to (re-)mount all of
1530 * these if they are not already mounted with the right options;
1531 * it contains an entry for /sys/fs/cgroup. In case it can't do
1532 * that, it prompts for the user to either manually fix it or
1533 * boot anyway. But without user input, booting of the container
1534 * hangs.)
1535 *
1536 * Instead of remounting the entire tmpfs readonly, we only
1537 * remount the paths readonly that are part of the cgroup
1538 * hierarchy.
f8f3c3c0 1539 */
f8f3c3c0 1540
aae1f3c4
CS
1541 free(path);
1542
c476bdce 1543 return true;
aae1f3c4
CS
1544
1545out_error:
1546 saved_errno = errno;
1547 free(path);
1548 free(dirname);
1549 free(parts);
1550 free(abs_path);
1551 free(abs_path2);
1552 errno = saved_errno;
c476bdce 1553 return false;
aae1f3c4
CS
1554}
1555
4fb3cba5 1556static int cgfs_nrtasks(void *hdata)
33ad9f1a 1557{
4fb3cba5
DE
1558 struct cgfs_data *d = hdata;
1559 struct cgroup_process_info *info;
33ad9f1a
CS
1560 struct cgroup_mount_point *mp = NULL;
1561 char *abs_path = NULL;
1562 int ret;
460a1cf0 1563
4fb3cba5
DE
1564 if (!d) {
1565 errno = ENOENT;
1566 return -1;
1567 }
1568
1569 info = d->info;
33ad9f1a
CS
1570 if (!info) {
1571 errno = ENOENT;
1572 return -1;
b98f7d6e 1573 }
c8f7c563 1574
33ad9f1a 1575 if (info->designated_mount_point) {
8900b9eb 1576 mp = info->designated_mount_point;
33ad9f1a
CS
1577 } else {
1578 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, false);
1579 if (!mp)
1580 return -1;
c8f7c563
CS
1581 }
1582
33ad9f1a
CS
1583 abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1584 if (!abs_path)
1585 return -1;
1586
1587 ret = cgroup_recursive_task_count(abs_path);
1588 free(abs_path);
1589 return ret;
c8f7c563
CS
1590}
1591
574c4428
QH
1592static struct cgroup_process_info *
1593lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str,
1594 struct cgroup_meta_data *meta)
d08ba6ec 1595{
33ad9f1a
CS
1596 struct cgroup_process_info *result = NULL;
1597 FILE *proc_pid_cgroup = NULL;
1598 char *line = NULL;
1599 size_t sz = 0;
1600 int saved_errno = 0;
1601 struct cgroup_process_info **cptr = &result;
1602 struct cgroup_process_info *entry = NULL;
1603
1604 proc_pid_cgroup = fopen_cloexec(proc_pid_cgroup_str, "r");
1605 if (!proc_pid_cgroup)
b98f7d6e 1606 return NULL;
1ac470c0 1607
33ad9f1a
CS
1608 while (getline(&line, &sz, proc_pid_cgroup) != -1) {
1609 /* file format: hierarchy:subsystems:group */
1610 char *colon1;
1611 char *colon2;
1612 char *endptr;
1613 int hierarchy_number;
1614 struct cgroup_hierarchy *h = NULL;
fd4f5a56 1615
33ad9f1a 1616 if (!line[0])
ae5c8b8e 1617 continue;
b98f7d6e 1618
33ad9f1a
CS
1619 if (line[strlen(line) - 1] == '\n')
1620 line[strlen(line) - 1] = '\0';
1621
1622 colon1 = strchr(line, ':');
1623 if (!colon1)
8900b9eb 1624 continue;
33ad9f1a
CS
1625 *colon1++ = '\0';
1626 colon2 = strchr(colon1, ':');
1627 if (!colon2)
ae5c8b8e 1628 continue;
33ad9f1a 1629 *colon2++ = '\0';
e4659536 1630
33ad9f1a
CS
1631 endptr = NULL;
1632 hierarchy_number = strtoul(line, &endptr, 10);
1633 if (!endptr || *endptr)
9a93d992 1634 continue;
9a93d992 1635
33ad9f1a
CS
1636 if (hierarchy_number > meta->maximum_hierarchy) {
1637 /* we encountered a hierarchy we didn't have before,
1638 * so probably somebody remounted some stuff in the
1639 * mean time...
1640 */
1641 errno = EAGAIN;
1642 goto out_error;
b98f7d6e 1643 }
33ad9f1a
CS
1644
1645 h = meta->hierarchies[hierarchy_number];
1646 if (!h) {
1647 /* we encountered a hierarchy that was thought to be
1648 * dead before, so probably somebody remounted some
1649 * stuff in the mean time...
1650 */
1651 errno = EAGAIN;
1652 goto out_error;
b98f7d6e 1653 }
33ad9f1a
CS
1654
1655 /* we are told that we should ignore this hierarchy */
1656 if (!h->used)
b98f7d6e 1657 continue;
5193cc3d 1658
33ad9f1a
CS
1659 entry = calloc(1, sizeof(struct cgroup_process_info));
1660 if (!entry)
1661 goto out_error;
fd4f5a56 1662
33ad9f1a
CS
1663 entry->meta_ref = lxc_cgroup_get_meta(meta);
1664 entry->hierarchy = h;
1665 entry->cgroup_path = strdup(colon2);
1666 if (!entry->cgroup_path)
1667 goto out_error;
3939a22a 1668 prune_init_scope(entry->cgroup_path);
d08ba6ec 1669
33ad9f1a
CS
1670 *cptr = entry;
1671 cptr = &entry->next;
1672 entry = NULL;
b98f7d6e 1673 }
b98f7d6e 1674
33ad9f1a
CS
1675 fclose(proc_pid_cgroup);
1676 free(line);
1677 return result;
1678
1679out_error:
1680 saved_errno = errno;
1681 if (proc_pid_cgroup)
1682 fclose(proc_pid_cgroup);
1683 lxc_cgroup_process_info_free(result);
1684 lxc_cgroup_process_info_free(entry);
1685 free(line);
1686 errno = saved_errno;
ae5c8b8e 1687 return NULL;
36b86299
DL
1688}
1689
574c4428
QH
1690static char **subsystems_from_mount_options(const char *mount_options,
1691 char **kernel_list)
36b86299 1692{
33ad9f1a
CS
1693 char *token, *str, *saveptr = NULL;
1694 char **result = NULL;
1695 size_t result_capacity = 0;
8900b9eb 1696 size_t result_count = 0;
33ad9f1a
CS
1697 int saved_errno;
1698 int r;
ef342abb 1699
33ad9f1a
CS
1700 str = alloca(strlen(mount_options)+1);
1701 strcpy(str, mount_options);
1702 for (; (token = strtok_r(str, ",", &saveptr)); str = NULL) {
1703 /* we have a subsystem if it's either in the list of
1704 * subsystems provided by the kernel OR if it starts
1705 * with name= for named hierarchies
1706 */
1707 if (!strncmp(token, "name=", 5) || lxc_string_in_array(token, (const char **)kernel_list)) {
1708 r = lxc_grow_array((void ***)&result, &result_capacity, result_count + 1, 12);
1709 if (r < 0)
1710 goto out_free;
1711 result[result_count + 1] = NULL;
1712 result[result_count] = strdup(token);
1713 if (!result[result_count])
1714 goto out_free;
1715 result_count++;
1716 }
ae5c8b8e 1717 }
f0e64b8b 1718
33ad9f1a
CS
1719 return result;
1720
1721out_free:
1722 saved_errno = errno;
1723 lxc_free_array((void**)result, free);
1724 errno = saved_errno;
1725 return NULL;
b98f7d6e
SH
1726}
1727
574c4428 1728static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp)
b98f7d6e 1729{
33ad9f1a
CS
1730 if (!mp)
1731 return;
1732 free(mp->mount_point);
1733 free(mp->mount_prefix);
1734 free(mp);
bcbd102c
SH
1735}
1736
574c4428 1737static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h)
341a9bd8 1738{
33ad9f1a
CS
1739 if (!h)
1740 return;
1741 lxc_free_array((void **)h->subsystems, free);
8bfcb981 1742 free(h->all_mount_points);
33ad9f1a
CS
1743 free(h);
1744}
341a9bd8 1745
574c4428 1746static bool is_valid_cgroup(const char *name)
33ad9f1a
CS
1747{
1748 const char *p;
1749 for (p = name; *p; p++) {
28bb9321
QH
1750 /* Use the ASCII printable characters range(32 - 127)
1751 * is reasonable, we kick out 32(SPACE) because it'll
1752 * break legacy lxc-ls
1753 */
1754 if (*p <= 32 || *p >= 127 || *p == '/')
33ad9f1a 1755 return false;
341a9bd8 1756 }
33ad9f1a
CS
1757 return strcmp(name, ".") != 0 && strcmp(name, "..") != 0;
1758}
341a9bd8 1759
574c4428
QH
1760static int create_or_remove_cgroup(bool do_remove,
1761 struct cgroup_mount_point *mp, const char *path, int recurse)
33ad9f1a
CS
1762{
1763 int r, saved_errno = 0;
1764 char *buf = cgroup_to_absolute_path(mp, path, NULL);
1765 if (!buf)
1766 return -1;
341a9bd8 1767
33ad9f1a 1768 /* create or remove directory */
603c64c2
SH
1769 if (do_remove) {
1770 if (recurse)
1771 r = cgroup_rmdir(buf);
1772 else
1773 r = rmdir(buf);
1774 } else
1775 r = mkdir(buf, 0777);
33ad9f1a
CS
1776 saved_errno = errno;
1777 free(buf);
1778 errno = saved_errno;
1779 return r;
341a9bd8 1780}
bcbd102c 1781
574c4428 1782static int create_cgroup(struct cgroup_mount_point *mp, const char *path)
a6ddef61 1783{
603c64c2 1784 return create_or_remove_cgroup(false, mp, path, false);
a6ddef61
MN
1785}
1786
574c4428
QH
1787static int remove_cgroup(struct cgroup_mount_point *mp,
1788 const char *path, bool recurse)
576f946d 1789{
603c64c2 1790 return create_or_remove_cgroup(true, mp, path, recurse);
33ad9f1a 1791}
576f946d 1792
574c4428
QH
1793static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp,
1794 const char *path, const char *suffix)
33ad9f1a
CS
1795{
1796 /* first we have to make sure we subtract the mount point's prefix */
1797 char *prefix = mp->mount_prefix;
1798 char *buf;
1799 ssize_t len, rv;
1800
1801 /* we want to make sure only absolute paths to cgroups are passed to us */
1802 if (path[0] != '/') {
1803 errno = EINVAL;
1804 return NULL;
1805 }
b98f7d6e 1806
33ad9f1a
CS
1807 if (prefix && !strcmp(prefix, "/"))
1808 prefix = NULL;
b98f7d6e 1809
33ad9f1a
CS
1810 /* prefix doesn't match */
1811 if (prefix && strncmp(prefix, path, strlen(prefix)) != 0) {
1812 errno = EINVAL;
1813 return NULL;
1814 }
1815 /* if prefix is /foo and path is /foobar */
1816 if (prefix && path[strlen(prefix)] != '/' && path[strlen(prefix)] != '\0') {
1817 errno = EINVAL;
1818 return NULL;
1819 }
b98f7d6e 1820
33ad9f1a
CS
1821 /* remove prefix from path */
1822 path += prefix ? strlen(prefix) : 0;
b98f7d6e 1823
33ad9f1a
CS
1824 len = strlen(mp->mount_point) + strlen(path) + (suffix ? strlen(suffix) : 0);
1825 buf = calloc(len + 1, 1);
50266dc6
DE
1826 if (!buf)
1827 return NULL;
33ad9f1a 1828 rv = snprintf(buf, len + 1, "%s%s%s", mp->mount_point, path, suffix ? suffix : "");
8900b9eb 1829 if (rv > len) {
33ad9f1a
CS
1830 free(buf);
1831 errno = ENOMEM;
8900b9eb 1832 return NULL;
8b92dc3a 1833 }
576f946d 1834
33ad9f1a 1835 return buf;
e0f888d9 1836}
283678ed 1837
574c4428
QH
1838static struct cgroup_process_info *
1839find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem)
283678ed 1840{
33ad9f1a
CS
1841 struct cgroup_process_info *info_ptr;
1842 for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1843 struct cgroup_hierarchy *h = info_ptr->hierarchy;
1844 if (lxc_string_in_array(subsystem, (const char **)h->subsystems))
1845 return info_ptr;
b98f7d6e 1846 }
33ad9f1a
CS
1847 errno = ENOENT;
1848 return NULL;
1849}
283678ed 1850
574c4428
QH
1851static int do_cgroup_get(const char *cgroup_path, const char *sub_filename,
1852 char *value, size_t len)
33ad9f1a
CS
1853{
1854 const char *parts[3] = {
1855 cgroup_path,
1856 sub_filename,
1857 NULL
1858 };
1859 char *filename;
1860 int ret, saved_errno;
1861
1862 filename = lxc_string_join("/", parts, false);
1863 if (!filename)
1864 return -1;
1865
1866 ret = lxc_read_from_file(filename, value, len);
1867 saved_errno = errno;
1868 free(filename);
1869 errno = saved_errno;
1870 return ret;
283678ed 1871}
b113383b 1872
574c4428
QH
1873static int do_cgroup_set(const char *cgroup_path, const char *sub_filename,
1874 const char *value)
b113383b 1875{
33ad9f1a
CS
1876 const char *parts[3] = {
1877 cgroup_path,
1878 sub_filename,
1879 NULL
1880 };
1881 char *filename;
1882 int ret, saved_errno;
b113383b 1883
33ad9f1a
CS
1884 filename = lxc_string_join("/", parts, false);
1885 if (!filename)
1886 return -1;
b113383b 1887
33ad9f1a
CS
1888 ret = lxc_write_to_file(filename, value, strlen(value), false);
1889 saved_errno = errno;
1890 free(filename);
1891 errno = saved_errno;
1892 return ret;
b98f7d6e
SH
1893}
1894
4fb3cba5 1895static int do_setup_cgroup_limits(struct cgfs_data *d,
574c4428 1896 struct lxc_list *cgroup_settings, bool do_devices)
b98f7d6e 1897{
365d180a 1898 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
b98f7d6e
SH
1899 struct lxc_cgroup *cg;
1900 int ret = -1;
1901
33ad9f1a 1902 if (lxc_list_empty(cgroup_settings))
b98f7d6e
SH
1903 return 0;
1904
aaf26830 1905 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
fac7c663
KT
1906 if (!sorted_cgroup_settings) {
1907 return -1;
1908 }
aaf26830
KT
1909
1910 lxc_list_for_each(iterator, sorted_cgroup_settings) {
b98f7d6e
SH
1911 cg = iterator->elem;
1912
33ad9f1a 1913 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
b98f7d6e 1914 if (strcmp(cg->subsystem, "devices.deny") == 0 &&
4fb3cba5 1915 cgroup_devices_has_allow_or_deny(d, cg->value, false))
b98f7d6e
SH
1916 continue;
1917 if (strcmp(cg->subsystem, "devices.allow") == 0 &&
4fb3cba5 1918 cgroup_devices_has_allow_or_deny(d, cg->value, true))
b98f7d6e 1919 continue;
4fb3cba5 1920 if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
dddf7c5b 1921 if (do_devices && (errno == EACCES || errno == EPERM)) {
4f875f70
SH
1922 WARN("Error setting %s to %s for %s",
1923 cg->subsystem, cg->value, d->name);
1924 continue;
1925 }
dddf7c5b 1926 SYSERROR("Error setting %s to %s for %s",
4fb3cba5 1927 cg->subsystem, cg->value, d->name);
b98f7d6e
SH
1928 goto out;
1929 }
b113383b 1930 }
b98f7d6e
SH
1931
1932 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
b113383b
SH
1933 }
1934
b98f7d6e
SH
1935 ret = 0;
1936 INFO("cgroup has been setup");
1937out:
365d180a 1938 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
aaf26830
KT
1939 lxc_list_del(iterator);
1940 free(iterator);
1941 }
365d180a 1942 free(sorted_cgroup_settings);
b113383b
SH
1943 return ret;
1944}
b98f7d6e 1945
4fb3cba5 1946static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d,
574c4428 1947 char *v, bool for_allow)
33ad9f1a
CS
1948{
1949 char *path;
1950 FILE *devices_list;
8900b9eb 1951 char *line = NULL;
33ad9f1a
CS
1952 size_t sz = 0;
1953 bool ret = !for_allow;
1954 const char *parts[3] = {
1955 NULL,
1956 "devices.list",
1957 NULL
1958 };
1959
1960 // XXX FIXME if users could use something other than 'lxc.devices.deny = a'.
1961 // not sure they ever do, but they *could*
1962 // right now, I'm assuming they do NOT
1963 if (!for_allow && strcmp(v, "a") != 0 && strcmp(v, "a *:* rwm") != 0)
1964 return false;
1965
4fb3cba5 1966 parts[0] = (const char *)lxc_cgroup_get_hierarchy_abs_path_data("devices", d);
33ad9f1a
CS
1967 if (!parts[0])
1968 return false;
1969 path = lxc_string_join("/", parts, false);
1970 if (!path) {
1971 free((void *)parts[0]);
1972 return false;
1973 }
1974
1975 devices_list = fopen_cloexec(path, "r");
1976 if (!devices_list) {
1977 free(path);
1978 return false;
1979 }
1980
1981 while (getline(&line, &sz, devices_list) != -1) {
1982 size_t len = strlen(line);
1983 if (len > 0 && line[len-1] == '\n')
1984 line[len-1] = '\0';
1985 if (strcmp(line, "a *:* rwm") == 0) {
1986 ret = for_allow;
1987 goto out;
1988 } else if (for_allow && strcmp(line, v) == 0) {
1989 ret = true;
8900b9eb 1990 goto out;
33ad9f1a
CS
1991 }
1992 }
1993
1994out:
1995 fclose(devices_list);
1996 free(line);
1997 free(path);
1998 return ret;
1999}
2000
574c4428 2001static int cgroup_recursive_task_count(const char *cgroup_path)
b98f7d6e 2002{
33ad9f1a
CS
2003 DIR *d;
2004 struct dirent *dent_buf;
2005 struct dirent *dent;
8900b9eb 2006 ssize_t name_max;
33ad9f1a
CS
2007 int n = 0, r;
2008
2009 /* see man readdir_r(3) */
2010 name_max = pathconf(cgroup_path, _PC_NAME_MAX);
2011 if (name_max <= 0)
2012 name_max = 255;
2013 dent_buf = malloc(offsetof(struct dirent, d_name) + name_max + 1);
2014 if (!dent_buf)
2015 return -1;
2016
2017 d = opendir(cgroup_path);
034ef75d
SH
2018 if (!d) {
2019 free(dent_buf);
33ad9f1a 2020 return 0;
034ef75d 2021 }
33ad9f1a
CS
2022
2023 while (readdir_r(d, dent_buf, &dent) == 0 && dent) {
2024 const char *parts[3] = {
2025 cgroup_path,
2026 dent->d_name,
2027 NULL
2028 };
2029 char *sub_path;
2030 struct stat st;
2031
2032 if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, ".."))
2033 continue;
2034 sub_path = lxc_string_join("/", parts, false);
2035 if (!sub_path) {
2036 closedir(d);
2037 free(dent_buf);
2038 return -1;
2039 }
2040 r = stat(sub_path, &st);
2041 if (r < 0) {
2042 closedir(d);
2043 free(dent_buf);
2044 free(sub_path);
2045 return -1;
2046 }
2047 if (S_ISDIR(st.st_mode)) {
2048 r = cgroup_recursive_task_count(sub_path);
2049 if (r >= 0)
2050 n += r;
2051 } else if (!strcmp(dent->d_name, "tasks")) {
2052 r = count_lines(sub_path);
2053 if (r >= 0)
2054 n += r;
2055 }
2056 free(sub_path);
2057 }
2058 closedir(d);
2059 free(dent_buf);
2060
2061 return n;
2062}
2063
574c4428 2064static int count_lines(const char *fn)
33ad9f1a
CS
2065{
2066 FILE *f;
2067 char *line = NULL;
2068 size_t sz = 0;
2069 int n = 0;
2070
2071 f = fopen_cloexec(fn, "r");
2072 if (!f)
2073 return -1;
2074
2075 while (getline(&line, &sz, f) != -1) {
2076 n++;
2077 }
2078 free(line);
2079 fclose(f);
2080 return n;
b98f7d6e
SH
2081}
2082
574c4428
QH
2083static int handle_cgroup_settings(struct cgroup_mount_point *mp,
2084 char *cgroup_path)
b98f7d6e 2085{
33ad9f1a 2086 int r, saved_errno = 0;
7e7243e1 2087 char buf[2];
1ea59ad2 2088
934b1673
SH
2089 mp->need_cpuset_init = false;
2090
1ea59ad2
SH
2091 /* If this is the memory cgroup, we want to enforce hierarchy.
2092 * But don't fail if for some reason we can't.
2093 */
2edb53c7
SH
2094 if (lxc_string_in_array("memory", (const char **)mp->hierarchy->subsystems)) {
2095 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/memory.use_hierarchy");
2096 if (cc_path) {
2097 r = lxc_read_from_file(cc_path, buf, 1);
2098 if (r < 1 || buf[0] != '1') {
2099 r = lxc_write_to_file(cc_path, "1", 1, false);
2100 if (r < 0)
a8916143 2101 SYSERROR("failed to set memory.use_hierarchy to 1; continuing");
2edb53c7 2102 }
1ea59ad2
SH
2103 free(cc_path);
2104 }
2edb53c7 2105 }
1ea59ad2 2106
33ad9f1a
CS
2107 /* if this is a cpuset hierarchy, we have to set cgroup.clone_children in
2108 * the base cgroup, otherwise containers will start with an empty cpuset.mems
2109 * and cpuset.cpus and then
2110 */
2edb53c7
SH
2111 if (lxc_string_in_array("cpuset", (const char **)mp->hierarchy->subsystems)) {
2112 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/cgroup.clone_children");
d703c2b1
RV
2113 struct stat sb;
2114
33ad9f1a 2115 if (!cc_path)
2edb53c7 2116 return -1;
d703c2b1
RV
2117 /* cgroup.clone_children is not available when running under
2118 * older kernel versions; in this case, we'll initialize
2119 * cpuset.cpus and cpuset.mems later, after the new cgroup
2120 * was created
2121 */
2122 if (stat(cc_path, &sb) != 0 && errno == ENOENT) {
934b1673 2123 mp->need_cpuset_init = true;
d703c2b1
RV
2124 free(cc_path);
2125 return 0;
2126 }
7e7243e1
SH
2127 r = lxc_read_from_file(cc_path, buf, 1);
2128 if (r == 1 && buf[0] == '1') {
2129 free(cc_path);
2edb53c7 2130 return 0;
7e7243e1 2131 }
33ad9f1a 2132 r = lxc_write_to_file(cc_path, "1", 1, false);
2edb53c7
SH
2133 saved_errno = errno;
2134 free(cc_path);
2135 errno = saved_errno;
2136 return r < 0 ? -1 : 0;
33ad9f1a
CS
2137 }
2138 return 0;
b98f7d6e 2139}
484ed030 2140
934b1673 2141static int cgroup_read_from_file(const char *fn, char buf[], size_t bufsize)
d703c2b1
RV
2142{
2143 int ret = lxc_read_from_file(fn, buf, bufsize);
2144 if (ret < 0) {
2145 SYSERROR("failed to read %s", fn);
934b1673 2146 return ret;
d703c2b1
RV
2147 }
2148 if (ret == bufsize) {
934b1673
SH
2149 if (bufsize > 0) {
2150 /* obviously this wasn't empty */
2151 buf[bufsize-1] = '\0';
2152 return ret;
2153 }
2154 /* Callers don't do this, but regression/sanity check */
2155 ERROR("%s: was not expecting 0 bufsize", __func__);
2156 return -1;
d703c2b1
RV
2157 }
2158 buf[ret] = '\0';
934b1673 2159 return ret;
d703c2b1
RV
2160}
2161
2162static bool do_init_cpuset_file(struct cgroup_mount_point *mp,
2163 const char *path, const char *name)
2164{
934b1673
SH
2165 char value[1024];
2166 char *childfile, *parentfile = NULL, *tmp;
2167 int ret;
2168 bool ok = false;
2169
d703c2b1
RV
2170 childfile = cgroup_to_absolute_path(mp, path, name);
2171 if (!childfile)
2172 return false;
2173
2174 /* don't overwrite a non-empty value in the file */
934b1673
SH
2175 ret = cgroup_read_from_file(childfile, value, sizeof(value));
2176 if (ret < 0)
2177 goto out;
d703c2b1 2178 if (value[0] != '\0' && value[0] != '\n') {
934b1673
SH
2179 ok = true;
2180 goto out;
d703c2b1
RV
2181 }
2182
2183 /* path to the same name in the parent cgroup */
2184 parentfile = strdup(path);
2185 if (!parentfile)
934b1673
SH
2186 goto out;
2187
d703c2b1 2188 tmp = strrchr(parentfile, '/');
934b1673
SH
2189 if (!tmp)
2190 goto out;
d703c2b1
RV
2191 if (tmp == parentfile)
2192 tmp++; /* keep the '/' at the start */
2193 *tmp = '\0';
2194 tmp = parentfile;
2195 parentfile = cgroup_to_absolute_path(mp, tmp, name);
2196 free(tmp);
934b1673
SH
2197 if (!parentfile)
2198 goto out;
d703c2b1
RV
2199
2200 /* copy from parent to child cgroup */
934b1673
SH
2201 ret = cgroup_read_from_file(parentfile, value, sizeof(value));
2202 if (ret < 0)
2203 goto out;
2204 if (ret == sizeof(value)) {
2205 /* If anyone actually sees this error, we can address it */
2206 ERROR("parent cpuset value too long");
2207 goto out;
d703c2b1
RV
2208 }
2209 ok = (lxc_write_to_file(childfile, value, strlen(value), false) >= 0);
2210 if (!ok)
2211 SYSERROR("failed writing %s", childfile);
b1dad6f6
RV
2212
2213out:
f10fad2f 2214 free(parentfile);
d703c2b1 2215 free(childfile);
d703c2b1
RV
2216 return ok;
2217}
2218
2219static bool init_cpuset_if_needed(struct cgroup_mount_point *mp,
2220 const char *path)
2221{
2222 /* the files we have to handle here are only in cpuset hierarchies */
2223 if (!lxc_string_in_array("cpuset",
2224 (const char **)mp->hierarchy->subsystems))
2225 return true;
2226
b1dad6f6
RV
2227 if (!mp->need_cpuset_init)
2228 return true;
2229
d703c2b1
RV
2230 return (do_init_cpuset_file(mp, path, "/cpuset.cpus") &&
2231 do_init_cpuset_file(mp, path, "/cpuset.mems") );
2232}
2233
4fb3cba5 2234struct cgroup_ops *cgfs_ops_init(void)
484ed030 2235{
4fb3cba5 2236 return &cgfs_ops;
d4ef7c50 2237}
484ed030 2238
4fb3cba5 2239static void *cgfs_init(const char *name)
d4ef7c50 2240{
4fb3cba5 2241 struct cgfs_data *d;
484ed030 2242
4fb3cba5
DE
2243 d = malloc(sizeof(*d));
2244 if (!d)
2245 return NULL;
484ed030 2246
4fb3cba5
DE
2247 memset(d, 0, sizeof(*d));
2248 d->name = strdup(name);
2249 if (!d->name)
2250 goto err1;
2251
5e1c5795 2252 d->cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
4fb3cba5
DE
2253
2254 d->meta = lxc_cgroup_load_meta();
2255 if (!d->meta) {
2256 ERROR("cgroupfs failed to detect cgroup metadata");
2257 goto err2;
2258 }
2259 return d;
2260
2261err2:
2262 free(d->name);
2263err1:
2264 free(d);
2265 return NULL;
d4ef7c50 2266}
484ed030 2267
4fb3cba5 2268static void cgfs_destroy(void *hdata)
d4ef7c50 2269{
4fb3cba5
DE
2270 struct cgfs_data *d = hdata;
2271
d4ef7c50
SH
2272 if (!d)
2273 return;
f10fad2f 2274 free(d->name);
c55d4505
ME
2275 lxc_cgroup_process_info_free_and_remove(d->info);
2276 lxc_cgroup_put_meta(d->meta);
d4ef7c50 2277 free(d);
d4ef7c50 2278}
484ed030 2279
4fb3cba5 2280static inline bool cgfs_create(void *hdata)
d4ef7c50 2281{
4fb3cba5
DE
2282 struct cgfs_data *d = hdata;
2283 struct cgroup_process_info *i;
2284 struct cgroup_meta_data *md;
484ed030 2285
4fb3cba5 2286 if (!d)
d4ef7c50 2287 return false;
4fb3cba5
DE
2288 md = d->meta;
2289 i = lxc_cgroupfs_create(d->name, d->cgroup_pattern, md, NULL);
d4ef7c50
SH
2290 if (!i)
2291 return false;
2292 d->info = i;
2293 return true;
2294}
484ed030 2295
4fb3cba5 2296static inline bool cgfs_enter(void *hdata, pid_t pid)
d4ef7c50 2297{
4fb3cba5
DE
2298 struct cgfs_data *d = hdata;
2299 struct cgroup_process_info *i;
d4ef7c50 2300 int ret;
4fb3cba5
DE
2301
2302 if (!d)
2303 return false;
2304 i = d->info;
2305 ret = lxc_cgroupfs_enter(i, pid, false);
484ed030 2306
d4ef7c50
SH
2307 return ret == 0;
2308}
2309
4fb3cba5 2310static inline bool cgfs_create_legacy(void *hdata, pid_t pid)
d4ef7c50 2311{
4fb3cba5
DE
2312 struct cgfs_data *d = hdata;
2313 struct cgroup_process_info *i;
2314
2315 if (!d)
2316 return false;
2317 i = d->info;
2318 if (lxc_cgroup_create_legacy(i, d->name, pid) < 0) {
2319 ERROR("failed to create legacy ns cgroups for '%s'", d->name);
d4ef7c50 2320 return false;
484ed030 2321 }
d4ef7c50
SH
2322 return true;
2323}
484ed030 2324
4fb3cba5 2325static const char *cgfs_get_cgroup(void *hdata, const char *subsystem)
d4ef7c50 2326{
4fb3cba5
DE
2327 struct cgfs_data *d = hdata;
2328
2329 if (!d)
2330 return NULL;
2331 return lxc_cgroup_get_hierarchy_path_data(subsystem, d);
484ed030
SH
2332}
2333
2ba7a429
TA
2334static const char *cgfs_canonical_path(void *hdata)
2335{
2336 struct cgfs_data *d = hdata;
2337 struct cgroup_process_info *info_ptr;
2338 char *path = NULL;
2339
2340 if (!d)
2341 return NULL;
2342
2343 for (info_ptr = d->info; info_ptr; info_ptr = info_ptr->next) {
2344 if (!path)
2345 path = info_ptr->cgroup_path;
2346 else if (strcmp(path, info_ptr->cgroup_path) != 0) {
2347 ERROR("not all paths match %s, %s has path %s", path,
2348 info_ptr->hierarchy->subsystems[0], info_ptr->cgroup_path);
2349 return NULL;
2350 }
2351 }
2352
2353 return path;
2354}
2355
06078509
TA
2356static bool cgfs_escape(void)
2357{
2358 struct cgroup_meta_data *md;
2359 int i;
2360 bool ret = false;
2361
2362 md = lxc_cgroup_load_meta();
2363 if (!md)
2364 return false;
2365
2366 for (i = 1; i <= md->maximum_hierarchy; i++) {
2367 struct cgroup_hierarchy *h = md->hierarchies[i];
2368 struct cgroup_mount_point *mp;
2369 char *tasks;
2370 FILE *f;
2371 int written;
2372
2373 if (!h) {
2374 WARN("not escaping hierarchy %d", i);
2375 continue;
2376 }
2377
2378 mp = lxc_cgroup_find_mount_point(h, "/", true);
2379 if (!mp)
2380 goto out;
2381
2382 tasks = cgroup_to_absolute_path(mp, "/", "tasks");
2383 if (!tasks)
2384 goto out;
2385
2386 f = fopen(tasks, "a");
2387 free(tasks);
2388 if (!f)
2389 goto out;
2390
2391 written = fprintf(f, "%d\n", getpid());
2392 fclose(f);
2393 if (written < 0) {
2394 SYSERROR("writing tasks failed\n");
2395 goto out;
2396 }
2397 }
2398
2399 ret = true;
2400out:
2401 lxc_cgroup_put_meta(md);
2402 return ret;
2403}
2404
4fb3cba5 2405static bool cgfs_unfreeze(void *hdata)
0086f499 2406{
4fb3cba5 2407 struct cgfs_data *d = hdata;
0086f499
SH
2408 char *cgabspath, *cgrelpath;
2409 int ret;
2410
4fb3cba5
DE
2411 if (!d)
2412 return false;
2413
2414 cgrelpath = lxc_cgroup_get_hierarchy_path_data("freezer", d);
0086f499
SH
2415 cgabspath = lxc_cgroup_find_abs_path("freezer", cgrelpath, true, NULL);
2416 if (!cgabspath)
ecfcb3f0 2417 return false;
0086f499
SH
2418
2419 ret = do_cgroup_set(cgabspath, "freezer.state", "THAWED");
2420 free(cgabspath);
ecfcb3f0 2421 return ret == 0;
0086f499
SH
2422}
2423
4fb3cba5
DE
2424static bool cgroupfs_setup_limits(void *hdata, struct lxc_list *cgroup_conf,
2425 bool with_devices)
9daf6f5d 2426{
4fb3cba5
DE
2427 struct cgfs_data *d = hdata;
2428
2429 if (!d)
2430 return false;
2431 return do_setup_cgroup_limits(d, cgroup_conf, with_devices) == 0;
9daf6f5d
SH
2432}
2433
4fb3cba5 2434static bool lxc_cgroupfs_attach(const char *name, const char *lxcpath, pid_t pid)
5d897655
SH
2435{
2436 struct cgroup_meta_data *meta_data;
2437 struct cgroup_process_info *container_info;
2438 int ret;
2439
2440 meta_data = lxc_cgroup_load_meta();
2441 if (!meta_data) {
2442 ERROR("could not move attached process %d to cgroup of container", pid);
2443 return false;
2444 }
2445
2446 container_info = lxc_cgroup_get_container_info(name, lxcpath, meta_data);
2447 lxc_cgroup_put_meta(meta_data);
2448 if (!container_info) {
2449 ERROR("could not move attached process %d to cgroup of container", pid);
2450 return false;
2451 }
2452
2453 ret = lxc_cgroupfs_enter(container_info, pid, false);
2454 lxc_cgroup_process_info_free(container_info);
2455 if (ret < 0) {
2456 ERROR("could not move attached process %d to cgroup of container", pid);
2457 return false;
2458 }
2459 return true;
2460}
2461
8b276860
SH
2462struct chown_data {
2463 const char *cgroup_path;
2464 uid_t origuid;
2465};
2466
2467/*
2468 * TODO - someone should refactor this to unshare once passing all the paths
2469 * to be chowned in one go
2470 */
2471static int chown_cgroup_wrapper(void *data)
2472{
2473 struct chown_data *arg = data;
2474 uid_t destuid;
2475 char *fpath;
2476
2477
2478 if (setresgid(0,0,0) < 0)
2479 SYSERROR("Failed to setgid to 0");
2480 if (setresuid(0,0,0) < 0)
2481 SYSERROR("Failed to setuid to 0");
2482 if (setgroups(0, NULL) < 0)
2483 SYSERROR("Failed to clear groups");
2484 destuid = get_ns_uid(arg->origuid);
2485
2486 if (chown(arg->cgroup_path, destuid, 0) < 0)
2487 SYSERROR("Failed chowning %s to %d", arg->cgroup_path, (int)destuid);
2488
2489 fpath = lxc_append_paths(arg->cgroup_path, "tasks");
2490 if (!fpath)
2491 return -1;
2492 if (chown(fpath, destuid, 0) < 0)
2493 SYSERROR("Error chowning %s\n", fpath);
2494 free(fpath);
2495 fpath = lxc_append_paths(arg->cgroup_path, "cgroup.procs");
2496 if (!fpath)
2497 return -1;
2498 if (chown(fpath, destuid, 0) < 0)
2499 SYSERROR("Error chowning %s", fpath);
2500 free(fpath);
2501
2502 return 0;
2503}
2504
2505static bool do_cgfs_chown(char *cgroup_path, struct lxc_conf *conf)
2506{
2507 struct chown_data data;
2508 char *fpath;
2509
2510 if (lxc_list_empty(&conf->id_map))
2511 /* If there's no mapping then we don't need to chown */
2512 return true;
2513
2514 data.cgroup_path = cgroup_path;
2515 data.origuid = geteuid();
2516
2517 /* Unpriv users can't chown it themselves, so chown from
2518 * a child namespace mapping both our own and the target uid
2519 */
2520 if (userns_exec_1(conf, chown_cgroup_wrapper, &data) < 0) {
2521 ERROR("Error requesting cgroup chown in new namespace");
2522 return false;
2523 }
2524
2525 /*
2526 * Now chmod 775 the directory else the container cannot create cgroups.
2527 * This can't be done in the child namespace because it only group-owns
2528 * the cgroup
2529 */
2530 if (chmod(cgroup_path, 0775) < 0) {
2531 SYSERROR("Error chmoding %s\n", cgroup_path);
2532 return false;
2533 }
2534 fpath = lxc_append_paths(cgroup_path, "tasks");
2535 if (!fpath)
2536 return false;
2537 if (chmod(fpath, 0664) < 0)
2538 SYSERROR("Error chmoding %s\n", fpath);
2539 free(fpath);
2540 fpath = lxc_append_paths(cgroup_path, "cgroup.procs");
2541 if (!fpath)
2542 return false;
2543 if (chmod(fpath, 0664) < 0)
2544 SYSERROR("Error chmoding %s\n", fpath);
2545 free(fpath);
2546
2547 return true;
2548}
2549
2550static bool cgfs_chown(void *hdata, struct lxc_conf *conf)
2551{
2552 struct cgfs_data *d = hdata;
2553 struct cgroup_process_info *info_ptr;
2554 char *cgpath;
2555 bool r = true;
2556
2557 if (!d)
2558 return false;
2559
2560 for (info_ptr = d->info; info_ptr; info_ptr = info_ptr->next) {
2561 if (!info_ptr->designated_mount_point) {
2562 info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, info_ptr->cgroup_path, true);
2563 if (!info_ptr->designated_mount_point) {
2564 SYSERROR("Could not chown cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", info_ptr->cgroup_path);
2565 return false;
2566 }
2567 }
2568
2569 cgpath = cgroup_to_absolute_path(info_ptr->designated_mount_point, info_ptr->cgroup_path, NULL);
2570 if (!cgpath) {
2571 SYSERROR("Could not chown cgroup %s: internal error", info_ptr->cgroup_path);
2572 continue;
2573 }
2574 r = do_cgfs_chown(cgpath, conf);
2575 if (!r) {
2576 ERROR("Failed chowning %s\n", cgpath);
2577 free(cgpath);
2578 return false;
2579 }
2580 free(cgpath);
2581 }
2582
2583 return true;
2584}
2585
d4ef7c50 2586static struct cgroup_ops cgfs_ops = {
d4ef7c50 2587 .init = cgfs_init,
4fb3cba5 2588 .destroy = cgfs_destroy,
d4ef7c50
SH
2589 .create = cgfs_create,
2590 .enter = cgfs_enter,
2591 .create_legacy = cgfs_create_legacy,
2592 .get_cgroup = cgfs_get_cgroup,
2ba7a429 2593 .canonical_path = cgfs_canonical_path,
06078509 2594 .escape = cgfs_escape,
d4ef7c50
SH
2595 .get = lxc_cgroupfs_get,
2596 .set = lxc_cgroupfs_set,
4fb3cba5 2597 .unfreeze = cgfs_unfreeze,
9daf6f5d 2598 .setup_limits = cgroupfs_setup_limits,
d4ef7c50 2599 .name = "cgroupfs",
5d897655 2600 .attach = lxc_cgroupfs_attach,
8b276860 2601 .chown = cgfs_chown,
c476bdce 2602 .mount_cgroup = cgroupfs_mount_cgroup,
4fb3cba5 2603 .nrtasks = cgfs_nrtasks,
23befb18 2604 .driver = CGFS,
d4ef7c50 2605};