]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgfs.c
Merge pull request #855 from hallyn/2016-02-26/cgfs.crucial
[mirror_lxc.git] / src / lxc / cgfs.c
CommitLineData
576f946d 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
576f946d 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
576f946d 22 */
d06245b8
NC
23#include "config.h"
24
576f946d 25#include <stdio.h>
576f946d 26#include <stdlib.h>
27#include <errno.h>
576f946d 28#include <unistd.h>
29#include <string.h>
341a9bd8 30#include <dirent.h>
576f946d 31#include <fcntl.h>
8b276860 32#include <grp.h>
b98f7d6e 33#include <ctype.h>
576f946d 34#include <sys/types.h>
35#include <sys/stat.h>
36#include <sys/param.h>
37#include <sys/inotify.h>
aae1f3c4 38#include <sys/mount.h>
576f946d 39#include <netinet/in.h>
40#include <net/if.h>
41
e2bcd7db 42#include "error.h"
ae5c8b8e 43#include "commands.h"
b98f7d6e
SH
44#include "list.h"
45#include "conf.h"
33ad9f1a 46#include "utils.h"
4ec31c52 47#include "bdev/bdev.h"
f2363e38
ÇO
48#include "log.h"
49#include "cgroup.h"
50#include "start.h"
484ed030 51#include "state.h"
36eb9bde 52
edaf8b1b
SG
53#if IS_BIONIC
54#include <../include/lxcmntent.h>
55#else
56#include <mntent.h>
57#endif
58
4fb3cba5
DE
59struct cgroup_hierarchy;
60struct cgroup_meta_data;
61struct cgroup_mount_point;
62
63/*
64 * cgroup_meta_data: the metadata about the cgroup infrastructure on this
65 * host
66 */
67struct cgroup_meta_data {
68 ptrdiff_t ref; /* simple refcount */
69 struct cgroup_hierarchy **hierarchies;
70 struct cgroup_mount_point **mount_points;
71 int maximum_hierarchy;
72};
73
74/*
75 * cgroup_hierarchy: describes a single cgroup hierarchy
76 * (may have multiple mount points)
77 */
78struct cgroup_hierarchy {
79 int index;
80 bool used; /* false if the hierarchy should be ignored by lxc */
81 char **subsystems;
82 struct cgroup_mount_point *rw_absolute_mount_point;
83 struct cgroup_mount_point *ro_absolute_mount_point;
84 struct cgroup_mount_point **all_mount_points;
85 size_t all_mount_point_capacity;
86};
87
88/*
89 * cgroup_mount_point: a mount point to where a hierarchy
90 * is mounted to
91 */
92struct cgroup_mount_point {
93 struct cgroup_hierarchy *hierarchy;
94 char *mount_point;
95 char *mount_prefix;
96 bool read_only;
97 bool need_cpuset_init;
98};
99
100/*
101 * cgroup_process_info: describes the membership of a
102 * process to the different cgroup
103 * hierarchies
104 *
105 * Note this is the per-process info tracked by the cgfs_ops.
106 * This is not used with cgmanager.
107 */
108struct cgroup_process_info {
109 struct cgroup_process_info *next;
110 struct cgroup_meta_data *meta_ref;
111 struct cgroup_hierarchy *hierarchy;
112 char *cgroup_path;
113 char *cgroup_path_sub;
114 char **created_paths;
115 size_t created_paths_capacity;
116 size_t created_paths_count;
117 struct cgroup_mount_point *designated_mount_point;
118};
119
120struct cgfs_data {
121 char *name;
122 const char *cgroup_pattern;
123 struct cgroup_meta_data *meta;
124 struct cgroup_process_info *info;
125};
126
127lxc_log_define(lxc_cgfs, lxc);
576f946d 128
33ad9f1a
CS
129static struct cgroup_process_info *lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str, struct cgroup_meta_data *meta);
130static char **subsystems_from_mount_options(const char *mount_options, char **kernel_list);
131static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp);
132static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h);
133static bool is_valid_cgroup(const char *name);
33ad9f1a 134static int create_cgroup(struct cgroup_mount_point *mp, const char *path);
603c64c2 135static int remove_cgroup(struct cgroup_mount_point *mp, const char *path, bool recurse);
33ad9f1a
CS
136static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp, const char *path, const char *suffix);
137static struct cgroup_process_info *find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem);
138static int do_cgroup_get(const char *cgroup_path, const char *sub_filename, char *value, size_t len);
139static int do_cgroup_set(const char *cgroup_path, const char *sub_filename, const char *value);
4fb3cba5
DE
140static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d, char *v, bool for_allow);
141static int do_setup_cgroup_limits(struct cgfs_data *d, struct lxc_list *cgroup_settings, bool do_devices);
33ad9f1a
CS
142static int cgroup_recursive_task_count(const char *cgroup_path);
143static int count_lines(const char *fn);
1ea59ad2 144static int handle_cgroup_settings(struct cgroup_mount_point *mp, char *cgroup_path);
d703c2b1 145static bool init_cpuset_if_needed(struct cgroup_mount_point *mp, const char *path);
33ad9f1a 146
4fb3cba5
DE
147static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist);
148static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data);
149static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data);
150
151/* free process membership information */
152static void lxc_cgroup_process_info_free(struct cgroup_process_info *info);
153static void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info);
154
d4ef7c50 155static struct cgroup_ops cgfs_ops;
d4ef7c50 156
603c64c2
SH
157static int cgroup_rmdir(char *dirname)
158{
159 struct dirent dirent, *direntp;
160 int saved_errno = 0;
161 DIR *dir;
162 int ret, failed=0;
163 char pathname[MAXPATHLEN];
164
165 dir = opendir(dirname);
166 if (!dir) {
167 ERROR("%s: failed to open %s", __func__, dirname);
168 return -1;
169 }
170
171 while (!readdir_r(dir, &dirent, &direntp)) {
172 struct stat mystat;
173 int rc;
174
175 if (!direntp)
176 break;
177
178 if (!strcmp(direntp->d_name, ".") ||
179 !strcmp(direntp->d_name, ".."))
180 continue;
181
182 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
183 if (rc < 0 || rc >= MAXPATHLEN) {
184 ERROR("pathname too long");
185 failed=1;
186 if (!saved_errno)
187 saved_errno = -ENOMEM;
188 continue;
189 }
190 ret = lstat(pathname, &mystat);
191 if (ret) {
192 SYSERROR("%s: failed to stat %s", __func__, pathname);
193 failed=1;
194 if (!saved_errno)
195 saved_errno = errno;
196 continue;
197 }
198 if (S_ISDIR(mystat.st_mode)) {
199 if (cgroup_rmdir(pathname) < 0) {
200 if (!saved_errno)
201 saved_errno = errno;
202 failed=1;
203 }
204 }
205 }
206
207 if (rmdir(dirname) < 0) {
208 SYSERROR("%s: failed to delete %s", __func__, dirname);
209 if (!saved_errno)
210 saved_errno = errno;
211 failed=1;
212 }
213
214 ret = closedir(dir);
215 if (ret) {
216 SYSERROR("%s: failed to close directory %s", __func__, dirname);
217 if (!saved_errno)
218 saved_errno = errno;
219 failed=1;
220 }
221
222 errno = saved_errno;
223 return failed ? -1 : 0;
224}
225
4fb3cba5 226static struct cgroup_meta_data *lxc_cgroup_load_meta()
33ad9f1a
CS
227{
228 const char *cgroup_use = NULL;
229 char **cgroup_use_list = NULL;
230 struct cgroup_meta_data *md = NULL;
231 int saved_errno;
232
233 errno = 0;
593e8478 234 cgroup_use = lxc_global_config_value("lxc.cgroup.use");
33ad9f1a
CS
235 if (!cgroup_use && errno != 0)
236 return NULL;
237 if (cgroup_use) {
238 cgroup_use_list = lxc_string_split_and_trim(cgroup_use, ',');
239 if (!cgroup_use_list)
240 return NULL;
241 }
576f946d 242
33ad9f1a
CS
243 md = lxc_cgroup_load_meta2((const char **)cgroup_use_list);
244 saved_errno = errno;
245 lxc_free_array((void **)cgroup_use_list, free);
246 errno = saved_errno;
247 return md;
248}
fd37327f 249
b653309a 250/* Step 1: determine all kernel subsystems */
4fb3cba5 251static bool find_cgroup_subsystems(char ***kernel_subsystems)
1d39a065 252{
b653309a
SH
253 FILE *proc_cgroups;
254 bool bret = false;
33ad9f1a
CS
255 char *line = NULL;
256 size_t sz = 0;
b653309a
SH
257 size_t kernel_subsystems_count = 0;
258 size_t kernel_subsystems_capacity = 0;
259 int r;
1d39a065 260
33ad9f1a
CS
261 proc_cgroups = fopen_cloexec("/proc/cgroups", "r");
262 if (!proc_cgroups)
b653309a 263 return false;
1d39a065 264
33ad9f1a
CS
265 while (getline(&line, &sz, proc_cgroups) != -1) {
266 char *tab1;
267 char *tab2;
268 int hierarchy_number;
1d39a065 269
33ad9f1a
CS
270 if (line[0] == '#')
271 continue;
272 if (!line[0])
273 continue;
1d39a065 274
33ad9f1a
CS
275 tab1 = strchr(line, '\t');
276 if (!tab1)
8900b9eb 277 continue;
33ad9f1a
CS
278 *tab1++ = '\0';
279 tab2 = strchr(tab1, '\t');
280 if (!tab2)
281 continue;
282 *tab2 = '\0';
fd37327f 283
33ad9f1a
CS
284 tab2 = NULL;
285 hierarchy_number = strtoul(tab1, &tab2, 10);
286 if (!tab2 || *tab2)
287 continue;
288 (void)hierarchy_number;
289
b653309a 290 r = lxc_grow_array((void ***)kernel_subsystems, &kernel_subsystems_capacity, kernel_subsystems_count + 1, 12);
33ad9f1a 291 if (r < 0)
b653309a
SH
292 goto out;
293 (*kernel_subsystems)[kernel_subsystems_count] = strdup(line);
294 if (!(*kernel_subsystems)[kernel_subsystems_count])
295 goto out;
33ad9f1a 296 kernel_subsystems_count++;
bcbd102c 297 }
b653309a 298 bret = true;
0d9f8e18 299
b653309a 300out:
33ad9f1a 301 fclose(proc_cgroups);
0ccf7c2a 302 free(line);
b653309a
SH
303 return bret;
304}
305
306/* Step 2: determine all hierarchies (by reading /proc/self/cgroup),
307 * since mount points don't specify hierarchy number and
308 * /proc/cgroups does not contain named hierarchies
309 */
310static bool find_cgroup_hierarchies(struct cgroup_meta_data *meta_data,
311 bool all_kernel_subsystems, bool all_named_subsystems,
312 const char **subsystem_whitelist)
313{
314 FILE *proc_self_cgroup;
315 char *line = NULL;
316 size_t sz = 0;
317 int r;
318 bool bret = false;
319 size_t hierarchy_capacity = 0;
ef6e34ee 320
33ad9f1a
CS
321 proc_self_cgroup = fopen_cloexec("/proc/self/cgroup", "r");
322 /* if for some reason (because of setns() and pid namespace for example),
323 * /proc/self is not valid, we try /proc/1/cgroup... */
324 if (!proc_self_cgroup)
325 proc_self_cgroup = fopen_cloexec("/proc/1/cgroup", "r");
326 if (!proc_self_cgroup)
b653309a 327 return false;
33ad9f1a
CS
328
329 while (getline(&line, &sz, proc_self_cgroup) != -1) {
330 /* file format: hierarchy:subsystems:group,
331 * we only extract hierarchy and subsystems
332 * here */
333 char *colon1;
334 char *colon2;
335 int hierarchy_number;
336 struct cgroup_hierarchy *h = NULL;
337 char **p;
338
339 if (!line[0])
340 continue;
ad08bbb7 341
33ad9f1a
CS
342 colon1 = strchr(line, ':');
343 if (!colon1)
8900b9eb 344 continue;
33ad9f1a
CS
345 *colon1++ = '\0';
346 colon2 = strchr(colon1, ':');
347 if (!colon2)
348 continue;
349 *colon2 = '\0';
ad08bbb7 350
33ad9f1a
CS
351 colon2 = NULL;
352 hierarchy_number = strtoul(line, &colon2, 10);
353 if (!colon2 || *colon2)
354 continue;
576f946d 355
33ad9f1a
CS
356 if (hierarchy_number > meta_data->maximum_hierarchy) {
357 /* lxc_grow_array will never shrink, so even if we find a lower
358 * hierarchy number here, the array will never be smaller
359 */
360 r = lxc_grow_array((void ***)&meta_data->hierarchies, &hierarchy_capacity, hierarchy_number + 1, 12);
361 if (r < 0)
b653309a 362 goto out;
5193cc3d 363
33ad9f1a
CS
364 meta_data->maximum_hierarchy = hierarchy_number;
365 }
fd37327f 366
33ad9f1a
CS
367 /* this shouldn't happen, we had this already */
368 if (meta_data->hierarchies[hierarchy_number])
b653309a 369 goto out;
33ad9f1a
CS
370
371 h = calloc(1, sizeof(struct cgroup_hierarchy));
372 if (!h)
b653309a 373 goto out;
33ad9f1a
CS
374
375 meta_data->hierarchies[hierarchy_number] = h;
376
377 h->index = hierarchy_number;
378 h->subsystems = lxc_string_split_and_trim(colon1, ',');
379 if (!h->subsystems)
b653309a 380 goto out;
33ad9f1a
CS
381 /* see if this hierarchy should be considered */
382 if (!all_kernel_subsystems || !all_named_subsystems) {
383 for (p = h->subsystems; *p; p++) {
384 if (!strncmp(*p, "name=", 5)) {
385 if (all_named_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
386 h->used = true;
387 break;
388 }
389 } else {
390 if (all_kernel_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
391 h->used = true;
392 break;
393 }
394 }
395 }
396 } else {
397 /* we want all hierarchy anyway */
398 h->used = true;
ae5c8b8e 399 }
ae5c8b8e 400 }
b653309a 401 bret = true;
0b9c21ab 402
b653309a 403out:
33ad9f1a 404 fclose(proc_self_cgroup);
0ccf7c2a 405 free(line);
b653309a
SH
406 return bret;
407}
408
409/* Step 3: determine all mount points of each hierarchy */
410static bool find_hierarchy_mountpts( struct cgroup_meta_data *meta_data, char **kernel_subsystems)
411{
412 bool bret = false;
413 FILE *proc_self_mountinfo;
414 char *line = NULL;
415 size_t sz = 0;
416 char **tokens = NULL;
417 size_t mount_point_count = 0;
418 size_t mount_point_capacity = 0;
419 size_t token_capacity = 0;
420 int r;
421
33ad9f1a
CS
422 proc_self_mountinfo = fopen_cloexec("/proc/self/mountinfo", "r");
423 /* if for some reason (because of setns() and pid namespace for example),
424 * /proc/self is not valid, we try /proc/1/cgroup... */
425 if (!proc_self_mountinfo)
426 proc_self_mountinfo = fopen_cloexec("/proc/1/mountinfo", "r");
427 if (!proc_self_mountinfo)
b653309a 428 return false;
33ad9f1a
CS
429
430 while (getline(&line, &sz, proc_self_mountinfo) != -1) {
178938fe 431 char *token, *line_tok, *saveptr = NULL;
33ad9f1a
CS
432 size_t i, j, k;
433 struct cgroup_mount_point *mount_point;
434 struct cgroup_hierarchy *h;
435 char **subsystems;
836514a8 436 bool is_lxcfs = false;
33ad9f1a
CS
437
438 if (line[0] && line[strlen(line) - 1] == '\n')
439 line[strlen(line) - 1] = '\0';
440
178938fe 441 for (i = 0, line_tok = line; (token = strtok_r(line_tok, " ", &saveptr)); line_tok = NULL) {
33ad9f1a
CS
442 r = lxc_grow_array((void ***)&tokens, &token_capacity, i + 1, 64);
443 if (r < 0)
b653309a 444 goto out;
33ad9f1a
CS
445 tokens[i++] = token;
446 }
b98f7d6e 447
33ad9f1a
CS
448 /* layout of /proc/self/mountinfo:
449 * 0: id
450 * 1: parent id
451 * 2: device major:minor
452 * 3: mount prefix
8900b9eb 453 * 4: mount point
33ad9f1a
CS
454 * 5: per-mount options
455 * [optional X]: additional data
456 * X+7: "-"
457 * X+8: type
458 * X+9: source
459 * X+10: per-superblock options
460 */
461 for (j = 6; j < i && tokens[j]; j++)
462 if (!strcmp(tokens[j], "-"))
463 break;
fd4f5a56 464
33ad9f1a
CS
465 /* could not find separator */
466 if (j >= i || !tokens[j])
467 continue;
468 /* there should be exactly three fields after
469 * the separator
470 */
471 if (i != j + 4)
472 continue;
fd4f5a56 473
33ad9f1a 474 /* not a cgroup filesystem */
836514a8
U
475 if (strcmp(tokens[j + 1], "cgroup") != 0) {
476 if (strcmp(tokens[j + 1], "fuse.lxcfs") != 0)
477 continue;
478 if (strncmp(tokens[4], "/sys/fs/cgroup/", 15) != 0)
479 continue;
480 is_lxcfs = true;
481 char *curtok = tokens[4] + 15;
482 subsystems = subsystems_from_mount_options(curtok,
483 kernel_subsystems);
484 } else
485 subsystems = subsystems_from_mount_options(tokens[j + 3],
486 kernel_subsystems);
33ad9f1a 487 if (!subsystems)
b653309a 488 goto out;
33ad9f1a
CS
489
490 h = NULL;
491 for (k = 1; k <= meta_data->maximum_hierarchy; k++) {
492 if (meta_data->hierarchies[k] &&
493 meta_data->hierarchies[k]->subsystems[0] &&
494 lxc_string_in_array(meta_data->hierarchies[k]->subsystems[0], (const char **)subsystems)) {
495 /* TODO: we could also check if the lists really match completely,
496 * just to have an additional sanity check */
497 h = meta_data->hierarchies[k];
b98f7d6e 498 break;
33ad9f1a 499 }
b98f7d6e 500 }
33ad9f1a
CS
501 lxc_free_array((void **)subsystems, free);
502
503 r = lxc_grow_array((void ***)&meta_data->mount_points, &mount_point_capacity, mount_point_count + 1, 12);
504 if (r < 0)
b653309a 505 goto out;
33ad9f1a
CS
506
507 /* create mount point object */
508 mount_point = calloc(1, sizeof(*mount_point));
509 if (!mount_point)
b653309a 510 goto out;
33ad9f1a
CS
511
512 meta_data->mount_points[mount_point_count++] = mount_point;
513
514 mount_point->hierarchy = h;
836514a8
U
515 if (is_lxcfs)
516 mount_point->mount_prefix = strdup("/");
517 else
518 mount_point->mount_prefix = strdup(tokens[3]);
33ad9f1a 519 mount_point->mount_point = strdup(tokens[4]);
33ad9f1a 520 if (!mount_point->mount_point || !mount_point->mount_prefix)
b653309a 521 goto out;
33ad9f1a
CS
522 mount_point->read_only = !lxc_string_in_list("rw", tokens[5], ',');
523
524 if (!strcmp(mount_point->mount_prefix, "/")) {
525 if (mount_point->read_only) {
526 if (!h->ro_absolute_mount_point)
527 h->ro_absolute_mount_point = mount_point;
528 } else {
529 if (!h->rw_absolute_mount_point)
530 h->rw_absolute_mount_point = mount_point;
531 }
b98f7d6e 532 }
ae5c8b8e 533
33ad9f1a
CS
534 k = lxc_array_len((void **)h->all_mount_points);
535 r = lxc_grow_array((void ***)&h->all_mount_points, &h->all_mount_point_capacity, k + 1, 4);
536 if (r < 0)
b653309a 537 goto out;
33ad9f1a 538 h->all_mount_points[k] = mount_point;
fd4f5a56 539 }
b653309a
SH
540 bret = true;
541
542out:
b653309a 543 fclose(proc_self_mountinfo);
b653309a 544 free(tokens);
2cdafc54 545 free(line);
b653309a
SH
546 return bret;
547}
548
4fb3cba5 549static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist)
b653309a
SH
550{
551 bool all_kernel_subsystems = true;
552 bool all_named_subsystems = false;
553 struct cgroup_meta_data *meta_data = NULL;
554 char **kernel_subsystems = NULL;
555 int saved_errno = 0;
556
557 /* if the subsystem whitelist is not specified, include all
558 * hierarchies that contain kernel subsystems by default but
559 * no hierarchies that only contain named subsystems
560 *
561 * if it is specified, the specifier @all will select all
562 * hierarchies, @kernel will select all hierarchies with
563 * kernel subsystems and @named will select all named
564 * hierarchies
565 */
566 all_kernel_subsystems = subsystem_whitelist ?
567 (lxc_string_in_array("@kernel", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
568 true;
569 all_named_subsystems = subsystem_whitelist ?
570 (lxc_string_in_array("@named", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
79c59e6b 571 true;
b653309a
SH
572
573 meta_data = calloc(1, sizeof(struct cgroup_meta_data));
574 if (!meta_data)
575 return NULL;
576 meta_data->ref = 1;
577
578 if (!find_cgroup_subsystems(&kernel_subsystems))
579 goto out_error;
580
581 if (!find_cgroup_hierarchies(meta_data, all_kernel_subsystems,
582 all_named_subsystems, subsystem_whitelist))
583 goto out_error;
584
585 if (!find_hierarchy_mountpts(meta_data, kernel_subsystems))
586 goto out_error;
fd4f5a56 587
33ad9f1a
CS
588 /* oops, we couldn't find anything */
589 if (!meta_data->hierarchies || !meta_data->mount_points) {
590 errno = EINVAL;
591 goto out_error;
ae5c8b8e 592 }
fd4f5a56 593
3a0abb3a 594 lxc_free_array((void **)kernel_subsystems, free);
33ad9f1a
CS
595 return meta_data;
596
597out_error:
598 saved_errno = errno;
33ad9f1a
CS
599 lxc_free_array((void **)kernel_subsystems, free);
600 lxc_cgroup_put_meta(meta_data);
601 errno = saved_errno;
602 return NULL;
fd4f5a56
DL
603}
604
4fb3cba5 605static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data)
e14f67a7 606{
33ad9f1a
CS
607 meta_data->ref++;
608 return meta_data;
609}
e14f67a7 610
4fb3cba5 611static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data)
33ad9f1a
CS
612{
613 size_t i;
614 if (!meta_data)
615 return NULL;
616 if (--meta_data->ref > 0)
617 return meta_data;
618 lxc_free_array((void **)meta_data->mount_points, (lxc_free_fn)lxc_cgroup_mount_point_free);
619 if (meta_data->hierarchies) {
620 for (i = 0; i <= meta_data->maximum_hierarchy; i++)
621 lxc_cgroup_hierarchy_free(meta_data->hierarchies[i]);
e14f67a7 622 }
33ad9f1a 623 free(meta_data->hierarchies);
178938fe 624 free(meta_data);
33ad9f1a 625 return NULL;
e14f67a7
U
626}
627
4fb3cba5 628static struct cgroup_hierarchy *lxc_cgroup_find_hierarchy(struct cgroup_meta_data *meta_data, const char *subsystem)
e14f67a7 629{
33ad9f1a
CS
630 size_t i;
631 for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
632 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
633 if (h && lxc_string_in_array(subsystem, (const char **)h->subsystems))
634 return h;
e14f67a7 635 }
e14f67a7
U
636 return NULL;
637}
638
d3f99e96
SH
639static bool mountpoint_is_accessible(struct cgroup_mount_point *mp)
640{
641 return mp && access(mp->mount_point, F_OK) == 0;
642}
643
4fb3cba5 644static struct cgroup_mount_point *lxc_cgroup_find_mount_point(struct cgroup_hierarchy *hierarchy, const char *group, bool should_be_writable)
b98f7d6e 645{
33ad9f1a
CS
646 struct cgroup_mount_point **mps;
647 struct cgroup_mount_point *current_result = NULL;
648 ssize_t quality = -1;
b98f7d6e 649
33ad9f1a 650 /* trivial case */
d3f99e96 651 if (mountpoint_is_accessible(hierarchy->rw_absolute_mount_point))
33ad9f1a 652 return hierarchy->rw_absolute_mount_point;
d3f99e96 653 if (!should_be_writable && mountpoint_is_accessible(hierarchy->ro_absolute_mount_point))
33ad9f1a 654 return hierarchy->ro_absolute_mount_point;
b98f7d6e 655
33ad9f1a
CS
656 for (mps = hierarchy->all_mount_points; mps && *mps; mps++) {
657 struct cgroup_mount_point *mp = *mps;
658 size_t prefix_len = mp->mount_prefix ? strlen(mp->mount_prefix) : 0;
b98f7d6e 659
33ad9f1a
CS
660 if (prefix_len == 1 && mp->mount_prefix[0] == '/')
661 prefix_len = 0;
b98f7d6e 662
d3f99e96
SH
663 if (!mountpoint_is_accessible(mp))
664 continue;
665
33ad9f1a
CS
666 if (should_be_writable && mp->read_only)
667 continue;
668
669 if (!prefix_len ||
670 (strncmp(group, mp->mount_prefix, prefix_len) == 0 &&
671 (group[prefix_len] == '\0' || group[prefix_len] == '/'))) {
672 /* search for the best quality match, i.e. the match with the
673 * shortest prefix where this group is still contained
674 */
675 if (quality == -1 || prefix_len < quality) {
676 current_result = mp;
677 quality = prefix_len;
678 }
b98f7d6e
SH
679 }
680 }
681
33ad9f1a
CS
682 if (!current_result)
683 errno = ENOENT;
684 return current_result;
b98f7d6e
SH
685}
686
4fb3cba5 687static char *lxc_cgroup_find_abs_path(const char *subsystem, const char *group, bool should_be_writable, const char *suffix)
b98f7d6e 688{
33ad9f1a
CS
689 struct cgroup_meta_data *meta_data;
690 struct cgroup_hierarchy *h;
691 struct cgroup_mount_point *mp;
692 char *result;
693 int saved_errno;
694
695 meta_data = lxc_cgroup_load_meta();
696 if (!meta_data)
697 return NULL;
b98f7d6e 698
33ad9f1a
CS
699 h = lxc_cgroup_find_hierarchy(meta_data, subsystem);
700 if (!h)
701 goto out_error;
b98f7d6e 702
33ad9f1a
CS
703 mp = lxc_cgroup_find_mount_point(h, group, should_be_writable);
704 if (!mp)
705 goto out_error;
b98f7d6e 706
33ad9f1a
CS
707 result = cgroup_to_absolute_path(mp, group, suffix);
708 if (!result)
709 goto out_error;
b98f7d6e 710
33ad9f1a
CS
711 lxc_cgroup_put_meta(meta_data);
712 return result;
b98f7d6e 713
33ad9f1a
CS
714out_error:
715 saved_errno = errno;
716 lxc_cgroup_put_meta(meta_data);
717 errno = saved_errno;
718 return NULL;
b98f7d6e
SH
719}
720
4fb3cba5 721static struct cgroup_process_info *lxc_cgroup_process_info_get(pid_t pid, struct cgroup_meta_data *meta)
fd4f5a56 722{
33ad9f1a
CS
723 char pid_buf[32];
724 snprintf(pid_buf, 32, "/proc/%lu/cgroup", (unsigned long)pid);
725 return lxc_cgroup_process_info_getx(pid_buf, meta);
c8f7c563
CS
726}
727
4fb3cba5 728static struct cgroup_process_info *lxc_cgroup_process_info_get_init(struct cgroup_meta_data *meta)
c8f7c563 729{
33ad9f1a
CS
730 return lxc_cgroup_process_info_get(1, meta);
731}
b98f7d6e 732
4fb3cba5 733static struct cgroup_process_info *lxc_cgroup_process_info_get_self(struct cgroup_meta_data *meta)
33ad9f1a
CS
734{
735 struct cgroup_process_info *i;
736 i = lxc_cgroup_process_info_getx("/proc/self/cgroup", meta);
737 if (!i)
738 i = lxc_cgroup_process_info_get(getpid(), meta);
739 return i;
740}
ae5c8b8e 741
692ba18f
SH
742/*
743 * If a controller has ns cgroup mounted, then in that cgroup the handler->pid
744 * is already in a new cgroup named after the pid. 'mnt' is passed in as
745 * the full current cgroup. Say that is /sys/fs/cgroup/lxc/2975 and the container
746 * name is c1. . We want to rename the cgroup directory to /sys/fs/cgroup/lxc/c1,
747 * and return the string /sys/fs/cgroup/lxc/c1.
748 */
cea0552e 749static char *cgroup_rename_nsgroup(const char *mountpath, const char *oldname, pid_t pid, const char *name)
692ba18f
SH
750{
751 char *dir, *fulloldpath;
752 char *newname, *fullnewpath;
cea0552e 753 int len, newlen, ret;
692ba18f
SH
754
755 /*
756 * if cgroup is mounted at /cgroup and task is in cgroup /ab/, pid 2375 and
757 * name is c1,
758 * dir: /ab
759 * fulloldpath = /cgroup/ab/2375
760 * fullnewpath = /cgroup/ab/c1
761 * newname = /ab/c1
762 */
763 dir = alloca(strlen(oldname) + 1);
764 strcpy(dir, oldname);
765
cea0552e
SH
766 len = strlen(oldname) + strlen(mountpath) + 22;
767 fulloldpath = alloca(len);
768 ret = snprintf(fulloldpath, len, "%s/%s/%ld", mountpath, oldname, (unsigned long)pid);
769 if (ret < 0 || ret >= len)
770 return NULL;
692ba18f
SH
771
772 len = strlen(dir) + strlen(name) + 2;
773 newname = malloc(len);
774 if (!newname) {
775 SYSERROR("Out of memory");
776 return NULL;
777 }
cea0552e
SH
778 ret = snprintf(newname, len, "%s/%s", dir, name);
779 if (ret < 0 || ret >= len) {
780 free(newname);
781 return NULL;
782 }
692ba18f 783
cea0552e
SH
784 newlen = strlen(mountpath) + len + 2;
785 fullnewpath = alloca(newlen);
786 ret = snprintf(fullnewpath, newlen, "%s/%s", mountpath, newname);
787 if (ret < 0 || ret >= newlen) {
788 free(newname);
789 return NULL;
790 }
692ba18f
SH
791
792 if (access(fullnewpath, F_OK) == 0) {
793 if (rmdir(fullnewpath) != 0) {
794 SYSERROR("container cgroup %s already exists.", fullnewpath);
795 free(newname);
796 return NULL;
797 }
798 }
799 if (rename(fulloldpath, fullnewpath)) {
800 SYSERROR("failed to rename cgroup %s->%s", fulloldpath, fullnewpath);
801 free(newname);
802 return NULL;
803 }
804
805 DEBUG("'%s' renamed to '%s'", oldname, newname);
806
807 return newname;
808}
809
ea439aac
SH
810static bool is_crucial_hierarchy(struct cgroup_hierarchy *h)
811{
812 char **p;
813
814 for (p = h->subsystems; *p; p++) {
815 if (is_crucial_cgroup_subsystem(*p))
816 return true;
817 }
818 return false;
819}
820
33ad9f1a 821/* create a new cgroup */
4fb3cba5 822static struct cgroup_process_info *lxc_cgroupfs_create(const char *name, const char *path_pattern, struct cgroup_meta_data *meta_data, const char *sub_pattern)
33ad9f1a 823{
001b026e 824 char **cgroup_path_components = NULL;
33ad9f1a
CS
825 char **p = NULL;
826 char *path_so_far = NULL;
827 char **new_cgroup_paths = NULL;
828 char **new_cgroup_paths_sub = NULL;
829 struct cgroup_mount_point *mp;
830 struct cgroup_hierarchy *h;
831 struct cgroup_process_info *base_info = NULL;
832 struct cgroup_process_info *info_ptr;
833 int saved_errno;
834 int r;
835 unsigned suffix = 0;
836 bool had_sub_pattern = false;
837 size_t i;
ae5c8b8e 838
33ad9f1a
CS
839 if (!is_valid_cgroup(name)) {
840 ERROR("Invalid cgroup name: '%s'", name);
841 errno = EINVAL;
842 return NULL;
ae5c8b8e
SH
843 }
844
33ad9f1a
CS
845 if (!strstr(path_pattern, "%n")) {
846 ERROR("Invalid cgroup path pattern: '%s'; contains no %%n for specifying container name", path_pattern);
847 errno = EINVAL;
848 return NULL;
849 }
fd37327f 850
33ad9f1a
CS
851 /* we will modify the result of this operation directly,
852 * so we don't have to copy the data structure
853 */
854 base_info = (path_pattern[0] == '/') ?
855 lxc_cgroup_process_info_get_init(meta_data) :
856 lxc_cgroup_process_info_get_self(meta_data);
857 if (!base_info)
858 return NULL;
c8f7c563 859
33ad9f1a
CS
860 new_cgroup_paths = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
861 if (!new_cgroup_paths)
862 goto out_initial_error;
863
864 new_cgroup_paths_sub = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
865 if (!new_cgroup_paths_sub)
866 goto out_initial_error;
867
868 /* find mount points we can use */
869 for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
870 h = info_ptr->hierarchy;
871 mp = lxc_cgroup_find_mount_point(h, info_ptr->cgroup_path, true);
872 if (!mp) {
873 ERROR("Could not find writable mount point for cgroup hierarchy %d while trying to create cgroup.", h->index);
874 goto out_initial_error;
875 }
876 info_ptr->designated_mount_point = mp;
460a1cf0 877
692ba18f
SH
878 if (lxc_string_in_array("ns", (const char **)h->subsystems))
879 continue;
2edb53c7
SH
880 if (handle_cgroup_settings(mp, info_ptr->cgroup_path) < 0) {
881 ERROR("Could not set clone_children to 1 for cpuset hierarchy in parent cgroup.");
33ad9f1a 882 goto out_initial_error;
2edb53c7 883 }
33ad9f1a 884 }
b98f7d6e 885
33ad9f1a
CS
886 /* normalize the path */
887 cgroup_path_components = lxc_normalize_path(path_pattern);
888 if (!cgroup_path_components)
889 goto out_initial_error;
890
891 /* go through the path components to see if we can create them */
892 for (p = cgroup_path_components; *p || (sub_pattern && !had_sub_pattern); p++) {
893 /* we only want to create the same component with -1, -2, etc.
894 * if the component contains the container name itself, otherwise
895 * it's not an error if it already exists
896 */
897 char *p_eff = *p ? *p : (char *)sub_pattern;
898 bool contains_name = strstr(p_eff, "%n");
899 char *current_component = NULL;
900 char *current_subpath = NULL;
901 char *current_entire_path = NULL;
902 char *parts[3];
903 size_t j = 0;
904 i = 0;
905
906 /* if we are processing the subpattern, we want to make sure
907 * loop is ended the next time around
908 */
909 if (!*p) {
910 had_sub_pattern = true;
911 p--;
912 }
b98f7d6e 913
33ad9f1a 914 goto find_name_on_this_level;
4fb3cba5 915
33ad9f1a
CS
916 cleanup_name_on_this_level:
917 /* This is reached if we found a name clash.
918 * In that case, remove the cgroup from all previous hierarchies
919 */
920 for (j = 0, info_ptr = base_info; j < i && info_ptr; info_ptr = info_ptr->next, j++) {
603c64c2 921 r = remove_cgroup(info_ptr->designated_mount_point, info_ptr->created_paths[info_ptr->created_paths_count - 1], false);
33ad9f1a
CS
922 if (r < 0)
923 WARN("could not clean up cgroup we created when trying to create container");
924 free(info_ptr->created_paths[info_ptr->created_paths_count - 1]);
925 info_ptr->created_paths[--info_ptr->created_paths_count] = NULL;
926 }
927 if (current_component != current_subpath)
928 free(current_subpath);
929 if (current_component != p_eff)
930 free(current_component);
931 current_component = current_subpath = NULL;
932 /* try again with another suffix */
933 ++suffix;
4fb3cba5 934
33ad9f1a
CS
935 find_name_on_this_level:
936 /* determine name of the path component we should create */
937 if (contains_name && suffix > 0) {
938 char *buf = calloc(strlen(name) + 32, 1);
939 if (!buf)
940 goto out_initial_error;
941 snprintf(buf, strlen(name) + 32, "%s-%u", name, suffix);
942 current_component = lxc_string_replace("%n", buf, p_eff);
943 free(buf);
944 } else {
945 current_component = contains_name ? lxc_string_replace("%n", name, p_eff) : p_eff;
946 }
947 parts[0] = path_so_far;
948 parts[1] = current_component;
949 parts[2] = NULL;
950 current_subpath = path_so_far ? lxc_string_join("/", (const char **)parts, false) : current_component;
951
952 /* Now go through each hierarchy and try to create the
953 * corresponding cgroup
954 */
955 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
956 char *parts2[3];
692ba18f
SH
957
958 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
959 continue;
33ad9f1a
CS
960 current_entire_path = NULL;
961
962 parts2[0] = !strcmp(info_ptr->cgroup_path, "/") ? "" : info_ptr->cgroup_path;
963 parts2[1] = current_subpath;
964 parts2[2] = NULL;
965 current_entire_path = lxc_string_join("/", (const char **)parts2, false);
966
967 if (!*p) {
968 /* we are processing the subpath, so only update that one */
969 free(new_cgroup_paths_sub[i]);
970 new_cgroup_paths_sub[i] = strdup(current_entire_path);
971 if (!new_cgroup_paths_sub[i])
972 goto cleanup_from_error;
973 } else {
974 /* remember which path was used on this controller */
975 free(new_cgroup_paths[i]);
976 new_cgroup_paths[i] = strdup(current_entire_path);
977 if (!new_cgroup_paths[i])
978 goto cleanup_from_error;
979 }
fd4f5a56 980
33ad9f1a
CS
981 r = create_cgroup(info_ptr->designated_mount_point, current_entire_path);
982 if (r < 0 && errno == EEXIST && contains_name) {
983 /* name clash => try new name with new suffix */
984 free(current_entire_path);
985 current_entire_path = NULL;
986 goto cleanup_name_on_this_level;
987 } else if (r < 0 && errno != EEXIST) {
ea439aac
SH
988 if (is_crucial_hierarchy(info_ptr->hierarchy)) {
989 SYSERROR("Could not create cgroup '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
990 goto cleanup_from_error;
991 }
992 goto skip;
33ad9f1a
CS
993 } else if (r == 0) {
994 /* successfully created */
995 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
996 if (r < 0)
997 goto cleanup_from_error;
d703c2b1 998 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, current_entire_path)) {
b38b62a6 999 ERROR("Failed to initialize cpuset for '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
d703c2b1
RV
1000 goto cleanup_from_error;
1001 }
33ad9f1a
CS
1002 info_ptr->created_paths[info_ptr->created_paths_count++] = current_entire_path;
1003 } else {
1004 /* if we didn't create the cgroup, then we have to make sure that
1005 * further cgroups will be created properly
1006 */
d703c2b1 1007 if (handle_cgroup_settings(info_ptr->designated_mount_point, info_ptr->cgroup_path) < 0) {
f6ac3b9e 1008 ERROR("Could not set clone_children to 1 for cpuset hierarchy in pre-existing cgroup.");
33ad9f1a 1009 goto cleanup_from_error;
f6ac3b9e 1010 }
d703c2b1
RV
1011 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, info_ptr->cgroup_path)) {
1012 ERROR("Failed to initialize cpuset in pre-existing '%s'.", info_ptr->cgroup_path);
1013 goto cleanup_from_error;
1014 }
33ad9f1a 1015
ea439aac 1016skip:
33ad9f1a
CS
1017 /* already existed but path component of pattern didn't contain '%n',
1018 * so this is not an error; but then we don't need current_entire_path
1019 * anymore...
1020 */
1021 free(current_entire_path);
1022 current_entire_path = NULL;
1023 }
1024 }
fd4f5a56 1025
33ad9f1a
CS
1026 /* save path so far */
1027 free(path_so_far);
1028 path_so_far = strdup(current_subpath);
1029 if (!path_so_far)
1030 goto cleanup_from_error;
1031
1032 /* cleanup */
1033 if (current_component != current_subpath)
1034 free(current_subpath);
1035 if (current_component != p_eff)
1036 free(current_component);
1037 current_component = current_subpath = NULL;
1038 continue;
4fb3cba5 1039
33ad9f1a 1040 cleanup_from_error:
ec64264d 1041 /* called if an error occurred in the loop, so we
33ad9f1a
CS
1042 * do some additional cleanup here
1043 */
1044 saved_errno = errno;
1045 if (current_component != current_subpath)
1046 free(current_subpath);
1047 if (current_component != p_eff)
1048 free(current_component);
1049 free(current_entire_path);
1050 errno = saved_errno;
1051 goto out_initial_error;
fd4f5a56
DL
1052 }
1053
33ad9f1a
CS
1054 /* we're done, now update the paths */
1055 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
47d8fb3b
CS
1056 /* ignore legacy 'ns' subsystem here, lxc_cgroup_create_legacy
1057 * will take care of it
1058 * Since we do a continue in above loop, new_cgroup_paths[i] is
1059 * unset anyway, as is new_cgroup_paths_sub[i]
692ba18f 1060 */
47d8fb3b
CS
1061 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1062 continue;
1063 free(info_ptr->cgroup_path);
1064 info_ptr->cgroup_path = new_cgroup_paths[i];
1065 info_ptr->cgroup_path_sub = new_cgroup_paths_sub[i];
fd4f5a56 1066 }
33ad9f1a
CS
1067 /* don't use lxc_free_array since we used the array members
1068 * to store them in our result...
1069 */
1070 free(new_cgroup_paths);
1071 free(new_cgroup_paths_sub);
1072 free(path_so_far);
1073 lxc_free_array((void **)cgroup_path_components, free);
1074 return base_info;
1075
1076out_initial_error:
1077 saved_errno = errno;
1078 free(path_so_far);
1079 lxc_cgroup_process_info_free_and_remove(base_info);
1080 lxc_free_array((void **)new_cgroup_paths, free);
1081 lxc_free_array((void **)new_cgroup_paths_sub, free);
1082 lxc_free_array((void **)cgroup_path_components, free);
1083 errno = saved_errno;
1084 return NULL;
c8f7c563
CS
1085}
1086
4fb3cba5 1087static int lxc_cgroup_create_legacy(struct cgroup_process_info *base_info, const char *name, pid_t pid)
47d8fb3b
CS
1088{
1089 struct cgroup_process_info *info_ptr;
1090 int r;
1091
1092 for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
1093 if (!lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1094 continue;
1095 /*
1096 * For any path which has ns cgroup mounted, handler->pid is already
1097 * moved into a container called '%d % (handler->pid)'. Rename it to
1098 * the cgroup name and record that.
1099 */
1100 char *tmp = cgroup_rename_nsgroup((const char *)info_ptr->designated_mount_point->mount_point,
1101 info_ptr->cgroup_path, pid, name);
1102 if (!tmp)
1103 return -1;
1104 free(info_ptr->cgroup_path);
1105 info_ptr->cgroup_path = tmp;
1106 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1107 if (r < 0)
1108 return -1;
1109 tmp = strdup(tmp);
1110 if (!tmp)
1111 return -1;
1112 info_ptr->created_paths[info_ptr->created_paths_count++] = tmp;
1113 }
1114 return 0;
1115}
1116
33ad9f1a 1117/* get the cgroup membership of a given container */
4fb3cba5 1118static struct cgroup_process_info *lxc_cgroup_get_container_info(const char *name, const char *lxcpath, struct cgroup_meta_data *meta_data)
c8f7c563 1119{
33ad9f1a
CS
1120 struct cgroup_process_info *result = NULL;
1121 int saved_errno = 0;
1122 size_t i;
1123 struct cgroup_process_info **cptr = &result;
1124 struct cgroup_process_info *entry = NULL;
1125 char *path = NULL;
1126
1127 for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
1128 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
1129 if (!h || !h->used)
1130 continue;
c8f7c563 1131
33ad9f1a
CS
1132 /* use the command interface to look for the cgroup */
1133 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->subsystems[0]);
c661b0a8
DE
1134 if (!path) {
1135 h->used = false;
1136 WARN("Not attaching to cgroup %s unknown to %s %s", h->subsystems[0], lxcpath, name);
1137 continue;
1138 }
33ad9f1a
CS
1139
1140 entry = calloc(1, sizeof(struct cgroup_process_info));
1141 if (!entry)
1142 goto out_error;
1143 entry->meta_ref = lxc_cgroup_get_meta(meta_data);
1144 entry->hierarchy = h;
1145 entry->cgroup_path = path;
1146 path = NULL;
1147
1148 /* it is not an error if we don't find anything here,
1149 * it is up to the caller to decide what to do in that
1150 * case */
1151 entry->designated_mount_point = lxc_cgroup_find_mount_point(h, entry->cgroup_path, true);
1152
1153 *cptr = entry;
1154 cptr = &entry->next;
1155 entry = NULL;
c8f7c563
CS
1156 }
1157
33ad9f1a
CS
1158 return result;
1159out_error:
1160 saved_errno = errno;
1161 free(path);
1162 lxc_cgroup_process_info_free(result);
1163 lxc_cgroup_process_info_free(entry);
1164 errno = saved_errno;
1165 return NULL;
fd4f5a56
DL
1166}
1167
33ad9f1a 1168/* move a processs to the cgroups specified by the membership */
4fb3cba5 1169static int lxc_cgroupfs_enter(struct cgroup_process_info *info, pid_t pid, bool enter_sub)
4f17323e 1170{
33ad9f1a
CS
1171 char pid_buf[32];
1172 char *cgroup_tasks_fn;
1173 int r;
1174 struct cgroup_process_info *info_ptr;
1175
1176 snprintf(pid_buf, 32, "%lu", (unsigned long)pid);
1177 for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1178 char *cgroup_path = (enter_sub && info_ptr->cgroup_path_sub) ?
1179 info_ptr->cgroup_path_sub :
1180 info_ptr->cgroup_path;
1181
1182 if (!info_ptr->designated_mount_point) {
1183 info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, cgroup_path, true);
1184 if (!info_ptr->designated_mount_point) {
1185 SYSERROR("Could not add pid %lu to cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", (unsigned long)pid, cgroup_path);
1186 return -1;
1187 }
1188 }
4f17323e 1189
33ad9f1a
CS
1190 cgroup_tasks_fn = cgroup_to_absolute_path(info_ptr->designated_mount_point, cgroup_path, "/tasks");
1191 if (!cgroup_tasks_fn) {
1192 SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1193 return -1;
1194 }
4f17323e 1195
33ad9f1a 1196 r = lxc_write_to_file(cgroup_tasks_fn, pid_buf, strlen(pid_buf), false);
5903da82 1197 free(cgroup_tasks_fn);
ea439aac 1198 if (r < 0 && is_crucial_hierarchy(info_ptr->hierarchy)) {
33ad9f1a
CS
1199 SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1200 return -1;
1201 }
4f17323e
CS
1202 }
1203
33ad9f1a 1204 return 0;
4f17323e
CS
1205}
1206
33ad9f1a
CS
1207/* free process membership information */
1208void lxc_cgroup_process_info_free(struct cgroup_process_info *info)
fc7de561 1209{
33ad9f1a
CS
1210 struct cgroup_process_info *next;
1211 if (!info)
b98f7d6e 1212 return;
33ad9f1a
CS
1213 next = info->next;
1214 lxc_cgroup_put_meta(info->meta_ref);
1215 free(info->cgroup_path);
1216 free(info->cgroup_path_sub);
1217 lxc_free_array((void **)info->created_paths, free);
1218 free(info);
1219 lxc_cgroup_process_info_free(next);
fc7de561
SH
1220}
1221
33ad9f1a
CS
1222/* free process membership information and remove cgroups that were created */
1223void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info)
b98f7d6e 1224{
33ad9f1a
CS
1225 struct cgroup_process_info *next;
1226 char **pp;
1227 if (!info)
1228 return;
1229 next = info->next;
603c64c2 1230 {
33ad9f1a
CS
1231 struct cgroup_mount_point *mp = info->designated_mount_point;
1232 if (!mp)
1233 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1234 if (mp)
1235 /* ignore return value here, perhaps we created the
1236 * '/lxc' cgroup in this container but another container
1237 * is still running (for example)
1238 */
603c64c2
SH
1239 (void)remove_cgroup(mp, info->cgroup_path, true);
1240 }
1241 for (pp = info->created_paths; pp && *pp; pp++);
1242 for ((void)(pp && --pp); info->created_paths && pp >= info->created_paths; --pp) {
33ad9f1a 1243 free(*pp);
b98f7d6e 1244 }
33ad9f1a
CS
1245 free(info->created_paths);
1246 lxc_cgroup_put_meta(info->meta_ref);
1247 free(info->cgroup_path);
1248 free(info->cgroup_path_sub);
1249 free(info);
9431aa65 1250 lxc_cgroup_process_info_free_and_remove(next);
33ad9f1a 1251}
b98f7d6e 1252
4fb3cba5 1253static char *lxc_cgroup_get_hierarchy_path_data(const char *subsystem, struct cgfs_data *d)
33ad9f1a 1254{
d4ef7c50
SH
1255 struct cgroup_process_info *info = d->info;
1256 info = find_info_for_subsystem(info, subsystem);
33ad9f1a
CS
1257 if (!info)
1258 return NULL;
f348e47c 1259 prune_init_scope(info->cgroup_path);
33ad9f1a 1260 return info->cgroup_path;
b98f7d6e
SH
1261}
1262
4fb3cba5 1263static char *lxc_cgroup_get_hierarchy_abs_path_data(const char *subsystem, struct cgfs_data *d)
b98f7d6e 1264{
d4ef7c50 1265 struct cgroup_process_info *info = d->info;
33ad9f1a 1266 struct cgroup_mount_point *mp = NULL;
d4ef7c50
SH
1267
1268 info = find_info_for_subsystem(info, subsystem);
33ad9f1a
CS
1269 if (!info)
1270 return NULL;
1271 if (info->designated_mount_point) {
8900b9eb 1272 mp = info->designated_mount_point;
33ad9f1a
CS
1273 } else {
1274 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1275 if (!mp)
1276 return NULL;
b98f7d6e 1277 }
33ad9f1a 1278 return cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
b98f7d6e 1279}
55c76589 1280
4fb3cba5 1281static char *lxc_cgroup_get_hierarchy_abs_path(const char *subsystem, const char *name, const char *lxcpath)
9a93d992 1282{
33ad9f1a
CS
1283 struct cgroup_meta_data *meta;
1284 struct cgroup_process_info *base_info, *info;
1285 struct cgroup_mount_point *mp;
1286 char *result = NULL;
33ad9f1a
CS
1287
1288 meta = lxc_cgroup_load_meta();
1289 if (!meta)
9a93d992 1290 return NULL;
33ad9f1a
CS
1291 base_info = lxc_cgroup_get_container_info(name, lxcpath, meta);
1292 if (!base_info)
178938fe 1293 goto out1;
33ad9f1a
CS
1294 info = find_info_for_subsystem(base_info, subsystem);
1295 if (!info)
178938fe 1296 goto out2;
33ad9f1a 1297 if (info->designated_mount_point) {
8900b9eb 1298 mp = info->designated_mount_point;
33ad9f1a
CS
1299 } else {
1300 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1301 if (!mp)
178938fe 1302 goto out3;
33ad9f1a
CS
1303 }
1304 result = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
178938fe 1305out3:
178938fe 1306out2:
33ad9f1a 1307 lxc_cgroup_process_info_free(base_info);
178938fe 1308out1:
33ad9f1a 1309 lxc_cgroup_put_meta(meta);
33ad9f1a
CS
1310 return result;
1311}
9a93d992 1312
4fb3cba5 1313static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfs_data *d)
33ad9f1a
CS
1314{
1315 char *subsystem = NULL, *p, *path;
1316 int ret = -1;
9a93d992 1317
33ad9f1a
CS
1318 subsystem = alloca(strlen(filename) + 1);
1319 strcpy(subsystem, filename);
46cd2845 1320 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a 1321 *p = '\0';
9a93d992 1322
4f875f70 1323 errno = ENOENT;
4fb3cba5 1324 path = lxc_cgroup_get_hierarchy_abs_path_data(subsystem, d);
33ad9f1a
CS
1325 if (path) {
1326 ret = do_cgroup_set(path, filename, value);
4f875f70 1327 int saved_errno = errno;
33ad9f1a 1328 free(path);
4f875f70 1329 errno = saved_errno;
9a93d992 1330 }
33ad9f1a
CS
1331 return ret;
1332}
9a93d992 1333
4fb3cba5 1334static int lxc_cgroupfs_set(const char *filename, const char *value, const char *name, const char *lxcpath)
9a93d992 1335{
33ad9f1a
CS
1336 char *subsystem = NULL, *p, *path;
1337 int ret = -1;
9a93d992 1338
33ad9f1a
CS
1339 subsystem = alloca(strlen(filename) + 1);
1340 strcpy(subsystem, filename);
46cd2845 1341 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a 1342 *p = '\0';
9a93d992 1343
33ad9f1a
CS
1344 path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1345 if (path) {
1346 ret = do_cgroup_set(path, filename, value);
1347 free(path);
1348 }
b98f7d6e 1349 return ret;
9a93d992
SH
1350}
1351
4fb3cba5 1352static int lxc_cgroupfs_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
9a93d992 1353{
33ad9f1a
CS
1354 char *subsystem = NULL, *p, *path;
1355 int ret = -1;
1356
1357 subsystem = alloca(strlen(filename) + 1);
1358 strcpy(subsystem, filename);
46cd2845 1359 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a
CS
1360 *p = '\0';
1361
1362 path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1363 if (path) {
1364 ret = do_cgroup_get(path, filename, value, len);
1365 free(path);
9a93d992 1366 }
33ad9f1a 1367 return ret;
9a93d992
SH
1368}
1369
4fb3cba5 1370static bool cgroupfs_mount_cgroup(void *hdata, const char *root, int type)
aae1f3c4
CS
1371{
1372 size_t bufsz = strlen(root) + sizeof("/sys/fs/cgroup");
1373 char *path = NULL;
1374 char **parts = NULL;
1375 char *dirname = NULL;
1376 char *abs_path = NULL;
1377 char *abs_path2 = NULL;
d4ef7c50
SH
1378 struct cgfs_data *cgfs_d;
1379 struct cgroup_process_info *info, *base_info;
aae1f3c4
CS
1380 int r, saved_errno = 0;
1381
4608594e
SH
1382 if (cgns_supported())
1383 return true;
1384
4fb3cba5
DE
1385 cgfs_d = hdata;
1386 if (!cgfs_d)
1387 return false;
d4ef7c50
SH
1388 base_info = cgfs_d->info;
1389
0769b82a
CS
1390 /* If we get passed the _NOSPEC types, we default to _MIXED, since we don't
1391 * have access to the lxc_conf object at this point. It really should be up
1392 * to the caller to fix this, but this doesn't really hurt.
1393 */
1394 if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1395 type = LXC_AUTO_CGROUP_FULL_MIXED;
1396 else if (type == LXC_AUTO_CGROUP_NOSPEC)
1397 type = LXC_AUTO_CGROUP_MIXED;
1398
7997d7da
CS
1399 if (type < LXC_AUTO_CGROUP_RO || type > LXC_AUTO_CGROUP_FULL_MIXED) {
1400 ERROR("could not mount cgroups into container: invalid type specified internally");
1401 errno = EINVAL;
c476bdce 1402 return false;
7997d7da
CS
1403 }
1404
aae1f3c4
CS
1405 path = calloc(1, bufsz);
1406 if (!path)
c476bdce 1407 return false;
aae1f3c4 1408 snprintf(path, bufsz, "%s/sys/fs/cgroup", root);
592fd47a
SH
1409 r = safe_mount("cgroup_root", path, "tmpfs",
1410 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1411 "size=10240k,mode=755",
1412 root);
aae1f3c4
CS
1413 if (r < 0) {
1414 SYSERROR("could not mount tmpfs to /sys/fs/cgroup in the container");
c476bdce 1415 return false;
aae1f3c4
CS
1416 }
1417
1418 /* now mount all the hierarchies we care about */
1419 for (info = base_info; info; info = info->next) {
1420 size_t subsystem_count, i;
1421 struct cgroup_mount_point *mp = info->designated_mount_point;
d3f99e96 1422 if (!mountpoint_is_accessible(mp))
aae1f3c4 1423 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
d3f99e96 1424
aae1f3c4
CS
1425 if (!mp) {
1426 SYSERROR("could not find original mount point for cgroup hierarchy while trying to mount cgroup filesystem");
1427 goto out_error;
1428 }
1429
1430 subsystem_count = lxc_array_len((void **)info->hierarchy->subsystems);
1431 parts = calloc(subsystem_count + 1, sizeof(char *));
1432 if (!parts)
1433 goto out_error;
1434
1435 for (i = 0; i < subsystem_count; i++) {
1436 if (!strncmp(info->hierarchy->subsystems[i], "name=", 5))
1437 parts[i] = info->hierarchy->subsystems[i] + 5;
1438 else
1439 parts[i] = info->hierarchy->subsystems[i];
1440 }
1441 dirname = lxc_string_join(",", (const char **)parts, false);
1442 if (!dirname)
1443 goto out_error;
1444
1445 /* create subsystem directory */
1446 abs_path = lxc_append_paths(path, dirname);
1447 if (!abs_path)
1448 goto out_error;
1449 r = mkdir_p(abs_path, 0755);
1450 if (r < 0 && errno != EEXIST) {
1451 SYSERROR("could not create cgroup subsystem directory /sys/fs/cgroup/%s", dirname);
1452 goto out_error;
1453 }
1454
aae1f3c4
CS
1455 abs_path2 = lxc_append_paths(abs_path, info->cgroup_path);
1456 if (!abs_path2)
1457 goto out_error;
aae1f3c4 1458
7997d7da
CS
1459 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_RW || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1460 /* bind-mount the cgroup entire filesystem there */
1461 if (strcmp(mp->mount_prefix, "/") != 0) {
1462 /* FIXME: maybe we should just try to remount the entire hierarchy
1463 * with a regular mount command? may that works? */
1464 ERROR("could not automatically mount cgroup-full to /sys/fs/cgroup/%s: host has no mount point for this cgroup filesystem that has access to the root cgroup", dirname);
1465 goto out_error;
1466 }
1467 r = mount(mp->mount_point, abs_path, "none", MS_BIND, 0);
1468 if (r < 0) {
1469 SYSERROR("error bind-mounting %s to %s", mp->mount_point, abs_path);
1470 goto out_error;
1471 }
f8f3c3c0
SG
1472 /* main cgroup path should be read-only */
1473 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1474 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1475 if (r < 0) {
1476 SYSERROR("error re-mounting %s readonly", abs_path);
1477 goto out_error;
1478 }
1479 }
7997d7da
CS
1480 /* own cgroup should be read-write */
1481 if (type == LXC_AUTO_CGROUP_FULL_MIXED) {
1482 r = mount(abs_path2, abs_path2, NULL, MS_BIND, NULL);
1483 if (r < 0) {
1484 SYSERROR("error bind-mounting %s onto itself", abs_path2);
1485 goto out_error;
1486 }
1487 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND, NULL);
1488 if (r < 0) {
1489 SYSERROR("error re-mounting %s readwrite", abs_path2);
1490 goto out_error;
1491 }
1492 }
1493 } else {
1494 /* create path for container's cgroup */
1495 r = mkdir_p(abs_path2, 0755);
1496 if (r < 0 && errno != EEXIST) {
1497 SYSERROR("could not create cgroup directory /sys/fs/cgroup/%s%s", dirname, info->cgroup_path);
1498 goto out_error;
1499 }
aae1f3c4 1500
b46f0553
CS
1501 /* for read-only and mixed cases, we have to bind-mount the tmpfs directory
1502 * that points to the hierarchy itself (i.e. /sys/fs/cgroup/cpu etc.) onto
1503 * itself and then bind-mount it read-only, since we keep the tmpfs itself
1504 * read-write (see comment below)
1505 */
1506 if (type == LXC_AUTO_CGROUP_MIXED || type == LXC_AUTO_CGROUP_RO) {
1507 r = mount(abs_path, abs_path, NULL, MS_BIND, NULL);
1508 if (r < 0) {
1509 SYSERROR("error bind-mounting %s onto itself", abs_path);
1510 goto out_error;
1511 }
1512 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1513 if (r < 0) {
1514 SYSERROR("error re-mounting %s readonly", abs_path);
1515 goto out_error;
1516 }
1517 }
1518
7997d7da
CS
1519 free(abs_path);
1520 abs_path = NULL;
1521
1522 /* bind-mount container's cgroup to that directory */
1523 abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1524 if (!abs_path)
1525 goto out_error;
1526 r = mount(abs_path, abs_path2, "none", MS_BIND, 0);
ea439aac 1527 if (r < 0 && is_crucial_hierarchy(info->hierarchy)) {
7997d7da
CS
1528 SYSERROR("error bind-mounting %s to %s", abs_path, abs_path2);
1529 goto out_error;
1530 }
1531 if (type == LXC_AUTO_CGROUP_RO) {
1532 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1533 if (r < 0) {
1534 SYSERROR("error re-mounting %s readonly", abs_path2);
1535 goto out_error;
1536 }
1537 }
aae1f3c4
CS
1538 }
1539
1540 free(abs_path);
1541 free(abs_path2);
1542 abs_path = NULL;
1543 abs_path2 = NULL;
1544
1545 /* add symlinks for every single subsystem */
1546 if (subsystem_count > 1) {
1547 for (i = 0; i < subsystem_count; i++) {
1548 abs_path = lxc_append_paths(path, parts[i]);
1549 if (!abs_path)
1550 goto out_error;
1551 r = symlink(dirname, abs_path);
1552 if (r < 0)
1553 WARN("could not create symlink %s -> %s in /sys/fs/cgroup of container", parts[i], dirname);
1554 free(abs_path);
1555 abs_path = NULL;
1556 }
1557 }
1558 free(dirname);
1559 free(parts);
1560 dirname = NULL;
1561 parts = NULL;
1562 }
1563
b46f0553
CS
1564 /* We used to remount the entire tmpfs readonly if any :ro or
1565 * :mixed mode was specified. However, Ubuntu's mountall has the
1566 * unfortunate behavior to block bootup if /sys/fs/cgroup is
1567 * mounted read-only and cannot be remounted read-write.
1568 * (mountall reads /lib/init/fstab and tries to (re-)mount all of
1569 * these if they are not already mounted with the right options;
1570 * it contains an entry for /sys/fs/cgroup. In case it can't do
1571 * that, it prompts for the user to either manually fix it or
1572 * boot anyway. But without user input, booting of the container
1573 * hangs.)
1574 *
1575 * Instead of remounting the entire tmpfs readonly, we only
1576 * remount the paths readonly that are part of the cgroup
1577 * hierarchy.
f8f3c3c0 1578 */
f8f3c3c0 1579
aae1f3c4
CS
1580 free(path);
1581
c476bdce 1582 return true;
aae1f3c4
CS
1583
1584out_error:
1585 saved_errno = errno;
1586 free(path);
1587 free(dirname);
1588 free(parts);
1589 free(abs_path);
1590 free(abs_path2);
1591 errno = saved_errno;
c476bdce 1592 return false;
aae1f3c4
CS
1593}
1594
4fb3cba5 1595static int cgfs_nrtasks(void *hdata)
33ad9f1a 1596{
4fb3cba5
DE
1597 struct cgfs_data *d = hdata;
1598 struct cgroup_process_info *info;
33ad9f1a
CS
1599 struct cgroup_mount_point *mp = NULL;
1600 char *abs_path = NULL;
1601 int ret;
460a1cf0 1602
4fb3cba5
DE
1603 if (!d) {
1604 errno = ENOENT;
1605 return -1;
1606 }
1607
1608 info = d->info;
33ad9f1a
CS
1609 if (!info) {
1610 errno = ENOENT;
1611 return -1;
b98f7d6e 1612 }
c8f7c563 1613
33ad9f1a 1614 if (info->designated_mount_point) {
8900b9eb 1615 mp = info->designated_mount_point;
33ad9f1a
CS
1616 } else {
1617 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, false);
1618 if (!mp)
1619 return -1;
c8f7c563
CS
1620 }
1621
33ad9f1a
CS
1622 abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1623 if (!abs_path)
1624 return -1;
1625
1626 ret = cgroup_recursive_task_count(abs_path);
1627 free(abs_path);
1628 return ret;
c8f7c563
CS
1629}
1630
574c4428
QH
1631static struct cgroup_process_info *
1632lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str,
1633 struct cgroup_meta_data *meta)
d08ba6ec 1634{
33ad9f1a
CS
1635 struct cgroup_process_info *result = NULL;
1636 FILE *proc_pid_cgroup = NULL;
1637 char *line = NULL;
1638 size_t sz = 0;
1639 int saved_errno = 0;
1640 struct cgroup_process_info **cptr = &result;
1641 struct cgroup_process_info *entry = NULL;
1642
1643 proc_pid_cgroup = fopen_cloexec(proc_pid_cgroup_str, "r");
1644 if (!proc_pid_cgroup)
b98f7d6e 1645 return NULL;
1ac470c0 1646
33ad9f1a
CS
1647 while (getline(&line, &sz, proc_pid_cgroup) != -1) {
1648 /* file format: hierarchy:subsystems:group */
1649 char *colon1;
1650 char *colon2;
1651 char *endptr;
1652 int hierarchy_number;
1653 struct cgroup_hierarchy *h = NULL;
fd4f5a56 1654
33ad9f1a 1655 if (!line[0])
ae5c8b8e 1656 continue;
b98f7d6e 1657
33ad9f1a
CS
1658 if (line[strlen(line) - 1] == '\n')
1659 line[strlen(line) - 1] = '\0';
1660
1661 colon1 = strchr(line, ':');
1662 if (!colon1)
8900b9eb 1663 continue;
33ad9f1a
CS
1664 *colon1++ = '\0';
1665 colon2 = strchr(colon1, ':');
1666 if (!colon2)
ae5c8b8e 1667 continue;
33ad9f1a 1668 *colon2++ = '\0';
e4659536 1669
33ad9f1a
CS
1670 endptr = NULL;
1671 hierarchy_number = strtoul(line, &endptr, 10);
1672 if (!endptr || *endptr)
9a93d992 1673 continue;
9a93d992 1674
33ad9f1a
CS
1675 if (hierarchy_number > meta->maximum_hierarchy) {
1676 /* we encountered a hierarchy we didn't have before,
1677 * so probably somebody remounted some stuff in the
1678 * mean time...
1679 */
1680 errno = EAGAIN;
1681 goto out_error;
b98f7d6e 1682 }
33ad9f1a
CS
1683
1684 h = meta->hierarchies[hierarchy_number];
1685 if (!h) {
1686 /* we encountered a hierarchy that was thought to be
1687 * dead before, so probably somebody remounted some
1688 * stuff in the mean time...
1689 */
1690 errno = EAGAIN;
1691 goto out_error;
b98f7d6e 1692 }
33ad9f1a
CS
1693
1694 /* we are told that we should ignore this hierarchy */
1695 if (!h->used)
b98f7d6e 1696 continue;
5193cc3d 1697
33ad9f1a
CS
1698 entry = calloc(1, sizeof(struct cgroup_process_info));
1699 if (!entry)
1700 goto out_error;
fd4f5a56 1701
33ad9f1a
CS
1702 entry->meta_ref = lxc_cgroup_get_meta(meta);
1703 entry->hierarchy = h;
1704 entry->cgroup_path = strdup(colon2);
1705 if (!entry->cgroup_path)
1706 goto out_error;
3939a22a 1707 prune_init_scope(entry->cgroup_path);
d08ba6ec 1708
33ad9f1a
CS
1709 *cptr = entry;
1710 cptr = &entry->next;
1711 entry = NULL;
b98f7d6e 1712 }
b98f7d6e 1713
33ad9f1a
CS
1714 fclose(proc_pid_cgroup);
1715 free(line);
1716 return result;
1717
1718out_error:
1719 saved_errno = errno;
1720 if (proc_pid_cgroup)
1721 fclose(proc_pid_cgroup);
1722 lxc_cgroup_process_info_free(result);
1723 lxc_cgroup_process_info_free(entry);
1724 free(line);
1725 errno = saved_errno;
ae5c8b8e 1726 return NULL;
36b86299
DL
1727}
1728
574c4428
QH
1729static char **subsystems_from_mount_options(const char *mount_options,
1730 char **kernel_list)
36b86299 1731{
33ad9f1a
CS
1732 char *token, *str, *saveptr = NULL;
1733 char **result = NULL;
1734 size_t result_capacity = 0;
8900b9eb 1735 size_t result_count = 0;
33ad9f1a
CS
1736 int saved_errno;
1737 int r;
ef342abb 1738
33ad9f1a
CS
1739 str = alloca(strlen(mount_options)+1);
1740 strcpy(str, mount_options);
1741 for (; (token = strtok_r(str, ",", &saveptr)); str = NULL) {
1742 /* we have a subsystem if it's either in the list of
1743 * subsystems provided by the kernel OR if it starts
1744 * with name= for named hierarchies
1745 */
836514a8
U
1746 r = lxc_grow_array((void ***)&result, &result_capacity, result_count + 1, 12);
1747 if (r < 0)
1748 goto out_free;
1749 result[result_count + 1] = NULL;
1750 if (strncmp(token, "name=", 5) && !lxc_string_in_array(token, (const char **)kernel_list)) {
1751 // this is eg 'systemd' but the mount will be 'name=systemd'
1752 result[result_count] = malloc(strlen(token) + 6);
1753 if (result[result_count])
1754 sprintf(result[result_count], "name=%s", token);
1755 } else
33ad9f1a 1756 result[result_count] = strdup(token);
836514a8
U
1757 if (!result[result_count])
1758 goto out_free;
1759 result_count++;
ae5c8b8e 1760 }
f0e64b8b 1761
33ad9f1a
CS
1762 return result;
1763
1764out_free:
1765 saved_errno = errno;
1766 lxc_free_array((void**)result, free);
1767 errno = saved_errno;
1768 return NULL;
b98f7d6e
SH
1769}
1770
574c4428 1771static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp)
b98f7d6e 1772{
33ad9f1a
CS
1773 if (!mp)
1774 return;
1775 free(mp->mount_point);
1776 free(mp->mount_prefix);
1777 free(mp);
bcbd102c
SH
1778}
1779
574c4428 1780static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h)
341a9bd8 1781{
33ad9f1a
CS
1782 if (!h)
1783 return;
1784 lxc_free_array((void **)h->subsystems, free);
8bfcb981 1785 free(h->all_mount_points);
33ad9f1a
CS
1786 free(h);
1787}
341a9bd8 1788
574c4428 1789static bool is_valid_cgroup(const char *name)
33ad9f1a
CS
1790{
1791 const char *p;
1792 for (p = name; *p; p++) {
28bb9321
QH
1793 /* Use the ASCII printable characters range(32 - 127)
1794 * is reasonable, we kick out 32(SPACE) because it'll
1795 * break legacy lxc-ls
1796 */
1797 if (*p <= 32 || *p >= 127 || *p == '/')
33ad9f1a 1798 return false;
341a9bd8 1799 }
33ad9f1a
CS
1800 return strcmp(name, ".") != 0 && strcmp(name, "..") != 0;
1801}
341a9bd8 1802
574c4428
QH
1803static int create_or_remove_cgroup(bool do_remove,
1804 struct cgroup_mount_point *mp, const char *path, int recurse)
33ad9f1a
CS
1805{
1806 int r, saved_errno = 0;
1807 char *buf = cgroup_to_absolute_path(mp, path, NULL);
1808 if (!buf)
1809 return -1;
341a9bd8 1810
33ad9f1a 1811 /* create or remove directory */
603c64c2
SH
1812 if (do_remove) {
1813 if (recurse)
1814 r = cgroup_rmdir(buf);
1815 else
1816 r = rmdir(buf);
1817 } else
1818 r = mkdir(buf, 0777);
33ad9f1a
CS
1819 saved_errno = errno;
1820 free(buf);
1821 errno = saved_errno;
1822 return r;
341a9bd8 1823}
bcbd102c 1824
574c4428 1825static int create_cgroup(struct cgroup_mount_point *mp, const char *path)
a6ddef61 1826{
603c64c2 1827 return create_or_remove_cgroup(false, mp, path, false);
a6ddef61
MN
1828}
1829
574c4428
QH
1830static int remove_cgroup(struct cgroup_mount_point *mp,
1831 const char *path, bool recurse)
576f946d 1832{
603c64c2 1833 return create_or_remove_cgroup(true, mp, path, recurse);
33ad9f1a 1834}
576f946d 1835
574c4428
QH
1836static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp,
1837 const char *path, const char *suffix)
33ad9f1a
CS
1838{
1839 /* first we have to make sure we subtract the mount point's prefix */
1840 char *prefix = mp->mount_prefix;
1841 char *buf;
1842 ssize_t len, rv;
1843
1844 /* we want to make sure only absolute paths to cgroups are passed to us */
1845 if (path[0] != '/') {
1846 errno = EINVAL;
1847 return NULL;
1848 }
b98f7d6e 1849
33ad9f1a
CS
1850 if (prefix && !strcmp(prefix, "/"))
1851 prefix = NULL;
b98f7d6e 1852
33ad9f1a
CS
1853 /* prefix doesn't match */
1854 if (prefix && strncmp(prefix, path, strlen(prefix)) != 0) {
1855 errno = EINVAL;
1856 return NULL;
1857 }
1858 /* if prefix is /foo and path is /foobar */
1859 if (prefix && path[strlen(prefix)] != '/' && path[strlen(prefix)] != '\0') {
1860 errno = EINVAL;
1861 return NULL;
1862 }
b98f7d6e 1863
33ad9f1a
CS
1864 /* remove prefix from path */
1865 path += prefix ? strlen(prefix) : 0;
b98f7d6e 1866
33ad9f1a
CS
1867 len = strlen(mp->mount_point) + strlen(path) + (suffix ? strlen(suffix) : 0);
1868 buf = calloc(len + 1, 1);
50266dc6
DE
1869 if (!buf)
1870 return NULL;
33ad9f1a 1871 rv = snprintf(buf, len + 1, "%s%s%s", mp->mount_point, path, suffix ? suffix : "");
8900b9eb 1872 if (rv > len) {
33ad9f1a
CS
1873 free(buf);
1874 errno = ENOMEM;
8900b9eb 1875 return NULL;
8b92dc3a 1876 }
576f946d 1877
33ad9f1a 1878 return buf;
e0f888d9 1879}
283678ed 1880
574c4428
QH
1881static struct cgroup_process_info *
1882find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem)
283678ed 1883{
33ad9f1a
CS
1884 struct cgroup_process_info *info_ptr;
1885 for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1886 struct cgroup_hierarchy *h = info_ptr->hierarchy;
1887 if (lxc_string_in_array(subsystem, (const char **)h->subsystems))
1888 return info_ptr;
b98f7d6e 1889 }
33ad9f1a
CS
1890 errno = ENOENT;
1891 return NULL;
1892}
283678ed 1893
574c4428
QH
1894static int do_cgroup_get(const char *cgroup_path, const char *sub_filename,
1895 char *value, size_t len)
33ad9f1a
CS
1896{
1897 const char *parts[3] = {
1898 cgroup_path,
1899 sub_filename,
1900 NULL
1901 };
1902 char *filename;
1903 int ret, saved_errno;
1904
1905 filename = lxc_string_join("/", parts, false);
1906 if (!filename)
1907 return -1;
1908
1909 ret = lxc_read_from_file(filename, value, len);
1910 saved_errno = errno;
1911 free(filename);
1912 errno = saved_errno;
1913 return ret;
283678ed 1914}
b113383b 1915
574c4428
QH
1916static int do_cgroup_set(const char *cgroup_path, const char *sub_filename,
1917 const char *value)
b113383b 1918{
33ad9f1a
CS
1919 const char *parts[3] = {
1920 cgroup_path,
1921 sub_filename,
1922 NULL
1923 };
1924 char *filename;
1925 int ret, saved_errno;
b113383b 1926
33ad9f1a
CS
1927 filename = lxc_string_join("/", parts, false);
1928 if (!filename)
1929 return -1;
b113383b 1930
33ad9f1a
CS
1931 ret = lxc_write_to_file(filename, value, strlen(value), false);
1932 saved_errno = errno;
1933 free(filename);
1934 errno = saved_errno;
1935 return ret;
b98f7d6e
SH
1936}
1937
4fb3cba5 1938static int do_setup_cgroup_limits(struct cgfs_data *d,
574c4428 1939 struct lxc_list *cgroup_settings, bool do_devices)
b98f7d6e 1940{
365d180a 1941 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
b98f7d6e
SH
1942 struct lxc_cgroup *cg;
1943 int ret = -1;
1944
33ad9f1a 1945 if (lxc_list_empty(cgroup_settings))
b98f7d6e
SH
1946 return 0;
1947
aaf26830 1948 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
fac7c663
KT
1949 if (!sorted_cgroup_settings) {
1950 return -1;
1951 }
aaf26830
KT
1952
1953 lxc_list_for_each(iterator, sorted_cgroup_settings) {
b98f7d6e
SH
1954 cg = iterator->elem;
1955
33ad9f1a 1956 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
b98f7d6e 1957 if (strcmp(cg->subsystem, "devices.deny") == 0 &&
4fb3cba5 1958 cgroup_devices_has_allow_or_deny(d, cg->value, false))
b98f7d6e
SH
1959 continue;
1960 if (strcmp(cg->subsystem, "devices.allow") == 0 &&
4fb3cba5 1961 cgroup_devices_has_allow_or_deny(d, cg->value, true))
b98f7d6e 1962 continue;
4fb3cba5 1963 if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
dddf7c5b 1964 if (do_devices && (errno == EACCES || errno == EPERM)) {
4f875f70
SH
1965 WARN("Error setting %s to %s for %s",
1966 cg->subsystem, cg->value, d->name);
1967 continue;
1968 }
dddf7c5b 1969 SYSERROR("Error setting %s to %s for %s",
4fb3cba5 1970 cg->subsystem, cg->value, d->name);
b98f7d6e
SH
1971 goto out;
1972 }
b113383b 1973 }
b98f7d6e
SH
1974
1975 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
b113383b
SH
1976 }
1977
b98f7d6e
SH
1978 ret = 0;
1979 INFO("cgroup has been setup");
1980out:
365d180a 1981 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
aaf26830
KT
1982 lxc_list_del(iterator);
1983 free(iterator);
1984 }
365d180a 1985 free(sorted_cgroup_settings);
b113383b
SH
1986 return ret;
1987}
b98f7d6e 1988
4fb3cba5 1989static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d,
574c4428 1990 char *v, bool for_allow)
33ad9f1a
CS
1991{
1992 char *path;
1993 FILE *devices_list;
8900b9eb 1994 char *line = NULL;
33ad9f1a
CS
1995 size_t sz = 0;
1996 bool ret = !for_allow;
1997 const char *parts[3] = {
1998 NULL,
1999 "devices.list",
2000 NULL
2001 };
2002
2003 // XXX FIXME if users could use something other than 'lxc.devices.deny = a'.
2004 // not sure they ever do, but they *could*
2005 // right now, I'm assuming they do NOT
2006 if (!for_allow && strcmp(v, "a") != 0 && strcmp(v, "a *:* rwm") != 0)
2007 return false;
2008
4fb3cba5 2009 parts[0] = (const char *)lxc_cgroup_get_hierarchy_abs_path_data("devices", d);
33ad9f1a
CS
2010 if (!parts[0])
2011 return false;
2012 path = lxc_string_join("/", parts, false);
2013 if (!path) {
2014 free((void *)parts[0]);
2015 return false;
2016 }
2017
2018 devices_list = fopen_cloexec(path, "r");
2019 if (!devices_list) {
2020 free(path);
2021 return false;
2022 }
2023
2024 while (getline(&line, &sz, devices_list) != -1) {
2025 size_t len = strlen(line);
2026 if (len > 0 && line[len-1] == '\n')
2027 line[len-1] = '\0';
2028 if (strcmp(line, "a *:* rwm") == 0) {
2029 ret = for_allow;
2030 goto out;
2031 } else if (for_allow && strcmp(line, v) == 0) {
2032 ret = true;
8900b9eb 2033 goto out;
33ad9f1a
CS
2034 }
2035 }
2036
2037out:
2038 fclose(devices_list);
2039 free(line);
2040 free(path);
2041 return ret;
2042}
2043
574c4428 2044static int cgroup_recursive_task_count(const char *cgroup_path)
b98f7d6e 2045{
33ad9f1a
CS
2046 DIR *d;
2047 struct dirent *dent_buf;
2048 struct dirent *dent;
8900b9eb 2049 ssize_t name_max;
33ad9f1a
CS
2050 int n = 0, r;
2051
2052 /* see man readdir_r(3) */
2053 name_max = pathconf(cgroup_path, _PC_NAME_MAX);
2054 if (name_max <= 0)
2055 name_max = 255;
2056 dent_buf = malloc(offsetof(struct dirent, d_name) + name_max + 1);
2057 if (!dent_buf)
2058 return -1;
2059
2060 d = opendir(cgroup_path);
034ef75d
SH
2061 if (!d) {
2062 free(dent_buf);
33ad9f1a 2063 return 0;
034ef75d 2064 }
33ad9f1a
CS
2065
2066 while (readdir_r(d, dent_buf, &dent) == 0 && dent) {
2067 const char *parts[3] = {
2068 cgroup_path,
2069 dent->d_name,
2070 NULL
2071 };
2072 char *sub_path;
2073 struct stat st;
2074
2075 if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, ".."))
2076 continue;
2077 sub_path = lxc_string_join("/", parts, false);
2078 if (!sub_path) {
2079 closedir(d);
2080 free(dent_buf);
2081 return -1;
2082 }
2083 r = stat(sub_path, &st);
2084 if (r < 0) {
2085 closedir(d);
2086 free(dent_buf);
2087 free(sub_path);
2088 return -1;
2089 }
2090 if (S_ISDIR(st.st_mode)) {
2091 r = cgroup_recursive_task_count(sub_path);
2092 if (r >= 0)
2093 n += r;
2094 } else if (!strcmp(dent->d_name, "tasks")) {
2095 r = count_lines(sub_path);
2096 if (r >= 0)
2097 n += r;
2098 }
2099 free(sub_path);
2100 }
2101 closedir(d);
2102 free(dent_buf);
2103
2104 return n;
2105}
2106
574c4428 2107static int count_lines(const char *fn)
33ad9f1a
CS
2108{
2109 FILE *f;
2110 char *line = NULL;
2111 size_t sz = 0;
2112 int n = 0;
2113
2114 f = fopen_cloexec(fn, "r");
2115 if (!f)
2116 return -1;
2117
2118 while (getline(&line, &sz, f) != -1) {
2119 n++;
2120 }
2121 free(line);
2122 fclose(f);
2123 return n;
b98f7d6e
SH
2124}
2125
574c4428
QH
2126static int handle_cgroup_settings(struct cgroup_mount_point *mp,
2127 char *cgroup_path)
b98f7d6e 2128{
33ad9f1a 2129 int r, saved_errno = 0;
7e7243e1 2130 char buf[2];
1ea59ad2 2131
934b1673
SH
2132 mp->need_cpuset_init = false;
2133
1ea59ad2
SH
2134 /* If this is the memory cgroup, we want to enforce hierarchy.
2135 * But don't fail if for some reason we can't.
2136 */
2edb53c7
SH
2137 if (lxc_string_in_array("memory", (const char **)mp->hierarchy->subsystems)) {
2138 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/memory.use_hierarchy");
2139 if (cc_path) {
2140 r = lxc_read_from_file(cc_path, buf, 1);
2141 if (r < 1 || buf[0] != '1') {
2142 r = lxc_write_to_file(cc_path, "1", 1, false);
2143 if (r < 0)
a8916143 2144 SYSERROR("failed to set memory.use_hierarchy to 1; continuing");
2edb53c7 2145 }
1ea59ad2
SH
2146 free(cc_path);
2147 }
2edb53c7 2148 }
1ea59ad2 2149
33ad9f1a
CS
2150 /* if this is a cpuset hierarchy, we have to set cgroup.clone_children in
2151 * the base cgroup, otherwise containers will start with an empty cpuset.mems
2152 * and cpuset.cpus and then
2153 */
2edb53c7
SH
2154 if (lxc_string_in_array("cpuset", (const char **)mp->hierarchy->subsystems)) {
2155 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/cgroup.clone_children");
d703c2b1
RV
2156 struct stat sb;
2157
33ad9f1a 2158 if (!cc_path)
2edb53c7 2159 return -1;
d703c2b1
RV
2160 /* cgroup.clone_children is not available when running under
2161 * older kernel versions; in this case, we'll initialize
2162 * cpuset.cpus and cpuset.mems later, after the new cgroup
2163 * was created
2164 */
2165 if (stat(cc_path, &sb) != 0 && errno == ENOENT) {
934b1673 2166 mp->need_cpuset_init = true;
d703c2b1
RV
2167 free(cc_path);
2168 return 0;
2169 }
7e7243e1
SH
2170 r = lxc_read_from_file(cc_path, buf, 1);
2171 if (r == 1 && buf[0] == '1') {
2172 free(cc_path);
2edb53c7 2173 return 0;
7e7243e1 2174 }
33ad9f1a 2175 r = lxc_write_to_file(cc_path, "1", 1, false);
2edb53c7
SH
2176 saved_errno = errno;
2177 free(cc_path);
2178 errno = saved_errno;
2179 return r < 0 ? -1 : 0;
33ad9f1a
CS
2180 }
2181 return 0;
b98f7d6e 2182}
484ed030 2183
934b1673 2184static int cgroup_read_from_file(const char *fn, char buf[], size_t bufsize)
d703c2b1
RV
2185{
2186 int ret = lxc_read_from_file(fn, buf, bufsize);
2187 if (ret < 0) {
2188 SYSERROR("failed to read %s", fn);
934b1673 2189 return ret;
d703c2b1
RV
2190 }
2191 if (ret == bufsize) {
934b1673
SH
2192 if (bufsize > 0) {
2193 /* obviously this wasn't empty */
2194 buf[bufsize-1] = '\0';
2195 return ret;
2196 }
2197 /* Callers don't do this, but regression/sanity check */
2198 ERROR("%s: was not expecting 0 bufsize", __func__);
2199 return -1;
d703c2b1
RV
2200 }
2201 buf[ret] = '\0';
934b1673 2202 return ret;
d703c2b1
RV
2203}
2204
2205static bool do_init_cpuset_file(struct cgroup_mount_point *mp,
2206 const char *path, const char *name)
2207{
934b1673
SH
2208 char value[1024];
2209 char *childfile, *parentfile = NULL, *tmp;
2210 int ret;
2211 bool ok = false;
2212
d703c2b1
RV
2213 childfile = cgroup_to_absolute_path(mp, path, name);
2214 if (!childfile)
2215 return false;
2216
2217 /* don't overwrite a non-empty value in the file */
934b1673
SH
2218 ret = cgroup_read_from_file(childfile, value, sizeof(value));
2219 if (ret < 0)
2220 goto out;
d703c2b1 2221 if (value[0] != '\0' && value[0] != '\n') {
934b1673
SH
2222 ok = true;
2223 goto out;
d703c2b1
RV
2224 }
2225
2226 /* path to the same name in the parent cgroup */
2227 parentfile = strdup(path);
2228 if (!parentfile)
934b1673
SH
2229 goto out;
2230
d703c2b1 2231 tmp = strrchr(parentfile, '/');
934b1673
SH
2232 if (!tmp)
2233 goto out;
d703c2b1
RV
2234 if (tmp == parentfile)
2235 tmp++; /* keep the '/' at the start */
2236 *tmp = '\0';
2237 tmp = parentfile;
2238 parentfile = cgroup_to_absolute_path(mp, tmp, name);
2239 free(tmp);
934b1673
SH
2240 if (!parentfile)
2241 goto out;
d703c2b1
RV
2242
2243 /* copy from parent to child cgroup */
934b1673
SH
2244 ret = cgroup_read_from_file(parentfile, value, sizeof(value));
2245 if (ret < 0)
2246 goto out;
2247 if (ret == sizeof(value)) {
2248 /* If anyone actually sees this error, we can address it */
2249 ERROR("parent cpuset value too long");
2250 goto out;
d703c2b1
RV
2251 }
2252 ok = (lxc_write_to_file(childfile, value, strlen(value), false) >= 0);
2253 if (!ok)
2254 SYSERROR("failed writing %s", childfile);
b1dad6f6
RV
2255
2256out:
f10fad2f 2257 free(parentfile);
d703c2b1 2258 free(childfile);
d703c2b1
RV
2259 return ok;
2260}
2261
2262static bool init_cpuset_if_needed(struct cgroup_mount_point *mp,
2263 const char *path)
2264{
2265 /* the files we have to handle here are only in cpuset hierarchies */
2266 if (!lxc_string_in_array("cpuset",
2267 (const char **)mp->hierarchy->subsystems))
2268 return true;
2269
b1dad6f6
RV
2270 if (!mp->need_cpuset_init)
2271 return true;
2272
d703c2b1
RV
2273 return (do_init_cpuset_file(mp, path, "/cpuset.cpus") &&
2274 do_init_cpuset_file(mp, path, "/cpuset.mems") );
2275}
2276
4fb3cba5 2277struct cgroup_ops *cgfs_ops_init(void)
484ed030 2278{
4fb3cba5 2279 return &cgfs_ops;
d4ef7c50 2280}
484ed030 2281
4fb3cba5 2282static void *cgfs_init(const char *name)
d4ef7c50 2283{
4fb3cba5 2284 struct cgfs_data *d;
484ed030 2285
4fb3cba5
DE
2286 d = malloc(sizeof(*d));
2287 if (!d)
2288 return NULL;
484ed030 2289
4fb3cba5
DE
2290 memset(d, 0, sizeof(*d));
2291 d->name = strdup(name);
2292 if (!d->name)
2293 goto err1;
2294
5e1c5795 2295 d->cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
4fb3cba5
DE
2296
2297 d->meta = lxc_cgroup_load_meta();
2298 if (!d->meta) {
2299 ERROR("cgroupfs failed to detect cgroup metadata");
2300 goto err2;
2301 }
2302 return d;
2303
2304err2:
2305 free(d->name);
2306err1:
2307 free(d);
2308 return NULL;
d4ef7c50 2309}
484ed030 2310
4fb3cba5 2311static void cgfs_destroy(void *hdata)
d4ef7c50 2312{
4fb3cba5
DE
2313 struct cgfs_data *d = hdata;
2314
d4ef7c50
SH
2315 if (!d)
2316 return;
f10fad2f 2317 free(d->name);
c55d4505
ME
2318 lxc_cgroup_process_info_free_and_remove(d->info);
2319 lxc_cgroup_put_meta(d->meta);
d4ef7c50 2320 free(d);
d4ef7c50 2321}
484ed030 2322
4fb3cba5 2323static inline bool cgfs_create(void *hdata)
d4ef7c50 2324{
4fb3cba5
DE
2325 struct cgfs_data *d = hdata;
2326 struct cgroup_process_info *i;
2327 struct cgroup_meta_data *md;
484ed030 2328
4fb3cba5 2329 if (!d)
d4ef7c50 2330 return false;
4fb3cba5
DE
2331 md = d->meta;
2332 i = lxc_cgroupfs_create(d->name, d->cgroup_pattern, md, NULL);
d4ef7c50
SH
2333 if (!i)
2334 return false;
2335 d->info = i;
2336 return true;
2337}
484ed030 2338
4fb3cba5 2339static inline bool cgfs_enter(void *hdata, pid_t pid)
d4ef7c50 2340{
4fb3cba5
DE
2341 struct cgfs_data *d = hdata;
2342 struct cgroup_process_info *i;
d4ef7c50 2343 int ret;
4fb3cba5
DE
2344
2345 if (!d)
2346 return false;
2347 i = d->info;
2348 ret = lxc_cgroupfs_enter(i, pid, false);
484ed030 2349
d4ef7c50
SH
2350 return ret == 0;
2351}
2352
4fb3cba5 2353static inline bool cgfs_create_legacy(void *hdata, pid_t pid)
d4ef7c50 2354{
4fb3cba5
DE
2355 struct cgfs_data *d = hdata;
2356 struct cgroup_process_info *i;
2357
2358 if (!d)
2359 return false;
2360 i = d->info;
2361 if (lxc_cgroup_create_legacy(i, d->name, pid) < 0) {
2362 ERROR("failed to create legacy ns cgroups for '%s'", d->name);
d4ef7c50 2363 return false;
484ed030 2364 }
d4ef7c50
SH
2365 return true;
2366}
484ed030 2367
4fb3cba5 2368static const char *cgfs_get_cgroup(void *hdata, const char *subsystem)
d4ef7c50 2369{
4fb3cba5
DE
2370 struct cgfs_data *d = hdata;
2371
2372 if (!d)
2373 return NULL;
2374 return lxc_cgroup_get_hierarchy_path_data(subsystem, d);
484ed030
SH
2375}
2376
2ba7a429
TA
2377static const char *cgfs_canonical_path(void *hdata)
2378{
2379 struct cgfs_data *d = hdata;
2380 struct cgroup_process_info *info_ptr;
2381 char *path = NULL;
2382
2383 if (!d)
2384 return NULL;
2385
2386 for (info_ptr = d->info; info_ptr; info_ptr = info_ptr->next) {
2387 if (!path)
2388 path = info_ptr->cgroup_path;
2389 else if (strcmp(path, info_ptr->cgroup_path) != 0) {
2390 ERROR("not all paths match %s, %s has path %s", path,
2391 info_ptr->hierarchy->subsystems[0], info_ptr->cgroup_path);
2392 return NULL;
2393 }
2394 }
2395
2396 return path;
2397}
2398
06078509
TA
2399static bool cgfs_escape(void)
2400{
2401 struct cgroup_meta_data *md;
2402 int i;
2403 bool ret = false;
2404
2405 md = lxc_cgroup_load_meta();
2406 if (!md)
2407 return false;
2408
2409 for (i = 1; i <= md->maximum_hierarchy; i++) {
2410 struct cgroup_hierarchy *h = md->hierarchies[i];
2411 struct cgroup_mount_point *mp;
2412 char *tasks;
2413 FILE *f;
2414 int written;
2415
2416 if (!h) {
2417 WARN("not escaping hierarchy %d", i);
2418 continue;
2419 }
2420
2421 mp = lxc_cgroup_find_mount_point(h, "/", true);
2422 if (!mp)
2423 goto out;
2424
2425 tasks = cgroup_to_absolute_path(mp, "/", "tasks");
2426 if (!tasks)
2427 goto out;
2428
2429 f = fopen(tasks, "a");
2430 free(tasks);
2431 if (!f)
2432 goto out;
2433
2434 written = fprintf(f, "%d\n", getpid());
2435 fclose(f);
2436 if (written < 0) {
2437 SYSERROR("writing tasks failed\n");
2438 goto out;
2439 }
2440 }
2441
2442 ret = true;
2443out:
2444 lxc_cgroup_put_meta(md);
2445 return ret;
2446}
2447
4fb3cba5 2448static bool cgfs_unfreeze(void *hdata)
0086f499 2449{
4fb3cba5 2450 struct cgfs_data *d = hdata;
0086f499
SH
2451 char *cgabspath, *cgrelpath;
2452 int ret;
2453
4fb3cba5
DE
2454 if (!d)
2455 return false;
2456
2457 cgrelpath = lxc_cgroup_get_hierarchy_path_data("freezer", d);
0086f499
SH
2458 cgabspath = lxc_cgroup_find_abs_path("freezer", cgrelpath, true, NULL);
2459 if (!cgabspath)
ecfcb3f0 2460 return false;
0086f499
SH
2461
2462 ret = do_cgroup_set(cgabspath, "freezer.state", "THAWED");
2463 free(cgabspath);
ecfcb3f0 2464 return ret == 0;
0086f499
SH
2465}
2466
4fb3cba5
DE
2467static bool cgroupfs_setup_limits(void *hdata, struct lxc_list *cgroup_conf,
2468 bool with_devices)
9daf6f5d 2469{
4fb3cba5
DE
2470 struct cgfs_data *d = hdata;
2471
2472 if (!d)
2473 return false;
2474 return do_setup_cgroup_limits(d, cgroup_conf, with_devices) == 0;
9daf6f5d
SH
2475}
2476
4fb3cba5 2477static bool lxc_cgroupfs_attach(const char *name, const char *lxcpath, pid_t pid)
5d897655
SH
2478{
2479 struct cgroup_meta_data *meta_data;
2480 struct cgroup_process_info *container_info;
2481 int ret;
2482
2483 meta_data = lxc_cgroup_load_meta();
2484 if (!meta_data) {
2485 ERROR("could not move attached process %d to cgroup of container", pid);
2486 return false;
2487 }
2488
2489 container_info = lxc_cgroup_get_container_info(name, lxcpath, meta_data);
2490 lxc_cgroup_put_meta(meta_data);
2491 if (!container_info) {
2492 ERROR("could not move attached process %d to cgroup of container", pid);
2493 return false;
2494 }
2495
2496 ret = lxc_cgroupfs_enter(container_info, pid, false);
2497 lxc_cgroup_process_info_free(container_info);
2498 if (ret < 0) {
2499 ERROR("could not move attached process %d to cgroup of container", pid);
2500 return false;
2501 }
2502 return true;
2503}
2504
8b276860
SH
2505struct chown_data {
2506 const char *cgroup_path;
2507 uid_t origuid;
2508};
2509
2510/*
2511 * TODO - someone should refactor this to unshare once passing all the paths
2512 * to be chowned in one go
2513 */
2514static int chown_cgroup_wrapper(void *data)
2515{
2516 struct chown_data *arg = data;
2517 uid_t destuid;
2518 char *fpath;
2519
2520
2521 if (setresgid(0,0,0) < 0)
2522 SYSERROR("Failed to setgid to 0");
2523 if (setresuid(0,0,0) < 0)
2524 SYSERROR("Failed to setuid to 0");
2525 if (setgroups(0, NULL) < 0)
2526 SYSERROR("Failed to clear groups");
2527 destuid = get_ns_uid(arg->origuid);
2528
2529 if (chown(arg->cgroup_path, destuid, 0) < 0)
2530 SYSERROR("Failed chowning %s to %d", arg->cgroup_path, (int)destuid);
2531
2532 fpath = lxc_append_paths(arg->cgroup_path, "tasks");
2533 if (!fpath)
2534 return -1;
2535 if (chown(fpath, destuid, 0) < 0)
2536 SYSERROR("Error chowning %s\n", fpath);
2537 free(fpath);
2538 fpath = lxc_append_paths(arg->cgroup_path, "cgroup.procs");
2539 if (!fpath)
2540 return -1;
2541 if (chown(fpath, destuid, 0) < 0)
2542 SYSERROR("Error chowning %s", fpath);
2543 free(fpath);
2544
2545 return 0;
2546}
2547
2548static bool do_cgfs_chown(char *cgroup_path, struct lxc_conf *conf)
2549{
2550 struct chown_data data;
2551 char *fpath;
2552
2553 if (lxc_list_empty(&conf->id_map))
2554 /* If there's no mapping then we don't need to chown */
2555 return true;
2556
2557 data.cgroup_path = cgroup_path;
2558 data.origuid = geteuid();
2559
2560 /* Unpriv users can't chown it themselves, so chown from
2561 * a child namespace mapping both our own and the target uid
2562 */
2563 if (userns_exec_1(conf, chown_cgroup_wrapper, &data) < 0) {
2564 ERROR("Error requesting cgroup chown in new namespace");
2565 return false;
2566 }
2567
2568 /*
2569 * Now chmod 775 the directory else the container cannot create cgroups.
2570 * This can't be done in the child namespace because it only group-owns
2571 * the cgroup
2572 */
2573 if (chmod(cgroup_path, 0775) < 0) {
2574 SYSERROR("Error chmoding %s\n", cgroup_path);
2575 return false;
2576 }
2577 fpath = lxc_append_paths(cgroup_path, "tasks");
2578 if (!fpath)
2579 return false;
2580 if (chmod(fpath, 0664) < 0)
2581 SYSERROR("Error chmoding %s\n", fpath);
2582 free(fpath);
2583 fpath = lxc_append_paths(cgroup_path, "cgroup.procs");
2584 if (!fpath)
2585 return false;
2586 if (chmod(fpath, 0664) < 0)
2587 SYSERROR("Error chmoding %s\n", fpath);
2588 free(fpath);
2589
2590 return true;
2591}
2592
2593static bool cgfs_chown(void *hdata, struct lxc_conf *conf)
2594{
2595 struct cgfs_data *d = hdata;
2596 struct cgroup_process_info *info_ptr;
2597 char *cgpath;
2598 bool r = true;
2599
2600 if (!d)
2601 return false;
2602
2603 for (info_ptr = d->info; info_ptr; info_ptr = info_ptr->next) {
2604 if (!info_ptr->designated_mount_point) {
2605 info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, info_ptr->cgroup_path, true);
2606 if (!info_ptr->designated_mount_point) {
2607 SYSERROR("Could not chown cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", info_ptr->cgroup_path);
2608 return false;
2609 }
2610 }
2611
2612 cgpath = cgroup_to_absolute_path(info_ptr->designated_mount_point, info_ptr->cgroup_path, NULL);
2613 if (!cgpath) {
2614 SYSERROR("Could not chown cgroup %s: internal error", info_ptr->cgroup_path);
2615 continue;
2616 }
2617 r = do_cgfs_chown(cgpath, conf);
ea439aac 2618 if (!r && is_crucial_hierarchy(info_ptr->hierarchy)) {
8b276860
SH
2619 ERROR("Failed chowning %s\n", cgpath);
2620 free(cgpath);
2621 return false;
2622 }
2623 free(cgpath);
2624 }
2625
2626 return true;
2627}
2628
d4ef7c50 2629static struct cgroup_ops cgfs_ops = {
d4ef7c50 2630 .init = cgfs_init,
4fb3cba5 2631 .destroy = cgfs_destroy,
d4ef7c50
SH
2632 .create = cgfs_create,
2633 .enter = cgfs_enter,
2634 .create_legacy = cgfs_create_legacy,
2635 .get_cgroup = cgfs_get_cgroup,
2ba7a429 2636 .canonical_path = cgfs_canonical_path,
06078509 2637 .escape = cgfs_escape,
d4ef7c50
SH
2638 .get = lxc_cgroupfs_get,
2639 .set = lxc_cgroupfs_set,
4fb3cba5 2640 .unfreeze = cgfs_unfreeze,
9daf6f5d 2641 .setup_limits = cgroupfs_setup_limits,
d4ef7c50 2642 .name = "cgroupfs",
5d897655 2643 .attach = lxc_cgroupfs_attach,
8b276860 2644 .chown = cgfs_chown,
c476bdce 2645 .mount_cgroup = cgroupfs_mount_cgroup,
4fb3cba5 2646 .nrtasks = cgfs_nrtasks,
23befb18 2647 .driver = CGFS,
d4ef7c50 2648};