]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/cgfs.c
change version to 2.0.0.rc4 in configure.ac
[mirror_lxc.git] / src / lxc / cgfs.c
CommitLineData
576f946d 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
576f946d 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
576f946d 22 */
d06245b8
NC
23#include "config.h"
24
576f946d 25#include <stdio.h>
576f946d 26#include <stdlib.h>
27#include <errno.h>
576f946d 28#include <unistd.h>
29#include <string.h>
341a9bd8 30#include <dirent.h>
576f946d 31#include <fcntl.h>
8b276860 32#include <grp.h>
b98f7d6e 33#include <ctype.h>
576f946d 34#include <sys/types.h>
35#include <sys/stat.h>
36#include <sys/param.h>
37#include <sys/inotify.h>
aae1f3c4 38#include <sys/mount.h>
576f946d 39#include <netinet/in.h>
40#include <net/if.h>
41
e2bcd7db 42#include "error.h"
ae5c8b8e 43#include "commands.h"
b98f7d6e
SH
44#include "list.h"
45#include "conf.h"
33ad9f1a 46#include "utils.h"
4ec31c52 47#include "bdev/bdev.h"
f2363e38
ÇO
48#include "log.h"
49#include "cgroup.h"
50#include "start.h"
484ed030 51#include "state.h"
36eb9bde 52
edaf8b1b
SG
53#if IS_BIONIC
54#include <../include/lxcmntent.h>
55#else
56#include <mntent.h>
57#endif
58
4fb3cba5
DE
59struct cgroup_hierarchy;
60struct cgroup_meta_data;
61struct cgroup_mount_point;
62
63/*
64 * cgroup_meta_data: the metadata about the cgroup infrastructure on this
65 * host
66 */
67struct cgroup_meta_data {
68 ptrdiff_t ref; /* simple refcount */
69 struct cgroup_hierarchy **hierarchies;
70 struct cgroup_mount_point **mount_points;
71 int maximum_hierarchy;
72};
73
74/*
75 * cgroup_hierarchy: describes a single cgroup hierarchy
76 * (may have multiple mount points)
77 */
78struct cgroup_hierarchy {
79 int index;
80 bool used; /* false if the hierarchy should be ignored by lxc */
81 char **subsystems;
82 struct cgroup_mount_point *rw_absolute_mount_point;
83 struct cgroup_mount_point *ro_absolute_mount_point;
84 struct cgroup_mount_point **all_mount_points;
85 size_t all_mount_point_capacity;
86};
87
88/*
89 * cgroup_mount_point: a mount point to where a hierarchy
90 * is mounted to
91 */
92struct cgroup_mount_point {
93 struct cgroup_hierarchy *hierarchy;
94 char *mount_point;
95 char *mount_prefix;
96 bool read_only;
97 bool need_cpuset_init;
98};
99
100/*
101 * cgroup_process_info: describes the membership of a
102 * process to the different cgroup
103 * hierarchies
104 *
105 * Note this is the per-process info tracked by the cgfs_ops.
106 * This is not used with cgmanager.
107 */
108struct cgroup_process_info {
109 struct cgroup_process_info *next;
110 struct cgroup_meta_data *meta_ref;
111 struct cgroup_hierarchy *hierarchy;
112 char *cgroup_path;
113 char *cgroup_path_sub;
114 char **created_paths;
115 size_t created_paths_capacity;
116 size_t created_paths_count;
117 struct cgroup_mount_point *designated_mount_point;
118};
119
120struct cgfs_data {
121 char *name;
122 const char *cgroup_pattern;
123 struct cgroup_meta_data *meta;
124 struct cgroup_process_info *info;
125};
126
127lxc_log_define(lxc_cgfs, lxc);
576f946d 128
33ad9f1a
CS
129static struct cgroup_process_info *lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str, struct cgroup_meta_data *meta);
130static char **subsystems_from_mount_options(const char *mount_options, char **kernel_list);
131static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp);
132static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h);
133static bool is_valid_cgroup(const char *name);
33ad9f1a 134static int create_cgroup(struct cgroup_mount_point *mp, const char *path);
603c64c2 135static int remove_cgroup(struct cgroup_mount_point *mp, const char *path, bool recurse);
33ad9f1a
CS
136static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp, const char *path, const char *suffix);
137static struct cgroup_process_info *find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem);
138static int do_cgroup_get(const char *cgroup_path, const char *sub_filename, char *value, size_t len);
139static int do_cgroup_set(const char *cgroup_path, const char *sub_filename, const char *value);
4fb3cba5
DE
140static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d, char *v, bool for_allow);
141static int do_setup_cgroup_limits(struct cgfs_data *d, struct lxc_list *cgroup_settings, bool do_devices);
33ad9f1a
CS
142static int cgroup_recursive_task_count(const char *cgroup_path);
143static int count_lines(const char *fn);
1ea59ad2 144static int handle_cgroup_settings(struct cgroup_mount_point *mp, char *cgroup_path);
d703c2b1 145static bool init_cpuset_if_needed(struct cgroup_mount_point *mp, const char *path);
33ad9f1a 146
4fb3cba5
DE
147static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist);
148static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data);
149static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data);
150
151/* free process membership information */
152static void lxc_cgroup_process_info_free(struct cgroup_process_info *info);
153static void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info);
154
d4ef7c50 155static struct cgroup_ops cgfs_ops;
d4ef7c50 156
603c64c2
SH
157static int cgroup_rmdir(char *dirname)
158{
159 struct dirent dirent, *direntp;
160 int saved_errno = 0;
161 DIR *dir;
162 int ret, failed=0;
163 char pathname[MAXPATHLEN];
164
165 dir = opendir(dirname);
166 if (!dir) {
167 ERROR("%s: failed to open %s", __func__, dirname);
168 return -1;
169 }
170
171 while (!readdir_r(dir, &dirent, &direntp)) {
172 struct stat mystat;
173 int rc;
174
175 if (!direntp)
176 break;
177
178 if (!strcmp(direntp->d_name, ".") ||
179 !strcmp(direntp->d_name, ".."))
180 continue;
181
182 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
183 if (rc < 0 || rc >= MAXPATHLEN) {
184 ERROR("pathname too long");
185 failed=1;
186 if (!saved_errno)
187 saved_errno = -ENOMEM;
188 continue;
189 }
190 ret = lstat(pathname, &mystat);
191 if (ret) {
192 SYSERROR("%s: failed to stat %s", __func__, pathname);
193 failed=1;
194 if (!saved_errno)
195 saved_errno = errno;
196 continue;
197 }
198 if (S_ISDIR(mystat.st_mode)) {
199 if (cgroup_rmdir(pathname) < 0) {
200 if (!saved_errno)
201 saved_errno = errno;
202 failed=1;
203 }
204 }
205 }
206
207 if (rmdir(dirname) < 0) {
208 SYSERROR("%s: failed to delete %s", __func__, dirname);
209 if (!saved_errno)
210 saved_errno = errno;
211 failed=1;
212 }
213
214 ret = closedir(dir);
215 if (ret) {
216 SYSERROR("%s: failed to close directory %s", __func__, dirname);
217 if (!saved_errno)
218 saved_errno = errno;
219 failed=1;
220 }
221
222 errno = saved_errno;
223 return failed ? -1 : 0;
224}
225
4fb3cba5 226static struct cgroup_meta_data *lxc_cgroup_load_meta()
33ad9f1a
CS
227{
228 const char *cgroup_use = NULL;
229 char **cgroup_use_list = NULL;
230 struct cgroup_meta_data *md = NULL;
231 int saved_errno;
232
233 errno = 0;
593e8478 234 cgroup_use = lxc_global_config_value("lxc.cgroup.use");
33ad9f1a
CS
235 if (!cgroup_use && errno != 0)
236 return NULL;
237 if (cgroup_use) {
238 cgroup_use_list = lxc_string_split_and_trim(cgroup_use, ',');
239 if (!cgroup_use_list)
240 return NULL;
241 }
576f946d 242
33ad9f1a
CS
243 md = lxc_cgroup_load_meta2((const char **)cgroup_use_list);
244 saved_errno = errno;
245 lxc_free_array((void **)cgroup_use_list, free);
246 errno = saved_errno;
247 return md;
248}
fd37327f 249
b653309a 250/* Step 1: determine all kernel subsystems */
4fb3cba5 251static bool find_cgroup_subsystems(char ***kernel_subsystems)
1d39a065 252{
b653309a
SH
253 FILE *proc_cgroups;
254 bool bret = false;
33ad9f1a
CS
255 char *line = NULL;
256 size_t sz = 0;
b653309a
SH
257 size_t kernel_subsystems_count = 0;
258 size_t kernel_subsystems_capacity = 0;
259 int r;
1d39a065 260
33ad9f1a
CS
261 proc_cgroups = fopen_cloexec("/proc/cgroups", "r");
262 if (!proc_cgroups)
b653309a 263 return false;
1d39a065 264
33ad9f1a
CS
265 while (getline(&line, &sz, proc_cgroups) != -1) {
266 char *tab1;
267 char *tab2;
268 int hierarchy_number;
1d39a065 269
33ad9f1a
CS
270 if (line[0] == '#')
271 continue;
272 if (!line[0])
273 continue;
1d39a065 274
33ad9f1a
CS
275 tab1 = strchr(line, '\t');
276 if (!tab1)
8900b9eb 277 continue;
33ad9f1a
CS
278 *tab1++ = '\0';
279 tab2 = strchr(tab1, '\t');
280 if (!tab2)
281 continue;
282 *tab2 = '\0';
fd37327f 283
33ad9f1a
CS
284 tab2 = NULL;
285 hierarchy_number = strtoul(tab1, &tab2, 10);
286 if (!tab2 || *tab2)
287 continue;
288 (void)hierarchy_number;
289
b653309a 290 r = lxc_grow_array((void ***)kernel_subsystems, &kernel_subsystems_capacity, kernel_subsystems_count + 1, 12);
33ad9f1a 291 if (r < 0)
b653309a
SH
292 goto out;
293 (*kernel_subsystems)[kernel_subsystems_count] = strdup(line);
294 if (!(*kernel_subsystems)[kernel_subsystems_count])
295 goto out;
33ad9f1a 296 kernel_subsystems_count++;
bcbd102c 297 }
b653309a 298 bret = true;
0d9f8e18 299
b653309a 300out:
33ad9f1a 301 fclose(proc_cgroups);
0ccf7c2a 302 free(line);
b653309a
SH
303 return bret;
304}
305
306/* Step 2: determine all hierarchies (by reading /proc/self/cgroup),
307 * since mount points don't specify hierarchy number and
308 * /proc/cgroups does not contain named hierarchies
309 */
310static bool find_cgroup_hierarchies(struct cgroup_meta_data *meta_data,
311 bool all_kernel_subsystems, bool all_named_subsystems,
312 const char **subsystem_whitelist)
313{
314 FILE *proc_self_cgroup;
315 char *line = NULL;
316 size_t sz = 0;
317 int r;
318 bool bret = false;
319 size_t hierarchy_capacity = 0;
ef6e34ee 320
33ad9f1a
CS
321 proc_self_cgroup = fopen_cloexec("/proc/self/cgroup", "r");
322 /* if for some reason (because of setns() and pid namespace for example),
323 * /proc/self is not valid, we try /proc/1/cgroup... */
324 if (!proc_self_cgroup)
325 proc_self_cgroup = fopen_cloexec("/proc/1/cgroup", "r");
326 if (!proc_self_cgroup)
b653309a 327 return false;
33ad9f1a
CS
328
329 while (getline(&line, &sz, proc_self_cgroup) != -1) {
330 /* file format: hierarchy:subsystems:group,
331 * we only extract hierarchy and subsystems
332 * here */
333 char *colon1;
334 char *colon2;
335 int hierarchy_number;
336 struct cgroup_hierarchy *h = NULL;
337 char **p;
338
339 if (!line[0])
340 continue;
ad08bbb7 341
33ad9f1a
CS
342 colon1 = strchr(line, ':');
343 if (!colon1)
8900b9eb 344 continue;
33ad9f1a
CS
345 *colon1++ = '\0';
346 colon2 = strchr(colon1, ':');
347 if (!colon2)
348 continue;
349 *colon2 = '\0';
ad08bbb7 350
33ad9f1a
CS
351 colon2 = NULL;
352 hierarchy_number = strtoul(line, &colon2, 10);
353 if (!colon2 || *colon2)
354 continue;
576f946d 355
33ad9f1a
CS
356 if (hierarchy_number > meta_data->maximum_hierarchy) {
357 /* lxc_grow_array will never shrink, so even if we find a lower
358 * hierarchy number here, the array will never be smaller
359 */
360 r = lxc_grow_array((void ***)&meta_data->hierarchies, &hierarchy_capacity, hierarchy_number + 1, 12);
361 if (r < 0)
b653309a 362 goto out;
5193cc3d 363
33ad9f1a
CS
364 meta_data->maximum_hierarchy = hierarchy_number;
365 }
fd37327f 366
33ad9f1a
CS
367 /* this shouldn't happen, we had this already */
368 if (meta_data->hierarchies[hierarchy_number])
b653309a 369 goto out;
33ad9f1a
CS
370
371 h = calloc(1, sizeof(struct cgroup_hierarchy));
372 if (!h)
b653309a 373 goto out;
33ad9f1a
CS
374
375 meta_data->hierarchies[hierarchy_number] = h;
376
377 h->index = hierarchy_number;
378 h->subsystems = lxc_string_split_and_trim(colon1, ',');
379 if (!h->subsystems)
b653309a 380 goto out;
33ad9f1a
CS
381 /* see if this hierarchy should be considered */
382 if (!all_kernel_subsystems || !all_named_subsystems) {
383 for (p = h->subsystems; *p; p++) {
384 if (!strncmp(*p, "name=", 5)) {
385 if (all_named_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
386 h->used = true;
387 break;
388 }
389 } else {
390 if (all_kernel_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
391 h->used = true;
392 break;
393 }
394 }
395 }
396 } else {
397 /* we want all hierarchy anyway */
398 h->used = true;
ae5c8b8e 399 }
ae5c8b8e 400 }
b653309a 401 bret = true;
0b9c21ab 402
b653309a 403out:
33ad9f1a 404 fclose(proc_self_cgroup);
0ccf7c2a 405 free(line);
b653309a
SH
406 return bret;
407}
408
409/* Step 3: determine all mount points of each hierarchy */
410static bool find_hierarchy_mountpts( struct cgroup_meta_data *meta_data, char **kernel_subsystems)
411{
412 bool bret = false;
413 FILE *proc_self_mountinfo;
414 char *line = NULL;
415 size_t sz = 0;
416 char **tokens = NULL;
417 size_t mount_point_count = 0;
418 size_t mount_point_capacity = 0;
419 size_t token_capacity = 0;
420 int r;
fcca16bc 421 bool is_cgns = cgns_supported();
b653309a 422
33ad9f1a
CS
423 proc_self_mountinfo = fopen_cloexec("/proc/self/mountinfo", "r");
424 /* if for some reason (because of setns() and pid namespace for example),
425 * /proc/self is not valid, we try /proc/1/cgroup... */
426 if (!proc_self_mountinfo)
427 proc_self_mountinfo = fopen_cloexec("/proc/1/mountinfo", "r");
428 if (!proc_self_mountinfo)
b653309a 429 return false;
33ad9f1a
CS
430
431 while (getline(&line, &sz, proc_self_mountinfo) != -1) {
178938fe 432 char *token, *line_tok, *saveptr = NULL;
33ad9f1a
CS
433 size_t i, j, k;
434 struct cgroup_mount_point *mount_point;
435 struct cgroup_hierarchy *h;
436 char **subsystems;
836514a8 437 bool is_lxcfs = false;
33ad9f1a
CS
438
439 if (line[0] && line[strlen(line) - 1] == '\n')
440 line[strlen(line) - 1] = '\0';
441
178938fe 442 for (i = 0, line_tok = line; (token = strtok_r(line_tok, " ", &saveptr)); line_tok = NULL) {
33ad9f1a
CS
443 r = lxc_grow_array((void ***)&tokens, &token_capacity, i + 1, 64);
444 if (r < 0)
b653309a 445 goto out;
33ad9f1a
CS
446 tokens[i++] = token;
447 }
b98f7d6e 448
33ad9f1a
CS
449 /* layout of /proc/self/mountinfo:
450 * 0: id
451 * 1: parent id
452 * 2: device major:minor
453 * 3: mount prefix
8900b9eb 454 * 4: mount point
33ad9f1a
CS
455 * 5: per-mount options
456 * [optional X]: additional data
457 * X+7: "-"
458 * X+8: type
459 * X+9: source
460 * X+10: per-superblock options
461 */
462 for (j = 6; j < i && tokens[j]; j++)
463 if (!strcmp(tokens[j], "-"))
464 break;
fd4f5a56 465
33ad9f1a
CS
466 /* could not find separator */
467 if (j >= i || !tokens[j])
468 continue;
469 /* there should be exactly three fields after
470 * the separator
471 */
472 if (i != j + 4)
473 continue;
fd4f5a56 474
33ad9f1a 475 /* not a cgroup filesystem */
836514a8
U
476 if (strcmp(tokens[j + 1], "cgroup") != 0) {
477 if (strcmp(tokens[j + 1], "fuse.lxcfs") != 0)
478 continue;
479 if (strncmp(tokens[4], "/sys/fs/cgroup/", 15) != 0)
480 continue;
481 is_lxcfs = true;
482 char *curtok = tokens[4] + 15;
483 subsystems = subsystems_from_mount_options(curtok,
484 kernel_subsystems);
485 } else
486 subsystems = subsystems_from_mount_options(tokens[j + 3],
487 kernel_subsystems);
33ad9f1a 488 if (!subsystems)
b653309a 489 goto out;
33ad9f1a
CS
490
491 h = NULL;
492 for (k = 1; k <= meta_data->maximum_hierarchy; k++) {
493 if (meta_data->hierarchies[k] &&
494 meta_data->hierarchies[k]->subsystems[0] &&
495 lxc_string_in_array(meta_data->hierarchies[k]->subsystems[0], (const char **)subsystems)) {
496 /* TODO: we could also check if the lists really match completely,
497 * just to have an additional sanity check */
498 h = meta_data->hierarchies[k];
b98f7d6e 499 break;
33ad9f1a 500 }
b98f7d6e 501 }
33ad9f1a
CS
502 lxc_free_array((void **)subsystems, free);
503
504 r = lxc_grow_array((void ***)&meta_data->mount_points, &mount_point_capacity, mount_point_count + 1, 12);
505 if (r < 0)
b653309a 506 goto out;
33ad9f1a
CS
507
508 /* create mount point object */
509 mount_point = calloc(1, sizeof(*mount_point));
510 if (!mount_point)
b653309a 511 goto out;
33ad9f1a
CS
512
513 meta_data->mount_points[mount_point_count++] = mount_point;
514
515 mount_point->hierarchy = h;
fcca16bc 516 if (is_lxcfs || is_cgns)
836514a8
U
517 mount_point->mount_prefix = strdup("/");
518 else
519 mount_point->mount_prefix = strdup(tokens[3]);
33ad9f1a 520 mount_point->mount_point = strdup(tokens[4]);
33ad9f1a 521 if (!mount_point->mount_point || !mount_point->mount_prefix)
b653309a 522 goto out;
33ad9f1a
CS
523 mount_point->read_only = !lxc_string_in_list("rw", tokens[5], ',');
524
525 if (!strcmp(mount_point->mount_prefix, "/")) {
526 if (mount_point->read_only) {
527 if (!h->ro_absolute_mount_point)
528 h->ro_absolute_mount_point = mount_point;
529 } else {
530 if (!h->rw_absolute_mount_point)
531 h->rw_absolute_mount_point = mount_point;
532 }
b98f7d6e 533 }
ae5c8b8e 534
33ad9f1a
CS
535 k = lxc_array_len((void **)h->all_mount_points);
536 r = lxc_grow_array((void ***)&h->all_mount_points, &h->all_mount_point_capacity, k + 1, 4);
537 if (r < 0)
b653309a 538 goto out;
33ad9f1a 539 h->all_mount_points[k] = mount_point;
fd4f5a56 540 }
b653309a
SH
541 bret = true;
542
543out:
b653309a 544 fclose(proc_self_mountinfo);
b653309a 545 free(tokens);
2cdafc54 546 free(line);
b653309a
SH
547 return bret;
548}
549
4fb3cba5 550static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist)
b653309a
SH
551{
552 bool all_kernel_subsystems = true;
553 bool all_named_subsystems = false;
554 struct cgroup_meta_data *meta_data = NULL;
555 char **kernel_subsystems = NULL;
556 int saved_errno = 0;
557
558 /* if the subsystem whitelist is not specified, include all
559 * hierarchies that contain kernel subsystems by default but
560 * no hierarchies that only contain named subsystems
561 *
562 * if it is specified, the specifier @all will select all
563 * hierarchies, @kernel will select all hierarchies with
564 * kernel subsystems and @named will select all named
565 * hierarchies
566 */
567 all_kernel_subsystems = subsystem_whitelist ?
568 (lxc_string_in_array("@kernel", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
569 true;
570 all_named_subsystems = subsystem_whitelist ?
571 (lxc_string_in_array("@named", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
79c59e6b 572 true;
b653309a
SH
573
574 meta_data = calloc(1, sizeof(struct cgroup_meta_data));
575 if (!meta_data)
576 return NULL;
577 meta_data->ref = 1;
578
579 if (!find_cgroup_subsystems(&kernel_subsystems))
580 goto out_error;
581
582 if (!find_cgroup_hierarchies(meta_data, all_kernel_subsystems,
583 all_named_subsystems, subsystem_whitelist))
584 goto out_error;
585
586 if (!find_hierarchy_mountpts(meta_data, kernel_subsystems))
587 goto out_error;
fd4f5a56 588
33ad9f1a
CS
589 /* oops, we couldn't find anything */
590 if (!meta_data->hierarchies || !meta_data->mount_points) {
591 errno = EINVAL;
592 goto out_error;
ae5c8b8e 593 }
fd4f5a56 594
3a0abb3a 595 lxc_free_array((void **)kernel_subsystems, free);
33ad9f1a
CS
596 return meta_data;
597
598out_error:
599 saved_errno = errno;
33ad9f1a
CS
600 lxc_free_array((void **)kernel_subsystems, free);
601 lxc_cgroup_put_meta(meta_data);
602 errno = saved_errno;
603 return NULL;
fd4f5a56
DL
604}
605
4fb3cba5 606static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data)
e14f67a7 607{
33ad9f1a
CS
608 meta_data->ref++;
609 return meta_data;
610}
e14f67a7 611
4fb3cba5 612static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data)
33ad9f1a
CS
613{
614 size_t i;
615 if (!meta_data)
616 return NULL;
617 if (--meta_data->ref > 0)
618 return meta_data;
619 lxc_free_array((void **)meta_data->mount_points, (lxc_free_fn)lxc_cgroup_mount_point_free);
620 if (meta_data->hierarchies) {
621 for (i = 0; i <= meta_data->maximum_hierarchy; i++)
622 lxc_cgroup_hierarchy_free(meta_data->hierarchies[i]);
e14f67a7 623 }
33ad9f1a 624 free(meta_data->hierarchies);
178938fe 625 free(meta_data);
33ad9f1a 626 return NULL;
e14f67a7
U
627}
628
4fb3cba5 629static struct cgroup_hierarchy *lxc_cgroup_find_hierarchy(struct cgroup_meta_data *meta_data, const char *subsystem)
e14f67a7 630{
33ad9f1a
CS
631 size_t i;
632 for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
633 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
634 if (h && lxc_string_in_array(subsystem, (const char **)h->subsystems))
635 return h;
e14f67a7 636 }
e14f67a7
U
637 return NULL;
638}
639
d3f99e96
SH
640static bool mountpoint_is_accessible(struct cgroup_mount_point *mp)
641{
642 return mp && access(mp->mount_point, F_OK) == 0;
643}
644
4fb3cba5 645static struct cgroup_mount_point *lxc_cgroup_find_mount_point(struct cgroup_hierarchy *hierarchy, const char *group, bool should_be_writable)
b98f7d6e 646{
33ad9f1a
CS
647 struct cgroup_mount_point **mps;
648 struct cgroup_mount_point *current_result = NULL;
649 ssize_t quality = -1;
b98f7d6e 650
33ad9f1a 651 /* trivial case */
d3f99e96 652 if (mountpoint_is_accessible(hierarchy->rw_absolute_mount_point))
33ad9f1a 653 return hierarchy->rw_absolute_mount_point;
d3f99e96 654 if (!should_be_writable && mountpoint_is_accessible(hierarchy->ro_absolute_mount_point))
33ad9f1a 655 return hierarchy->ro_absolute_mount_point;
b98f7d6e 656
33ad9f1a
CS
657 for (mps = hierarchy->all_mount_points; mps && *mps; mps++) {
658 struct cgroup_mount_point *mp = *mps;
659 size_t prefix_len = mp->mount_prefix ? strlen(mp->mount_prefix) : 0;
b98f7d6e 660
33ad9f1a
CS
661 if (prefix_len == 1 && mp->mount_prefix[0] == '/')
662 prefix_len = 0;
b98f7d6e 663
d3f99e96
SH
664 if (!mountpoint_is_accessible(mp))
665 continue;
666
33ad9f1a
CS
667 if (should_be_writable && mp->read_only)
668 continue;
669
670 if (!prefix_len ||
671 (strncmp(group, mp->mount_prefix, prefix_len) == 0 &&
672 (group[prefix_len] == '\0' || group[prefix_len] == '/'))) {
673 /* search for the best quality match, i.e. the match with the
674 * shortest prefix where this group is still contained
675 */
676 if (quality == -1 || prefix_len < quality) {
677 current_result = mp;
678 quality = prefix_len;
679 }
b98f7d6e
SH
680 }
681 }
682
33ad9f1a
CS
683 if (!current_result)
684 errno = ENOENT;
685 return current_result;
b98f7d6e
SH
686}
687
4fb3cba5 688static char *lxc_cgroup_find_abs_path(const char *subsystem, const char *group, bool should_be_writable, const char *suffix)
b98f7d6e 689{
33ad9f1a
CS
690 struct cgroup_meta_data *meta_data;
691 struct cgroup_hierarchy *h;
692 struct cgroup_mount_point *mp;
693 char *result;
694 int saved_errno;
695
696 meta_data = lxc_cgroup_load_meta();
697 if (!meta_data)
698 return NULL;
b98f7d6e 699
33ad9f1a
CS
700 h = lxc_cgroup_find_hierarchy(meta_data, subsystem);
701 if (!h)
702 goto out_error;
b98f7d6e 703
33ad9f1a
CS
704 mp = lxc_cgroup_find_mount_point(h, group, should_be_writable);
705 if (!mp)
706 goto out_error;
b98f7d6e 707
33ad9f1a
CS
708 result = cgroup_to_absolute_path(mp, group, suffix);
709 if (!result)
710 goto out_error;
b98f7d6e 711
33ad9f1a
CS
712 lxc_cgroup_put_meta(meta_data);
713 return result;
b98f7d6e 714
33ad9f1a
CS
715out_error:
716 saved_errno = errno;
717 lxc_cgroup_put_meta(meta_data);
718 errno = saved_errno;
719 return NULL;
b98f7d6e
SH
720}
721
4fb3cba5 722static struct cgroup_process_info *lxc_cgroup_process_info_get(pid_t pid, struct cgroup_meta_data *meta)
fd4f5a56 723{
33ad9f1a
CS
724 char pid_buf[32];
725 snprintf(pid_buf, 32, "/proc/%lu/cgroup", (unsigned long)pid);
726 return lxc_cgroup_process_info_getx(pid_buf, meta);
c8f7c563
CS
727}
728
4fb3cba5 729static struct cgroup_process_info *lxc_cgroup_process_info_get_init(struct cgroup_meta_data *meta)
c8f7c563 730{
33ad9f1a
CS
731 return lxc_cgroup_process_info_get(1, meta);
732}
b98f7d6e 733
4fb3cba5 734static struct cgroup_process_info *lxc_cgroup_process_info_get_self(struct cgroup_meta_data *meta)
33ad9f1a
CS
735{
736 struct cgroup_process_info *i;
737 i = lxc_cgroup_process_info_getx("/proc/self/cgroup", meta);
738 if (!i)
739 i = lxc_cgroup_process_info_get(getpid(), meta);
740 return i;
741}
ae5c8b8e 742
692ba18f
SH
743/*
744 * If a controller has ns cgroup mounted, then in that cgroup the handler->pid
745 * is already in a new cgroup named after the pid. 'mnt' is passed in as
746 * the full current cgroup. Say that is /sys/fs/cgroup/lxc/2975 and the container
747 * name is c1. . We want to rename the cgroup directory to /sys/fs/cgroup/lxc/c1,
748 * and return the string /sys/fs/cgroup/lxc/c1.
749 */
cea0552e 750static char *cgroup_rename_nsgroup(const char *mountpath, const char *oldname, pid_t pid, const char *name)
692ba18f
SH
751{
752 char *dir, *fulloldpath;
753 char *newname, *fullnewpath;
cea0552e 754 int len, newlen, ret;
692ba18f
SH
755
756 /*
757 * if cgroup is mounted at /cgroup and task is in cgroup /ab/, pid 2375 and
758 * name is c1,
759 * dir: /ab
760 * fulloldpath = /cgroup/ab/2375
761 * fullnewpath = /cgroup/ab/c1
762 * newname = /ab/c1
763 */
764 dir = alloca(strlen(oldname) + 1);
765 strcpy(dir, oldname);
766
cea0552e
SH
767 len = strlen(oldname) + strlen(mountpath) + 22;
768 fulloldpath = alloca(len);
769 ret = snprintf(fulloldpath, len, "%s/%s/%ld", mountpath, oldname, (unsigned long)pid);
770 if (ret < 0 || ret >= len)
771 return NULL;
692ba18f
SH
772
773 len = strlen(dir) + strlen(name) + 2;
774 newname = malloc(len);
775 if (!newname) {
776 SYSERROR("Out of memory");
777 return NULL;
778 }
cea0552e
SH
779 ret = snprintf(newname, len, "%s/%s", dir, name);
780 if (ret < 0 || ret >= len) {
781 free(newname);
782 return NULL;
783 }
692ba18f 784
cea0552e
SH
785 newlen = strlen(mountpath) + len + 2;
786 fullnewpath = alloca(newlen);
787 ret = snprintf(fullnewpath, newlen, "%s/%s", mountpath, newname);
788 if (ret < 0 || ret >= newlen) {
789 free(newname);
790 return NULL;
791 }
692ba18f
SH
792
793 if (access(fullnewpath, F_OK) == 0) {
794 if (rmdir(fullnewpath) != 0) {
795 SYSERROR("container cgroup %s already exists.", fullnewpath);
796 free(newname);
797 return NULL;
798 }
799 }
800 if (rename(fulloldpath, fullnewpath)) {
801 SYSERROR("failed to rename cgroup %s->%s", fulloldpath, fullnewpath);
802 free(newname);
803 return NULL;
804 }
805
806 DEBUG("'%s' renamed to '%s'", oldname, newname);
807
808 return newname;
809}
810
ea439aac
SH
811static bool is_crucial_hierarchy(struct cgroup_hierarchy *h)
812{
813 char **p;
814
815 for (p = h->subsystems; *p; p++) {
816 if (is_crucial_cgroup_subsystem(*p))
817 return true;
818 }
819 return false;
820}
821
33ad9f1a 822/* create a new cgroup */
4fb3cba5 823static struct cgroup_process_info *lxc_cgroupfs_create(const char *name, const char *path_pattern, struct cgroup_meta_data *meta_data, const char *sub_pattern)
33ad9f1a 824{
001b026e 825 char **cgroup_path_components = NULL;
33ad9f1a
CS
826 char **p = NULL;
827 char *path_so_far = NULL;
828 char **new_cgroup_paths = NULL;
829 char **new_cgroup_paths_sub = NULL;
830 struct cgroup_mount_point *mp;
831 struct cgroup_hierarchy *h;
832 struct cgroup_process_info *base_info = NULL;
833 struct cgroup_process_info *info_ptr;
834 int saved_errno;
835 int r;
836 unsigned suffix = 0;
837 bool had_sub_pattern = false;
838 size_t i;
ae5c8b8e 839
33ad9f1a
CS
840 if (!is_valid_cgroup(name)) {
841 ERROR("Invalid cgroup name: '%s'", name);
842 errno = EINVAL;
843 return NULL;
ae5c8b8e
SH
844 }
845
33ad9f1a
CS
846 if (!strstr(path_pattern, "%n")) {
847 ERROR("Invalid cgroup path pattern: '%s'; contains no %%n for specifying container name", path_pattern);
848 errno = EINVAL;
849 return NULL;
850 }
fd37327f 851
33ad9f1a
CS
852 /* we will modify the result of this operation directly,
853 * so we don't have to copy the data structure
854 */
855 base_info = (path_pattern[0] == '/') ?
856 lxc_cgroup_process_info_get_init(meta_data) :
857 lxc_cgroup_process_info_get_self(meta_data);
858 if (!base_info)
859 return NULL;
c8f7c563 860
33ad9f1a
CS
861 new_cgroup_paths = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
862 if (!new_cgroup_paths)
863 goto out_initial_error;
864
865 new_cgroup_paths_sub = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
866 if (!new_cgroup_paths_sub)
867 goto out_initial_error;
868
869 /* find mount points we can use */
870 for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
871 h = info_ptr->hierarchy;
872 mp = lxc_cgroup_find_mount_point(h, info_ptr->cgroup_path, true);
873 if (!mp) {
874 ERROR("Could not find writable mount point for cgroup hierarchy %d while trying to create cgroup.", h->index);
875 goto out_initial_error;
876 }
877 info_ptr->designated_mount_point = mp;
460a1cf0 878
692ba18f
SH
879 if (lxc_string_in_array("ns", (const char **)h->subsystems))
880 continue;
2edb53c7
SH
881 if (handle_cgroup_settings(mp, info_ptr->cgroup_path) < 0) {
882 ERROR("Could not set clone_children to 1 for cpuset hierarchy in parent cgroup.");
33ad9f1a 883 goto out_initial_error;
2edb53c7 884 }
33ad9f1a 885 }
b98f7d6e 886
33ad9f1a
CS
887 /* normalize the path */
888 cgroup_path_components = lxc_normalize_path(path_pattern);
889 if (!cgroup_path_components)
890 goto out_initial_error;
891
892 /* go through the path components to see if we can create them */
893 for (p = cgroup_path_components; *p || (sub_pattern && !had_sub_pattern); p++) {
894 /* we only want to create the same component with -1, -2, etc.
895 * if the component contains the container name itself, otherwise
896 * it's not an error if it already exists
897 */
898 char *p_eff = *p ? *p : (char *)sub_pattern;
899 bool contains_name = strstr(p_eff, "%n");
900 char *current_component = NULL;
901 char *current_subpath = NULL;
902 char *current_entire_path = NULL;
903 char *parts[3];
904 size_t j = 0;
905 i = 0;
906
907 /* if we are processing the subpattern, we want to make sure
908 * loop is ended the next time around
909 */
910 if (!*p) {
911 had_sub_pattern = true;
912 p--;
913 }
b98f7d6e 914
33ad9f1a 915 goto find_name_on_this_level;
4fb3cba5 916
33ad9f1a
CS
917 cleanup_name_on_this_level:
918 /* This is reached if we found a name clash.
919 * In that case, remove the cgroup from all previous hierarchies
920 */
921 for (j = 0, info_ptr = base_info; j < i && info_ptr; info_ptr = info_ptr->next, j++) {
603c64c2 922 r = remove_cgroup(info_ptr->designated_mount_point, info_ptr->created_paths[info_ptr->created_paths_count - 1], false);
33ad9f1a
CS
923 if (r < 0)
924 WARN("could not clean up cgroup we created when trying to create container");
925 free(info_ptr->created_paths[info_ptr->created_paths_count - 1]);
926 info_ptr->created_paths[--info_ptr->created_paths_count] = NULL;
927 }
928 if (current_component != current_subpath)
929 free(current_subpath);
930 if (current_component != p_eff)
931 free(current_component);
932 current_component = current_subpath = NULL;
933 /* try again with another suffix */
934 ++suffix;
4fb3cba5 935
33ad9f1a
CS
936 find_name_on_this_level:
937 /* determine name of the path component we should create */
938 if (contains_name && suffix > 0) {
939 char *buf = calloc(strlen(name) + 32, 1);
940 if (!buf)
941 goto out_initial_error;
942 snprintf(buf, strlen(name) + 32, "%s-%u", name, suffix);
943 current_component = lxc_string_replace("%n", buf, p_eff);
944 free(buf);
945 } else {
946 current_component = contains_name ? lxc_string_replace("%n", name, p_eff) : p_eff;
947 }
948 parts[0] = path_so_far;
949 parts[1] = current_component;
950 parts[2] = NULL;
951 current_subpath = path_so_far ? lxc_string_join("/", (const char **)parts, false) : current_component;
952
953 /* Now go through each hierarchy and try to create the
954 * corresponding cgroup
955 */
956 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
957 char *parts2[3];
692ba18f
SH
958
959 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
960 continue;
33ad9f1a
CS
961 current_entire_path = NULL;
962
963 parts2[0] = !strcmp(info_ptr->cgroup_path, "/") ? "" : info_ptr->cgroup_path;
964 parts2[1] = current_subpath;
965 parts2[2] = NULL;
966 current_entire_path = lxc_string_join("/", (const char **)parts2, false);
967
968 if (!*p) {
969 /* we are processing the subpath, so only update that one */
970 free(new_cgroup_paths_sub[i]);
971 new_cgroup_paths_sub[i] = strdup(current_entire_path);
972 if (!new_cgroup_paths_sub[i])
973 goto cleanup_from_error;
974 } else {
975 /* remember which path was used on this controller */
976 free(new_cgroup_paths[i]);
977 new_cgroup_paths[i] = strdup(current_entire_path);
978 if (!new_cgroup_paths[i])
979 goto cleanup_from_error;
980 }
fd4f5a56 981
33ad9f1a
CS
982 r = create_cgroup(info_ptr->designated_mount_point, current_entire_path);
983 if (r < 0 && errno == EEXIST && contains_name) {
984 /* name clash => try new name with new suffix */
985 free(current_entire_path);
986 current_entire_path = NULL;
987 goto cleanup_name_on_this_level;
988 } else if (r < 0 && errno != EEXIST) {
ea439aac
SH
989 if (is_crucial_hierarchy(info_ptr->hierarchy)) {
990 SYSERROR("Could not create cgroup '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
991 goto cleanup_from_error;
992 }
993 goto skip;
33ad9f1a
CS
994 } else if (r == 0) {
995 /* successfully created */
996 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
997 if (r < 0)
998 goto cleanup_from_error;
d703c2b1 999 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, current_entire_path)) {
b38b62a6 1000 ERROR("Failed to initialize cpuset for '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
d703c2b1
RV
1001 goto cleanup_from_error;
1002 }
33ad9f1a
CS
1003 info_ptr->created_paths[info_ptr->created_paths_count++] = current_entire_path;
1004 } else {
1005 /* if we didn't create the cgroup, then we have to make sure that
1006 * further cgroups will be created properly
1007 */
d703c2b1 1008 if (handle_cgroup_settings(info_ptr->designated_mount_point, info_ptr->cgroup_path) < 0) {
f6ac3b9e 1009 ERROR("Could not set clone_children to 1 for cpuset hierarchy in pre-existing cgroup.");
33ad9f1a 1010 goto cleanup_from_error;
f6ac3b9e 1011 }
d703c2b1
RV
1012 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, info_ptr->cgroup_path)) {
1013 ERROR("Failed to initialize cpuset in pre-existing '%s'.", info_ptr->cgroup_path);
1014 goto cleanup_from_error;
1015 }
33ad9f1a 1016
ea439aac 1017skip:
33ad9f1a
CS
1018 /* already existed but path component of pattern didn't contain '%n',
1019 * so this is not an error; but then we don't need current_entire_path
1020 * anymore...
1021 */
1022 free(current_entire_path);
1023 current_entire_path = NULL;
1024 }
1025 }
fd4f5a56 1026
33ad9f1a
CS
1027 /* save path so far */
1028 free(path_so_far);
1029 path_so_far = strdup(current_subpath);
1030 if (!path_so_far)
1031 goto cleanup_from_error;
1032
1033 /* cleanup */
1034 if (current_component != current_subpath)
1035 free(current_subpath);
1036 if (current_component != p_eff)
1037 free(current_component);
1038 current_component = current_subpath = NULL;
1039 continue;
4fb3cba5 1040
33ad9f1a 1041 cleanup_from_error:
ec64264d 1042 /* called if an error occurred in the loop, so we
33ad9f1a
CS
1043 * do some additional cleanup here
1044 */
1045 saved_errno = errno;
1046 if (current_component != current_subpath)
1047 free(current_subpath);
1048 if (current_component != p_eff)
1049 free(current_component);
1050 free(current_entire_path);
1051 errno = saved_errno;
1052 goto out_initial_error;
fd4f5a56
DL
1053 }
1054
33ad9f1a
CS
1055 /* we're done, now update the paths */
1056 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
47d8fb3b
CS
1057 /* ignore legacy 'ns' subsystem here, lxc_cgroup_create_legacy
1058 * will take care of it
1059 * Since we do a continue in above loop, new_cgroup_paths[i] is
1060 * unset anyway, as is new_cgroup_paths_sub[i]
692ba18f 1061 */
47d8fb3b
CS
1062 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1063 continue;
1064 free(info_ptr->cgroup_path);
1065 info_ptr->cgroup_path = new_cgroup_paths[i];
1066 info_ptr->cgroup_path_sub = new_cgroup_paths_sub[i];
fd4f5a56 1067 }
33ad9f1a
CS
1068 /* don't use lxc_free_array since we used the array members
1069 * to store them in our result...
1070 */
1071 free(new_cgroup_paths);
1072 free(new_cgroup_paths_sub);
1073 free(path_so_far);
1074 lxc_free_array((void **)cgroup_path_components, free);
1075 return base_info;
1076
1077out_initial_error:
1078 saved_errno = errno;
1079 free(path_so_far);
1080 lxc_cgroup_process_info_free_and_remove(base_info);
1081 lxc_free_array((void **)new_cgroup_paths, free);
1082 lxc_free_array((void **)new_cgroup_paths_sub, free);
1083 lxc_free_array((void **)cgroup_path_components, free);
1084 errno = saved_errno;
1085 return NULL;
c8f7c563
CS
1086}
1087
4fb3cba5 1088static int lxc_cgroup_create_legacy(struct cgroup_process_info *base_info, const char *name, pid_t pid)
47d8fb3b
CS
1089{
1090 struct cgroup_process_info *info_ptr;
1091 int r;
1092
1093 for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
1094 if (!lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1095 continue;
1096 /*
1097 * For any path which has ns cgroup mounted, handler->pid is already
1098 * moved into a container called '%d % (handler->pid)'. Rename it to
1099 * the cgroup name and record that.
1100 */
1101 char *tmp = cgroup_rename_nsgroup((const char *)info_ptr->designated_mount_point->mount_point,
1102 info_ptr->cgroup_path, pid, name);
1103 if (!tmp)
1104 return -1;
1105 free(info_ptr->cgroup_path);
1106 info_ptr->cgroup_path = tmp;
1107 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1108 if (r < 0)
1109 return -1;
1110 tmp = strdup(tmp);
1111 if (!tmp)
1112 return -1;
1113 info_ptr->created_paths[info_ptr->created_paths_count++] = tmp;
1114 }
1115 return 0;
1116}
1117
33ad9f1a 1118/* get the cgroup membership of a given container */
4fb3cba5 1119static struct cgroup_process_info *lxc_cgroup_get_container_info(const char *name, const char *lxcpath, struct cgroup_meta_data *meta_data)
c8f7c563 1120{
33ad9f1a
CS
1121 struct cgroup_process_info *result = NULL;
1122 int saved_errno = 0;
1123 size_t i;
1124 struct cgroup_process_info **cptr = &result;
1125 struct cgroup_process_info *entry = NULL;
1126 char *path = NULL;
1127
1128 for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
1129 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
1130 if (!h || !h->used)
1131 continue;
c8f7c563 1132
33ad9f1a
CS
1133 /* use the command interface to look for the cgroup */
1134 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->subsystems[0]);
c661b0a8
DE
1135 if (!path) {
1136 h->used = false;
1137 WARN("Not attaching to cgroup %s unknown to %s %s", h->subsystems[0], lxcpath, name);
1138 continue;
1139 }
33ad9f1a
CS
1140
1141 entry = calloc(1, sizeof(struct cgroup_process_info));
1142 if (!entry)
1143 goto out_error;
1144 entry->meta_ref = lxc_cgroup_get_meta(meta_data);
1145 entry->hierarchy = h;
1146 entry->cgroup_path = path;
1147 path = NULL;
1148
1149 /* it is not an error if we don't find anything here,
1150 * it is up to the caller to decide what to do in that
1151 * case */
1152 entry->designated_mount_point = lxc_cgroup_find_mount_point(h, entry->cgroup_path, true);
1153
1154 *cptr = entry;
1155 cptr = &entry->next;
1156 entry = NULL;
c8f7c563
CS
1157 }
1158
33ad9f1a
CS
1159 return result;
1160out_error:
1161 saved_errno = errno;
1162 free(path);
1163 lxc_cgroup_process_info_free(result);
1164 lxc_cgroup_process_info_free(entry);
1165 errno = saved_errno;
1166 return NULL;
fd4f5a56
DL
1167}
1168
33ad9f1a 1169/* move a processs to the cgroups specified by the membership */
4fb3cba5 1170static int lxc_cgroupfs_enter(struct cgroup_process_info *info, pid_t pid, bool enter_sub)
4f17323e 1171{
33ad9f1a
CS
1172 char pid_buf[32];
1173 char *cgroup_tasks_fn;
1174 int r;
1175 struct cgroup_process_info *info_ptr;
1176
1177 snprintf(pid_buf, 32, "%lu", (unsigned long)pid);
1178 for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1179 char *cgroup_path = (enter_sub && info_ptr->cgroup_path_sub) ?
1180 info_ptr->cgroup_path_sub :
1181 info_ptr->cgroup_path;
1182
1183 if (!info_ptr->designated_mount_point) {
1184 info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, cgroup_path, true);
1185 if (!info_ptr->designated_mount_point) {
1186 SYSERROR("Could not add pid %lu to cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", (unsigned long)pid, cgroup_path);
1187 return -1;
1188 }
1189 }
4f17323e 1190
33ad9f1a
CS
1191 cgroup_tasks_fn = cgroup_to_absolute_path(info_ptr->designated_mount_point, cgroup_path, "/tasks");
1192 if (!cgroup_tasks_fn) {
1193 SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1194 return -1;
1195 }
4f17323e 1196
33ad9f1a 1197 r = lxc_write_to_file(cgroup_tasks_fn, pid_buf, strlen(pid_buf), false);
5903da82 1198 free(cgroup_tasks_fn);
ea439aac 1199 if (r < 0 && is_crucial_hierarchy(info_ptr->hierarchy)) {
33ad9f1a
CS
1200 SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1201 return -1;
1202 }
4f17323e
CS
1203 }
1204
33ad9f1a 1205 return 0;
4f17323e
CS
1206}
1207
33ad9f1a
CS
1208/* free process membership information */
1209void lxc_cgroup_process_info_free(struct cgroup_process_info *info)
fc7de561 1210{
33ad9f1a
CS
1211 struct cgroup_process_info *next;
1212 if (!info)
b98f7d6e 1213 return;
33ad9f1a
CS
1214 next = info->next;
1215 lxc_cgroup_put_meta(info->meta_ref);
1216 free(info->cgroup_path);
1217 free(info->cgroup_path_sub);
1218 lxc_free_array((void **)info->created_paths, free);
1219 free(info);
1220 lxc_cgroup_process_info_free(next);
fc7de561
SH
1221}
1222
33ad9f1a
CS
1223/* free process membership information and remove cgroups that were created */
1224void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info)
b98f7d6e 1225{
33ad9f1a
CS
1226 struct cgroup_process_info *next;
1227 char **pp;
1228 if (!info)
1229 return;
1230 next = info->next;
603c64c2 1231 {
33ad9f1a
CS
1232 struct cgroup_mount_point *mp = info->designated_mount_point;
1233 if (!mp)
1234 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1235 if (mp)
1236 /* ignore return value here, perhaps we created the
1237 * '/lxc' cgroup in this container but another container
1238 * is still running (for example)
1239 */
603c64c2
SH
1240 (void)remove_cgroup(mp, info->cgroup_path, true);
1241 }
1242 for (pp = info->created_paths; pp && *pp; pp++);
1243 for ((void)(pp && --pp); info->created_paths && pp >= info->created_paths; --pp) {
33ad9f1a 1244 free(*pp);
b98f7d6e 1245 }
33ad9f1a
CS
1246 free(info->created_paths);
1247 lxc_cgroup_put_meta(info->meta_ref);
1248 free(info->cgroup_path);
1249 free(info->cgroup_path_sub);
1250 free(info);
9431aa65 1251 lxc_cgroup_process_info_free_and_remove(next);
33ad9f1a 1252}
b98f7d6e 1253
4fb3cba5 1254static char *lxc_cgroup_get_hierarchy_path_data(const char *subsystem, struct cgfs_data *d)
33ad9f1a 1255{
d4ef7c50
SH
1256 struct cgroup_process_info *info = d->info;
1257 info = find_info_for_subsystem(info, subsystem);
33ad9f1a
CS
1258 if (!info)
1259 return NULL;
f348e47c 1260 prune_init_scope(info->cgroup_path);
33ad9f1a 1261 return info->cgroup_path;
b98f7d6e
SH
1262}
1263
4fb3cba5 1264static char *lxc_cgroup_get_hierarchy_abs_path_data(const char *subsystem, struct cgfs_data *d)
b98f7d6e 1265{
d4ef7c50 1266 struct cgroup_process_info *info = d->info;
33ad9f1a 1267 struct cgroup_mount_point *mp = NULL;
d4ef7c50
SH
1268
1269 info = find_info_for_subsystem(info, subsystem);
33ad9f1a
CS
1270 if (!info)
1271 return NULL;
1272 if (info->designated_mount_point) {
8900b9eb 1273 mp = info->designated_mount_point;
33ad9f1a
CS
1274 } else {
1275 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1276 if (!mp)
1277 return NULL;
b98f7d6e 1278 }
33ad9f1a 1279 return cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
b98f7d6e 1280}
55c76589 1281
4fb3cba5 1282static char *lxc_cgroup_get_hierarchy_abs_path(const char *subsystem, const char *name, const char *lxcpath)
9a93d992 1283{
33ad9f1a
CS
1284 struct cgroup_meta_data *meta;
1285 struct cgroup_process_info *base_info, *info;
1286 struct cgroup_mount_point *mp;
1287 char *result = NULL;
33ad9f1a
CS
1288
1289 meta = lxc_cgroup_load_meta();
1290 if (!meta)
9a93d992 1291 return NULL;
33ad9f1a
CS
1292 base_info = lxc_cgroup_get_container_info(name, lxcpath, meta);
1293 if (!base_info)
178938fe 1294 goto out1;
33ad9f1a
CS
1295 info = find_info_for_subsystem(base_info, subsystem);
1296 if (!info)
178938fe 1297 goto out2;
33ad9f1a 1298 if (info->designated_mount_point) {
8900b9eb 1299 mp = info->designated_mount_point;
33ad9f1a
CS
1300 } else {
1301 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1302 if (!mp)
178938fe 1303 goto out3;
33ad9f1a
CS
1304 }
1305 result = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
178938fe 1306out3:
178938fe 1307out2:
33ad9f1a 1308 lxc_cgroup_process_info_free(base_info);
178938fe 1309out1:
33ad9f1a 1310 lxc_cgroup_put_meta(meta);
33ad9f1a
CS
1311 return result;
1312}
9a93d992 1313
4fb3cba5 1314static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfs_data *d)
33ad9f1a
CS
1315{
1316 char *subsystem = NULL, *p, *path;
1317 int ret = -1;
9a93d992 1318
33ad9f1a
CS
1319 subsystem = alloca(strlen(filename) + 1);
1320 strcpy(subsystem, filename);
46cd2845 1321 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a 1322 *p = '\0';
9a93d992 1323
4f875f70 1324 errno = ENOENT;
4fb3cba5 1325 path = lxc_cgroup_get_hierarchy_abs_path_data(subsystem, d);
33ad9f1a
CS
1326 if (path) {
1327 ret = do_cgroup_set(path, filename, value);
4f875f70 1328 int saved_errno = errno;
33ad9f1a 1329 free(path);
4f875f70 1330 errno = saved_errno;
9a93d992 1331 }
33ad9f1a
CS
1332 return ret;
1333}
9a93d992 1334
4fb3cba5 1335static int lxc_cgroupfs_set(const char *filename, const char *value, const char *name, const char *lxcpath)
9a93d992 1336{
33ad9f1a
CS
1337 char *subsystem = NULL, *p, *path;
1338 int ret = -1;
9a93d992 1339
33ad9f1a
CS
1340 subsystem = alloca(strlen(filename) + 1);
1341 strcpy(subsystem, filename);
46cd2845 1342 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a 1343 *p = '\0';
9a93d992 1344
33ad9f1a
CS
1345 path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1346 if (path) {
1347 ret = do_cgroup_set(path, filename, value);
1348 free(path);
1349 }
b98f7d6e 1350 return ret;
9a93d992
SH
1351}
1352
4fb3cba5 1353static int lxc_cgroupfs_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
9a93d992 1354{
33ad9f1a
CS
1355 char *subsystem = NULL, *p, *path;
1356 int ret = -1;
1357
1358 subsystem = alloca(strlen(filename) + 1);
1359 strcpy(subsystem, filename);
46cd2845 1360 if ((p = strchr(subsystem, '.')) != NULL)
33ad9f1a
CS
1361 *p = '\0';
1362
1363 path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1364 if (path) {
1365 ret = do_cgroup_get(path, filename, value, len);
1366 free(path);
9a93d992 1367 }
33ad9f1a 1368 return ret;
9a93d992
SH
1369}
1370
4fb3cba5 1371static bool cgroupfs_mount_cgroup(void *hdata, const char *root, int type)
aae1f3c4
CS
1372{
1373 size_t bufsz = strlen(root) + sizeof("/sys/fs/cgroup");
1374 char *path = NULL;
1375 char **parts = NULL;
1376 char *dirname = NULL;
1377 char *abs_path = NULL;
1378 char *abs_path2 = NULL;
d4ef7c50
SH
1379 struct cgfs_data *cgfs_d;
1380 struct cgroup_process_info *info, *base_info;
aae1f3c4
CS
1381 int r, saved_errno = 0;
1382
4608594e
SH
1383 if (cgns_supported())
1384 return true;
1385
4fb3cba5
DE
1386 cgfs_d = hdata;
1387 if (!cgfs_d)
1388 return false;
d4ef7c50
SH
1389 base_info = cgfs_d->info;
1390
0769b82a
CS
1391 /* If we get passed the _NOSPEC types, we default to _MIXED, since we don't
1392 * have access to the lxc_conf object at this point. It really should be up
1393 * to the caller to fix this, but this doesn't really hurt.
1394 */
1395 if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1396 type = LXC_AUTO_CGROUP_FULL_MIXED;
1397 else if (type == LXC_AUTO_CGROUP_NOSPEC)
1398 type = LXC_AUTO_CGROUP_MIXED;
1399
7997d7da
CS
1400 if (type < LXC_AUTO_CGROUP_RO || type > LXC_AUTO_CGROUP_FULL_MIXED) {
1401 ERROR("could not mount cgroups into container: invalid type specified internally");
1402 errno = EINVAL;
c476bdce 1403 return false;
7997d7da
CS
1404 }
1405
aae1f3c4
CS
1406 path = calloc(1, bufsz);
1407 if (!path)
c476bdce 1408 return false;
aae1f3c4 1409 snprintf(path, bufsz, "%s/sys/fs/cgroup", root);
592fd47a
SH
1410 r = safe_mount("cgroup_root", path, "tmpfs",
1411 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1412 "size=10240k,mode=755",
1413 root);
aae1f3c4
CS
1414 if (r < 0) {
1415 SYSERROR("could not mount tmpfs to /sys/fs/cgroup in the container");
c476bdce 1416 return false;
aae1f3c4
CS
1417 }
1418
1419 /* now mount all the hierarchies we care about */
1420 for (info = base_info; info; info = info->next) {
1421 size_t subsystem_count, i;
1422 struct cgroup_mount_point *mp = info->designated_mount_point;
d3f99e96 1423 if (!mountpoint_is_accessible(mp))
aae1f3c4 1424 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
d3f99e96 1425
aae1f3c4
CS
1426 if (!mp) {
1427 SYSERROR("could not find original mount point for cgroup hierarchy while trying to mount cgroup filesystem");
1428 goto out_error;
1429 }
1430
1431 subsystem_count = lxc_array_len((void **)info->hierarchy->subsystems);
1432 parts = calloc(subsystem_count + 1, sizeof(char *));
1433 if (!parts)
1434 goto out_error;
1435
1436 for (i = 0; i < subsystem_count; i++) {
1437 if (!strncmp(info->hierarchy->subsystems[i], "name=", 5))
1438 parts[i] = info->hierarchy->subsystems[i] + 5;
1439 else
1440 parts[i] = info->hierarchy->subsystems[i];
1441 }
1442 dirname = lxc_string_join(",", (const char **)parts, false);
1443 if (!dirname)
1444 goto out_error;
1445
1446 /* create subsystem directory */
1447 abs_path = lxc_append_paths(path, dirname);
1448 if (!abs_path)
1449 goto out_error;
1450 r = mkdir_p(abs_path, 0755);
1451 if (r < 0 && errno != EEXIST) {
1452 SYSERROR("could not create cgroup subsystem directory /sys/fs/cgroup/%s", dirname);
1453 goto out_error;
1454 }
1455
aae1f3c4
CS
1456 abs_path2 = lxc_append_paths(abs_path, info->cgroup_path);
1457 if (!abs_path2)
1458 goto out_error;
aae1f3c4 1459
7997d7da
CS
1460 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_RW || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1461 /* bind-mount the cgroup entire filesystem there */
1462 if (strcmp(mp->mount_prefix, "/") != 0) {
1463 /* FIXME: maybe we should just try to remount the entire hierarchy
1464 * with a regular mount command? may that works? */
1465 ERROR("could not automatically mount cgroup-full to /sys/fs/cgroup/%s: host has no mount point for this cgroup filesystem that has access to the root cgroup", dirname);
1466 goto out_error;
1467 }
1468 r = mount(mp->mount_point, abs_path, "none", MS_BIND, 0);
1469 if (r < 0) {
1470 SYSERROR("error bind-mounting %s to %s", mp->mount_point, abs_path);
1471 goto out_error;
1472 }
f8f3c3c0
SG
1473 /* main cgroup path should be read-only */
1474 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1475 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1476 if (r < 0) {
1477 SYSERROR("error re-mounting %s readonly", abs_path);
1478 goto out_error;
1479 }
1480 }
7997d7da
CS
1481 /* own cgroup should be read-write */
1482 if (type == LXC_AUTO_CGROUP_FULL_MIXED) {
1483 r = mount(abs_path2, abs_path2, NULL, MS_BIND, NULL);
1484 if (r < 0) {
1485 SYSERROR("error bind-mounting %s onto itself", abs_path2);
1486 goto out_error;
1487 }
1488 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND, NULL);
1489 if (r < 0) {
1490 SYSERROR("error re-mounting %s readwrite", abs_path2);
1491 goto out_error;
1492 }
1493 }
1494 } else {
1495 /* create path for container's cgroup */
1496 r = mkdir_p(abs_path2, 0755);
1497 if (r < 0 && errno != EEXIST) {
1498 SYSERROR("could not create cgroup directory /sys/fs/cgroup/%s%s", dirname, info->cgroup_path);
1499 goto out_error;
1500 }
aae1f3c4 1501
b46f0553
CS
1502 /* for read-only and mixed cases, we have to bind-mount the tmpfs directory
1503 * that points to the hierarchy itself (i.e. /sys/fs/cgroup/cpu etc.) onto
1504 * itself and then bind-mount it read-only, since we keep the tmpfs itself
1505 * read-write (see comment below)
1506 */
1507 if (type == LXC_AUTO_CGROUP_MIXED || type == LXC_AUTO_CGROUP_RO) {
1508 r = mount(abs_path, abs_path, NULL, MS_BIND, NULL);
1509 if (r < 0) {
1510 SYSERROR("error bind-mounting %s onto itself", abs_path);
1511 goto out_error;
1512 }
1513 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1514 if (r < 0) {
1515 SYSERROR("error re-mounting %s readonly", abs_path);
1516 goto out_error;
1517 }
1518 }
1519
7997d7da
CS
1520 free(abs_path);
1521 abs_path = NULL;
1522
1523 /* bind-mount container's cgroup to that directory */
1524 abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1525 if (!abs_path)
1526 goto out_error;
1527 r = mount(abs_path, abs_path2, "none", MS_BIND, 0);
ea439aac 1528 if (r < 0 && is_crucial_hierarchy(info->hierarchy)) {
7997d7da
CS
1529 SYSERROR("error bind-mounting %s to %s", abs_path, abs_path2);
1530 goto out_error;
1531 }
1532 if (type == LXC_AUTO_CGROUP_RO) {
1533 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1534 if (r < 0) {
1535 SYSERROR("error re-mounting %s readonly", abs_path2);
1536 goto out_error;
1537 }
1538 }
aae1f3c4
CS
1539 }
1540
1541 free(abs_path);
1542 free(abs_path2);
1543 abs_path = NULL;
1544 abs_path2 = NULL;
1545
1546 /* add symlinks for every single subsystem */
1547 if (subsystem_count > 1) {
1548 for (i = 0; i < subsystem_count; i++) {
1549 abs_path = lxc_append_paths(path, parts[i]);
1550 if (!abs_path)
1551 goto out_error;
1552 r = symlink(dirname, abs_path);
1553 if (r < 0)
1554 WARN("could not create symlink %s -> %s in /sys/fs/cgroup of container", parts[i], dirname);
1555 free(abs_path);
1556 abs_path = NULL;
1557 }
1558 }
1559 free(dirname);
1560 free(parts);
1561 dirname = NULL;
1562 parts = NULL;
1563 }
1564
b46f0553
CS
1565 /* We used to remount the entire tmpfs readonly if any :ro or
1566 * :mixed mode was specified. However, Ubuntu's mountall has the
1567 * unfortunate behavior to block bootup if /sys/fs/cgroup is
1568 * mounted read-only and cannot be remounted read-write.
1569 * (mountall reads /lib/init/fstab and tries to (re-)mount all of
1570 * these if they are not already mounted with the right options;
1571 * it contains an entry for /sys/fs/cgroup. In case it can't do
1572 * that, it prompts for the user to either manually fix it or
1573 * boot anyway. But without user input, booting of the container
1574 * hangs.)
1575 *
1576 * Instead of remounting the entire tmpfs readonly, we only
1577 * remount the paths readonly that are part of the cgroup
1578 * hierarchy.
f8f3c3c0 1579 */
f8f3c3c0 1580
aae1f3c4
CS
1581 free(path);
1582
c476bdce 1583 return true;
aae1f3c4
CS
1584
1585out_error:
1586 saved_errno = errno;
1587 free(path);
1588 free(dirname);
1589 free(parts);
1590 free(abs_path);
1591 free(abs_path2);
1592 errno = saved_errno;
c476bdce 1593 return false;
aae1f3c4
CS
1594}
1595
4fb3cba5 1596static int cgfs_nrtasks(void *hdata)
33ad9f1a 1597{
4fb3cba5
DE
1598 struct cgfs_data *d = hdata;
1599 struct cgroup_process_info *info;
33ad9f1a
CS
1600 struct cgroup_mount_point *mp = NULL;
1601 char *abs_path = NULL;
1602 int ret;
460a1cf0 1603
4fb3cba5
DE
1604 if (!d) {
1605 errno = ENOENT;
1606 return -1;
1607 }
1608
1609 info = d->info;
33ad9f1a
CS
1610 if (!info) {
1611 errno = ENOENT;
1612 return -1;
b98f7d6e 1613 }
c8f7c563 1614
33ad9f1a 1615 if (info->designated_mount_point) {
8900b9eb 1616 mp = info->designated_mount_point;
33ad9f1a
CS
1617 } else {
1618 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, false);
1619 if (!mp)
1620 return -1;
c8f7c563
CS
1621 }
1622
33ad9f1a
CS
1623 abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1624 if (!abs_path)
1625 return -1;
1626
1627 ret = cgroup_recursive_task_count(abs_path);
1628 free(abs_path);
1629 return ret;
c8f7c563
CS
1630}
1631
574c4428
QH
1632static struct cgroup_process_info *
1633lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str,
1634 struct cgroup_meta_data *meta)
d08ba6ec 1635{
33ad9f1a
CS
1636 struct cgroup_process_info *result = NULL;
1637 FILE *proc_pid_cgroup = NULL;
1638 char *line = NULL;
1639 size_t sz = 0;
1640 int saved_errno = 0;
1641 struct cgroup_process_info **cptr = &result;
1642 struct cgroup_process_info *entry = NULL;
1643
1644 proc_pid_cgroup = fopen_cloexec(proc_pid_cgroup_str, "r");
1645 if (!proc_pid_cgroup)
b98f7d6e 1646 return NULL;
1ac470c0 1647
33ad9f1a
CS
1648 while (getline(&line, &sz, proc_pid_cgroup) != -1) {
1649 /* file format: hierarchy:subsystems:group */
1650 char *colon1;
1651 char *colon2;
1652 char *endptr;
1653 int hierarchy_number;
1654 struct cgroup_hierarchy *h = NULL;
fd4f5a56 1655
33ad9f1a 1656 if (!line[0])
ae5c8b8e 1657 continue;
b98f7d6e 1658
33ad9f1a
CS
1659 if (line[strlen(line) - 1] == '\n')
1660 line[strlen(line) - 1] = '\0';
1661
1662 colon1 = strchr(line, ':');
1663 if (!colon1)
8900b9eb 1664 continue;
33ad9f1a
CS
1665 *colon1++ = '\0';
1666 colon2 = strchr(colon1, ':');
1667 if (!colon2)
ae5c8b8e 1668 continue;
33ad9f1a 1669 *colon2++ = '\0';
e4659536 1670
33ad9f1a
CS
1671 endptr = NULL;
1672 hierarchy_number = strtoul(line, &endptr, 10);
1673 if (!endptr || *endptr)
9a93d992 1674 continue;
9a93d992 1675
33ad9f1a
CS
1676 if (hierarchy_number > meta->maximum_hierarchy) {
1677 /* we encountered a hierarchy we didn't have before,
1678 * so probably somebody remounted some stuff in the
1679 * mean time...
1680 */
1681 errno = EAGAIN;
1682 goto out_error;
b98f7d6e 1683 }
33ad9f1a
CS
1684
1685 h = meta->hierarchies[hierarchy_number];
1686 if (!h) {
1687 /* we encountered a hierarchy that was thought to be
1688 * dead before, so probably somebody remounted some
1689 * stuff in the mean time...
1690 */
1691 errno = EAGAIN;
1692 goto out_error;
b98f7d6e 1693 }
33ad9f1a
CS
1694
1695 /* we are told that we should ignore this hierarchy */
1696 if (!h->used)
b98f7d6e 1697 continue;
5193cc3d 1698
33ad9f1a
CS
1699 entry = calloc(1, sizeof(struct cgroup_process_info));
1700 if (!entry)
1701 goto out_error;
fd4f5a56 1702
33ad9f1a
CS
1703 entry->meta_ref = lxc_cgroup_get_meta(meta);
1704 entry->hierarchy = h;
1705 entry->cgroup_path = strdup(colon2);
1706 if (!entry->cgroup_path)
1707 goto out_error;
3939a22a 1708 prune_init_scope(entry->cgroup_path);
d08ba6ec 1709
33ad9f1a
CS
1710 *cptr = entry;
1711 cptr = &entry->next;
1712 entry = NULL;
b98f7d6e 1713 }
b98f7d6e 1714
33ad9f1a
CS
1715 fclose(proc_pid_cgroup);
1716 free(line);
1717 return result;
1718
1719out_error:
1720 saved_errno = errno;
1721 if (proc_pid_cgroup)
1722 fclose(proc_pid_cgroup);
1723 lxc_cgroup_process_info_free(result);
1724 lxc_cgroup_process_info_free(entry);
1725 free(line);
1726 errno = saved_errno;
ae5c8b8e 1727 return NULL;
36b86299
DL
1728}
1729
574c4428
QH
1730static char **subsystems_from_mount_options(const char *mount_options,
1731 char **kernel_list)
36b86299 1732{
33ad9f1a
CS
1733 char *token, *str, *saveptr = NULL;
1734 char **result = NULL;
1735 size_t result_capacity = 0;
8900b9eb 1736 size_t result_count = 0;
33ad9f1a
CS
1737 int saved_errno;
1738 int r;
ef342abb 1739
33ad9f1a
CS
1740 str = alloca(strlen(mount_options)+1);
1741 strcpy(str, mount_options);
1742 for (; (token = strtok_r(str, ",", &saveptr)); str = NULL) {
1743 /* we have a subsystem if it's either in the list of
1744 * subsystems provided by the kernel OR if it starts
1745 * with name= for named hierarchies
1746 */
836514a8
U
1747 r = lxc_grow_array((void ***)&result, &result_capacity, result_count + 1, 12);
1748 if (r < 0)
1749 goto out_free;
1750 result[result_count + 1] = NULL;
1751 if (strncmp(token, "name=", 5) && !lxc_string_in_array(token, (const char **)kernel_list)) {
1752 // this is eg 'systemd' but the mount will be 'name=systemd'
1753 result[result_count] = malloc(strlen(token) + 6);
1754 if (result[result_count])
1755 sprintf(result[result_count], "name=%s", token);
1756 } else
33ad9f1a 1757 result[result_count] = strdup(token);
836514a8
U
1758 if (!result[result_count])
1759 goto out_free;
1760 result_count++;
ae5c8b8e 1761 }
f0e64b8b 1762
33ad9f1a
CS
1763 return result;
1764
1765out_free:
1766 saved_errno = errno;
1767 lxc_free_array((void**)result, free);
1768 errno = saved_errno;
1769 return NULL;
b98f7d6e
SH
1770}
1771
574c4428 1772static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp)
b98f7d6e 1773{
33ad9f1a
CS
1774 if (!mp)
1775 return;
1776 free(mp->mount_point);
1777 free(mp->mount_prefix);
1778 free(mp);
bcbd102c
SH
1779}
1780
574c4428 1781static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h)
341a9bd8 1782{
33ad9f1a
CS
1783 if (!h)
1784 return;
1785 lxc_free_array((void **)h->subsystems, free);
8bfcb981 1786 free(h->all_mount_points);
33ad9f1a
CS
1787 free(h);
1788}
341a9bd8 1789
574c4428 1790static bool is_valid_cgroup(const char *name)
33ad9f1a
CS
1791{
1792 const char *p;
1793 for (p = name; *p; p++) {
28bb9321
QH
1794 /* Use the ASCII printable characters range(32 - 127)
1795 * is reasonable, we kick out 32(SPACE) because it'll
1796 * break legacy lxc-ls
1797 */
1798 if (*p <= 32 || *p >= 127 || *p == '/')
33ad9f1a 1799 return false;
341a9bd8 1800 }
33ad9f1a
CS
1801 return strcmp(name, ".") != 0 && strcmp(name, "..") != 0;
1802}
341a9bd8 1803
574c4428
QH
1804static int create_or_remove_cgroup(bool do_remove,
1805 struct cgroup_mount_point *mp, const char *path, int recurse)
33ad9f1a
CS
1806{
1807 int r, saved_errno = 0;
1808 char *buf = cgroup_to_absolute_path(mp, path, NULL);
1809 if (!buf)
1810 return -1;
341a9bd8 1811
33ad9f1a 1812 /* create or remove directory */
603c64c2
SH
1813 if (do_remove) {
1814 if (recurse)
1815 r = cgroup_rmdir(buf);
1816 else
1817 r = rmdir(buf);
1818 } else
1819 r = mkdir(buf, 0777);
33ad9f1a
CS
1820 saved_errno = errno;
1821 free(buf);
1822 errno = saved_errno;
1823 return r;
341a9bd8 1824}
bcbd102c 1825
574c4428 1826static int create_cgroup(struct cgroup_mount_point *mp, const char *path)
a6ddef61 1827{
603c64c2 1828 return create_or_remove_cgroup(false, mp, path, false);
a6ddef61
MN
1829}
1830
574c4428
QH
1831static int remove_cgroup(struct cgroup_mount_point *mp,
1832 const char *path, bool recurse)
576f946d 1833{
603c64c2 1834 return create_or_remove_cgroup(true, mp, path, recurse);
33ad9f1a 1835}
576f946d 1836
574c4428
QH
1837static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp,
1838 const char *path, const char *suffix)
33ad9f1a
CS
1839{
1840 /* first we have to make sure we subtract the mount point's prefix */
1841 char *prefix = mp->mount_prefix;
1842 char *buf;
1843 ssize_t len, rv;
1844
1845 /* we want to make sure only absolute paths to cgroups are passed to us */
1846 if (path[0] != '/') {
1847 errno = EINVAL;
1848 return NULL;
1849 }
b98f7d6e 1850
33ad9f1a
CS
1851 if (prefix && !strcmp(prefix, "/"))
1852 prefix = NULL;
b98f7d6e 1853
33ad9f1a
CS
1854 /* prefix doesn't match */
1855 if (prefix && strncmp(prefix, path, strlen(prefix)) != 0) {
1856 errno = EINVAL;
1857 return NULL;
1858 }
1859 /* if prefix is /foo and path is /foobar */
1860 if (prefix && path[strlen(prefix)] != '/' && path[strlen(prefix)] != '\0') {
1861 errno = EINVAL;
1862 return NULL;
1863 }
b98f7d6e 1864
33ad9f1a
CS
1865 /* remove prefix from path */
1866 path += prefix ? strlen(prefix) : 0;
b98f7d6e 1867
33ad9f1a
CS
1868 len = strlen(mp->mount_point) + strlen(path) + (suffix ? strlen(suffix) : 0);
1869 buf = calloc(len + 1, 1);
50266dc6
DE
1870 if (!buf)
1871 return NULL;
33ad9f1a 1872 rv = snprintf(buf, len + 1, "%s%s%s", mp->mount_point, path, suffix ? suffix : "");
8900b9eb 1873 if (rv > len) {
33ad9f1a
CS
1874 free(buf);
1875 errno = ENOMEM;
8900b9eb 1876 return NULL;
8b92dc3a 1877 }
576f946d 1878
33ad9f1a 1879 return buf;
e0f888d9 1880}
283678ed 1881
574c4428
QH
1882static struct cgroup_process_info *
1883find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem)
283678ed 1884{
33ad9f1a
CS
1885 struct cgroup_process_info *info_ptr;
1886 for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1887 struct cgroup_hierarchy *h = info_ptr->hierarchy;
1888 if (lxc_string_in_array(subsystem, (const char **)h->subsystems))
1889 return info_ptr;
b98f7d6e 1890 }
33ad9f1a
CS
1891 errno = ENOENT;
1892 return NULL;
1893}
283678ed 1894
574c4428
QH
1895static int do_cgroup_get(const char *cgroup_path, const char *sub_filename,
1896 char *value, size_t len)
33ad9f1a
CS
1897{
1898 const char *parts[3] = {
1899 cgroup_path,
1900 sub_filename,
1901 NULL
1902 };
1903 char *filename;
1904 int ret, saved_errno;
1905
1906 filename = lxc_string_join("/", parts, false);
1907 if (!filename)
1908 return -1;
1909
1910 ret = lxc_read_from_file(filename, value, len);
1911 saved_errno = errno;
1912 free(filename);
1913 errno = saved_errno;
1914 return ret;
283678ed 1915}
b113383b 1916
574c4428
QH
1917static int do_cgroup_set(const char *cgroup_path, const char *sub_filename,
1918 const char *value)
b113383b 1919{
33ad9f1a
CS
1920 const char *parts[3] = {
1921 cgroup_path,
1922 sub_filename,
1923 NULL
1924 };
1925 char *filename;
1926 int ret, saved_errno;
b113383b 1927
33ad9f1a
CS
1928 filename = lxc_string_join("/", parts, false);
1929 if (!filename)
1930 return -1;
b113383b 1931
33ad9f1a
CS
1932 ret = lxc_write_to_file(filename, value, strlen(value), false);
1933 saved_errno = errno;
1934 free(filename);
1935 errno = saved_errno;
1936 return ret;
b98f7d6e
SH
1937}
1938
4fb3cba5 1939static int do_setup_cgroup_limits(struct cgfs_data *d,
574c4428 1940 struct lxc_list *cgroup_settings, bool do_devices)
b98f7d6e 1941{
365d180a 1942 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
b98f7d6e
SH
1943 struct lxc_cgroup *cg;
1944 int ret = -1;
1945
33ad9f1a 1946 if (lxc_list_empty(cgroup_settings))
b98f7d6e
SH
1947 return 0;
1948
aaf26830 1949 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
fac7c663
KT
1950 if (!sorted_cgroup_settings) {
1951 return -1;
1952 }
aaf26830
KT
1953
1954 lxc_list_for_each(iterator, sorted_cgroup_settings) {
b98f7d6e
SH
1955 cg = iterator->elem;
1956
33ad9f1a 1957 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
b98f7d6e 1958 if (strcmp(cg->subsystem, "devices.deny") == 0 &&
4fb3cba5 1959 cgroup_devices_has_allow_or_deny(d, cg->value, false))
b98f7d6e
SH
1960 continue;
1961 if (strcmp(cg->subsystem, "devices.allow") == 0 &&
4fb3cba5 1962 cgroup_devices_has_allow_or_deny(d, cg->value, true))
b98f7d6e 1963 continue;
4fb3cba5 1964 if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
dddf7c5b 1965 if (do_devices && (errno == EACCES || errno == EPERM)) {
4f875f70
SH
1966 WARN("Error setting %s to %s for %s",
1967 cg->subsystem, cg->value, d->name);
1968 continue;
1969 }
dddf7c5b 1970 SYSERROR("Error setting %s to %s for %s",
4fb3cba5 1971 cg->subsystem, cg->value, d->name);
b98f7d6e
SH
1972 goto out;
1973 }
b113383b 1974 }
b98f7d6e
SH
1975
1976 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
b113383b
SH
1977 }
1978
b98f7d6e
SH
1979 ret = 0;
1980 INFO("cgroup has been setup");
1981out:
365d180a 1982 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
aaf26830
KT
1983 lxc_list_del(iterator);
1984 free(iterator);
1985 }
365d180a 1986 free(sorted_cgroup_settings);
b113383b
SH
1987 return ret;
1988}
b98f7d6e 1989
4fb3cba5 1990static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d,
574c4428 1991 char *v, bool for_allow)
33ad9f1a
CS
1992{
1993 char *path;
1994 FILE *devices_list;
8900b9eb 1995 char *line = NULL;
33ad9f1a
CS
1996 size_t sz = 0;
1997 bool ret = !for_allow;
1998 const char *parts[3] = {
1999 NULL,
2000 "devices.list",
2001 NULL
2002 };
2003
2004 // XXX FIXME if users could use something other than 'lxc.devices.deny = a'.
2005 // not sure they ever do, but they *could*
2006 // right now, I'm assuming they do NOT
2007 if (!for_allow && strcmp(v, "a") != 0 && strcmp(v, "a *:* rwm") != 0)
2008 return false;
2009
4fb3cba5 2010 parts[0] = (const char *)lxc_cgroup_get_hierarchy_abs_path_data("devices", d);
33ad9f1a
CS
2011 if (!parts[0])
2012 return false;
2013 path = lxc_string_join("/", parts, false);
2014 if (!path) {
2015 free((void *)parts[0]);
2016 return false;
2017 }
2018
2019 devices_list = fopen_cloexec(path, "r");
2020 if (!devices_list) {
2021 free(path);
2022 return false;
2023 }
2024
2025 while (getline(&line, &sz, devices_list) != -1) {
2026 size_t len = strlen(line);
2027 if (len > 0 && line[len-1] == '\n')
2028 line[len-1] = '\0';
2029 if (strcmp(line, "a *:* rwm") == 0) {
2030 ret = for_allow;
2031 goto out;
2032 } else if (for_allow && strcmp(line, v) == 0) {
2033 ret = true;
8900b9eb 2034 goto out;
33ad9f1a
CS
2035 }
2036 }
2037
2038out:
2039 fclose(devices_list);
2040 free(line);
2041 free(path);
2042 return ret;
2043}
2044
574c4428 2045static int cgroup_recursive_task_count(const char *cgroup_path)
b98f7d6e 2046{
33ad9f1a
CS
2047 DIR *d;
2048 struct dirent *dent_buf;
2049 struct dirent *dent;
8900b9eb 2050 ssize_t name_max;
33ad9f1a
CS
2051 int n = 0, r;
2052
2053 /* see man readdir_r(3) */
2054 name_max = pathconf(cgroup_path, _PC_NAME_MAX);
2055 if (name_max <= 0)
2056 name_max = 255;
2057 dent_buf = malloc(offsetof(struct dirent, d_name) + name_max + 1);
2058 if (!dent_buf)
2059 return -1;
2060
2061 d = opendir(cgroup_path);
034ef75d
SH
2062 if (!d) {
2063 free(dent_buf);
33ad9f1a 2064 return 0;
034ef75d 2065 }
33ad9f1a
CS
2066
2067 while (readdir_r(d, dent_buf, &dent) == 0 && dent) {
2068 const char *parts[3] = {
2069 cgroup_path,
2070 dent->d_name,
2071 NULL
2072 };
2073 char *sub_path;
2074 struct stat st;
2075
2076 if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, ".."))
2077 continue;
2078 sub_path = lxc_string_join("/", parts, false);
2079 if (!sub_path) {
2080 closedir(d);
2081 free(dent_buf);
2082 return -1;
2083 }
2084 r = stat(sub_path, &st);
2085 if (r < 0) {
2086 closedir(d);
2087 free(dent_buf);
2088 free(sub_path);
2089 return -1;
2090 }
2091 if (S_ISDIR(st.st_mode)) {
2092 r = cgroup_recursive_task_count(sub_path);
2093 if (r >= 0)
2094 n += r;
2095 } else if (!strcmp(dent->d_name, "tasks")) {
2096 r = count_lines(sub_path);
2097 if (r >= 0)
2098 n += r;
2099 }
2100 free(sub_path);
2101 }
2102 closedir(d);
2103 free(dent_buf);
2104
2105 return n;
2106}
2107
574c4428 2108static int count_lines(const char *fn)
33ad9f1a
CS
2109{
2110 FILE *f;
2111 char *line = NULL;
2112 size_t sz = 0;
2113 int n = 0;
2114
2115 f = fopen_cloexec(fn, "r");
2116 if (!f)
2117 return -1;
2118
2119 while (getline(&line, &sz, f) != -1) {
2120 n++;
2121 }
2122 free(line);
2123 fclose(f);
2124 return n;
b98f7d6e
SH
2125}
2126
574c4428
QH
2127static int handle_cgroup_settings(struct cgroup_mount_point *mp,
2128 char *cgroup_path)
b98f7d6e 2129{
33ad9f1a 2130 int r, saved_errno = 0;
7e7243e1 2131 char buf[2];
1ea59ad2 2132
934b1673
SH
2133 mp->need_cpuset_init = false;
2134
1ea59ad2
SH
2135 /* If this is the memory cgroup, we want to enforce hierarchy.
2136 * But don't fail if for some reason we can't.
2137 */
2edb53c7
SH
2138 if (lxc_string_in_array("memory", (const char **)mp->hierarchy->subsystems)) {
2139 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/memory.use_hierarchy");
2140 if (cc_path) {
2141 r = lxc_read_from_file(cc_path, buf, 1);
2142 if (r < 1 || buf[0] != '1') {
2143 r = lxc_write_to_file(cc_path, "1", 1, false);
2144 if (r < 0)
a8916143 2145 SYSERROR("failed to set memory.use_hierarchy to 1; continuing");
2edb53c7 2146 }
1ea59ad2
SH
2147 free(cc_path);
2148 }
2edb53c7 2149 }
1ea59ad2 2150
33ad9f1a
CS
2151 /* if this is a cpuset hierarchy, we have to set cgroup.clone_children in
2152 * the base cgroup, otherwise containers will start with an empty cpuset.mems
2153 * and cpuset.cpus and then
2154 */
2edb53c7
SH
2155 if (lxc_string_in_array("cpuset", (const char **)mp->hierarchy->subsystems)) {
2156 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/cgroup.clone_children");
d703c2b1
RV
2157 struct stat sb;
2158
33ad9f1a 2159 if (!cc_path)
2edb53c7 2160 return -1;
d703c2b1
RV
2161 /* cgroup.clone_children is not available when running under
2162 * older kernel versions; in this case, we'll initialize
2163 * cpuset.cpus and cpuset.mems later, after the new cgroup
2164 * was created
2165 */
2166 if (stat(cc_path, &sb) != 0 && errno == ENOENT) {
934b1673 2167 mp->need_cpuset_init = true;
d703c2b1
RV
2168 free(cc_path);
2169 return 0;
2170 }
7e7243e1
SH
2171 r = lxc_read_from_file(cc_path, buf, 1);
2172 if (r == 1 && buf[0] == '1') {
2173 free(cc_path);
2edb53c7 2174 return 0;
7e7243e1 2175 }
33ad9f1a 2176 r = lxc_write_to_file(cc_path, "1", 1, false);
2edb53c7
SH
2177 saved_errno = errno;
2178 free(cc_path);
2179 errno = saved_errno;
2180 return r < 0 ? -1 : 0;
33ad9f1a
CS
2181 }
2182 return 0;
b98f7d6e 2183}
484ed030 2184
934b1673 2185static int cgroup_read_from_file(const char *fn, char buf[], size_t bufsize)
d703c2b1
RV
2186{
2187 int ret = lxc_read_from_file(fn, buf, bufsize);
2188 if (ret < 0) {
2189 SYSERROR("failed to read %s", fn);
934b1673 2190 return ret;
d703c2b1
RV
2191 }
2192 if (ret == bufsize) {
934b1673
SH
2193 if (bufsize > 0) {
2194 /* obviously this wasn't empty */
2195 buf[bufsize-1] = '\0';
2196 return ret;
2197 }
2198 /* Callers don't do this, but regression/sanity check */
2199 ERROR("%s: was not expecting 0 bufsize", __func__);
2200 return -1;
d703c2b1
RV
2201 }
2202 buf[ret] = '\0';
934b1673 2203 return ret;
d703c2b1
RV
2204}
2205
2206static bool do_init_cpuset_file(struct cgroup_mount_point *mp,
2207 const char *path, const char *name)
2208{
934b1673
SH
2209 char value[1024];
2210 char *childfile, *parentfile = NULL, *tmp;
2211 int ret;
2212 bool ok = false;
2213
d703c2b1
RV
2214 childfile = cgroup_to_absolute_path(mp, path, name);
2215 if (!childfile)
2216 return false;
2217
2218 /* don't overwrite a non-empty value in the file */
934b1673
SH
2219 ret = cgroup_read_from_file(childfile, value, sizeof(value));
2220 if (ret < 0)
2221 goto out;
d703c2b1 2222 if (value[0] != '\0' && value[0] != '\n') {
934b1673
SH
2223 ok = true;
2224 goto out;
d703c2b1
RV
2225 }
2226
2227 /* path to the same name in the parent cgroup */
2228 parentfile = strdup(path);
2229 if (!parentfile)
934b1673
SH
2230 goto out;
2231
d703c2b1 2232 tmp = strrchr(parentfile, '/');
934b1673
SH
2233 if (!tmp)
2234 goto out;
d703c2b1
RV
2235 if (tmp == parentfile)
2236 tmp++; /* keep the '/' at the start */
2237 *tmp = '\0';
2238 tmp = parentfile;
2239 parentfile = cgroup_to_absolute_path(mp, tmp, name);
2240 free(tmp);
934b1673
SH
2241 if (!parentfile)
2242 goto out;
d703c2b1
RV
2243
2244 /* copy from parent to child cgroup */
934b1673
SH
2245 ret = cgroup_read_from_file(parentfile, value, sizeof(value));
2246 if (ret < 0)
2247 goto out;
2248 if (ret == sizeof(value)) {
2249 /* If anyone actually sees this error, we can address it */
2250 ERROR("parent cpuset value too long");
2251 goto out;
d703c2b1
RV
2252 }
2253 ok = (lxc_write_to_file(childfile, value, strlen(value), false) >= 0);
2254 if (!ok)
2255 SYSERROR("failed writing %s", childfile);
b1dad6f6
RV
2256
2257out:
f10fad2f 2258 free(parentfile);
d703c2b1 2259 free(childfile);
d703c2b1
RV
2260 return ok;
2261}
2262
2263static bool init_cpuset_if_needed(struct cgroup_mount_point *mp,
2264 const char *path)
2265{
2266 /* the files we have to handle here are only in cpuset hierarchies */
2267 if (!lxc_string_in_array("cpuset",
2268 (const char **)mp->hierarchy->subsystems))
2269 return true;
2270
b1dad6f6
RV
2271 if (!mp->need_cpuset_init)
2272 return true;
2273
d703c2b1
RV
2274 return (do_init_cpuset_file(mp, path, "/cpuset.cpus") &&
2275 do_init_cpuset_file(mp, path, "/cpuset.mems") );
2276}
2277
4fb3cba5 2278struct cgroup_ops *cgfs_ops_init(void)
484ed030 2279{
4fb3cba5 2280 return &cgfs_ops;
d4ef7c50 2281}
484ed030 2282
4fb3cba5 2283static void *cgfs_init(const char *name)
d4ef7c50 2284{
4fb3cba5 2285 struct cgfs_data *d;
484ed030 2286
4fb3cba5
DE
2287 d = malloc(sizeof(*d));
2288 if (!d)
2289 return NULL;
484ed030 2290
4fb3cba5
DE
2291 memset(d, 0, sizeof(*d));
2292 d->name = strdup(name);
2293 if (!d->name)
2294 goto err1;
2295
5e1c5795 2296 d->cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
4fb3cba5
DE
2297
2298 d->meta = lxc_cgroup_load_meta();
2299 if (!d->meta) {
2300 ERROR("cgroupfs failed to detect cgroup metadata");
2301 goto err2;
2302 }
2303 return d;
2304
2305err2:
2306 free(d->name);
2307err1:
2308 free(d);
2309 return NULL;
d4ef7c50 2310}
484ed030 2311
4fb3cba5 2312static void cgfs_destroy(void *hdata)
d4ef7c50 2313{
4fb3cba5
DE
2314 struct cgfs_data *d = hdata;
2315
d4ef7c50
SH
2316 if (!d)
2317 return;
f10fad2f 2318 free(d->name);
c55d4505
ME
2319 lxc_cgroup_process_info_free_and_remove(d->info);
2320 lxc_cgroup_put_meta(d->meta);
d4ef7c50 2321 free(d);
d4ef7c50 2322}
484ed030 2323
4fb3cba5 2324static inline bool cgfs_create(void *hdata)
d4ef7c50 2325{
4fb3cba5
DE
2326 struct cgfs_data *d = hdata;
2327 struct cgroup_process_info *i;
2328 struct cgroup_meta_data *md;
484ed030 2329
4fb3cba5 2330 if (!d)
d4ef7c50 2331 return false;
4fb3cba5
DE
2332 md = d->meta;
2333 i = lxc_cgroupfs_create(d->name, d->cgroup_pattern, md, NULL);
d4ef7c50
SH
2334 if (!i)
2335 return false;
2336 d->info = i;
2337 return true;
2338}
484ed030 2339
4fb3cba5 2340static inline bool cgfs_enter(void *hdata, pid_t pid)
d4ef7c50 2341{
4fb3cba5
DE
2342 struct cgfs_data *d = hdata;
2343 struct cgroup_process_info *i;
d4ef7c50 2344 int ret;
4fb3cba5
DE
2345
2346 if (!d)
2347 return false;
2348 i = d->info;
2349 ret = lxc_cgroupfs_enter(i, pid, false);
484ed030 2350
d4ef7c50
SH
2351 return ret == 0;
2352}
2353
4fb3cba5 2354static inline bool cgfs_create_legacy(void *hdata, pid_t pid)
d4ef7c50 2355{
4fb3cba5
DE
2356 struct cgfs_data *d = hdata;
2357 struct cgroup_process_info *i;
2358
2359 if (!d)
2360 return false;
2361 i = d->info;
2362 if (lxc_cgroup_create_legacy(i, d->name, pid) < 0) {
2363 ERROR("failed to create legacy ns cgroups for '%s'", d->name);
d4ef7c50 2364 return false;
484ed030 2365 }
d4ef7c50
SH
2366 return true;
2367}
484ed030 2368
4fb3cba5 2369static const char *cgfs_get_cgroup(void *hdata, const char *subsystem)
d4ef7c50 2370{
4fb3cba5
DE
2371 struct cgfs_data *d = hdata;
2372
2373 if (!d)
2374 return NULL;
2375 return lxc_cgroup_get_hierarchy_path_data(subsystem, d);
484ed030
SH
2376}
2377
2ba7a429
TA
2378static const char *cgfs_canonical_path(void *hdata)
2379{
2380 struct cgfs_data *d = hdata;
2381 struct cgroup_process_info *info_ptr;
2382 char *path = NULL;
2383
2384 if (!d)
2385 return NULL;
2386
2387 for (info_ptr = d->info; info_ptr; info_ptr = info_ptr->next) {
2388 if (!path)
2389 path = info_ptr->cgroup_path;
2390 else if (strcmp(path, info_ptr->cgroup_path) != 0) {
2391 ERROR("not all paths match %s, %s has path %s", path,
2392 info_ptr->hierarchy->subsystems[0], info_ptr->cgroup_path);
2393 return NULL;
2394 }
2395 }
2396
2397 return path;
2398}
2399
06078509
TA
2400static bool cgfs_escape(void)
2401{
2402 struct cgroup_meta_data *md;
2403 int i;
2404 bool ret = false;
2405
2406 md = lxc_cgroup_load_meta();
2407 if (!md)
2408 return false;
2409
2410 for (i = 1; i <= md->maximum_hierarchy; i++) {
2411 struct cgroup_hierarchy *h = md->hierarchies[i];
2412 struct cgroup_mount_point *mp;
2413 char *tasks;
2414 FILE *f;
2415 int written;
2416
2417 if (!h) {
2418 WARN("not escaping hierarchy %d", i);
2419 continue;
2420 }
2421
2422 mp = lxc_cgroup_find_mount_point(h, "/", true);
2423 if (!mp)
2424 goto out;
2425
2426 tasks = cgroup_to_absolute_path(mp, "/", "tasks");
2427 if (!tasks)
2428 goto out;
2429
2430 f = fopen(tasks, "a");
2431 free(tasks);
2432 if (!f)
2433 goto out;
2434
2435 written = fprintf(f, "%d\n", getpid());
2436 fclose(f);
2437 if (written < 0) {
2438 SYSERROR("writing tasks failed\n");
2439 goto out;
2440 }
2441 }
2442
2443 ret = true;
2444out:
2445 lxc_cgroup_put_meta(md);
2446 return ret;
2447}
2448
4fb3cba5 2449static bool cgfs_unfreeze(void *hdata)
0086f499 2450{
4fb3cba5 2451 struct cgfs_data *d = hdata;
0086f499
SH
2452 char *cgabspath, *cgrelpath;
2453 int ret;
2454
4fb3cba5
DE
2455 if (!d)
2456 return false;
2457
2458 cgrelpath = lxc_cgroup_get_hierarchy_path_data("freezer", d);
0086f499
SH
2459 cgabspath = lxc_cgroup_find_abs_path("freezer", cgrelpath, true, NULL);
2460 if (!cgabspath)
ecfcb3f0 2461 return false;
0086f499
SH
2462
2463 ret = do_cgroup_set(cgabspath, "freezer.state", "THAWED");
2464 free(cgabspath);
ecfcb3f0 2465 return ret == 0;
0086f499
SH
2466}
2467
4fb3cba5
DE
2468static bool cgroupfs_setup_limits(void *hdata, struct lxc_list *cgroup_conf,
2469 bool with_devices)
9daf6f5d 2470{
4fb3cba5
DE
2471 struct cgfs_data *d = hdata;
2472
2473 if (!d)
2474 return false;
2475 return do_setup_cgroup_limits(d, cgroup_conf, with_devices) == 0;
9daf6f5d
SH
2476}
2477
4fb3cba5 2478static bool lxc_cgroupfs_attach(const char *name, const char *lxcpath, pid_t pid)
5d897655
SH
2479{
2480 struct cgroup_meta_data *meta_data;
2481 struct cgroup_process_info *container_info;
2482 int ret;
2483
2484 meta_data = lxc_cgroup_load_meta();
2485 if (!meta_data) {
2486 ERROR("could not move attached process %d to cgroup of container", pid);
2487 return false;
2488 }
2489
2490 container_info = lxc_cgroup_get_container_info(name, lxcpath, meta_data);
2491 lxc_cgroup_put_meta(meta_data);
2492 if (!container_info) {
2493 ERROR("could not move attached process %d to cgroup of container", pid);
2494 return false;
2495 }
2496
2497 ret = lxc_cgroupfs_enter(container_info, pid, false);
2498 lxc_cgroup_process_info_free(container_info);
2499 if (ret < 0) {
2500 ERROR("could not move attached process %d to cgroup of container", pid);
2501 return false;
2502 }
2503 return true;
2504}
2505
8b276860
SH
2506struct chown_data {
2507 const char *cgroup_path;
2508 uid_t origuid;
2509};
2510
2511/*
2512 * TODO - someone should refactor this to unshare once passing all the paths
2513 * to be chowned in one go
2514 */
2515static int chown_cgroup_wrapper(void *data)
2516{
2517 struct chown_data *arg = data;
2518 uid_t destuid;
2519 char *fpath;
2520
2521
2522 if (setresgid(0,0,0) < 0)
2523 SYSERROR("Failed to setgid to 0");
2524 if (setresuid(0,0,0) < 0)
2525 SYSERROR("Failed to setuid to 0");
2526 if (setgroups(0, NULL) < 0)
2527 SYSERROR("Failed to clear groups");
2528 destuid = get_ns_uid(arg->origuid);
2529
2530 if (chown(arg->cgroup_path, destuid, 0) < 0)
2531 SYSERROR("Failed chowning %s to %d", arg->cgroup_path, (int)destuid);
2532
2533 fpath = lxc_append_paths(arg->cgroup_path, "tasks");
2534 if (!fpath)
2535 return -1;
2536 if (chown(fpath, destuid, 0) < 0)
2537 SYSERROR("Error chowning %s\n", fpath);
2538 free(fpath);
2539 fpath = lxc_append_paths(arg->cgroup_path, "cgroup.procs");
2540 if (!fpath)
2541 return -1;
2542 if (chown(fpath, destuid, 0) < 0)
2543 SYSERROR("Error chowning %s", fpath);
2544 free(fpath);
2545
2546 return 0;
2547}
2548
2549static bool do_cgfs_chown(char *cgroup_path, struct lxc_conf *conf)
2550{
2551 struct chown_data data;
2552 char *fpath;
2553
2554 if (lxc_list_empty(&conf->id_map))
2555 /* If there's no mapping then we don't need to chown */
2556 return true;
2557
2558 data.cgroup_path = cgroup_path;
2559 data.origuid = geteuid();
2560
2561 /* Unpriv users can't chown it themselves, so chown from
2562 * a child namespace mapping both our own and the target uid
2563 */
2564 if (userns_exec_1(conf, chown_cgroup_wrapper, &data) < 0) {
2565 ERROR("Error requesting cgroup chown in new namespace");
2566 return false;
2567 }
2568
2569 /*
2570 * Now chmod 775 the directory else the container cannot create cgroups.
2571 * This can't be done in the child namespace because it only group-owns
2572 * the cgroup
2573 */
2574 if (chmod(cgroup_path, 0775) < 0) {
2575 SYSERROR("Error chmoding %s\n", cgroup_path);
2576 return false;
2577 }
2578 fpath = lxc_append_paths(cgroup_path, "tasks");
2579 if (!fpath)
2580 return false;
2581 if (chmod(fpath, 0664) < 0)
2582 SYSERROR("Error chmoding %s\n", fpath);
2583 free(fpath);
2584 fpath = lxc_append_paths(cgroup_path, "cgroup.procs");
2585 if (!fpath)
2586 return false;
2587 if (chmod(fpath, 0664) < 0)
2588 SYSERROR("Error chmoding %s\n", fpath);
2589 free(fpath);
2590
2591 return true;
2592}
2593
2594static bool cgfs_chown(void *hdata, struct lxc_conf *conf)
2595{
2596 struct cgfs_data *d = hdata;
2597 struct cgroup_process_info *info_ptr;
2598 char *cgpath;
2599 bool r = true;
2600
2601 if (!d)
2602 return false;
2603
2604 for (info_ptr = d->info; info_ptr; info_ptr = info_ptr->next) {
2605 if (!info_ptr->designated_mount_point) {
2606 info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, info_ptr->cgroup_path, true);
2607 if (!info_ptr->designated_mount_point) {
2608 SYSERROR("Could not chown cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", info_ptr->cgroup_path);
2609 return false;
2610 }
2611 }
2612
2613 cgpath = cgroup_to_absolute_path(info_ptr->designated_mount_point, info_ptr->cgroup_path, NULL);
2614 if (!cgpath) {
2615 SYSERROR("Could not chown cgroup %s: internal error", info_ptr->cgroup_path);
2616 continue;
2617 }
2618 r = do_cgfs_chown(cgpath, conf);
ea439aac 2619 if (!r && is_crucial_hierarchy(info_ptr->hierarchy)) {
8b276860
SH
2620 ERROR("Failed chowning %s\n", cgpath);
2621 free(cgpath);
2622 return false;
2623 }
2624 free(cgpath);
2625 }
2626
2627 return true;
2628}
2629
d4ef7c50 2630static struct cgroup_ops cgfs_ops = {
d4ef7c50 2631 .init = cgfs_init,
4fb3cba5 2632 .destroy = cgfs_destroy,
d4ef7c50
SH
2633 .create = cgfs_create,
2634 .enter = cgfs_enter,
2635 .create_legacy = cgfs_create_legacy,
2636 .get_cgroup = cgfs_get_cgroup,
2ba7a429 2637 .canonical_path = cgfs_canonical_path,
06078509 2638 .escape = cgfs_escape,
d4ef7c50
SH
2639 .get = lxc_cgroupfs_get,
2640 .set = lxc_cgroupfs_set,
4fb3cba5 2641 .unfreeze = cgfs_unfreeze,
9daf6f5d 2642 .setup_limits = cgroupfs_setup_limits,
d4ef7c50 2643 .name = "cgroupfs",
5d897655 2644 .attach = lxc_cgroupfs_attach,
8b276860 2645 .chown = cgfs_chown,
c476bdce 2646 .mount_cgroup = cgroupfs_mount_cgroup,
4fb3cba5 2647 .nrtasks = cgfs_nrtasks,
23befb18 2648 .driver = CGFS,
d4ef7c50 2649};