]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/cgroups/cgfs.c
Merge pull request #1578 from 0x0916/export-seccomp-filter-to-log
[mirror_lxc.git] / src / lxc / cgroups / cgfs.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23 #include "config.h"
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <errno.h>
28 #include <unistd.h>
29 #include <string.h>
30 #include <dirent.h>
31 #include <fcntl.h>
32 #include <grp.h>
33 #include <ctype.h>
34 #include <sys/types.h>
35 #include <sys/stat.h>
36 #include <sys/param.h>
37 #include <sys/inotify.h>
38 #include <sys/mount.h>
39 #include <netinet/in.h>
40 #include <net/if.h>
41
42 #include "bdev.h"
43 #include "error.h"
44 #include "commands.h"
45 #include "list.h"
46 #include "conf.h"
47 #include "utils.h"
48 #include "log.h"
49 #include "cgroup.h"
50 #include "start.h"
51 #include "state.h"
52
53 #if IS_BIONIC
54 #include <../include/lxcmntent.h>
55 #else
56 #include <mntent.h>
57 #endif
58
59 struct cgroup_hierarchy;
60 struct cgroup_meta_data;
61 struct cgroup_mount_point;
62
63 /*
64 * cgroup_meta_data: the metadata about the cgroup infrastructure on this
65 * host
66 */
67 struct cgroup_meta_data {
68 ptrdiff_t ref; /* simple refcount */
69 struct cgroup_hierarchy **hierarchies;
70 struct cgroup_mount_point **mount_points;
71 int maximum_hierarchy;
72 };
73
74 /*
75 * cgroup_hierarchy: describes a single cgroup hierarchy
76 * (may have multiple mount points)
77 */
78 struct cgroup_hierarchy {
79 int index;
80 bool used; /* false if the hierarchy should be ignored by lxc */
81 char **subsystems;
82 struct cgroup_mount_point *rw_absolute_mount_point;
83 struct cgroup_mount_point *ro_absolute_mount_point;
84 struct cgroup_mount_point **all_mount_points;
85 size_t all_mount_point_capacity;
86 };
87
88 /*
89 * cgroup_mount_point: a mount point to where a hierarchy
90 * is mounted to
91 */
92 struct cgroup_mount_point {
93 struct cgroup_hierarchy *hierarchy;
94 char *mount_point;
95 char *mount_prefix;
96 bool read_only;
97 bool need_cpuset_init;
98 };
99
100 /*
101 * cgroup_process_info: describes the membership of a
102 * process to the different cgroup
103 * hierarchies
104 *
105 * Note this is the per-process info tracked by the cgfs_ops.
106 * This is not used with cgmanager.
107 */
108 struct cgroup_process_info {
109 struct cgroup_process_info *next;
110 struct cgroup_meta_data *meta_ref;
111 struct cgroup_hierarchy *hierarchy;
112 char *cgroup_path;
113 char *cgroup_path_sub;
114 char **created_paths;
115 size_t created_paths_capacity;
116 size_t created_paths_count;
117 struct cgroup_mount_point *designated_mount_point;
118 };
119
120 struct cgfs_data {
121 char *name;
122 const char *cgroup_pattern;
123 struct cgroup_meta_data *meta;
124 struct cgroup_process_info *info;
125 };
126
127 lxc_log_define(lxc_cgfs, lxc);
128
129 static struct cgroup_process_info *lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str, struct cgroup_meta_data *meta);
130 static char **subsystems_from_mount_options(const char *mount_options, char **kernel_list);
131 static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp);
132 static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h);
133 static bool is_valid_cgroup(const char *name);
134 static int create_cgroup(struct cgroup_mount_point *mp, const char *path);
135 static int remove_cgroup(struct cgroup_mount_point *mp, const char *path, bool recurse,
136 struct lxc_conf *conf);
137 static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp, const char *path, const char *suffix);
138 static struct cgroup_process_info *find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem);
139 static int do_cgroup_get(const char *cgroup_path, const char *sub_filename, char *value, size_t len);
140 static int do_cgroup_set(const char *cgroup_path, const char *sub_filename, const char *value);
141 static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d, char *v, bool for_allow);
142 static int do_setup_cgroup_limits(struct cgfs_data *d, struct lxc_list *cgroup_settings, bool do_devices);
143 static int cgroup_recursive_task_count(const char *cgroup_path);
144 static int handle_cgroup_settings(struct cgroup_mount_point *mp, char *cgroup_path);
145 static bool init_cpuset_if_needed(struct cgroup_mount_point *mp, const char *path);
146
147 static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist);
148 static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data);
149 static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data);
150
151 /* free process membership information */
152 static void lxc_cgroup_process_info_free(struct cgroup_process_info *info);
153 static void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info,
154 struct lxc_conf *conf);
155
156 static struct cgroup_ops cgfs_ops;
157
158 static int cgroup_rmdir(char *dirname)
159 {
160 struct dirent *direntp;
161 int saved_errno = 0;
162 DIR *dir;
163 int ret, failed=0;
164 char pathname[MAXPATHLEN];
165
166 dir = opendir(dirname);
167 if (!dir) {
168 ERROR("%s: failed to open %s", __func__, dirname);
169 return -1;
170 }
171
172 while ((direntp = readdir(dir))) {
173 struct stat mystat;
174 int rc;
175
176 if (!direntp)
177 break;
178
179 if (!strcmp(direntp->d_name, ".") ||
180 !strcmp(direntp->d_name, ".."))
181 continue;
182
183 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
184 if (rc < 0 || rc >= MAXPATHLEN) {
185 ERROR("pathname too long");
186 failed=1;
187 if (!saved_errno)
188 saved_errno = -ENOMEM;
189 continue;
190 }
191 ret = lstat(pathname, &mystat);
192 if (ret) {
193 SYSERROR("%s: failed to stat %s", __func__, pathname);
194 failed=1;
195 if (!saved_errno)
196 saved_errno = errno;
197 continue;
198 }
199 if (S_ISDIR(mystat.st_mode)) {
200 if (cgroup_rmdir(pathname) < 0) {
201 if (!saved_errno)
202 saved_errno = errno;
203 failed=1;
204 }
205 }
206 }
207
208 if (rmdir(dirname) < 0) {
209 SYSERROR("%s: failed to delete %s", __func__, dirname);
210 if (!saved_errno)
211 saved_errno = errno;
212 failed=1;
213 }
214
215 ret = closedir(dir);
216 if (ret) {
217 SYSERROR("%s: failed to close directory %s", __func__, dirname);
218 if (!saved_errno)
219 saved_errno = errno;
220 failed=1;
221 }
222
223 errno = saved_errno;
224 return failed ? -1 : 0;
225 }
226
227 static int rmdir_wrapper(void *data)
228 {
229 char *path = data;
230
231 if (setresgid(0,0,0) < 0)
232 SYSERROR("Failed to setgid to 0");
233 if (setresuid(0,0,0) < 0)
234 SYSERROR("Failed to setuid to 0");
235 if (setgroups(0, NULL) < 0)
236 SYSERROR("Failed to clear groups");
237
238 return cgroup_rmdir(path);
239 }
240
241 static struct cgroup_meta_data *lxc_cgroup_load_meta()
242 {
243 const char *cgroup_use = NULL;
244 char **cgroup_use_list = NULL;
245 struct cgroup_meta_data *md = NULL;
246 int saved_errno;
247
248 errno = 0;
249 cgroup_use = lxc_global_config_value("lxc.cgroup.use");
250 if (!cgroup_use && errno != 0)
251 return NULL;
252 if (cgroup_use) {
253 cgroup_use_list = lxc_string_split_and_trim(cgroup_use, ',');
254 if (!cgroup_use_list)
255 return NULL;
256 }
257
258 md = lxc_cgroup_load_meta2((const char **)cgroup_use_list);
259 saved_errno = errno;
260 lxc_free_array((void **)cgroup_use_list, free);
261 errno = saved_errno;
262 return md;
263 }
264
265 /* Step 1: determine all kernel subsystems */
266 static bool find_cgroup_subsystems(char ***kernel_subsystems)
267 {
268 FILE *proc_cgroups;
269 bool bret = false;
270 char *line = NULL;
271 size_t sz = 0;
272 size_t kernel_subsystems_count = 0;
273 size_t kernel_subsystems_capacity = 0;
274 int r;
275
276 proc_cgroups = fopen_cloexec("/proc/cgroups", "r");
277 if (!proc_cgroups)
278 return false;
279
280 while (getline(&line, &sz, proc_cgroups) != -1) {
281 char *tab1;
282 char *tab2;
283 int hierarchy_number;
284
285 if (line[0] == '#')
286 continue;
287 if (!line[0])
288 continue;
289
290 tab1 = strchr(line, '\t');
291 if (!tab1)
292 continue;
293 *tab1++ = '\0';
294 tab2 = strchr(tab1, '\t');
295 if (!tab2)
296 continue;
297 *tab2 = '\0';
298
299 tab2 = NULL;
300 hierarchy_number = strtoul(tab1, &tab2, 10);
301 if (!tab2 || *tab2)
302 continue;
303 (void)hierarchy_number;
304
305 r = lxc_grow_array((void ***)kernel_subsystems, &kernel_subsystems_capacity, kernel_subsystems_count + 1, 12);
306 if (r < 0)
307 goto out;
308 (*kernel_subsystems)[kernel_subsystems_count] = strdup(line);
309 if (!(*kernel_subsystems)[kernel_subsystems_count])
310 goto out;
311 kernel_subsystems_count++;
312 }
313 bret = true;
314
315 out:
316 fclose(proc_cgroups);
317 free(line);
318 return bret;
319 }
320
321 /* Step 2: determine all hierarchies (by reading /proc/self/cgroup),
322 * since mount points don't specify hierarchy number and
323 * /proc/cgroups does not contain named hierarchies
324 */
325 static bool find_cgroup_hierarchies(struct cgroup_meta_data *meta_data,
326 bool all_kernel_subsystems, bool all_named_subsystems,
327 const char **subsystem_whitelist)
328 {
329 FILE *proc_self_cgroup;
330 char *line = NULL;
331 size_t sz = 0;
332 int r;
333 bool bret = false;
334 size_t hierarchy_capacity = 0;
335
336 proc_self_cgroup = fopen_cloexec("/proc/self/cgroup", "r");
337 /* if for some reason (because of setns() and pid namespace for example),
338 * /proc/self is not valid, we try /proc/1/cgroup... */
339 if (!proc_self_cgroup)
340 proc_self_cgroup = fopen_cloexec("/proc/1/cgroup", "r");
341 if (!proc_self_cgroup)
342 return false;
343
344 while (getline(&line, &sz, proc_self_cgroup) != -1) {
345 /* file format: hierarchy:subsystems:group,
346 * we only extract hierarchy and subsystems
347 * here */
348 char *colon1;
349 char *colon2;
350 int hierarchy_number;
351 struct cgroup_hierarchy *h = NULL;
352 char **p;
353
354 if (!line[0])
355 continue;
356
357 colon1 = strchr(line, ':');
358 if (!colon1)
359 continue;
360 *colon1++ = '\0';
361 colon2 = strchr(colon1, ':');
362 if (!colon2)
363 continue;
364 *colon2 = '\0';
365
366 colon2 = NULL;
367
368 /* With cgroupv2 /proc/self/cgroup can contain entries of the
369 * form: 0::/
370 * These entries need to be skipped.
371 */
372 if (!strcmp(colon1, ""))
373 continue;
374
375 hierarchy_number = strtoul(line, &colon2, 10);
376 if (!colon2 || *colon2)
377 continue;
378
379 if (hierarchy_number > meta_data->maximum_hierarchy) {
380 /* lxc_grow_array will never shrink, so even if we find a lower
381 * hierarchy number here, the array will never be smaller
382 */
383 r = lxc_grow_array((void ***)&meta_data->hierarchies, &hierarchy_capacity, hierarchy_number + 1, 12);
384 if (r < 0)
385 goto out;
386
387 meta_data->maximum_hierarchy = hierarchy_number;
388 }
389
390 /* this shouldn't happen, we had this already */
391 if (meta_data->hierarchies[hierarchy_number])
392 goto out;
393
394 h = calloc(1, sizeof(struct cgroup_hierarchy));
395 if (!h)
396 goto out;
397
398 meta_data->hierarchies[hierarchy_number] = h;
399
400 h->index = hierarchy_number;
401 h->subsystems = lxc_string_split_and_trim(colon1, ',');
402 if (!h->subsystems)
403 goto out;
404 /* see if this hierarchy should be considered */
405 if (!all_kernel_subsystems || !all_named_subsystems) {
406 for (p = h->subsystems; *p; p++) {
407 if (!strncmp(*p, "name=", 5)) {
408 if (all_named_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
409 h->used = true;
410 break;
411 }
412 } else {
413 if (all_kernel_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
414 h->used = true;
415 break;
416 }
417 }
418 }
419 } else {
420 /* we want all hierarchy anyway */
421 h->used = true;
422 }
423 }
424 bret = true;
425
426 out:
427 fclose(proc_self_cgroup);
428 free(line);
429 return bret;
430 }
431
432 /* Step 3: determine all mount points of each hierarchy */
433 static bool find_hierarchy_mountpts( struct cgroup_meta_data *meta_data, char **kernel_subsystems)
434 {
435 bool bret = false;
436 FILE *proc_self_mountinfo;
437 char *line = NULL;
438 size_t sz = 0;
439 char **tokens = NULL;
440 size_t mount_point_count = 0;
441 size_t mount_point_capacity = 0;
442 size_t token_capacity = 0;
443 int r;
444 bool is_cgns = cgns_supported();
445
446 proc_self_mountinfo = fopen_cloexec("/proc/self/mountinfo", "r");
447 /* if for some reason (because of setns() and pid namespace for example),
448 * /proc/self is not valid, we try /proc/1/cgroup... */
449 if (!proc_self_mountinfo)
450 proc_self_mountinfo = fopen_cloexec("/proc/1/mountinfo", "r");
451 if (!proc_self_mountinfo)
452 return false;
453
454 while (getline(&line, &sz, proc_self_mountinfo) != -1) {
455 char *token, *line_tok, *saveptr = NULL;
456 size_t i, j, k;
457 struct cgroup_mount_point *mount_point;
458 struct cgroup_hierarchy *h;
459 char **subsystems;
460 bool is_lxcfs = false;
461
462 if (line[0] && line[strlen(line) - 1] == '\n')
463 line[strlen(line) - 1] = '\0';
464
465 for (i = 0, line_tok = line; (token = strtok_r(line_tok, " ", &saveptr)); line_tok = NULL) {
466 r = lxc_grow_array((void ***)&tokens, &token_capacity, i + 1, 64);
467 if (r < 0)
468 goto out;
469 tokens[i++] = token;
470 }
471
472 /* layout of /proc/self/mountinfo:
473 * 0: id
474 * 1: parent id
475 * 2: device major:minor
476 * 3: mount prefix
477 * 4: mount point
478 * 5: per-mount options
479 * [optional X]: additional data
480 * X+7: "-"
481 * X+8: type
482 * X+9: source
483 * X+10: per-superblock options
484 */
485 for (j = 6; j < i && tokens[j]; j++)
486 if (!strcmp(tokens[j], "-"))
487 break;
488
489 /* could not find separator */
490 if (j >= i || !tokens[j])
491 continue;
492 /* there should be exactly three fields after
493 * the separator
494 */
495 if (i != j + 4)
496 continue;
497
498 /* not a cgroup filesystem */
499 if (strcmp(tokens[j + 1], "cgroup") != 0) {
500 if (strcmp(tokens[j + 1], "fuse.lxcfs") != 0)
501 continue;
502 if (strncmp(tokens[4], "/sys/fs/cgroup/", 15) != 0)
503 continue;
504 is_lxcfs = true;
505 char *curtok = tokens[4] + 15;
506 subsystems = subsystems_from_mount_options(curtok,
507 kernel_subsystems);
508 } else
509 subsystems = subsystems_from_mount_options(tokens[j + 3],
510 kernel_subsystems);
511 if (!subsystems)
512 goto out;
513
514 h = NULL;
515 for (k = 0; k <= meta_data->maximum_hierarchy; k++) {
516 if (meta_data->hierarchies[k] &&
517 meta_data->hierarchies[k]->subsystems[0] &&
518 lxc_string_in_array(meta_data->hierarchies[k]->subsystems[0], (const char **)subsystems)) {
519 /* TODO: we could also check if the lists really match completely,
520 * just to have an additional sanity check */
521 h = meta_data->hierarchies[k];
522 break;
523 }
524 }
525 lxc_free_array((void **)subsystems, free);
526
527 r = lxc_grow_array((void ***)&meta_data->mount_points, &mount_point_capacity, mount_point_count + 1, 12);
528 if (r < 0)
529 goto out;
530
531 /* create mount point object */
532 mount_point = calloc(1, sizeof(*mount_point));
533 if (!mount_point)
534 goto out;
535
536 meta_data->mount_points[mount_point_count++] = mount_point;
537
538 mount_point->hierarchy = h;
539 if (is_lxcfs || is_cgns)
540 mount_point->mount_prefix = strdup("/");
541 else
542 mount_point->mount_prefix = strdup(tokens[3]);
543 mount_point->mount_point = strdup(tokens[4]);
544 if (!mount_point->mount_point || !mount_point->mount_prefix)
545 goto out;
546 mount_point->read_only = !lxc_string_in_list("rw", tokens[5], ',');
547
548 if (!strcmp(mount_point->mount_prefix, "/")) {
549 if (mount_point->read_only) {
550 if (!h->ro_absolute_mount_point)
551 h->ro_absolute_mount_point = mount_point;
552 } else {
553 if (!h->rw_absolute_mount_point)
554 h->rw_absolute_mount_point = mount_point;
555 }
556 }
557
558 k = lxc_array_len((void **)h->all_mount_points);
559 r = lxc_grow_array((void ***)&h->all_mount_points, &h->all_mount_point_capacity, k + 1, 4);
560 if (r < 0)
561 goto out;
562 h->all_mount_points[k] = mount_point;
563 }
564 bret = true;
565
566 out:
567 fclose(proc_self_mountinfo);
568 free(tokens);
569 free(line);
570 return bret;
571 }
572
573 static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist)
574 {
575 bool all_kernel_subsystems = true;
576 bool all_named_subsystems = false;
577 struct cgroup_meta_data *meta_data = NULL;
578 char **kernel_subsystems = NULL;
579 int saved_errno = 0;
580
581 /* if the subsystem whitelist is not specified, include all
582 * hierarchies that contain kernel subsystems by default but
583 * no hierarchies that only contain named subsystems
584 *
585 * if it is specified, the specifier @all will select all
586 * hierarchies, @kernel will select all hierarchies with
587 * kernel subsystems and @named will select all named
588 * hierarchies
589 */
590 all_kernel_subsystems = subsystem_whitelist ?
591 (lxc_string_in_array("@kernel", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
592 true;
593 all_named_subsystems = subsystem_whitelist ?
594 (lxc_string_in_array("@named", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
595 true;
596
597 meta_data = calloc(1, sizeof(struct cgroup_meta_data));
598 if (!meta_data)
599 return NULL;
600 meta_data->ref = 1;
601
602 if (!find_cgroup_subsystems(&kernel_subsystems))
603 goto out_error;
604
605 if (!find_cgroup_hierarchies(meta_data, all_kernel_subsystems,
606 all_named_subsystems, subsystem_whitelist))
607 goto out_error;
608
609 if (!find_hierarchy_mountpts(meta_data, kernel_subsystems))
610 goto out_error;
611
612 /* oops, we couldn't find anything */
613 if (!meta_data->hierarchies || !meta_data->mount_points) {
614 errno = EINVAL;
615 goto out_error;
616 }
617
618 lxc_free_array((void **)kernel_subsystems, free);
619 return meta_data;
620
621 out_error:
622 saved_errno = errno;
623 lxc_free_array((void **)kernel_subsystems, free);
624 lxc_cgroup_put_meta(meta_data);
625 errno = saved_errno;
626 return NULL;
627 }
628
629 static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data)
630 {
631 meta_data->ref++;
632 return meta_data;
633 }
634
635 static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data)
636 {
637 size_t i;
638 if (!meta_data)
639 return NULL;
640 if (--meta_data->ref > 0)
641 return meta_data;
642 lxc_free_array((void **)meta_data->mount_points, (lxc_free_fn)lxc_cgroup_mount_point_free);
643 if (meta_data->hierarchies)
644 for (i = 0; i <= meta_data->maximum_hierarchy; i++)
645 if (meta_data->hierarchies[i])
646 lxc_cgroup_hierarchy_free(meta_data->hierarchies[i]);
647 free(meta_data->hierarchies);
648 free(meta_data);
649 return NULL;
650 }
651
652 static struct cgroup_hierarchy *lxc_cgroup_find_hierarchy(struct cgroup_meta_data *meta_data, const char *subsystem)
653 {
654 size_t i;
655 for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
656 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
657 if (!h)
658 continue;
659 if (h && lxc_string_in_array(subsystem, (const char **)h->subsystems))
660 return h;
661 }
662 return NULL;
663 }
664
665 static bool mountpoint_is_accessible(struct cgroup_mount_point *mp)
666 {
667 return mp && access(mp->mount_point, F_OK) == 0;
668 }
669
670 static struct cgroup_mount_point *lxc_cgroup_find_mount_point(struct cgroup_hierarchy *hierarchy, const char *group, bool should_be_writable)
671 {
672 struct cgroup_mount_point **mps;
673 struct cgroup_mount_point *current_result = NULL;
674 ssize_t quality = -1;
675
676 /* trivial case */
677 if (mountpoint_is_accessible(hierarchy->rw_absolute_mount_point))
678 return hierarchy->rw_absolute_mount_point;
679 if (!should_be_writable && mountpoint_is_accessible(hierarchy->ro_absolute_mount_point))
680 return hierarchy->ro_absolute_mount_point;
681
682 for (mps = hierarchy->all_mount_points; mps && *mps; mps++) {
683 struct cgroup_mount_point *mp = *mps;
684 size_t prefix_len = mp->mount_prefix ? strlen(mp->mount_prefix) : 0;
685
686 if (prefix_len == 1 && mp->mount_prefix[0] == '/')
687 prefix_len = 0;
688
689 if (!mountpoint_is_accessible(mp))
690 continue;
691
692 if (should_be_writable && mp->read_only)
693 continue;
694
695 if (!prefix_len ||
696 (strncmp(group, mp->mount_prefix, prefix_len) == 0 &&
697 (group[prefix_len] == '\0' || group[prefix_len] == '/'))) {
698 /* search for the best quality match, i.e. the match with the
699 * shortest prefix where this group is still contained
700 */
701 if (quality == -1 || prefix_len < quality) {
702 current_result = mp;
703 quality = prefix_len;
704 }
705 }
706 }
707
708 if (!current_result)
709 errno = ENOENT;
710 return current_result;
711 }
712
713 static char *lxc_cgroup_find_abs_path(const char *subsystem, const char *group, bool should_be_writable, const char *suffix)
714 {
715 struct cgroup_meta_data *meta_data;
716 struct cgroup_hierarchy *h;
717 struct cgroup_mount_point *mp;
718 char *result;
719 int saved_errno;
720
721 meta_data = lxc_cgroup_load_meta();
722 if (!meta_data)
723 return NULL;
724
725 h = lxc_cgroup_find_hierarchy(meta_data, subsystem);
726 if (!h)
727 goto out_error;
728
729 mp = lxc_cgroup_find_mount_point(h, group, should_be_writable);
730 if (!mp)
731 goto out_error;
732
733 result = cgroup_to_absolute_path(mp, group, suffix);
734 if (!result)
735 goto out_error;
736
737 lxc_cgroup_put_meta(meta_data);
738 return result;
739
740 out_error:
741 saved_errno = errno;
742 lxc_cgroup_put_meta(meta_data);
743 errno = saved_errno;
744 return NULL;
745 }
746
747 static struct cgroup_process_info *lxc_cgroup_process_info_get(pid_t pid, struct cgroup_meta_data *meta)
748 {
749 char pid_buf[32];
750 snprintf(pid_buf, 32, "/proc/%lu/cgroup", (unsigned long)pid);
751 return lxc_cgroup_process_info_getx(pid_buf, meta);
752 }
753
754 static struct cgroup_process_info *lxc_cgroup_process_info_get_init(struct cgroup_meta_data *meta)
755 {
756 return lxc_cgroup_process_info_get(1, meta);
757 }
758
759 static struct cgroup_process_info *lxc_cgroup_process_info_get_self(struct cgroup_meta_data *meta)
760 {
761 struct cgroup_process_info *i;
762 i = lxc_cgroup_process_info_getx("/proc/self/cgroup", meta);
763 if (!i)
764 i = lxc_cgroup_process_info_get(getpid(), meta);
765 return i;
766 }
767
768 /*
769 * If a controller has ns cgroup mounted, then in that cgroup the handler->pid
770 * is already in a new cgroup named after the pid. 'mnt' is passed in as
771 * the full current cgroup. Say that is /sys/fs/cgroup/lxc/2975 and the container
772 * name is c1. . We want to rename the cgroup directory to /sys/fs/cgroup/lxc/c1,
773 * and return the string /sys/fs/cgroup/lxc/c1.
774 */
775 static char *cgroup_rename_nsgroup(const char *mountpath, const char *oldname, pid_t pid, const char *name)
776 {
777 char *dir, *fulloldpath;
778 char *newname, *fullnewpath;
779 int len, newlen, ret;
780
781 /*
782 * if cgroup is mounted at /cgroup and task is in cgroup /ab/, pid 2375 and
783 * name is c1,
784 * dir: /ab
785 * fulloldpath = /cgroup/ab/2375
786 * fullnewpath = /cgroup/ab/c1
787 * newname = /ab/c1
788 */
789 dir = alloca(strlen(oldname) + 1);
790 strcpy(dir, oldname);
791
792 len = strlen(oldname) + strlen(mountpath) + 22;
793 fulloldpath = alloca(len);
794 ret = snprintf(fulloldpath, len, "%s/%s/%ld", mountpath, oldname, (unsigned long)pid);
795 if (ret < 0 || ret >= len)
796 return NULL;
797
798 len = strlen(dir) + strlen(name) + 2;
799 newname = malloc(len);
800 if (!newname) {
801 SYSERROR("Out of memory");
802 return NULL;
803 }
804 ret = snprintf(newname, len, "%s/%s", dir, name);
805 if (ret < 0 || ret >= len) {
806 free(newname);
807 return NULL;
808 }
809
810 newlen = strlen(mountpath) + len + 2;
811 fullnewpath = alloca(newlen);
812 ret = snprintf(fullnewpath, newlen, "%s/%s", mountpath, newname);
813 if (ret < 0 || ret >= newlen) {
814 free(newname);
815 return NULL;
816 }
817
818 if (access(fullnewpath, F_OK) == 0) {
819 if (rmdir(fullnewpath) != 0) {
820 SYSERROR("container cgroup %s already exists.", fullnewpath);
821 free(newname);
822 return NULL;
823 }
824 }
825 if (rename(fulloldpath, fullnewpath)) {
826 SYSERROR("failed to rename cgroup %s->%s", fulloldpath, fullnewpath);
827 free(newname);
828 return NULL;
829 }
830
831 DEBUG("'%s' renamed to '%s'", oldname, newname);
832
833 return newname;
834 }
835
836 static bool is_crucial_hierarchy(struct cgroup_hierarchy *h)
837 {
838 char **p;
839
840 for (p = h->subsystems; *p; p++) {
841 if (is_crucial_cgroup_subsystem(*p))
842 return true;
843 }
844 return false;
845 }
846
847 /* create a new cgroup */
848 static struct cgroup_process_info *lxc_cgroupfs_create(const char *name, const char *path_pattern, struct cgroup_meta_data *meta_data, const char *sub_pattern)
849 {
850 char **cgroup_path_components = NULL;
851 char **p = NULL;
852 char *path_so_far = NULL;
853 char **new_cgroup_paths = NULL;
854 char **new_cgroup_paths_sub = NULL;
855 struct cgroup_mount_point *mp;
856 struct cgroup_hierarchy *h;
857 struct cgroup_process_info *base_info = NULL;
858 struct cgroup_process_info *info_ptr;
859 int saved_errno;
860 int r;
861 unsigned suffix = 0;
862 bool had_sub_pattern = false;
863 size_t i;
864
865 if (!is_valid_cgroup(name)) {
866 ERROR("Invalid cgroup name: '%s'", name);
867 errno = EINVAL;
868 return NULL;
869 }
870
871 if (!strstr(path_pattern, "%n")) {
872 ERROR("Invalid cgroup path pattern: '%s'; contains no %%n for specifying container name", path_pattern);
873 errno = EINVAL;
874 return NULL;
875 }
876
877 /* we will modify the result of this operation directly,
878 * so we don't have to copy the data structure
879 */
880 base_info = (path_pattern[0] == '/') ?
881 lxc_cgroup_process_info_get_init(meta_data) :
882 lxc_cgroup_process_info_get_self(meta_data);
883 if (!base_info)
884 return NULL;
885
886 new_cgroup_paths = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
887 if (!new_cgroup_paths)
888 goto out_initial_error;
889
890 new_cgroup_paths_sub = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
891 if (!new_cgroup_paths_sub)
892 goto out_initial_error;
893
894 /* find mount points we can use */
895 for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
896 h = info_ptr->hierarchy;
897 if (!h)
898 continue;
899 mp = lxc_cgroup_find_mount_point(h, info_ptr->cgroup_path, true);
900 if (!mp) {
901 ERROR("Could not find writable mount point for cgroup hierarchy %d while trying to create cgroup.", h->index);
902 goto out_initial_error;
903 }
904 info_ptr->designated_mount_point = mp;
905
906 if (lxc_string_in_array("ns", (const char **)h->subsystems))
907 continue;
908 if (handle_cgroup_settings(mp, info_ptr->cgroup_path) < 0) {
909 ERROR("Could not set clone_children to 1 for cpuset hierarchy in parent cgroup.");
910 goto out_initial_error;
911 }
912 }
913
914 /* normalize the path */
915 cgroup_path_components = lxc_normalize_path(path_pattern);
916 if (!cgroup_path_components)
917 goto out_initial_error;
918
919 /* go through the path components to see if we can create them */
920 for (p = cgroup_path_components; *p || (sub_pattern && !had_sub_pattern); p++) {
921 /* we only want to create the same component with -1, -2, etc.
922 * if the component contains the container name itself, otherwise
923 * it's not an error if it already exists
924 */
925 char *p_eff = *p ? *p : (char *)sub_pattern;
926 bool contains_name = strstr(p_eff, "%n");
927 char *current_component = NULL;
928 char *current_subpath = NULL;
929 char *current_entire_path = NULL;
930 char *parts[3];
931 size_t j = 0;
932 i = 0;
933
934 /* if we are processing the subpattern, we want to make sure
935 * loop is ended the next time around
936 */
937 if (!*p) {
938 had_sub_pattern = true;
939 p--;
940 }
941
942 goto find_name_on_this_level;
943
944 cleanup_name_on_this_level:
945 /* This is reached if we found a name clash.
946 * In that case, remove the cgroup from all previous hierarchies
947 */
948 for (j = 0, info_ptr = base_info; j < i && info_ptr; info_ptr = info_ptr->next, j++) {
949 if (info_ptr->created_paths_count < 1)
950 continue;
951 r = remove_cgroup(info_ptr->designated_mount_point, info_ptr->created_paths[info_ptr->created_paths_count - 1], false, NULL);
952 if (r < 0)
953 WARN("could not clean up cgroup we created when trying to create container");
954 free(info_ptr->created_paths[info_ptr->created_paths_count - 1]);
955 info_ptr->created_paths[--info_ptr->created_paths_count] = NULL;
956 }
957 if (current_component != current_subpath)
958 free(current_subpath);
959 if (current_component != p_eff)
960 free(current_component);
961 current_component = current_subpath = NULL;
962 /* try again with another suffix */
963 ++suffix;
964
965 find_name_on_this_level:
966 /* determine name of the path component we should create */
967 if (contains_name && suffix > 0) {
968 char *buf = calloc(strlen(name) + 32, 1);
969 if (!buf)
970 goto out_initial_error;
971 snprintf(buf, strlen(name) + 32, "%s-%u", name, suffix);
972 current_component = lxc_string_replace("%n", buf, p_eff);
973 free(buf);
974 } else {
975 current_component = contains_name ? lxc_string_replace("%n", name, p_eff) : p_eff;
976 }
977 parts[0] = path_so_far;
978 parts[1] = current_component;
979 parts[2] = NULL;
980 current_subpath = path_so_far ? lxc_string_join("/", (const char **)parts, false) : current_component;
981
982 /* Now go through each hierarchy and try to create the
983 * corresponding cgroup
984 */
985 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
986 char *parts2[3];
987
988 if (!info_ptr->hierarchy)
989 continue;
990
991 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
992 continue;
993 current_entire_path = NULL;
994
995 parts2[0] = !strcmp(info_ptr->cgroup_path, "/") ? "" : info_ptr->cgroup_path;
996 parts2[1] = current_subpath;
997 parts2[2] = NULL;
998 current_entire_path = lxc_string_join("/", (const char **)parts2, false);
999
1000 if (!*p) {
1001 /* we are processing the subpath, so only update that one */
1002 free(new_cgroup_paths_sub[i]);
1003 new_cgroup_paths_sub[i] = strdup(current_entire_path);
1004 if (!new_cgroup_paths_sub[i])
1005 goto cleanup_from_error;
1006 } else {
1007 /* remember which path was used on this controller */
1008 free(new_cgroup_paths[i]);
1009 new_cgroup_paths[i] = strdup(current_entire_path);
1010 if (!new_cgroup_paths[i])
1011 goto cleanup_from_error;
1012 }
1013
1014 r = create_cgroup(info_ptr->designated_mount_point, current_entire_path);
1015 if (r < 0 && errno == EEXIST && contains_name) {
1016 /* name clash => try new name with new suffix */
1017 free(current_entire_path);
1018 current_entire_path = NULL;
1019 goto cleanup_name_on_this_level;
1020 } else if (r < 0 && errno != EEXIST) {
1021 if (is_crucial_hierarchy(info_ptr->hierarchy)) {
1022 SYSERROR("Could not create cgroup '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
1023 goto cleanup_from_error;
1024 }
1025 goto skip;
1026 } else if (r == 0) {
1027 /* successfully created */
1028 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1029 if (r < 0)
1030 goto cleanup_from_error;
1031 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, current_entire_path)) {
1032 ERROR("Failed to initialize cpuset for '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
1033 goto cleanup_from_error;
1034 }
1035 info_ptr->created_paths[info_ptr->created_paths_count++] = current_entire_path;
1036 } else {
1037 /* if we didn't create the cgroup, then we have to make sure that
1038 * further cgroups will be created properly
1039 */
1040 if (handle_cgroup_settings(info_ptr->designated_mount_point, info_ptr->cgroup_path) < 0) {
1041 ERROR("Could not set clone_children to 1 for cpuset hierarchy in pre-existing cgroup.");
1042 goto cleanup_from_error;
1043 }
1044 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, info_ptr->cgroup_path)) {
1045 ERROR("Failed to initialize cpuset in pre-existing '%s'.", info_ptr->cgroup_path);
1046 goto cleanup_from_error;
1047 }
1048
1049 skip:
1050 /* already existed but path component of pattern didn't contain '%n',
1051 * so this is not an error; but then we don't need current_entire_path
1052 * anymore...
1053 */
1054 free(current_entire_path);
1055 current_entire_path = NULL;
1056 }
1057 }
1058
1059 /* save path so far */
1060 free(path_so_far);
1061 path_so_far = strdup(current_subpath);
1062 if (!path_so_far)
1063 goto cleanup_from_error;
1064
1065 /* cleanup */
1066 if (current_component != current_subpath)
1067 free(current_subpath);
1068 if (current_component != p_eff)
1069 free(current_component);
1070 current_component = current_subpath = NULL;
1071 continue;
1072
1073 cleanup_from_error:
1074 /* called if an error occurred in the loop, so we
1075 * do some additional cleanup here
1076 */
1077 saved_errno = errno;
1078 if (current_component != current_subpath)
1079 free(current_subpath);
1080 if (current_component != p_eff)
1081 free(current_component);
1082 free(current_entire_path);
1083 errno = saved_errno;
1084 goto out_initial_error;
1085 }
1086
1087 /* we're done, now update the paths */
1088 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
1089 if (!info_ptr->hierarchy)
1090 continue;
1091 /* ignore legacy 'ns' subsystem here, lxc_cgroup_create_legacy
1092 * will take care of it
1093 * Since we do a continue in above loop, new_cgroup_paths[i] is
1094 * unset anyway, as is new_cgroup_paths_sub[i]
1095 */
1096 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1097 continue;
1098 free(info_ptr->cgroup_path);
1099 info_ptr->cgroup_path = new_cgroup_paths[i];
1100 info_ptr->cgroup_path_sub = new_cgroup_paths_sub[i];
1101 }
1102 /* don't use lxc_free_array since we used the array members
1103 * to store them in our result...
1104 */
1105 free(new_cgroup_paths);
1106 free(new_cgroup_paths_sub);
1107 free(path_so_far);
1108 lxc_free_array((void **)cgroup_path_components, free);
1109 return base_info;
1110
1111 out_initial_error:
1112 saved_errno = errno;
1113 free(path_so_far);
1114 lxc_cgroup_process_info_free_and_remove(base_info, NULL);
1115 lxc_free_array((void **)new_cgroup_paths, free);
1116 lxc_free_array((void **)new_cgroup_paths_sub, free);
1117 lxc_free_array((void **)cgroup_path_components, free);
1118 errno = saved_errno;
1119 return NULL;
1120 }
1121
1122 static int lxc_cgroup_create_legacy(struct cgroup_process_info *base_info, const char *name, pid_t pid)
1123 {
1124 struct cgroup_process_info *info_ptr;
1125 int r;
1126
1127 for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
1128 if (!info_ptr->hierarchy)
1129 continue;
1130
1131 if (!lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1132 continue;
1133 /*
1134 * For any path which has ns cgroup mounted, handler->pid is already
1135 * moved into a container called '%d % (handler->pid)'. Rename it to
1136 * the cgroup name and record that.
1137 */
1138 char *tmp = cgroup_rename_nsgroup((const char *)info_ptr->designated_mount_point->mount_point,
1139 info_ptr->cgroup_path, pid, name);
1140 if (!tmp)
1141 return -1;
1142 free(info_ptr->cgroup_path);
1143 info_ptr->cgroup_path = tmp;
1144 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1145 if (r < 0)
1146 return -1;
1147 tmp = strdup(tmp);
1148 if (!tmp)
1149 return -1;
1150 info_ptr->created_paths[info_ptr->created_paths_count++] = tmp;
1151 }
1152 return 0;
1153 }
1154
1155 /* get the cgroup membership of a given container */
1156 static struct cgroup_process_info *lxc_cgroup_get_container_info(const char *name, const char *lxcpath, struct cgroup_meta_data *meta_data)
1157 {
1158 struct cgroup_process_info *result = NULL;
1159 int saved_errno = 0;
1160 size_t i;
1161 struct cgroup_process_info **cptr = &result;
1162 struct cgroup_process_info *entry = NULL;
1163 char *path = NULL;
1164
1165 for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
1166 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
1167 if (!h || !h->used)
1168 continue;
1169
1170 /* use the command interface to look for the cgroup */
1171 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->subsystems[0]);
1172 if (!path) {
1173 h->used = false;
1174 continue;
1175 }
1176
1177 entry = calloc(1, sizeof(struct cgroup_process_info));
1178 if (!entry)
1179 goto out_error;
1180 entry->meta_ref = lxc_cgroup_get_meta(meta_data);
1181 entry->hierarchy = h;
1182 entry->cgroup_path = path;
1183 path = NULL;
1184
1185 /* it is not an error if we don't find anything here,
1186 * it is up to the caller to decide what to do in that
1187 * case */
1188 entry->designated_mount_point = lxc_cgroup_find_mount_point(h, entry->cgroup_path, true);
1189
1190 *cptr = entry;
1191 cptr = &entry->next;
1192 entry = NULL;
1193 }
1194
1195 return result;
1196 out_error:
1197 saved_errno = errno;
1198 free(path);
1199 lxc_cgroup_process_info_free(result);
1200 lxc_cgroup_process_info_free(entry);
1201 errno = saved_errno;
1202 return NULL;
1203 }
1204
1205 /* move a processs to the cgroups specified by the membership */
1206 static int lxc_cgroupfs_enter(struct cgroup_process_info *info, pid_t pid, bool enter_sub)
1207 {
1208 char pid_buf[32];
1209 char *cgroup_tasks_fn;
1210 int r;
1211 struct cgroup_process_info *info_ptr;
1212
1213 snprintf(pid_buf, 32, "%lu", (unsigned long)pid);
1214 for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1215 if (!info_ptr->hierarchy)
1216 continue;
1217
1218 char *cgroup_path = (enter_sub && info_ptr->cgroup_path_sub) ?
1219 info_ptr->cgroup_path_sub :
1220 info_ptr->cgroup_path;
1221
1222 if (!info_ptr->designated_mount_point) {
1223 info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, cgroup_path, true);
1224 if (!info_ptr->designated_mount_point) {
1225 SYSERROR("Could not add pid %lu to cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", (unsigned long)pid, cgroup_path);
1226 return -1;
1227 }
1228 }
1229
1230 cgroup_tasks_fn = cgroup_to_absolute_path(info_ptr->designated_mount_point, cgroup_path, "/tasks");
1231 if (!cgroup_tasks_fn) {
1232 SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1233 return -1;
1234 }
1235
1236 r = lxc_write_to_file(cgroup_tasks_fn, pid_buf, strlen(pid_buf), false);
1237 free(cgroup_tasks_fn);
1238 if (r < 0 && is_crucial_hierarchy(info_ptr->hierarchy)) {
1239 SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1240 return -1;
1241 }
1242 }
1243
1244 return 0;
1245 }
1246
1247 /* free process membership information */
1248 void lxc_cgroup_process_info_free(struct cgroup_process_info *info)
1249 {
1250 struct cgroup_process_info *next;
1251 if (!info)
1252 return;
1253 next = info->next;
1254 lxc_cgroup_put_meta(info->meta_ref);
1255 free(info->cgroup_path);
1256 free(info->cgroup_path_sub);
1257 lxc_free_array((void **)info->created_paths, free);
1258 free(info);
1259 lxc_cgroup_process_info_free(next);
1260 }
1261
1262 /* free process membership information and remove cgroups that were created */
1263 void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info, struct lxc_conf *conf)
1264 {
1265 struct cgroup_process_info *next;
1266 char **pp;
1267 if (!info)
1268 return;
1269 next = info->next;
1270 {
1271 struct cgroup_mount_point *mp = info->designated_mount_point;
1272 if (!mp)
1273 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1274 if (mp)
1275 /* ignore return value here, perhaps we created the
1276 * '/lxc' cgroup in this container but another container
1277 * is still running (for example)
1278 */
1279 (void)remove_cgroup(mp, info->cgroup_path, true, conf);
1280 }
1281 for (pp = info->created_paths; pp && *pp; pp++);
1282 for ((void)(pp && --pp); info->created_paths && pp >= info->created_paths; --pp) {
1283 free(*pp);
1284 }
1285 free(info->created_paths);
1286 lxc_cgroup_put_meta(info->meta_ref);
1287 free(info->cgroup_path);
1288 free(info->cgroup_path_sub);
1289 free(info);
1290 lxc_cgroup_process_info_free_and_remove(next, conf);
1291 }
1292
1293 static char *lxc_cgroup_get_hierarchy_path_data(const char *subsystem, struct cgfs_data *d)
1294 {
1295 struct cgroup_process_info *info = d->info;
1296 info = find_info_for_subsystem(info, subsystem);
1297 if (!info)
1298 return NULL;
1299 prune_init_scope(info->cgroup_path);
1300 return info->cgroup_path;
1301 }
1302
1303 static char *lxc_cgroup_get_hierarchy_abs_path_data(const char *subsystem, struct cgfs_data *d)
1304 {
1305 struct cgroup_process_info *info = d->info;
1306 struct cgroup_mount_point *mp = NULL;
1307
1308 info = find_info_for_subsystem(info, subsystem);
1309 if (!info)
1310 return NULL;
1311 if (info->designated_mount_point) {
1312 mp = info->designated_mount_point;
1313 } else {
1314 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1315 if (!mp)
1316 return NULL;
1317 }
1318 return cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1319 }
1320
1321 static char *lxc_cgroup_get_hierarchy_abs_path(const char *subsystem, const char *name, const char *lxcpath)
1322 {
1323 struct cgroup_meta_data *meta;
1324 struct cgroup_process_info *base_info, *info;
1325 struct cgroup_mount_point *mp;
1326 char *result = NULL;
1327
1328 meta = lxc_cgroup_load_meta();
1329 if (!meta)
1330 return NULL;
1331 base_info = lxc_cgroup_get_container_info(name, lxcpath, meta);
1332 if (!base_info)
1333 goto out1;
1334 info = find_info_for_subsystem(base_info, subsystem);
1335 if (!info)
1336 goto out2;
1337 if (info->designated_mount_point) {
1338 mp = info->designated_mount_point;
1339 } else {
1340 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1341 if (!mp)
1342 goto out3;
1343 }
1344 result = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1345 out3:
1346 out2:
1347 lxc_cgroup_process_info_free(base_info);
1348 out1:
1349 lxc_cgroup_put_meta(meta);
1350 return result;
1351 }
1352
1353 static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfs_data *d)
1354 {
1355 char *subsystem = NULL, *p, *path;
1356 int ret = -1;
1357
1358 subsystem = alloca(strlen(filename) + 1);
1359 strcpy(subsystem, filename);
1360 if ((p = strchr(subsystem, '.')) != NULL)
1361 *p = '\0';
1362
1363 errno = ENOENT;
1364 path = lxc_cgroup_get_hierarchy_abs_path_data(subsystem, d);
1365 if (path) {
1366 ret = do_cgroup_set(path, filename, value);
1367 int saved_errno = errno;
1368 free(path);
1369 errno = saved_errno;
1370 }
1371 return ret;
1372 }
1373
1374 static int lxc_cgroupfs_set(const char *filename, const char *value, const char *name, const char *lxcpath)
1375 {
1376 char *subsystem = NULL, *p, *path;
1377 int ret = -1;
1378
1379 subsystem = alloca(strlen(filename) + 1);
1380 strcpy(subsystem, filename);
1381 if ((p = strchr(subsystem, '.')) != NULL)
1382 *p = '\0';
1383
1384 path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1385 if (path) {
1386 ret = do_cgroup_set(path, filename, value);
1387 free(path);
1388 }
1389 return ret;
1390 }
1391
1392 static int lxc_cgroupfs_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
1393 {
1394 char *subsystem = NULL, *p, *path;
1395 int ret = -1;
1396
1397 subsystem = alloca(strlen(filename) + 1);
1398 strcpy(subsystem, filename);
1399 if ((p = strchr(subsystem, '.')) != NULL)
1400 *p = '\0';
1401
1402 path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1403 if (path) {
1404 ret = do_cgroup_get(path, filename, value, len);
1405 free(path);
1406 }
1407 return ret;
1408 }
1409
1410 static bool cgroupfs_mount_cgroup(void *hdata, const char *root, int type)
1411 {
1412 size_t bufsz = strlen(root) + sizeof("/sys/fs/cgroup");
1413 char *path = NULL;
1414 char **parts = NULL;
1415 char *dirname = NULL;
1416 char *abs_path = NULL;
1417 char *abs_path2 = NULL;
1418 struct cgfs_data *cgfs_d;
1419 struct cgroup_process_info *info, *base_info;
1420 int r, saved_errno = 0;
1421
1422 if (cgns_supported())
1423 return true;
1424
1425 cgfs_d = hdata;
1426 if (!cgfs_d)
1427 return false;
1428 base_info = cgfs_d->info;
1429
1430 /* If we get passed the _NOSPEC types, we default to _MIXED, since we don't
1431 * have access to the lxc_conf object at this point. It really should be up
1432 * to the caller to fix this, but this doesn't really hurt.
1433 */
1434 if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1435 type = LXC_AUTO_CGROUP_FULL_MIXED;
1436 else if (type == LXC_AUTO_CGROUP_NOSPEC)
1437 type = LXC_AUTO_CGROUP_MIXED;
1438
1439 if (type < LXC_AUTO_CGROUP_RO || type > LXC_AUTO_CGROUP_FULL_MIXED) {
1440 ERROR("could not mount cgroups into container: invalid type specified internally");
1441 errno = EINVAL;
1442 return false;
1443 }
1444
1445 path = calloc(1, bufsz);
1446 if (!path)
1447 return false;
1448 snprintf(path, bufsz, "%s/sys/fs/cgroup", root);
1449 r = safe_mount("cgroup_root", path, "tmpfs",
1450 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1451 "size=10240k,mode=755",
1452 root);
1453 if (r < 0) {
1454 SYSERROR("could not mount tmpfs to /sys/fs/cgroup in the container");
1455 return false;
1456 }
1457
1458 /* now mount all the hierarchies we care about */
1459 for (info = base_info; info; info = info->next) {
1460 size_t subsystem_count, i;
1461 struct cgroup_mount_point *mp = info->designated_mount_point;
1462
1463 if (!info->hierarchy)
1464 continue;
1465
1466 if (!mountpoint_is_accessible(mp))
1467 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1468
1469 if (!mp) {
1470 SYSERROR("could not find original mount point for cgroup hierarchy while trying to mount cgroup filesystem");
1471 goto out_error;
1472 }
1473
1474 subsystem_count = lxc_array_len((void **)info->hierarchy->subsystems);
1475 parts = calloc(subsystem_count + 1, sizeof(char *));
1476 if (!parts)
1477 goto out_error;
1478
1479 for (i = 0; i < subsystem_count; i++) {
1480 if (!strncmp(info->hierarchy->subsystems[i], "name=", 5))
1481 parts[i] = info->hierarchy->subsystems[i] + 5;
1482 else
1483 parts[i] = info->hierarchy->subsystems[i];
1484 }
1485 dirname = lxc_string_join(",", (const char **)parts, false);
1486 if (!dirname)
1487 goto out_error;
1488
1489 /* create subsystem directory */
1490 abs_path = lxc_append_paths(path, dirname);
1491 if (!abs_path)
1492 goto out_error;
1493 r = mkdir_p(abs_path, 0755);
1494 if (r < 0 && errno != EEXIST) {
1495 SYSERROR("could not create cgroup subsystem directory /sys/fs/cgroup/%s", dirname);
1496 goto out_error;
1497 }
1498
1499 abs_path2 = lxc_append_paths(abs_path, info->cgroup_path);
1500 if (!abs_path2)
1501 goto out_error;
1502
1503 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_RW || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1504 /* bind-mount the cgroup entire filesystem there */
1505 if (strcmp(mp->mount_prefix, "/") != 0) {
1506 /* FIXME: maybe we should just try to remount the entire hierarchy
1507 * with a regular mount command? may that works? */
1508 ERROR("could not automatically mount cgroup-full to /sys/fs/cgroup/%s: host has no mount point for this cgroup filesystem that has access to the root cgroup", dirname);
1509 goto out_error;
1510 }
1511 r = mount(mp->mount_point, abs_path, "none", MS_BIND, 0);
1512 if (r < 0) {
1513 SYSERROR("error bind-mounting %s to %s", mp->mount_point, abs_path);
1514 goto out_error;
1515 }
1516 /* main cgroup path should be read-only */
1517 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1518 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1519 if (r < 0) {
1520 SYSERROR("error re-mounting %s readonly", abs_path);
1521 goto out_error;
1522 }
1523 }
1524 /* own cgroup should be read-write */
1525 if (type == LXC_AUTO_CGROUP_FULL_MIXED) {
1526 r = mount(abs_path2, abs_path2, NULL, MS_BIND, NULL);
1527 if (r < 0) {
1528 SYSERROR("error bind-mounting %s onto itself", abs_path2);
1529 goto out_error;
1530 }
1531 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND, NULL);
1532 if (r < 0) {
1533 SYSERROR("error re-mounting %s readwrite", abs_path2);
1534 goto out_error;
1535 }
1536 }
1537 } else {
1538 /* create path for container's cgroup */
1539 r = mkdir_p(abs_path2, 0755);
1540 if (r < 0 && errno != EEXIST) {
1541 SYSERROR("could not create cgroup directory /sys/fs/cgroup/%s%s", dirname, info->cgroup_path);
1542 goto out_error;
1543 }
1544
1545 /* for read-only and mixed cases, we have to bind-mount the tmpfs directory
1546 * that points to the hierarchy itself (i.e. /sys/fs/cgroup/cpu etc.) onto
1547 * itself and then bind-mount it read-only, since we keep the tmpfs itself
1548 * read-write (see comment below)
1549 */
1550 if (type == LXC_AUTO_CGROUP_MIXED || type == LXC_AUTO_CGROUP_RO) {
1551 r = mount(abs_path, abs_path, NULL, MS_BIND, NULL);
1552 if (r < 0) {
1553 SYSERROR("error bind-mounting %s onto itself", abs_path);
1554 goto out_error;
1555 }
1556 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1557 if (r < 0) {
1558 SYSERROR("error re-mounting %s readonly", abs_path);
1559 goto out_error;
1560 }
1561 }
1562
1563 free(abs_path);
1564 abs_path = NULL;
1565
1566 /* bind-mount container's cgroup to that directory */
1567 abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1568 if (!abs_path)
1569 goto out_error;
1570 r = mount(abs_path, abs_path2, "none", MS_BIND, 0);
1571 if (r < 0 && is_crucial_hierarchy(info->hierarchy)) {
1572 SYSERROR("error bind-mounting %s to %s", abs_path, abs_path2);
1573 goto out_error;
1574 }
1575 if (type == LXC_AUTO_CGROUP_RO) {
1576 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1577 if (r < 0) {
1578 SYSERROR("error re-mounting %s readonly", abs_path2);
1579 goto out_error;
1580 }
1581 }
1582 }
1583
1584 free(abs_path);
1585 free(abs_path2);
1586 abs_path = NULL;
1587 abs_path2 = NULL;
1588
1589 /* add symlinks for every single subsystem */
1590 if (subsystem_count > 1) {
1591 for (i = 0; i < subsystem_count; i++) {
1592 abs_path = lxc_append_paths(path, parts[i]);
1593 if (!abs_path)
1594 goto out_error;
1595 r = symlink(dirname, abs_path);
1596 if (r < 0)
1597 WARN("could not create symlink %s -> %s in /sys/fs/cgroup of container", parts[i], dirname);
1598 free(abs_path);
1599 abs_path = NULL;
1600 }
1601 }
1602 free(dirname);
1603 free(parts);
1604 dirname = NULL;
1605 parts = NULL;
1606 }
1607
1608 /* We used to remount the entire tmpfs readonly if any :ro or
1609 * :mixed mode was specified. However, Ubuntu's mountall has the
1610 * unfortunate behavior to block bootup if /sys/fs/cgroup is
1611 * mounted read-only and cannot be remounted read-write.
1612 * (mountall reads /lib/init/fstab and tries to (re-)mount all of
1613 * these if they are not already mounted with the right options;
1614 * it contains an entry for /sys/fs/cgroup. In case it can't do
1615 * that, it prompts for the user to either manually fix it or
1616 * boot anyway. But without user input, booting of the container
1617 * hangs.)
1618 *
1619 * Instead of remounting the entire tmpfs readonly, we only
1620 * remount the paths readonly that are part of the cgroup
1621 * hierarchy.
1622 */
1623
1624 free(path);
1625
1626 return true;
1627
1628 out_error:
1629 saved_errno = errno;
1630 free(path);
1631 free(dirname);
1632 free(parts);
1633 free(abs_path);
1634 free(abs_path2);
1635 errno = saved_errno;
1636 return false;
1637 }
1638
1639 static int cgfs_nrtasks(void *hdata)
1640 {
1641 struct cgfs_data *d = hdata;
1642 struct cgroup_process_info *info;
1643 struct cgroup_mount_point *mp = NULL;
1644 char *abs_path = NULL;
1645 int ret;
1646
1647 if (!d) {
1648 errno = ENOENT;
1649 return -1;
1650 }
1651
1652 info = d->info;
1653 if (!info) {
1654 errno = ENOENT;
1655 return -1;
1656 }
1657
1658 if (info->designated_mount_point) {
1659 mp = info->designated_mount_point;
1660 } else {
1661 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, false);
1662 if (!mp)
1663 return -1;
1664 }
1665
1666 abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1667 if (!abs_path)
1668 return -1;
1669
1670 ret = cgroup_recursive_task_count(abs_path);
1671 free(abs_path);
1672 return ret;
1673 }
1674
1675 static struct cgroup_process_info *
1676 lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str,
1677 struct cgroup_meta_data *meta)
1678 {
1679 struct cgroup_process_info *result = NULL;
1680 FILE *proc_pid_cgroup = NULL;
1681 char *line = NULL;
1682 size_t sz = 0;
1683 int saved_errno = 0;
1684 struct cgroup_process_info **cptr = &result;
1685 struct cgroup_process_info *entry = NULL;
1686
1687 proc_pid_cgroup = fopen_cloexec(proc_pid_cgroup_str, "r");
1688 if (!proc_pid_cgroup)
1689 return NULL;
1690
1691 while (getline(&line, &sz, proc_pid_cgroup) != -1) {
1692 /* file format: hierarchy:subsystems:group */
1693 char *colon1;
1694 char *colon2;
1695 char *endptr;
1696 int hierarchy_number;
1697 struct cgroup_hierarchy *h = NULL;
1698
1699 if (!line[0])
1700 continue;
1701
1702 if (line[strlen(line) - 1] == '\n')
1703 line[strlen(line) - 1] = '\0';
1704
1705 colon1 = strchr(line, ':');
1706 if (!colon1)
1707 continue;
1708 *colon1++ = '\0';
1709 colon2 = strchr(colon1, ':');
1710 if (!colon2)
1711 continue;
1712 *colon2++ = '\0';
1713
1714 endptr = NULL;
1715
1716 /* With cgroupv2 /proc/self/cgroup can contain entries of the
1717 * form: 0::/
1718 * These entries need to be skipped.
1719 */
1720 if (!strcmp(colon1, ""))
1721 continue;
1722
1723 hierarchy_number = strtoul(line, &endptr, 10);
1724 if (!endptr || *endptr)
1725 continue;
1726
1727 if (hierarchy_number > meta->maximum_hierarchy) {
1728 /* we encountered a hierarchy we didn't have before,
1729 * so probably somebody remounted some stuff in the
1730 * mean time...
1731 */
1732 errno = EAGAIN;
1733 goto out_error;
1734 }
1735
1736 h = meta->hierarchies[hierarchy_number];
1737 if (!h) {
1738 /* we encountered a hierarchy that was thought to be
1739 * dead before, so probably somebody remounted some
1740 * stuff in the mean time...
1741 */
1742 errno = EAGAIN;
1743 goto out_error;
1744 }
1745
1746 /* we are told that we should ignore this hierarchy */
1747 if (!h->used)
1748 continue;
1749
1750 entry = calloc(1, sizeof(struct cgroup_process_info));
1751 if (!entry)
1752 goto out_error;
1753
1754 entry->meta_ref = lxc_cgroup_get_meta(meta);
1755 entry->hierarchy = h;
1756 entry->cgroup_path = strdup(colon2);
1757 if (!entry->cgroup_path)
1758 goto out_error;
1759 prune_init_scope(entry->cgroup_path);
1760
1761 *cptr = entry;
1762 cptr = &entry->next;
1763 entry = NULL;
1764 }
1765
1766 fclose(proc_pid_cgroup);
1767 free(line);
1768 return result;
1769
1770 out_error:
1771 saved_errno = errno;
1772 if (proc_pid_cgroup)
1773 fclose(proc_pid_cgroup);
1774 lxc_cgroup_process_info_free(result);
1775 lxc_cgroup_process_info_free(entry);
1776 free(line);
1777 errno = saved_errno;
1778 return NULL;
1779 }
1780
1781 static char **subsystems_from_mount_options(const char *mount_options,
1782 char **kernel_list)
1783 {
1784 char *token, *str, *saveptr = NULL;
1785 char **result = NULL;
1786 size_t result_capacity = 0;
1787 size_t result_count = 0;
1788 int saved_errno;
1789 int r;
1790
1791 str = alloca(strlen(mount_options)+1);
1792 strcpy(str, mount_options);
1793 for (; (token = strtok_r(str, ",", &saveptr)); str = NULL) {
1794 /* we have a subsystem if it's either in the list of
1795 * subsystems provided by the kernel OR if it starts
1796 * with name= for named hierarchies
1797 */
1798 r = lxc_grow_array((void ***)&result, &result_capacity, result_count + 1, 12);
1799 if (r < 0)
1800 goto out_free;
1801 result[result_count + 1] = NULL;
1802 if (strncmp(token, "name=", 5) && !lxc_string_in_array(token, (const char **)kernel_list)) {
1803 // this is eg 'systemd' but the mount will be 'name=systemd'
1804 result[result_count] = malloc(strlen(token) + 6);
1805 if (result[result_count])
1806 sprintf(result[result_count], "name=%s", token);
1807 } else
1808 result[result_count] = strdup(token);
1809 if (!result[result_count])
1810 goto out_free;
1811 result_count++;
1812 }
1813
1814 return result;
1815
1816 out_free:
1817 saved_errno = errno;
1818 lxc_free_array((void**)result, free);
1819 errno = saved_errno;
1820 return NULL;
1821 }
1822
1823 static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp)
1824 {
1825 if (!mp)
1826 return;
1827 free(mp->mount_point);
1828 free(mp->mount_prefix);
1829 free(mp);
1830 }
1831
1832 static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h)
1833 {
1834 if (!h)
1835 return;
1836 if (h->subsystems) {
1837 lxc_free_array((void **)h->subsystems, free);
1838 h->subsystems = NULL;
1839 }
1840 if (h->all_mount_points) {
1841 free(h->all_mount_points);
1842 h->all_mount_points = NULL;
1843 }
1844 free(h);
1845 h = NULL;
1846 }
1847
1848 static bool is_valid_cgroup(const char *name)
1849 {
1850 const char *p;
1851 for (p = name; *p; p++) {
1852 /* Use the ASCII printable characters range(32 - 127)
1853 * is reasonable, we kick out 32(SPACE) because it'll
1854 * break legacy lxc-ls
1855 */
1856 if (*p <= 32 || *p >= 127 || *p == '/')
1857 return false;
1858 }
1859 return strcmp(name, ".") != 0 && strcmp(name, "..") != 0;
1860 }
1861
1862 static int create_or_remove_cgroup(bool do_remove,
1863 struct cgroup_mount_point *mp, const char *path, int recurse,
1864 struct lxc_conf *conf)
1865 {
1866 int r, saved_errno = 0;
1867 char *buf = cgroup_to_absolute_path(mp, path, NULL);
1868 if (!buf)
1869 return -1;
1870
1871 /* create or remove directory */
1872 if (do_remove) {
1873 if (!dir_exists(buf))
1874 return 0;
1875 if (recurse) {
1876 if (conf && !lxc_list_empty(&conf->id_map))
1877 r = userns_exec_1(conf, rmdir_wrapper, buf,
1878 "rmdir_wrapper");
1879 else
1880 r = cgroup_rmdir(buf);
1881 } else
1882 r = rmdir(buf);
1883 } else
1884 r = mkdir_p(buf, 0777);
1885 saved_errno = errno;
1886 free(buf);
1887 errno = saved_errno;
1888 return r;
1889 }
1890
1891 static int create_cgroup(struct cgroup_mount_point *mp, const char *path)
1892 {
1893 return create_or_remove_cgroup(false, mp, path, false, NULL);
1894 }
1895
1896 static int remove_cgroup(struct cgroup_mount_point *mp,
1897 const char *path, bool recurse, struct lxc_conf *conf)
1898 {
1899 return create_or_remove_cgroup(true, mp, path, recurse, conf);
1900 }
1901
1902 static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp,
1903 const char *path, const char *suffix)
1904 {
1905 /* first we have to make sure we subtract the mount point's prefix */
1906 char *prefix = mp->mount_prefix;
1907 char *buf;
1908 ssize_t len, rv;
1909
1910 /* we want to make sure only absolute paths to cgroups are passed to us */
1911 if (path[0] != '/') {
1912 errno = EINVAL;
1913 return NULL;
1914 }
1915
1916 if (prefix && !strcmp(prefix, "/"))
1917 prefix = NULL;
1918
1919 /* prefix doesn't match */
1920 if (prefix && strncmp(prefix, path, strlen(prefix)) != 0) {
1921 errno = EINVAL;
1922 return NULL;
1923 }
1924 /* if prefix is /foo and path is /foobar */
1925 if (prefix && path[strlen(prefix)] != '/' && path[strlen(prefix)] != '\0') {
1926 errno = EINVAL;
1927 return NULL;
1928 }
1929
1930 /* remove prefix from path */
1931 path += prefix ? strlen(prefix) : 0;
1932
1933 len = strlen(mp->mount_point) + strlen(path) + (suffix ? strlen(suffix) : 0);
1934 buf = calloc(len + 1, 1);
1935 if (!buf)
1936 return NULL;
1937 rv = snprintf(buf, len + 1, "%s%s%s", mp->mount_point, path, suffix ? suffix : "");
1938 if (rv > len) {
1939 free(buf);
1940 errno = ENOMEM;
1941 return NULL;
1942 }
1943
1944 return buf;
1945 }
1946
1947 static struct cgroup_process_info *
1948 find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem)
1949 {
1950 struct cgroup_process_info *info_ptr;
1951 for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1952 struct cgroup_hierarchy *h = info_ptr->hierarchy;
1953 if (!h)
1954 continue;
1955 if (lxc_string_in_array(subsystem, (const char **)h->subsystems))
1956 return info_ptr;
1957 }
1958 errno = ENOENT;
1959 return NULL;
1960 }
1961
1962 static int do_cgroup_get(const char *cgroup_path, const char *sub_filename,
1963 char *value, size_t len)
1964 {
1965 const char *parts[3] = {
1966 cgroup_path,
1967 sub_filename,
1968 NULL
1969 };
1970 char *filename;
1971 int ret, saved_errno;
1972
1973 filename = lxc_string_join("/", parts, false);
1974 if (!filename)
1975 return -1;
1976
1977 ret = lxc_read_from_file(filename, value, len);
1978 saved_errno = errno;
1979 free(filename);
1980 errno = saved_errno;
1981 return ret;
1982 }
1983
1984 static int do_cgroup_set(const char *cgroup_path, const char *sub_filename,
1985 const char *value)
1986 {
1987 const char *parts[3] = {
1988 cgroup_path,
1989 sub_filename,
1990 NULL
1991 };
1992 char *filename;
1993 int ret, saved_errno;
1994
1995 filename = lxc_string_join("/", parts, false);
1996 if (!filename)
1997 return -1;
1998
1999 ret = lxc_write_to_file(filename, value, strlen(value), false);
2000 saved_errno = errno;
2001 free(filename);
2002 errno = saved_errno;
2003 return ret;
2004 }
2005
2006 static int do_setup_cgroup_limits(struct cgfs_data *d,
2007 struct lxc_list *cgroup_settings, bool do_devices)
2008 {
2009 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
2010 struct lxc_cgroup *cg;
2011 int ret = -1;
2012
2013 if (lxc_list_empty(cgroup_settings))
2014 return 0;
2015
2016 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2017 if (!sorted_cgroup_settings) {
2018 return -1;
2019 }
2020
2021 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2022 cg = iterator->elem;
2023
2024 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2025 if (strcmp(cg->subsystem, "devices.deny") == 0 &&
2026 cgroup_devices_has_allow_or_deny(d, cg->value, false))
2027 continue;
2028 if (strcmp(cg->subsystem, "devices.allow") == 0 &&
2029 cgroup_devices_has_allow_or_deny(d, cg->value, true))
2030 continue;
2031 if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
2032 if (do_devices && (errno == EACCES || errno == EPERM)) {
2033 WARN("Error setting %s to %s for %s",
2034 cg->subsystem, cg->value, d->name);
2035 continue;
2036 }
2037 SYSERROR("Error setting %s to %s for %s",
2038 cg->subsystem, cg->value, d->name);
2039 goto out;
2040 }
2041 }
2042
2043 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
2044 }
2045
2046 ret = 0;
2047 INFO("cgroup has been setup");
2048 out:
2049 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2050 lxc_list_del(iterator);
2051 free(iterator);
2052 }
2053 free(sorted_cgroup_settings);
2054 return ret;
2055 }
2056
2057 static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d,
2058 char *v, bool for_allow)
2059 {
2060 char *path;
2061 FILE *devices_list;
2062 char *line = NULL;
2063 size_t sz = 0;
2064 bool ret = !for_allow;
2065 const char *parts[3] = {
2066 NULL,
2067 "devices.list",
2068 NULL
2069 };
2070
2071 // XXX FIXME if users could use something other than 'lxc.devices.deny = a'.
2072 // not sure they ever do, but they *could*
2073 // right now, I'm assuming they do NOT
2074 if (!for_allow && strcmp(v, "a") != 0 && strcmp(v, "a *:* rwm") != 0)
2075 return false;
2076
2077 parts[0] = (const char *)lxc_cgroup_get_hierarchy_abs_path_data("devices", d);
2078 if (!parts[0])
2079 return false;
2080 path = lxc_string_join("/", parts, false);
2081 if (!path) {
2082 free((void *)parts[0]);
2083 return false;
2084 }
2085
2086 devices_list = fopen_cloexec(path, "r");
2087 if (!devices_list) {
2088 free(path);
2089 return false;
2090 }
2091
2092 while (getline(&line, &sz, devices_list) != -1) {
2093 size_t len = strlen(line);
2094 if (len > 0 && line[len-1] == '\n')
2095 line[len-1] = '\0';
2096 if (strcmp(line, "a *:* rwm") == 0) {
2097 ret = for_allow;
2098 goto out;
2099 } else if (for_allow && strcmp(line, v) == 0) {
2100 ret = true;
2101 goto out;
2102 }
2103 }
2104
2105 out:
2106 fclose(devices_list);
2107 free(line);
2108 free(path);
2109 return ret;
2110 }
2111
2112 static int cgroup_recursive_task_count(const char *cgroup_path)
2113 {
2114 DIR *d;
2115 struct dirent *dent;
2116 int n = 0, r;
2117
2118 d = opendir(cgroup_path);
2119 if (!d)
2120 return 0;
2121
2122 while ((dent = readdir(d))) {
2123 const char *parts[3] = {
2124 cgroup_path,
2125 dent->d_name,
2126 NULL
2127 };
2128 char *sub_path;
2129 struct stat st;
2130
2131 if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, ".."))
2132 continue;
2133 sub_path = lxc_string_join("/", parts, false);
2134 if (!sub_path) {
2135 closedir(d);
2136 return -1;
2137 }
2138 r = stat(sub_path, &st);
2139 if (r < 0) {
2140 closedir(d);
2141 free(sub_path);
2142 return -1;
2143 }
2144 if (S_ISDIR(st.st_mode)) {
2145 r = cgroup_recursive_task_count(sub_path);
2146 if (r >= 0)
2147 n += r;
2148 } else if (!strcmp(dent->d_name, "tasks")) {
2149 r = lxc_count_file_lines(sub_path);
2150 if (r >= 0)
2151 n += r;
2152 }
2153 free(sub_path);
2154 }
2155 closedir(d);
2156
2157 return n;
2158 }
2159
2160 static int handle_cgroup_settings(struct cgroup_mount_point *mp,
2161 char *cgroup_path)
2162 {
2163 int r, saved_errno = 0;
2164 char buf[2];
2165
2166 mp->need_cpuset_init = false;
2167
2168 /* If this is the memory cgroup, we want to enforce hierarchy.
2169 * But don't fail if for some reason we can't.
2170 */
2171 if (lxc_string_in_array("memory", (const char **)mp->hierarchy->subsystems)) {
2172 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/memory.use_hierarchy");
2173 if (cc_path) {
2174 r = lxc_read_from_file(cc_path, buf, 1);
2175 if (r < 1 || buf[0] != '1') {
2176 r = lxc_write_to_file(cc_path, "1", 1, false);
2177 if (r < 0)
2178 SYSERROR("failed to set memory.use_hierarchy to 1; continuing");
2179 }
2180 free(cc_path);
2181 }
2182 }
2183
2184 /* if this is a cpuset hierarchy, we have to set cgroup.clone_children in
2185 * the base cgroup, otherwise containers will start with an empty cpuset.mems
2186 * and cpuset.cpus and then
2187 */
2188 if (lxc_string_in_array("cpuset", (const char **)mp->hierarchy->subsystems)) {
2189 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/cgroup.clone_children");
2190 struct stat sb;
2191
2192 if (!cc_path)
2193 return -1;
2194 /* cgroup.clone_children is not available when running under
2195 * older kernel versions; in this case, we'll initialize
2196 * cpuset.cpus and cpuset.mems later, after the new cgroup
2197 * was created
2198 */
2199 if (stat(cc_path, &sb) != 0 && errno == ENOENT) {
2200 mp->need_cpuset_init = true;
2201 free(cc_path);
2202 return 0;
2203 }
2204 r = lxc_read_from_file(cc_path, buf, 1);
2205 if (r == 1 && buf[0] == '1') {
2206 free(cc_path);
2207 return 0;
2208 }
2209 r = lxc_write_to_file(cc_path, "1", 1, false);
2210 saved_errno = errno;
2211 free(cc_path);
2212 errno = saved_errno;
2213 return r < 0 ? -1 : 0;
2214 }
2215 return 0;
2216 }
2217
2218 static int cgroup_read_from_file(const char *fn, char buf[], size_t bufsize)
2219 {
2220 int ret = lxc_read_from_file(fn, buf, bufsize);
2221 if (ret < 0) {
2222 SYSERROR("failed to read %s", fn);
2223 return ret;
2224 }
2225 if (ret == bufsize) {
2226 if (bufsize > 0) {
2227 /* obviously this wasn't empty */
2228 buf[bufsize-1] = '\0';
2229 return ret;
2230 }
2231 /* Callers don't do this, but regression/sanity check */
2232 ERROR("%s: was not expecting 0 bufsize", __func__);
2233 return -1;
2234 }
2235 buf[ret] = '\0';
2236 return ret;
2237 }
2238
2239 static bool do_init_cpuset_file(struct cgroup_mount_point *mp,
2240 const char *path, const char *name)
2241 {
2242 char value[1024];
2243 char *childfile, *parentfile = NULL, *tmp;
2244 int ret;
2245 bool ok = false;
2246
2247 childfile = cgroup_to_absolute_path(mp, path, name);
2248 if (!childfile)
2249 return false;
2250
2251 /* don't overwrite a non-empty value in the file */
2252 ret = cgroup_read_from_file(childfile, value, sizeof(value));
2253 if (ret < 0)
2254 goto out;
2255 if (value[0] != '\0' && value[0] != '\n') {
2256 ok = true;
2257 goto out;
2258 }
2259
2260 /* path to the same name in the parent cgroup */
2261 parentfile = strdup(path);
2262 if (!parentfile)
2263 goto out;
2264
2265 tmp = strrchr(parentfile, '/');
2266 if (!tmp)
2267 goto out;
2268 if (tmp == parentfile)
2269 tmp++; /* keep the '/' at the start */
2270 *tmp = '\0';
2271 tmp = parentfile;
2272 parentfile = cgroup_to_absolute_path(mp, tmp, name);
2273 free(tmp);
2274 if (!parentfile)
2275 goto out;
2276
2277 /* copy from parent to child cgroup */
2278 ret = cgroup_read_from_file(parentfile, value, sizeof(value));
2279 if (ret < 0)
2280 goto out;
2281 if (ret == sizeof(value)) {
2282 /* If anyone actually sees this error, we can address it */
2283 ERROR("parent cpuset value too long");
2284 goto out;
2285 }
2286 ok = (lxc_write_to_file(childfile, value, strlen(value), false) >= 0);
2287 if (!ok)
2288 SYSERROR("failed writing %s", childfile);
2289
2290 out:
2291 free(parentfile);
2292 free(childfile);
2293 return ok;
2294 }
2295
2296 static bool init_cpuset_if_needed(struct cgroup_mount_point *mp,
2297 const char *path)
2298 {
2299 /* the files we have to handle here are only in cpuset hierarchies */
2300 if (!lxc_string_in_array("cpuset",
2301 (const char **)mp->hierarchy->subsystems))
2302 return true;
2303
2304 if (!mp->need_cpuset_init)
2305 return true;
2306
2307 return (do_init_cpuset_file(mp, path, "/cpuset.cpus") &&
2308 do_init_cpuset_file(mp, path, "/cpuset.mems") );
2309 }
2310
2311 static void print_cgfs_init_debuginfo(struct cgfs_data *d)
2312 {
2313 int i;
2314
2315 if (!getenv("LXC_DEBUG_CGFS"))
2316 return;
2317
2318 DEBUG("Cgroup information:");
2319 DEBUG(" container name: %s", d->name);
2320 if (!d->meta || !d->meta->hierarchies) {
2321 DEBUG(" No hierarchies found.");
2322 return;
2323 }
2324 DEBUG(" Controllers:");
2325 for (i = 0; i <= d->meta->maximum_hierarchy; i++) {
2326 char **p;
2327 struct cgroup_hierarchy *h = d->meta->hierarchies[i];
2328 if (!h) {
2329 DEBUG(" Empty hierarchy number %d.", i);
2330 continue;
2331 }
2332 for (p = h->subsystems; p && *p; p++) {
2333 DEBUG(" %2d: %s", i, *p);
2334 }
2335 }
2336 }
2337
2338 struct cgroup_ops *cgfs_ops_init(void)
2339 {
2340 return &cgfs_ops;
2341 }
2342
2343 static void *cgfs_init(const char *name)
2344 {
2345 struct cgfs_data *d;
2346
2347 d = malloc(sizeof(*d));
2348 if (!d)
2349 return NULL;
2350
2351 memset(d, 0, sizeof(*d));
2352 d->name = strdup(name);
2353 if (!d->name)
2354 goto err1;
2355
2356 d->cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
2357
2358 d->meta = lxc_cgroup_load_meta();
2359 if (!d->meta) {
2360 ERROR("cgroupfs failed to detect cgroup metadata");
2361 goto err2;
2362 }
2363
2364 print_cgfs_init_debuginfo(d);
2365
2366 return d;
2367
2368 err2:
2369 free(d->name);
2370 err1:
2371 free(d);
2372 return NULL;
2373 }
2374
2375 static void cgfs_destroy(void *hdata, struct lxc_conf *conf)
2376 {
2377 struct cgfs_data *d = hdata;
2378
2379 if (!d)
2380 return;
2381 free(d->name);
2382 lxc_cgroup_process_info_free_and_remove(d->info, conf);
2383 lxc_cgroup_put_meta(d->meta);
2384 free(d);
2385 }
2386
2387 static inline bool cgfs_create(void *hdata)
2388 {
2389 struct cgfs_data *d = hdata;
2390 struct cgroup_process_info *i;
2391 struct cgroup_meta_data *md;
2392
2393 if (!d)
2394 return false;
2395 md = d->meta;
2396 i = lxc_cgroupfs_create(d->name, d->cgroup_pattern, md, NULL);
2397 if (!i)
2398 return false;
2399 d->info = i;
2400 return true;
2401 }
2402
2403 static inline bool cgfs_enter(void *hdata, pid_t pid)
2404 {
2405 struct cgfs_data *d = hdata;
2406 struct cgroup_process_info *i;
2407 int ret;
2408
2409 if (!d)
2410 return false;
2411 i = d->info;
2412 ret = lxc_cgroupfs_enter(i, pid, false);
2413
2414 return ret == 0;
2415 }
2416
2417 static inline bool cgfs_create_legacy(void *hdata, pid_t pid)
2418 {
2419 struct cgfs_data *d = hdata;
2420 struct cgroup_process_info *i;
2421
2422 if (!d)
2423 return false;
2424 i = d->info;
2425 if (lxc_cgroup_create_legacy(i, d->name, pid) < 0) {
2426 ERROR("failed to create legacy ns cgroups for '%s'", d->name);
2427 return false;
2428 }
2429 return true;
2430 }
2431
2432 static const char *cgfs_get_cgroup(void *hdata, const char *subsystem)
2433 {
2434 struct cgfs_data *d = hdata;
2435
2436 if (!d)
2437 return NULL;
2438 return lxc_cgroup_get_hierarchy_path_data(subsystem, d);
2439 }
2440
2441 static bool cgfs_escape(void *hdata)
2442 {
2443 struct cgroup_meta_data *md;
2444 int i;
2445 bool ret = false;
2446
2447 md = lxc_cgroup_load_meta();
2448 if (!md)
2449 return false;
2450
2451 for (i = 0; i <= md->maximum_hierarchy; i++) {
2452 struct cgroup_hierarchy *h = md->hierarchies[i];
2453 struct cgroup_mount_point *mp;
2454 char *tasks;
2455 FILE *f;
2456 int written;
2457
2458 if (!h) {
2459 WARN("not escaping hierarchy %d", i);
2460 continue;
2461 }
2462
2463 mp = lxc_cgroup_find_mount_point(h, "/", true);
2464 if (!mp)
2465 goto out;
2466
2467 tasks = cgroup_to_absolute_path(mp, "/", "tasks");
2468 if (!tasks)
2469 goto out;
2470
2471 f = fopen(tasks, "a");
2472 free(tasks);
2473 if (!f)
2474 goto out;
2475
2476 written = fprintf(f, "%d\n", getpid());
2477 fclose(f);
2478 if (written < 0) {
2479 SYSERROR("writing tasks failed\n");
2480 goto out;
2481 }
2482 }
2483
2484 ret = true;
2485 out:
2486 lxc_cgroup_put_meta(md);
2487 return ret;
2488 }
2489
2490 static int cgfs_num_hierarchies(void)
2491 {
2492 /* not implemented */
2493 return -1;
2494 }
2495
2496 static bool cgfs_get_hierarchies(int i, char ***out)
2497 {
2498 /* not implemented */
2499 return false;
2500 }
2501
2502 static bool cgfs_unfreeze(void *hdata)
2503 {
2504 struct cgfs_data *d = hdata;
2505 char *cgabspath, *cgrelpath;
2506 int ret;
2507
2508 if (!d)
2509 return false;
2510
2511 cgrelpath = lxc_cgroup_get_hierarchy_path_data("freezer", d);
2512 cgabspath = lxc_cgroup_find_abs_path("freezer", cgrelpath, true, NULL);
2513 if (!cgabspath)
2514 return false;
2515
2516 ret = do_cgroup_set(cgabspath, "freezer.state", "THAWED");
2517 free(cgabspath);
2518 return ret == 0;
2519 }
2520
2521 static bool cgroupfs_setup_limits(void *hdata, struct lxc_list *cgroup_conf,
2522 bool with_devices)
2523 {
2524 struct cgfs_data *d = hdata;
2525
2526 if (!d)
2527 return false;
2528 return do_setup_cgroup_limits(d, cgroup_conf, with_devices) == 0;
2529 }
2530
2531 static bool lxc_cgroupfs_attach(const char *name, const char *lxcpath, pid_t pid)
2532 {
2533 struct cgroup_meta_data *meta_data;
2534 struct cgroup_process_info *container_info;
2535 int ret;
2536
2537 meta_data = lxc_cgroup_load_meta();
2538 if (!meta_data) {
2539 ERROR("could not move attached process %d to cgroup of container", pid);
2540 return false;
2541 }
2542
2543 container_info = lxc_cgroup_get_container_info(name, lxcpath, meta_data);
2544 lxc_cgroup_put_meta(meta_data);
2545 if (!container_info) {
2546 ERROR("could not move attached process %d to cgroup of container", pid);
2547 return false;
2548 }
2549
2550 ret = lxc_cgroupfs_enter(container_info, pid, false);
2551 lxc_cgroup_process_info_free(container_info);
2552 if (ret < 0) {
2553 ERROR("could not move attached process %d to cgroup of container", pid);
2554 return false;
2555 }
2556 return true;
2557 }
2558
2559 struct chown_data {
2560 const char *cgroup_path;
2561 uid_t origuid;
2562 };
2563
2564 /*
2565 * TODO - someone should refactor this to unshare once passing all the paths
2566 * to be chowned in one go
2567 */
2568 static int chown_cgroup_wrapper(void *data)
2569 {
2570 struct chown_data *arg = data;
2571 uid_t destuid;
2572 char *fpath;
2573
2574 if (setresgid(0,0,0) < 0)
2575 SYSERROR("Failed to setgid to 0");
2576 if (setresuid(0,0,0) < 0)
2577 SYSERROR("Failed to setuid to 0");
2578 if (setgroups(0, NULL) < 0)
2579 SYSERROR("Failed to clear groups");
2580 destuid = get_ns_uid(arg->origuid);
2581
2582 if (chown(arg->cgroup_path, destuid, 0) < 0)
2583 SYSERROR("Failed chowning %s to %d", arg->cgroup_path, (int)destuid);
2584
2585 fpath = lxc_append_paths(arg->cgroup_path, "tasks");
2586 if (!fpath)
2587 return -1;
2588 if (chown(fpath, destuid, 0) < 0)
2589 SYSERROR("Error chowning %s\n", fpath);
2590 free(fpath);
2591
2592 fpath = lxc_append_paths(arg->cgroup_path, "cgroup.procs");
2593 if (!fpath)
2594 return -1;
2595 if (chown(fpath, destuid, 0) < 0)
2596 SYSERROR("Error chowning %s", fpath);
2597 free(fpath);
2598
2599 return 0;
2600 }
2601
2602 static bool do_cgfs_chown(char *cgroup_path, struct lxc_conf *conf)
2603 {
2604 struct chown_data data;
2605 char *fpath;
2606
2607 if (!dir_exists(cgroup_path))
2608 return true;
2609
2610 if (lxc_list_empty(&conf->id_map))
2611 /* If there's no mapping then we don't need to chown */
2612 return true;
2613
2614 data.cgroup_path = cgroup_path;
2615 data.origuid = geteuid();
2616
2617 /* Unpriv users can't chown it themselves, so chown from
2618 * a child namespace mapping both our own and the target uid
2619 */
2620 if (userns_exec_1(conf, chown_cgroup_wrapper, &data,
2621 "chown_cgroup_wrapper") < 0) {
2622 ERROR("Error requesting cgroup chown in new namespace");
2623 return false;
2624 }
2625
2626 /*
2627 * Now chmod 775 the directory else the container cannot create cgroups.
2628 * This can't be done in the child namespace because it only group-owns
2629 * the cgroup
2630 */
2631 if (chmod(cgroup_path, 0775) < 0) {
2632 SYSERROR("Error chmoding %s\n", cgroup_path);
2633 return false;
2634 }
2635 fpath = lxc_append_paths(cgroup_path, "tasks");
2636 if (!fpath)
2637 return false;
2638 if (chmod(fpath, 0664) < 0)
2639 SYSERROR("Error chmoding %s\n", fpath);
2640 free(fpath);
2641 fpath = lxc_append_paths(cgroup_path, "cgroup.procs");
2642 if (!fpath)
2643 return false;
2644 if (chmod(fpath, 0664) < 0)
2645 SYSERROR("Error chmoding %s\n", fpath);
2646 free(fpath);
2647
2648 return true;
2649 }
2650
2651 static bool cgfs_chown(void *hdata, struct lxc_conf *conf)
2652 {
2653 struct cgfs_data *d = hdata;
2654 struct cgroup_process_info *info_ptr;
2655 char *cgpath;
2656 bool r = true;
2657
2658 if (!d)
2659 return false;
2660
2661 for (info_ptr = d->info; info_ptr; info_ptr = info_ptr->next) {
2662 if (!info_ptr->hierarchy)
2663 continue;
2664
2665 if (!info_ptr->designated_mount_point) {
2666 info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, info_ptr->cgroup_path, true);
2667 if (!info_ptr->designated_mount_point) {
2668 SYSERROR("Could not chown cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", info_ptr->cgroup_path);
2669 return false;
2670 }
2671 }
2672
2673 cgpath = cgroup_to_absolute_path(info_ptr->designated_mount_point, info_ptr->cgroup_path, NULL);
2674 if (!cgpath) {
2675 SYSERROR("Could not chown cgroup %s: internal error", info_ptr->cgroup_path);
2676 continue;
2677 }
2678 r = do_cgfs_chown(cgpath, conf);
2679 if (!r && is_crucial_hierarchy(info_ptr->hierarchy)) {
2680 ERROR("Failed chowning %s\n", cgpath);
2681 free(cgpath);
2682 return false;
2683 }
2684 free(cgpath);
2685 }
2686
2687 return true;
2688 }
2689
2690 static struct cgroup_ops cgfs_ops = {
2691 .init = cgfs_init,
2692 .destroy = cgfs_destroy,
2693 .create = cgfs_create,
2694 .enter = cgfs_enter,
2695 .create_legacy = cgfs_create_legacy,
2696 .get_cgroup = cgfs_get_cgroup,
2697 .escape = cgfs_escape,
2698 .num_hierarchies = cgfs_num_hierarchies,
2699 .get_hierarchies = cgfs_get_hierarchies,
2700 .get = lxc_cgroupfs_get,
2701 .set = lxc_cgroupfs_set,
2702 .unfreeze = cgfs_unfreeze,
2703 .setup_limits = cgroupfs_setup_limits,
2704 .name = "cgroupfs",
2705 .attach = lxc_cgroupfs_attach,
2706 .chown = cgfs_chown,
2707 .mount_cgroup = cgroupfs_mount_cgroup,
2708 .nrtasks = cgfs_nrtasks,
2709 .driver = CGFS,
2710 };