]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/cgroups/cgfs.c
tree-wide: s/getpid()/lxc_raw_getpid()/g
[mirror_lxc.git] / src / lxc / cgroups / cgfs.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23 #include "config.h"
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <errno.h>
28 #include <unistd.h>
29 #include <string.h>
30 #include <dirent.h>
31 #include <fcntl.h>
32 #include <grp.h>
33 #include <ctype.h>
34 #include <sys/types.h>
35 #include <sys/stat.h>
36 #include <sys/param.h>
37 #include <sys/inotify.h>
38 #include <sys/mount.h>
39 #include <netinet/in.h>
40 #include <net/if.h>
41
42 #include "error.h"
43 #include "commands.h"
44 #include "list.h"
45 #include "conf.h"
46 #include "utils.h"
47 #include "log.h"
48 #include "cgroup.h"
49 #include "start.h"
50 #include "state.h"
51 #include "storage.h"
52
53 #if IS_BIONIC
54 #include <../include/lxcmntent.h>
55 #else
56 #include <mntent.h>
57 #endif
58
59 struct cgroup_hierarchy;
60 struct cgroup_meta_data;
61 struct cgroup_mount_point;
62
63 /*
64 * cgroup_meta_data: the metadata about the cgroup infrastructure on this
65 * host
66 */
67 struct cgroup_meta_data {
68 ptrdiff_t ref; /* simple refcount */
69 struct cgroup_hierarchy **hierarchies;
70 struct cgroup_mount_point **mount_points;
71 int maximum_hierarchy;
72 };
73
74 /*
75 * cgroup_hierarchy: describes a single cgroup hierarchy
76 * (may have multiple mount points)
77 */
78 struct cgroup_hierarchy {
79 int index;
80 bool used; /* false if the hierarchy should be ignored by lxc */
81 char **subsystems;
82 struct cgroup_mount_point *rw_absolute_mount_point;
83 struct cgroup_mount_point *ro_absolute_mount_point;
84 struct cgroup_mount_point **all_mount_points;
85 size_t all_mount_point_capacity;
86 };
87
88 /*
89 * cgroup_mount_point: a mount point to where a hierarchy
90 * is mounted to
91 */
92 struct cgroup_mount_point {
93 struct cgroup_hierarchy *hierarchy;
94 char *mount_point;
95 char *mount_prefix;
96 bool read_only;
97 bool need_cpuset_init;
98 };
99
100 /*
101 * cgroup_process_info: describes the membership of a
102 * process to the different cgroup
103 * hierarchies
104 *
105 * Note this is the per-process info tracked by the cgfs_ops.
106 * This is not used with cgmanager.
107 */
108 struct cgroup_process_info {
109 struct cgroup_process_info *next;
110 struct cgroup_meta_data *meta_ref;
111 struct cgroup_hierarchy *hierarchy;
112 char *cgroup_path;
113 char *cgroup_path_sub;
114 char **created_paths;
115 size_t created_paths_capacity;
116 size_t created_paths_count;
117 struct cgroup_mount_point *designated_mount_point;
118 };
119
120 struct cgfs_data {
121 char *name;
122 const char *cgroup_pattern;
123 struct cgroup_meta_data *meta;
124 struct cgroup_process_info *info;
125 };
126
127 lxc_log_define(lxc_cgfs, lxc);
128
129 static struct cgroup_process_info *lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str, struct cgroup_meta_data *meta);
130 static char **subsystems_from_mount_options(const char *mount_options, char **kernel_list);
131 static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp);
132 static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h);
133 static bool is_valid_cgroup(const char *name);
134 static int create_cgroup(struct cgroup_mount_point *mp, const char *path);
135 static int remove_cgroup(struct cgroup_mount_point *mp, const char *path, bool recurse,
136 struct lxc_conf *conf);
137 static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp, const char *path, const char *suffix);
138 static struct cgroup_process_info *find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem);
139 static int do_cgroup_get(const char *cgroup_path, const char *sub_filename, char *value, size_t len);
140 static int do_cgroup_set(const char *cgroup_path, const char *sub_filename, const char *value);
141 static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d, char *v, bool for_allow);
142 static int do_setup_cgroup_limits(struct cgfs_data *d, struct lxc_list *cgroup_settings, bool do_devices);
143 static int cgroup_recursive_task_count(const char *cgroup_path);
144 static int handle_cgroup_settings(struct cgroup_mount_point *mp, char *cgroup_path);
145 static bool init_cpuset_if_needed(struct cgroup_mount_point *mp, const char *path);
146
147 static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist);
148 static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data);
149 static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data);
150
151 /* free process membership information */
152 static void lxc_cgroup_process_info_free(struct cgroup_process_info *info);
153 static void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info,
154 struct lxc_conf *conf);
155
156 static struct cgroup_ops cgfs_ops;
157
158 static int cgroup_rmdir(char *dirname)
159 {
160 struct dirent *direntp;
161 int saved_errno = 0;
162 DIR *dir;
163 int ret, failed=0;
164 char pathname[MAXPATHLEN];
165
166 dir = opendir(dirname);
167 if (!dir) {
168 ERROR("Failed to open %s", dirname);
169 return -1;
170 }
171
172 while ((direntp = readdir(dir))) {
173 struct stat mystat;
174 int rc;
175
176 if (!direntp)
177 break;
178
179 if (!strcmp(direntp->d_name, ".") ||
180 !strcmp(direntp->d_name, ".."))
181 continue;
182
183 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
184 if (rc < 0 || rc >= MAXPATHLEN) {
185 ERROR("pathname too long");
186 failed=1;
187 if (!saved_errno)
188 saved_errno = -ENOMEM;
189 continue;
190 }
191 ret = lstat(pathname, &mystat);
192 if (ret) {
193 SYSERROR("Failed to stat %s", pathname);
194 failed=1;
195 if (!saved_errno)
196 saved_errno = errno;
197 continue;
198 }
199 if (S_ISDIR(mystat.st_mode)) {
200 if (cgroup_rmdir(pathname) < 0) {
201 if (!saved_errno)
202 saved_errno = errno;
203 failed=1;
204 }
205 }
206 }
207
208 if (rmdir(dirname) < 0) {
209 SYSERROR("Failed to delete %s", dirname);
210 if (!saved_errno)
211 saved_errno = errno;
212 failed=1;
213 }
214
215 ret = closedir(dir);
216 if (ret) {
217 SYSERROR("Failed to close directory %s", dirname);
218 if (!saved_errno)
219 saved_errno = errno;
220 failed=1;
221 }
222
223 errno = saved_errno;
224 return failed ? -1 : 0;
225 }
226
227 static int rmdir_wrapper(void *data)
228 {
229 char *path = data;
230
231 if (setresgid(0,0,0) < 0)
232 SYSERROR("Failed to setgid to 0");
233 if (setresuid(0,0,0) < 0)
234 SYSERROR("Failed to setuid to 0");
235 if (setgroups(0, NULL) < 0)
236 SYSERROR("Failed to clear groups");
237
238 return cgroup_rmdir(path);
239 }
240
241 static struct cgroup_meta_data *lxc_cgroup_load_meta()
242 {
243 const char *cgroup_use = NULL;
244 char **cgroup_use_list = NULL;
245 struct cgroup_meta_data *md = NULL;
246 int saved_errno;
247
248 errno = 0;
249 cgroup_use = lxc_global_config_value("lxc.cgroup.use");
250 if (!cgroup_use && errno != 0)
251 return NULL;
252 if (cgroup_use) {
253 cgroup_use_list = lxc_string_split_and_trim(cgroup_use, ',');
254 if (!cgroup_use_list)
255 return NULL;
256 }
257
258 md = lxc_cgroup_load_meta2((const char **)cgroup_use_list);
259 saved_errno = errno;
260 lxc_free_array((void **)cgroup_use_list, free);
261 errno = saved_errno;
262 return md;
263 }
264
265 /* Step 1: determine all kernel subsystems */
266 static bool find_cgroup_subsystems(char ***kernel_subsystems)
267 {
268 FILE *proc_cgroups;
269 bool bret = false;
270 char *line = NULL;
271 size_t sz = 0;
272 size_t kernel_subsystems_count = 0;
273 size_t kernel_subsystems_capacity = 0;
274 int r;
275
276 proc_cgroups = fopen_cloexec("/proc/cgroups", "r");
277 if (!proc_cgroups)
278 return false;
279
280 while (getline(&line, &sz, proc_cgroups) != -1) {
281 char *tab1;
282 char *tab2;
283 int hierarchy_number;
284
285 if (line[0] == '#')
286 continue;
287 if (!line[0])
288 continue;
289
290 tab1 = strchr(line, '\t');
291 if (!tab1)
292 continue;
293 *tab1++ = '\0';
294 tab2 = strchr(tab1, '\t');
295 if (!tab2)
296 continue;
297 *tab2 = '\0';
298
299 tab2 = NULL;
300 hierarchy_number = strtoul(tab1, &tab2, 10);
301 if (!tab2 || *tab2)
302 continue;
303 (void)hierarchy_number;
304
305 r = lxc_grow_array((void ***)kernel_subsystems, &kernel_subsystems_capacity, kernel_subsystems_count + 1, 12);
306 if (r < 0)
307 goto out;
308 (*kernel_subsystems)[kernel_subsystems_count] = strdup(line);
309 if (!(*kernel_subsystems)[kernel_subsystems_count])
310 goto out;
311 kernel_subsystems_count++;
312 }
313 bret = true;
314
315 out:
316 fclose(proc_cgroups);
317 free(line);
318 return bret;
319 }
320
321 /* Step 2: determine all hierarchies (by reading /proc/self/cgroup),
322 * since mount points don't specify hierarchy number and
323 * /proc/cgroups does not contain named hierarchies
324 */
325 static bool find_cgroup_hierarchies(struct cgroup_meta_data *meta_data,
326 bool all_kernel_subsystems, bool all_named_subsystems,
327 const char **subsystem_whitelist)
328 {
329 FILE *proc_self_cgroup;
330 char *line = NULL;
331 size_t sz = 0;
332 int r;
333 bool bret = false;
334 size_t hierarchy_capacity = 0;
335
336 proc_self_cgroup = fopen_cloexec("/proc/self/cgroup", "r");
337 /* if for some reason (because of setns() and pid namespace for example),
338 * /proc/self is not valid, we try /proc/1/cgroup... */
339 if (!proc_self_cgroup)
340 proc_self_cgroup = fopen_cloexec("/proc/1/cgroup", "r");
341 if (!proc_self_cgroup)
342 return false;
343
344 while (getline(&line, &sz, proc_self_cgroup) != -1) {
345 /* file format: hierarchy:subsystems:group,
346 * we only extract hierarchy and subsystems
347 * here */
348 char *colon1;
349 char *colon2;
350 int hierarchy_number;
351 struct cgroup_hierarchy *h = NULL;
352 char **p;
353
354 if (!line[0])
355 continue;
356
357 colon1 = strchr(line, ':');
358 if (!colon1)
359 continue;
360 *colon1++ = '\0';
361 colon2 = strchr(colon1, ':');
362 if (!colon2)
363 continue;
364 *colon2 = '\0';
365
366 colon2 = NULL;
367
368 /* With cgroupv2 /proc/self/cgroup can contain entries of the
369 * form: 0::/
370 * These entries need to be skipped.
371 */
372 if (!strcmp(colon1, ""))
373 continue;
374
375 hierarchy_number = strtoul(line, &colon2, 10);
376 if (!colon2 || *colon2)
377 continue;
378
379 if (hierarchy_number > meta_data->maximum_hierarchy) {
380 /* lxc_grow_array will never shrink, so even if we find a lower
381 * hierarchy number here, the array will never be smaller
382 */
383 r = lxc_grow_array((void ***)&meta_data->hierarchies, &hierarchy_capacity, hierarchy_number + 1, 12);
384 if (r < 0)
385 goto out;
386
387 meta_data->maximum_hierarchy = hierarchy_number;
388 }
389
390 /* this shouldn't happen, we had this already */
391 if (meta_data->hierarchies[hierarchy_number])
392 goto out;
393
394 h = calloc(1, sizeof(struct cgroup_hierarchy));
395 if (!h)
396 goto out;
397
398 meta_data->hierarchies[hierarchy_number] = h;
399
400 h->index = hierarchy_number;
401 h->subsystems = lxc_string_split_and_trim(colon1, ',');
402 if (!h->subsystems)
403 goto out;
404 /* see if this hierarchy should be considered */
405 if (!all_kernel_subsystems || !all_named_subsystems) {
406 for (p = h->subsystems; *p; p++) {
407 if (!strncmp(*p, "name=", 5)) {
408 if (all_named_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
409 h->used = true;
410 break;
411 }
412 } else {
413 if (all_kernel_subsystems || (subsystem_whitelist && lxc_string_in_array(*p, subsystem_whitelist))) {
414 h->used = true;
415 break;
416 }
417 }
418 }
419 } else {
420 /* we want all hierarchy anyway */
421 h->used = true;
422 }
423 }
424 bret = true;
425
426 out:
427 fclose(proc_self_cgroup);
428 free(line);
429 return bret;
430 }
431
432 /* Step 3: determine all mount points of each hierarchy */
433 static bool find_hierarchy_mountpts( struct cgroup_meta_data *meta_data, char **kernel_subsystems)
434 {
435 bool bret = false;
436 FILE *proc_self_mountinfo;
437 char *line = NULL;
438 size_t sz = 0;
439 char **tokens = NULL;
440 size_t mount_point_count = 0;
441 size_t mount_point_capacity = 0;
442 size_t token_capacity = 0;
443 int r;
444 bool is_cgns = cgns_supported();
445
446 proc_self_mountinfo = fopen_cloexec("/proc/self/mountinfo", "r");
447 /* if for some reason (because of setns() and pid namespace for example),
448 * /proc/self is not valid, we try /proc/1/cgroup... */
449 if (!proc_self_mountinfo)
450 proc_self_mountinfo = fopen_cloexec("/proc/1/mountinfo", "r");
451 if (!proc_self_mountinfo)
452 return false;
453
454 while (getline(&line, &sz, proc_self_mountinfo) != -1) {
455 char *token, *line_tok, *saveptr = NULL;
456 size_t i, j, k;
457 struct cgroup_mount_point *mount_point;
458 struct cgroup_hierarchy *h;
459 char **subsystems;
460 bool is_lxcfs = false;
461
462 if (line[0] && line[strlen(line) - 1] == '\n')
463 line[strlen(line) - 1] = '\0';
464
465 for (i = 0, line_tok = line; (token = strtok_r(line_tok, " ", &saveptr)); line_tok = NULL) {
466 r = lxc_grow_array((void ***)&tokens, &token_capacity, i + 1, 64);
467 if (r < 0)
468 goto out;
469 tokens[i++] = token;
470 }
471
472 /* layout of /proc/self/mountinfo:
473 * 0: id
474 * 1: parent id
475 * 2: device major:minor
476 * 3: mount prefix
477 * 4: mount point
478 * 5: per-mount options
479 * [optional X]: additional data
480 * X+7: "-"
481 * X+8: type
482 * X+9: source
483 * X+10: per-superblock options
484 */
485 for (j = 6; j < i && tokens[j]; j++)
486 if (!strcmp(tokens[j], "-"))
487 break;
488
489 /* could not find separator */
490 if (j >= i || !tokens[j])
491 continue;
492 /* there should be exactly three fields after
493 * the separator
494 */
495 if (i != j + 4)
496 continue;
497
498 /* not a cgroup filesystem */
499 if (strcmp(tokens[j + 1], "cgroup") != 0) {
500 if (strcmp(tokens[j + 1], "fuse.lxcfs") != 0)
501 continue;
502 if (strncmp(tokens[4], "/sys/fs/cgroup/", 15) != 0)
503 continue;
504 is_lxcfs = true;
505 char *curtok = tokens[4] + 15;
506 subsystems = subsystems_from_mount_options(curtok,
507 kernel_subsystems);
508 } else
509 subsystems = subsystems_from_mount_options(tokens[j + 3],
510 kernel_subsystems);
511 if (!subsystems)
512 goto out;
513
514 h = NULL;
515 for (k = 0; k <= meta_data->maximum_hierarchy; k++) {
516 if (meta_data->hierarchies[k] &&
517 meta_data->hierarchies[k]->subsystems[0] &&
518 lxc_string_in_array(meta_data->hierarchies[k]->subsystems[0], (const char **)subsystems)) {
519 /* TODO: we could also check if the lists really match completely,
520 * just to have an additional sanity check */
521 h = meta_data->hierarchies[k];
522 break;
523 }
524 }
525 lxc_free_array((void **)subsystems, free);
526
527 r = lxc_grow_array((void ***)&meta_data->mount_points, &mount_point_capacity, mount_point_count + 1, 12);
528 if (r < 0)
529 goto out;
530
531 /* create mount point object */
532 mount_point = calloc(1, sizeof(*mount_point));
533 if (!mount_point)
534 goto out;
535
536 meta_data->mount_points[mount_point_count++] = mount_point;
537
538 mount_point->hierarchy = h;
539 if (is_lxcfs || is_cgns)
540 mount_point->mount_prefix = strdup("/");
541 else
542 mount_point->mount_prefix = strdup(tokens[3]);
543 mount_point->mount_point = strdup(tokens[4]);
544 if (!mount_point->mount_point || !mount_point->mount_prefix)
545 goto out;
546 mount_point->read_only = !lxc_string_in_list("rw", tokens[5], ',');
547
548 if (!strcmp(mount_point->mount_prefix, "/")) {
549 if (mount_point->read_only) {
550 if (!h->ro_absolute_mount_point)
551 h->ro_absolute_mount_point = mount_point;
552 } else {
553 if (!h->rw_absolute_mount_point)
554 h->rw_absolute_mount_point = mount_point;
555 }
556 }
557
558 if (h)
559 k = lxc_array_len((void **)h->all_mount_points);
560 else
561 k = 0;
562 r = lxc_grow_array((void ***)&h->all_mount_points, &h->all_mount_point_capacity, k + 1, 4);
563 if (r < 0)
564 goto out;
565 h->all_mount_points[k] = mount_point;
566 }
567 bret = true;
568
569 out:
570 fclose(proc_self_mountinfo);
571 free(tokens);
572 free(line);
573 return bret;
574 }
575
576 static struct cgroup_meta_data *lxc_cgroup_load_meta2(const char **subsystem_whitelist)
577 {
578 bool all_kernel_subsystems = true;
579 bool all_named_subsystems = false;
580 struct cgroup_meta_data *meta_data = NULL;
581 char **kernel_subsystems = NULL;
582 int saved_errno = 0;
583
584 /* if the subsystem whitelist is not specified, include all
585 * hierarchies that contain kernel subsystems by default but
586 * no hierarchies that only contain named subsystems
587 *
588 * if it is specified, the specifier @all will select all
589 * hierarchies, @kernel will select all hierarchies with
590 * kernel subsystems and @named will select all named
591 * hierarchies
592 */
593 all_kernel_subsystems = subsystem_whitelist ?
594 (lxc_string_in_array("@kernel", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
595 true;
596 all_named_subsystems = subsystem_whitelist ?
597 (lxc_string_in_array("@named", subsystem_whitelist) || lxc_string_in_array("@all", subsystem_whitelist)) :
598 true;
599
600 meta_data = calloc(1, sizeof(struct cgroup_meta_data));
601 if (!meta_data)
602 return NULL;
603 meta_data->ref = 1;
604
605 if (!find_cgroup_subsystems(&kernel_subsystems))
606 goto out_error;
607
608 if (!find_cgroup_hierarchies(meta_data, all_kernel_subsystems,
609 all_named_subsystems, subsystem_whitelist))
610 goto out_error;
611
612 if (!find_hierarchy_mountpts(meta_data, kernel_subsystems))
613 goto out_error;
614
615 /* oops, we couldn't find anything */
616 if (!meta_data->hierarchies || !meta_data->mount_points) {
617 errno = EINVAL;
618 goto out_error;
619 }
620
621 lxc_free_array((void **)kernel_subsystems, free);
622 return meta_data;
623
624 out_error:
625 saved_errno = errno;
626 lxc_free_array((void **)kernel_subsystems, free);
627 lxc_cgroup_put_meta(meta_data);
628 errno = saved_errno;
629 return NULL;
630 }
631
632 static struct cgroup_meta_data *lxc_cgroup_get_meta(struct cgroup_meta_data *meta_data)
633 {
634 meta_data->ref++;
635 return meta_data;
636 }
637
638 static struct cgroup_meta_data *lxc_cgroup_put_meta(struct cgroup_meta_data *meta_data)
639 {
640 size_t i;
641 if (!meta_data)
642 return NULL;
643 if (--meta_data->ref > 0)
644 return meta_data;
645 lxc_free_array((void **)meta_data->mount_points, (lxc_free_fn)lxc_cgroup_mount_point_free);
646 if (meta_data->hierarchies)
647 for (i = 0; i <= meta_data->maximum_hierarchy; i++)
648 if (meta_data->hierarchies[i])
649 lxc_cgroup_hierarchy_free(meta_data->hierarchies[i]);
650 free(meta_data->hierarchies);
651 free(meta_data);
652 return NULL;
653 }
654
655 static struct cgroup_hierarchy *lxc_cgroup_find_hierarchy(struct cgroup_meta_data *meta_data, const char *subsystem)
656 {
657 size_t i;
658 for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
659 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
660 if (!h)
661 continue;
662 if (h && lxc_string_in_array(subsystem, (const char **)h->subsystems))
663 return h;
664 }
665 return NULL;
666 }
667
668 static bool mountpoint_is_accessible(struct cgroup_mount_point *mp)
669 {
670 return mp && access(mp->mount_point, F_OK) == 0;
671 }
672
673 static struct cgroup_mount_point *lxc_cgroup_find_mount_point(struct cgroup_hierarchy *hierarchy, const char *group, bool should_be_writable)
674 {
675 struct cgroup_mount_point **mps;
676 struct cgroup_mount_point *current_result = NULL;
677 ssize_t quality = -1;
678
679 /* trivial case */
680 if (mountpoint_is_accessible(hierarchy->rw_absolute_mount_point))
681 return hierarchy->rw_absolute_mount_point;
682 if (!should_be_writable && mountpoint_is_accessible(hierarchy->ro_absolute_mount_point))
683 return hierarchy->ro_absolute_mount_point;
684
685 for (mps = hierarchy->all_mount_points; mps && *mps; mps++) {
686 struct cgroup_mount_point *mp = *mps;
687 size_t prefix_len = mp->mount_prefix ? strlen(mp->mount_prefix) : 0;
688
689 if (prefix_len == 1 && mp->mount_prefix[0] == '/')
690 prefix_len = 0;
691
692 if (!mountpoint_is_accessible(mp))
693 continue;
694
695 if (should_be_writable && mp->read_only)
696 continue;
697
698 if (!prefix_len ||
699 (strncmp(group, mp->mount_prefix, prefix_len) == 0 &&
700 (group[prefix_len] == '\0' || group[prefix_len] == '/'))) {
701 /* search for the best quality match, i.e. the match with the
702 * shortest prefix where this group is still contained
703 */
704 if (quality == -1 || prefix_len < quality) {
705 current_result = mp;
706 quality = prefix_len;
707 }
708 }
709 }
710
711 if (!current_result)
712 errno = ENOENT;
713 return current_result;
714 }
715
716 static char *lxc_cgroup_find_abs_path(const char *subsystem, const char *group, bool should_be_writable, const char *suffix)
717 {
718 struct cgroup_meta_data *meta_data;
719 struct cgroup_hierarchy *h;
720 struct cgroup_mount_point *mp;
721 char *result;
722 int saved_errno;
723
724 meta_data = lxc_cgroup_load_meta();
725 if (!meta_data)
726 return NULL;
727
728 h = lxc_cgroup_find_hierarchy(meta_data, subsystem);
729 if (!h)
730 goto out_error;
731
732 mp = lxc_cgroup_find_mount_point(h, group, should_be_writable);
733 if (!mp)
734 goto out_error;
735
736 result = cgroup_to_absolute_path(mp, group, suffix);
737 if (!result)
738 goto out_error;
739
740 lxc_cgroup_put_meta(meta_data);
741 return result;
742
743 out_error:
744 saved_errno = errno;
745 lxc_cgroup_put_meta(meta_data);
746 errno = saved_errno;
747 return NULL;
748 }
749
750 static struct cgroup_process_info *lxc_cgroup_process_info_get(pid_t pid, struct cgroup_meta_data *meta)
751 {
752 char pid_buf[32];
753 snprintf(pid_buf, 32, "/proc/%lu/cgroup", (unsigned long)pid);
754 return lxc_cgroup_process_info_getx(pid_buf, meta);
755 }
756
757 static struct cgroup_process_info *lxc_cgroup_process_info_get_init(struct cgroup_meta_data *meta)
758 {
759 return lxc_cgroup_process_info_get(1, meta);
760 }
761
762 static struct cgroup_process_info *lxc_cgroup_process_info_get_self(struct cgroup_meta_data *meta)
763 {
764 struct cgroup_process_info *i;
765 i = lxc_cgroup_process_info_getx("/proc/self/cgroup", meta);
766 if (!i)
767 i = lxc_cgroup_process_info_get(lxc_raw_getpid(), meta);
768 return i;
769 }
770
771 /*
772 * If a controller has ns cgroup mounted, then in that cgroup the handler->pid
773 * is already in a new cgroup named after the pid. 'mnt' is passed in as
774 * the full current cgroup. Say that is /sys/fs/cgroup/lxc/2975 and the container
775 * name is c1. . We want to rename the cgroup directory to /sys/fs/cgroup/lxc/c1,
776 * and return the string /sys/fs/cgroup/lxc/c1.
777 */
778 static char *cgroup_rename_nsgroup(const char *mountpath, const char *oldname, pid_t pid, const char *name)
779 {
780 char *dir, *fulloldpath;
781 char *newname, *fullnewpath;
782 int len, newlen, ret;
783
784 /*
785 * if cgroup is mounted at /cgroup and task is in cgroup /ab/, pid 2375 and
786 * name is c1,
787 * dir: /ab
788 * fulloldpath = /cgroup/ab/2375
789 * fullnewpath = /cgroup/ab/c1
790 * newname = /ab/c1
791 */
792 dir = alloca(strlen(oldname) + 1);
793 strcpy(dir, oldname);
794
795 len = strlen(oldname) + strlen(mountpath) + 22;
796 fulloldpath = alloca(len);
797 ret = snprintf(fulloldpath, len, "%s/%s/%lu", mountpath, oldname, (unsigned long)pid);
798 if (ret < 0 || ret >= len)
799 return NULL;
800
801 len = strlen(dir) + strlen(name) + 2;
802 newname = malloc(len);
803 if (!newname) {
804 SYSERROR("Out of memory");
805 return NULL;
806 }
807 ret = snprintf(newname, len, "%s/%s", dir, name);
808 if (ret < 0 || ret >= len) {
809 free(newname);
810 return NULL;
811 }
812
813 newlen = strlen(mountpath) + len + 2;
814 fullnewpath = alloca(newlen);
815 ret = snprintf(fullnewpath, newlen, "%s/%s", mountpath, newname);
816 if (ret < 0 || ret >= newlen) {
817 free(newname);
818 return NULL;
819 }
820
821 if (access(fullnewpath, F_OK) == 0) {
822 if (rmdir(fullnewpath) != 0) {
823 SYSERROR("container cgroup %s already exists.", fullnewpath);
824 free(newname);
825 return NULL;
826 }
827 }
828 if (rename(fulloldpath, fullnewpath)) {
829 SYSERROR("failed to rename cgroup %s->%s", fulloldpath, fullnewpath);
830 free(newname);
831 return NULL;
832 }
833
834 DEBUG("'%s' renamed to '%s'", oldname, newname);
835
836 return newname;
837 }
838
839 static bool is_crucial_hierarchy(struct cgroup_hierarchy *h)
840 {
841 char **p;
842
843 for (p = h->subsystems; *p; p++) {
844 if (is_crucial_cgroup_subsystem(*p))
845 return true;
846 }
847 return false;
848 }
849
850 /* create a new cgroup */
851 static struct cgroup_process_info *lxc_cgroupfs_create(const char *name, const char *path_pattern, struct cgroup_meta_data *meta_data, const char *sub_pattern)
852 {
853 char **cgroup_path_components = NULL;
854 char **p = NULL;
855 char *path_so_far = NULL;
856 char **new_cgroup_paths = NULL;
857 char **new_cgroup_paths_sub = NULL;
858 struct cgroup_mount_point *mp;
859 struct cgroup_hierarchy *h;
860 struct cgroup_process_info *base_info = NULL;
861 struct cgroup_process_info *info_ptr;
862 int saved_errno;
863 int r;
864 unsigned suffix = 0;
865 bool had_sub_pattern = false;
866 size_t i;
867
868 if (!is_valid_cgroup(name)) {
869 ERROR("Invalid cgroup name: '%s'", name);
870 errno = EINVAL;
871 return NULL;
872 }
873
874 if (!strstr(path_pattern, "%n")) {
875 ERROR("Invalid cgroup path pattern: '%s'; contains no %%n for specifying container name", path_pattern);
876 errno = EINVAL;
877 return NULL;
878 }
879
880 /* we will modify the result of this operation directly,
881 * so we don't have to copy the data structure
882 */
883 base_info = (path_pattern[0] == '/') ?
884 lxc_cgroup_process_info_get_init(meta_data) :
885 lxc_cgroup_process_info_get_self(meta_data);
886 if (!base_info)
887 return NULL;
888
889 new_cgroup_paths = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
890 if (!new_cgroup_paths)
891 goto out_initial_error;
892
893 new_cgroup_paths_sub = calloc(meta_data->maximum_hierarchy + 1, sizeof(char *));
894 if (!new_cgroup_paths_sub)
895 goto out_initial_error;
896
897 /* find mount points we can use */
898 for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
899 h = info_ptr->hierarchy;
900 if (!h)
901 continue;
902 mp = lxc_cgroup_find_mount_point(h, info_ptr->cgroup_path, true);
903 if (!mp) {
904 ERROR("Could not find writable mount point for cgroup hierarchy %d while trying to create cgroup.", h->index);
905 goto out_initial_error;
906 }
907 info_ptr->designated_mount_point = mp;
908
909 if (lxc_string_in_array("ns", (const char **)h->subsystems))
910 continue;
911 if (handle_cgroup_settings(mp, info_ptr->cgroup_path) < 0) {
912 ERROR("Could not set clone_children to 1 for cpuset hierarchy in parent cgroup.");
913 goto out_initial_error;
914 }
915 }
916
917 /* normalize the path */
918 cgroup_path_components = lxc_normalize_path(path_pattern);
919 if (!cgroup_path_components)
920 goto out_initial_error;
921
922 /* go through the path components to see if we can create them */
923 for (p = cgroup_path_components; *p || (sub_pattern && !had_sub_pattern); p++) {
924 /* we only want to create the same component with -1, -2, etc.
925 * if the component contains the container name itself, otherwise
926 * it's not an error if it already exists
927 */
928 char *p_eff = *p ? *p : (char *)sub_pattern;
929 bool contains_name = strstr(p_eff, "%n");
930 char *current_component = NULL;
931 char *current_subpath = NULL;
932 char *current_entire_path = NULL;
933 char *parts[3];
934 size_t j = 0;
935 i = 0;
936
937 /* if we are processing the subpattern, we want to make sure
938 * loop is ended the next time around
939 */
940 if (!*p) {
941 had_sub_pattern = true;
942 p--;
943 }
944
945 goto find_name_on_this_level;
946
947 cleanup_name_on_this_level:
948 /* This is reached if we found a name clash.
949 * In that case, remove the cgroup from all previous hierarchies
950 */
951 for (j = 0, info_ptr = base_info; j < i && info_ptr; info_ptr = info_ptr->next, j++) {
952 if (info_ptr->created_paths_count < 1)
953 continue;
954 r = remove_cgroup(info_ptr->designated_mount_point, info_ptr->created_paths[info_ptr->created_paths_count - 1], false, NULL);
955 if (r < 0)
956 WARN("could not clean up cgroup we created when trying to create container");
957 free(info_ptr->created_paths[info_ptr->created_paths_count - 1]);
958 info_ptr->created_paths[--info_ptr->created_paths_count] = NULL;
959 }
960 if (current_component != current_subpath)
961 free(current_subpath);
962 if (current_component != p_eff)
963 free(current_component);
964 current_component = current_subpath = NULL;
965 /* try again with another suffix */
966 ++suffix;
967
968 find_name_on_this_level:
969 /* determine name of the path component we should create */
970 if (contains_name && suffix > 0) {
971 char *buf = calloc(strlen(name) + 32, 1);
972 if (!buf)
973 goto out_initial_error;
974 snprintf(buf, strlen(name) + 32, "%s-%u", name, suffix);
975 current_component = lxc_string_replace("%n", buf, p_eff);
976 free(buf);
977 } else {
978 current_component = contains_name ? lxc_string_replace("%n", name, p_eff) : p_eff;
979 }
980 parts[0] = path_so_far;
981 parts[1] = current_component;
982 parts[2] = NULL;
983 current_subpath = path_so_far ? lxc_string_join("/", (const char **)parts, false) : current_component;
984
985 /* Now go through each hierarchy and try to create the
986 * corresponding cgroup
987 */
988 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
989 char *parts2[3];
990
991 if (!info_ptr->hierarchy)
992 continue;
993
994 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
995 continue;
996 current_entire_path = NULL;
997
998 parts2[0] = !strcmp(info_ptr->cgroup_path, "/") ? "" : info_ptr->cgroup_path;
999 parts2[1] = current_subpath;
1000 parts2[2] = NULL;
1001 current_entire_path = lxc_string_join("/", (const char **)parts2, false);
1002
1003 if (!*p) {
1004 /* we are processing the subpath, so only update that one */
1005 free(new_cgroup_paths_sub[i]);
1006 new_cgroup_paths_sub[i] = strdup(current_entire_path);
1007 if (!new_cgroup_paths_sub[i])
1008 goto cleanup_from_error;
1009 } else {
1010 /* remember which path was used on this controller */
1011 free(new_cgroup_paths[i]);
1012 new_cgroup_paths[i] = strdup(current_entire_path);
1013 if (!new_cgroup_paths[i])
1014 goto cleanup_from_error;
1015 }
1016
1017 r = create_cgroup(info_ptr->designated_mount_point, current_entire_path);
1018 if (r < 0 && errno == EEXIST && contains_name) {
1019 /* name clash => try new name with new suffix */
1020 free(current_entire_path);
1021 current_entire_path = NULL;
1022 goto cleanup_name_on_this_level;
1023 } else if (r < 0 && errno != EEXIST) {
1024 if (is_crucial_hierarchy(info_ptr->hierarchy)) {
1025 SYSERROR("Could not create cgroup '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
1026 goto cleanup_from_error;
1027 }
1028 goto skip;
1029 } else if (r == 0) {
1030 /* successfully created */
1031 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1032 if (r < 0)
1033 goto cleanup_from_error;
1034 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, current_entire_path)) {
1035 ERROR("Failed to initialize cpuset for '%s' in '%s'.", current_entire_path, info_ptr->designated_mount_point->mount_point);
1036 goto cleanup_from_error;
1037 }
1038 info_ptr->created_paths[info_ptr->created_paths_count++] = current_entire_path;
1039 } else {
1040 /* if we didn't create the cgroup, then we have to make sure that
1041 * further cgroups will be created properly
1042 */
1043 if (handle_cgroup_settings(info_ptr->designated_mount_point, info_ptr->cgroup_path) < 0) {
1044 ERROR("Could not set clone_children to 1 for cpuset hierarchy in pre-existing cgroup.");
1045 goto cleanup_from_error;
1046 }
1047 if (!init_cpuset_if_needed(info_ptr->designated_mount_point, info_ptr->cgroup_path)) {
1048 ERROR("Failed to initialize cpuset in pre-existing '%s'.", info_ptr->cgroup_path);
1049 goto cleanup_from_error;
1050 }
1051
1052 skip:
1053 /* already existed but path component of pattern didn't contain '%n',
1054 * so this is not an error; but then we don't need current_entire_path
1055 * anymore...
1056 */
1057 free(current_entire_path);
1058 current_entire_path = NULL;
1059 }
1060 }
1061
1062 /* save path so far */
1063 free(path_so_far);
1064 path_so_far = strdup(current_subpath);
1065 if (!path_so_far)
1066 goto cleanup_from_error;
1067
1068 /* cleanup */
1069 if (current_component != current_subpath)
1070 free(current_subpath);
1071 if (current_component != p_eff)
1072 free(current_component);
1073 current_component = current_subpath = NULL;
1074 continue;
1075
1076 cleanup_from_error:
1077 /* called if an error occurred in the loop, so we
1078 * do some additional cleanup here
1079 */
1080 saved_errno = errno;
1081 if (current_component != current_subpath)
1082 free(current_subpath);
1083 if (current_component != p_eff)
1084 free(current_component);
1085 free(current_entire_path);
1086 errno = saved_errno;
1087 goto out_initial_error;
1088 }
1089
1090 /* we're done, now update the paths */
1091 for (i = 0, info_ptr = base_info; info_ptr; info_ptr = info_ptr->next, i++) {
1092 if (!info_ptr->hierarchy)
1093 continue;
1094 /* ignore legacy 'ns' subsystem here, lxc_cgroup_create_legacy
1095 * will take care of it
1096 * Since we do a continue in above loop, new_cgroup_paths[i] is
1097 * unset anyway, as is new_cgroup_paths_sub[i]
1098 */
1099 if (lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1100 continue;
1101 free(info_ptr->cgroup_path);
1102 info_ptr->cgroup_path = new_cgroup_paths[i];
1103 info_ptr->cgroup_path_sub = new_cgroup_paths_sub[i];
1104 }
1105 /* don't use lxc_free_array since we used the array members
1106 * to store them in our result...
1107 */
1108 free(new_cgroup_paths);
1109 free(new_cgroup_paths_sub);
1110 free(path_so_far);
1111 lxc_free_array((void **)cgroup_path_components, free);
1112 return base_info;
1113
1114 out_initial_error:
1115 saved_errno = errno;
1116 free(path_so_far);
1117 lxc_cgroup_process_info_free_and_remove(base_info, NULL);
1118 lxc_free_array((void **)new_cgroup_paths, free);
1119 lxc_free_array((void **)new_cgroup_paths_sub, free);
1120 lxc_free_array((void **)cgroup_path_components, free);
1121 errno = saved_errno;
1122 return NULL;
1123 }
1124
1125 static int lxc_cgroup_create_legacy(struct cgroup_process_info *base_info, const char *name, pid_t pid)
1126 {
1127 struct cgroup_process_info *info_ptr;
1128 int r;
1129
1130 for (info_ptr = base_info; info_ptr; info_ptr = info_ptr->next) {
1131 if (!info_ptr->hierarchy)
1132 continue;
1133
1134 if (!lxc_string_in_array("ns", (const char **)info_ptr->hierarchy->subsystems))
1135 continue;
1136 /*
1137 * For any path which has ns cgroup mounted, handler->pid is already
1138 * moved into a container called '%d % (handler->pid)'. Rename it to
1139 * the cgroup name and record that.
1140 */
1141 char *tmp = cgroup_rename_nsgroup((const char *)info_ptr->designated_mount_point->mount_point,
1142 info_ptr->cgroup_path, pid, name);
1143 if (!tmp)
1144 return -1;
1145 free(info_ptr->cgroup_path);
1146 info_ptr->cgroup_path = tmp;
1147 r = lxc_grow_array((void ***)&info_ptr->created_paths, &info_ptr->created_paths_capacity, info_ptr->created_paths_count + 1, 8);
1148 if (r < 0)
1149 return -1;
1150 tmp = strdup(tmp);
1151 if (!tmp)
1152 return -1;
1153 info_ptr->created_paths[info_ptr->created_paths_count++] = tmp;
1154 }
1155 return 0;
1156 }
1157
1158 /* get the cgroup membership of a given container */
1159 static struct cgroup_process_info *lxc_cgroup_get_container_info(const char *name, const char *lxcpath, struct cgroup_meta_data *meta_data)
1160 {
1161 struct cgroup_process_info *result = NULL;
1162 int saved_errno = 0;
1163 size_t i;
1164 struct cgroup_process_info **cptr = &result;
1165 struct cgroup_process_info *entry = NULL;
1166 char *path = NULL;
1167
1168 for (i = 0; i <= meta_data->maximum_hierarchy; i++) {
1169 struct cgroup_hierarchy *h = meta_data->hierarchies[i];
1170 if (!h || !h->used)
1171 continue;
1172
1173 /* use the command interface to look for the cgroup */
1174 path = lxc_cmd_get_cgroup_path(name, lxcpath, h->subsystems[0]);
1175 if (!path) {
1176 h->used = false;
1177 continue;
1178 }
1179
1180 entry = calloc(1, sizeof(struct cgroup_process_info));
1181 if (!entry)
1182 goto out_error;
1183 entry->meta_ref = lxc_cgroup_get_meta(meta_data);
1184 entry->hierarchy = h;
1185 entry->cgroup_path = path;
1186 path = NULL;
1187
1188 /* it is not an error if we don't find anything here,
1189 * it is up to the caller to decide what to do in that
1190 * case */
1191 entry->designated_mount_point = lxc_cgroup_find_mount_point(h, entry->cgroup_path, true);
1192
1193 *cptr = entry;
1194 cptr = &entry->next;
1195 entry = NULL;
1196 }
1197
1198 return result;
1199 out_error:
1200 saved_errno = errno;
1201 free(path);
1202 lxc_cgroup_process_info_free(result);
1203 lxc_cgroup_process_info_free(entry);
1204 errno = saved_errno;
1205 return NULL;
1206 }
1207
1208 /* move a processs to the cgroups specified by the membership */
1209 static int lxc_cgroupfs_enter(struct cgroup_process_info *info, pid_t pid, bool enter_sub)
1210 {
1211 char pid_buf[32];
1212 char *cgroup_tasks_fn;
1213 int r;
1214 struct cgroup_process_info *info_ptr;
1215
1216 snprintf(pid_buf, 32, "%lu", (unsigned long)pid);
1217 for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1218 if (!info_ptr->hierarchy)
1219 continue;
1220
1221 char *cgroup_path = (enter_sub && info_ptr->cgroup_path_sub) ?
1222 info_ptr->cgroup_path_sub :
1223 info_ptr->cgroup_path;
1224
1225 if (!info_ptr->designated_mount_point) {
1226 info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, cgroup_path, true);
1227 if (!info_ptr->designated_mount_point) {
1228 SYSERROR("Could not add pid %lu to cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", (unsigned long)pid, cgroup_path);
1229 return -1;
1230 }
1231 }
1232
1233 cgroup_tasks_fn = cgroup_to_absolute_path(info_ptr->designated_mount_point, cgroup_path, "/tasks");
1234 if (!cgroup_tasks_fn) {
1235 SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1236 return -1;
1237 }
1238
1239 r = lxc_write_to_file(cgroup_tasks_fn, pid_buf, strlen(pid_buf), false);
1240 free(cgroup_tasks_fn);
1241 if (r < 0 && is_crucial_hierarchy(info_ptr->hierarchy)) {
1242 SYSERROR("Could not add pid %lu to cgroup %s: internal error", (unsigned long)pid, cgroup_path);
1243 return -1;
1244 }
1245 }
1246
1247 return 0;
1248 }
1249
1250 /* free process membership information */
1251 void lxc_cgroup_process_info_free(struct cgroup_process_info *info)
1252 {
1253 struct cgroup_process_info *next;
1254 if (!info)
1255 return;
1256 next = info->next;
1257 lxc_cgroup_put_meta(info->meta_ref);
1258 free(info->cgroup_path);
1259 free(info->cgroup_path_sub);
1260 lxc_free_array((void **)info->created_paths, free);
1261 free(info);
1262 lxc_cgroup_process_info_free(next);
1263 }
1264
1265 /* free process membership information and remove cgroups that were created */
1266 void lxc_cgroup_process_info_free_and_remove(struct cgroup_process_info *info, struct lxc_conf *conf)
1267 {
1268 struct cgroup_process_info *next;
1269 char **pp;
1270 if (!info)
1271 return;
1272 next = info->next;
1273 {
1274 struct cgroup_mount_point *mp = info->designated_mount_point;
1275 if (!mp)
1276 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1277 if (mp)
1278 /* ignore return value here, perhaps we created the
1279 * '/lxc' cgroup in this container but another container
1280 * is still running (for example)
1281 */
1282 (void)remove_cgroup(mp, info->cgroup_path, true, conf);
1283 }
1284 for (pp = info->created_paths; pp && *pp; pp++);
1285 for ((void)(pp && --pp); info->created_paths && pp >= info->created_paths; --pp) {
1286 free(*pp);
1287 }
1288 free(info->created_paths);
1289 lxc_cgroup_put_meta(info->meta_ref);
1290 free(info->cgroup_path);
1291 free(info->cgroup_path_sub);
1292 free(info);
1293 lxc_cgroup_process_info_free_and_remove(next, conf);
1294 }
1295
1296 static char *lxc_cgroup_get_hierarchy_path_data(const char *subsystem, struct cgfs_data *d)
1297 {
1298 struct cgroup_process_info *info = d->info;
1299 info = find_info_for_subsystem(info, subsystem);
1300 if (!info)
1301 return NULL;
1302 prune_init_scope(info->cgroup_path);
1303 return info->cgroup_path;
1304 }
1305
1306 static char *lxc_cgroup_get_hierarchy_abs_path_data(const char *subsystem, struct cgfs_data *d)
1307 {
1308 struct cgroup_process_info *info = d->info;
1309 struct cgroup_mount_point *mp = NULL;
1310
1311 info = find_info_for_subsystem(info, subsystem);
1312 if (!info)
1313 return NULL;
1314 if (info->designated_mount_point) {
1315 mp = info->designated_mount_point;
1316 } else {
1317 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1318 if (!mp)
1319 return NULL;
1320 }
1321 return cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1322 }
1323
1324 static char *lxc_cgroup_get_hierarchy_abs_path(const char *subsystem, const char *name, const char *lxcpath)
1325 {
1326 struct cgroup_meta_data *meta;
1327 struct cgroup_process_info *base_info, *info;
1328 struct cgroup_mount_point *mp;
1329 char *result = NULL;
1330
1331 meta = lxc_cgroup_load_meta();
1332 if (!meta)
1333 return NULL;
1334 base_info = lxc_cgroup_get_container_info(name, lxcpath, meta);
1335 if (!base_info)
1336 goto out1;
1337 info = find_info_for_subsystem(base_info, subsystem);
1338 if (!info)
1339 goto out2;
1340 if (info->designated_mount_point) {
1341 mp = info->designated_mount_point;
1342 } else {
1343 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1344 if (!mp)
1345 goto out3;
1346 }
1347 result = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1348 out3:
1349 out2:
1350 lxc_cgroup_process_info_free(base_info);
1351 out1:
1352 lxc_cgroup_put_meta(meta);
1353 return result;
1354 }
1355
1356 static int lxc_cgroup_set_data(const char *filename, const char *value, struct cgfs_data *d)
1357 {
1358 char *subsystem = NULL, *p, *path;
1359 int ret = -1;
1360
1361 subsystem = alloca(strlen(filename) + 1);
1362 strcpy(subsystem, filename);
1363 if ((p = strchr(subsystem, '.')) != NULL)
1364 *p = '\0';
1365
1366 errno = ENOENT;
1367 path = lxc_cgroup_get_hierarchy_abs_path_data(subsystem, d);
1368 if (path) {
1369 ret = do_cgroup_set(path, filename, value);
1370 int saved_errno = errno;
1371 free(path);
1372 errno = saved_errno;
1373 }
1374 return ret;
1375 }
1376
1377 static int lxc_cgroupfs_set(const char *filename, const char *value, const char *name, const char *lxcpath)
1378 {
1379 char *subsystem = NULL, *p, *path;
1380 int ret = -1;
1381
1382 subsystem = alloca(strlen(filename) + 1);
1383 strcpy(subsystem, filename);
1384 if ((p = strchr(subsystem, '.')) != NULL)
1385 *p = '\0';
1386
1387 path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1388 if (path) {
1389 ret = do_cgroup_set(path, filename, value);
1390 free(path);
1391 }
1392 return ret;
1393 }
1394
1395 static int lxc_cgroupfs_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
1396 {
1397 char *subsystem = NULL, *p, *path;
1398 int ret = -1;
1399
1400 subsystem = alloca(strlen(filename) + 1);
1401 strcpy(subsystem, filename);
1402 if ((p = strchr(subsystem, '.')) != NULL)
1403 *p = '\0';
1404
1405 path = lxc_cgroup_get_hierarchy_abs_path(subsystem, name, lxcpath);
1406 if (path) {
1407 ret = do_cgroup_get(path, filename, value, len);
1408 free(path);
1409 }
1410 return ret;
1411 }
1412
1413 static bool cgroupfs_mount_cgroup(void *hdata, const char *root, int type)
1414 {
1415 size_t bufsz = strlen(root) + sizeof("/sys/fs/cgroup");
1416 char *path = NULL;
1417 char **parts = NULL;
1418 char *dirname = NULL;
1419 char *abs_path = NULL;
1420 char *abs_path2 = NULL;
1421 struct cgfs_data *cgfs_d;
1422 struct cgroup_process_info *info, *base_info;
1423 int r, saved_errno = 0;
1424 struct lxc_handler *handler = hdata;
1425
1426 if (cgns_supported())
1427 return true;
1428
1429 cgfs_d = handler->cgroup_data;
1430 if (!cgfs_d)
1431 return false;
1432 base_info = cgfs_d->info;
1433
1434 /* If we get passed the _NOSPEC types, we default to _MIXED, since we don't
1435 * have access to the lxc_conf object at this point. It really should be up
1436 * to the caller to fix this, but this doesn't really hurt.
1437 */
1438 if (type == LXC_AUTO_CGROUP_FULL_NOSPEC)
1439 type = LXC_AUTO_CGROUP_FULL_MIXED;
1440 else if (type == LXC_AUTO_CGROUP_NOSPEC)
1441 type = LXC_AUTO_CGROUP_MIXED;
1442
1443 if (type < LXC_AUTO_CGROUP_RO || type > LXC_AUTO_CGROUP_FULL_MIXED) {
1444 ERROR("could not mount cgroups into container: invalid type specified internally");
1445 errno = EINVAL;
1446 return false;
1447 }
1448
1449 path = calloc(1, bufsz);
1450 if (!path)
1451 return false;
1452 snprintf(path, bufsz, "%s/sys/fs/cgroup", root);
1453 r = safe_mount("cgroup_root", path, "tmpfs",
1454 MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME,
1455 "size=10240k,mode=755",
1456 root);
1457 if (r < 0) {
1458 SYSERROR("could not mount tmpfs to /sys/fs/cgroup in the container");
1459 return false;
1460 }
1461
1462 /* now mount all the hierarchies we care about */
1463 for (info = base_info; info; info = info->next) {
1464 size_t subsystem_count, i;
1465 struct cgroup_mount_point *mp = info->designated_mount_point;
1466
1467 if (!info->hierarchy)
1468 continue;
1469
1470 if (!mountpoint_is_accessible(mp))
1471 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, true);
1472
1473 if (!mp) {
1474 SYSERROR("could not find original mount point for cgroup hierarchy while trying to mount cgroup filesystem");
1475 goto out_error;
1476 }
1477
1478 subsystem_count = lxc_array_len((void **)info->hierarchy->subsystems);
1479 parts = calloc(subsystem_count + 1, sizeof(char *));
1480 if (!parts)
1481 goto out_error;
1482
1483 for (i = 0; i < subsystem_count; i++) {
1484 if (!strncmp(info->hierarchy->subsystems[i], "name=", 5))
1485 parts[i] = info->hierarchy->subsystems[i] + 5;
1486 else
1487 parts[i] = info->hierarchy->subsystems[i];
1488 }
1489 dirname = lxc_string_join(",", (const char **)parts, false);
1490 if (!dirname)
1491 goto out_error;
1492
1493 /* create subsystem directory */
1494 abs_path = lxc_append_paths(path, dirname);
1495 if (!abs_path)
1496 goto out_error;
1497 r = mkdir_p(abs_path, 0755);
1498 if (r < 0 && errno != EEXIST) {
1499 SYSERROR("could not create cgroup subsystem directory /sys/fs/cgroup/%s", dirname);
1500 goto out_error;
1501 }
1502
1503 abs_path2 = lxc_append_paths(abs_path, info->cgroup_path);
1504 if (!abs_path2)
1505 goto out_error;
1506
1507 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_RW || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1508 /* bind-mount the cgroup entire filesystem there */
1509 if (strcmp(mp->mount_prefix, "/") != 0) {
1510 /* FIXME: maybe we should just try to remount the entire hierarchy
1511 * with a regular mount command? may that works? */
1512 ERROR("could not automatically mount cgroup-full to /sys/fs/cgroup/%s: host has no mount point for this cgroup filesystem that has access to the root cgroup", dirname);
1513 goto out_error;
1514 }
1515 r = mount(mp->mount_point, abs_path, "none", MS_BIND, 0);
1516 if (r < 0) {
1517 SYSERROR("error bind-mounting %s to %s", mp->mount_point, abs_path);
1518 goto out_error;
1519 }
1520 /* main cgroup path should be read-only */
1521 if (type == LXC_AUTO_CGROUP_FULL_RO || type == LXC_AUTO_CGROUP_FULL_MIXED) {
1522 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1523 if (r < 0) {
1524 SYSERROR("error re-mounting %s readonly", abs_path);
1525 goto out_error;
1526 }
1527 }
1528 /* own cgroup should be read-write */
1529 if (type == LXC_AUTO_CGROUP_FULL_MIXED) {
1530 r = mount(abs_path2, abs_path2, NULL, MS_BIND, NULL);
1531 if (r < 0) {
1532 SYSERROR("error bind-mounting %s onto itself", abs_path2);
1533 goto out_error;
1534 }
1535 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND, NULL);
1536 if (r < 0) {
1537 SYSERROR("error re-mounting %s readwrite", abs_path2);
1538 goto out_error;
1539 }
1540 }
1541 } else {
1542 /* create path for container's cgroup */
1543 r = mkdir_p(abs_path2, 0755);
1544 if (r < 0 && errno != EEXIST) {
1545 SYSERROR("could not create cgroup directory /sys/fs/cgroup/%s%s", dirname, info->cgroup_path);
1546 goto out_error;
1547 }
1548
1549 /* for read-only and mixed cases, we have to bind-mount the tmpfs directory
1550 * that points to the hierarchy itself (i.e. /sys/fs/cgroup/cpu etc.) onto
1551 * itself and then bind-mount it read-only, since we keep the tmpfs itself
1552 * read-write (see comment below)
1553 */
1554 if (type == LXC_AUTO_CGROUP_MIXED || type == LXC_AUTO_CGROUP_RO) {
1555 r = mount(abs_path, abs_path, NULL, MS_BIND, NULL);
1556 if (r < 0) {
1557 SYSERROR("error bind-mounting %s onto itself", abs_path);
1558 goto out_error;
1559 }
1560 r = mount(NULL, abs_path, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1561 if (r < 0) {
1562 SYSERROR("error re-mounting %s readonly", abs_path);
1563 goto out_error;
1564 }
1565 }
1566
1567 free(abs_path);
1568 abs_path = NULL;
1569
1570 /* bind-mount container's cgroup to that directory */
1571 abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1572 if (!abs_path)
1573 goto out_error;
1574 r = mount(abs_path, abs_path2, "none", MS_BIND, 0);
1575 if (r < 0 && is_crucial_hierarchy(info->hierarchy)) {
1576 SYSERROR("error bind-mounting %s to %s", abs_path, abs_path2);
1577 goto out_error;
1578 }
1579 if (type == LXC_AUTO_CGROUP_RO) {
1580 r = mount(NULL, abs_path2, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL);
1581 if (r < 0) {
1582 SYSERROR("error re-mounting %s readonly", abs_path2);
1583 goto out_error;
1584 }
1585 }
1586 }
1587
1588 free(abs_path);
1589 free(abs_path2);
1590 abs_path = NULL;
1591 abs_path2 = NULL;
1592
1593 /* add symlinks for every single subsystem */
1594 if (subsystem_count > 1) {
1595 for (i = 0; i < subsystem_count; i++) {
1596 abs_path = lxc_append_paths(path, parts[i]);
1597 if (!abs_path)
1598 goto out_error;
1599 r = symlink(dirname, abs_path);
1600 if (r < 0)
1601 WARN("could not create symlink %s -> %s in /sys/fs/cgroup of container", parts[i], dirname);
1602 free(abs_path);
1603 abs_path = NULL;
1604 }
1605 }
1606 free(dirname);
1607 free(parts);
1608 dirname = NULL;
1609 parts = NULL;
1610 }
1611
1612 /* We used to remount the entire tmpfs readonly if any :ro or
1613 * :mixed mode was specified. However, Ubuntu's mountall has the
1614 * unfortunate behavior to block bootup if /sys/fs/cgroup is
1615 * mounted read-only and cannot be remounted read-write.
1616 * (mountall reads /lib/init/fstab and tries to (re-)mount all of
1617 * these if they are not already mounted with the right options;
1618 * it contains an entry for /sys/fs/cgroup. In case it can't do
1619 * that, it prompts for the user to either manually fix it or
1620 * boot anyway. But without user input, booting of the container
1621 * hangs.)
1622 *
1623 * Instead of remounting the entire tmpfs readonly, we only
1624 * remount the paths readonly that are part of the cgroup
1625 * hierarchy.
1626 */
1627
1628 free(path);
1629
1630 return true;
1631
1632 out_error:
1633 saved_errno = errno;
1634 free(path);
1635 free(dirname);
1636 free(parts);
1637 free(abs_path);
1638 free(abs_path2);
1639 errno = saved_errno;
1640 return false;
1641 }
1642
1643 static int cgfs_nrtasks(void *hdata)
1644 {
1645 struct cgfs_data *d = hdata;
1646 struct cgroup_process_info *info;
1647 struct cgroup_mount_point *mp = NULL;
1648 char *abs_path = NULL;
1649 int ret;
1650
1651 if (!d) {
1652 errno = ENOENT;
1653 return -1;
1654 }
1655
1656 info = d->info;
1657 if (!info) {
1658 errno = ENOENT;
1659 return -1;
1660 }
1661
1662 if (info->designated_mount_point) {
1663 mp = info->designated_mount_point;
1664 } else {
1665 mp = lxc_cgroup_find_mount_point(info->hierarchy, info->cgroup_path, false);
1666 if (!mp)
1667 return -1;
1668 }
1669
1670 abs_path = cgroup_to_absolute_path(mp, info->cgroup_path, NULL);
1671 if (!abs_path)
1672 return -1;
1673
1674 ret = cgroup_recursive_task_count(abs_path);
1675 free(abs_path);
1676 return ret;
1677 }
1678
1679 static struct cgroup_process_info *
1680 lxc_cgroup_process_info_getx(const char *proc_pid_cgroup_str,
1681 struct cgroup_meta_data *meta)
1682 {
1683 struct cgroup_process_info *result = NULL;
1684 FILE *proc_pid_cgroup = NULL;
1685 char *line = NULL;
1686 size_t sz = 0;
1687 int saved_errno = 0;
1688 struct cgroup_process_info **cptr = &result;
1689 struct cgroup_process_info *entry = NULL;
1690
1691 proc_pid_cgroup = fopen_cloexec(proc_pid_cgroup_str, "r");
1692 if (!proc_pid_cgroup)
1693 return NULL;
1694
1695 while (getline(&line, &sz, proc_pid_cgroup) != -1) {
1696 /* file format: hierarchy:subsystems:group */
1697 char *colon1;
1698 char *colon2;
1699 char *endptr;
1700 int hierarchy_number;
1701 struct cgroup_hierarchy *h = NULL;
1702
1703 if (!line[0])
1704 continue;
1705
1706 if (line[strlen(line) - 1] == '\n')
1707 line[strlen(line) - 1] = '\0';
1708
1709 colon1 = strchr(line, ':');
1710 if (!colon1)
1711 continue;
1712 *colon1++ = '\0';
1713 colon2 = strchr(colon1, ':');
1714 if (!colon2)
1715 continue;
1716 *colon2++ = '\0';
1717
1718 endptr = NULL;
1719
1720 /* With cgroupv2 /proc/self/cgroup can contain entries of the
1721 * form: 0::/
1722 * These entries need to be skipped.
1723 */
1724 if (!strcmp(colon1, ""))
1725 continue;
1726
1727 hierarchy_number = strtoul(line, &endptr, 10);
1728 if (!endptr || *endptr)
1729 continue;
1730
1731 if (hierarchy_number > meta->maximum_hierarchy) {
1732 /* we encountered a hierarchy we didn't have before,
1733 * so probably somebody remounted some stuff in the
1734 * mean time...
1735 */
1736 errno = EAGAIN;
1737 goto out_error;
1738 }
1739
1740 h = meta->hierarchies[hierarchy_number];
1741 if (!h) {
1742 /* we encountered a hierarchy that was thought to be
1743 * dead before, so probably somebody remounted some
1744 * stuff in the mean time...
1745 */
1746 errno = EAGAIN;
1747 goto out_error;
1748 }
1749
1750 /* we are told that we should ignore this hierarchy */
1751 if (!h->used)
1752 continue;
1753
1754 entry = calloc(1, sizeof(struct cgroup_process_info));
1755 if (!entry)
1756 goto out_error;
1757
1758 entry->meta_ref = lxc_cgroup_get_meta(meta);
1759 entry->hierarchy = h;
1760 entry->cgroup_path = strdup(colon2);
1761 if (!entry->cgroup_path)
1762 goto out_error;
1763 prune_init_scope(entry->cgroup_path);
1764
1765 *cptr = entry;
1766 cptr = &entry->next;
1767 entry = NULL;
1768 }
1769
1770 fclose(proc_pid_cgroup);
1771 free(line);
1772 return result;
1773
1774 out_error:
1775 saved_errno = errno;
1776 if (proc_pid_cgroup)
1777 fclose(proc_pid_cgroup);
1778 lxc_cgroup_process_info_free(result);
1779 lxc_cgroup_process_info_free(entry);
1780 free(line);
1781 errno = saved_errno;
1782 return NULL;
1783 }
1784
1785 static char **subsystems_from_mount_options(const char *mount_options,
1786 char **kernel_list)
1787 {
1788 char *token, *str, *saveptr = NULL;
1789 char **result = NULL;
1790 size_t result_capacity = 0;
1791 size_t result_count = 0;
1792 int saved_errno;
1793 int r;
1794
1795 str = alloca(strlen(mount_options)+1);
1796 strcpy(str, mount_options);
1797 for (; (token = strtok_r(str, ",", &saveptr)); str = NULL) {
1798 /* we have a subsystem if it's either in the list of
1799 * subsystems provided by the kernel OR if it starts
1800 * with name= for named hierarchies
1801 */
1802 r = lxc_grow_array((void ***)&result, &result_capacity, result_count + 1, 12);
1803 if (r < 0)
1804 goto out_free;
1805 result[result_count + 1] = NULL;
1806 if (strncmp(token, "name=", 5) && !lxc_string_in_array(token, (const char **)kernel_list)) {
1807 /* this is eg 'systemd' but the mount will be
1808 * 'name=systemd'
1809 */
1810 result[result_count] = malloc(strlen(token) + 6);
1811 if (result[result_count])
1812 sprintf(result[result_count], "name=%s", token);
1813 } else
1814 result[result_count] = strdup(token);
1815 if (!result[result_count])
1816 goto out_free;
1817 result_count++;
1818 }
1819
1820 return result;
1821
1822 out_free:
1823 saved_errno = errno;
1824 lxc_free_array((void**)result, free);
1825 errno = saved_errno;
1826 return NULL;
1827 }
1828
1829 static void lxc_cgroup_mount_point_free(struct cgroup_mount_point *mp)
1830 {
1831 if (!mp)
1832 return;
1833 free(mp->mount_point);
1834 free(mp->mount_prefix);
1835 free(mp);
1836 }
1837
1838 static void lxc_cgroup_hierarchy_free(struct cgroup_hierarchy *h)
1839 {
1840 if (!h)
1841 return;
1842 if (h->subsystems) {
1843 lxc_free_array((void **)h->subsystems, free);
1844 h->subsystems = NULL;
1845 }
1846 if (h->all_mount_points) {
1847 free(h->all_mount_points);
1848 h->all_mount_points = NULL;
1849 }
1850 free(h);
1851 h = NULL;
1852 }
1853
1854 static bool is_valid_cgroup(const char *name)
1855 {
1856 const char *p;
1857 for (p = name; *p; p++) {
1858 /* Use the ASCII printable characters range(32 - 127)
1859 * is reasonable, we kick out 32(SPACE) because it'll
1860 * break legacy lxc-ls
1861 */
1862 if (*p <= 32 || *p >= 127 || *p == '/')
1863 return false;
1864 }
1865 return strcmp(name, ".") != 0 && strcmp(name, "..") != 0;
1866 }
1867
1868 static int create_or_remove_cgroup(bool do_remove,
1869 struct cgroup_mount_point *mp, const char *path, int recurse,
1870 struct lxc_conf *conf)
1871 {
1872 int r, saved_errno = 0;
1873 char *buf = cgroup_to_absolute_path(mp, path, NULL);
1874 if (!buf)
1875 return -1;
1876
1877 /* create or remove directory */
1878 if (do_remove) {
1879 if (!dir_exists(buf))
1880 return 0;
1881 if (recurse) {
1882 if (conf && !lxc_list_empty(&conf->id_map))
1883 r = userns_exec_1(conf, rmdir_wrapper, buf,
1884 "rmdir_wrapper");
1885 else
1886 r = cgroup_rmdir(buf);
1887 } else
1888 r = rmdir(buf);
1889 } else
1890 r = mkdir_p(buf, 0777);
1891 saved_errno = errno;
1892 free(buf);
1893 errno = saved_errno;
1894 return r;
1895 }
1896
1897 static int create_cgroup(struct cgroup_mount_point *mp, const char *path)
1898 {
1899 return create_or_remove_cgroup(false, mp, path, false, NULL);
1900 }
1901
1902 static int remove_cgroup(struct cgroup_mount_point *mp,
1903 const char *path, bool recurse, struct lxc_conf *conf)
1904 {
1905 return create_or_remove_cgroup(true, mp, path, recurse, conf);
1906 }
1907
1908 static char *cgroup_to_absolute_path(struct cgroup_mount_point *mp,
1909 const char *path, const char *suffix)
1910 {
1911 /* first we have to make sure we subtract the mount point's prefix */
1912 char *prefix = mp->mount_prefix;
1913 char *buf;
1914 ssize_t len, rv;
1915
1916 /* we want to make sure only absolute paths to cgroups are passed to us */
1917 if (path[0] != '/') {
1918 errno = EINVAL;
1919 return NULL;
1920 }
1921
1922 if (prefix && !strcmp(prefix, "/"))
1923 prefix = NULL;
1924
1925 /* prefix doesn't match */
1926 if (prefix && strncmp(prefix, path, strlen(prefix)) != 0) {
1927 errno = EINVAL;
1928 return NULL;
1929 }
1930 /* if prefix is /foo and path is /foobar */
1931 if (prefix && path[strlen(prefix)] != '/' && path[strlen(prefix)] != '\0') {
1932 errno = EINVAL;
1933 return NULL;
1934 }
1935
1936 /* remove prefix from path */
1937 path += prefix ? strlen(prefix) : 0;
1938
1939 len = strlen(mp->mount_point) + strlen(path) + (suffix ? strlen(suffix) : 0);
1940 buf = calloc(len + 1, 1);
1941 if (!buf)
1942 return NULL;
1943 rv = snprintf(buf, len + 1, "%s%s%s", mp->mount_point, path, suffix ? suffix : "");
1944 if (rv > len) {
1945 free(buf);
1946 errno = ENOMEM;
1947 return NULL;
1948 }
1949
1950 return buf;
1951 }
1952
1953 static struct cgroup_process_info *
1954 find_info_for_subsystem(struct cgroup_process_info *info, const char *subsystem)
1955 {
1956 struct cgroup_process_info *info_ptr;
1957 for (info_ptr = info; info_ptr; info_ptr = info_ptr->next) {
1958 struct cgroup_hierarchy *h = info_ptr->hierarchy;
1959 if (!h)
1960 continue;
1961 if (lxc_string_in_array(subsystem, (const char **)h->subsystems))
1962 return info_ptr;
1963 }
1964 errno = ENOENT;
1965 return NULL;
1966 }
1967
1968 static int do_cgroup_get(const char *cgroup_path, const char *sub_filename,
1969 char *value, size_t len)
1970 {
1971 const char *parts[3] = {
1972 cgroup_path,
1973 sub_filename,
1974 NULL
1975 };
1976 char *filename;
1977 int ret, saved_errno;
1978
1979 filename = lxc_string_join("/", parts, false);
1980 if (!filename)
1981 return -1;
1982
1983 ret = lxc_read_from_file(filename, value, len);
1984 saved_errno = errno;
1985 free(filename);
1986 errno = saved_errno;
1987 return ret;
1988 }
1989
1990 static int do_cgroup_set(const char *cgroup_path, const char *sub_filename,
1991 const char *value)
1992 {
1993 const char *parts[3] = {
1994 cgroup_path,
1995 sub_filename,
1996 NULL
1997 };
1998 char *filename;
1999 int ret, saved_errno;
2000
2001 filename = lxc_string_join("/", parts, false);
2002 if (!filename)
2003 return -1;
2004
2005 ret = lxc_write_to_file(filename, value, strlen(value), false);
2006 saved_errno = errno;
2007 free(filename);
2008 errno = saved_errno;
2009 return ret;
2010 }
2011
2012 static int do_setup_cgroup_limits(struct cgfs_data *d,
2013 struct lxc_list *cgroup_settings, bool do_devices)
2014 {
2015 struct lxc_list *iterator, *sorted_cgroup_settings, *next;
2016 struct lxc_cgroup *cg;
2017 int ret = -1;
2018
2019 if (lxc_list_empty(cgroup_settings))
2020 return 0;
2021
2022 sorted_cgroup_settings = sort_cgroup_settings(cgroup_settings);
2023 if (!sorted_cgroup_settings) {
2024 return -1;
2025 }
2026
2027 lxc_list_for_each(iterator, sorted_cgroup_settings) {
2028 cg = iterator->elem;
2029
2030 if (do_devices == !strncmp("devices", cg->subsystem, 7)) {
2031 if (strcmp(cg->subsystem, "devices.deny") == 0 &&
2032 cgroup_devices_has_allow_or_deny(d, cg->value, false))
2033 continue;
2034 if (strcmp(cg->subsystem, "devices.allow") == 0 &&
2035 cgroup_devices_has_allow_or_deny(d, cg->value, true))
2036 continue;
2037 if (lxc_cgroup_set_data(cg->subsystem, cg->value, d)) {
2038 if (do_devices && (errno == EACCES || errno == EPERM)) {
2039 WARN("Error setting %s to %s for %s",
2040 cg->subsystem, cg->value, d->name);
2041 continue;
2042 }
2043 SYSERROR("Error setting %s to %s for %s",
2044 cg->subsystem, cg->value, d->name);
2045 goto out;
2046 }
2047 }
2048
2049 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
2050 }
2051
2052 ret = 0;
2053 INFO("cgroup has been setup");
2054 out:
2055 lxc_list_for_each_safe(iterator, sorted_cgroup_settings, next) {
2056 lxc_list_del(iterator);
2057 free(iterator);
2058 }
2059 free(sorted_cgroup_settings);
2060 return ret;
2061 }
2062
2063 static bool cgroup_devices_has_allow_or_deny(struct cgfs_data *d,
2064 char *v, bool for_allow)
2065 {
2066 char *path;
2067 FILE *devices_list;
2068 char *line = NULL;
2069 size_t sz = 0;
2070 bool ret = !for_allow;
2071 const char *parts[3] = {
2072 NULL,
2073 "devices.list",
2074 NULL
2075 };
2076
2077 /* XXX FIXME if users could use something other than 'lxc.devices.deny =
2078 * a'. not sure they ever do, but they *could* right now, I'm assuming
2079 * they do NOT
2080 */
2081 if (!for_allow && strcmp(v, "a") != 0 && strcmp(v, "a *:* rwm") != 0)
2082 return false;
2083
2084 parts[0] = (const char *)lxc_cgroup_get_hierarchy_abs_path_data("devices", d);
2085 if (!parts[0])
2086 return false;
2087 path = lxc_string_join("/", parts, false);
2088 if (!path) {
2089 free((void *)parts[0]);
2090 return false;
2091 }
2092
2093 devices_list = fopen_cloexec(path, "r");
2094 if (!devices_list) {
2095 free(path);
2096 return false;
2097 }
2098
2099 while (getline(&line, &sz, devices_list) != -1) {
2100 size_t len = strlen(line);
2101 if (len > 0 && line[len-1] == '\n')
2102 line[len-1] = '\0';
2103 if (strcmp(line, "a *:* rwm") == 0) {
2104 ret = for_allow;
2105 goto out;
2106 } else if (for_allow && strcmp(line, v) == 0) {
2107 ret = true;
2108 goto out;
2109 }
2110 }
2111
2112 out:
2113 fclose(devices_list);
2114 free(line);
2115 free(path);
2116 return ret;
2117 }
2118
2119 static int cgroup_recursive_task_count(const char *cgroup_path)
2120 {
2121 DIR *d;
2122 struct dirent *dent;
2123 int n = 0, r;
2124
2125 d = opendir(cgroup_path);
2126 if (!d)
2127 return 0;
2128
2129 while ((dent = readdir(d))) {
2130 const char *parts[3] = {
2131 cgroup_path,
2132 dent->d_name,
2133 NULL
2134 };
2135 char *sub_path;
2136 struct stat st;
2137
2138 if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, ".."))
2139 continue;
2140 sub_path = lxc_string_join("/", parts, false);
2141 if (!sub_path) {
2142 closedir(d);
2143 return -1;
2144 }
2145 r = stat(sub_path, &st);
2146 if (r < 0) {
2147 closedir(d);
2148 free(sub_path);
2149 return -1;
2150 }
2151 if (S_ISDIR(st.st_mode)) {
2152 r = cgroup_recursive_task_count(sub_path);
2153 if (r >= 0)
2154 n += r;
2155 } else if (!strcmp(dent->d_name, "tasks")) {
2156 r = lxc_count_file_lines(sub_path);
2157 if (r >= 0)
2158 n += r;
2159 }
2160 free(sub_path);
2161 }
2162 closedir(d);
2163
2164 return n;
2165 }
2166
2167 static int handle_cgroup_settings(struct cgroup_mount_point *mp,
2168 char *cgroup_path)
2169 {
2170 int r, saved_errno = 0;
2171 char buf[2];
2172
2173 mp->need_cpuset_init = false;
2174
2175 /* If this is the memory cgroup, we want to enforce hierarchy.
2176 * But don't fail if for some reason we can't.
2177 */
2178 if (lxc_string_in_array("memory", (const char **)mp->hierarchy->subsystems)) {
2179 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/memory.use_hierarchy");
2180 if (cc_path) {
2181 r = lxc_read_from_file(cc_path, buf, 1);
2182 if (r < 1 || buf[0] != '1') {
2183 r = lxc_write_to_file(cc_path, "1", 1, false);
2184 if (r < 0)
2185 SYSERROR("failed to set memory.use_hierarchy to 1; continuing");
2186 }
2187 free(cc_path);
2188 }
2189 }
2190
2191 /* if this is a cpuset hierarchy, we have to set cgroup.clone_children in
2192 * the base cgroup, otherwise containers will start with an empty cpuset.mems
2193 * and cpuset.cpus and then
2194 */
2195 if (lxc_string_in_array("cpuset", (const char **)mp->hierarchy->subsystems)) {
2196 char *cc_path = cgroup_to_absolute_path(mp, cgroup_path, "/cgroup.clone_children");
2197 struct stat sb;
2198
2199 if (!cc_path)
2200 return -1;
2201 /* cgroup.clone_children is not available when running under
2202 * older kernel versions; in this case, we'll initialize
2203 * cpuset.cpus and cpuset.mems later, after the new cgroup
2204 * was created
2205 */
2206 if (stat(cc_path, &sb) != 0 && errno == ENOENT) {
2207 mp->need_cpuset_init = true;
2208 free(cc_path);
2209 return 0;
2210 }
2211 r = lxc_read_from_file(cc_path, buf, 1);
2212 if (r == 1 && buf[0] == '1') {
2213 free(cc_path);
2214 return 0;
2215 }
2216 r = lxc_write_to_file(cc_path, "1", 1, false);
2217 saved_errno = errno;
2218 free(cc_path);
2219 errno = saved_errno;
2220 return r < 0 ? -1 : 0;
2221 }
2222 return 0;
2223 }
2224
2225 static int cgroup_read_from_file(const char *fn, char buf[], size_t bufsize)
2226 {
2227 int ret = lxc_read_from_file(fn, buf, bufsize);
2228 if (ret < 0) {
2229 SYSERROR("failed to read %s", fn);
2230 return ret;
2231 }
2232 if (ret == bufsize) {
2233 if (bufsize > 0) {
2234 /* obviously this wasn't empty */
2235 buf[bufsize-1] = '\0';
2236 return ret;
2237 }
2238 /* Callers don't do this, but regression/sanity check */
2239 ERROR("was not expecting 0 bufsize");
2240 return -1;
2241 }
2242 buf[ret] = '\0';
2243 return ret;
2244 }
2245
2246 static bool do_init_cpuset_file(struct cgroup_mount_point *mp,
2247 const char *path, const char *name)
2248 {
2249 char value[1024];
2250 char *childfile, *parentfile = NULL, *tmp;
2251 int ret;
2252 bool ok = false;
2253
2254 childfile = cgroup_to_absolute_path(mp, path, name);
2255 if (!childfile)
2256 return false;
2257
2258 /* don't overwrite a non-empty value in the file */
2259 ret = cgroup_read_from_file(childfile, value, sizeof(value));
2260 if (ret < 0)
2261 goto out;
2262 if (value[0] != '\0' && value[0] != '\n') {
2263 ok = true;
2264 goto out;
2265 }
2266
2267 /* path to the same name in the parent cgroup */
2268 parentfile = strdup(path);
2269 if (!parentfile)
2270 goto out;
2271
2272 tmp = strrchr(parentfile, '/');
2273 if (!tmp)
2274 goto out;
2275 if (tmp == parentfile)
2276 tmp++; /* keep the '/' at the start */
2277 *tmp = '\0';
2278 tmp = parentfile;
2279 parentfile = cgroup_to_absolute_path(mp, tmp, name);
2280 free(tmp);
2281 if (!parentfile)
2282 goto out;
2283
2284 /* copy from parent to child cgroup */
2285 ret = cgroup_read_from_file(parentfile, value, sizeof(value));
2286 if (ret < 0)
2287 goto out;
2288 if (ret == sizeof(value)) {
2289 /* If anyone actually sees this error, we can address it */
2290 ERROR("parent cpuset value too long");
2291 goto out;
2292 }
2293 ok = (lxc_write_to_file(childfile, value, strlen(value), false) >= 0);
2294 if (!ok)
2295 SYSERROR("failed writing %s", childfile);
2296
2297 out:
2298 free(parentfile);
2299 free(childfile);
2300 return ok;
2301 }
2302
2303 static bool init_cpuset_if_needed(struct cgroup_mount_point *mp,
2304 const char *path)
2305 {
2306 /* the files we have to handle here are only in cpuset hierarchies */
2307 if (!lxc_string_in_array("cpuset",
2308 (const char **)mp->hierarchy->subsystems))
2309 return true;
2310
2311 if (!mp->need_cpuset_init)
2312 return true;
2313
2314 return (do_init_cpuset_file(mp, path, "/cpuset.cpus") &&
2315 do_init_cpuset_file(mp, path, "/cpuset.mems") );
2316 }
2317
2318 static void print_cgfs_init_debuginfo(struct cgfs_data *d)
2319 {
2320 int i;
2321
2322 if (!getenv("LXC_DEBUG_CGFS"))
2323 return;
2324
2325 DEBUG("Cgroup information:");
2326 DEBUG(" container name: %s", d->name);
2327 if (!d->meta || !d->meta->hierarchies) {
2328 DEBUG(" No hierarchies found.");
2329 return;
2330 }
2331 DEBUG(" Controllers:");
2332 for (i = 0; i <= d->meta->maximum_hierarchy; i++) {
2333 char **p;
2334 struct cgroup_hierarchy *h = d->meta->hierarchies[i];
2335 if (!h) {
2336 DEBUG(" Empty hierarchy number %d.", i);
2337 continue;
2338 }
2339 for (p = h->subsystems; p && *p; p++) {
2340 DEBUG(" %2d: %s", i, *p);
2341 }
2342 }
2343 }
2344
2345 struct cgroup_ops *cgfs_ops_init(void)
2346 {
2347 return &cgfs_ops;
2348 }
2349
2350 static void *cgfs_init(struct lxc_handler *handler)
2351 {
2352 struct cgfs_data *d;
2353
2354 d = malloc(sizeof(*d));
2355 if (!d)
2356 return NULL;
2357
2358 memset(d, 0, sizeof(*d));
2359 d->name = strdup(handler->name);
2360 if (!d->name)
2361 goto err1;
2362
2363 d->cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
2364
2365 d->meta = lxc_cgroup_load_meta();
2366 if (!d->meta) {
2367 ERROR("cgroupfs failed to detect cgroup metadata");
2368 goto err2;
2369 }
2370
2371 print_cgfs_init_debuginfo(d);
2372
2373 return d;
2374
2375 err2:
2376 free(d->name);
2377 err1:
2378 free(d);
2379 return NULL;
2380 }
2381
2382 static void cgfs_destroy(void *hdata, struct lxc_conf *conf)
2383 {
2384 struct cgfs_data *d = hdata;
2385
2386 if (!d)
2387 return;
2388 free(d->name);
2389 lxc_cgroup_process_info_free_and_remove(d->info, conf);
2390 lxc_cgroup_put_meta(d->meta);
2391 free(d);
2392 }
2393
2394 static inline bool cgfs_create(void *hdata)
2395 {
2396 struct cgfs_data *d = hdata;
2397 struct cgroup_process_info *i;
2398 struct cgroup_meta_data *md;
2399
2400 if (!d)
2401 return false;
2402 md = d->meta;
2403 i = lxc_cgroupfs_create(d->name, d->cgroup_pattern, md, NULL);
2404 if (!i)
2405 return false;
2406 d->info = i;
2407 return true;
2408 }
2409
2410 static inline bool cgfs_enter(void *hdata, pid_t pid)
2411 {
2412 struct cgfs_data *d = hdata;
2413 struct cgroup_process_info *i;
2414 int ret;
2415
2416 if (!d)
2417 return false;
2418 i = d->info;
2419 ret = lxc_cgroupfs_enter(i, pid, false);
2420
2421 return ret == 0;
2422 }
2423
2424 static inline bool cgfs_create_legacy(void *hdata, pid_t pid)
2425 {
2426 struct cgfs_data *d = hdata;
2427 struct cgroup_process_info *i;
2428
2429 if (!d)
2430 return false;
2431 i = d->info;
2432 if (lxc_cgroup_create_legacy(i, d->name, pid) < 0) {
2433 ERROR("failed to create legacy ns cgroups for '%s'", d->name);
2434 return false;
2435 }
2436 return true;
2437 }
2438
2439 static const char *cgfs_get_cgroup(void *hdata, const char *subsystem)
2440 {
2441 struct cgfs_data *d = hdata;
2442
2443 if (!d)
2444 return NULL;
2445 return lxc_cgroup_get_hierarchy_path_data(subsystem, d);
2446 }
2447
2448 static bool cgfs_escape(void *hdata)
2449 {
2450 struct cgroup_meta_data *md;
2451 int i;
2452 bool ret = false;
2453
2454 md = lxc_cgroup_load_meta();
2455 if (!md)
2456 return false;
2457
2458 for (i = 0; i <= md->maximum_hierarchy; i++) {
2459 struct cgroup_hierarchy *h = md->hierarchies[i];
2460 struct cgroup_mount_point *mp;
2461 char *tasks;
2462 FILE *f;
2463 int written;
2464
2465 if (!h) {
2466 WARN("not escaping hierarchy %d", i);
2467 continue;
2468 }
2469
2470 mp = lxc_cgroup_find_mount_point(h, "/", true);
2471 if (!mp)
2472 goto out;
2473
2474 tasks = cgroup_to_absolute_path(mp, "/", "tasks");
2475 if (!tasks)
2476 goto out;
2477
2478 f = fopen(tasks, "a");
2479 free(tasks);
2480 if (!f)
2481 goto out;
2482
2483 written = fprintf(f, "%d\n", lxc_raw_getpid());
2484 fclose(f);
2485 if (written < 0) {
2486 SYSERROR("writing tasks failed\n");
2487 goto out;
2488 }
2489 }
2490
2491 ret = true;
2492 out:
2493 lxc_cgroup_put_meta(md);
2494 return ret;
2495 }
2496
2497 static int cgfs_num_hierarchies(void)
2498 {
2499 /* not implemented */
2500 return -1;
2501 }
2502
2503 static bool cgfs_get_hierarchies(int i, char ***out)
2504 {
2505 /* not implemented */
2506 return false;
2507 }
2508
2509 static bool cgfs_unfreeze(void *hdata)
2510 {
2511 struct cgfs_data *d = hdata;
2512 char *cgabspath, *cgrelpath;
2513 int ret;
2514
2515 if (!d)
2516 return false;
2517
2518 cgrelpath = lxc_cgroup_get_hierarchy_path_data("freezer", d);
2519 cgabspath = lxc_cgroup_find_abs_path("freezer", cgrelpath, true, NULL);
2520 if (!cgabspath)
2521 return false;
2522
2523 ret = do_cgroup_set(cgabspath, "freezer.state", "THAWED");
2524 free(cgabspath);
2525 return ret == 0;
2526 }
2527
2528 static bool cgroupfs_setup_limits(void *hdata, struct lxc_list *cgroup_conf,
2529 bool with_devices)
2530 {
2531 struct cgfs_data *d = hdata;
2532
2533 if (!d)
2534 return false;
2535 return do_setup_cgroup_limits(d, cgroup_conf, with_devices) == 0;
2536 }
2537
2538 static bool lxc_cgroupfs_attach(const char *name, const char *lxcpath, pid_t pid)
2539 {
2540 struct cgroup_meta_data *meta_data;
2541 struct cgroup_process_info *container_info;
2542 int ret;
2543
2544 meta_data = lxc_cgroup_load_meta();
2545 if (!meta_data) {
2546 ERROR("could not move attached process %d to cgroup of container", pid);
2547 return false;
2548 }
2549
2550 container_info = lxc_cgroup_get_container_info(name, lxcpath, meta_data);
2551 lxc_cgroup_put_meta(meta_data);
2552 if (!container_info) {
2553 ERROR("could not move attached process %d to cgroup of container", pid);
2554 return false;
2555 }
2556
2557 ret = lxc_cgroupfs_enter(container_info, pid, false);
2558 lxc_cgroup_process_info_free(container_info);
2559 if (ret < 0) {
2560 ERROR("could not move attached process %d to cgroup of container", pid);
2561 return false;
2562 }
2563 return true;
2564 }
2565
2566 struct chown_data {
2567 const char *cgroup_path;
2568 uid_t origuid;
2569 };
2570
2571 /*
2572 * TODO - someone should refactor this to unshare once passing all the paths
2573 * to be chowned in one go
2574 */
2575 static int chown_cgroup_wrapper(void *data)
2576 {
2577 struct chown_data *arg = data;
2578 uid_t destuid;
2579 char *fpath;
2580
2581 if (setresgid(0,0,0) < 0)
2582 SYSERROR("Failed to setgid to 0");
2583 if (setresuid(0,0,0) < 0)
2584 SYSERROR("Failed to setuid to 0");
2585 if (setgroups(0, NULL) < 0)
2586 SYSERROR("Failed to clear groups");
2587 destuid = get_ns_uid(arg->origuid);
2588
2589 if (chown(arg->cgroup_path, destuid, 0) < 0)
2590 SYSERROR("Failed chowning %s to %d", arg->cgroup_path, (int)destuid);
2591
2592 fpath = lxc_append_paths(arg->cgroup_path, "tasks");
2593 if (!fpath)
2594 return -1;
2595 if (chown(fpath, destuid, 0) < 0)
2596 SYSERROR("Error chowning %s\n", fpath);
2597 free(fpath);
2598
2599 fpath = lxc_append_paths(arg->cgroup_path, "cgroup.procs");
2600 if (!fpath)
2601 return -1;
2602 if (chown(fpath, destuid, 0) < 0)
2603 SYSERROR("Error chowning %s", fpath);
2604 free(fpath);
2605
2606 return 0;
2607 }
2608
2609 static bool do_cgfs_chown(char *cgroup_path, struct lxc_conf *conf)
2610 {
2611 struct chown_data data;
2612 char *fpath;
2613
2614 if (!dir_exists(cgroup_path))
2615 return true;
2616
2617 if (lxc_list_empty(&conf->id_map))
2618 /* If there's no mapping then we don't need to chown */
2619 return true;
2620
2621 data.cgroup_path = cgroup_path;
2622 data.origuid = geteuid();
2623
2624 /* Unpriv users can't chown it themselves, so chown from
2625 * a child namespace mapping both our own and the target uid
2626 */
2627 if (userns_exec_1(conf, chown_cgroup_wrapper, &data,
2628 "chown_cgroup_wrapper") < 0) {
2629 ERROR("Error requesting cgroup chown in new namespace");
2630 return false;
2631 }
2632
2633 /*
2634 * Now chmod 775 the directory else the container cannot create cgroups.
2635 * This can't be done in the child namespace because it only group-owns
2636 * the cgroup
2637 */
2638 if (chmod(cgroup_path, 0775) < 0) {
2639 SYSERROR("Error chmoding %s\n", cgroup_path);
2640 return false;
2641 }
2642 fpath = lxc_append_paths(cgroup_path, "tasks");
2643 if (!fpath)
2644 return false;
2645 if (chmod(fpath, 0664) < 0)
2646 SYSERROR("Error chmoding %s\n", fpath);
2647 free(fpath);
2648 fpath = lxc_append_paths(cgroup_path, "cgroup.procs");
2649 if (!fpath)
2650 return false;
2651 if (chmod(fpath, 0664) < 0)
2652 SYSERROR("Error chmoding %s\n", fpath);
2653 free(fpath);
2654
2655 return true;
2656 }
2657
2658 static bool cgfs_chown(void *hdata, struct lxc_conf *conf)
2659 {
2660 struct cgfs_data *d = hdata;
2661 struct cgroup_process_info *info_ptr;
2662 char *cgpath;
2663 bool r = true;
2664
2665 if (!d)
2666 return false;
2667
2668 for (info_ptr = d->info; info_ptr; info_ptr = info_ptr->next) {
2669 if (!info_ptr->hierarchy)
2670 continue;
2671
2672 if (!info_ptr->designated_mount_point) {
2673 info_ptr->designated_mount_point = lxc_cgroup_find_mount_point(info_ptr->hierarchy, info_ptr->cgroup_path, true);
2674 if (!info_ptr->designated_mount_point) {
2675 SYSERROR("Could not chown cgroup %s: internal error (couldn't find any writable mountpoint to cgroup filesystem)", info_ptr->cgroup_path);
2676 return false;
2677 }
2678 }
2679
2680 cgpath = cgroup_to_absolute_path(info_ptr->designated_mount_point, info_ptr->cgroup_path, NULL);
2681 if (!cgpath) {
2682 SYSERROR("Could not chown cgroup %s: internal error", info_ptr->cgroup_path);
2683 continue;
2684 }
2685 r = do_cgfs_chown(cgpath, conf);
2686 if (!r && is_crucial_hierarchy(info_ptr->hierarchy)) {
2687 ERROR("Failed chowning %s\n", cgpath);
2688 free(cgpath);
2689 return false;
2690 }
2691 free(cgpath);
2692 }
2693
2694 return true;
2695 }
2696
2697 static struct cgroup_ops cgfs_ops = {
2698 .init = cgfs_init,
2699 .destroy = cgfs_destroy,
2700 .create = cgfs_create,
2701 .enter = cgfs_enter,
2702 .create_legacy = cgfs_create_legacy,
2703 .get_cgroup = cgfs_get_cgroup,
2704 .escape = cgfs_escape,
2705 .num_hierarchies = cgfs_num_hierarchies,
2706 .get_hierarchies = cgfs_get_hierarchies,
2707 .get = lxc_cgroupfs_get,
2708 .set = lxc_cgroupfs_set,
2709 .unfreeze = cgfs_unfreeze,
2710 .setup_limits = cgroupfs_setup_limits,
2711 .name = "cgroupfs",
2712 .attach = lxc_cgroupfs_attach,
2713 .chown = cgfs_chown,
2714 .mount_cgroup = cgroupfs_mount_cgroup,
2715 .nrtasks = cgfs_nrtasks,
2716 .driver = CGFS,
2717 };