]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/cgroup.c
cgroups: don't mount under init's cgroup
[mirror_lxc.git] / src / lxc / cgroup.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <dlezcano at fr.ibm.com>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23 #define _GNU_SOURCE
24 #include <stdio.h>
25 #undef _GNU_SOURCE
26 #include <stdlib.h>
27 #include <errno.h>
28 #include <unistd.h>
29 #include <string.h>
30 #include <dirent.h>
31 #include <fcntl.h>
32 #include <sys/types.h>
33 #include <sys/stat.h>
34 #include <sys/param.h>
35 #include <sys/inotify.h>
36 #include <netinet/in.h>
37 #include <net/if.h>
38
39 #include "error.h"
40 #include "config.h"
41 #include "commands.h"
42
43 #include <lxc/log.h>
44 #include <lxc/cgroup.h>
45 #include <lxc/start.h>
46
47 #if IS_BIONIC
48 #include <../include/lxcmntent.h>
49 #else
50 #include <mntent.h>
51 #endif
52
53 lxc_log_define(lxc_cgroup, lxc);
54
55 #define MTAB "/proc/mounts"
56
57 /* Check if a mount is a cgroup hierarchy for any subsystem.
58 * Return the first subsystem found (or NULL if none).
59 */
60 static char *mount_has_subsystem(const struct mntent *mntent)
61 {
62 FILE *f;
63 char *c, *ret = NULL;
64 char line[MAXPATHLEN];
65
66 /* read the list of subsystems from the kernel */
67 f = fopen("/proc/cgroups", "r");
68 if (!f)
69 return 0;
70
71 /* skip the first line, which contains column headings */
72 if (!fgets(line, MAXPATHLEN, f))
73 return 0;
74
75 while (fgets(line, MAXPATHLEN, f)) {
76 c = strchr(line, '\t');
77 if (!c)
78 continue;
79 *c = '\0';
80
81 ret = hasmntopt(mntent, line);
82 if (ret)
83 break;
84 }
85
86 fclose(f);
87 return ret;
88 }
89
90 /*
91 * Determine mountpoint for a cgroup subsystem.
92 * @subsystem: cgroup subsystem (i.e. freezer). If this is NULL, the first
93 * cgroup mountpoint with any subsystems is used.
94 * @mnt: a passed-in buffer of at least size MAXPATHLEN into which the path
95 * is copied.
96 *
97 * Returns 0 on success, -1 on error.
98 */
99 static int get_cgroup_mount(const char *subsystem, char *mnt)
100 {
101 struct mntent *mntent;
102 FILE *file = NULL;
103 int ret, err = -1;
104
105 file = setmntent(MTAB, "r");
106 if (!file) {
107 SYSERROR("failed to open %s", MTAB);
108 return -1;
109 }
110
111 while ((mntent = getmntent(file))) {
112 if (strcmp(mntent->mnt_type, "cgroup"))
113 continue;
114
115 if (subsystem) {
116 if (!hasmntopt(mntent, subsystem))
117 continue;
118 } else {
119 if (!mount_has_subsystem(mntent))
120 continue;
121 }
122
123 ret = snprintf(mnt, MAXPATHLEN, "%s", mntent->mnt_dir);
124 if (ret < 0 || ret >= MAXPATHLEN)
125 goto fail;
126
127 DEBUG("using cgroup mounted at '%s'", mnt);
128 err = 0;
129 goto out;
130 };
131
132 fail:
133 DEBUG("Failed to find cgroup for %s\n",
134 subsystem ? subsystem : "(NULL)");
135 out:
136 endmntent(file);
137 return err;
138 }
139
140 /*
141 * cgroup_path_get: Calculate the full path for a particular subsystem, plus
142 * a passed-in (to be appended) relative cgpath for a container.
143 * @path: a char** into which a pointer to the answer is copied
144 * @subsystem: subsystem of interest (i.e. freezer).
145 * @cgpath: a container's (relative) cgroup path, i.e. "/lxc/c1".
146 *
147 * Returns 0 on success, -1 on error.
148 *
149 * The answer is written in a static char[MAXPATHLEN] in this function and
150 * should not be freed.
151 */
152 extern int cgroup_path_get(char **path, const char *subsystem, const char *cgpath)
153 {
154 static char buf[MAXPATHLEN];
155 static char retbuf[MAXPATHLEN];
156 int rc;
157
158 /* lxc_cgroup_set passes a state object for the subsystem,
159 * so trim it to just the subsystem part */
160 if (subsystem) {
161 rc = snprintf(retbuf, MAXPATHLEN, "%s", subsystem);
162 if (rc < 0 || rc >= MAXPATHLEN) {
163 ERROR("subsystem name too long");
164 return -1;
165 }
166 char *s = index(retbuf, '.');
167 if (s)
168 *s = '\0';
169 DEBUG("%s: called for subsys %s name %s\n", __func__, retbuf, cgpath);
170 }
171 if (get_cgroup_mount(subsystem ? retbuf : NULL, buf)) {
172 ERROR("cgroup is not mounted");
173 return -1;
174 }
175
176 rc = snprintf(retbuf, MAXPATHLEN, "%s/%s", buf, cgpath);
177 if (rc < 0 || rc >= MAXPATHLEN) {
178 ERROR("name too long");
179 return -1;
180 }
181
182 DEBUG("%s: returning %s for subsystem %s", __func__, retbuf, subsystem);
183
184 *path = retbuf;
185 return 0;
186 }
187
188 /*
189 * Calculate a container's cgroup path for a particular subsystem. This
190 * is the cgroup path relative to the root of the cgroup filesystem.
191 * @path: A char ** into which we copy the char* containing the answer
192 * @subsystem: the cgroup subsystem of interest (i.e. freezer)
193 * @name: container name
194 * @lxcpath: the lxcpath in which the container is running.
195 *
196 * Returns 0 on success, -1 on error.
197 *
198 * Note that the char* copied into *path is a static char[MAXPATHLEN] in
199 * commands.c:receive_answer(). It should not be freed.
200 */
201 extern int lxc_get_cgpath(const char **path, const char *subsystem, const char *name, const char *lxcpath)
202 {
203 struct lxc_command command = {
204 .request = { .type = LXC_COMMAND_CGROUP },
205 };
206
207 int ret, stopped = 0;
208
209 ret = lxc_command(name, &command, &stopped, lxcpath);
210 if (ret < 0) {
211 if (!stopped)
212 ERROR("failed to send command");
213 return -1;
214 }
215
216 if (!ret) {
217 WARN("'%s' has stopped before sending its state", name);
218 return -1;
219 }
220
221 if (command.answer.ret < 0 || command.answer.pathlen < 0) {
222 ERROR("failed to get state for '%s': %s",
223 name, strerror(-command.answer.ret));
224 return -1;
225 }
226
227 *path = command.answer.path;
228
229 return 0;
230 }
231
232 /*
233 * lxc_cgroup_path_get: determine full pathname for a cgroup
234 * file for a specific container.
235 * @path: char ** used to return the answer. The char * will point
236 * into the static char* retuf from cgroup_path_get() (so no need
237 * to free it).
238 * @subsystem: cgroup subsystem (i.e. "freezer") for which to
239 * return an answer. If NULL, then the first cgroup entry in
240 * mtab will be used.
241 *
242 * This is the exported function, which determines cgpath from the
243 * monitor running in lxcpath.
244 *
245 * Returns 0 on success, < 0 on error.
246 */
247 int lxc_cgroup_path_get(char **path, const char *subsystem, const char *name, const char *lxcpath)
248 {
249 const char *cgpath;
250
251 if (lxc_get_cgpath(&cgpath, subsystem, name, lxcpath) < 0)
252 return -1;
253
254 return cgroup_path_get(path, subsystem, cgpath);
255 }
256
257 /*
258 * small helper which simply write a value into a (cgroup) file
259 */
260 static int do_cgroup_set(const char *path, const char *value)
261 {
262 int fd, ret;
263
264 if ((fd = open(path, O_WRONLY)) < 0) {
265 SYSERROR("open %s : %s", path, strerror(errno));
266 return -1;
267 }
268
269 if ((ret = write(fd, value, strlen(value))) < 0) {
270 close(fd);
271 SYSERROR("write %s : %s", path, strerror(errno));
272 return ret;
273 }
274
275 if ((ret = close(fd)) < 0) {
276 SYSERROR("close %s : %s", path, strerror(errno));
277 return ret;
278 }
279 return 0;
280 }
281
282 /*
283 * small helper to write a value into a file in a particular directory.
284 * @cgpath: the directory in which to find the file
285 * @filename: the file (under cgpath) to which to write
286 * @value: what to write
287 *
288 * Returns 0 on success, < 0 on error.
289 */
290 int lxc_cgroup_set_bypath(const char *cgpath, const char *filename, const char *value)
291 {
292 int ret;
293 char *dirpath;
294 char path[MAXPATHLEN];
295
296 ret = cgroup_path_get(&dirpath, filename, cgpath);
297 if (ret)
298 return -1;
299
300 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirpath, filename);
301 if (ret < 0 || ret >= MAXPATHLEN) {
302 ERROR("pathname too long");
303 return -1;
304 }
305
306 return do_cgroup_set(path, value);
307 }
308
309 /*
310 * set a cgroup value for a container
311 *
312 * @name: name of the container
313 * @filename: the cgroup file (i.e. freezer.state) whose value to change
314 * @value: the value to write to the file
315 * @lxcpath: the lxcpath under which the container is running.
316 *
317 * Returns 0 on success, < 0 on error.
318 */
319
320 int lxc_cgroup_set(const char *name, const char *filename, const char *value,
321 const char *lxcpath)
322 {
323 int ret;
324 char *dirpath;
325 char path[MAXPATHLEN];
326
327 ret = lxc_cgroup_path_get(&dirpath, filename, name, lxcpath);
328 if (ret)
329 return -1;
330
331 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirpath, filename);
332 if (ret < 0 || ret >= MAXPATHLEN) {
333 ERROR("pathname too long");
334 return -1;
335 }
336
337 return do_cgroup_set(path, value);
338 }
339
340 /*
341 * Get value of a cgroup setting for a container.
342 *
343 * @name: name of the container
344 * @filename: the cgroup file to read (i.e. 'freezer.state')
345 * @value: a preallocated char* into which to copy the answer
346 * @len: the length of pre-allocated @value
347 * @lxcpath: the lxcpath in which the container is running (i.e.
348 * /var/lib/lxc)
349 *
350 * Returns < 0 on error, or the number of bytes read.
351 *
352 * If you pass in NULL value or 0 len, then you are asking for the size of the
353 * file.
354 *
355 * Note that we can't get the file size quickly through stat or lseek.
356 * Therefore if you pass in len > 0 but less than the file size, your only
357 * indication will be that the return value will be equal to the passed-in ret.
358 * We will not return the actual full file size.
359 */
360 int lxc_cgroup_get(const char *name, const char *filename, char *value,
361 size_t len, const char *lxcpath)
362 {
363 int fd, ret = -1;
364 char *dirpath;
365 char path[MAXPATHLEN];
366 int rc;
367
368 ret = lxc_cgroup_path_get(&dirpath, filename, name, lxcpath);
369 if (ret)
370 return -1;
371
372 rc = snprintf(path, MAXPATHLEN, "%s/%s", dirpath, filename);
373 if (rc < 0 || rc >= MAXPATHLEN) {
374 ERROR("pathname too long");
375 return -1;
376 }
377
378 fd = open(path, O_RDONLY);
379 if (fd < 0) {
380 ERROR("open %s : %s", path, strerror(errno));
381 return -1;
382 }
383
384 if (!len || !value) {
385 char buf[100];
386 int count = 0;
387 while ((ret = read(fd, buf, 100)) > 0)
388 count += ret;
389 if (ret >= 0)
390 ret = count;
391 } else {
392 memset(value, 0, len);
393 ret = read(fd, value, len);
394 }
395
396 if (ret < 0)
397 ERROR("read %s : %s", path, strerror(errno));
398
399 close(fd);
400 return ret;
401 }
402
403 int lxc_cgroup_nrtasks(const char *cgpath)
404 {
405 char *dpath;
406 char path[MAXPATHLEN];
407 int pid, ret, count = 0;
408 FILE *file;
409 int rc;
410
411 ret = cgroup_path_get(&dpath, NULL, cgpath);
412 if (ret)
413 return -1;
414
415 rc = snprintf(path, MAXPATHLEN, "%s/tasks", dpath);
416 if (rc < 0 || rc >= MAXPATHLEN) {
417 ERROR("pathname too long");
418 return -1;
419 }
420
421 file = fopen(path, "r");
422 if (!file) {
423 SYSERROR("fopen '%s' failed", path);
424 return -1;
425 }
426
427 while (fscanf(file, "%d", &pid) != EOF)
428 count++;
429
430 fclose(file);
431
432 return count;
433 }
434
435 /*
436 * If first creating the /sys/fs/cgroup/$subsys/lxc container, then
437 * try to set clone_children to 1. Some kernels don't support
438 * clone_children, and cgroup maintainer wants to deprecate it. So
439 * XXX TODO we should instead after each cgroup mkdir (here and in
440 * hooks/mountcgroup) check if cpuset is in the subsystems, and if so
441 * manually copy over mems and cpus.
442 */
443 static void set_clone_children(const char *mntdir)
444 {
445 char path[MAXPATHLEN];
446 FILE *fout;
447 int ret;
448
449 ret = snprintf(path, MAXPATHLEN, "%s/cgroup.clone_children", mntdir);
450 INFO("writing to %s\n", path);
451 if (ret < 0 || ret > MAXPATHLEN)
452 return;
453 fout = fopen(path, "w");
454 if (!fout)
455 return;
456 fprintf(fout, "1\n");
457 fclose(fout);
458 }
459
460 /*
461 * Make sure the 'cgroup group' exists, so that we don't have to worry about
462 * that later.
463 *
464 * @lxcgroup: the cgroup group, i.e. 'lxc' by default.
465 *
466 * See detailed comments at lxc_cgroup_path_create for more information.
467 *
468 * Returns 0 on success, -1 on error.
469 */
470 static int create_lxcgroups(const char *lxcgroup)
471 {
472 FILE *file = NULL;
473 struct mntent *mntent;
474 int ret, retv = -1;
475 char path[MAXPATHLEN];
476
477 file = setmntent(MTAB, "r");
478 if (!file) {
479 SYSERROR("failed to open %s", MTAB);
480 return -1;
481 }
482
483 while ((mntent = getmntent(file))) {
484
485 if (strcmp(mntent->mnt_type, "cgroup"))
486 continue;
487 if (!mount_has_subsystem(mntent))
488 continue;
489
490 /*
491 * TODO - handle case where lxcgroup has subdirs? (i.e. build/l1)
492 * We probably only want to support that for /users/joe
493 */
494 ret = snprintf(path, MAXPATHLEN, "%s/%s",
495 mntent->mnt_dir, lxcgroup ? lxcgroup : "lxc");
496 if (ret < 0 || ret >= MAXPATHLEN)
497 goto fail;
498 if (access(path, F_OK)) {
499 set_clone_children(mntent->mnt_dir);
500 ret = mkdir(path, 0755);
501 if (ret == -1 && errno != EEXIST) {
502 SYSERROR("failed to create '%s' directory", path);
503 goto fail;
504 }
505 }
506
507 }
508
509 retv = 0;
510 fail:
511 endmntent(file);
512 return retv;
513 }
514
515 /*
516 * For a new container, find a cgroup path which is unique in all cgroup mounts.
517 * I.e. if r1 is already running, then /lxc/r1-1 may be used.
518 *
519 * @lxcgroup: the cgroup 'group' the contaienr should run in. By default, this
520 * is just 'lxc'. Admins may wish to group some containers into other groups,
521 * i.e. 'build', to take advantage of cgroup hierarchy to simplify group
522 * administration. Also, unprivileged users who are placed into a cgroup by
523 * libcgroup_pam will be using that cgroup rather than the system-wide 'lxc'
524 * group.
525 * @name: the name of the container
526 *
527 * The chosen cgpath is returned as a strdup'd string. The caller will have to
528 * free that eventually, however the lxc monitor will keep that string so as to
529 * return it in response to a LXC_COMMAND_CGROUP query.
530 *
531 * Note the path is relative to cgroup mounts. I.e. if the freezer subsystem
532 * is at /sys/fs/cgroup/freezer, and this fn returns '/lxc/r1', then the
533 * freezer cgroup's full path will be /sys/fs/cgroup/freezer/lxc/r1/.
534 *
535 * XXX This should probably be locked globally
536 *
537 * Races won't be determintal, you'll just end up with leftover unused cgroups
538 */
539 char *lxc_cgroup_path_create(const char *lxcgroup, const char *name)
540 {
541 int i = 0, ret;
542 char *retpath, path[MAXPATHLEN];
543 char tail[12];
544 FILE *file = NULL;
545 struct mntent *mntent;
546
547 if (create_lxcgroups(lxcgroup) < 0)
548 return NULL;
549
550 again:
551 file = setmntent(MTAB, "r");
552 if (!file) {
553 SYSERROR("failed to open %s", MTAB);
554 return NULL;
555 }
556
557 if (i)
558 snprintf(tail, 12, "-%d", i);
559 else
560 *tail = '\0';
561
562 while ((mntent = getmntent(file))) {
563
564 if (strcmp(mntent->mnt_type, "cgroup"))
565 continue;
566 if (!mount_has_subsystem(mntent))
567 continue;
568
569 /* find unused mnt_dir + lxcgroup + name + -$i */
570 ret = snprintf(path, MAXPATHLEN, "%s/%s/%s%s", mntent->mnt_dir,
571 lxcgroup ? lxcgroup : "lxc", name, tail);
572 if (ret < 0 || ret >= MAXPATHLEN)
573 goto fail;
574
575 if (access(path, F_OK) == 0) goto next;
576
577 if (mkdir(path, 0755)) {
578 ERROR("Error creating cgroups");
579 goto fail;
580 }
581
582 }
583
584 endmntent(file);
585
586 // print out the cgpath part
587 ret = snprintf(path, MAXPATHLEN, "%s/%s%s",
588 lxcgroup ? lxcgroup : "lxc", name, tail);
589 if (ret < 0 || ret >= MAXPATHLEN) // can't happen
590 goto fail;
591
592 retpath = strdup(path);
593
594 return retpath;
595
596 next:
597 endmntent(file);
598 i++;
599 goto again;
600
601 fail:
602 endmntent(file);
603 return NULL;
604 }
605
606 int lxc_cgroup_enter(const char *cgpath, pid_t pid)
607 {
608 char path[MAXPATHLEN];
609 FILE *file = NULL, *fout;
610 struct mntent *mntent;
611 int ret, retv = -1;
612
613 file = setmntent(MTAB, "r");
614 if (!file) {
615 SYSERROR("failed to open %s", MTAB);
616 return -1;
617 }
618
619 while ((mntent = getmntent(file))) {
620 if (strcmp(mntent->mnt_type, "cgroup"))
621 continue;
622 if (!mount_has_subsystem(mntent))
623 continue;
624 ret = snprintf(path, MAXPATHLEN, "%s/%s/tasks",
625 mntent->mnt_dir, cgpath);
626 if (ret < 0 || ret >= MAXPATHLEN) {
627 ERROR("entering cgroup");
628 goto out;
629 }
630 fout = fopen(path, "w");
631 if (!fout) {
632 ERROR("entering cgroup");
633 goto out;
634 }
635 fprintf(fout, "%d\n", (int)pid);
636 fclose(fout);
637 }
638 retv = 0;
639
640 out:
641 endmntent(file);
642 return retv;
643 }
644
645 int recursive_rmdir(char *dirname)
646 {
647 struct dirent dirent, *direntp;
648 DIR *dir;
649 int ret;
650 char pathname[MAXPATHLEN];
651
652 dir = opendir(dirname);
653 if (!dir) {
654 WARN("failed to open directory: %m");
655 return -1;
656 }
657
658 while (!readdir_r(dir, &dirent, &direntp)) {
659 struct stat mystat;
660 int rc;
661
662 if (!direntp)
663 break;
664
665 if (!strcmp(direntp->d_name, ".") ||
666 !strcmp(direntp->d_name, ".."))
667 continue;
668
669 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
670 if (rc < 0 || rc >= MAXPATHLEN) {
671 ERROR("pathname too long");
672 continue;
673 }
674 ret = stat(pathname, &mystat);
675 if (ret)
676 continue;
677 if (S_ISDIR(mystat.st_mode))
678 recursive_rmdir(pathname);
679 }
680
681 ret = rmdir(dirname);
682
683 if (closedir(dir))
684 ERROR("failed to close directory");
685 return ret;
686
687
688 }
689
690 static int lxc_one_cgroup_destroy(struct mntent *mntent, const char *cgpath)
691 {
692 char cgname[MAXPATHLEN];
693 char *cgmnt = mntent->mnt_dir;
694 int rc;
695
696 rc = snprintf(cgname, MAXPATHLEN, "%s/%s", cgmnt, cgpath);
697 if (rc < 0 || rc >= MAXPATHLEN) {
698 ERROR("name too long");
699 return -1;
700 }
701 DEBUG("destroying %s\n", cgname);
702 if (recursive_rmdir(cgname)) {
703 SYSERROR("failed to remove cgroup '%s'", cgname);
704 return -1;
705 }
706
707 DEBUG("'%s' unlinked", cgname);
708
709 return 0;
710 }
711
712 /*
713 * for each mounted cgroup, destroy the cgroup for the container
714 */
715 int lxc_cgroup_destroy(const char *cgpath)
716 {
717 struct mntent *mntent;
718 FILE *file = NULL;
719 int err, retv = 0;
720
721 file = setmntent(MTAB, "r");
722 if (!file) {
723 SYSERROR("failed to open %s", MTAB);
724 return -1;
725 }
726
727 while ((mntent = getmntent(file))) {
728 if (strcmp(mntent->mnt_type, "cgroup"))
729 continue;
730 if (!mount_has_subsystem(mntent))
731 continue;
732
733 err = lxc_one_cgroup_destroy(mntent, cgpath);
734 if (err) // keep trying to clean up the others
735 retv = -1;
736 }
737
738 endmntent(file);
739 return retv;
740 }
741
742 int lxc_cgroup_attach(pid_t pid, const char *name, const char *lxcpath)
743 {
744 const char *dirpath;
745
746 if (lxc_get_cgpath(&dirpath, NULL, name, lxcpath) < 0) {
747 ERROR("Error getting cgroup for container %s: %s", lxcpath, name);
748 return -1;
749 }
750 INFO("joining pid %d to cgroup %s", pid, dirpath);
751
752 return lxc_cgroup_enter(dirpath, pid);
753 }