]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/cgmanager.c
pivot_root: switch to a new mechanism (v2)
[mirror_lxc.git] / src / lxc / cgmanager.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23 #include "config.h"
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <errno.h>
28 #include <unistd.h>
29 #include <string.h>
30 #include <dirent.h>
31 #include <fcntl.h>
32 #include <ctype.h>
33 #include <pthread.h>
34 #include <grp.h>
35 #include <sys/types.h>
36 #include <sys/stat.h>
37 #include <sys/param.h>
38 #include <sys/inotify.h>
39 #include <sys/mount.h>
40 #include <netinet/in.h>
41 #include <net/if.h>
42
43 #include "error.h"
44 #include "commands.h"
45 #include "list.h"
46 #include "conf.h"
47 #include "utils.h"
48 #include "bdev.h"
49 #include "log.h"
50 #include "cgroup.h"
51 #include "start.h"
52 #include "state.h"
53
54 #define CGM_SUPPORTS_GET_ABS 3
55 #define CGM_SUPPORTS_NAMED 4
56 #define CGM_SUPPORTS_MULT_CONTROLLERS 10
57
58 #ifdef HAVE_CGMANAGER
59 lxc_log_define(lxc_cgmanager, lxc);
60
61 #include <nih-dbus/dbus_connection.h>
62 #include <cgmanager/cgmanager-client.h>
63 #include <nih/alloc.h>
64 #include <nih/error.h>
65 #include <nih/string.h>
66
67 struct cgm_data {
68 char *name;
69 char *cgroup_path;
70 const char *cgroup_pattern;
71 };
72
73 static pthread_mutex_t cgm_mutex = PTHREAD_MUTEX_INITIALIZER;
74
75 static void lock_mutex(pthread_mutex_t *l)
76 {
77 int ret;
78
79 if ((ret = pthread_mutex_lock(l)) != 0) {
80 fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret));
81 exit(1);
82 }
83 }
84
85 static void unlock_mutex(pthread_mutex_t *l)
86 {
87 int ret;
88
89 if ((ret = pthread_mutex_unlock(l)) != 0) {
90 fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret));
91 exit(1);
92 }
93 }
94
95 void cgm_lock(void)
96 {
97 lock_mutex(&cgm_mutex);
98 }
99
100 void cgm_unlock(void)
101 {
102 unlock_mutex(&cgm_mutex);
103 }
104
105 #ifdef HAVE_PTHREAD_ATFORK
106 __attribute__((constructor))
107 static void process_lock_setup_atfork(void)
108 {
109 pthread_atfork(cgm_lock, cgm_unlock, cgm_unlock);
110 }
111 #endif
112
113 static NihDBusProxy *cgroup_manager = NULL;
114 static int32_t api_version;
115
116 static struct cgroup_ops cgmanager_ops;
117 static int nr_subsystems;
118 static char **subsystems, **subsystems_inone;
119 static bool dbus_threads_initialized = false;
120 static void cull_user_controllers(void);
121
122 static void cgm_dbus_disconnect(void)
123 {
124 if (cgroup_manager) {
125 dbus_connection_flush(cgroup_manager->connection);
126 dbus_connection_close(cgroup_manager->connection);
127 nih_free(cgroup_manager);
128 }
129 cgroup_manager = NULL;
130 cgm_unlock();
131 }
132
133 #define CGMANAGER_DBUS_SOCK "unix:path=/sys/fs/cgroup/cgmanager/sock"
134 static bool cgm_dbus_connect(void)
135 {
136 DBusError dbus_error;
137 static DBusConnection *connection;
138
139 cgm_lock();
140 if (!dbus_threads_initialized) {
141 // tell dbus to do struct locking for thread safety
142 dbus_threads_init_default();
143 dbus_threads_initialized = true;
144 }
145
146 dbus_error_init(&dbus_error);
147
148 connection = dbus_connection_open_private(CGMANAGER_DBUS_SOCK, &dbus_error);
149 if (!connection) {
150 DEBUG("Failed opening dbus connection: %s: %s",
151 dbus_error.name, dbus_error.message);
152 dbus_error_free(&dbus_error);
153 cgm_unlock();
154 return false;
155 }
156 dbus_connection_set_exit_on_disconnect(connection, FALSE);
157 dbus_error_free(&dbus_error);
158 cgroup_manager = nih_dbus_proxy_new(NULL, connection,
159 NULL /* p2p */,
160 "/org/linuxcontainers/cgmanager", NULL, NULL);
161 dbus_connection_unref(connection);
162 if (!cgroup_manager) {
163 NihError *nerr;
164 nerr = nih_error_get();
165 ERROR("Error opening cgmanager proxy: %s", nerr->message);
166 nih_free(nerr);
167 cgm_dbus_disconnect();
168 return false;
169 }
170
171 // get the api version
172 if (cgmanager_get_api_version_sync(NULL, cgroup_manager, &api_version) != 0) {
173 NihError *nerr;
174 nerr = nih_error_get();
175 ERROR("Error cgroup manager api version: %s", nerr->message);
176 nih_free(nerr);
177 cgm_dbus_disconnect();
178 return false;
179 }
180 if (api_version < CGM_SUPPORTS_NAMED)
181 cull_user_controllers();
182 return true;
183 }
184
185 static inline bool cgm_supports_multiple_controllers(void)
186 {
187 return api_version >= CGM_SUPPORTS_MULT_CONTROLLERS;
188 }
189
190 static int send_creds(int sock, int rpid, int ruid, int rgid)
191 {
192 struct msghdr msg = { 0 };
193 struct iovec iov;
194 struct cmsghdr *cmsg;
195 struct ucred cred = {
196 .pid = rpid,
197 .uid = ruid,
198 .gid = rgid,
199 };
200 char cmsgbuf[CMSG_SPACE(sizeof(cred))];
201 char buf[1];
202 buf[0] = 'p';
203
204 msg.msg_control = cmsgbuf;
205 msg.msg_controllen = sizeof(cmsgbuf);
206
207 cmsg = CMSG_FIRSTHDR(&msg);
208 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
209 cmsg->cmsg_level = SOL_SOCKET;
210 cmsg->cmsg_type = SCM_CREDENTIALS;
211 memcpy(CMSG_DATA(cmsg), &cred, sizeof(cred));
212
213 msg.msg_name = NULL;
214 msg.msg_namelen = 0;
215
216 iov.iov_base = buf;
217 iov.iov_len = sizeof(buf);
218 msg.msg_iov = &iov;
219 msg.msg_iovlen = 1;
220
221 if (sendmsg(sock, &msg, 0) < 0)
222 return -1;
223 return 0;
224 }
225
226 static bool lxc_cgmanager_create(const char *controller, const char *cgroup_path, int32_t *existed)
227 {
228 bool ret = true;
229 if ( cgmanager_create_sync(NULL, cgroup_manager, controller,
230 cgroup_path, existed) != 0) {
231 NihError *nerr;
232 nerr = nih_error_get();
233 ERROR("call to cgmanager_create_sync failed: %s", nerr->message);
234 nih_free(nerr);
235 ERROR("Failed to create %s:%s", controller, cgroup_path);
236 ret = false;
237 }
238
239 return ret;
240 }
241
242 /*
243 * Escape to the root cgroup if we are root, so that the container will
244 * be in "/lxc/c1" rather than "/user/..../c1"
245 * called internally with connection already open
246 */
247 static bool lxc_cgmanager_escape(void)
248 {
249 bool ret = true;
250 pid_t me = getpid();
251 char **slist = subsystems;
252 int i;
253
254 if (cgm_supports_multiple_controllers())
255 slist = subsystems_inone;
256
257 for (i = 0; slist[i]; i++) {
258 if (cgmanager_move_pid_abs_sync(NULL, cgroup_manager,
259 slist[i], "/", me) != 0) {
260 NihError *nerr;
261 nerr = nih_error_get();
262 ERROR("call to cgmanager_move_pid_abs_sync(%s) failed: %s",
263 slist[i], nerr->message);
264 nih_free(nerr);
265 ret = false;
266 break;
267 }
268 }
269
270 return ret;
271 }
272
273 struct chown_data {
274 const char *cgroup_path;
275 uid_t origuid;
276 };
277
278 static int do_chown_cgroup(const char *controller, const char *cgroup_path,
279 uid_t newuid)
280 {
281 int sv[2] = {-1, -1}, optval = 1, ret = -1;
282 char buf[1];
283
284 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sv) < 0) {
285 SYSERROR("Error creating socketpair");
286 goto out;
287 }
288 if (setsockopt(sv[1], SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
289 SYSERROR("setsockopt failed");
290 goto out;
291 }
292 if (setsockopt(sv[0], SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
293 SYSERROR("setsockopt failed");
294 goto out;
295 }
296 if ( cgmanager_chown_scm_sync(NULL, cgroup_manager, controller,
297 cgroup_path, sv[1]) != 0) {
298 NihError *nerr;
299 nerr = nih_error_get();
300 ERROR("call to cgmanager_chown_scm_sync failed: %s", nerr->message);
301 nih_free(nerr);
302 goto out;
303 }
304 /* now send credentials */
305
306 fd_set rfds;
307 FD_ZERO(&rfds);
308 FD_SET(sv[0], &rfds);
309 if (select(sv[0]+1, &rfds, NULL, NULL, NULL) < 0) {
310 ERROR("Error getting go-ahead from server: %s", strerror(errno));
311 goto out;
312 }
313 if (read(sv[0], &buf, 1) != 1) {
314 ERROR("Error getting reply from server over socketpair");
315 goto out;
316 }
317 if (send_creds(sv[0], getpid(), getuid(), getgid())) {
318 SYSERROR("%s: Error sending pid over SCM_CREDENTIAL", __func__);
319 goto out;
320 }
321 FD_ZERO(&rfds);
322 FD_SET(sv[0], &rfds);
323 if (select(sv[0]+1, &rfds, NULL, NULL, NULL) < 0) {
324 ERROR("Error getting go-ahead from server: %s", strerror(errno));
325 goto out;
326 }
327 if (read(sv[0], &buf, 1) != 1) {
328 ERROR("Error getting reply from server over socketpair");
329 goto out;
330 }
331 if (send_creds(sv[0], getpid(), newuid, 0)) {
332 SYSERROR("%s: Error sending pid over SCM_CREDENTIAL", __func__);
333 goto out;
334 }
335 FD_ZERO(&rfds);
336 FD_SET(sv[0], &rfds);
337 if (select(sv[0]+1, &rfds, NULL, NULL, NULL) < 0) {
338 ERROR("Error getting go-ahead from server: %s", strerror(errno));
339 goto out;
340 }
341 ret = read(sv[0], buf, 1);
342 out:
343 close(sv[0]);
344 close(sv[1]);
345 if (ret == 1 && *buf == '1')
346 return 0;
347 return -1;
348 }
349
350 static int chown_cgroup_wrapper(void *data)
351 {
352 struct chown_data *arg = data;
353 char **slist = subsystems;
354 int i, ret = -1;
355 uid_t destuid;
356
357 if (setresgid(0,0,0) < 0)
358 SYSERROR("Failed to setgid to 0");
359 if (setresuid(0,0,0) < 0)
360 SYSERROR("Failed to setuid to 0");
361 if (setgroups(0, NULL) < 0)
362 SYSERROR("Failed to clear groups");
363 cgm_dbus_disconnect();
364 if (!cgm_dbus_connect()) {
365 ERROR("Error connecting to cgroup manager");
366 return -1;
367 }
368 destuid = get_ns_uid(arg->origuid);
369
370 if (cgm_supports_multiple_controllers())
371 slist = subsystems_inone;
372
373 for (i = 0; slist[i]; i++) {
374 if (do_chown_cgroup(slist[i], arg->cgroup_path, destuid) < 0) {
375 ERROR("Failed to chown %s:%s to container root",
376 slist[i], arg->cgroup_path);
377 goto fail;
378 }
379 }
380 ret = 0;
381 fail:
382 cgm_dbus_disconnect();
383 return ret;
384 }
385
386 /* Internal helper. Must be called with the cgmanager dbus socket open */
387 static bool lxc_cgmanager_chmod(const char *controller,
388 const char *cgroup_path, const char *file, int mode)
389 {
390 if (cgmanager_chmod_sync(NULL, cgroup_manager, controller,
391 cgroup_path, file, mode) != 0) {
392 NihError *nerr;
393 nerr = nih_error_get();
394 ERROR("call to cgmanager_chmod_sync failed: %s", nerr->message);
395 nih_free(nerr);
396 return false;
397 }
398 return true;
399 }
400
401 /* Internal helper. Must be called with the cgmanager dbus socket open */
402 static bool chown_cgroup(const char *cgroup_path, struct lxc_conf *conf)
403 {
404 struct chown_data data;
405 char **slist = subsystems;
406 int i;
407
408 if (lxc_list_empty(&conf->id_map))
409 /* If there's no mapping then we don't need to chown */
410 return true;
411
412 data.cgroup_path = cgroup_path;
413 data.origuid = geteuid();
414
415 /* Unpriv users can't chown it themselves, so chown from
416 * a child namespace mapping both our own and the target uid
417 */
418 if (userns_exec_1(conf, chown_cgroup_wrapper, &data) < 0) {
419 ERROR("Error requesting cgroup chown in new namespace");
420 return false;
421 }
422
423 /*
424 * Now chmod 775 the directory else the container cannot create cgroups.
425 * This can't be done in the child namespace because it only group-owns
426 * the cgroup
427 */
428 if (cgm_supports_multiple_controllers())
429 slist = subsystems_inone;
430
431 for (i = 0; slist[i]; i++) {
432 if (!lxc_cgmanager_chmod(slist[i], cgroup_path, "", 0775))
433 return false;
434 if (!lxc_cgmanager_chmod(slist[i], cgroup_path, "tasks", 0775))
435 return false;
436 if (!lxc_cgmanager_chmod(slist[i], cgroup_path, "cgroup.procs", 0775))
437 return false;
438 }
439
440 return true;
441 }
442
443 #define CG_REMOVE_RECURSIVE 1
444 /* Internal helper. Must be called with the cgmanager dbus socket open */
445 static void cgm_remove_cgroup(const char *controller, const char *path)
446 {
447 int existed;
448 if ( cgmanager_remove_sync(NULL, cgroup_manager, controller,
449 path, CG_REMOVE_RECURSIVE, &existed) != 0) {
450 NihError *nerr;
451 nerr = nih_error_get();
452 ERROR("call to cgmanager_remove_sync failed: %s", nerr->message);
453 nih_free(nerr);
454 ERROR("Error removing %s:%s", controller, path);
455 }
456 if (existed == -1)
457 INFO("cgroup removal attempt: %s:%s did not exist", controller, path);
458 }
459
460 static void *cgm_init(const char *name)
461 {
462 struct cgm_data *d;
463
464 if (!cgm_dbus_connect()) {
465 ERROR("Error connecting to cgroup manager");
466 return NULL;
467 }
468 d = malloc(sizeof(*d));
469 if (!d) {
470 cgm_dbus_disconnect();
471 return NULL;
472 }
473
474 memset(d, 0, sizeof(*d));
475 d->name = strdup(name);
476 if (!d->name) {
477 cgm_dbus_disconnect();
478 goto err1;
479 }
480
481 /* if we are running as root, use system cgroup pattern, otherwise
482 * just create a cgroup under the current one. But also fall back to
483 * that if for some reason reading the configuration fails and no
484 * default value is available
485 */
486 if (geteuid() == 0)
487 d->cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
488 if (!d->cgroup_pattern)
489 d->cgroup_pattern = "%n";
490 // cgm_create immediately gets called so keep the connection open
491 return d;
492
493 err1:
494 free(d);
495 return NULL;
496 }
497
498 /* Called after a failed container startup */
499 static void cgm_destroy(void *hdata)
500 {
501 struct cgm_data *d = hdata;
502 char **slist = subsystems;
503 int i;
504
505 if (!d || !d->cgroup_path)
506 return;
507 if (!cgm_dbus_connect()) {
508 ERROR("Error connecting to cgroup manager");
509 return;
510 }
511
512 if (cgm_supports_multiple_controllers())
513 slist = subsystems_inone;
514 for (i = 0; slist[i]; i++)
515 cgm_remove_cgroup(slist[i], d->cgroup_path);
516
517 free(d->name);
518 if (d->cgroup_path)
519 free(d->cgroup_path);
520 free(d);
521 cgm_dbus_disconnect();
522 }
523
524 /*
525 * remove all the cgroups created
526 * called internally with dbus connection open
527 */
528 static inline void cleanup_cgroups(char *path)
529 {
530 int i;
531 char **slist = subsystems;
532
533 if (cgm_supports_multiple_controllers())
534 slist = subsystems_inone;
535 for (i = 0; slist[i]; i++)
536 cgm_remove_cgroup(slist[i], path);
537 }
538
539 static inline bool cgm_create(void *hdata)
540 {
541 struct cgm_data *d = hdata;
542 char **slist = subsystems;
543 int i, index=0, baselen, ret;
544 int32_t existed;
545 char result[MAXPATHLEN], *tmp, *cgroup_path;
546
547 if (!d)
548 return false;
549 // XXX we should send a hint to the cgmanager that when these
550 // cgroups become empty they should be deleted. Requires a cgmanager
551 // extension
552
553 memset(result, 0, MAXPATHLEN);
554 tmp = lxc_string_replace("%n", d->name, d->cgroup_pattern);
555 if (!tmp)
556 goto bad;
557 if (strlen(tmp) >= MAXPATHLEN) {
558 free(tmp);
559 goto bad;
560 }
561 strcpy(result, tmp);
562 baselen = strlen(result);
563 free(tmp);
564 tmp = result;
565 while (*tmp == '/')
566 tmp++;
567 again:
568 if (index == 100) { // turn this into a warn later
569 ERROR("cgroup error? 100 cgroups with this name already running");
570 goto bad;
571 }
572 if (index) {
573 ret = snprintf(result+baselen, MAXPATHLEN-baselen, "-%d", index);
574 if (ret < 0 || ret >= MAXPATHLEN-baselen)
575 goto bad;
576 }
577 existed = 0;
578
579 if (cgm_supports_multiple_controllers())
580 slist = subsystems_inone;
581
582 for (i = 0; slist[i]; i++) {
583 if (!lxc_cgmanager_create(slist[i], tmp, &existed)) {
584 ERROR("Error creating cgroup %s:%s", slist[i], result);
585 cleanup_cgroups(tmp);
586 goto bad;
587 }
588 if (existed == 1)
589 goto next;
590 }
591 // success
592 cgroup_path = strdup(tmp);
593 if (!cgroup_path) {
594 cleanup_cgroups(tmp);
595 goto bad;
596 }
597 d->cgroup_path = cgroup_path;
598 cgm_dbus_disconnect();
599 return true;
600
601 next:
602 index++;
603 goto again;
604 bad:
605 cgm_dbus_disconnect();
606 return false;
607 }
608
609 /*
610 * Use the cgmanager to move a task into a cgroup for a particular
611 * hierarchy.
612 * All the subsystems in this hierarchy are co-mounted, so we only
613 * need to transition the task into one of the cgroups
614 *
615 * Internal helper, must be called with cgmanager dbus socket open
616 */
617 static bool lxc_cgmanager_enter(pid_t pid, const char *controller,
618 const char *cgroup_path, bool abs)
619 {
620 int ret;
621
622 if (abs)
623 ret = cgmanager_move_pid_abs_sync(NULL, cgroup_manager,
624 controller, cgroup_path, pid);
625 else
626 ret = cgmanager_move_pid_sync(NULL, cgroup_manager,
627 controller, cgroup_path, pid);
628 if (ret != 0) {
629 NihError *nerr;
630 nerr = nih_error_get();
631 ERROR("call to cgmanager_move_pid_%ssync failed: %s",
632 abs ? "abs_" : "", nerr->message);
633 nih_free(nerr);
634 return false;
635 }
636 return true;
637 }
638
639 /* Internal helper, must be called with cgmanager dbus socket open */
640 static bool do_cgm_enter(pid_t pid, const char *cgroup_path, bool abs)
641 {
642 char **slist = subsystems;
643 int i;
644
645 if (cgm_supports_multiple_controllers())
646 slist = subsystems_inone;
647
648 for (i = 0; slist[i]; i++) {
649 if (!lxc_cgmanager_enter(pid, slist[i], cgroup_path, abs))
650 return false;
651 }
652 return true;
653 }
654
655 static inline bool cgm_enter(void *hdata, pid_t pid)
656 {
657 struct cgm_data *d = hdata;
658 bool ret = false;
659
660 if (!cgm_dbus_connect()) {
661 ERROR("Error connecting to cgroup manager");
662 return false;
663 }
664 if (!d || !d->cgroup_path)
665 goto out;
666 if (do_cgm_enter(pid, d->cgroup_path, false))
667 ret = true;
668 out:
669 cgm_dbus_disconnect();
670 return ret;
671 }
672
673 static const char *cgm_get_cgroup(void *hdata, const char *subsystem)
674 {
675 struct cgm_data *d = hdata;
676
677 if (!d || !d->cgroup_path)
678 return NULL;
679 return d->cgroup_path;
680 }
681
682 #if HAVE_CGMANAGER_GET_PID_CGROUP_ABS_SYNC
683 static inline bool abs_cgroup_supported(void) {
684 return api_version >= CGM_SUPPORTS_GET_ABS;
685 }
686 #else
687 static inline bool abs_cgroup_supported(void) {
688 return false;
689 }
690 #define cgmanager_get_pid_cgroup_abs_sync(...) -1
691 #endif
692
693 static char *try_get_abs_cgroup(const char *name, const char *lxcpath,
694 const char *controller)
695 {
696 char *cgroup = NULL;
697
698 if (abs_cgroup_supported()) {
699 /* get the container init pid and ask for its abs cgroup */
700 pid_t pid = lxc_cmd_get_init_pid(name, lxcpath);
701 if (pid < 0)
702 return NULL;
703 if (cgmanager_get_pid_cgroup_abs_sync(NULL, cgroup_manager,
704 controller, pid, &cgroup) != 0) {
705 cgroup = NULL;
706 NihError *nerr;
707 nerr = nih_error_get();
708 nih_free(nerr);
709 }
710 return cgroup;
711 }
712
713 /* use the command interface to look for the cgroup */
714 return lxc_cmd_get_cgroup_path(name, lxcpath, controller);
715 }
716
717 /*
718 * nrtasks is called by the utmp helper by the container monitor.
719 * cgmanager socket was closed after cgroup setup was complete, so we need
720 * to reopen here.
721 *
722 * Return -1 on error.
723 */
724 static int cgm_get_nrtasks(void *hdata)
725 {
726 struct cgm_data *d = hdata;
727 int32_t *pids;
728 size_t pids_len;
729
730 if (!d || !d->cgroup_path)
731 return -1;
732
733 if (!cgm_dbus_connect()) {
734 ERROR("Error connecting to cgroup manager");
735 return -1;
736 }
737 if (cgmanager_get_tasks_sync(NULL, cgroup_manager, subsystems[0],
738 d->cgroup_path, &pids, &pids_len) != 0) {
739 NihError *nerr;
740 nerr = nih_error_get();
741 ERROR("call to cgmanager_get_tasks_sync failed: %s", nerr->message);
742 nih_free(nerr);
743 pids_len = -1;
744 goto out;
745 }
746 nih_free(pids);
747 out:
748 cgm_dbus_disconnect();
749 return pids_len;
750 }
751
752 static inline void free_abs_cgroup(char *cgroup)
753 {
754 if (!cgroup)
755 return;
756 if (abs_cgroup_supported())
757 nih_free(cgroup);
758 else
759 free(cgroup);
760 }
761
762 static void do_cgm_get(const char *name, const char *lxcpath, const char *filename, int outp, bool sendvalue)
763 {
764 char *controller, *key, *cgroup = NULL, *cglast;
765 int len = -1;
766 int ret;
767 nih_local char *result = NULL;
768
769 controller = alloca(strlen(filename)+1);
770 strcpy(controller, filename);
771 key = strchr(controller, '.');
772 if (!key) {
773 ret = write(outp, &len, sizeof(len));
774 if (ret != sizeof(len))
775 WARN("Failed to warn cgm_get of error; parent may hang");
776 exit(1);
777 }
778 *key = '\0';
779
780 if (!cgm_dbus_connect()) {
781 ERROR("Error connecting to cgroup manager");
782 ret = write(outp, &len, sizeof(len));
783 if (ret != sizeof(len))
784 WARN("Failed to warn cgm_get of error; parent may hang");
785 exit(1);
786 }
787 cgroup = try_get_abs_cgroup(name, lxcpath, subsystems[0]);
788 if (!cgroup) {
789 cgm_dbus_disconnect();
790 ret = write(outp, &len, sizeof(len));
791 if (ret != sizeof(len))
792 WARN("Failed to warn cgm_get of error; parent may hang");
793 exit(1);
794 }
795 cglast = strrchr(cgroup, '/');
796 if (!cglast) {
797 cgm_dbus_disconnect();
798 free_abs_cgroup(cgroup);
799 ret = write(outp, &len, sizeof(len));
800 if (ret != sizeof(len))
801 WARN("Failed to warn cgm_get of error; parent may hang");
802 exit(1);
803 }
804 *cglast = '\0';
805 if (!lxc_cgmanager_enter(getpid(), controller, cgroup, abs_cgroup_supported())) {
806 ERROR("Failed to enter container cgroup %s:%s", controller, cgroup);
807 ret = write(outp, &len, sizeof(len));
808 if (ret != sizeof(len))
809 WARN("Failed to warn cgm_get of error; parent may hang");
810 cgm_dbus_disconnect();
811 free_abs_cgroup(cgroup);
812 exit(1);
813 }
814 if (cgmanager_get_value_sync(NULL, cgroup_manager, controller, cglast+1, filename, &result) != 0) {
815 NihError *nerr;
816 nerr = nih_error_get();
817 nih_free(nerr);
818 free_abs_cgroup(cgroup);
819 cgm_dbus_disconnect();
820 ret = write(outp, &len, sizeof(len));
821 if (ret != sizeof(len))
822 WARN("Failed to warn cgm_get of error; parent may hang");
823 exit(1);
824 }
825 free_abs_cgroup(cgroup);
826 cgm_dbus_disconnect();
827 len = strlen(result);
828 ret = write(outp, &len, sizeof(len));
829 if (ret != sizeof(len)) {
830 WARN("Failed to send length to parent");
831 exit(1);
832 }
833 if (!len || !sendvalue) {
834 exit(0);
835 }
836 ret = write(outp, result, len);
837 if (ret < 0)
838 exit(1);
839 exit(0);
840 }
841
842 /* cgm_get is called to get container cgroup settings, not during startup */
843 static int cgm_get(const char *filename, char *value, size_t len, const char *name, const char *lxcpath)
844 {
845 pid_t pid;
846 int p[2], ret, newlen, readlen;
847
848 if (pipe(p) < 0)
849 return -1;
850 if ((pid = fork()) < 0) {
851 close(p[0]);
852 close(p[1]);
853 return -1;
854 }
855 if (!pid) // do_cgm_get exits
856 do_cgm_get(name, lxcpath, filename, p[1], len && value);
857 close(p[1]);
858 ret = read(p[0], &newlen, sizeof(newlen));
859 if (ret != sizeof(newlen)) {
860 close(p[0]);
861 ret = -1;
862 goto out;
863 }
864 if (!len || !value) {
865 close(p[0]);
866 ret = newlen;
867 goto out;
868 }
869 memset(value, 0, len);
870 if (newlen < 0) { // child is reporting an error
871 close(p[0]);
872 ret = -1;
873 goto out;
874 }
875 if (newlen == 0) { // empty read
876 close(p[0]);
877 ret = 0;
878 goto out;
879 }
880 readlen = newlen > len ? len : newlen;
881 ret = read(p[0], value, readlen);
882 close(p[0]);
883 if (ret != readlen) {
884 ret = -1;
885 goto out;
886 }
887 if (newlen >= len) {
888 value[len-1] = '\0';
889 newlen = len-1;
890 } else if (newlen+1 < len) {
891 // cgmanager doesn't add eol to last entry
892 value[newlen++] = '\n';
893 value[newlen] = '\0';
894 }
895 ret = newlen;
896 out:
897 if (wait_for_pid(pid))
898 WARN("do_cgm_get exited with error");
899 return ret;
900 }
901
902 static void do_cgm_set(const char *name, const char *lxcpath, const char *filename, const char *value, int outp)
903 {
904 char *controller, *key, *cgroup = NULL;
905 int retval = 0; // value we are sending to the parent over outp
906 int ret;
907 char *cglast;
908
909 controller = alloca(strlen(filename)+1);
910 strcpy(controller, filename);
911 key = strchr(controller, '.');
912 if (!key) {
913 ret = write(outp, &retval, sizeof(retval));
914 if (ret != sizeof(retval))
915 WARN("Failed to warn cgm_set of error; parent may hang");
916 exit(1);
917 }
918 *key = '\0';
919
920 if (!cgm_dbus_connect()) {
921 ERROR("Error connecting to cgroup manager");
922 ret = write(outp, &retval, sizeof(retval));
923 if (ret != sizeof(retval))
924 WARN("Failed to warn cgm_set of error; parent may hang");
925 exit(1);
926 }
927 cgroup = try_get_abs_cgroup(name, lxcpath, subsystems[0]);
928 if (!cgroup) {
929 cgm_dbus_disconnect();
930 ret = write(outp, &retval, sizeof(retval));
931 if (ret != sizeof(retval))
932 WARN("Failed to warn cgm_set of error; parent may hang");
933 exit(1);
934 }
935 cglast = strrchr(cgroup, '/');
936 if (!cglast) {
937 cgm_dbus_disconnect();
938 free_abs_cgroup(cgroup);
939 ret = write(outp, &retval, sizeof(retval));
940 if (ret != sizeof(retval))
941 WARN("Failed to warn cgm_set of error; parent may hang");
942 exit(1);
943 }
944 *cglast = '\0';
945 if (!lxc_cgmanager_enter(getpid(), controller, cgroup, abs_cgroup_supported())) {
946 ERROR("Failed to enter container cgroup %s:%s", controller, cgroup);
947 ret = write(outp, &retval, sizeof(retval));
948 if (ret != sizeof(retval))
949 WARN("Failed to warn cgm_set of error; parent may hang");
950 cgm_dbus_disconnect();
951 free_abs_cgroup(cgroup);
952 exit(1);
953 }
954 if (cgmanager_set_value_sync(NULL, cgroup_manager, controller, cglast+1, filename, value) != 0) {
955 NihError *nerr;
956 nerr = nih_error_get();
957 ERROR("Error setting cgroup value %s for %s:%s", filename, controller, cgroup);
958 ERROR("call to cgmanager_set_value_sync failed: %s", nerr->message);
959 nih_free(nerr);
960 free_abs_cgroup(cgroup);
961 cgm_dbus_disconnect();
962 ret = write(outp, &retval, sizeof(retval));
963 if (ret != sizeof(retval))
964 WARN("Failed to warn cgm_set of error; parent may hang");
965 exit(1);
966 }
967 free_abs_cgroup(cgroup);
968 cgm_dbus_disconnect();
969 /* tell parent that we are done */
970 retval = 1;
971 ret = write(outp, &retval, sizeof(retval));
972 if (ret != sizeof(retval)) {
973 exit(1);
974 }
975 exit(0);
976 }
977
978 /* cgm_set is called to change cgroup settings, not during startup */
979 static int cgm_set(const char *filename, const char *value, const char *name, const char *lxcpath)
980 {
981 pid_t pid;
982 int p[2], ret, v;
983
984 if (pipe(p) < 0)
985 return -1;
986 if ((pid = fork()) < 0) {
987 close(p[1]);
988 close(p[0]);
989 return -1;
990 }
991 if (!pid) // do_cgm_set exits
992 do_cgm_set(name, lxcpath, filename, value, p[1]);
993 close(p[1]);
994 ret = read(p[0], &v, sizeof(v));
995 close(p[0]);
996 if (wait_for_pid(pid))
997 WARN("do_cgm_set exited with error");
998 if (ret != sizeof(v) || !v)
999 return -1;
1000 return 0;
1001 }
1002
1003 static void free_subsystems(void)
1004 {
1005 int i;
1006
1007 for (i = 0; i < nr_subsystems; i++)
1008 free(subsystems[i]);
1009 free(subsystems);
1010 subsystems = NULL;
1011 nr_subsystems = 0;
1012 }
1013
1014 static void cull_user_controllers(void)
1015 {
1016 int i, j;
1017
1018 for (i = 0; i < nr_subsystems; i++) {
1019 if (strncmp(subsystems[i], "name=", 5) != 0)
1020 continue;
1021 for (j = i; j < nr_subsystems-1; j++)
1022 subsystems[j] = subsystems[j+1];
1023 nr_subsystems--;
1024 }
1025 }
1026
1027 static bool collect_subsytems(void)
1028 {
1029 char *line = NULL;
1030 size_t sz = 0;
1031 FILE *f;
1032
1033 if (subsystems) // already initialized
1034 return true;
1035
1036 subsystems_inone = malloc(2 * sizeof(char *));
1037 if (!subsystems_inone)
1038 return false;
1039 subsystems_inone[0] = "all";
1040 subsystems_inone[1] = NULL;
1041
1042 f = fopen_cloexec("/proc/self/cgroup", "r");
1043 if (!f) {
1044 f = fopen_cloexec("/proc/1/cgroup", "r");
1045 if (!f)
1046 return false;
1047 }
1048 while (getline(&line, &sz, f) != -1) {
1049 /* file format: hierarchy:subsystems:group,
1050 * with multiple subsystems being ,-separated */
1051 char *slist, *end, *p, *saveptr = NULL, **tmp;
1052
1053 if (!line[0])
1054 continue;
1055
1056 slist = strchr(line, ':');
1057 if (!slist)
1058 continue;
1059 slist++;
1060 end = strchr(slist, ':');
1061 if (!end)
1062 continue;
1063 *end = '\0';
1064
1065 for (p = strtok_r(slist, ",", &saveptr);
1066 p;
1067 p = strtok_r(NULL, ",", &saveptr)) {
1068 tmp = realloc(subsystems, (nr_subsystems+2)*sizeof(char *));
1069 if (!tmp)
1070 goto out_free;
1071
1072 subsystems = tmp;
1073 tmp[nr_subsystems] = strdup(p);
1074 tmp[nr_subsystems+1] = NULL;
1075 if (!tmp[nr_subsystems])
1076 goto out_free;
1077 nr_subsystems++;
1078 }
1079 }
1080 fclose(f);
1081
1082 free(line);
1083 if (!nr_subsystems) {
1084 ERROR("No cgroup subsystems found");
1085 return false;
1086 }
1087
1088 return true;
1089
1090 out_free:
1091 free(line);
1092 fclose(f);
1093 free_subsystems();
1094 return false;
1095 }
1096
1097 /*
1098 * called during cgroup.c:cgroup_ops_init(), at startup. No threads.
1099 * We check whether we can talk to cgmanager, escape to root cgroup if
1100 * we are root, then close the connection.
1101 */
1102 struct cgroup_ops *cgm_ops_init(void)
1103 {
1104 if (!collect_subsytems())
1105 return NULL;
1106 if (!cgm_dbus_connect())
1107 goto err1;
1108
1109 // root; try to escape to root cgroup
1110 if (geteuid() == 0 && !lxc_cgmanager_escape())
1111 goto err2;
1112 cgm_dbus_disconnect();
1113
1114 return &cgmanager_ops;
1115
1116 err2:
1117 cgm_dbus_disconnect();
1118 err1:
1119 free_subsystems();
1120 return NULL;
1121 }
1122
1123 /* unfreeze is called by the command api after killing a container. */
1124 static bool cgm_unfreeze(void *hdata)
1125 {
1126 struct cgm_data *d = hdata;
1127 bool ret = true;
1128
1129 if (!d || !d->cgroup_path)
1130 return false;
1131
1132 if (!cgm_dbus_connect()) {
1133 ERROR("Error connecting to cgroup manager");
1134 return false;
1135 }
1136 if (cgmanager_set_value_sync(NULL, cgroup_manager, "freezer", d->cgroup_path,
1137 "freezer.state", "THAWED") != 0) {
1138 NihError *nerr;
1139 nerr = nih_error_get();
1140 ERROR("call to cgmanager_set_value_sync failed: %s", nerr->message);
1141 nih_free(nerr);
1142 ERROR("Error unfreezing %s", d->cgroup_path);
1143 ret = false;
1144 }
1145 cgm_dbus_disconnect();
1146 return ret;
1147 }
1148
1149 static bool cgm_setup_limits(void *hdata, struct lxc_list *cgroup_settings, bool do_devices)
1150 {
1151 struct cgm_data *d = hdata;
1152 struct lxc_list *iterator;
1153 struct lxc_cgroup *cg;
1154 bool ret = false;
1155
1156 if (lxc_list_empty(cgroup_settings))
1157 return true;
1158
1159 if (!d || !d->cgroup_path)
1160 return false;
1161
1162 if (!cgm_dbus_connect()) {
1163 ERROR("Error connecting to cgroup manager");
1164 return false;
1165 }
1166
1167 lxc_list_for_each(iterator, cgroup_settings) {
1168 char controller[100], *p;
1169 cg = iterator->elem;
1170 if (do_devices != !strncmp("devices", cg->subsystem, 7))
1171 continue;
1172 if (strlen(cg->subsystem) > 100) // i smell a rat
1173 goto out;
1174 strcpy(controller, cg->subsystem);
1175 p = strchr(controller, '.');
1176 if (p)
1177 *p = '\0';
1178 if (cgmanager_set_value_sync(NULL, cgroup_manager, controller,
1179 d->cgroup_path, cg->subsystem, cg->value) != 0) {
1180 NihError *nerr;
1181 nerr = nih_error_get();
1182 ERROR("call to cgmanager_set_value_sync failed: %s", nerr->message);
1183 nih_free(nerr);
1184 ERROR("Error setting cgroup %s:%s limit type %s", controller,
1185 d->cgroup_path, cg->subsystem);
1186 goto out;
1187 }
1188
1189 DEBUG("cgroup '%s' set to '%s'", cg->subsystem, cg->value);
1190 }
1191
1192 ret = true;
1193 INFO("cgroup limits have been setup");
1194 out:
1195 cgm_dbus_disconnect();
1196 return ret;
1197 }
1198
1199 static bool cgm_chown(void *hdata, struct lxc_conf *conf)
1200 {
1201 struct cgm_data *d = hdata;
1202
1203 if (!d || !d->cgroup_path)
1204 return false;
1205 if (!cgm_dbus_connect()) {
1206 ERROR("Error connecting to cgroup manager");
1207 return false;
1208 }
1209 if (!chown_cgroup(d->cgroup_path, conf))
1210 WARN("Failed to chown %s to container root", d->cgroup_path);
1211 cgm_dbus_disconnect();
1212 return true;
1213 }
1214
1215 /*
1216 * TODO: this should be re-written to use the get_config_item("lxc.id_map")
1217 * cmd api instead of getting the idmap from c->lxc_conf. The reason is
1218 * that the id_maps may be different if the container was started with a
1219 * -f or -s argument.
1220 * The reason I'm punting on that is because we'll need to parse the
1221 * idmap results.
1222 */
1223 static bool cgm_attach(const char *name, const char *lxcpath, pid_t pid)
1224 {
1225 bool pass;
1226 char *cgroup = NULL;
1227
1228 if (!cgm_dbus_connect()) {
1229 ERROR("Error connecting to cgroup manager");
1230 return false;
1231 }
1232 // cgm_create makes sure that we have the same cgroup name for all
1233 // subsystems, so since this is a slow command over the cmd socket,
1234 // just get the cgroup name for the first one.
1235 cgroup = try_get_abs_cgroup(name, lxcpath, subsystems[0]);
1236 if (!cgroup) {
1237 ERROR("Failed to get cgroup for controller %s", subsystems[0]);
1238 cgm_dbus_disconnect();
1239 return false;
1240 }
1241
1242 pass = do_cgm_enter(pid, cgroup, abs_cgroup_supported());
1243 cgm_dbus_disconnect();
1244 if (!pass)
1245 ERROR("Failed to enter group %s", cgroup);
1246
1247 free_abs_cgroup(cgroup);
1248 return pass;
1249 }
1250
1251 static bool cgm_bind_dir(const char *root, const char *dirname)
1252 {
1253 nih_local char *cgpath = NULL;
1254
1255 /* /sys should have been mounted by now */
1256 cgpath = NIH_MUST( nih_strdup(NULL, root) );
1257 NIH_MUST( nih_strcat(&cgpath, NULL, "/sys/fs/cgroup") );
1258
1259 if (!dir_exists(cgpath)) {
1260 ERROR("%s does not exist", cgpath);
1261 return false;
1262 }
1263
1264 /* mount a tmpfs there so we can create subdirs */
1265 if (mount("cgroup", cgpath, "tmpfs", 0, "size=10000,mode=755")) {
1266 SYSERROR("Failed to mount tmpfs at %s", cgpath);
1267 return false;
1268 }
1269 NIH_MUST( nih_strcat(&cgpath, NULL, "/cgmanager") );
1270
1271 if (mkdir(cgpath, 0755) < 0) {
1272 SYSERROR("Failed to create %s", cgpath);
1273 return false;
1274 }
1275
1276 if (mount(dirname, cgpath, "none", MS_BIND, 0)) {
1277 SYSERROR("Failed to bind mount %s to %s", dirname, cgpath);
1278 return false;
1279 }
1280
1281 return true;
1282 }
1283
1284 /*
1285 * cgm_mount_cgroup:
1286 * If /sys/fs/cgroup/cgmanager.lower/ exists, bind mount that to
1287 * /sys/fs/cgroup/cgmanager/ in the container.
1288 * Otherwise, if /sys/fs/cgroup/cgmanager exists, bind mount that.
1289 * Else do nothing
1290 */
1291 #define CGMANAGER_LOWER_SOCK "/sys/fs/cgroup/cgmanager.lower"
1292 #define CGMANAGER_UPPER_SOCK "/sys/fs/cgroup/cgmanager"
1293 static bool cgm_mount_cgroup(void *hdata, const char *root, int type)
1294 {
1295 if (dir_exists(CGMANAGER_LOWER_SOCK))
1296 return cgm_bind_dir(root, CGMANAGER_LOWER_SOCK);
1297 if (dir_exists(CGMANAGER_UPPER_SOCK))
1298 return cgm_bind_dir(root, CGMANAGER_UPPER_SOCK);
1299 // Host doesn't have cgmanager running? Then how did we get here?
1300 return false;
1301 }
1302
1303 static struct cgroup_ops cgmanager_ops = {
1304 .init = cgm_init,
1305 .destroy = cgm_destroy,
1306 .create = cgm_create,
1307 .enter = cgm_enter,
1308 .create_legacy = NULL,
1309 .get_cgroup = cgm_get_cgroup,
1310 .get = cgm_get,
1311 .set = cgm_set,
1312 .unfreeze = cgm_unfreeze,
1313 .setup_limits = cgm_setup_limits,
1314 .name = "cgmanager",
1315 .chown = cgm_chown,
1316 .attach = cgm_attach,
1317 .mount_cgroup = cgm_mount_cgroup,
1318 .nrtasks = cgm_get_nrtasks,
1319 .disconnect = NULL,
1320 };
1321 #endif