]> git.proxmox.com Git - systemd.git/blob - src/core/cgroup.c
Imported Upstream version 218
[systemd.git] / src / core / cgroup.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2013 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <fcntl.h>
23 #include <fnmatch.h>
24
25 #include "path-util.h"
26 #include "special.h"
27 #include "cgroup-util.h"
28 #include "cgroup.h"
29
30 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
31
32 void cgroup_context_init(CGroupContext *c) {
33 assert(c);
34
35 /* Initialize everything to the kernel defaults, assuming the
36 * structure is preinitialized to 0 */
37
38 c->cpu_shares = (unsigned long) -1;
39 c->startup_cpu_shares = (unsigned long) -1;
40 c->memory_limit = (uint64_t) -1;
41 c->blockio_weight = (unsigned long) -1;
42 c->startup_blockio_weight = (unsigned long) -1;
43
44 c->cpu_quota_per_sec_usec = USEC_INFINITY;
45 }
46
47 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
48 assert(c);
49 assert(a);
50
51 LIST_REMOVE(device_allow, c->device_allow, a);
52 free(a->path);
53 free(a);
54 }
55
56 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
57 assert(c);
58 assert(w);
59
60 LIST_REMOVE(device_weights, c->blockio_device_weights, w);
61 free(w->path);
62 free(w);
63 }
64
65 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
66 assert(c);
67 assert(b);
68
69 LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
70 free(b->path);
71 free(b);
72 }
73
74 void cgroup_context_done(CGroupContext *c) {
75 assert(c);
76
77 while (c->blockio_device_weights)
78 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
79
80 while (c->blockio_device_bandwidths)
81 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
82
83 while (c->device_allow)
84 cgroup_context_free_device_allow(c, c->device_allow);
85 }
86
87 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
88 CGroupBlockIODeviceBandwidth *b;
89 CGroupBlockIODeviceWeight *w;
90 CGroupDeviceAllow *a;
91 char u[FORMAT_TIMESPAN_MAX];
92
93 assert(c);
94 assert(f);
95
96 prefix = strempty(prefix);
97
98 fprintf(f,
99 "%sCPUAccounting=%s\n"
100 "%sBlockIOAccounting=%s\n"
101 "%sMemoryAccounting=%s\n"
102 "%sCPUShares=%lu\n"
103 "%sStartupCPUShares=%lu\n"
104 "%sCPUQuotaPerSecSec=%s\n"
105 "%sBlockIOWeight=%lu\n"
106 "%sStartupBlockIOWeight=%lu\n"
107 "%sMemoryLimit=%" PRIu64 "\n"
108 "%sDevicePolicy=%s\n"
109 "%sDelegate=%s\n",
110 prefix, yes_no(c->cpu_accounting),
111 prefix, yes_no(c->blockio_accounting),
112 prefix, yes_no(c->memory_accounting),
113 prefix, c->cpu_shares,
114 prefix, c->startup_cpu_shares,
115 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
116 prefix, c->blockio_weight,
117 prefix, c->startup_blockio_weight,
118 prefix, c->memory_limit,
119 prefix, cgroup_device_policy_to_string(c->device_policy),
120 prefix, yes_no(c->delegate));
121
122 LIST_FOREACH(device_allow, a, c->device_allow)
123 fprintf(f,
124 "%sDeviceAllow=%s %s%s%s\n",
125 prefix,
126 a->path,
127 a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
128
129 LIST_FOREACH(device_weights, w, c->blockio_device_weights)
130 fprintf(f,
131 "%sBlockIODeviceWeight=%s %lu",
132 prefix,
133 w->path,
134 w->weight);
135
136 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
137 char buf[FORMAT_BYTES_MAX];
138
139 fprintf(f,
140 "%s%s=%s %s\n",
141 prefix,
142 b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
143 b->path,
144 format_bytes(buf, sizeof(buf), b->bandwidth));
145 }
146 }
147
148 static int lookup_blkio_device(const char *p, dev_t *dev) {
149 struct stat st;
150 int r;
151
152 assert(p);
153 assert(dev);
154
155 r = stat(p, &st);
156 if (r < 0)
157 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
158
159 if (S_ISBLK(st.st_mode))
160 *dev = st.st_rdev;
161 else if (major(st.st_dev) != 0) {
162 /* If this is not a device node then find the block
163 * device this file is stored on */
164 *dev = st.st_dev;
165
166 /* If this is a partition, try to get the originating
167 * block device */
168 block_get_whole_disk(*dev, dev);
169 } else {
170 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
171 return -ENODEV;
172 }
173
174 return 0;
175 }
176
177 static int whitelist_device(const char *path, const char *node, const char *acc) {
178 char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
179 struct stat st;
180 int r;
181
182 assert(path);
183 assert(acc);
184
185 if (stat(node, &st) < 0) {
186 log_warning("Couldn't stat device %s", node);
187 return -errno;
188 }
189
190 if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
191 log_warning("%s is not a device.", node);
192 return -ENODEV;
193 }
194
195 sprintf(buf,
196 "%c %u:%u %s",
197 S_ISCHR(st.st_mode) ? 'c' : 'b',
198 major(st.st_rdev), minor(st.st_rdev),
199 acc);
200
201 r = cg_set_attribute("devices", path, "devices.allow", buf);
202 if (r < 0)
203 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set devices.allow on %s: %s", path, strerror(-r));
204
205 return r;
206 }
207
208 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
209 _cleanup_fclose_ FILE *f = NULL;
210 char line[LINE_MAX];
211 bool good = false;
212 int r;
213
214 assert(path);
215 assert(acc);
216 assert(type == 'b' || type == 'c');
217
218 f = fopen("/proc/devices", "re");
219 if (!f)
220 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
221
222 FOREACH_LINE(line, f, goto fail) {
223 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
224 unsigned maj;
225
226 truncate_nl(line);
227
228 if (type == 'c' && streq(line, "Character devices:")) {
229 good = true;
230 continue;
231 }
232
233 if (type == 'b' && streq(line, "Block devices:")) {
234 good = true;
235 continue;
236 }
237
238 if (isempty(line)) {
239 good = false;
240 continue;
241 }
242
243 if (!good)
244 continue;
245
246 p = strstrip(line);
247
248 w = strpbrk(p, WHITESPACE);
249 if (!w)
250 continue;
251 *w = 0;
252
253 r = safe_atou(p, &maj);
254 if (r < 0)
255 continue;
256 if (maj <= 0)
257 continue;
258
259 w++;
260 w += strspn(w, WHITESPACE);
261
262 if (fnmatch(name, w, 0) != 0)
263 continue;
264
265 sprintf(buf,
266 "%c %u:* %s",
267 type,
268 maj,
269 acc);
270
271 r = cg_set_attribute("devices", path, "devices.allow", buf);
272 if (r < 0)
273 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set devices.allow on %s: %s", path, strerror(-r));
274 }
275
276 return 0;
277
278 fail:
279 log_warning_errno(errno, "Failed to read /proc/devices: %m");
280 return -errno;
281 }
282
283 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
284 bool is_root;
285 int r;
286
287 assert(c);
288 assert(path);
289
290 if (mask == 0)
291 return;
292
293 /* Some cgroup attributes are not support on the root cgroup,
294 * hence silently ignore */
295 is_root = isempty(path) || path_equal(path, "/");
296
297 if ((mask & CGROUP_CPU) && !is_root) {
298 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
299
300 sprintf(buf, "%lu\n",
301 IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
302 c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
303 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
304 if (r < 0)
305 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.shares on %s: %s", path, strerror(-r));
306
307 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
308 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
309 if (r < 0)
310 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
311
312 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
313 sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
314 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
315 } else
316 r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
317 if (r < 0)
318 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
319 }
320
321 if (mask & CGROUP_BLKIO) {
322 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
323 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
324 DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
325 CGroupBlockIODeviceWeight *w;
326 CGroupBlockIODeviceBandwidth *b;
327
328 if (!is_root) {
329 sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
330 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
331 r = cg_set_attribute("blkio", path, "blkio.weight", buf);
332 if (r < 0)
333 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set blkio.weight on %s: %s", path, strerror(-r));
334
335 /* FIXME: no way to reset this list */
336 LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
337 dev_t dev;
338
339 r = lookup_blkio_device(w->path, &dev);
340 if (r < 0)
341 continue;
342
343 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
344 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
345 if (r < 0)
346 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
347 }
348 }
349
350 /* FIXME: no way to reset this list */
351 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
352 const char *a;
353 dev_t dev;
354
355 r = lookup_blkio_device(b->path, &dev);
356 if (r < 0)
357 continue;
358
359 a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
360
361 sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
362 r = cg_set_attribute("blkio", path, a, buf);
363 if (r < 0)
364 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set %s on %s: %s", a, path, strerror(-r));
365 }
366 }
367
368 if (mask & CGROUP_MEMORY) {
369 if (c->memory_limit != (uint64_t) -1) {
370 char buf[DECIMAL_STR_MAX(uint64_t) + 1];
371
372 sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
373 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
374 } else
375 r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
376
377 if (r < 0)
378 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
379 }
380
381 if ((mask & CGROUP_DEVICE) && !is_root) {
382 CGroupDeviceAllow *a;
383
384 if (c->device_allow || c->device_policy != CGROUP_AUTO)
385 r = cg_set_attribute("devices", path, "devices.deny", "a");
386 else
387 r = cg_set_attribute("devices", path, "devices.allow", "a");
388 if (r < 0)
389 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to reset devices.list on %s: %s", path, strerror(-r));
390
391 if (c->device_policy == CGROUP_CLOSED ||
392 (c->device_policy == CGROUP_AUTO && c->device_allow)) {
393 static const char auto_devices[] =
394 "/dev/null\0" "rwm\0"
395 "/dev/zero\0" "rwm\0"
396 "/dev/full\0" "rwm\0"
397 "/dev/random\0" "rwm\0"
398 "/dev/urandom\0" "rwm\0"
399 "/dev/tty\0" "rwm\0"
400 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
401
402 const char *x, *y;
403
404 NULSTR_FOREACH_PAIR(x, y, auto_devices)
405 whitelist_device(path, x, y);
406
407 whitelist_major(path, "pts", 'c', "rw");
408 whitelist_major(path, "kdbus", 'c', "rw");
409 whitelist_major(path, "kdbus/*", 'c', "rw");
410 }
411
412 LIST_FOREACH(device_allow, a, c->device_allow) {
413 char acc[4];
414 unsigned k = 0;
415
416 if (a->r)
417 acc[k++] = 'r';
418 if (a->w)
419 acc[k++] = 'w';
420 if (a->m)
421 acc[k++] = 'm';
422
423 if (k == 0)
424 continue;
425
426 acc[k++] = 0;
427
428 if (startswith(a->path, "/dev/"))
429 whitelist_device(path, a->path, acc);
430 else if (startswith(a->path, "block-"))
431 whitelist_major(path, a->path + 6, 'b', acc);
432 else if (startswith(a->path, "char-"))
433 whitelist_major(path, a->path + 5, 'c', acc);
434 else
435 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
436 }
437 }
438 }
439
440 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
441 CGroupControllerMask mask = 0;
442
443 /* Figure out which controllers we need */
444
445 if (c->cpu_accounting ||
446 c->cpu_shares != (unsigned long) -1 ||
447 c->startup_cpu_shares != (unsigned long) -1 ||
448 c->cpu_quota_per_sec_usec != USEC_INFINITY)
449 mask |= CGROUP_CPUACCT | CGROUP_CPU;
450
451 if (c->blockio_accounting ||
452 c->blockio_weight != (unsigned long) -1 ||
453 c->startup_blockio_weight != (unsigned long) -1 ||
454 c->blockio_device_weights ||
455 c->blockio_device_bandwidths)
456 mask |= CGROUP_BLKIO;
457
458 if (c->memory_accounting ||
459 c->memory_limit != (uint64_t) -1)
460 mask |= CGROUP_MEMORY;
461
462 if (c->device_allow ||
463 c->device_policy != CGROUP_AUTO)
464 mask |= CGROUP_DEVICE;
465
466 return mask;
467 }
468
469 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
470 CGroupContext *c;
471
472 c = unit_get_cgroup_context(u);
473 if (!c)
474 return 0;
475
476 /* If delegation is turned on, then turn on all cgroups,
477 * unless the process we fork into it is known to drop
478 * privileges anyway, and shouldn't get access to the
479 * controllers anyway. */
480
481 if (c->delegate) {
482 ExecContext *e;
483
484 e = unit_get_exec_context(u);
485 if (!e || exec_context_maintains_privileges(e))
486 return _CGROUP_CONTROLLER_MASK_ALL;
487 }
488
489 return cgroup_context_get_mask(c);
490 }
491
492 CGroupControllerMask unit_get_members_mask(Unit *u) {
493 assert(u);
494
495 if (u->cgroup_members_mask_valid)
496 return u->cgroup_members_mask;
497
498 u->cgroup_members_mask = 0;
499
500 if (u->type == UNIT_SLICE) {
501 Unit *member;
502 Iterator i;
503
504 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
505
506 if (member == u)
507 continue;
508
509 if (UNIT_DEREF(member->slice) != u)
510 continue;
511
512 u->cgroup_members_mask |=
513 unit_get_cgroup_mask(member) |
514 unit_get_members_mask(member);
515 }
516 }
517
518 u->cgroup_members_mask_valid = true;
519 return u->cgroup_members_mask;
520 }
521
522 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
523 assert(u);
524
525 if (UNIT_ISSET(u->slice))
526 return unit_get_members_mask(UNIT_DEREF(u->slice));
527
528 return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
529 }
530
531 CGroupControllerMask unit_get_target_mask(Unit *u) {
532 CGroupControllerMask mask;
533
534 mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
535 mask &= u->manager->cgroup_supported;
536
537 return mask;
538 }
539
540 /* Recurse from a unit up through its containing slices, propagating
541 * mask bits upward. A unit is also member of itself. */
542 void unit_update_cgroup_members_masks(Unit *u) {
543 CGroupControllerMask m;
544 bool more;
545
546 assert(u);
547
548 /* Calculate subtree mask */
549 m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
550
551 /* See if anything changed from the previous invocation. If
552 * not, we're done. */
553 if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
554 return;
555
556 more =
557 u->cgroup_subtree_mask_valid &&
558 ((m & ~u->cgroup_subtree_mask) != 0) &&
559 ((~m & u->cgroup_subtree_mask) == 0);
560
561 u->cgroup_subtree_mask = m;
562 u->cgroup_subtree_mask_valid = true;
563
564 if (UNIT_ISSET(u->slice)) {
565 Unit *s = UNIT_DEREF(u->slice);
566
567 if (more)
568 /* There's more set now than before. We
569 * propagate the new mask to the parent's mask
570 * (not caring if it actually was valid or
571 * not). */
572
573 s->cgroup_members_mask |= m;
574
575 else
576 /* There's less set now than before (or we
577 * don't know), we need to recalculate
578 * everything, so let's invalidate the
579 * parent's members mask */
580
581 s->cgroup_members_mask_valid = false;
582
583 /* And now make sure that this change also hits our
584 * grandparents */
585 unit_update_cgroup_members_masks(s);
586 }
587 }
588
589 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
590 Unit *u = userdata;
591
592 assert(mask != 0);
593 assert(u);
594
595 while (u) {
596 if (u->cgroup_path &&
597 u->cgroup_realized &&
598 (u->cgroup_realized_mask & mask) == mask)
599 return u->cgroup_path;
600
601 u = UNIT_DEREF(u->slice);
602 }
603
604 return NULL;
605 }
606
607 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
608 CGroupContext *c;
609 int r;
610
611 assert(u);
612
613 c = unit_get_cgroup_context(u);
614 if (!c)
615 return 0;
616
617 if (!u->cgroup_path) {
618 _cleanup_free_ char *path = NULL;
619
620 path = unit_default_cgroup_path(u);
621 if (!path)
622 return log_oom();
623
624 r = hashmap_put(u->manager->cgroup_unit, path, u);
625 if (r < 0) {
626 log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
627 return r;
628 }
629 if (r > 0) {
630 u->cgroup_path = path;
631 path = NULL;
632 }
633 }
634
635 /* First, create our own group */
636 r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
637 if (r < 0)
638 return log_error_errno(r, "Failed to create cgroup %s: %m", u->cgroup_path);
639
640 /* Keep track that this is now realized */
641 u->cgroup_realized = true;
642 u->cgroup_realized_mask = mask;
643
644 if (u->type != UNIT_SLICE && !c->delegate) {
645
646 /* Then, possibly move things over, but not if
647 * subgroups may contain processes, which is the case
648 * for slice and delegation units. */
649 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
650 if (r < 0)
651 log_warning_errno(r, "Failed to migrate cgroup from to %s: %m", u->cgroup_path);
652 }
653
654 return 0;
655 }
656
657 int unit_attach_pids_to_cgroup(Unit *u) {
658 int r;
659 assert(u);
660
661 r = unit_realize_cgroup(u);
662 if (r < 0)
663 return r;
664
665 r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
666 if (r < 0)
667 return r;
668
669 return 0;
670 }
671
672 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
673 assert(u);
674
675 return u->cgroup_realized && u->cgroup_realized_mask == mask;
676 }
677
678 /* Check if necessary controllers and attributes for a unit are in place.
679 *
680 * If so, do nothing.
681 * If not, create paths, move processes over, and set attributes.
682 *
683 * Returns 0 on success and < 0 on failure. */
684 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
685 CGroupControllerMask mask;
686 int r;
687
688 assert(u);
689
690 if (u->in_cgroup_queue) {
691 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
692 u->in_cgroup_queue = false;
693 }
694
695 mask = unit_get_target_mask(u);
696
697 if (unit_has_mask_realized(u, mask))
698 return 0;
699
700 /* First, realize parents */
701 if (UNIT_ISSET(u->slice)) {
702 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
703 if (r < 0)
704 return r;
705 }
706
707 /* And then do the real work */
708 r = unit_create_cgroups(u, mask);
709 if (r < 0)
710 return r;
711
712 /* Finally, apply the necessary attributes. */
713 cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
714
715 return 0;
716 }
717
718 static void unit_add_to_cgroup_queue(Unit *u) {
719
720 if (u->in_cgroup_queue)
721 return;
722
723 LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
724 u->in_cgroup_queue = true;
725 }
726
727 unsigned manager_dispatch_cgroup_queue(Manager *m) {
728 ManagerState state;
729 unsigned n = 0;
730 Unit *i;
731 int r;
732
733 state = manager_state(m);
734
735 while ((i = m->cgroup_queue)) {
736 assert(i->in_cgroup_queue);
737
738 r = unit_realize_cgroup_now(i, state);
739 if (r < 0)
740 log_warning_errno(r, "Failed to realize cgroups for queued unit %s: %m", i->id);
741
742 n++;
743 }
744
745 return n;
746 }
747
748 static void unit_queue_siblings(Unit *u) {
749 Unit *slice;
750
751 /* This adds the siblings of the specified unit and the
752 * siblings of all parent units to the cgroup queue. (But
753 * neither the specified unit itself nor the parents.) */
754
755 while ((slice = UNIT_DEREF(u->slice))) {
756 Iterator i;
757 Unit *m;
758
759 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
760 if (m == u)
761 continue;
762
763 /* Skip units that have a dependency on the slice
764 * but aren't actually in it. */
765 if (UNIT_DEREF(m->slice) != slice)
766 continue;
767
768 /* No point in doing cgroup application for units
769 * without active processes. */
770 if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
771 continue;
772
773 /* If the unit doesn't need any new controllers
774 * and has current ones realized, it doesn't need
775 * any changes. */
776 if (unit_has_mask_realized(m, unit_get_target_mask(m)))
777 continue;
778
779 unit_add_to_cgroup_queue(m);
780 }
781
782 u = slice;
783 }
784 }
785
786 int unit_realize_cgroup(Unit *u) {
787 CGroupContext *c;
788
789 assert(u);
790
791 c = unit_get_cgroup_context(u);
792 if (!c)
793 return 0;
794
795 /* So, here's the deal: when realizing the cgroups for this
796 * unit, we need to first create all parents, but there's more
797 * actually: for the weight-based controllers we also need to
798 * make sure that all our siblings (i.e. units that are in the
799 * same slice as we are) have cgroups, too. Otherwise, things
800 * would become very uneven as each of their processes would
801 * get as much resources as all our group together. This call
802 * will synchronously create the parent cgroups, but will
803 * defer work on the siblings to the next event loop
804 * iteration. */
805
806 /* Add all sibling slices to the cgroup queue. */
807 unit_queue_siblings(u);
808
809 /* And realize this one now (and apply the values) */
810 return unit_realize_cgroup_now(u, manager_state(u->manager));
811 }
812
813 void unit_destroy_cgroup_if_empty(Unit *u) {
814 int r;
815
816 assert(u);
817
818 if (!u->cgroup_path)
819 return;
820
821 r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
822 if (r < 0) {
823 log_debug_errno(r, "Failed to destroy cgroup %s: %m", u->cgroup_path);
824 return;
825 }
826
827 hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
828
829 free(u->cgroup_path);
830 u->cgroup_path = NULL;
831 u->cgroup_realized = false;
832 u->cgroup_realized_mask = 0;
833 }
834
835 pid_t unit_search_main_pid(Unit *u) {
836 _cleanup_fclose_ FILE *f = NULL;
837 pid_t pid = 0, npid, mypid;
838
839 assert(u);
840
841 if (!u->cgroup_path)
842 return 0;
843
844 if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
845 return 0;
846
847 mypid = getpid();
848 while (cg_read_pid(f, &npid) > 0) {
849 pid_t ppid;
850
851 if (npid == pid)
852 continue;
853
854 /* Ignore processes that aren't our kids */
855 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
856 continue;
857
858 if (pid != 0) {
859 /* Dang, there's more than one daemonized PID
860 in this group, so we don't know what process
861 is the main process. */
862 pid = 0;
863 break;
864 }
865
866 pid = npid;
867 }
868
869 return pid;
870 }
871
872 int manager_setup_cgroup(Manager *m) {
873 _cleanup_free_ char *path = NULL;
874 int r;
875
876 assert(m);
877
878 /* 1. Determine hierarchy */
879 free(m->cgroup_root);
880 m->cgroup_root = NULL;
881
882 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
883 if (r < 0)
884 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
885
886 /* LEGACY: Already in /system.slice? If so, let's cut this
887 * off. This is to support live upgrades from older systemd
888 * versions where PID 1 was moved there. */
889 if (m->running_as == SYSTEMD_SYSTEM) {
890 char *e;
891
892 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
893 if (!e)
894 e = endswith(m->cgroup_root, "/system");
895 if (e)
896 *e = 0;
897 }
898
899 /* And make sure to store away the root value without trailing
900 * slash, even for the root dir, so that we can easily prepend
901 * it everywhere. */
902 if (streq(m->cgroup_root, "/"))
903 m->cgroup_root[0] = 0;
904
905 /* 2. Show data */
906 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
907 if (r < 0)
908 return log_error_errno(r, "Cannot find cgroup mount point: %m");
909
910 log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
911 if (!m->test_run) {
912
913 /* 3. Install agent */
914 if (m->running_as == SYSTEMD_SYSTEM) {
915 r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
916 if (r < 0)
917 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
918 else if (r > 0)
919 log_debug("Installed release agent.");
920 else
921 log_debug("Release agent already installed.");
922 }
923
924 /* 4. Make sure we are in the root cgroup */
925 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
926 if (r < 0)
927 return log_error_errno(r, "Failed to create root cgroup hierarchy: %m");
928
929 /* 5. And pin it, so that it cannot be unmounted */
930 safe_close(m->pin_cgroupfs_fd);
931
932 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
933 if (m->pin_cgroupfs_fd < 0)
934 return log_error_errno(errno, "Failed to open pin file: %m");
935
936 /* 6. Always enable hierarchial support if it exists... */
937 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
938 }
939
940 /* 7. Figure out which controllers are supported */
941 m->cgroup_supported = cg_mask_supported();
942
943 return 0;
944 }
945
946 void manager_shutdown_cgroup(Manager *m, bool delete) {
947 assert(m);
948
949 /* We can't really delete the group, since we are in it. But
950 * let's trim it. */
951 if (delete && m->cgroup_root)
952 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
953
954 m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
955
956 free(m->cgroup_root);
957 m->cgroup_root = NULL;
958 }
959
960 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
961 char *p;
962 Unit *u;
963
964 assert(m);
965 assert(cgroup);
966
967 u = hashmap_get(m->cgroup_unit, cgroup);
968 if (u)
969 return u;
970
971 p = strdupa(cgroup);
972 for (;;) {
973 char *e;
974
975 e = strrchr(p, '/');
976 if (e == p || !e)
977 return NULL;
978
979 *e = 0;
980
981 u = hashmap_get(m->cgroup_unit, p);
982 if (u)
983 return u;
984 }
985 }
986
987 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
988 _cleanup_free_ char *cgroup = NULL;
989 int r;
990
991 assert(m);
992
993 if (pid <= 1)
994 return NULL;
995
996 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
997 if (r < 0)
998 return NULL;
999
1000 return manager_get_unit_by_cgroup(m, cgroup);
1001 }
1002
1003 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1004 Unit *u;
1005 int r;
1006
1007 assert(m);
1008 assert(cgroup);
1009
1010 u = manager_get_unit_by_cgroup(m, cgroup);
1011 if (u) {
1012 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
1013 if (r > 0) {
1014 if (UNIT_VTABLE(u)->notify_cgroup_empty)
1015 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1016
1017 unit_add_to_gc_queue(u);
1018 }
1019 }
1020
1021 return 0;
1022 }
1023
1024 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1025 [CGROUP_AUTO] = "auto",
1026 [CGROUP_CLOSED] = "closed",
1027 [CGROUP_STRICT] = "strict",
1028 };
1029
1030 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);