src/core/cgroup.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2013 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <fcntl.h>
  23 #include <fnmatch.h>
  24
  25 #include "path-util.h"
  26 #include "special.h"
  27 #include "cgroup-util.h"
  28 #include "cgroup.h"
  29
  30 #define CGROUP_CPU_QUOTA_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
  31
  32 void cgroup_context_init(CGroupContext *c) {
  33         assert(c);
  34
  35         /* Initialize everything to the kernel defaults, assuming the
  36          * structure is preinitialized to 0 */
  37
  38         c->cpu_shares = (unsigned long) -1;
  39         c->startup_cpu_shares = (unsigned long) -1;
  40         c->memory_limit = (uint64_t) -1;
  41         c->blockio_weight = (unsigned long) -1;
  42         c->startup_blockio_weight = (unsigned long) -1;
  43
  44         c->cpu_quota_per_sec_usec = USEC_INFINITY;
  45 }
  46
  47 void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
  48         assert(c);
  49         assert(a);
  50
  51         LIST_REMOVE(device_allow, c->device_allow, a);
  52         free(a->path);
  53         free(a);
  54 }
  55
  56 void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
  57         assert(c);
  58         assert(w);
  59
  60         LIST_REMOVE(device_weights, c->blockio_device_weights, w);
  61         free(w->path);
  62         free(w);
  63 }
  64
  65 void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
  66         assert(c);
  67         assert(b);
  68
  69         LIST_REMOVE(device_bandwidths, c->blockio_device_bandwidths, b);
  70         free(b->path);
  71         free(b);
  72 }
  73
  74 void cgroup_context_done(CGroupContext *c) {
  75         assert(c);
  76
  77         while (c->blockio_device_weights)
  78                 cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
  79
  80         while (c->blockio_device_bandwidths)
  81                 cgroup_context_free_blockio_device_bandwidth(c, c->blockio_device_bandwidths);
  82
  83         while (c->device_allow)
  84                 cgroup_context_free_device_allow(c, c->device_allow);
  85 }
  86
  87 void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) {
  88         CGroupBlockIODeviceBandwidth *b;
  89         CGroupBlockIODeviceWeight *w;
  90         CGroupDeviceAllow *a;
  91         char u[FORMAT_TIMESPAN_MAX];
  92
  93         assert(c);
  94         assert(f);
  95
  96         prefix = strempty(prefix);
  97
  98         fprintf(f,
  99                 "%sCPUAccounting=%s\n"
 100                 "%sBlockIOAccounting=%s\n"
 101                 "%sMemoryAccounting=%s\n"
 102                 "%sCPUShares=%lu\n"
 103                 "%sStartupCPUShares=%lu\n"
 104                 "%sCPUQuotaPerSecSec=%s\n"
 105                 "%sBlockIOWeight=%lu\n"
 106                 "%sStartupBlockIOWeight=%lu\n"
 107                 "%sMemoryLimit=%" PRIu64 "\n"
 108                 "%sDevicePolicy=%s\n"
 109                 "%sDelegate=%s\n",
 110                 prefix, yes_no(c->cpu_accounting),
 111                 prefix, yes_no(c->blockio_accounting),
 112                 prefix, yes_no(c->memory_accounting),
 113                 prefix, c->cpu_shares,
 114                 prefix, c->startup_cpu_shares,
 115                 prefix, format_timespan(u, sizeof(u), c->cpu_quota_per_sec_usec, 1),
 116                 prefix, c->blockio_weight,
 117                 prefix, c->startup_blockio_weight,
 118                 prefix, c->memory_limit,
 119                 prefix, cgroup_device_policy_to_string(c->device_policy),
 120                 prefix, yes_no(c->delegate));
 121
 122         LIST_FOREACH(device_allow, a, c->device_allow)
 123                 fprintf(f,
 124                         "%sDeviceAllow=%s %s%s%s\n",
 125                         prefix,
 126                         a->path,
 127                         a->r ? "r" : "", a->w ? "w" : "", a->m ? "m" : "");
 128
 129         LIST_FOREACH(device_weights, w, c->blockio_device_weights)
 130                 fprintf(f,
 131                         "%sBlockIODeviceWeight=%s %lu",
 132                         prefix,
 133                         w->path,
 134                         w->weight);
 135
 136         LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 137                 char buf[FORMAT_BYTES_MAX];
 138
 139                 fprintf(f,
 140                         "%s%s=%s %s\n",
 141                         prefix,
 142                         b->read ? "BlockIOReadBandwidth" : "BlockIOWriteBandwidth",
 143                         b->path,
 144                         format_bytes(buf, sizeof(buf), b->bandwidth));
 145         }
 146 }
 147
 148 static int lookup_blkio_device(const char *p, dev_t *dev) {
 149         struct stat st;
 150         int r;
 151
 152         assert(p);
 153         assert(dev);
 154
 155         r = stat(p, &st);
 156         if (r < 0)
 157                 return log_warning_errno(errno, "Couldn't stat device %s: %m", p);
 158
 159         if (S_ISBLK(st.st_mode))
 160                 *dev = st.st_rdev;
 161         else if (major(st.st_dev) != 0) {
 162                 /* If this is not a device node then find the block
 163                  * device this file is stored on */
 164                 *dev = st.st_dev;
 165
 166                 /* If this is a partition, try to get the originating
 167                  * block device */
 168                 block_get_whole_disk(*dev, dev);
 169         } else {
 170                 log_warning("%s is not a block device and file system block device cannot be determined or is not local.", p);
 171                 return -ENODEV;
 172         }
 173
 174         return 0;
 175 }
 176
 177 static int whitelist_device(const char *path, const char *node, const char *acc) {
 178         char buf[2+DECIMAL_STR_MAX(dev_t)*2+2+4];
 179         struct stat st;
 180         int r;
 181
 182         assert(path);
 183         assert(acc);
 184
 185         if (stat(node, &st) < 0) {
 186                 log_warning("Couldn't stat device %s", node);
 187                 return -errno;
 188         }
 189
 190         if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
 191                 log_warning("%s is not a device.", node);
 192                 return -ENODEV;
 193         }
 194
 195         sprintf(buf,
 196                 "%c %u:%u %s",
 197                 S_ISCHR(st.st_mode) ? 'c' : 'b',
 198                 major(st.st_rdev), minor(st.st_rdev),
 199                 acc);
 200
 201         r = cg_set_attribute("devices", path, "devices.allow", buf);
 202         if (r < 0)
 203                 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set devices.allow on %s: %s", path, strerror(-r));
 204
 205         return r;
 206 }
 207
 208 static int whitelist_major(const char *path, const char *name, char type, const char *acc) {
 209         _cleanup_fclose_ FILE *f = NULL;
 210         char line[LINE_MAX];
 211         bool good = false;
 212         int r;
 213
 214         assert(path);
 215         assert(acc);
 216         assert(type == 'b' || type == 'c');
 217
 218         f = fopen("/proc/devices", "re");
 219         if (!f)
 220                 return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type);
 221
 222         FOREACH_LINE(line, f, goto fail) {
 223                 char buf[2+DECIMAL_STR_MAX(unsigned)+3+4], *p, *w;
 224                 unsigned maj;
 225
 226                 truncate_nl(line);
 227
 228                 if (type == 'c' && streq(line, "Character devices:")) {
 229                         good = true;
 230                         continue;
 231                 }
 232
 233                 if (type == 'b' && streq(line, "Block devices:")) {
 234                         good = true;
 235                         continue;
 236                 }
 237
 238                 if (isempty(line)) {
 239                         good = false;
 240                         continue;
 241                 }
 242
 243                 if (!good)
 244                         continue;
 245
 246                 p = strstrip(line);
 247
 248                 w = strpbrk(p, WHITESPACE);
 249                 if (!w)
 250                         continue;
 251                 *w = 0;
 252
 253                 r = safe_atou(p, &maj);
 254                 if (r < 0)
 255                         continue;
 256                 if (maj <= 0)
 257                         continue;
 258
 259                 w++;
 260                 w += strspn(w, WHITESPACE);
 261
 262                 if (fnmatch(name, w, 0) != 0)
 263                         continue;
 264
 265                 sprintf(buf,
 266                         "%c %u:* %s",
 267                         type,
 268                         maj,
 269                         acc);
 270
 271                 r = cg_set_attribute("devices", path, "devices.allow", buf);
 272                 if (r < 0)
 273                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set devices.allow on %s: %s", path, strerror(-r));
 274         }
 275
 276         return 0;
 277
 278 fail:
 279         log_warning_errno(errno, "Failed to read /proc/devices: %m");
 280         return -errno;
 281 }
 282
 283 void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
 284         bool is_root;
 285         int r;
 286
 287         assert(c);
 288         assert(path);
 289
 290         if (mask == 0)
 291                 return;
 292
 293         /* Some cgroup attributes are not support on the root cgroup,
 294          * hence silently ignore */
 295         is_root = isempty(path) || path_equal(path, "/");
 296
 297         if ((mask & CGROUP_CPU) && !is_root) {
 298                 char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
 299
 300                 sprintf(buf, "%lu\n",
 301                         IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_cpu_shares != (unsigned long) -1 ? c->startup_cpu_shares :
 302                         c->cpu_shares != (unsigned long) -1 ? c->cpu_shares : 1024);
 303                 r = cg_set_attribute("cpu", path, "cpu.shares", buf);
 304                 if (r < 0)
 305                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.shares on %s: %s", path, strerror(-r));
 306
 307                 sprintf(buf, USEC_FMT "\n", CGROUP_CPU_QUOTA_PERIOD_USEC);
 308                 r = cg_set_attribute("cpu", path, "cpu.cfs_period_us", buf);
 309                 if (r < 0)
 310                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.cfs_period_us on %s: %s", path, strerror(-r));
 311
 312                 if (c->cpu_quota_per_sec_usec != USEC_INFINITY) {
 313                         sprintf(buf, USEC_FMT "\n", c->cpu_quota_per_sec_usec * CGROUP_CPU_QUOTA_PERIOD_USEC / USEC_PER_SEC);
 314                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", buf);
 315                 } else
 316                         r = cg_set_attribute("cpu", path, "cpu.cfs_quota_us", "-1");
 317                 if (r < 0)
 318                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set cpu.cfs_quota_us on %s: %s", path, strerror(-r));
 319         }
 320
 321         if (mask & CGROUP_BLKIO) {
 322                 char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
 323                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
 324                               DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
 325                 CGroupBlockIODeviceWeight *w;
 326                 CGroupBlockIODeviceBandwidth *b;
 327
 328                 if (!is_root) {
 329                         sprintf(buf, "%lu\n", IN_SET(state, MANAGER_STARTING, MANAGER_INITIALIZING) && c->startup_blockio_weight != (unsigned long) -1 ? c->startup_blockio_weight :
 330                                 c->blockio_weight != (unsigned long) -1 ? c->blockio_weight : 1000);
 331                         r = cg_set_attribute("blkio", path, "blkio.weight", buf);
 332                         if (r < 0)
 333                                 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set blkio.weight on %s: %s", path, strerror(-r));
 334
 335                         /* FIXME: no way to reset this list */
 336                         LIST_FOREACH(device_weights, w, c->blockio_device_weights) {
 337                                 dev_t dev;
 338
 339                                 r = lookup_blkio_device(w->path, &dev);
 340                                 if (r < 0)
 341                                         continue;
 342
 343                                 sprintf(buf, "%u:%u %lu", major(dev), minor(dev), w->weight);
 344                                 r = cg_set_attribute("blkio", path, "blkio.weight_device", buf);
 345                                 if (r < 0)
 346                                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set blkio.weight_device on %s: %s", path, strerror(-r));
 347                         }
 348                 }
 349
 350                 /* FIXME: no way to reset this list */
 351                 LIST_FOREACH(device_bandwidths, b, c->blockio_device_bandwidths) {
 352                         const char *a;
 353                         dev_t dev;
 354
 355                         r = lookup_blkio_device(b->path, &dev);
 356                         if (r < 0)
 357                                 continue;
 358
 359                         a = b->read ? "blkio.throttle.read_bps_device" : "blkio.throttle.write_bps_device";
 360
 361                         sprintf(buf, "%u:%u %" PRIu64 "\n", major(dev), minor(dev), b->bandwidth);
 362                         r = cg_set_attribute("blkio", path, a, buf);
 363                         if (r < 0)
 364                                 log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set %s on %s: %s", a, path, strerror(-r));
 365                 }
 366         }
 367
 368         if (mask & CGROUP_MEMORY) {
 369                 if (c->memory_limit != (uint64_t) -1) {
 370                         char buf[DECIMAL_STR_MAX(uint64_t) + 1];
 371
 372                         sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
 373                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
 374                 } else
 375                         r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
 376
 377                 if (r < 0)
 378                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to set memory.limit_in_bytes on %s: %s", path, strerror(-r));
 379         }
 380
 381         if ((mask & CGROUP_DEVICE) && !is_root) {
 382                 CGroupDeviceAllow *a;
 383
 384                 if (c->device_allow || c->device_policy != CGROUP_AUTO)
 385                         r = cg_set_attribute("devices", path, "devices.deny", "a");
 386                 else
 387                         r = cg_set_attribute("devices", path, "devices.allow", "a");
 388                 if (r < 0)
 389                         log_full(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, "Failed to reset devices.list on %s: %s", path, strerror(-r));
 390
 391                 if (c->device_policy == CGROUP_CLOSED ||
 392                     (c->device_policy == CGROUP_AUTO && c->device_allow)) {
 393                         static const char auto_devices[] =
 394                                 "/dev/null\0" "rwm\0"
 395                                 "/dev/zero\0" "rwm\0"
 396                                 "/dev/full\0" "rwm\0"
 397                                 "/dev/random\0" "rwm\0"
 398                                 "/dev/urandom\0" "rwm\0"
 399                                 "/dev/tty\0" "rwm\0"
 400                                 "/dev/pts/ptmx\0" "rw\0"; /* /dev/pts/ptmx may not be duplicated, but accessed */
 401
 402                         const char *x, *y;
 403
 404                         NULSTR_FOREACH_PAIR(x, y, auto_devices)
 405                                 whitelist_device(path, x, y);
 406
 407                         whitelist_major(path, "pts", 'c', "rw");
 408                         whitelist_major(path, "kdbus", 'c', "rw");
 409                         whitelist_major(path, "kdbus/*", 'c', "rw");
 410                 }
 411
 412                 LIST_FOREACH(device_allow, a, c->device_allow) {
 413                         char acc[4];
 414                         unsigned k = 0;
 415
 416                         if (a->r)
 417                                 acc[k++] = 'r';
 418                         if (a->w)
 419                                 acc[k++] = 'w';
 420                         if (a->m)
 421                                 acc[k++] = 'm';
 422
 423                         if (k == 0)
 424                                 continue;
 425
 426                         acc[k++] = 0;
 427
 428                         if (startswith(a->path, "/dev/"))
 429                                 whitelist_device(path, a->path, acc);
 430                         else if (startswith(a->path, "block-"))
 431                                 whitelist_major(path, a->path + 6, 'b', acc);
 432                         else if (startswith(a->path, "char-"))
 433                                 whitelist_major(path, a->path + 5, 'c', acc);
 434                         else
 435                                 log_debug("Ignoring device %s while writing cgroup attribute.", a->path);
 436                 }
 437         }
 438 }
 439
 440 CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
 441         CGroupControllerMask mask = 0;
 442
 443         /* Figure out which controllers we need */
 444
 445         if (c->cpu_accounting ||
 446             c->cpu_shares != (unsigned long) -1 ||
 447             c->startup_cpu_shares != (unsigned long) -1 ||
 448             c->cpu_quota_per_sec_usec != USEC_INFINITY)
 449                 mask |= CGROUP_CPUACCT | CGROUP_CPU;
 450
 451         if (c->blockio_accounting ||
 452             c->blockio_weight != (unsigned long) -1 ||
 453             c->startup_blockio_weight != (unsigned long) -1 ||
 454             c->blockio_device_weights ||
 455             c->blockio_device_bandwidths)
 456                 mask |= CGROUP_BLKIO;
 457
 458         if (c->memory_accounting ||
 459             c->memory_limit != (uint64_t) -1)
 460                 mask |= CGROUP_MEMORY;
 461
 462         if (c->device_allow ||
 463             c->device_policy != CGROUP_AUTO)
 464                 mask |= CGROUP_DEVICE;
 465
 466         return mask;
 467 }
 468
 469 CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
 470         CGroupContext *c;
 471
 472         c = unit_get_cgroup_context(u);
 473         if (!c)
 474                 return 0;
 475
 476         /* If delegation is turned on, then turn on all cgroups,
 477          * unless the process we fork into it is known to drop
 478          * privileges anyway, and shouldn't get access to the
 479          * controllers anyway. */
 480
 481         if (c->delegate) {
 482                 ExecContext *e;
 483
 484                 e = unit_get_exec_context(u);
 485                 if (!e || exec_context_maintains_privileges(e))
 486                         return _CGROUP_CONTROLLER_MASK_ALL;
 487         }
 488
 489         return cgroup_context_get_mask(c);
 490 }
 491
 492 CGroupControllerMask unit_get_members_mask(Unit *u) {
 493         assert(u);
 494
 495         if (u->cgroup_members_mask_valid)
 496                 return u->cgroup_members_mask;
 497
 498         u->cgroup_members_mask = 0;
 499
 500         if (u->type == UNIT_SLICE) {
 501                 Unit *member;
 502                 Iterator i;
 503
 504                 SET_FOREACH(member, u->dependencies[UNIT_BEFORE], i) {
 505
 506                         if (member == u)
 507                                 continue;
 508
 509                         if (UNIT_DEREF(member->slice) != u)
 510                                 continue;
 511
 512                         u->cgroup_members_mask |=
 513                                 unit_get_cgroup_mask(member) |
 514                                 unit_get_members_mask(member);
 515                 }
 516         }
 517
 518         u->cgroup_members_mask_valid = true;
 519         return u->cgroup_members_mask;
 520 }
 521
 522 CGroupControllerMask unit_get_siblings_mask(Unit *u) {
 523         assert(u);
 524
 525         if (UNIT_ISSET(u->slice))
 526                 return unit_get_members_mask(UNIT_DEREF(u->slice));
 527
 528         return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 529 }
 530
 531 CGroupControllerMask unit_get_target_mask(Unit *u) {
 532         CGroupControllerMask mask;
 533
 534         mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
 535         mask &= u->manager->cgroup_supported;
 536
 537         return mask;
 538 }
 539
 540 /* Recurse from a unit up through its containing slices, propagating
 541  * mask bits upward. A unit is also member of itself. */
 542 void unit_update_cgroup_members_masks(Unit *u) {
 543         CGroupControllerMask m;
 544         bool more;
 545
 546         assert(u);
 547
 548         /* Calculate subtree mask */
 549         m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
 550
 551         /* See if anything changed from the previous invocation. If
 552          * not, we're done. */
 553         if (u->cgroup_subtree_mask_valid && m == u->cgroup_subtree_mask)
 554                 return;
 555
 556         more =
 557                 u->cgroup_subtree_mask_valid &&
 558                 ((m & ~u->cgroup_subtree_mask) != 0) &&
 559                 ((~m & u->cgroup_subtree_mask) == 0);
 560
 561         u->cgroup_subtree_mask = m;
 562         u->cgroup_subtree_mask_valid = true;
 563
 564         if (UNIT_ISSET(u->slice)) {
 565                 Unit *s = UNIT_DEREF(u->slice);
 566
 567                 if (more)
 568                         /* There's more set now than before. We
 569                          * propagate the new mask to the parent's mask
 570                          * (not caring if it actually was valid or
 571                          * not). */
 572
 573                         s->cgroup_members_mask |= m;
 574
 575                 else
 576                         /* There's less set now than before (or we
 577                          * don't know), we need to recalculate
 578                          * everything, so let's invalidate the
 579                          * parent's members mask */
 580
 581                         s->cgroup_members_mask_valid = false;
 582
 583                 /* And now make sure that this change also hits our
 584                  * grandparents */
 585                 unit_update_cgroup_members_masks(s);
 586         }
 587 }
 588
 589 static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
 590         Unit *u = userdata;
 591
 592         assert(mask != 0);
 593         assert(u);
 594
 595         while (u) {
 596                 if (u->cgroup_path &&
 597                     u->cgroup_realized &&
 598                     (u->cgroup_realized_mask & mask) == mask)
 599                         return u->cgroup_path;
 600
 601                 u = UNIT_DEREF(u->slice);
 602         }
 603
 604         return NULL;
 605 }
 606
 607 static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
 608         CGroupContext *c;
 609         int r;
 610
 611         assert(u);
 612
 613         c = unit_get_cgroup_context(u);
 614         if (!c)
 615                 return 0;
 616
 617         if (!u->cgroup_path) {
 618                 _cleanup_free_ char *path = NULL;
 619
 620                 path = unit_default_cgroup_path(u);
 621                 if (!path)
 622                         return log_oom();
 623
 624                 r = hashmap_put(u->manager->cgroup_unit, path, u);
 625                 if (r < 0) {
 626                         log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
 627                         return r;
 628                 }
 629                 if (r > 0) {
 630                         u->cgroup_path = path;
 631                         path = NULL;
 632                 }
 633         }
 634
 635         /* First, create our own group */
 636         r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
 637         if (r < 0)
 638                 return log_error_errno(r, "Failed to create cgroup %s: %m", u->cgroup_path);
 639
 640         /* Keep track that this is now realized */
 641         u->cgroup_realized = true;
 642         u->cgroup_realized_mask = mask;
 643
 644         if (u->type != UNIT_SLICE && !c->delegate) {
 645
 646                 /* Then, possibly move things over, but not if
 647                  * subgroups may contain processes, which is the case
 648                  * for slice and delegation units. */
 649                 r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
 650                 if (r < 0)
 651                         log_warning_errno(r, "Failed to migrate cgroup from to %s: %m", u->cgroup_path);
 652         }
 653
 654         return 0;
 655 }
 656
 657 int unit_attach_pids_to_cgroup(Unit *u) {
 658         int r;
 659         assert(u);
 660
 661         r = unit_realize_cgroup(u);
 662         if (r < 0)
 663                 return r;
 664
 665         r = cg_attach_many_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->pids, migrate_callback, u);
 666         if (r < 0)
 667                 return r;
 668
 669         return 0;
 670 }
 671
 672 static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
 673         assert(u);
 674
 675         return u->cgroup_realized && u->cgroup_realized_mask == mask;
 676 }
 677
 678 /* Check if necessary controllers and attributes for a unit are in place.
 679  *
 680  * If so, do nothing.
 681  * If not, create paths, move processes over, and set attributes.
 682  *
 683  * Returns 0 on success and < 0 on failure. */
 684 static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
 685         CGroupControllerMask mask;
 686         int r;
 687
 688         assert(u);
 689
 690         if (u->in_cgroup_queue) {
 691                 LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
 692                 u->in_cgroup_queue = false;
 693         }
 694
 695         mask = unit_get_target_mask(u);
 696
 697         if (unit_has_mask_realized(u, mask))
 698                 return 0;
 699
 700         /* First, realize parents */
 701         if (UNIT_ISSET(u->slice)) {
 702                 r = unit_realize_cgroup_now(UNIT_DEREF(u->slice), state);
 703                 if (r < 0)
 704                         return r;
 705         }
 706
 707         /* And then do the real work */
 708         r = unit_create_cgroups(u, mask);
 709         if (r < 0)
 710                 return r;
 711
 712         /* Finally, apply the necessary attributes. */
 713         cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
 714
 715         return 0;
 716 }
 717
 718 static void unit_add_to_cgroup_queue(Unit *u) {
 719
 720         if (u->in_cgroup_queue)
 721                 return;
 722
 723         LIST_PREPEND(cgroup_queue, u->manager->cgroup_queue, u);
 724         u->in_cgroup_queue = true;
 725 }
 726
 727 unsigned manager_dispatch_cgroup_queue(Manager *m) {
 728         ManagerState state;
 729         unsigned n = 0;
 730         Unit *i;
 731         int r;
 732
 733         state = manager_state(m);
 734
 735         while ((i = m->cgroup_queue)) {
 736                 assert(i->in_cgroup_queue);
 737
 738                 r = unit_realize_cgroup_now(i, state);
 739                 if (r < 0)
 740                         log_warning_errno(r, "Failed to realize cgroups for queued unit %s: %m", i->id);
 741
 742                 n++;
 743         }
 744
 745         return n;
 746 }
 747
 748 static void unit_queue_siblings(Unit *u) {
 749         Unit *slice;
 750
 751         /* This adds the siblings of the specified unit and the
 752          * siblings of all parent units to the cgroup queue. (But
 753          * neither the specified unit itself nor the parents.) */
 754
 755         while ((slice = UNIT_DEREF(u->slice))) {
 756                 Iterator i;
 757                 Unit *m;
 758
 759                 SET_FOREACH(m, slice->dependencies[UNIT_BEFORE], i) {
 760                         if (m == u)
 761                                 continue;
 762
 763                         /* Skip units that have a dependency on the slice
 764                          * but aren't actually in it. */
 765                         if (UNIT_DEREF(m->slice) != slice)
 766                                 continue;
 767
 768                         /* No point in doing cgroup application for units
 769                          * without active processes. */
 770                         if (UNIT_IS_INACTIVE_OR_FAILED(unit_active_state(m)))
 771                                 continue;
 772
 773                         /* If the unit doesn't need any new controllers
 774                          * and has current ones realized, it doesn't need
 775                          * any changes. */
 776                         if (unit_has_mask_realized(m, unit_get_target_mask(m)))
 777                                 continue;
 778
 779                         unit_add_to_cgroup_queue(m);
 780                 }
 781
 782                 u = slice;
 783         }
 784 }
 785
 786 int unit_realize_cgroup(Unit *u) {
 787         CGroupContext *c;
 788
 789         assert(u);
 790
 791         c = unit_get_cgroup_context(u);
 792         if (!c)
 793                 return 0;
 794
 795         /* So, here's the deal: when realizing the cgroups for this
 796          * unit, we need to first create all parents, but there's more
 797          * actually: for the weight-based controllers we also need to
 798          * make sure that all our siblings (i.e. units that are in the
 799          * same slice as we are) have cgroups, too. Otherwise, things
 800          * would become very uneven as each of their processes would
 801          * get as much resources as all our group together. This call
 802          * will synchronously create the parent cgroups, but will
 803          * defer work on the siblings to the next event loop
 804          * iteration. */
 805
 806         /* Add all sibling slices to the cgroup queue. */
 807         unit_queue_siblings(u);
 808
 809         /* And realize this one now (and apply the values) */
 810         return unit_realize_cgroup_now(u, manager_state(u->manager));
 811 }
 812
 813 void unit_destroy_cgroup_if_empty(Unit *u) {
 814         int r;
 815
 816         assert(u);
 817
 818         if (!u->cgroup_path)
 819                 return;
 820
 821         r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
 822         if (r < 0) {
 823                 log_debug_errno(r, "Failed to destroy cgroup %s: %m", u->cgroup_path);
 824                 return;
 825         }
 826
 827         hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
 828
 829         free(u->cgroup_path);
 830         u->cgroup_path = NULL;
 831         u->cgroup_realized = false;
 832         u->cgroup_realized_mask = 0;
 833 }
 834
 835 pid_t unit_search_main_pid(Unit *u) {
 836         _cleanup_fclose_ FILE *f = NULL;
 837         pid_t pid = 0, npid, mypid;
 838
 839         assert(u);
 840
 841         if (!u->cgroup_path)
 842                 return 0;
 843
 844         if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
 845                 return 0;
 846
 847         mypid = getpid();
 848         while (cg_read_pid(f, &npid) > 0)  {
 849                 pid_t ppid;
 850
 851                 if (npid == pid)
 852                         continue;
 853
 854                 /* Ignore processes that aren't our kids */
 855                 if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
 856                         continue;
 857
 858                 if (pid != 0) {
 859                         /* Dang, there's more than one daemonized PID
 860                         in this group, so we don't know what process
 861                         is the main process. */
 862                         pid = 0;
 863                         break;
 864                 }
 865
 866                 pid = npid;
 867         }
 868
 869         return pid;
 870 }
 871
 872 int manager_setup_cgroup(Manager *m) {
 873         _cleanup_free_ char *path = NULL;
 874         int r;
 875
 876         assert(m);
 877
 878         /* 1. Determine hierarchy */
 879         free(m->cgroup_root);
 880         m->cgroup_root = NULL;
 881
 882         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
 883         if (r < 0)
 884                 return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
 885
 886         /* LEGACY: Already in /system.slice? If so, let's cut this
 887          * off. This is to support live upgrades from older systemd
 888          * versions where PID 1 was moved there. */
 889         if (m->running_as == SYSTEMD_SYSTEM) {
 890                 char *e;
 891
 892                 e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
 893                 if (!e)
 894                         e = endswith(m->cgroup_root, "/system");
 895                 if (e)
 896                         *e = 0;
 897         }
 898
 899         /* And make sure to store away the root value without trailing
 900          * slash, even for the root dir, so that we can easily prepend
 901          * it everywhere. */
 902         if (streq(m->cgroup_root, "/"))
 903                 m->cgroup_root[0] = 0;
 904
 905         /* 2. Show data */
 906         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
 907         if (r < 0)
 908                 return log_error_errno(r, "Cannot find cgroup mount point: %m");
 909
 910         log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
 911         if (!m->test_run) {
 912
 913                 /* 3. Install agent */
 914                 if (m->running_as == SYSTEMD_SYSTEM) {
 915                         r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
 916                         if (r < 0)
 917                                 log_warning_errno(r, "Failed to install release agent, ignoring: %m");
 918                         else if (r > 0)
 919                                 log_debug("Installed release agent.");
 920                         else
 921                                 log_debug("Release agent already installed.");
 922                 }
 923
 924                 /* 4. Make sure we are in the root cgroup */
 925                 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
 926                 if (r < 0)
 927                         return log_error_errno(r, "Failed to create root cgroup hierarchy: %m");
 928
 929                 /* 5. And pin it, so that it cannot be unmounted */
 930                 safe_close(m->pin_cgroupfs_fd);
 931
 932                 m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
 933                 if (m->pin_cgroupfs_fd < 0)
 934                         return log_error_errno(errno, "Failed to open pin file: %m");
 935
 936                 /* 6.  Always enable hierarchial support if it exists... */
 937                 cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
 938         }
 939
 940         /* 7. Figure out which controllers are supported */
 941         m->cgroup_supported = cg_mask_supported();
 942
 943         return 0;
 944 }
 945
 946 void manager_shutdown_cgroup(Manager *m, bool delete) {
 947         assert(m);
 948
 949         /* We can't really delete the group, since we are in it. But
 950          * let's trim it. */
 951         if (delete && m->cgroup_root)
 952                 cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
 953
 954         m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
 955
 956         free(m->cgroup_root);
 957         m->cgroup_root = NULL;
 958 }
 959
 960 Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
 961         char *p;
 962         Unit *u;
 963
 964         assert(m);
 965         assert(cgroup);
 966
 967         u = hashmap_get(m->cgroup_unit, cgroup);
 968         if (u)
 969                 return u;
 970
 971         p = strdupa(cgroup);
 972         for (;;) {
 973                 char *e;
 974
 975                 e = strrchr(p, '/');
 976                 if (e == p || !e)
 977                         return NULL;
 978
 979                 *e = 0;
 980
 981                 u = hashmap_get(m->cgroup_unit, p);
 982                 if (u)
 983                         return u;
 984         }
 985 }
 986
 987 Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
 988         _cleanup_free_ char *cgroup = NULL;
 989         int r;
 990
 991         assert(m);
 992
 993         if (pid <= 1)
 994                 return NULL;
 995
 996         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
 997         if (r < 0)
 998                 return NULL;
 999
1000         return manager_get_unit_by_cgroup(m, cgroup);
1001 }
1002
1003 int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
1004         Unit *u;
1005         int r;
1006
1007         assert(m);
1008         assert(cgroup);
1009
1010         u = manager_get_unit_by_cgroup(m, cgroup);
1011         if (u) {
1012                 r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, true);
1013                 if (r > 0) {
1014                         if (UNIT_VTABLE(u)->notify_cgroup_empty)
1015                                 UNIT_VTABLE(u)->notify_cgroup_empty(u);
1016
1017                         unit_add_to_gc_queue(u);
1018                 }
1019         }
1020
1021         return 0;
1022 }
1023
1024 static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
1025         [CGROUP_AUTO] = "auto",
1026         [CGROUP_CLOSED] = "closed",
1027         [CGROUP_STRICT] = "strict",
1028 };
1029
1030 DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);