src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <glob.h>
  24 #include <grp.h>
  25 #include <poll.h>
  26 #include <signal.h>
  27 #include <string.h>
  28 #include <sys/capability.h>
  29 #include <sys/eventfd.h>
  30 #include <sys/mman.h>
  31 #include <sys/personality.h>
  32 #include <sys/prctl.h>
  33 #include <sys/shm.h>
  34 #include <sys/socket.h>
  35 #include <sys/stat.h>
  36 #include <sys/types.h>
  37 #include <sys/un.h>
  38 #include <unistd.h>
  39 #include <utmpx.h>
  40
  41 #if HAVE_PAM
  42 #include <security/pam_appl.h>
  43 #endif
  44
  45 #if HAVE_SELINUX
  46 #include <selinux/selinux.h>
  47 #endif
  48
  49 #if HAVE_SECCOMP
  50 #include <seccomp.h>
  51 #endif
  52
  53 #if HAVE_APPARMOR
  54 #include <sys/apparmor.h>
  55 #endif
  56
  57 #include "sd-messages.h"
  58
  59 #include "af-list.h"
  60 #include "alloc-util.h"
  61 #if HAVE_APPARMOR
  62 #include "apparmor-util.h"
  63 #endif
  64 #include "async.h"
  65 #include "barrier.h"
  66 #include "cap-list.h"
  67 #include "capability-util.h"
  68 #include "chown-recursive.h"
  69 #include "cpu-set-util.h"
  70 #include "def.h"
  71 #include "env-util.h"
  72 #include "errno-list.h"
  73 #include "execute.h"
  74 #include "exit-status.h"
  75 #include "fd-util.h"
  76 #include "fileio.h"
  77 #include "format-util.h"
  78 #include "fs-util.h"
  79 #include "glob-util.h"
  80 #include "io-util.h"
  81 #include "ioprio.h"
  82 #include "label.h"
  83 #include "log.h"
  84 #include "macro.h"
  85 #include "missing.h"
  86 #include "mkdir.h"
  87 #include "namespace.h"
  88 #include "parse-util.h"
  89 #include "path-util.h"
  90 #include "process-util.h"
  91 #include "rlimit-util.h"
  92 #include "rm-rf.h"
  93 #if HAVE_SECCOMP
  94 #include "seccomp-util.h"
  95 #endif
  96 #include "securebits.h"
  97 #include "securebits-util.h"
  98 #include "selinux-util.h"
  99 #include "signal-util.h"
 100 #include "smack-util.h"
 101 #include "special.h"
 102 #include "stat-util.h"
 103 #include "string-table.h"
 104 #include "string-util.h"
 105 #include "strv.h"
 106 #include "syslog-util.h"
 107 #include "terminal-util.h"
 108 #include "unit.h"
 109 #include "user-util.h"
 110 #include "util.h"
 111 #include "utmp-wtmp.h"
 112
 113 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 114 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 115
 116 /* This assumes there is a 'tty' group */
 117 #define TTY_MODE 0620
 118
 119 #define SNDBUF_SIZE (8*1024*1024)
 120
 121 static int shift_fds(int fds[], unsigned n_fds) {
 122         int start, restart_from;
 123
 124         if (n_fds <= 0)
 125                 return 0;
 126
 127         /* Modifies the fds array! (sorts it) */
 128
 129         assert(fds);
 130
 131         start = 0;
 132         for (;;) {
 133                 int i;
 134
 135                 restart_from = -1;
 136
 137                 for (i = start; i < (int) n_fds; i++) {
 138                         int nfd;
 139
 140                         /* Already at right index? */
 141                         if (fds[i] == i+3)
 142                                 continue;
 143
 144                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 145                         if (nfd < 0)
 146                                 return -errno;
 147
 148                         safe_close(fds[i]);
 149                         fds[i] = nfd;
 150
 151                         /* Hmm, the fd we wanted isn't free? Then
 152                          * let's remember that and try again from here */
 153                         if (nfd != i+3 && restart_from < 0)
 154                                 restart_from = i;
 155                 }
 156
 157                 if (restart_from < 0)
 158                         break;
 159
 160                 start = restart_from;
 161         }
 162
 163         return 0;
 164 }
 165
 166 static int flags_fds(const int fds[], unsigned n_storage_fds, unsigned n_socket_fds, bool nonblock) {
 167         unsigned i, n_fds;
 168         int r;
 169
 170         n_fds = n_storage_fds + n_socket_fds;
 171         if (n_fds <= 0)
 172                 return 0;
 173
 174         assert(fds);
 175
 176         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 177          * O_NONBLOCK only applies to socket activation though. */
 178
 179         for (i = 0; i < n_fds; i++) {
 180
 181                 if (i < n_socket_fds) {
 182                         r = fd_nonblock(fds[i], nonblock);
 183                         if (r < 0)
 184                                 return r;
 185                 }
 186
 187                 /* We unconditionally drop FD_CLOEXEC from the fds,
 188                  * since after all we want to pass these fds to our
 189                  * children */
 190
 191                 r = fd_cloexec(fds[i], false);
 192                 if (r < 0)
 193                         return r;
 194         }
 195
 196         return 0;
 197 }
 198
 199 static const char *exec_context_tty_path(const ExecContext *context) {
 200         assert(context);
 201
 202         if (context->stdio_as_fds)
 203                 return NULL;
 204
 205         if (context->tty_path)
 206                 return context->tty_path;
 207
 208         return "/dev/console";
 209 }
 210
 211 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 212         const char *path;
 213
 214         assert(context);
 215
 216         path = exec_context_tty_path(context);
 217
 218         if (context->tty_vhangup) {
 219                 if (p && p->stdin_fd >= 0)
 220                         (void) terminal_vhangup_fd(p->stdin_fd);
 221                 else if (path)
 222                         (void) terminal_vhangup(path);
 223         }
 224
 225         if (context->tty_reset) {
 226                 if (p && p->stdin_fd >= 0)
 227                         (void) reset_terminal_fd(p->stdin_fd, true);
 228                 else if (path)
 229                         (void) reset_terminal(path);
 230         }
 231
 232         if (context->tty_vt_disallocate && path)
 233                 (void) vt_disallocate(path);
 234 }
 235
 236 static bool is_terminal_input(ExecInput i) {
 237         return IN_SET(i,
 238                       EXEC_INPUT_TTY,
 239                       EXEC_INPUT_TTY_FORCE,
 240                       EXEC_INPUT_TTY_FAIL);
 241 }
 242
 243 static bool is_terminal_output(ExecOutput o) {
 244         return IN_SET(o,
 245                       EXEC_OUTPUT_TTY,
 246                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 247                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 248                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 249 }
 250
 251 static bool is_syslog_output(ExecOutput o) {
 252         return IN_SET(o,
 253                       EXEC_OUTPUT_SYSLOG,
 254                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 255 }
 256
 257 static bool is_kmsg_output(ExecOutput o) {
 258         return IN_SET(o,
 259                       EXEC_OUTPUT_KMSG,
 260                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 261 }
 262
 263 static bool exec_context_needs_term(const ExecContext *c) {
 264         assert(c);
 265
 266         /* Return true if the execution context suggests we should set $TERM to something useful. */
 267
 268         if (is_terminal_input(c->std_input))
 269                 return true;
 270
 271         if (is_terminal_output(c->std_output))
 272                 return true;
 273
 274         if (is_terminal_output(c->std_error))
 275                 return true;
 276
 277         return !!c->tty_path;
 278 }
 279
 280 static int open_null_as(int flags, int nfd) {
 281         int fd;
 282
 283         assert(nfd >= 0);
 284
 285         fd = open("/dev/null", flags|O_NOCTTY);
 286         if (fd < 0)
 287                 return -errno;
 288
 289         return move_fd(fd, nfd, false);
 290 }
 291
 292 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 293         static const union sockaddr_union sa = {
 294                 .un.sun_family = AF_UNIX,
 295                 .un.sun_path = "/run/systemd/journal/stdout",
 296         };
 297         uid_t olduid = UID_INVALID;
 298         gid_t oldgid = GID_INVALID;
 299         int r;
 300
 301         if (gid_is_valid(gid)) {
 302                 oldgid = getgid();
 303
 304                 if (setegid(gid) < 0)
 305                         return -errno;
 306         }
 307
 308         if (uid_is_valid(uid)) {
 309                 olduid = getuid();
 310
 311                 if (seteuid(uid) < 0) {
 312                         r = -errno;
 313                         goto restore_gid;
 314                 }
 315         }
 316
 317         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 318
 319         /* If we fail to restore the uid or gid, things will likely
 320            fail later on. This should only happen if an LSM interferes. */
 321
 322         if (uid_is_valid(uid))
 323                 (void) seteuid(olduid);
 324
 325  restore_gid:
 326         if (gid_is_valid(gid))
 327                 (void) setegid(oldgid);
 328
 329         return r;
 330 }
 331
 332 static int connect_logger_as(
 333                 Unit *unit,
 334                 const ExecContext *context,
 335                 const ExecParameters *params,
 336                 ExecOutput output,
 337                 const char *ident,
 338                 int nfd,
 339                 uid_t uid,
 340                 gid_t gid) {
 341
 342         int fd, r;
 343
 344         assert(context);
 345         assert(params);
 346         assert(output < _EXEC_OUTPUT_MAX);
 347         assert(ident);
 348         assert(nfd >= 0);
 349
 350         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 351         if (fd < 0)
 352                 return -errno;
 353
 354         r = connect_journal_socket(fd, uid, gid);
 355         if (r < 0)
 356                 return r;
 357
 358         if (shutdown(fd, SHUT_RD) < 0) {
 359                 safe_close(fd);
 360                 return -errno;
 361         }
 362
 363         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 364
 365         dprintf(fd,
 366                 "%s\n"
 367                 "%s\n"
 368                 "%i\n"
 369                 "%i\n"
 370                 "%i\n"
 371                 "%i\n"
 372                 "%i\n",
 373                 context->syslog_identifier ?: ident,
 374                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 375                 context->syslog_priority,
 376                 !!context->syslog_level_prefix,
 377                 is_syslog_output(output),
 378                 is_kmsg_output(output),
 379                 is_terminal_output(output));
 380
 381         return move_fd(fd, nfd, false);
 382 }
 383 static int open_terminal_as(const char *path, int flags, int nfd) {
 384         int fd;
 385
 386         assert(path);
 387         assert(nfd >= 0);
 388
 389         fd = open_terminal(path, flags | O_NOCTTY);
 390         if (fd < 0)
 391                 return fd;
 392
 393         return move_fd(fd, nfd, false);
 394 }
 395
 396 static int acquire_path(const char *path, int flags, mode_t mode) {
 397         union sockaddr_union sa = {
 398                 .sa.sa_family = AF_UNIX,
 399         };
 400         int fd, r;
 401
 402         assert(path);
 403
 404         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 405                 flags |= O_CREAT;
 406
 407         fd = open(path, flags|O_NOCTTY, mode);
 408         if (fd >= 0)
 409                 return fd;
 410
 411         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 412                 return -errno;
 413         if (strlen(path) > sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
 414                 return -ENXIO;
 415
 416         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 417
 418         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 419         if (fd < 0)
 420                 return -errno;
 421
 422         strncpy(sa.un.sun_path, path, sizeof(sa.un.sun_path));
 423         if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
 424                 safe_close(fd);
 425                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 426                                                            * indication that his wasn't an AF_UNIX socket after all */
 427         }
 428
 429         if ((flags & O_ACCMODE) == O_RDONLY)
 430                 r = shutdown(fd, SHUT_WR);
 431         else if ((flags & O_ACCMODE) == O_WRONLY)
 432                 r = shutdown(fd, SHUT_RD);
 433         else
 434                 return fd;
 435         if (r < 0) {
 436                 safe_close(fd);
 437                 return -errno;
 438         }
 439
 440         return fd;
 441 }
 442
 443 static int fixup_input(
 444                 const ExecContext *context,
 445                 int socket_fd,
 446                 bool apply_tty_stdin) {
 447
 448         ExecInput std_input;
 449
 450         assert(context);
 451
 452         std_input = context->std_input;
 453
 454         if (is_terminal_input(std_input) && !apply_tty_stdin)
 455                 return EXEC_INPUT_NULL;
 456
 457         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 458                 return EXEC_INPUT_NULL;
 459
 460         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 461                 return EXEC_INPUT_NULL;
 462
 463         return std_input;
 464 }
 465
 466 static int fixup_output(ExecOutput std_output, int socket_fd) {
 467
 468         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 469                 return EXEC_OUTPUT_INHERIT;
 470
 471         return std_output;
 472 }
 473
 474 static int setup_input(
 475                 const ExecContext *context,
 476                 const ExecParameters *params,
 477                 int socket_fd,
 478                 int named_iofds[3]) {
 479
 480         ExecInput i;
 481
 482         assert(context);
 483         assert(params);
 484
 485         if (params->stdin_fd >= 0) {
 486                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 487                         return -errno;
 488
 489                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 490                 if (isatty(STDIN_FILENO)) {
 491                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 492                         (void) reset_terminal_fd(STDIN_FILENO, true);
 493                 }
 494
 495                 return STDIN_FILENO;
 496         }
 497
 498         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 499
 500         switch (i) {
 501
 502         case EXEC_INPUT_NULL:
 503                 return open_null_as(O_RDONLY, STDIN_FILENO);
 504
 505         case EXEC_INPUT_TTY:
 506         case EXEC_INPUT_TTY_FORCE:
 507         case EXEC_INPUT_TTY_FAIL: {
 508                 int fd;
 509
 510                 fd = acquire_terminal(exec_context_tty_path(context),
 511                                       i == EXEC_INPUT_TTY_FAIL,
 512                                       i == EXEC_INPUT_TTY_FORCE,
 513                                       false,
 514                                       USEC_INFINITY);
 515                 if (fd < 0)
 516                         return fd;
 517
 518                 return move_fd(fd, STDIN_FILENO, false);
 519         }
 520
 521         case EXEC_INPUT_SOCKET:
 522                 assert(socket_fd >= 0);
 523
 524                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 525
 526         case EXEC_INPUT_NAMED_FD:
 527                 assert(named_iofds[STDIN_FILENO] >= 0);
 528
 529                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 530                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 531
 532         case EXEC_INPUT_DATA: {
 533                 int fd;
 534
 535                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 536                 if (fd < 0)
 537                         return fd;
 538
 539                 return move_fd(fd, STDIN_FILENO, false);
 540         }
 541
 542         case EXEC_INPUT_FILE: {
 543                 bool rw;
 544                 int fd;
 545
 546                 assert(context->stdio_file[STDIN_FILENO]);
 547
 548                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 549                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 550
 551                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 552                 if (fd < 0)
 553                         return fd;
 554
 555                 return move_fd(fd, STDIN_FILENO, false);
 556         }
 557
 558         default:
 559                 assert_not_reached("Unknown input type");
 560         }
 561 }
 562
 563 static int setup_output(
 564                 Unit *unit,
 565                 const ExecContext *context,
 566                 const ExecParameters *params,
 567                 int fileno,
 568                 int socket_fd,
 569                 int named_iofds[3],
 570                 const char *ident,
 571                 uid_t uid,
 572                 gid_t gid,
 573                 dev_t *journal_stream_dev,
 574                 ino_t *journal_stream_ino) {
 575
 576         ExecOutput o;
 577         ExecInput i;
 578         int r;
 579
 580         assert(unit);
 581         assert(context);
 582         assert(params);
 583         assert(ident);
 584         assert(journal_stream_dev);
 585         assert(journal_stream_ino);
 586
 587         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 588
 589                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 590                         return -errno;
 591
 592                 return STDOUT_FILENO;
 593         }
 594
 595         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 596                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 597                         return -errno;
 598
 599                 return STDERR_FILENO;
 600         }
 601
 602         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 603         o = fixup_output(context->std_output, socket_fd);
 604
 605         if (fileno == STDERR_FILENO) {
 606                 ExecOutput e;
 607                 e = fixup_output(context->std_error, socket_fd);
 608
 609                 /* This expects the input and output are already set up */
 610
 611                 /* Don't change the stderr file descriptor if we inherit all
 612                  * the way and are not on a tty */
 613                 if (e == EXEC_OUTPUT_INHERIT &&
 614                     o == EXEC_OUTPUT_INHERIT &&
 615                     i == EXEC_INPUT_NULL &&
 616                     !is_terminal_input(context->std_input) &&
 617                     getppid () != 1)
 618                         return fileno;
 619
 620                 /* Duplicate from stdout if possible */
 621                 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
 622                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 623
 624                 o = e;
 625
 626         } else if (o == EXEC_OUTPUT_INHERIT) {
 627                 /* If input got downgraded, inherit the original value */
 628                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 629                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 630
 631                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 632                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 633                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 634
 635                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 636                 if (getppid() != 1)
 637                         return fileno;
 638
 639                 /* We need to open /dev/null here anew, to get the right access mode. */
 640                 return open_null_as(O_WRONLY, fileno);
 641         }
 642
 643         switch (o) {
 644
 645         case EXEC_OUTPUT_NULL:
 646                 return open_null_as(O_WRONLY, fileno);
 647
 648         case EXEC_OUTPUT_TTY:
 649                 if (is_terminal_input(i))
 650                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 651
 652                 /* We don't reset the terminal if this is just about output */
 653                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 654
 655         case EXEC_OUTPUT_SYSLOG:
 656         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 657         case EXEC_OUTPUT_KMSG:
 658         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 659         case EXEC_OUTPUT_JOURNAL:
 660         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 661                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 662                 if (r < 0) {
 663                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 664                         r = open_null_as(O_WRONLY, fileno);
 665                 } else {
 666                         struct stat st;
 667
 668                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 669                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 670                          * services to detect whether they are connected to the journal or not.
 671                          *
 672                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 673                          * about STDERR as that's usually the best way to do logging. */
 674
 675                         if (fstat(fileno, &st) >= 0 &&
 676                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 677                                 *journal_stream_dev = st.st_dev;
 678                                 *journal_stream_ino = st.st_ino;
 679                         }
 680                 }
 681                 return r;
 682
 683         case EXEC_OUTPUT_SOCKET:
 684                 assert(socket_fd >= 0);
 685
 686                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 687
 688         case EXEC_OUTPUT_NAMED_FD:
 689                 assert(named_iofds[fileno] >= 0);
 690
 691                 (void) fd_nonblock(named_iofds[fileno], false);
 692                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 693
 694         case EXEC_OUTPUT_FILE: {
 695                 bool rw;
 696                 int fd;
 697
 698                 assert(context->stdio_file[fileno]);
 699
 700                 rw = context->std_input == EXEC_INPUT_FILE &&
 701                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 702
 703                 if (rw)
 704                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 705
 706                 fd = acquire_path(context->stdio_file[fileno], O_WRONLY, 0666 & ~context->umask);
 707                 if (fd < 0)
 708                         return fd;
 709
 710                 return move_fd(fd, fileno, false);
 711         }
 712
 713         default:
 714                 assert_not_reached("Unknown error type");
 715         }
 716 }
 717
 718 static int chown_terminal(int fd, uid_t uid) {
 719         struct stat st;
 720
 721         assert(fd >= 0);
 722
 723         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 724         if (isatty(fd) < 1)
 725                 return 0;
 726
 727         /* This might fail. What matters are the results. */
 728         (void) fchown(fd, uid, -1);
 729         (void) fchmod(fd, TTY_MODE);
 730
 731         if (fstat(fd, &st) < 0)
 732                 return -errno;
 733
 734         if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
 735                 return -EPERM;
 736
 737         return 0;
 738 }
 739
 740 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 741         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 742         int r;
 743
 744         assert(_saved_stdin);
 745         assert(_saved_stdout);
 746
 747         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 748         if (saved_stdin < 0)
 749                 return -errno;
 750
 751         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 752         if (saved_stdout < 0)
 753                 return -errno;
 754
 755         fd = acquire_terminal(vc, false, false, false, DEFAULT_CONFIRM_USEC);
 756         if (fd < 0)
 757                 return fd;
 758
 759         r = chown_terminal(fd, getuid());
 760         if (r < 0)
 761                 return r;
 762
 763         r = reset_terminal_fd(fd, true);
 764         if (r < 0)
 765                 return r;
 766
 767         if (dup2(fd, STDIN_FILENO) < 0)
 768                 return -errno;
 769
 770         if (dup2(fd, STDOUT_FILENO) < 0)
 771                 return -errno;
 772
 773         if (fd >= 2)
 774                 safe_close(fd);
 775         fd = -1;
 776
 777         *_saved_stdin = saved_stdin;
 778         *_saved_stdout = saved_stdout;
 779
 780         saved_stdin = saved_stdout = -1;
 781
 782         return 0;
 783 }
 784
 785 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 786         assert(err < 0);
 787
 788         if (err == -ETIMEDOUT)
 789                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 790         else {
 791                 errno = -err;
 792                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 793         }
 794 }
 795
 796 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 797         _cleanup_close_ int fd = -1;
 798
 799         assert(vc);
 800
 801         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 802         if (fd < 0)
 803                 return;
 804
 805         write_confirm_error_fd(err, fd, u);
 806 }
 807
 808 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 809         int r = 0;
 810
 811         assert(saved_stdin);
 812         assert(saved_stdout);
 813
 814         release_terminal();
 815
 816         if (*saved_stdin >= 0)
 817                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 818                         r = -errno;
 819
 820         if (*saved_stdout >= 0)
 821                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 822                         r = -errno;
 823
 824         *saved_stdin = safe_close(*saved_stdin);
 825         *saved_stdout = safe_close(*saved_stdout);
 826
 827         return r;
 828 }
 829
 830 enum {
 831         CONFIRM_PRETEND_FAILURE = -1,
 832         CONFIRM_PRETEND_SUCCESS =  0,
 833         CONFIRM_EXECUTE = 1,
 834 };
 835
 836 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 837         int saved_stdout = -1, saved_stdin = -1, r;
 838         _cleanup_free_ char *e = NULL;
 839         char c;
 840
 841         /* For any internal errors, assume a positive response. */
 842         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 843         if (r < 0) {
 844                 write_confirm_error(r, vc, u);
 845                 return CONFIRM_EXECUTE;
 846         }
 847
 848         /* confirm_spawn might have been disabled while we were sleeping. */
 849         if (manager_is_confirm_spawn_disabled(u->manager)) {
 850                 r = 1;
 851                 goto restore_stdio;
 852         }
 853
 854         e = ellipsize(cmdline, 60, 100);
 855         if (!e) {
 856                 log_oom();
 857                 r = CONFIRM_EXECUTE;
 858                 goto restore_stdio;
 859         }
 860
 861         for (;;) {
 862                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 863                 if (r < 0) {
 864                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 865                         r = CONFIRM_EXECUTE;
 866                         goto restore_stdio;
 867                 }
 868
 869                 switch (c) {
 870                 case 'c':
 871                         printf("Resuming normal execution.\n");
 872                         manager_disable_confirm_spawn();
 873                         r = 1;
 874                         break;
 875                 case 'D':
 876                         unit_dump(u, stdout, "  ");
 877                         continue; /* ask again */
 878                 case 'f':
 879                         printf("Failing execution.\n");
 880                         r = CONFIRM_PRETEND_FAILURE;
 881                         break;
 882                 case 'h':
 883                         printf("  c - continue, proceed without asking anymore\n"
 884                                "  D - dump, show the state of the unit\n"
 885                                "  f - fail, don't execute the command and pretend it failed\n"
 886                                "  h - help\n"
 887                                "  i - info, show a short summary of the unit\n"
 888                                "  j - jobs, show jobs that are in progress\n"
 889                                "  s - skip, don't execute the command and pretend it succeeded\n"
 890                                "  y - yes, execute the command\n");
 891                         continue; /* ask again */
 892                 case 'i':
 893                         printf("  Description: %s\n"
 894                                "  Unit:        %s\n"
 895                                "  Command:     %s\n",
 896                                u->id, u->description, cmdline);
 897                         continue; /* ask again */
 898                 case 'j':
 899                         manager_dump_jobs(u->manager, stdout, "  ");
 900                         continue; /* ask again */
 901                 case 'n':
 902                         /* 'n' was removed in favor of 'f'. */
 903                         printf("Didn't understand 'n', did you mean 'f'?\n");
 904                         continue; /* ask again */
 905                 case 's':
 906                         printf("Skipping execution.\n");
 907                         r = CONFIRM_PRETEND_SUCCESS;
 908                         break;
 909                 case 'y':
 910                         r = CONFIRM_EXECUTE;
 911                         break;
 912                 default:
 913                         assert_not_reached("Unhandled choice");
 914                 }
 915                 break;
 916         }
 917
 918 restore_stdio:
 919         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 920         return r;
 921 }
 922
 923 static int get_fixed_user(const ExecContext *c, const char **user,
 924                           uid_t *uid, gid_t *gid,
 925                           const char **home, const char **shell) {
 926         int r;
 927         const char *name;
 928
 929         assert(c);
 930
 931         if (!c->user)
 932                 return 0;
 933
 934         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 935          * (i.e. are "/" or "/bin/nologin"). */
 936
 937         name = c->user;
 938         r = get_user_creds_clean(&name, uid, gid, home, shell);
 939         if (r < 0)
 940                 return r;
 941
 942         *user = name;
 943         return 0;
 944 }
 945
 946 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 947         int r;
 948         const char *name;
 949
 950         assert(c);
 951
 952         if (!c->group)
 953                 return 0;
 954
 955         name = c->group;
 956         r = get_group_creds(&name, gid);
 957         if (r < 0)
 958                 return r;
 959
 960         *group = name;
 961         return 0;
 962 }
 963
 964 static int get_supplementary_groups(const ExecContext *c, const char *user,
 965                                     const char *group, gid_t gid,
 966                                     gid_t **supplementary_gids, int *ngids) {
 967         char **i;
 968         int r, k = 0;
 969         int ngroups_max;
 970         bool keep_groups = false;
 971         gid_t *groups = NULL;
 972         _cleanup_free_ gid_t *l_gids = NULL;
 973
 974         assert(c);
 975
 976         /*
 977          * If user is given, then lookup GID and supplementary groups list.
 978          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 979          * here and as early as possible so we keep the list of supplementary
 980          * groups of the caller.
 981          */
 982         if (user && gid_is_valid(gid) && gid != 0) {
 983                 /* First step, initialize groups from /etc/groups */
 984                 if (initgroups(user, gid) < 0)
 985                         return -errno;
 986
 987                 keep_groups = true;
 988         }
 989
 990         if (strv_isempty(c->supplementary_groups))
 991                 return 0;
 992
 993         /*
 994          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 995          * be positive, otherwise fail.
 996          */
 997         errno = 0;
 998         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 999         if (ngroups_max <= 0) {
1000                 if (errno > 0)
1001                         return -errno;
1002                 else
1003                         return -EOPNOTSUPP; /* For all other values */
1004         }
1005
1006         l_gids = new(gid_t, ngroups_max);
1007         if (!l_gids)
1008                 return -ENOMEM;
1009
1010         if (keep_groups) {
1011                 /*
1012                  * Lookup the list of groups that the user belongs to, we
1013                  * avoid NSS lookups here too for gid=0.
1014                  */
1015                 k = ngroups_max;
1016                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1017                         return -EINVAL;
1018         } else
1019                 k = 0;
1020
1021         STRV_FOREACH(i, c->supplementary_groups) {
1022                 const char *g;
1023
1024                 if (k >= ngroups_max)
1025                         return -E2BIG;
1026
1027                 g = *i;
1028                 r = get_group_creds(&g, l_gids+k);
1029                 if (r < 0)
1030                         return r;
1031
1032                 k++;
1033         }
1034
1035         /*
1036          * Sets ngids to zero to drop all supplementary groups, happens
1037          * when we are under root and SupplementaryGroups= is empty.
1038          */
1039         if (k == 0) {
1040                 *ngids = 0;
1041                 return 0;
1042         }
1043
1044         /* Otherwise get the final list of supplementary groups */
1045         groups = memdup(l_gids, sizeof(gid_t) * k);
1046         if (!groups)
1047                 return -ENOMEM;
1048
1049         *supplementary_gids = groups;
1050         *ngids = k;
1051
1052         groups = NULL;
1053
1054         return 0;
1055 }
1056
1057 static int enforce_groups(gid_t gid, gid_t *supplementary_gids, int ngids) {
1058         int r;
1059
1060         /* Handle SupplementaryGroups= if it is not empty */
1061         if (ngids > 0) {
1062                 r = maybe_setgroups(ngids, supplementary_gids);
1063                 if (r < 0)
1064                         return r;
1065         }
1066
1067         if (gid_is_valid(gid)) {
1068                 /* Then set our gids */
1069                 if (setresgid(gid, gid, gid) < 0)
1070                         return -errno;
1071         }
1072
1073         return 0;
1074 }
1075
1076 static int enforce_user(const ExecContext *context, uid_t uid) {
1077         assert(context);
1078
1079         if (!uid_is_valid(uid))
1080                 return 0;
1081
1082         /* Sets (but doesn't look up) the uid and make sure we keep the
1083          * capabilities while doing so. */
1084
1085         if (context->capability_ambient_set != 0) {
1086
1087                 /* First step: If we need to keep capabilities but
1088                  * drop privileges we need to make sure we keep our
1089                  * caps, while we drop privileges. */
1090                 if (uid != 0) {
1091                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1092
1093                         if (prctl(PR_GET_SECUREBITS) != sb)
1094                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1095                                         return -errno;
1096                 }
1097         }
1098
1099         /* Second step: actually set the uids */
1100         if (setresuid(uid, uid, uid) < 0)
1101                 return -errno;
1102
1103         /* At this point we should have all necessary capabilities but
1104            are otherwise a normal user. However, the caps might got
1105            corrupted due to the setresuid() so we need clean them up
1106            later. This is done outside of this call. */
1107
1108         return 0;
1109 }
1110
1111 #if HAVE_PAM
1112
1113 static int null_conv(
1114                 int num_msg,
1115                 const struct pam_message **msg,
1116                 struct pam_response **resp,
1117                 void *appdata_ptr) {
1118
1119         /* We don't support conversations */
1120
1121         return PAM_CONV_ERR;
1122 }
1123
1124 #endif
1125
1126 static int setup_pam(
1127                 const char *name,
1128                 const char *user,
1129                 uid_t uid,
1130                 gid_t gid,
1131                 const char *tty,
1132                 char ***env,
1133                 int fds[], unsigned n_fds) {
1134
1135 #if HAVE_PAM
1136
1137         static const struct pam_conv conv = {
1138                 .conv = null_conv,
1139                 .appdata_ptr = NULL
1140         };
1141
1142         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1143         pam_handle_t *handle = NULL;
1144         sigset_t old_ss;
1145         int pam_code = PAM_SUCCESS, r;
1146         char **nv, **e = NULL;
1147         bool close_session = false;
1148         pid_t pam_pid = 0, parent_pid;
1149         int flags = 0;
1150
1151         assert(name);
1152         assert(user);
1153         assert(env);
1154
1155         /* We set up PAM in the parent process, then fork. The child
1156          * will then stay around until killed via PR_GET_PDEATHSIG or
1157          * systemd via the cgroup logic. It will then remove the PAM
1158          * session again. The parent process will exec() the actual
1159          * daemon. We do things this way to ensure that the main PID
1160          * of the daemon is the one we initially fork()ed. */
1161
1162         r = barrier_create(&barrier);
1163         if (r < 0)
1164                 goto fail;
1165
1166         if (log_get_max_level() < LOG_DEBUG)
1167                 flags |= PAM_SILENT;
1168
1169         pam_code = pam_start(name, user, &conv, &handle);
1170         if (pam_code != PAM_SUCCESS) {
1171                 handle = NULL;
1172                 goto fail;
1173         }
1174
1175         if (tty) {
1176                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1177                 if (pam_code != PAM_SUCCESS)
1178                         goto fail;
1179         }
1180
1181         STRV_FOREACH(nv, *env) {
1182                 pam_code = pam_putenv(handle, *nv);
1183                 if (pam_code != PAM_SUCCESS)
1184                         goto fail;
1185         }
1186
1187         pam_code = pam_acct_mgmt(handle, flags);
1188         if (pam_code != PAM_SUCCESS)
1189                 goto fail;
1190
1191         pam_code = pam_open_session(handle, flags);
1192         if (pam_code != PAM_SUCCESS)
1193                 goto fail;
1194
1195         close_session = true;
1196
1197         e = pam_getenvlist(handle);
1198         if (!e) {
1199                 pam_code = PAM_BUF_ERR;
1200                 goto fail;
1201         }
1202
1203         /* Block SIGTERM, so that we know that it won't get lost in
1204          * the child */
1205
1206         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1207
1208         parent_pid = getpid_cached();
1209
1210         pam_pid = fork();
1211         if (pam_pid < 0) {
1212                 r = -errno;
1213                 goto fail;
1214         }
1215
1216         if (pam_pid == 0) {
1217                 int sig, ret = EXIT_PAM;
1218
1219                 /* The child's job is to reset the PAM session on
1220                  * termination */
1221                 barrier_set_role(&barrier, BARRIER_CHILD);
1222
1223                 /* This string must fit in 10 chars (i.e. the length
1224                  * of "/sbin/init"), to look pretty in /bin/ps */
1225                 rename_process("(sd-pam)");
1226
1227                 /* Make sure we don't keep open the passed fds in this
1228                 child. We assume that otherwise only those fds are
1229                 open here that have been opened by PAM. */
1230                 close_many(fds, n_fds);
1231
1232                 /* Drop privileges - we don't need any to pam_close_session
1233                  * and this will make PR_SET_PDEATHSIG work in most cases.
1234                  * If this fails, ignore the error - but expect sd-pam threads
1235                  * to fail to exit normally */
1236
1237                 r = maybe_setgroups(0, NULL);
1238                 if (r < 0)
1239                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1240                 if (setresgid(gid, gid, gid) < 0)
1241                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1242                 if (setresuid(uid, uid, uid) < 0)
1243                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1244
1245                 (void) ignore_signals(SIGPIPE, -1);
1246
1247                 /* Wait until our parent died. This will only work if
1248                  * the above setresuid() succeeds, otherwise the kernel
1249                  * will not allow unprivileged parents kill their privileged
1250                  * children this way. We rely on the control groups kill logic
1251                  * to do the rest for us. */
1252                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1253                         goto child_finish;
1254
1255                 /* Tell the parent that our setup is done. This is especially
1256                  * important regarding dropping privileges. Otherwise, unit
1257                  * setup might race against our setresuid(2) call.
1258                  *
1259                  * If the parent aborted, we'll detect this below, hence ignore
1260                  * return failure here. */
1261                 (void) barrier_place(&barrier);
1262
1263                 /* Check if our parent process might already have died? */
1264                 if (getppid() == parent_pid) {
1265                         sigset_t ss;
1266
1267                         assert_se(sigemptyset(&ss) >= 0);
1268                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1269
1270                         for (;;) {
1271                                 if (sigwait(&ss, &sig) < 0) {
1272                                         if (errno == EINTR)
1273                                                 continue;
1274
1275                                         goto child_finish;
1276                                 }
1277
1278                                 assert(sig == SIGTERM);
1279                                 break;
1280                         }
1281                 }
1282
1283                 /* If our parent died we'll end the session */
1284                 if (getppid() != parent_pid) {
1285                         pam_code = pam_close_session(handle, flags);
1286                         if (pam_code != PAM_SUCCESS)
1287                                 goto child_finish;
1288                 }
1289
1290                 ret = 0;
1291
1292         child_finish:
1293                 pam_end(handle, pam_code | flags);
1294                 _exit(ret);
1295         }
1296
1297         barrier_set_role(&barrier, BARRIER_PARENT);
1298
1299         /* If the child was forked off successfully it will do all the
1300          * cleanups, so forget about the handle here. */
1301         handle = NULL;
1302
1303         /* Unblock SIGTERM again in the parent */
1304         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1305
1306         /* We close the log explicitly here, since the PAM modules
1307          * might have opened it, but we don't want this fd around. */
1308         closelog();
1309
1310         /* Synchronously wait for the child to initialize. We don't care for
1311          * errors as we cannot recover. However, warn loudly if it happens. */
1312         if (!barrier_place_and_sync(&barrier))
1313                 log_error("PAM initialization failed");
1314
1315         strv_free(*env);
1316         *env = e;
1317
1318         return 0;
1319
1320 fail:
1321         if (pam_code != PAM_SUCCESS) {
1322                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1323                 r = -EPERM;  /* PAM errors do not map to errno */
1324         } else
1325                 log_error_errno(r, "PAM failed: %m");
1326
1327         if (handle) {
1328                 if (close_session)
1329                         pam_code = pam_close_session(handle, flags);
1330
1331                 pam_end(handle, pam_code | flags);
1332         }
1333
1334         strv_free(e);
1335         closelog();
1336
1337         return r;
1338 #else
1339         return 0;
1340 #endif
1341 }
1342
1343 static void rename_process_from_path(const char *path) {
1344         char process_name[11];
1345         const char *p;
1346         size_t l;
1347
1348         /* This resulting string must fit in 10 chars (i.e. the length
1349          * of "/sbin/init") to look pretty in /bin/ps */
1350
1351         p = basename(path);
1352         if (isempty(p)) {
1353                 rename_process("(...)");
1354                 return;
1355         }
1356
1357         l = strlen(p);
1358         if (l > 8) {
1359                 /* The end of the process name is usually more
1360                  * interesting, since the first bit might just be
1361                  * "systemd-" */
1362                 p = p + l - 8;
1363                 l = 8;
1364         }
1365
1366         process_name[0] = '(';
1367         memcpy(process_name+1, p, l);
1368         process_name[1+l] = ')';
1369         process_name[1+l+1] = 0;
1370
1371         rename_process(process_name);
1372 }
1373
1374 static bool context_has_address_families(const ExecContext *c) {
1375         assert(c);
1376
1377         return c->address_families_whitelist ||
1378                 !set_isempty(c->address_families);
1379 }
1380
1381 static bool context_has_syscall_filters(const ExecContext *c) {
1382         assert(c);
1383
1384         return c->syscall_whitelist ||
1385                 !hashmap_isempty(c->syscall_filter);
1386 }
1387
1388 static bool context_has_no_new_privileges(const ExecContext *c) {
1389         assert(c);
1390
1391         if (c->no_new_privileges)
1392                 return true;
1393
1394         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1395                 return false;
1396
1397         /* We need NNP if we have any form of seccomp and are unprivileged */
1398         return context_has_address_families(c) ||
1399                 c->memory_deny_write_execute ||
1400                 c->restrict_realtime ||
1401                 exec_context_restrict_namespaces_set(c) ||
1402                 c->protect_kernel_tunables ||
1403                 c->protect_kernel_modules ||
1404                 c->private_devices ||
1405                 context_has_syscall_filters(c) ||
1406                 !set_isempty(c->syscall_archs) ||
1407                 c->lock_personality;
1408 }
1409
1410 #if HAVE_SECCOMP
1411
1412 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1413
1414         if (is_seccomp_available())
1415                 return false;
1416
1417         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1418         return true;
1419 }
1420
1421 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1422         uint32_t negative_action, default_action, action;
1423         int r;
1424
1425         assert(u);
1426         assert(c);
1427
1428         if (!context_has_syscall_filters(c))
1429                 return 0;
1430
1431         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1432                 return 0;
1433
1434         negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1435
1436         if (c->syscall_whitelist) {
1437                 default_action = negative_action;
1438                 action = SCMP_ACT_ALLOW;
1439         } else {
1440                 default_action = SCMP_ACT_ALLOW;
1441                 action = negative_action;
1442         }
1443
1444         if (needs_ambient_hack) {
1445                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1446                 if (r < 0)
1447                         return r;
1448         }
1449
1450         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1451 }
1452
1453 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1454         assert(u);
1455         assert(c);
1456
1457         if (set_isempty(c->syscall_archs))
1458                 return 0;
1459
1460         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1461                 return 0;
1462
1463         return seccomp_restrict_archs(c->syscall_archs);
1464 }
1465
1466 static int apply_address_families(const Unit* u, const ExecContext *c) {
1467         assert(u);
1468         assert(c);
1469
1470         if (!context_has_address_families(c))
1471                 return 0;
1472
1473         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1474                 return 0;
1475
1476         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1477 }
1478
1479 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1480         assert(u);
1481         assert(c);
1482
1483         if (!c->memory_deny_write_execute)
1484                 return 0;
1485
1486         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1487                 return 0;
1488
1489         return seccomp_memory_deny_write_execute();
1490 }
1491
1492 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1493         assert(u);
1494         assert(c);
1495
1496         if (!c->restrict_realtime)
1497                 return 0;
1498
1499         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1500                 return 0;
1501
1502         return seccomp_restrict_realtime();
1503 }
1504
1505 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1506         assert(u);
1507         assert(c);
1508
1509         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1510          * let's protect even those systems where this is left on in the kernel. */
1511
1512         if (!c->protect_kernel_tunables)
1513                 return 0;
1514
1515         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1516                 return 0;
1517
1518         return seccomp_protect_sysctl();
1519 }
1520
1521 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1522         assert(u);
1523         assert(c);
1524
1525         /* Turn off module syscalls on ProtectKernelModules=yes */
1526
1527         if (!c->protect_kernel_modules)
1528                 return 0;
1529
1530         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1531                 return 0;
1532
1533         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1534 }
1535
1536 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1537         assert(u);
1538         assert(c);
1539
1540         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1541
1542         if (!c->private_devices)
1543                 return 0;
1544
1545         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1546                 return 0;
1547
1548         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1549 }
1550
1551 static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
1552         assert(u);
1553         assert(c);
1554
1555         if (!exec_context_restrict_namespaces_set(c))
1556                 return 0;
1557
1558         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1559                 return 0;
1560
1561         return seccomp_restrict_namespaces(c->restrict_namespaces);
1562 }
1563
1564 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1565         unsigned long personality;
1566         int r;
1567
1568         assert(u);
1569         assert(c);
1570
1571         if (!c->lock_personality)
1572                 return 0;
1573
1574         if (skip_seccomp_unavailable(u, "LockPersonality="))
1575                 return 0;
1576
1577         personality = c->personality;
1578
1579         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1580         if (personality == PERSONALITY_INVALID) {
1581
1582                 r = opinionated_personality(&personality);
1583                 if (r < 0)
1584                         return r;
1585         }
1586
1587         return seccomp_lock_personality(personality);
1588 }
1589
1590 #endif
1591
1592 static void do_idle_pipe_dance(int idle_pipe[4]) {
1593         assert(idle_pipe);
1594
1595         idle_pipe[1] = safe_close(idle_pipe[1]);
1596         idle_pipe[2] = safe_close(idle_pipe[2]);
1597
1598         if (idle_pipe[0] >= 0) {
1599                 int r;
1600
1601                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1602
1603                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1604                         ssize_t n;
1605
1606                         /* Signal systemd that we are bored and want to continue. */
1607                         n = write(idle_pipe[3], "x", 1);
1608                         if (n > 0)
1609                                 /* Wait for systemd to react to the signal above. */
1610                                 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1611                 }
1612
1613                 idle_pipe[0] = safe_close(idle_pipe[0]);
1614
1615         }
1616
1617         idle_pipe[3] = safe_close(idle_pipe[3]);
1618 }
1619
1620 static int build_environment(
1621                 Unit *u,
1622                 const ExecContext *c,
1623                 const ExecParameters *p,
1624                 unsigned n_fds,
1625                 const char *home,
1626                 const char *username,
1627                 const char *shell,
1628                 dev_t journal_stream_dev,
1629                 ino_t journal_stream_ino,
1630                 char ***ret) {
1631
1632         _cleanup_strv_free_ char **our_env = NULL;
1633         unsigned n_env = 0;
1634         char *x;
1635
1636         assert(u);
1637         assert(c);
1638         assert(ret);
1639
1640         our_env = new0(char*, 14);
1641         if (!our_env)
1642                 return -ENOMEM;
1643
1644         if (n_fds > 0) {
1645                 _cleanup_free_ char *joined = NULL;
1646
1647                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1648                         return -ENOMEM;
1649                 our_env[n_env++] = x;
1650
1651                 if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
1652                         return -ENOMEM;
1653                 our_env[n_env++] = x;
1654
1655                 joined = strv_join(p->fd_names, ":");
1656                 if (!joined)
1657                         return -ENOMEM;
1658
1659                 x = strjoin("LISTEN_FDNAMES=", joined);
1660                 if (!x)
1661                         return -ENOMEM;
1662                 our_env[n_env++] = x;
1663         }
1664
1665         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1666                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1667                         return -ENOMEM;
1668                 our_env[n_env++] = x;
1669
1670                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1671                         return -ENOMEM;
1672                 our_env[n_env++] = x;
1673         }
1674
1675         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1676          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1677          * check the database directly. */
1678         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1679                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1680                 if (!x)
1681                         return -ENOMEM;
1682                 our_env[n_env++] = x;
1683         }
1684
1685         if (home) {
1686                 x = strappend("HOME=", home);
1687                 if (!x)
1688                         return -ENOMEM;
1689                 our_env[n_env++] = x;
1690         }
1691
1692         if (username) {
1693                 x = strappend("LOGNAME=", username);
1694                 if (!x)
1695                         return -ENOMEM;
1696                 our_env[n_env++] = x;
1697
1698                 x = strappend("USER=", username);
1699                 if (!x)
1700                         return -ENOMEM;
1701                 our_env[n_env++] = x;
1702         }
1703
1704         if (shell) {
1705                 x = strappend("SHELL=", shell);
1706                 if (!x)
1707                         return -ENOMEM;
1708                 our_env[n_env++] = x;
1709         }
1710
1711         if (!sd_id128_is_null(u->invocation_id)) {
1712                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1713                         return -ENOMEM;
1714
1715                 our_env[n_env++] = x;
1716         }
1717
1718         if (exec_context_needs_term(c)) {
1719                 const char *tty_path, *term = NULL;
1720
1721                 tty_path = exec_context_tty_path(c);
1722
1723                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1724                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1725                  * passes to PID 1 ends up all the way in the console login shown. */
1726
1727                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1728                         term = getenv("TERM");
1729                 if (!term)
1730                         term = default_term_for_tty(tty_path);
1731
1732                 x = strappend("TERM=", term);
1733                 if (!x)
1734                         return -ENOMEM;
1735                 our_env[n_env++] = x;
1736         }
1737
1738         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1739                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1740                         return -ENOMEM;
1741
1742                 our_env[n_env++] = x;
1743         }
1744
1745         our_env[n_env++] = NULL;
1746         assert(n_env <= 12);
1747
1748         *ret = our_env;
1749         our_env = NULL;
1750
1751         return 0;
1752 }
1753
1754 static int build_pass_environment(const ExecContext *c, char ***ret) {
1755         _cleanup_strv_free_ char **pass_env = NULL;
1756         size_t n_env = 0, n_bufsize = 0;
1757         char **i;
1758
1759         STRV_FOREACH(i, c->pass_environment) {
1760                 _cleanup_free_ char *x = NULL;
1761                 char *v;
1762
1763                 v = getenv(*i);
1764                 if (!v)
1765                         continue;
1766                 x = strjoin(*i, "=", v);
1767                 if (!x)
1768                         return -ENOMEM;
1769
1770                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1771                         return -ENOMEM;
1772
1773                 pass_env[n_env++] = x;
1774                 pass_env[n_env] = NULL;
1775                 x = NULL;
1776         }
1777
1778         *ret = pass_env;
1779         pass_env = NULL;
1780
1781         return 0;
1782 }
1783
1784 static bool exec_needs_mount_namespace(
1785                 const ExecContext *context,
1786                 const ExecParameters *params,
1787                 ExecRuntime *runtime) {
1788
1789         assert(context);
1790         assert(params);
1791
1792         if (context->root_image)
1793                 return true;
1794
1795         if (!strv_isempty(context->read_write_paths) ||
1796             !strv_isempty(context->read_only_paths) ||
1797             !strv_isempty(context->inaccessible_paths))
1798                 return true;
1799
1800         if (context->n_bind_mounts > 0 ||
1801             !strv_isempty(context->directories[EXEC_DIRECTORY_RUNTIME].paths) ||
1802             !strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1803             !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1804             !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths) ||
1805             !strv_isempty(context->directories[EXEC_DIRECTORY_CONFIGURATION].paths))
1806                 return true;
1807
1808         if (context->mount_flags != 0)
1809                 return true;
1810
1811         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1812                 return true;
1813
1814         if (context->private_devices ||
1815             context->protect_system != PROTECT_SYSTEM_NO ||
1816             context->protect_home != PROTECT_HOME_NO ||
1817             context->protect_kernel_tunables ||
1818             context->protect_kernel_modules ||
1819             context->protect_control_groups)
1820                 return true;
1821
1822         if (context->mount_apivfs && (context->root_image || context->root_directory))
1823                 return true;
1824
1825         return false;
1826 }
1827
1828 static int setup_private_users(uid_t uid, gid_t gid) {
1829         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1830         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1831         _cleanup_close_ int unshare_ready_fd = -1;
1832         _cleanup_(sigkill_waitp) pid_t pid = 0;
1833         uint64_t c = 1;
1834         siginfo_t si;
1835         ssize_t n;
1836         int r;
1837
1838         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1839          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1840          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1841          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1842          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1843          * continues execution normally. */
1844
1845         if (uid != 0 && uid_is_valid(uid)) {
1846                 r = asprintf(&uid_map,
1847                              "0 0 1\n"                      /* Map root → root */
1848                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1849                              uid, uid);
1850                 if (r < 0)
1851                         return -ENOMEM;
1852         } else {
1853                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1854                 if (!uid_map)
1855                         return -ENOMEM;
1856         }
1857
1858         if (gid != 0 && gid_is_valid(gid)) {
1859                 r = asprintf(&gid_map,
1860                              "0 0 1\n"                      /* Map root → root */
1861                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1862                              gid, gid);
1863                 if (r < 0)
1864                         return -ENOMEM;
1865         } else {
1866                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1867                 if (!gid_map)
1868                         return -ENOMEM;
1869         }
1870
1871         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1872          * namespace. */
1873         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1874         if (unshare_ready_fd < 0)
1875                 return -errno;
1876
1877         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1878          * failed. */
1879         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1880                 return -errno;
1881
1882         pid = fork();
1883         if (pid < 0)
1884                 return -errno;
1885
1886         if (pid == 0) {
1887                 _cleanup_close_ int fd = -1;
1888                 const char *a;
1889                 pid_t ppid;
1890
1891                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1892                  * here, after the parent opened its own user namespace. */
1893
1894                 ppid = getppid();
1895                 errno_pipe[0] = safe_close(errno_pipe[0]);
1896
1897                 /* Wait until the parent unshared the user namespace */
1898                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1899                         r = -errno;
1900                         goto child_fail;
1901                 }
1902
1903                 /* Disable the setgroups() system call in the child user namespace, for good. */
1904                 a = procfs_file_alloca(ppid, "setgroups");
1905                 fd = open(a, O_WRONLY|O_CLOEXEC);
1906                 if (fd < 0) {
1907                         if (errno != ENOENT) {
1908                                 r = -errno;
1909                                 goto child_fail;
1910                         }
1911
1912                         /* If the file is missing the kernel is too old, let's continue anyway. */
1913                 } else {
1914                         if (write(fd, "deny\n", 5) < 0) {
1915                                 r = -errno;
1916                                 goto child_fail;
1917                         }
1918
1919                         fd = safe_close(fd);
1920                 }
1921
1922                 /* First write the GID map */
1923                 a = procfs_file_alloca(ppid, "gid_map");
1924                 fd = open(a, O_WRONLY|O_CLOEXEC);
1925                 if (fd < 0) {
1926                         r = -errno;
1927                         goto child_fail;
1928                 }
1929                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1930                         r = -errno;
1931                         goto child_fail;
1932                 }
1933                 fd = safe_close(fd);
1934
1935                 /* The write the UID map */
1936                 a = procfs_file_alloca(ppid, "uid_map");
1937                 fd = open(a, O_WRONLY|O_CLOEXEC);
1938                 if (fd < 0) {
1939                         r = -errno;
1940                         goto child_fail;
1941                 }
1942                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1943                         r = -errno;
1944                         goto child_fail;
1945                 }
1946
1947                 _exit(EXIT_SUCCESS);
1948
1949         child_fail:
1950                 (void) write(errno_pipe[1], &r, sizeof(r));
1951                 _exit(EXIT_FAILURE);
1952         }
1953
1954         errno_pipe[1] = safe_close(errno_pipe[1]);
1955
1956         if (unshare(CLONE_NEWUSER) < 0)
1957                 return -errno;
1958
1959         /* Let the child know that the namespace is ready now */
1960         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1961                 return -errno;
1962
1963         /* Try to read an error code from the child */
1964         n = read(errno_pipe[0], &r, sizeof(r));
1965         if (n < 0)
1966                 return -errno;
1967         if (n == sizeof(r)) { /* an error code was sent to us */
1968                 if (r < 0)
1969                         return r;
1970                 return -EIO;
1971         }
1972         if (n != 0) /* on success we should have read 0 bytes */
1973                 return -EIO;
1974
1975         r = wait_for_terminate(pid, &si);
1976         if (r < 0)
1977                 return r;
1978         pid = 0;
1979
1980         /* If something strange happened with the child, let's consider this fatal, too */
1981         if (si.si_code != CLD_EXITED || si.si_status != 0)
1982                 return -EIO;
1983
1984         return 0;
1985 }
1986
1987 static int setup_exec_directory(
1988                 const ExecContext *context,
1989                 const ExecParameters *params,
1990                 uid_t uid,
1991                 gid_t gid,
1992                 ExecDirectoryType type,
1993                 int *exit_status) {
1994
1995         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1996                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1997                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1998                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1999                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2000                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2001         };
2002         char **rt;
2003         int r;
2004
2005         assert(context);
2006         assert(params);
2007         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2008         assert(exit_status);
2009
2010         if (!params->prefix[type])
2011                 return 0;
2012
2013         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2014                 if (!uid_is_valid(uid))
2015                         uid = 0;
2016                 if (!gid_is_valid(gid))
2017                         gid = 0;
2018         }
2019
2020         STRV_FOREACH(rt, context->directories[type].paths) {
2021                 _cleanup_free_ char *p = NULL, *pp = NULL;
2022                 const char *effective;
2023
2024                 p = strjoin(params->prefix[type], "/", *rt);
2025                 if (!p) {
2026                         r = -ENOMEM;
2027                         goto fail;
2028                 }
2029
2030                 r = mkdir_parents_label(p, 0755);
2031                 if (r < 0)
2032                         goto fail;
2033
2034                 if (context->dynamic_user &&
2035                     !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2036                         _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
2037
2038                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2039                          * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2040                          * whose UID is later on reused. To lock this down we use the same trick used by container
2041                          * managers to prohibit host users to get access to files of the same UID in containers: we
2042                          * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2043                          * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2044                          * to make this directory permeable for the service itself.
2045                          *
2046                          * Specifically: for a service which wants a special directory "foo/" we first create a
2047                          * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2048                          * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2049                          * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2050                          * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2051                          * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2052                          * disabling the access boundary for the service and making sure it only gets access to the
2053                          * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2054                          *
2055                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2056                          * owned by the service itself.
2057                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2058                          * files or sockets with other services. */
2059
2060                         private_root = strjoin(params->prefix[type], "/private");
2061                         if (!private_root) {
2062                                 r = -ENOMEM;
2063                                 goto fail;
2064                         }
2065
2066                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2067                         r = mkdir_safe_label(private_root, 0700, 0, 0, false);
2068                         if (r < 0)
2069                                 goto fail;
2070
2071                         pp = strjoin(private_root, "/", *rt);
2072                         if (!pp) {
2073                                 r = -ENOMEM;
2074                                 goto fail;
2075                         }
2076
2077                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2078                         r = mkdir_parents_label(pp, 0755);
2079                         if (r < 0)
2080                                 goto fail;
2081
2082                         if (is_dir(p, false) > 0 &&
2083                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2084
2085                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2086                                  * it over. Most likely the service has been upgraded from one that didn't use
2087                                  * DynamicUser=1, to one that does. */
2088
2089                                 if (rename(p, pp) < 0) {
2090                                         r = -errno;
2091                                         goto fail;
2092                                 }
2093                         } else {
2094                                 /* Otherwise, create the actual directory for the service */
2095
2096                                 r = mkdir_label(pp, context->directories[type].mode);
2097                                 if (r < 0 && r != -EEXIST)
2098                                         goto fail;
2099                         }
2100
2101                         parent = dirname_malloc(p);
2102                         if (!parent) {
2103                                 r = -ENOMEM;
2104                                 goto fail;
2105                         }
2106
2107                         r = path_make_relative(parent, pp, &relative);
2108                         if (r < 0)
2109                                 goto fail;
2110
2111                         /* And link it up from the original place */
2112                         r = symlink_idempotent(relative, p);
2113                         if (r < 0)
2114                                 goto fail;
2115
2116                         effective = pp;
2117
2118                 } else {
2119                         r = mkdir_label(p, context->directories[type].mode);
2120                         if (r < 0 && r != -EEXIST)
2121                                 goto fail;
2122
2123                         effective = p;
2124                 }
2125
2126                 /* First lock down the access mode */
2127                 if (chmod(effective, context->directories[type].mode) < 0) {
2128                         r = -errno;
2129                         goto fail;
2130                 }
2131
2132                 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2133                  * a service, and shall not be writable. */
2134                 if (type == EXEC_DIRECTORY_CONFIGURATION)
2135                         continue;
2136
2137                 /* Then, change the ownership of the whole tree, if necessary */
2138                 r = path_chown_recursive(effective, uid, gid);
2139                 if (r < 0)
2140                         goto fail;
2141         }
2142
2143         return 0;
2144
2145 fail:
2146         *exit_status = exit_status_table[type];
2147         return r;
2148 }
2149
2150 #if ENABLE_SMACK
2151 static int setup_smack(
2152                 const ExecContext *context,
2153                 const ExecCommand *command) {
2154
2155         int r;
2156
2157         assert(context);
2158         assert(command);
2159
2160         if (context->smack_process_label) {
2161                 r = mac_smack_apply_pid(0, context->smack_process_label);
2162                 if (r < 0)
2163                         return r;
2164         }
2165 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2166         else {
2167                 _cleanup_free_ char *exec_label = NULL;
2168
2169                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2170                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2171                         return r;
2172
2173                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2174                 if (r < 0)
2175                         return r;
2176         }
2177 #endif
2178
2179         return 0;
2180 }
2181 #endif
2182
2183 static int compile_bind_mounts(
2184                 const ExecContext *context,
2185                 const ExecParameters *params,
2186                 BindMount **ret_bind_mounts,
2187                 unsigned *ret_n_bind_mounts,
2188                 char ***ret_empty_directories) {
2189
2190         _cleanup_strv_free_ char **empty_directories = NULL;
2191         BindMount *bind_mounts;
2192         unsigned n, h = 0, i;
2193         ExecDirectoryType t;
2194         int r;
2195
2196         assert(context);
2197         assert(params);
2198         assert(ret_bind_mounts);
2199         assert(ret_n_bind_mounts);
2200         assert(ret_empty_directories);
2201
2202         n = context->n_bind_mounts;
2203         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2204                 if (!params->prefix[t])
2205                         continue;
2206
2207                 n += strv_length(context->directories[t].paths);
2208         }
2209
2210         if (n <= 0) {
2211                 *ret_bind_mounts = NULL;
2212                 *ret_n_bind_mounts = 0;
2213                 *ret_empty_directories = NULL;
2214                 return 0;
2215         }
2216
2217         bind_mounts = new(BindMount, n);
2218         if (!bind_mounts)
2219                 return -ENOMEM;
2220
2221         for (i = 0; i < context->n_bind_mounts; i++) {
2222                 BindMount *item = context->bind_mounts + i;
2223                 char *s, *d;
2224
2225                 s = strdup(item->source);
2226                 if (!s) {
2227                         r = -ENOMEM;
2228                         goto finish;
2229                 }
2230
2231                 d = strdup(item->destination);
2232                 if (!d) {
2233                         free(s);
2234                         r = -ENOMEM;
2235                         goto finish;
2236                 }
2237
2238                 bind_mounts[h++] = (BindMount) {
2239                         .source = s,
2240                         .destination = d,
2241                         .read_only = item->read_only,
2242                         .recursive = item->recursive,
2243                         .ignore_enoent = item->ignore_enoent,
2244                 };
2245         }
2246
2247         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2248                 char **suffix;
2249
2250                 if (!params->prefix[t])
2251                         continue;
2252
2253                 if (strv_isempty(context->directories[t].paths))
2254                         continue;
2255
2256                 if (context->dynamic_user &&
2257                     !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2258                         char *private_root;
2259
2260                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2261                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2262                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2263
2264                         private_root = strjoin(params->prefix[t], "/private");
2265                         if (!private_root) {
2266                                 r = -ENOMEM;
2267                                 goto finish;
2268                         }
2269
2270                         r = strv_consume(&empty_directories, private_root);
2271                         if (r < 0) {
2272                                 r = -ENOMEM;
2273                                 goto finish;
2274                         }
2275                 }
2276
2277                 STRV_FOREACH(suffix, context->directories[t].paths) {
2278                         char *s, *d;
2279
2280                         if (context->dynamic_user &&
2281                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2282                                 s = strjoin(params->prefix[t], "/private/", *suffix);
2283                         else
2284                                 s = strjoin(params->prefix[t], "/", *suffix);
2285                         if (!s) {
2286                                 r = -ENOMEM;
2287                                 goto finish;
2288                         }
2289
2290                         d = strdup(s);
2291                         if (!d) {
2292                                 free(s);
2293                                 r = -ENOMEM;
2294                                 goto finish;
2295                         }
2296
2297                         bind_mounts[h++] = (BindMount) {
2298                                 .source = s,
2299                                 .destination = d,
2300                                 .read_only = false,
2301                                 .recursive = true,
2302                                 .ignore_enoent = false,
2303                         };
2304                 }
2305         }
2306
2307         assert(h == n);
2308
2309         *ret_bind_mounts = bind_mounts;
2310         *ret_n_bind_mounts = n;
2311         *ret_empty_directories = empty_directories;
2312
2313         empty_directories = NULL;
2314
2315         return (int) n;
2316
2317 finish:
2318         bind_mount_free_many(bind_mounts, h);
2319         return r;
2320 }
2321
2322 static int apply_mount_namespace(
2323                 Unit *u,
2324                 ExecCommand *command,
2325                 const ExecContext *context,
2326                 const ExecParameters *params,
2327                 ExecRuntime *runtime) {
2328
2329         _cleanup_strv_free_ char **empty_directories = NULL;
2330         char *tmp = NULL, *var = NULL;
2331         const char *root_dir = NULL, *root_image = NULL;
2332         NamespaceInfo ns_info = {
2333                 .ignore_protect_paths = false,
2334                 .private_dev = context->private_devices,
2335                 .protect_control_groups = context->protect_control_groups,
2336                 .protect_kernel_tunables = context->protect_kernel_tunables,
2337                 .protect_kernel_modules = context->protect_kernel_modules,
2338                 .mount_apivfs = context->mount_apivfs,
2339         };
2340         bool needs_sandboxing;
2341         BindMount *bind_mounts = NULL;
2342         unsigned n_bind_mounts = 0;
2343         int r;
2344
2345         assert(context);
2346
2347         /* The runtime struct only contains the parent of the private /tmp,
2348          * which is non-accessible to world users. Inside of it there's a /tmp
2349          * that is sticky, and that's the one we want to use here. */
2350
2351         if (context->private_tmp && runtime) {
2352                 if (runtime->tmp_dir)
2353                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2354                 if (runtime->var_tmp_dir)
2355                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2356         }
2357
2358         if (params->flags & EXEC_APPLY_CHROOT) {
2359                 root_image = context->root_image;
2360
2361                 if (!root_image)
2362                         root_dir = context->root_directory;
2363         }
2364
2365         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2366         if (r < 0)
2367                 return r;
2368
2369         /*
2370          * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2371          * sandbox info, otherwise enforce it, don't ignore protected paths and
2372          * fail if we are enable to apply the sandbox inside the mount namespace.
2373          */
2374         if (!context->dynamic_user && root_dir)
2375                 ns_info.ignore_protect_paths = true;
2376
2377         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2378
2379         r = setup_namespace(root_dir, root_image,
2380                             &ns_info, context->read_write_paths,
2381                             needs_sandboxing ? context->read_only_paths : NULL,
2382                             needs_sandboxing ? context->inaccessible_paths : NULL,
2383                             empty_directories,
2384                             bind_mounts,
2385                             n_bind_mounts,
2386                             tmp,
2387                             var,
2388                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2389                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2390                             context->mount_flags,
2391                             DISSECT_IMAGE_DISCARD_ON_LOOP);
2392
2393         bind_mount_free_many(bind_mounts, n_bind_mounts);
2394
2395         /* If we couldn't set up the namespace this is probably due to a
2396          * missing capability. In this case, silently proceeed. */
2397         if (IN_SET(r, -EPERM, -EACCES)) {
2398                 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2399                 return 0;
2400         }
2401
2402         return r;
2403 }
2404
2405 static int apply_working_directory(
2406                 const ExecContext *context,
2407                 const ExecParameters *params,
2408                 const char *home,
2409                 const bool needs_mount_ns,
2410                 int *exit_status) {
2411
2412         const char *d, *wd;
2413
2414         assert(context);
2415         assert(exit_status);
2416
2417         if (context->working_directory_home) {
2418
2419                 if (!home) {
2420                         *exit_status = EXIT_CHDIR;
2421                         return -ENXIO;
2422                 }
2423
2424                 wd = home;
2425
2426         } else if (context->working_directory)
2427                 wd = context->working_directory;
2428         else
2429                 wd = "/";
2430
2431         if (params->flags & EXEC_APPLY_CHROOT) {
2432                 if (!needs_mount_ns && context->root_directory)
2433                         if (chroot(context->root_directory) < 0) {
2434                                 *exit_status = EXIT_CHROOT;
2435                                 return -errno;
2436                         }
2437
2438                 d = wd;
2439         } else
2440                 d = prefix_roota(context->root_directory, wd);
2441
2442         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2443                 *exit_status = EXIT_CHDIR;
2444                 return -errno;
2445         }
2446
2447         return 0;
2448 }
2449
2450 static int setup_keyring(
2451                 Unit *u,
2452                 const ExecContext *context,
2453                 const ExecParameters *p,
2454                 uid_t uid, gid_t gid) {
2455
2456         key_serial_t keyring;
2457         int r;
2458
2459         assert(u);
2460         assert(context);
2461         assert(p);
2462
2463         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2464          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2465          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2466          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2467          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2468          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2469
2470         if (!(p->flags & EXEC_NEW_KEYRING))
2471                 return 0;
2472
2473         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2474                 return 0;
2475
2476         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2477         if (keyring == -1) {
2478                 if (errno == ENOSYS)
2479                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2480                 else if (IN_SET(errno, EACCES, EPERM))
2481                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2482                 else if (errno == EDQUOT)
2483                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2484                 else
2485                         return log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2486
2487                 return 0;
2488         }
2489
2490         /* Populate they keyring with the invocation ID by default. */
2491         if (!sd_id128_is_null(u->invocation_id)) {
2492                 key_serial_t key;
2493
2494                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2495                 if (key == -1)
2496                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2497                 else {
2498                         if (keyctl(KEYCTL_SETPERM, key,
2499                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2500                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2501                                 return log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2502                 }
2503         }
2504
2505         /* And now, make the keyring owned by the service's user */
2506         if (uid_is_valid(uid) || gid_is_valid(gid))
2507                 if (keyctl(KEYCTL_CHOWN, keyring, uid, gid, 0) < 0)
2508                         return log_unit_error_errno(u, errno, "Failed to change ownership of session keyring: %m");
2509
2510         /* When requested link the user keyring into the session keyring. */
2511         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2512                 uid_t saved_uid;
2513                 gid_t saved_gid;
2514
2515                 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2516                  * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2517                  * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2518
2519                 saved_uid = getuid();
2520                 saved_gid = getgid();
2521
2522                 if (gid_is_valid(gid) && gid != saved_gid) {
2523                         if (setregid(gid, -1) < 0)
2524                                 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2525                 }
2526
2527                 if (uid_is_valid(uid) && uid != saved_uid) {
2528                         if (setreuid(uid, -1) < 0) {
2529                                 (void) setregid(saved_gid, -1);
2530                                 return log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2531                         }
2532                 }
2533
2534                 if (keyctl(KEYCTL_LINK,
2535                            KEY_SPEC_USER_KEYRING,
2536                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2537
2538                         r = -errno;
2539
2540                         (void) setreuid(saved_uid, -1);
2541                         (void) setregid(saved_gid, -1);
2542
2543                         return log_unit_error_errno(u, r, "Failed to link user keyring into session keyring: %m");
2544                 }
2545
2546                 if (uid_is_valid(uid) && uid != saved_uid) {
2547                         if (setreuid(saved_uid, -1) < 0) {
2548                                 (void) setregid(saved_gid, -1);
2549                                 return log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2550                         }
2551                 }
2552
2553                 if (gid_is_valid(gid) && gid != saved_gid) {
2554                         if (setregid(saved_gid, -1) < 0)
2555                                 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2556                 }
2557         }
2558
2559         return 0;
2560 }
2561
2562 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
2563         assert(array);
2564         assert(n);
2565
2566         if (!pair)
2567                 return;
2568
2569         if (pair[0] >= 0)
2570                 array[(*n)++] = pair[0];
2571         if (pair[1] >= 0)
2572                 array[(*n)++] = pair[1];
2573 }
2574
2575 static int close_remaining_fds(
2576                 const ExecParameters *params,
2577                 ExecRuntime *runtime,
2578                 DynamicCreds *dcreds,
2579                 int user_lookup_fd,
2580                 int socket_fd,
2581                 int *fds, unsigned n_fds) {
2582
2583         unsigned n_dont_close = 0;
2584         int dont_close[n_fds + 12];
2585
2586         assert(params);
2587
2588         if (params->stdin_fd >= 0)
2589                 dont_close[n_dont_close++] = params->stdin_fd;
2590         if (params->stdout_fd >= 0)
2591                 dont_close[n_dont_close++] = params->stdout_fd;
2592         if (params->stderr_fd >= 0)
2593                 dont_close[n_dont_close++] = params->stderr_fd;
2594
2595         if (socket_fd >= 0)
2596                 dont_close[n_dont_close++] = socket_fd;
2597         if (n_fds > 0) {
2598                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2599                 n_dont_close += n_fds;
2600         }
2601
2602         if (runtime)
2603                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2604
2605         if (dcreds) {
2606                 if (dcreds->user)
2607                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2608                 if (dcreds->group)
2609                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2610         }
2611
2612         if (user_lookup_fd >= 0)
2613                 dont_close[n_dont_close++] = user_lookup_fd;
2614
2615         return close_all_fds(dont_close, n_dont_close);
2616 }
2617
2618 static int send_user_lookup(
2619                 Unit *unit,
2620                 int user_lookup_fd,
2621                 uid_t uid,
2622                 gid_t gid) {
2623
2624         assert(unit);
2625
2626         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2627          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2628          * specified. */
2629
2630         if (user_lookup_fd < 0)
2631                 return 0;
2632
2633         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2634                 return 0;
2635
2636         if (writev(user_lookup_fd,
2637                (struct iovec[]) {
2638                            IOVEC_INIT(&uid, sizeof(uid)),
2639                            IOVEC_INIT(&gid, sizeof(gid)),
2640                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2641                 return -errno;
2642
2643         return 0;
2644 }
2645
2646 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2647         int r;
2648
2649         assert(c);
2650         assert(home);
2651         assert(buf);
2652
2653         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2654
2655         if (*home)
2656                 return 0;
2657
2658         if (!c->working_directory_home)
2659                 return 0;
2660
2661         if (uid == 0) {
2662                 /* Hardcode /root as home directory for UID 0 */
2663                 *home = "/root";
2664                 return 1;
2665         }
2666
2667         r = get_home_dir(buf);
2668         if (r < 0)
2669                 return r;
2670
2671         *home = *buf;
2672         return 1;
2673 }
2674
2675 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2676         _cleanup_strv_free_ char ** list = NULL;
2677         ExecDirectoryType t;
2678         int r;
2679
2680         assert(c);
2681         assert(p);
2682         assert(ret);
2683
2684         assert(c->dynamic_user);
2685
2686         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2687          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2688          * directories. */
2689
2690         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2691                 char **i;
2692
2693                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2694                         continue;
2695
2696                 if (!p->prefix[t])
2697                         continue;
2698
2699                 STRV_FOREACH(i, c->directories[t].paths) {
2700                         char *e;
2701
2702                         if (t == EXEC_DIRECTORY_RUNTIME)
2703                                 e = strjoin(p->prefix[t], "/", *i);
2704                         else
2705                                 e = strjoin(p->prefix[t], "/private/", *i);
2706                         if (!e)
2707                                 return -ENOMEM;
2708
2709                         r = strv_consume(&list, e);
2710                         if (r < 0)
2711                                 return r;
2712                 }
2713         }
2714
2715         *ret = list;
2716         list = NULL;
2717
2718         return 0;
2719 }
2720
2721 static int exec_child(
2722                 Unit *unit,
2723                 ExecCommand *command,
2724                 const ExecContext *context,
2725                 const ExecParameters *params,
2726                 ExecRuntime *runtime,
2727                 DynamicCreds *dcreds,
2728                 char **argv,
2729                 int socket_fd,
2730                 int named_iofds[3],
2731                 int *fds,
2732                 unsigned n_storage_fds,
2733                 unsigned n_socket_fds,
2734                 char **files_env,
2735                 int user_lookup_fd,
2736                 int *exit_status) {
2737
2738         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2739         _cleanup_free_ char *home_buffer = NULL;
2740         _cleanup_free_ gid_t *supplementary_gids = NULL;
2741         const char *username = NULL, *groupname = NULL;
2742         const char *home = NULL, *shell = NULL;
2743         dev_t journal_stream_dev = 0;
2744         ino_t journal_stream_ino = 0;
2745         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2746                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2747                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2748                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2749 #if HAVE_SELINUX
2750         _cleanup_free_ char *mac_selinux_context_net = NULL;
2751         bool use_selinux = false;
2752 #endif
2753 #if ENABLE_SMACK
2754         bool use_smack = false;
2755 #endif
2756 #if HAVE_APPARMOR
2757         bool use_apparmor = false;
2758 #endif
2759         uid_t uid = UID_INVALID;
2760         gid_t gid = GID_INVALID;
2761         int i, r, ngids = 0;
2762         unsigned n_fds;
2763         ExecDirectoryType dt;
2764         int secure_bits;
2765
2766         assert(unit);
2767         assert(command);
2768         assert(context);
2769         assert(params);
2770         assert(exit_status);
2771
2772         rename_process_from_path(command->path);
2773
2774         /* We reset exactly these signals, since they are the
2775          * only ones we set to SIG_IGN in the main daemon. All
2776          * others we leave untouched because we set them to
2777          * SIG_DFL or a valid handler initially, both of which
2778          * will be demoted to SIG_DFL. */
2779         (void) default_signals(SIGNALS_CRASH_HANDLER,
2780                                SIGNALS_IGNORE, -1);
2781
2782         if (context->ignore_sigpipe)
2783                 (void) ignore_signals(SIGPIPE, -1);
2784
2785         r = reset_signal_mask();
2786         if (r < 0) {
2787                 *exit_status = EXIT_SIGNAL_MASK;
2788                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2789         }
2790
2791         if (params->idle_pipe)
2792                 do_idle_pipe_dance(params->idle_pipe);
2793
2794         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2795          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2796          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2797          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2798
2799         log_forget_fds();
2800         log_set_open_when_needed(true);
2801
2802         /* In case anything used libc syslog(), close this here, too */
2803         closelog();
2804
2805         n_fds = n_storage_fds + n_socket_fds;
2806         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2807         if (r < 0) {
2808                 *exit_status = EXIT_FDS;
2809                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2810         }
2811
2812         if (!context->same_pgrp)
2813                 if (setsid() < 0) {
2814                         *exit_status = EXIT_SETSID;
2815                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2816                 }
2817
2818         exec_context_tty_reset(context, params);
2819
2820         if (unit_shall_confirm_spawn(unit)) {
2821                 const char *vc = params->confirm_spawn;
2822                 _cleanup_free_ char *cmdline = NULL;
2823
2824                 cmdline = exec_command_line(argv);
2825                 if (!cmdline) {
2826                         *exit_status = EXIT_MEMORY;
2827                         return log_oom();
2828                 }
2829
2830                 r = ask_for_confirmation(vc, unit, cmdline);
2831                 if (r != CONFIRM_EXECUTE) {
2832                         if (r == CONFIRM_PRETEND_SUCCESS) {
2833                                 *exit_status = EXIT_SUCCESS;
2834                                 return 0;
2835                         }
2836                         *exit_status = EXIT_CONFIRM;
2837                         log_unit_error(unit, "Execution cancelled by the user");
2838                         return -ECANCELED;
2839                 }
2840         }
2841
2842         if (context->dynamic_user && dcreds) {
2843                 _cleanup_strv_free_ char **suggested_paths = NULL;
2844
2845                 /* Make sure we bypass our own NSS module for any NSS checks */
2846                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2847                         *exit_status = EXIT_USER;
2848                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2849                 }
2850
2851                 r = compile_suggested_paths(context, params, &suggested_paths);
2852                 if (r < 0) {
2853                         *exit_status = EXIT_MEMORY;
2854                         return log_oom();
2855                 }
2856
2857                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2858                 if (r < 0) {
2859                         *exit_status = EXIT_USER;
2860                         if (r == -EILSEQ) {
2861                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2862                                 return -EOPNOTSUPP;
2863                         }
2864                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2865                 }
2866
2867                 if (!uid_is_valid(uid)) {
2868                         *exit_status = EXIT_USER;
2869                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2870                         return -ESRCH;
2871                 }
2872
2873                 if (!gid_is_valid(gid)) {
2874                         *exit_status = EXIT_USER;
2875                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2876                         return -ESRCH;
2877                 }
2878
2879                 if (dcreds->user)
2880                         username = dcreds->user->name;
2881
2882         } else {
2883                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2884                 if (r < 0) {
2885                         *exit_status = EXIT_USER;
2886                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2887                 }
2888
2889                 r = get_fixed_group(context, &groupname, &gid);
2890                 if (r < 0) {
2891                         *exit_status = EXIT_GROUP;
2892                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2893                 }
2894         }
2895
2896         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2897         r = get_supplementary_groups(context, username, groupname, gid,
2898                                      &supplementary_gids, &ngids);
2899         if (r < 0) {
2900                 *exit_status = EXIT_GROUP;
2901                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2902         }
2903
2904         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2905         if (r < 0) {
2906                 *exit_status = EXIT_USER;
2907                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2908         }
2909
2910         user_lookup_fd = safe_close(user_lookup_fd);
2911
2912         r = acquire_home(context, uid, &home, &home_buffer);
2913         if (r < 0) {
2914                 *exit_status = EXIT_CHDIR;
2915                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2916         }
2917
2918         /* If a socket is connected to STDIN/STDOUT/STDERR, we
2919          * must sure to drop O_NONBLOCK */
2920         if (socket_fd >= 0)
2921                 (void) fd_nonblock(socket_fd, false);
2922
2923         r = setup_input(context, params, socket_fd, named_iofds);
2924         if (r < 0) {
2925                 *exit_status = EXIT_STDIN;
2926                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2927         }
2928
2929         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2930         if (r < 0) {
2931                 *exit_status = EXIT_STDOUT;
2932                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2933         }
2934
2935         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2936         if (r < 0) {
2937                 *exit_status = EXIT_STDERR;
2938                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2939         }
2940
2941         if (params->cgroup_path) {
2942                 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2943                 if (r < 0) {
2944                         *exit_status = EXIT_CGROUP;
2945                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2946                 }
2947         }
2948
2949         if (context->oom_score_adjust_set) {
2950                 char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
2951
2952                 /* When we can't make this change due to EPERM, then
2953                  * let's silently skip over it. User namespaces
2954                  * prohibit write access to this file, and we
2955                  * shouldn't trip up over that. */
2956
2957                 sprintf(t, "%i", context->oom_score_adjust);
2958                 r = write_string_file("/proc/self/oom_score_adj", t, 0);
2959                 if (IN_SET(r, -EPERM, -EACCES))
2960                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2961                 else if (r < 0) {
2962                         *exit_status = EXIT_OOM_ADJUST;
2963                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2964                 }
2965         }
2966
2967         if (context->nice_set)
2968                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2969                         *exit_status = EXIT_NICE;
2970                         return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2971                 }
2972
2973         if (context->cpu_sched_set) {
2974                 struct sched_param param = {
2975                         .sched_priority = context->cpu_sched_priority,
2976                 };
2977
2978                 r = sched_setscheduler(0,
2979                                        context->cpu_sched_policy |
2980                                        (context->cpu_sched_reset_on_fork ?
2981                                         SCHED_RESET_ON_FORK : 0),
2982                                        &param);
2983                 if (r < 0) {
2984                         *exit_status = EXIT_SETSCHEDULER;
2985                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2986                 }
2987         }
2988
2989         if (context->cpuset)
2990                 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2991                         *exit_status = EXIT_CPUAFFINITY;
2992                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2993                 }
2994
2995         if (context->ioprio_set)
2996                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2997                         *exit_status = EXIT_IOPRIO;
2998                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
2999                 }
3000
3001         if (context->timer_slack_nsec != NSEC_INFINITY)
3002                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3003                         *exit_status = EXIT_TIMERSLACK;
3004                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3005                 }
3006
3007         if (context->personality != PERSONALITY_INVALID) {
3008                 r = safe_personality(context->personality);
3009                 if (r < 0) {
3010                         *exit_status = EXIT_PERSONALITY;
3011                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3012                 }
3013         }
3014
3015         if (context->utmp_id)
3016                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3017                                       context->tty_path,
3018                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3019                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3020                                       USER_PROCESS,
3021                                       username);
3022
3023         if (context->user) {
3024                 r = chown_terminal(STDIN_FILENO, uid);
3025                 if (r < 0) {
3026                         *exit_status = EXIT_STDIN;
3027                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3028                 }
3029         }
3030
3031         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3032          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3033          * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3034          * touch a single hierarchy too. */
3035         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3036                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3037                 if (r < 0) {
3038                         *exit_status = EXIT_CGROUP;
3039                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3040                 }
3041         }
3042
3043         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3044                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3045                 if (r < 0)
3046                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3047         }
3048
3049         r = build_environment(
3050                         unit,
3051                         context,
3052                         params,
3053                         n_fds,
3054                         home,
3055                         username,
3056                         shell,
3057                         journal_stream_dev,
3058                         journal_stream_ino,
3059                         &our_env);
3060         if (r < 0) {
3061                 *exit_status = EXIT_MEMORY;
3062                 return log_oom();
3063         }
3064
3065         r = build_pass_environment(context, &pass_env);
3066         if (r < 0) {
3067                 *exit_status = EXIT_MEMORY;
3068                 return log_oom();
3069         }
3070
3071         accum_env = strv_env_merge(5,
3072                                    params->environment,
3073                                    our_env,
3074                                    pass_env,
3075                                    context->environment,
3076                                    files_env,
3077                                    NULL);
3078         if (!accum_env) {
3079                 *exit_status = EXIT_MEMORY;
3080                 return log_oom();
3081         }
3082         accum_env = strv_env_clean(accum_env);
3083
3084         (void) umask(context->umask);
3085
3086         r = setup_keyring(unit, context, params, uid, gid);
3087         if (r < 0) {
3088                 *exit_status = EXIT_KEYRING;
3089                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3090         }
3091
3092         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3093         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3094
3095         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3096         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3097
3098         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3099         if (needs_ambient_hack)
3100                 needs_setuid = false;
3101         else
3102                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3103
3104         if (needs_sandboxing) {
3105                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3106                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3107                  * impacting our own code paths. */
3108
3109 #if HAVE_SELINUX
3110                 use_selinux = mac_selinux_use();
3111 #endif
3112 #if ENABLE_SMACK
3113                 use_smack = mac_smack_use();
3114 #endif
3115 #if HAVE_APPARMOR
3116                 use_apparmor = mac_apparmor_use();
3117 #endif
3118         }
3119
3120         if (needs_setuid) {
3121                 if (context->pam_name && username) {
3122                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3123                         if (r < 0) {
3124                                 *exit_status = EXIT_PAM;
3125                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3126                         }
3127                 }
3128         }
3129
3130         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3131                 if (ns_type_supported(NAMESPACE_NET)) {
3132                         r = setup_netns(runtime->netns_storage_socket);
3133                         if (r < 0) {
3134                                 *exit_status = EXIT_NETWORK;
3135                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3136                         }
3137                 } else
3138                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3139         }
3140
3141         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3142         if (needs_mount_namespace) {
3143                 r = apply_mount_namespace(unit, command, context, params, runtime);
3144                 if (r < 0) {
3145                         *exit_status = EXIT_NAMESPACE;
3146                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3147                 }
3148         }
3149
3150         /* Apply just after mount namespace setup */
3151         r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3152         if (r < 0)
3153                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3154
3155         /* Drop groups as early as possbile */
3156         if (needs_setuid) {
3157                 r = enforce_groups(gid, supplementary_gids, ngids);
3158                 if (r < 0) {
3159                         *exit_status = EXIT_GROUP;
3160                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3161                 }
3162         }
3163
3164         if (needs_sandboxing) {
3165 #if HAVE_SELINUX
3166                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3167                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3168                         if (r < 0) {
3169                                 *exit_status = EXIT_SELINUX_CONTEXT;
3170                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3171                         }
3172                 }
3173 #endif
3174
3175                 if (context->private_users) {
3176                         r = setup_private_users(uid, gid);
3177                         if (r < 0) {
3178                                 *exit_status = EXIT_USER;
3179                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3180                         }
3181                 }
3182         }
3183
3184         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3185          * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3186          * was needed to upload the policy and can now be closed as well. */
3187         r = close_all_fds(fds, n_fds);
3188         if (r >= 0)
3189                 r = shift_fds(fds, n_fds);
3190         if (r >= 0)
3191                 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
3192         if (r < 0) {
3193                 *exit_status = EXIT_FDS;
3194                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3195         }
3196
3197         secure_bits = context->secure_bits;
3198
3199         if (needs_sandboxing) {
3200                 uint64_t bset;
3201
3202                 for (i = 0; i < _RLIMIT_MAX; i++) {
3203
3204                         if (!context->rlimit[i])
3205                                 continue;
3206
3207                         r = setrlimit_closest(i, context->rlimit[i]);
3208                         if (r < 0) {
3209                                 *exit_status = EXIT_LIMITS;
3210                                 return log_unit_error_errno(unit, r, "Failed to adjust resource limit %s: %m", rlimit_to_string(i));
3211                         }
3212                 }
3213
3214                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3215                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3216                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3217                                 *exit_status = EXIT_LIMITS;
3218                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3219                         }
3220                 }
3221
3222 #if ENABLE_SMACK
3223                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3224                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3225                 if (use_smack) {
3226                         r = setup_smack(context, command);
3227                         if (r < 0) {
3228                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3229                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3230                         }
3231                 }
3232 #endif
3233
3234                 bset = context->capability_bounding_set;
3235                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3236                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3237                  * instead of us doing that */
3238                 if (needs_ambient_hack)
3239                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3240                                 (UINT64_C(1) << CAP_SETUID) |
3241                                 (UINT64_C(1) << CAP_SETGID);
3242
3243                 if (!cap_test_all(bset)) {
3244                         r = capability_bounding_set_drop(bset, false);
3245                         if (r < 0) {
3246                                 *exit_status = EXIT_CAPABILITIES;
3247                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3248                         }
3249                 }
3250
3251                 /* This is done before enforce_user, but ambient set
3252                  * does not survive over setresuid() if keep_caps is not set. */
3253                 if (!needs_ambient_hack &&
3254                     context->capability_ambient_set != 0) {
3255                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3256                         if (r < 0) {
3257                                 *exit_status = EXIT_CAPABILITIES;
3258                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3259                         }
3260                 }
3261         }
3262
3263         if (needs_setuid) {
3264                 if (context->user) {
3265                         r = enforce_user(context, uid);
3266                         if (r < 0) {
3267                                 *exit_status = EXIT_USER;
3268                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3269                         }
3270
3271                         if (!needs_ambient_hack &&
3272                             context->capability_ambient_set != 0) {
3273
3274                                 /* Fix the ambient capabilities after user change. */
3275                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3276                                 if (r < 0) {
3277                                         *exit_status = EXIT_CAPABILITIES;
3278                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3279                                 }
3280
3281                                 /* If we were asked to change user and ambient capabilities
3282                                  * were requested, we had to add keep-caps to the securebits
3283                                  * so that we would maintain the inherited capability set
3284                                  * through the setresuid(). Make sure that the bit is added
3285                                  * also to the context secure_bits so that we don't try to
3286                                  * drop the bit away next. */
3287
3288                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3289                         }
3290                 }
3291         }
3292
3293         if (needs_sandboxing) {
3294                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3295                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3296                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3297                  * are restricted. */
3298
3299 #if HAVE_SELINUX
3300                 if (use_selinux) {
3301                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3302
3303                         if (exec_context) {
3304                                 r = setexeccon(exec_context);
3305                                 if (r < 0) {
3306                                         *exit_status = EXIT_SELINUX_CONTEXT;
3307                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3308                                 }
3309                         }
3310                 }
3311 #endif
3312
3313 #if HAVE_APPARMOR
3314                 if (use_apparmor && context->apparmor_profile) {
3315                         r = aa_change_onexec(context->apparmor_profile);
3316                         if (r < 0 && !context->apparmor_profile_ignore) {
3317                                 *exit_status = EXIT_APPARMOR_PROFILE;
3318                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3319                         }
3320                 }
3321 #endif
3322
3323                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3324                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3325                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3326                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3327                                 *exit_status = EXIT_SECUREBITS;
3328                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3329                         }
3330
3331                 if (context_has_no_new_privileges(context))
3332                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3333                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3334                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3335                         }
3336
3337 #if HAVE_SECCOMP
3338                 r = apply_address_families(unit, context);
3339                 if (r < 0) {
3340                         *exit_status = EXIT_ADDRESS_FAMILIES;
3341                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3342                 }
3343
3344                 r = apply_memory_deny_write_execute(unit, context);
3345                 if (r < 0) {
3346                         *exit_status = EXIT_SECCOMP;
3347                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3348                 }
3349
3350                 r = apply_restrict_realtime(unit, context);
3351                 if (r < 0) {
3352                         *exit_status = EXIT_SECCOMP;
3353                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3354                 }
3355
3356                 r = apply_restrict_namespaces(unit, context);
3357                 if (r < 0) {
3358                         *exit_status = EXIT_SECCOMP;
3359                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3360                 }
3361
3362                 r = apply_protect_sysctl(unit, context);
3363                 if (r < 0) {
3364                         *exit_status = EXIT_SECCOMP;
3365                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3366                 }
3367
3368                 r = apply_protect_kernel_modules(unit, context);
3369                 if (r < 0) {
3370                         *exit_status = EXIT_SECCOMP;
3371                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3372                 }
3373
3374                 r = apply_private_devices(unit, context);
3375                 if (r < 0) {
3376                         *exit_status = EXIT_SECCOMP;
3377                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3378                 }
3379
3380                 r = apply_syscall_archs(unit, context);
3381                 if (r < 0) {
3382                         *exit_status = EXIT_SECCOMP;
3383                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3384                 }
3385
3386                 r = apply_lock_personality(unit, context);
3387                 if (r < 0) {
3388                         *exit_status = EXIT_SECCOMP;
3389                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3390                 }
3391
3392                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3393                  * by the filter as little as possible. */
3394                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3395                 if (r < 0) {
3396                         *exit_status = EXIT_SECCOMP;
3397                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3398                 }
3399 #endif
3400         }
3401
3402         if (!strv_isempty(context->unset_environment)) {
3403                 char **ee = NULL;
3404
3405                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3406                 if (!ee) {
3407                         *exit_status = EXIT_MEMORY;
3408                         return log_oom();
3409                 }
3410
3411                 strv_free(accum_env);
3412                 accum_env = ee;
3413         }
3414
3415         final_argv = replace_env_argv(argv, accum_env);
3416         if (!final_argv) {
3417                 *exit_status = EXIT_MEMORY;
3418                 return log_oom();
3419         }
3420
3421         if (_unlikely_(log_get_max_level() >= LOG_DEBUG)) {
3422                 _cleanup_free_ char *line;
3423
3424                 line = exec_command_line(final_argv);
3425                 if (line) {
3426                         log_struct(LOG_DEBUG,
3427                                    "EXECUTABLE=%s", command->path,
3428                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3429                                    LOG_UNIT_ID(unit),
3430                                    LOG_UNIT_INVOCATION_ID(unit),
3431                                    NULL);
3432                 }
3433         }
3434
3435         execve(command->path, final_argv, accum_env);
3436
3437         if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3438
3439                 log_struct_errno(LOG_INFO, errno,
3440                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3441                                  LOG_UNIT_ID(unit),
3442                                  LOG_UNIT_INVOCATION_ID(unit),
3443                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3444                                                   command->path),
3445                                  "EXECUTABLE=%s", command->path,
3446                                  NULL);
3447
3448                 return 0;
3449         }
3450
3451         *exit_status = EXIT_EXEC;
3452         return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
3453 }
3454
3455 int exec_spawn(Unit *unit,
3456                ExecCommand *command,
3457                const ExecContext *context,
3458                const ExecParameters *params,
3459                ExecRuntime *runtime,
3460                DynamicCreds *dcreds,
3461                pid_t *ret) {
3462
3463         _cleanup_strv_free_ char **files_env = NULL;
3464         int *fds = NULL;
3465         unsigned n_storage_fds = 0, n_socket_fds = 0;
3466         _cleanup_free_ char *line = NULL;
3467         int socket_fd, r;
3468         int named_iofds[3] = { -1, -1, -1 };
3469         char **argv;
3470         pid_t pid;
3471
3472         assert(unit);
3473         assert(command);
3474         assert(context);
3475         assert(ret);
3476         assert(params);
3477         assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3478
3479         if (context->std_input == EXEC_INPUT_SOCKET ||
3480             context->std_output == EXEC_OUTPUT_SOCKET ||
3481             context->std_error == EXEC_OUTPUT_SOCKET) {
3482
3483                 if (params->n_socket_fds > 1) {
3484                         log_unit_error(unit, "Got more than one socket.");
3485                         return -EINVAL;
3486                 }
3487
3488                 if (params->n_socket_fds == 0) {
3489                         log_unit_error(unit, "Got no socket.");
3490                         return -EINVAL;
3491                 }
3492
3493                 socket_fd = params->fds[0];
3494         } else {
3495                 socket_fd = -1;
3496                 fds = params->fds;
3497                 n_storage_fds = params->n_storage_fds;
3498                 n_socket_fds = params->n_socket_fds;
3499         }
3500
3501         r = exec_context_named_iofds(unit, context, params, named_iofds);
3502         if (r < 0)
3503                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3504
3505         r = exec_context_load_environment(unit, context, &files_env);
3506         if (r < 0)
3507                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3508
3509         argv = params->argv ?: command->argv;
3510         line = exec_command_line(argv);
3511         if (!line)
3512                 return log_oom();
3513
3514         log_struct(LOG_DEBUG,
3515                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3516                    "EXECUTABLE=%s", command->path,
3517                    LOG_UNIT_ID(unit),
3518                    LOG_UNIT_INVOCATION_ID(unit),
3519                    NULL);
3520
3521         pid = fork();
3522         if (pid < 0)
3523                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3524
3525         if (pid == 0) {
3526                 int exit_status = EXIT_SUCCESS;
3527
3528                 r = exec_child(unit,
3529                                command,
3530                                context,
3531                                params,
3532                                runtime,
3533                                dcreds,
3534                                argv,
3535                                socket_fd,
3536                                named_iofds,
3537                                fds,
3538                                n_storage_fds,
3539                                n_socket_fds,
3540                                files_env,
3541                                unit->manager->user_lookup_fds[1],
3542                                &exit_status);
3543
3544                 if (r < 0) {
3545                         log_struct_errno(LOG_ERR, r,
3546                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3547                                          LOG_UNIT_ID(unit),
3548                                          LOG_UNIT_INVOCATION_ID(unit),
3549                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3550                                                           exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3551                                                           command->path),
3552                                          "EXECUTABLE=%s", command->path,
3553                                          NULL);
3554                 }
3555
3556                 _exit(exit_status);
3557         }
3558
3559         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3560
3561         /* We add the new process to the cgroup both in the child (so
3562          * that we can be sure that no user code is ever executed
3563          * outside of the cgroup) and in the parent (so that we can be
3564          * sure that when we kill the cgroup the process will be
3565          * killed too). */
3566         if (params->cgroup_path)
3567                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3568
3569         exec_status_start(&command->exec_status, pid);
3570
3571         *ret = pid;
3572         return 0;
3573 }
3574
3575 void exec_context_init(ExecContext *c) {
3576         ExecDirectoryType i;
3577
3578         assert(c);
3579
3580         c->umask = 0022;
3581         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3582         c->cpu_sched_policy = SCHED_OTHER;
3583         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3584         c->syslog_level_prefix = true;
3585         c->ignore_sigpipe = true;
3586         c->timer_slack_nsec = NSEC_INFINITY;
3587         c->personality = PERSONALITY_INVALID;
3588         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3589                 c->directories[i].mode = 0755;
3590         c->capability_bounding_set = CAP_ALL;
3591         c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
3592         c->log_level_max = -1;
3593 }
3594
3595 void exec_context_done(ExecContext *c) {
3596         ExecDirectoryType i;
3597         size_t l;
3598
3599         assert(c);
3600
3601         c->environment = strv_free(c->environment);
3602         c->environment_files = strv_free(c->environment_files);
3603         c->pass_environment = strv_free(c->pass_environment);
3604         c->unset_environment = strv_free(c->unset_environment);
3605
3606         for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
3607                 c->rlimit[l] = mfree(c->rlimit[l]);
3608
3609         for (l = 0; l < 3; l++) {
3610                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3611                 c->stdio_file[l] = mfree(c->stdio_file[l]);
3612         }
3613
3614         c->working_directory = mfree(c->working_directory);
3615         c->root_directory = mfree(c->root_directory);
3616         c->root_image = mfree(c->root_image);
3617         c->tty_path = mfree(c->tty_path);
3618         c->syslog_identifier = mfree(c->syslog_identifier);
3619         c->user = mfree(c->user);
3620         c->group = mfree(c->group);
3621
3622         c->supplementary_groups = strv_free(c->supplementary_groups);
3623
3624         c->pam_name = mfree(c->pam_name);
3625
3626         c->read_only_paths = strv_free(c->read_only_paths);
3627         c->read_write_paths = strv_free(c->read_write_paths);
3628         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3629
3630         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3631
3632         c->cpuset = cpu_set_mfree(c->cpuset);
3633
3634         c->utmp_id = mfree(c->utmp_id);
3635         c->selinux_context = mfree(c->selinux_context);
3636         c->apparmor_profile = mfree(c->apparmor_profile);
3637         c->smack_process_label = mfree(c->smack_process_label);
3638
3639         c->syscall_filter = hashmap_free(c->syscall_filter);
3640         c->syscall_archs = set_free(c->syscall_archs);
3641         c->address_families = set_free(c->address_families);
3642
3643         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3644                 c->directories[i].paths = strv_free(c->directories[i].paths);
3645
3646         c->log_level_max = -1;
3647
3648         exec_context_free_log_extra_fields(c);
3649
3650         c->stdin_data = mfree(c->stdin_data);
3651         c->stdin_data_size = 0;
3652 }
3653
3654 int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_prefix) {
3655         char **i;
3656
3657         assert(c);
3658
3659         if (!runtime_prefix)
3660                 return 0;
3661
3662         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3663                 _cleanup_free_ char *p;
3664
3665                 p = strjoin(runtime_prefix, "/", *i);
3666                 if (!p)
3667                         return -ENOMEM;
3668
3669                 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3670                  * next. */
3671                 (void) rm_rf(p, REMOVE_ROOT);
3672         }
3673
3674         return 0;
3675 }
3676
3677 void exec_command_done(ExecCommand *c) {
3678         assert(c);
3679
3680         c->path = mfree(c->path);
3681
3682         c->argv = strv_free(c->argv);
3683 }
3684
3685 void exec_command_done_array(ExecCommand *c, unsigned n) {
3686         unsigned i;
3687
3688         for (i = 0; i < n; i++)
3689                 exec_command_done(c+i);
3690 }
3691
3692 ExecCommand* exec_command_free_list(ExecCommand *c) {
3693         ExecCommand *i;
3694
3695         while ((i = c)) {
3696                 LIST_REMOVE(command, c, i);
3697                 exec_command_done(i);
3698                 free(i);
3699         }
3700
3701         return NULL;
3702 }
3703
3704 void exec_command_free_array(ExecCommand **c, unsigned n) {
3705         unsigned i;
3706
3707         for (i = 0; i < n; i++)
3708                 c[i] = exec_command_free_list(c[i]);
3709 }
3710
3711 typedef struct InvalidEnvInfo {
3712         Unit *unit;
3713         const char *path;
3714 } InvalidEnvInfo;
3715
3716 static void invalid_env(const char *p, void *userdata) {
3717         InvalidEnvInfo *info = userdata;
3718
3719         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3720 }
3721
3722 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3723         assert(c);
3724
3725         switch (fd_index) {
3726
3727         case STDIN_FILENO:
3728                 if (c->std_input != EXEC_INPUT_NAMED_FD)
3729                         return NULL;
3730
3731                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3732
3733         case STDOUT_FILENO:
3734                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3735                         return NULL;
3736
3737                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3738
3739         case STDERR_FILENO:
3740                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3741                         return NULL;
3742
3743                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3744
3745         default:
3746                 return NULL;
3747         }
3748 }
3749
3750 int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3751         unsigned i, targets;
3752         const char* stdio_fdname[3];
3753         unsigned n_fds;
3754
3755         assert(c);
3756         assert(p);
3757
3758         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3759                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3760                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
3761
3762         for (i = 0; i < 3; i++)
3763                 stdio_fdname[i] = exec_context_fdname(c, i);
3764
3765         n_fds = p->n_storage_fds + p->n_socket_fds;
3766
3767         for (i = 0; i < n_fds  && targets > 0; i++)
3768                 if (named_iofds[STDIN_FILENO] < 0 &&
3769                     c->std_input == EXEC_INPUT_NAMED_FD &&
3770                     stdio_fdname[STDIN_FILENO] &&
3771                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3772
3773                         named_iofds[STDIN_FILENO] = p->fds[i];
3774                         targets--;
3775
3776                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3777                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
3778                            stdio_fdname[STDOUT_FILENO] &&
3779                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3780
3781                         named_iofds[STDOUT_FILENO] = p->fds[i];
3782                         targets--;
3783
3784                 } else if (named_iofds[STDERR_FILENO] < 0 &&
3785                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
3786                            stdio_fdname[STDERR_FILENO] &&
3787                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3788
3789                         named_iofds[STDERR_FILENO] = p->fds[i];
3790                         targets--;
3791                 }
3792
3793         return targets == 0 ? 0 : -ENOENT;
3794 }
3795
3796 int exec_context_load_environment(Unit *unit, const ExecContext *c, char ***l) {
3797         char **i, **r = NULL;
3798
3799         assert(c);
3800         assert(l);
3801
3802         STRV_FOREACH(i, c->environment_files) {
3803                 char *fn;
3804                 int k;
3805                 unsigned n;
3806                 bool ignore = false;
3807                 char **p;
3808                 _cleanup_globfree_ glob_t pglob = {};
3809
3810                 fn = *i;
3811
3812                 if (fn[0] == '-') {
3813                         ignore = true;
3814                         fn++;
3815                 }
3816
3817                 if (!path_is_absolute(fn)) {
3818                         if (ignore)
3819                                 continue;
3820
3821                         strv_free(r);
3822                         return -EINVAL;
3823                 }
3824
3825                 /* Filename supports globbing, take all matching files */
3826                 k = safe_glob(fn, 0, &pglob);
3827                 if (k < 0) {
3828                         if (ignore)
3829                                 continue;
3830
3831                         strv_free(r);
3832                         return k;
3833                 }
3834
3835                 /* When we don't match anything, -ENOENT should be returned */
3836                 assert(pglob.gl_pathc > 0);
3837
3838                 for (n = 0; n < pglob.gl_pathc; n++) {
3839                         k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3840                         if (k < 0) {
3841                                 if (ignore)
3842                                         continue;
3843
3844                                 strv_free(r);
3845                                 return k;
3846                         }
3847                         /* Log invalid environment variables with filename */
3848                         if (p) {
3849                                 InvalidEnvInfo info = {
3850                                         .unit = unit,
3851                                         .path = pglob.gl_pathv[n]
3852                                 };
3853
3854                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
3855                         }
3856
3857                         if (!r)
3858                                 r = p;
3859                         else {
3860                                 char **m;
3861
3862                                 m = strv_env_merge(2, r, p);
3863                                 strv_free(r);
3864                                 strv_free(p);
3865                                 if (!m)
3866                                         return -ENOMEM;
3867
3868                                 r = m;
3869                         }
3870                 }
3871         }
3872
3873         *l = r;
3874
3875         return 0;
3876 }
3877
3878 static bool tty_may_match_dev_console(const char *tty) {
3879         _cleanup_free_ char *active = NULL;
3880         char *console;
3881
3882         if (!tty)
3883                 return true;
3884
3885         tty = skip_dev_prefix(tty);
3886
3887         /* trivial identity? */
3888         if (streq(tty, "console"))
3889                 return true;
3890
3891         console = resolve_dev_console(&active);
3892         /* if we could not resolve, assume it may */
3893         if (!console)
3894                 return true;
3895
3896         /* "tty0" means the active VC, so it may be the same sometimes */
3897         return streq(console, tty) || (streq(console, "tty0") && tty_is_vc(tty));
3898 }
3899
3900 bool exec_context_may_touch_console(ExecContext *ec) {
3901
3902         return (ec->tty_reset ||
3903                 ec->tty_vhangup ||
3904                 ec->tty_vt_disallocate ||
3905                 is_terminal_input(ec->std_input) ||
3906                 is_terminal_output(ec->std_output) ||
3907                 is_terminal_output(ec->std_error)) &&
3908                tty_may_match_dev_console(exec_context_tty_path(ec));
3909 }
3910
3911 static void strv_fprintf(FILE *f, char **l) {
3912         char **g;
3913
3914         assert(f);
3915
3916         STRV_FOREACH(g, l)
3917                 fprintf(f, " %s", *g);
3918 }
3919
3920 void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
3921         ExecDirectoryType dt;
3922         char **e, **d;
3923         unsigned i;
3924         int r;
3925
3926         assert(c);
3927         assert(f);
3928
3929         prefix = strempty(prefix);
3930
3931         fprintf(f,
3932                 "%sUMask: %04o\n"
3933                 "%sWorkingDirectory: %s\n"
3934                 "%sRootDirectory: %s\n"
3935                 "%sNonBlocking: %s\n"
3936                 "%sPrivateTmp: %s\n"
3937                 "%sPrivateDevices: %s\n"
3938                 "%sProtectKernelTunables: %s\n"
3939                 "%sProtectKernelModules: %s\n"
3940                 "%sProtectControlGroups: %s\n"
3941                 "%sPrivateNetwork: %s\n"
3942                 "%sPrivateUsers: %s\n"
3943                 "%sProtectHome: %s\n"
3944                 "%sProtectSystem: %s\n"
3945                 "%sMountAPIVFS: %s\n"
3946                 "%sIgnoreSIGPIPE: %s\n"
3947                 "%sMemoryDenyWriteExecute: %s\n"
3948                 "%sRestrictRealtime: %s\n"
3949                 "%sKeyringMode: %s\n",
3950                 prefix, c->umask,
3951                 prefix, c->working_directory ? c->working_directory : "/",
3952                 prefix, c->root_directory ? c->root_directory : "/",
3953                 prefix, yes_no(c->non_blocking),
3954                 prefix, yes_no(c->private_tmp),
3955                 prefix, yes_no(c->private_devices),
3956                 prefix, yes_no(c->protect_kernel_tunables),
3957                 prefix, yes_no(c->protect_kernel_modules),
3958                 prefix, yes_no(c->protect_control_groups),
3959                 prefix, yes_no(c->private_network),
3960                 prefix, yes_no(c->private_users),
3961                 prefix, protect_home_to_string(c->protect_home),
3962                 prefix, protect_system_to_string(c->protect_system),
3963                 prefix, yes_no(c->mount_apivfs),
3964                 prefix, yes_no(c->ignore_sigpipe),
3965                 prefix, yes_no(c->memory_deny_write_execute),
3966                 prefix, yes_no(c->restrict_realtime),
3967                 prefix, exec_keyring_mode_to_string(c->keyring_mode));
3968
3969         if (c->root_image)
3970                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3971
3972         STRV_FOREACH(e, c->environment)
3973                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3974
3975         STRV_FOREACH(e, c->environment_files)
3976                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3977
3978         STRV_FOREACH(e, c->pass_environment)
3979                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3980
3981         STRV_FOREACH(e, c->unset_environment)
3982                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3983
3984         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3985
3986         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3987                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3988
3989                 STRV_FOREACH(d, c->directories[dt].paths)
3990                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3991         }
3992
3993         if (c->nice_set)
3994                 fprintf(f,
3995                         "%sNice: %i\n",
3996                         prefix, c->nice);
3997
3998         if (c->oom_score_adjust_set)
3999                 fprintf(f,
4000                         "%sOOMScoreAdjust: %i\n",
4001                         prefix, c->oom_score_adjust);
4002
4003         for (i = 0; i < RLIM_NLIMITS; i++)
4004                 if (c->rlimit[i]) {
4005                         fprintf(f, "%s%s: " RLIM_FMT "\n",
4006                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4007                         fprintf(f, "%s%sSoft: " RLIM_FMT "\n",
4008                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4009                 }
4010
4011         if (c->ioprio_set) {
4012                 _cleanup_free_ char *class_str = NULL;
4013
4014                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4015                 if (r >= 0)
4016                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4017
4018                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4019         }
4020
4021         if (c->cpu_sched_set) {
4022                 _cleanup_free_ char *policy_str = NULL;
4023
4024                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4025                 if (r >= 0)
4026                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4027
4028                 fprintf(f,
4029                         "%sCPUSchedulingPriority: %i\n"
4030                         "%sCPUSchedulingResetOnFork: %s\n",
4031                         prefix, c->cpu_sched_priority,
4032                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4033         }
4034
4035         if (c->cpuset) {
4036                 fprintf(f, "%sCPUAffinity:", prefix);
4037                 for (i = 0; i < c->cpuset_ncpus; i++)
4038                         if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
4039                                 fprintf(f, " %u", i);
4040                 fputs("\n", f);
4041         }
4042
4043         if (c->timer_slack_nsec != NSEC_INFINITY)
4044                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4045
4046         fprintf(f,
4047                 "%sStandardInput: %s\n"
4048                 "%sStandardOutput: %s\n"
4049                 "%sStandardError: %s\n",
4050                 prefix, exec_input_to_string(c->std_input),
4051                 prefix, exec_output_to_string(c->std_output),
4052                 prefix, exec_output_to_string(c->std_error));
4053
4054         if (c->std_input == EXEC_INPUT_NAMED_FD)
4055                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4056         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4057                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4058         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4059                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4060
4061         if (c->std_input == EXEC_INPUT_FILE)
4062                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4063         if (c->std_output == EXEC_OUTPUT_FILE)
4064                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4065         if (c->std_error == EXEC_OUTPUT_FILE)
4066                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4067
4068         if (c->tty_path)
4069                 fprintf(f,
4070                         "%sTTYPath: %s\n"
4071                         "%sTTYReset: %s\n"
4072                         "%sTTYVHangup: %s\n"
4073                         "%sTTYVTDisallocate: %s\n",
4074                         prefix, c->tty_path,
4075                         prefix, yes_no(c->tty_reset),
4076                         prefix, yes_no(c->tty_vhangup),
4077                         prefix, yes_no(c->tty_vt_disallocate));
4078
4079         if (IN_SET(c->std_output,
4080                    EXEC_OUTPUT_SYSLOG,
4081                    EXEC_OUTPUT_KMSG,
4082                    EXEC_OUTPUT_JOURNAL,
4083                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4084                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4085                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4086             IN_SET(c->std_error,
4087                    EXEC_OUTPUT_SYSLOG,
4088                    EXEC_OUTPUT_KMSG,
4089                    EXEC_OUTPUT_JOURNAL,
4090                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4091                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4092                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4093
4094                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4095
4096                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4097                 if (r >= 0)
4098                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4099
4100                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4101                 if (r >= 0)
4102                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4103         }
4104
4105         if (c->log_level_max >= 0) {
4106                 _cleanup_free_ char *t = NULL;
4107
4108                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4109
4110                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4111         }
4112
4113         if (c->n_log_extra_fields > 0) {
4114                 size_t j;
4115
4116                 for (j = 0; j < c->n_log_extra_fields; j++) {
4117                         fprintf(f, "%sLogExtraFields: ", prefix);
4118                         fwrite(c->log_extra_fields[j].iov_base,
4119                                1, c->log_extra_fields[j].iov_len,
4120                                f);
4121                         fputc('\n', f);
4122                 }
4123         }
4124
4125         if (c->secure_bits) {
4126                 _cleanup_free_ char *str = NULL;
4127
4128                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4129                 if (r >= 0)
4130                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4131         }
4132
4133         if (c->capability_bounding_set != CAP_ALL) {
4134                 _cleanup_free_ char *str = NULL;
4135
4136                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4137                 if (r >= 0)
4138                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4139         }
4140
4141         if (c->capability_ambient_set != 0) {
4142                 _cleanup_free_ char *str = NULL;
4143
4144                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4145                 if (r >= 0)
4146                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4147         }
4148
4149         if (c->user)
4150                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4151         if (c->group)
4152                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4153
4154         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4155
4156         if (!strv_isempty(c->supplementary_groups)) {
4157                 fprintf(f, "%sSupplementaryGroups:", prefix);
4158                 strv_fprintf(f, c->supplementary_groups);
4159                 fputs("\n", f);
4160         }
4161
4162         if (c->pam_name)
4163                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4164
4165         if (strv_length(c->read_write_paths) > 0) {
4166                 fprintf(f, "%sReadWritePaths:", prefix);
4167                 strv_fprintf(f, c->read_write_paths);
4168                 fputs("\n", f);
4169         }
4170
4171         if (strv_length(c->read_only_paths) > 0) {
4172                 fprintf(f, "%sReadOnlyPaths:", prefix);
4173                 strv_fprintf(f, c->read_only_paths);
4174                 fputs("\n", f);
4175         }
4176
4177         if (strv_length(c->inaccessible_paths) > 0) {
4178                 fprintf(f, "%sInaccessiblePaths:", prefix);
4179                 strv_fprintf(f, c->inaccessible_paths);
4180                 fputs("\n", f);
4181         }
4182
4183         if (c->n_bind_mounts > 0)
4184                 for (i = 0; i < c->n_bind_mounts; i++) {
4185                         fprintf(f, "%s%s: %s:%s:%s\n", prefix,
4186                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4187                                 c->bind_mounts[i].source,
4188                                 c->bind_mounts[i].destination,
4189                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4190                 }
4191
4192         if (c->utmp_id)
4193                 fprintf(f,
4194                         "%sUtmpIdentifier: %s\n",
4195                         prefix, c->utmp_id);
4196
4197         if (c->selinux_context)
4198                 fprintf(f,
4199                         "%sSELinuxContext: %s%s\n",
4200                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4201
4202         if (c->apparmor_profile)
4203                 fprintf(f,
4204                         "%sAppArmorProfile: %s%s\n",
4205                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4206
4207         if (c->smack_process_label)
4208                 fprintf(f,
4209                         "%sSmackProcessLabel: %s%s\n",
4210                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4211
4212         if (c->personality != PERSONALITY_INVALID)
4213                 fprintf(f,
4214                         "%sPersonality: %s\n",
4215                         prefix, strna(personality_to_string(c->personality)));
4216
4217         fprintf(f,
4218                 "%sLockPersonality: %s\n",
4219                 prefix, yes_no(c->lock_personality));
4220
4221         if (c->syscall_filter) {
4222 #if HAVE_SECCOMP
4223                 Iterator j;
4224                 void *id, *val;
4225                 bool first = true;
4226 #endif
4227
4228                 fprintf(f,
4229                         "%sSystemCallFilter: ",
4230                         prefix);
4231
4232                 if (!c->syscall_whitelist)
4233                         fputc('~', f);
4234
4235 #if HAVE_SECCOMP
4236                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4237                         _cleanup_free_ char *name = NULL;
4238                         const char *errno_name = NULL;
4239                         int num = PTR_TO_INT(val);
4240
4241                         if (first)
4242                                 first = false;
4243                         else
4244                                 fputc(' ', f);
4245
4246                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4247                         fputs(strna(name), f);
4248
4249                         if (num >= 0) {
4250                                 errno_name = errno_to_name(num);
4251                                 if (errno_name)
4252                                         fprintf(f, ":%s", errno_name);
4253                                 else
4254                                         fprintf(f, ":%d", num);
4255                         }
4256                 }
4257 #endif
4258
4259                 fputc('\n', f);
4260         }
4261
4262         if (c->syscall_archs) {
4263 #if HAVE_SECCOMP
4264                 Iterator j;
4265                 void *id;
4266 #endif
4267
4268                 fprintf(f,
4269                         "%sSystemCallArchitectures:",
4270                         prefix);
4271
4272 #if HAVE_SECCOMP
4273                 SET_FOREACH(id, c->syscall_archs, j)
4274                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4275 #endif
4276                 fputc('\n', f);
4277         }
4278
4279         if (exec_context_restrict_namespaces_set(c)) {
4280                 _cleanup_free_ char *s = NULL;
4281
4282                 r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
4283                 if (r >= 0)
4284                         fprintf(f, "%sRestrictNamespaces: %s\n",
4285                                 prefix, s);
4286         }
4287
4288         if (c->syscall_errno > 0) {
4289                 const char *errno_name;
4290
4291                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4292
4293                 errno_name = errno_to_name(c->syscall_errno);
4294                 if (errno_name)
4295                         fprintf(f, "%s\n", errno_name);
4296                 else
4297                         fprintf(f, "%d\n", c->syscall_errno);
4298         }
4299
4300         if (c->apparmor_profile)
4301                 fprintf(f,
4302                         "%sAppArmorProfile: %s%s\n",
4303                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4304 }
4305
4306 bool exec_context_maintains_privileges(ExecContext *c) {
4307         assert(c);
4308
4309         /* Returns true if the process forked off would run under
4310          * an unchanged UID or as root. */
4311
4312         if (!c->user)
4313                 return true;
4314
4315         if (streq(c->user, "root") || streq(c->user, "0"))
4316                 return true;
4317
4318         return false;
4319 }
4320
4321 int exec_context_get_effective_ioprio(ExecContext *c) {
4322         int p;
4323
4324         assert(c);
4325
4326         if (c->ioprio_set)
4327                 return c->ioprio;
4328
4329         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4330         if (p < 0)
4331                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4332
4333         return p;
4334 }
4335
4336 void exec_context_free_log_extra_fields(ExecContext *c) {
4337         size_t l;
4338
4339         assert(c);
4340
4341         for (l = 0; l < c->n_log_extra_fields; l++)
4342                 free(c->log_extra_fields[l].iov_base);
4343         c->log_extra_fields = mfree(c->log_extra_fields);
4344         c->n_log_extra_fields = 0;
4345 }
4346
4347 void exec_status_start(ExecStatus *s, pid_t pid) {
4348         assert(s);
4349
4350         zero(*s);
4351         s->pid = pid;
4352         dual_timestamp_get(&s->start_timestamp);
4353 }
4354
4355 void exec_status_exit(ExecStatus *s, ExecContext *context, pid_t pid, int code, int status) {
4356         assert(s);
4357
4358         if (s->pid && s->pid != pid)
4359                 zero(*s);
4360
4361         s->pid = pid;
4362         dual_timestamp_get(&s->exit_timestamp);
4363
4364         s->code = code;
4365         s->status = status;
4366
4367         if (context) {
4368                 if (context->utmp_id)
4369                         utmp_put_dead_process(context->utmp_id, pid, code, status);
4370
4371                 exec_context_tty_reset(context, NULL);
4372         }
4373 }
4374
4375 void exec_status_dump(ExecStatus *s, FILE *f, const char *prefix) {
4376         char buf[FORMAT_TIMESTAMP_MAX];
4377
4378         assert(s);
4379         assert(f);
4380
4381         if (s->pid <= 0)
4382                 return;
4383
4384         prefix = strempty(prefix);
4385
4386         fprintf(f,
4387                 "%sPID: "PID_FMT"\n",
4388                 prefix, s->pid);
4389
4390         if (dual_timestamp_is_set(&s->start_timestamp))
4391                 fprintf(f,
4392                         "%sStart Timestamp: %s\n",
4393                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4394
4395         if (dual_timestamp_is_set(&s->exit_timestamp))
4396                 fprintf(f,
4397                         "%sExit Timestamp: %s\n"
4398                         "%sExit Code: %s\n"
4399                         "%sExit Status: %i\n",
4400                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4401                         prefix, sigchld_code_to_string(s->code),
4402                         prefix, s->status);
4403 }
4404
4405 char *exec_command_line(char **argv) {
4406         size_t k;
4407         char *n, *p, **a;
4408         bool first = true;
4409
4410         assert(argv);
4411
4412         k = 1;
4413         STRV_FOREACH(a, argv)
4414                 k += strlen(*a)+3;
4415
4416         n = new(char, k);
4417         if (!n)
4418                 return NULL;
4419
4420         p = n;
4421         STRV_FOREACH(a, argv) {
4422
4423                 if (!first)
4424                         *(p++) = ' ';
4425                 else
4426                         first = false;
4427
4428                 if (strpbrk(*a, WHITESPACE)) {
4429                         *(p++) = '\'';
4430                         p = stpcpy(p, *a);
4431                         *(p++) = '\'';
4432                 } else
4433                         p = stpcpy(p, *a);
4434
4435         }
4436
4437         *p = 0;
4438
4439         /* FIXME: this doesn't really handle arguments that have
4440          * spaces and ticks in them */
4441
4442         return n;
4443 }
4444
4445 void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4446         _cleanup_free_ char *cmd = NULL;
4447         const char *prefix2;
4448
4449         assert(c);
4450         assert(f);
4451
4452         prefix = strempty(prefix);
4453         prefix2 = strjoina(prefix, "\t");
4454
4455         cmd = exec_command_line(c->argv);
4456         fprintf(f,
4457                 "%sCommand Line: %s\n",
4458                 prefix, cmd ? cmd : strerror(ENOMEM));
4459
4460         exec_status_dump(&c->exec_status, f, prefix2);
4461 }
4462
4463 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4464         assert(f);
4465
4466         prefix = strempty(prefix);
4467
4468         LIST_FOREACH(command, c, c)
4469                 exec_command_dump(c, f, prefix);
4470 }
4471
4472 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4473         ExecCommand *end;
4474
4475         assert(l);
4476         assert(e);
4477
4478         if (*l) {
4479                 /* It's kind of important, that we keep the order here */
4480                 LIST_FIND_TAIL(command, *l, end);
4481                 LIST_INSERT_AFTER(command, *l, end, e);
4482         } else
4483               *l = e;
4484 }
4485
4486 int exec_command_set(ExecCommand *c, const char *path, ...) {
4487         va_list ap;
4488         char **l, *p;
4489
4490         assert(c);
4491         assert(path);
4492
4493         va_start(ap, path);
4494         l = strv_new_ap(path, ap);
4495         va_end(ap);
4496
4497         if (!l)
4498                 return -ENOMEM;
4499
4500         p = strdup(path);
4501         if (!p) {
4502                 strv_free(l);
4503                 return -ENOMEM;
4504         }
4505
4506         free(c->path);
4507         c->path = p;
4508
4509         strv_free(c->argv);
4510         c->argv = l;
4511
4512         return 0;
4513 }
4514
4515 int exec_command_append(ExecCommand *c, const char *path, ...) {
4516         _cleanup_strv_free_ char **l = NULL;
4517         va_list ap;
4518         int r;
4519
4520         assert(c);
4521         assert(path);
4522
4523         va_start(ap, path);
4524         l = strv_new_ap(path, ap);
4525         va_end(ap);
4526
4527         if (!l)
4528                 return -ENOMEM;
4529
4530         r = strv_extend_strv(&c->argv, l, false);
4531         if (r < 0)
4532                 return r;
4533
4534         return 0;
4535 }
4536
4537
4538 static int exec_runtime_allocate(ExecRuntime **rt) {
4539
4540         if (*rt)
4541                 return 0;
4542
4543         *rt = new0(ExecRuntime, 1);
4544         if (!*rt)
4545                 return -ENOMEM;
4546
4547         (*rt)->n_ref = 1;
4548         (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4549
4550         return 0;
4551 }
4552
4553 int exec_runtime_make(ExecRuntime **rt, ExecContext *c, const char *id) {
4554         int r;
4555
4556         assert(rt);
4557         assert(c);
4558         assert(id);
4559
4560         if (*rt)
4561                 return 1;
4562
4563         if (!c->private_network && !c->private_tmp)
4564                 return 0;
4565
4566         r = exec_runtime_allocate(rt);
4567         if (r < 0)
4568                 return r;
4569
4570         if (c->private_network && (*rt)->netns_storage_socket[0] < 0) {
4571                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, (*rt)->netns_storage_socket) < 0)
4572                         return -errno;
4573         }
4574
4575         if (c->private_tmp && !(*rt)->tmp_dir) {
4576                 r = setup_tmp_dirs(id, &(*rt)->tmp_dir, &(*rt)->var_tmp_dir);
4577                 if (r < 0)
4578                         return r;
4579         }
4580
4581         return 1;
4582 }
4583
4584 ExecRuntime *exec_runtime_ref(ExecRuntime *r) {
4585         assert(r);
4586         assert(r->n_ref > 0);
4587
4588         r->n_ref++;
4589         return r;
4590 }
4591
4592 ExecRuntime *exec_runtime_unref(ExecRuntime *r) {
4593
4594         if (!r)
4595                 return NULL;
4596
4597         assert(r->n_ref > 0);
4598
4599         r->n_ref--;
4600         if (r->n_ref > 0)
4601                 return NULL;
4602
4603         free(r->tmp_dir);
4604         free(r->var_tmp_dir);
4605         safe_close_pair(r->netns_storage_socket);
4606         return mfree(r);
4607 }
4608
4609 int exec_runtime_serialize(Unit *u, ExecRuntime *rt, FILE *f, FDSet *fds) {
4610         assert(u);
4611         assert(f);
4612         assert(fds);
4613
4614         if (!rt)
4615                 return 0;
4616
4617         if (rt->tmp_dir)
4618                 unit_serialize_item(u, f, "tmp-dir", rt->tmp_dir);
4619
4620         if (rt->var_tmp_dir)
4621                 unit_serialize_item(u, f, "var-tmp-dir", rt->var_tmp_dir);
4622
4623         if (rt->netns_storage_socket[0] >= 0) {
4624                 int copy;
4625
4626                 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4627                 if (copy < 0)
4628                         return copy;
4629
4630                 unit_serialize_item_format(u, f, "netns-socket-0", "%i", copy);
4631         }
4632
4633         if (rt->netns_storage_socket[1] >= 0) {
4634                 int copy;
4635
4636                 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4637                 if (copy < 0)
4638                         return copy;
4639
4640                 unit_serialize_item_format(u, f, "netns-socket-1", "%i", copy);
4641         }
4642
4643         return 0;
4644 }
4645
4646 int exec_runtime_deserialize_item(Unit *u, ExecRuntime **rt, const char *key, const char *value, FDSet *fds) {
4647         int r;
4648
4649         assert(rt);
4650         assert(key);
4651         assert(value);
4652
4653         if (streq(key, "tmp-dir")) {
4654                 char *copy;
4655
4656                 r = exec_runtime_allocate(rt);
4657                 if (r < 0)
4658                         return log_oom();
4659
4660                 copy = strdup(value);
4661                 if (!copy)
4662                         return log_oom();
4663
4664                 free((*rt)->tmp_dir);
4665                 (*rt)->tmp_dir = copy;
4666
4667         } else if (streq(key, "var-tmp-dir")) {
4668                 char *copy;
4669
4670                 r = exec_runtime_allocate(rt);
4671                 if (r < 0)
4672                         return log_oom();
4673
4674                 copy = strdup(value);
4675                 if (!copy)
4676                         return log_oom();
4677
4678                 free((*rt)->var_tmp_dir);
4679                 (*rt)->var_tmp_dir = copy;
4680
4681         } else if (streq(key, "netns-socket-0")) {
4682                 int fd;
4683
4684                 r = exec_runtime_allocate(rt);
4685                 if (r < 0)
4686                         return log_oom();
4687
4688                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4689                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4690                 else {
4691                         safe_close((*rt)->netns_storage_socket[0]);
4692                         (*rt)->netns_storage_socket[0] = fdset_remove(fds, fd);
4693                 }
4694         } else if (streq(key, "netns-socket-1")) {
4695                 int fd;
4696
4697                 r = exec_runtime_allocate(rt);
4698                 if (r < 0)
4699                         return log_oom();
4700
4701                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4702                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4703                 else {
4704                         safe_close((*rt)->netns_storage_socket[1]);
4705                         (*rt)->netns_storage_socket[1] = fdset_remove(fds, fd);
4706                 }
4707         } else
4708                 return 0;
4709
4710         return 1;
4711 }
4712
4713 static void *remove_tmpdir_thread(void *p) {
4714         _cleanup_free_ char *path = p;
4715
4716         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4717         return NULL;
4718 }
4719
4720 void exec_runtime_destroy(ExecRuntime *rt) {
4721         int r;
4722
4723         if (!rt)
4724                 return;
4725
4726         /* If there are multiple users of this, let's leave the stuff around */
4727         if (rt->n_ref > 1)
4728                 return;
4729
4730         if (rt->tmp_dir) {
4731                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4732
4733                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4734                 if (r < 0) {
4735                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4736                         free(rt->tmp_dir);
4737                 }
4738
4739                 rt->tmp_dir = NULL;
4740         }
4741
4742         if (rt->var_tmp_dir) {
4743                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4744
4745                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4746                 if (r < 0) {
4747                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4748                         free(rt->var_tmp_dir);
4749                 }
4750
4751                 rt->var_tmp_dir = NULL;
4752         }
4753
4754         safe_close_pair(rt->netns_storage_socket);
4755 }
4756
4757 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4758         [EXEC_INPUT_NULL] = "null",
4759         [EXEC_INPUT_TTY] = "tty",
4760         [EXEC_INPUT_TTY_FORCE] = "tty-force",
4761         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4762         [EXEC_INPUT_SOCKET] = "socket",
4763         [EXEC_INPUT_NAMED_FD] = "fd",
4764         [EXEC_INPUT_DATA] = "data",
4765         [EXEC_INPUT_FILE] = "file",
4766 };
4767
4768 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4769
4770 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4771         [EXEC_OUTPUT_INHERIT] = "inherit",
4772         [EXEC_OUTPUT_NULL] = "null",
4773         [EXEC_OUTPUT_TTY] = "tty",
4774         [EXEC_OUTPUT_SYSLOG] = "syslog",
4775         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4776         [EXEC_OUTPUT_KMSG] = "kmsg",
4777         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4778         [EXEC_OUTPUT_JOURNAL] = "journal",
4779         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4780         [EXEC_OUTPUT_SOCKET] = "socket",
4781         [EXEC_OUTPUT_NAMED_FD] = "fd",
4782         [EXEC_OUTPUT_FILE] = "file",
4783 };
4784
4785 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4786
4787 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4788         [EXEC_UTMP_INIT] = "init",
4789         [EXEC_UTMP_LOGIN] = "login",
4790         [EXEC_UTMP_USER] = "user",
4791 };
4792
4793 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
4794
4795 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
4796         [EXEC_PRESERVE_NO] = "no",
4797         [EXEC_PRESERVE_YES] = "yes",
4798         [EXEC_PRESERVE_RESTART] = "restart",
4799 };
4800
4801 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
4802
4803 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
4804         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
4805         [EXEC_DIRECTORY_STATE] = "StateDirectory",
4806         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
4807         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
4808         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
4809 };
4810
4811 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
4812
4813 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
4814         [EXEC_KEYRING_INHERIT] = "inherit",
4815         [EXEC_KEYRING_PRIVATE] = "private",
4816         [EXEC_KEYRING_SHARED] = "shared",
4817 };
4818
4819 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);