src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/personality.h>
  10 #include <sys/prctl.h>
  11 #include <sys/shm.h>
  12 #include <sys/types.h>
  13 #include <sys/un.h>
  14 #include <unistd.h>
  15 #include <utmpx.h>
  16
  17 #if HAVE_PAM
  18 #include <security/pam_appl.h>
  19 #endif
  20
  21 #if HAVE_SELINUX
  22 #include <selinux/selinux.h>
  23 #endif
  24
  25 #if HAVE_SECCOMP
  26 #include <seccomp.h>
  27 #endif
  28
  29 #if HAVE_APPARMOR
  30 #include <sys/apparmor.h>
  31 #endif
  32
  33 #include "sd-messages.h"
  34
  35 #include "af-list.h"
  36 #include "alloc-util.h"
  37 #if HAVE_APPARMOR
  38 #include "apparmor-util.h"
  39 #endif
  40 #include "async.h"
  41 #include "barrier.h"
  42 #include "cap-list.h"
  43 #include "capability-util.h"
  44 #include "chown-recursive.h"
  45 #include "cgroup-setup.h"
  46 #include "cpu-set-util.h"
  47 #include "def.h"
  48 #include "env-file.h"
  49 #include "env-util.h"
  50 #include "errno-list.h"
  51 #include "execute.h"
  52 #include "exit-status.h"
  53 #include "fd-util.h"
  54 #include "format-util.h"
  55 #include "fs-util.h"
  56 #include "glob-util.h"
  57 #include "io-util.h"
  58 #include "ioprio.h"
  59 #include "label.h"
  60 #include "log.h"
  61 #include "macro.h"
  62 #include "manager.h"
  63 #include "memory-util.h"
  64 #include "missing_fs.h"
  65 #include "mkdir.h"
  66 #include "namespace.h"
  67 #include "parse-util.h"
  68 #include "path-util.h"
  69 #include "process-util.h"
  70 #include "rlimit-util.h"
  71 #include "rm-rf.h"
  72 #if HAVE_SECCOMP
  73 #include "seccomp-util.h"
  74 #endif
  75 #include "securebits-util.h"
  76 #include "selinux-util.h"
  77 #include "signal-util.h"
  78 #include "smack-util.h"
  79 #include "socket-util.h"
  80 #include "special.h"
  81 #include "stat-util.h"
  82 #include "string-table.h"
  83 #include "string-util.h"
  84 #include "strv.h"
  85 #include "syslog-util.h"
  86 #include "terminal-util.h"
  87 #include "umask-util.h"
  88 #include "unit.h"
  89 #include "user-util.h"
  90 #include "utmp-wtmp.h"
  91
  92 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  93 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
  94
  95 #define SNDBUF_SIZE (8*1024*1024)
  96
  97 static int shift_fds(int fds[], size_t n_fds) {
  98         int start, restart_from;
  99
 100         if (n_fds <= 0)
 101                 return 0;
 102
 103         /* Modifies the fds array! (sorts it) */
 104
 105         assert(fds);
 106
 107         start = 0;
 108         for (;;) {
 109                 int i;
 110
 111                 restart_from = -1;
 112
 113                 for (i = start; i < (int) n_fds; i++) {
 114                         int nfd;
 115
 116                         /* Already at right index? */
 117                         if (fds[i] == i+3)
 118                                 continue;
 119
 120                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 121                         if (nfd < 0)
 122                                 return -errno;
 123
 124                         safe_close(fds[i]);
 125                         fds[i] = nfd;
 126
 127                         /* Hmm, the fd we wanted isn't free? Then
 128                          * let's remember that and try again from here */
 129                         if (nfd != i+3 && restart_from < 0)
 130                                 restart_from = i;
 131                 }
 132
 133                 if (restart_from < 0)
 134                         break;
 135
 136                 start = restart_from;
 137         }
 138
 139         return 0;
 140 }
 141
 142 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 143         size_t i, n_fds;
 144         int r;
 145
 146         n_fds = n_socket_fds + n_storage_fds;
 147         if (n_fds <= 0)
 148                 return 0;
 149
 150         assert(fds);
 151
 152         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 153          * O_NONBLOCK only applies to socket activation though. */
 154
 155         for (i = 0; i < n_fds; i++) {
 156
 157                 if (i < n_socket_fds) {
 158                         r = fd_nonblock(fds[i], nonblock);
 159                         if (r < 0)
 160                                 return r;
 161                 }
 162
 163                 /* We unconditionally drop FD_CLOEXEC from the fds,
 164                  * since after all we want to pass these fds to our
 165                  * children */
 166
 167                 r = fd_cloexec(fds[i], false);
 168                 if (r < 0)
 169                         return r;
 170         }
 171
 172         return 0;
 173 }
 174
 175 static const char *exec_context_tty_path(const ExecContext *context) {
 176         assert(context);
 177
 178         if (context->stdio_as_fds)
 179                 return NULL;
 180
 181         if (context->tty_path)
 182                 return context->tty_path;
 183
 184         return "/dev/console";
 185 }
 186
 187 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 188         const char *path;
 189
 190         assert(context);
 191
 192         path = exec_context_tty_path(context);
 193
 194         if (context->tty_vhangup) {
 195                 if (p && p->stdin_fd >= 0)
 196                         (void) terminal_vhangup_fd(p->stdin_fd);
 197                 else if (path)
 198                         (void) terminal_vhangup(path);
 199         }
 200
 201         if (context->tty_reset) {
 202                 if (p && p->stdin_fd >= 0)
 203                         (void) reset_terminal_fd(p->stdin_fd, true);
 204                 else if (path)
 205                         (void) reset_terminal(path);
 206         }
 207
 208         if (context->tty_vt_disallocate && path)
 209                 (void) vt_disallocate(path);
 210 }
 211
 212 static bool is_terminal_input(ExecInput i) {
 213         return IN_SET(i,
 214                       EXEC_INPUT_TTY,
 215                       EXEC_INPUT_TTY_FORCE,
 216                       EXEC_INPUT_TTY_FAIL);
 217 }
 218
 219 static bool is_terminal_output(ExecOutput o) {
 220         return IN_SET(o,
 221                       EXEC_OUTPUT_TTY,
 222                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 223                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 224                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 225 }
 226
 227 static bool is_syslog_output(ExecOutput o) {
 228         return IN_SET(o,
 229                       EXEC_OUTPUT_SYSLOG,
 230                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 231 }
 232
 233 static bool is_kmsg_output(ExecOutput o) {
 234         return IN_SET(o,
 235                       EXEC_OUTPUT_KMSG,
 236                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 237 }
 238
 239 static bool exec_context_needs_term(const ExecContext *c) {
 240         assert(c);
 241
 242         /* Return true if the execution context suggests we should set $TERM to something useful. */
 243
 244         if (is_terminal_input(c->std_input))
 245                 return true;
 246
 247         if (is_terminal_output(c->std_output))
 248                 return true;
 249
 250         if (is_terminal_output(c->std_error))
 251                 return true;
 252
 253         return !!c->tty_path;
 254 }
 255
 256 static int open_null_as(int flags, int nfd) {
 257         int fd;
 258
 259         assert(nfd >= 0);
 260
 261         fd = open("/dev/null", flags|O_NOCTTY);
 262         if (fd < 0)
 263                 return -errno;
 264
 265         return move_fd(fd, nfd, false);
 266 }
 267
 268 static int connect_journal_socket(
 269                 int fd,
 270                 const char *log_namespace,
 271                 uid_t uid,
 272                 gid_t gid) {
 273
 274         union sockaddr_union sa;
 275         socklen_t sa_len;
 276         uid_t olduid = UID_INVALID;
 277         gid_t oldgid = GID_INVALID;
 278         const char *j;
 279         int r;
 280
 281         j = log_namespace ?
 282                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 283                 "/run/systemd/journal/stdout";
 284         r = sockaddr_un_set_path(&sa.un, j);
 285         if (r < 0)
 286                 return r;
 287         sa_len = r;
 288
 289         if (gid_is_valid(gid)) {
 290                 oldgid = getgid();
 291
 292                 if (setegid(gid) < 0)
 293                         return -errno;
 294         }
 295
 296         if (uid_is_valid(uid)) {
 297                 olduid = getuid();
 298
 299                 if (seteuid(uid) < 0) {
 300                         r = -errno;
 301                         goto restore_gid;
 302                 }
 303         }
 304
 305         r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
 306
 307         /* If we fail to restore the uid or gid, things will likely
 308            fail later on. This should only happen if an LSM interferes. */
 309
 310         if (uid_is_valid(uid))
 311                 (void) seteuid(olduid);
 312
 313  restore_gid:
 314         if (gid_is_valid(gid))
 315                 (void) setegid(oldgid);
 316
 317         return r;
 318 }
 319
 320 static int connect_logger_as(
 321                 const Unit *unit,
 322                 const ExecContext *context,
 323                 const ExecParameters *params,
 324                 ExecOutput output,
 325                 const char *ident,
 326                 int nfd,
 327                 uid_t uid,
 328                 gid_t gid) {
 329
 330         _cleanup_close_ int fd = -1;
 331         int r;
 332
 333         assert(context);
 334         assert(params);
 335         assert(output < _EXEC_OUTPUT_MAX);
 336         assert(ident);
 337         assert(nfd >= 0);
 338
 339         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 340         if (fd < 0)
 341                 return -errno;
 342
 343         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 344         if (r < 0)
 345                 return r;
 346
 347         if (shutdown(fd, SHUT_RD) < 0)
 348                 return -errno;
 349
 350         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 351
 352         if (dprintf(fd,
 353                 "%s\n"
 354                 "%s\n"
 355                 "%i\n"
 356                 "%i\n"
 357                 "%i\n"
 358                 "%i\n"
 359                 "%i\n",
 360                 context->syslog_identifier ?: ident,
 361                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 362                 context->syslog_priority,
 363                 !!context->syslog_level_prefix,
 364                 is_syslog_output(output),
 365                 is_kmsg_output(output),
 366                 is_terminal_output(output)) < 0)
 367                 return -errno;
 368
 369         return move_fd(TAKE_FD(fd), nfd, false);
 370 }
 371
 372 static int open_terminal_as(const char *path, int flags, int nfd) {
 373         int fd;
 374
 375         assert(path);
 376         assert(nfd >= 0);
 377
 378         fd = open_terminal(path, flags | O_NOCTTY);
 379         if (fd < 0)
 380                 return fd;
 381
 382         return move_fd(fd, nfd, false);
 383 }
 384
 385 static int acquire_path(const char *path, int flags, mode_t mode) {
 386         union sockaddr_union sa;
 387         socklen_t sa_len;
 388         _cleanup_close_ int fd = -1;
 389         int r;
 390
 391         assert(path);
 392
 393         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 394                 flags |= O_CREAT;
 395
 396         fd = open(path, flags|O_NOCTTY, mode);
 397         if (fd >= 0)
 398                 return TAKE_FD(fd);
 399
 400         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 401                 return -errno;
 402
 403         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 404
 405         r = sockaddr_un_set_path(&sa.un, path);
 406         if (r < 0)
 407                 return r == -EINVAL ? -ENXIO : r;
 408         sa_len = r;
 409
 410         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 411         if (fd < 0)
 412                 return -errno;
 413
 414         if (connect(fd, &sa.sa, sa_len) < 0)
 415                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 416                                                            * indication that his wasn't an AF_UNIX socket after all */
 417
 418         if ((flags & O_ACCMODE) == O_RDONLY)
 419                 r = shutdown(fd, SHUT_WR);
 420         else if ((flags & O_ACCMODE) == O_WRONLY)
 421                 r = shutdown(fd, SHUT_RD);
 422         else
 423                 r = 0;
 424         if (r < 0)
 425                 return -errno;
 426
 427         return TAKE_FD(fd);
 428 }
 429
 430 static int fixup_input(
 431                 const ExecContext *context,
 432                 int socket_fd,
 433                 bool apply_tty_stdin) {
 434
 435         ExecInput std_input;
 436
 437         assert(context);
 438
 439         std_input = context->std_input;
 440
 441         if (is_terminal_input(std_input) && !apply_tty_stdin)
 442                 return EXEC_INPUT_NULL;
 443
 444         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 445                 return EXEC_INPUT_NULL;
 446
 447         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 448                 return EXEC_INPUT_NULL;
 449
 450         return std_input;
 451 }
 452
 453 static int fixup_output(ExecOutput std_output, int socket_fd) {
 454
 455         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 456                 return EXEC_OUTPUT_INHERIT;
 457
 458         return std_output;
 459 }
 460
 461 static int setup_input(
 462                 const ExecContext *context,
 463                 const ExecParameters *params,
 464                 int socket_fd,
 465                 const int named_iofds[static 3]) {
 466
 467         ExecInput i;
 468
 469         assert(context);
 470         assert(params);
 471         assert(named_iofds);
 472
 473         if (params->stdin_fd >= 0) {
 474                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 475                         return -errno;
 476
 477                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 478                 if (isatty(STDIN_FILENO)) {
 479                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 480                         (void) reset_terminal_fd(STDIN_FILENO, true);
 481                 }
 482
 483                 return STDIN_FILENO;
 484         }
 485
 486         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 487
 488         switch (i) {
 489
 490         case EXEC_INPUT_NULL:
 491                 return open_null_as(O_RDONLY, STDIN_FILENO);
 492
 493         case EXEC_INPUT_TTY:
 494         case EXEC_INPUT_TTY_FORCE:
 495         case EXEC_INPUT_TTY_FAIL: {
 496                 int fd;
 497
 498                 fd = acquire_terminal(exec_context_tty_path(context),
 499                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 500                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 501                                                                   ACQUIRE_TERMINAL_WAIT,
 502                                       USEC_INFINITY);
 503                 if (fd < 0)
 504                         return fd;
 505
 506                 return move_fd(fd, STDIN_FILENO, false);
 507         }
 508
 509         case EXEC_INPUT_SOCKET:
 510                 assert(socket_fd >= 0);
 511
 512                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 513
 514         case EXEC_INPUT_NAMED_FD:
 515                 assert(named_iofds[STDIN_FILENO] >= 0);
 516
 517                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 518                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 519
 520         case EXEC_INPUT_DATA: {
 521                 int fd;
 522
 523                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 524                 if (fd < 0)
 525                         return fd;
 526
 527                 return move_fd(fd, STDIN_FILENO, false);
 528         }
 529
 530         case EXEC_INPUT_FILE: {
 531                 bool rw;
 532                 int fd;
 533
 534                 assert(context->stdio_file[STDIN_FILENO]);
 535
 536                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 537                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 538
 539                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 540                 if (fd < 0)
 541                         return fd;
 542
 543                 return move_fd(fd, STDIN_FILENO, false);
 544         }
 545
 546         default:
 547                 assert_not_reached("Unknown input type");
 548         }
 549 }
 550
 551 static bool can_inherit_stderr_from_stdout(
 552                 const ExecContext *context,
 553                 ExecOutput o,
 554                 ExecOutput e) {
 555
 556         assert(context);
 557
 558         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 559          * stderr fd */
 560
 561         if (e == EXEC_OUTPUT_INHERIT)
 562                 return true;
 563         if (e != o)
 564                 return false;
 565
 566         if (e == EXEC_OUTPUT_NAMED_FD)
 567                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 568
 569         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
 570                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 571
 572         return true;
 573 }
 574
 575 static int setup_output(
 576                 const Unit *unit,
 577                 const ExecContext *context,
 578                 const ExecParameters *params,
 579                 int fileno,
 580                 int socket_fd,
 581                 const int named_iofds[static 3],
 582                 const char *ident,
 583                 uid_t uid,
 584                 gid_t gid,
 585                 dev_t *journal_stream_dev,
 586                 ino_t *journal_stream_ino) {
 587
 588         ExecOutput o;
 589         ExecInput i;
 590         int r;
 591
 592         assert(unit);
 593         assert(context);
 594         assert(params);
 595         assert(ident);
 596         assert(journal_stream_dev);
 597         assert(journal_stream_ino);
 598
 599         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 600
 601                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 602                         return -errno;
 603
 604                 return STDOUT_FILENO;
 605         }
 606
 607         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 608                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 609                         return -errno;
 610
 611                 return STDERR_FILENO;
 612         }
 613
 614         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 615         o = fixup_output(context->std_output, socket_fd);
 616
 617         if (fileno == STDERR_FILENO) {
 618                 ExecOutput e;
 619                 e = fixup_output(context->std_error, socket_fd);
 620
 621                 /* This expects the input and output are already set up */
 622
 623                 /* Don't change the stderr file descriptor if we inherit all
 624                  * the way and are not on a tty */
 625                 if (e == EXEC_OUTPUT_INHERIT &&
 626                     o == EXEC_OUTPUT_INHERIT &&
 627                     i == EXEC_INPUT_NULL &&
 628                     !is_terminal_input(context->std_input) &&
 629                     getppid () != 1)
 630                         return fileno;
 631
 632                 /* Duplicate from stdout if possible */
 633                 if (can_inherit_stderr_from_stdout(context, o, e))
 634                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 635
 636                 o = e;
 637
 638         } else if (o == EXEC_OUTPUT_INHERIT) {
 639                 /* If input got downgraded, inherit the original value */
 640                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 641                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 642
 643                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 644                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 645                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 646
 647                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 648                 if (getppid() != 1)
 649                         return fileno;
 650
 651                 /* We need to open /dev/null here anew, to get the right access mode. */
 652                 return open_null_as(O_WRONLY, fileno);
 653         }
 654
 655         switch (o) {
 656
 657         case EXEC_OUTPUT_NULL:
 658                 return open_null_as(O_WRONLY, fileno);
 659
 660         case EXEC_OUTPUT_TTY:
 661                 if (is_terminal_input(i))
 662                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 663
 664                 /* We don't reset the terminal if this is just about output */
 665                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 666
 667         case EXEC_OUTPUT_SYSLOG:
 668         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 669         case EXEC_OUTPUT_KMSG:
 670         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 671         case EXEC_OUTPUT_JOURNAL:
 672         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 673                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 674                 if (r < 0) {
 675                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 676                         r = open_null_as(O_WRONLY, fileno);
 677                 } else {
 678                         struct stat st;
 679
 680                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 681                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 682                          * services to detect whether they are connected to the journal or not.
 683                          *
 684                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 685                          * about STDERR as that's usually the best way to do logging. */
 686
 687                         if (fstat(fileno, &st) >= 0 &&
 688                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 689                                 *journal_stream_dev = st.st_dev;
 690                                 *journal_stream_ino = st.st_ino;
 691                         }
 692                 }
 693                 return r;
 694
 695         case EXEC_OUTPUT_SOCKET:
 696                 assert(socket_fd >= 0);
 697
 698                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 699
 700         case EXEC_OUTPUT_NAMED_FD:
 701                 assert(named_iofds[fileno] >= 0);
 702
 703                 (void) fd_nonblock(named_iofds[fileno], false);
 704                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 705
 706         case EXEC_OUTPUT_FILE:
 707         case EXEC_OUTPUT_FILE_APPEND: {
 708                 bool rw;
 709                 int fd, flags;
 710
 711                 assert(context->stdio_file[fileno]);
 712
 713                 rw = context->std_input == EXEC_INPUT_FILE &&
 714                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 715
 716                 if (rw)
 717                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 718
 719                 flags = O_WRONLY;
 720                 if (o == EXEC_OUTPUT_FILE_APPEND)
 721                         flags |= O_APPEND;
 722
 723                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 724                 if (fd < 0)
 725                         return fd;
 726
 727                 return move_fd(fd, fileno, 0);
 728         }
 729
 730         default:
 731                 assert_not_reached("Unknown error type");
 732         }
 733 }
 734
 735 static int chown_terminal(int fd, uid_t uid) {
 736         int r;
 737
 738         assert(fd >= 0);
 739
 740         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 741         if (isatty(fd) < 1) {
 742                 if (IN_SET(errno, EINVAL, ENOTTY))
 743                         return 0; /* not a tty */
 744
 745                 return -errno;
 746         }
 747
 748         /* This might fail. What matters are the results. */
 749         r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
 750         if (r < 0)
 751                 return r;
 752
 753         return 1;
 754 }
 755
 756 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 757         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 758         int r;
 759
 760         assert(_saved_stdin);
 761         assert(_saved_stdout);
 762
 763         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 764         if (saved_stdin < 0)
 765                 return -errno;
 766
 767         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 768         if (saved_stdout < 0)
 769                 return -errno;
 770
 771         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 772         if (fd < 0)
 773                 return fd;
 774
 775         r = chown_terminal(fd, getuid());
 776         if (r < 0)
 777                 return r;
 778
 779         r = reset_terminal_fd(fd, true);
 780         if (r < 0)
 781                 return r;
 782
 783         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 784         fd = -1;
 785         if (r < 0)
 786                 return r;
 787
 788         *_saved_stdin = saved_stdin;
 789         *_saved_stdout = saved_stdout;
 790
 791         saved_stdin = saved_stdout = -1;
 792
 793         return 0;
 794 }
 795
 796 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 797         assert(err < 0);
 798
 799         if (err == -ETIMEDOUT)
 800                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 801         else {
 802                 errno = -err;
 803                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 804         }
 805 }
 806
 807 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 808         _cleanup_close_ int fd = -1;
 809
 810         assert(vc);
 811
 812         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 813         if (fd < 0)
 814                 return;
 815
 816         write_confirm_error_fd(err, fd, u);
 817 }
 818
 819 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 820         int r = 0;
 821
 822         assert(saved_stdin);
 823         assert(saved_stdout);
 824
 825         release_terminal();
 826
 827         if (*saved_stdin >= 0)
 828                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 829                         r = -errno;
 830
 831         if (*saved_stdout >= 0)
 832                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 833                         r = -errno;
 834
 835         *saved_stdin = safe_close(*saved_stdin);
 836         *saved_stdout = safe_close(*saved_stdout);
 837
 838         return r;
 839 }
 840
 841 enum {
 842         CONFIRM_PRETEND_FAILURE = -1,
 843         CONFIRM_PRETEND_SUCCESS =  0,
 844         CONFIRM_EXECUTE = 1,
 845 };
 846
 847 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 848         int saved_stdout = -1, saved_stdin = -1, r;
 849         _cleanup_free_ char *e = NULL;
 850         char c;
 851
 852         /* For any internal errors, assume a positive response. */
 853         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 854         if (r < 0) {
 855                 write_confirm_error(r, vc, u);
 856                 return CONFIRM_EXECUTE;
 857         }
 858
 859         /* confirm_spawn might have been disabled while we were sleeping. */
 860         if (manager_is_confirm_spawn_disabled(u->manager)) {
 861                 r = 1;
 862                 goto restore_stdio;
 863         }
 864
 865         e = ellipsize(cmdline, 60, 100);
 866         if (!e) {
 867                 log_oom();
 868                 r = CONFIRM_EXECUTE;
 869                 goto restore_stdio;
 870         }
 871
 872         for (;;) {
 873                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 874                 if (r < 0) {
 875                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 876                         r = CONFIRM_EXECUTE;
 877                         goto restore_stdio;
 878                 }
 879
 880                 switch (c) {
 881                 case 'c':
 882                         printf("Resuming normal execution.\n");
 883                         manager_disable_confirm_spawn();
 884                         r = 1;
 885                         break;
 886                 case 'D':
 887                         unit_dump(u, stdout, "  ");
 888                         continue; /* ask again */
 889                 case 'f':
 890                         printf("Failing execution.\n");
 891                         r = CONFIRM_PRETEND_FAILURE;
 892                         break;
 893                 case 'h':
 894                         printf("  c - continue, proceed without asking anymore\n"
 895                                "  D - dump, show the state of the unit\n"
 896                                "  f - fail, don't execute the command and pretend it failed\n"
 897                                "  h - help\n"
 898                                "  i - info, show a short summary of the unit\n"
 899                                "  j - jobs, show jobs that are in progress\n"
 900                                "  s - skip, don't execute the command and pretend it succeeded\n"
 901                                "  y - yes, execute the command\n");
 902                         continue; /* ask again */
 903                 case 'i':
 904                         printf("  Description: %s\n"
 905                                "  Unit:        %s\n"
 906                                "  Command:     %s\n",
 907                                u->id, u->description, cmdline);
 908                         continue; /* ask again */
 909                 case 'j':
 910                         manager_dump_jobs(u->manager, stdout, "  ");
 911                         continue; /* ask again */
 912                 case 'n':
 913                         /* 'n' was removed in favor of 'f'. */
 914                         printf("Didn't understand 'n', did you mean 'f'?\n");
 915                         continue; /* ask again */
 916                 case 's':
 917                         printf("Skipping execution.\n");
 918                         r = CONFIRM_PRETEND_SUCCESS;
 919                         break;
 920                 case 'y':
 921                         r = CONFIRM_EXECUTE;
 922                         break;
 923                 default:
 924                         assert_not_reached("Unhandled choice");
 925                 }
 926                 break;
 927         }
 928
 929 restore_stdio:
 930         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 931         return r;
 932 }
 933
 934 static int get_fixed_user(const ExecContext *c, const char **user,
 935                           uid_t *uid, gid_t *gid,
 936                           const char **home, const char **shell) {
 937         int r;
 938         const char *name;
 939
 940         assert(c);
 941
 942         if (!c->user)
 943                 return 0;
 944
 945         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 946          * (i.e. are "/" or "/bin/nologin"). */
 947
 948         name = c->user;
 949         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 950         if (r < 0)
 951                 return r;
 952
 953         *user = name;
 954         return 0;
 955 }
 956
 957 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 958         int r;
 959         const char *name;
 960
 961         assert(c);
 962
 963         if (!c->group)
 964                 return 0;
 965
 966         name = c->group;
 967         r = get_group_creds(&name, gid, 0);
 968         if (r < 0)
 969                 return r;
 970
 971         *group = name;
 972         return 0;
 973 }
 974
 975 static int get_supplementary_groups(const ExecContext *c, const char *user,
 976                                     const char *group, gid_t gid,
 977                                     gid_t **supplementary_gids, int *ngids) {
 978         char **i;
 979         int r, k = 0;
 980         int ngroups_max;
 981         bool keep_groups = false;
 982         gid_t *groups = NULL;
 983         _cleanup_free_ gid_t *l_gids = NULL;
 984
 985         assert(c);
 986
 987         /*
 988          * If user is given, then lookup GID and supplementary groups list.
 989          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 990          * here and as early as possible so we keep the list of supplementary
 991          * groups of the caller.
 992          */
 993         if (user && gid_is_valid(gid) && gid != 0) {
 994                 /* First step, initialize groups from /etc/groups */
 995                 if (initgroups(user, gid) < 0)
 996                         return -errno;
 997
 998                 keep_groups = true;
 999         }
1000
1001         if (strv_isempty(c->supplementary_groups))
1002                 return 0;
1003
1004         /*
1005          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1006          * be positive, otherwise fail.
1007          */
1008         errno = 0;
1009         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1010         if (ngroups_max <= 0)
1011                 return errno_or_else(EOPNOTSUPP);
1012
1013         l_gids = new(gid_t, ngroups_max);
1014         if (!l_gids)
1015                 return -ENOMEM;
1016
1017         if (keep_groups) {
1018                 /*
1019                  * Lookup the list of groups that the user belongs to, we
1020                  * avoid NSS lookups here too for gid=0.
1021                  */
1022                 k = ngroups_max;
1023                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1024                         return -EINVAL;
1025         } else
1026                 k = 0;
1027
1028         STRV_FOREACH(i, c->supplementary_groups) {
1029                 const char *g;
1030
1031                 if (k >= ngroups_max)
1032                         return -E2BIG;
1033
1034                 g = *i;
1035                 r = get_group_creds(&g, l_gids+k, 0);
1036                 if (r < 0)
1037                         return r;
1038
1039                 k++;
1040         }
1041
1042         /*
1043          * Sets ngids to zero to drop all supplementary groups, happens
1044          * when we are under root and SupplementaryGroups= is empty.
1045          */
1046         if (k == 0) {
1047                 *ngids = 0;
1048                 return 0;
1049         }
1050
1051         /* Otherwise get the final list of supplementary groups */
1052         groups = memdup(l_gids, sizeof(gid_t) * k);
1053         if (!groups)
1054                 return -ENOMEM;
1055
1056         *supplementary_gids = groups;
1057         *ngids = k;
1058
1059         groups = NULL;
1060
1061         return 0;
1062 }
1063
1064 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1065         int r;
1066
1067         /* Handle SupplementaryGroups= if it is not empty */
1068         if (ngids > 0) {
1069                 r = maybe_setgroups(ngids, supplementary_gids);
1070                 if (r < 0)
1071                         return r;
1072         }
1073
1074         if (gid_is_valid(gid)) {
1075                 /* Then set our gids */
1076                 if (setresgid(gid, gid, gid) < 0)
1077                         return -errno;
1078         }
1079
1080         return 0;
1081 }
1082
1083 static int enforce_user(const ExecContext *context, uid_t uid) {
1084         assert(context);
1085
1086         if (!uid_is_valid(uid))
1087                 return 0;
1088
1089         /* Sets (but doesn't look up) the uid and make sure we keep the
1090          * capabilities while doing so. */
1091
1092         if (context->capability_ambient_set != 0) {
1093
1094                 /* First step: If we need to keep capabilities but
1095                  * drop privileges we need to make sure we keep our
1096                  * caps, while we drop privileges. */
1097                 if (uid != 0) {
1098                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1099
1100                         if (prctl(PR_GET_SECUREBITS) != sb)
1101                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1102                                         return -errno;
1103                 }
1104         }
1105
1106         /* Second step: actually set the uids */
1107         if (setresuid(uid, uid, uid) < 0)
1108                 return -errno;
1109
1110         /* At this point we should have all necessary capabilities but
1111            are otherwise a normal user. However, the caps might got
1112            corrupted due to the setresuid() so we need clean them up
1113            later. This is done outside of this call. */
1114
1115         return 0;
1116 }
1117
1118 #if HAVE_PAM
1119
1120 static int null_conv(
1121                 int num_msg,
1122                 const struct pam_message **msg,
1123                 struct pam_response **resp,
1124                 void *appdata_ptr) {
1125
1126         /* We don't support conversations */
1127
1128         return PAM_CONV_ERR;
1129 }
1130
1131 #endif
1132
1133 static int setup_pam(
1134                 const char *name,
1135                 const char *user,
1136                 uid_t uid,
1137                 gid_t gid,
1138                 const char *tty,
1139                 char ***env,
1140                 const int fds[], size_t n_fds) {
1141
1142 #if HAVE_PAM
1143
1144         static const struct pam_conv conv = {
1145                 .conv = null_conv,
1146                 .appdata_ptr = NULL
1147         };
1148
1149         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1150         pam_handle_t *handle = NULL;
1151         sigset_t old_ss;
1152         int pam_code = PAM_SUCCESS, r;
1153         char **nv, **e = NULL;
1154         bool close_session = false;
1155         pid_t pam_pid = 0, parent_pid;
1156         int flags = 0;
1157
1158         assert(name);
1159         assert(user);
1160         assert(env);
1161
1162         /* We set up PAM in the parent process, then fork. The child
1163          * will then stay around until killed via PR_GET_PDEATHSIG or
1164          * systemd via the cgroup logic. It will then remove the PAM
1165          * session again. The parent process will exec() the actual
1166          * daemon. We do things this way to ensure that the main PID
1167          * of the daemon is the one we initially fork()ed. */
1168
1169         r = barrier_create(&barrier);
1170         if (r < 0)
1171                 goto fail;
1172
1173         if (log_get_max_level() < LOG_DEBUG)
1174                 flags |= PAM_SILENT;
1175
1176         pam_code = pam_start(name, user, &conv, &handle);
1177         if (pam_code != PAM_SUCCESS) {
1178                 handle = NULL;
1179                 goto fail;
1180         }
1181
1182         if (!tty) {
1183                 _cleanup_free_ char *q = NULL;
1184
1185                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1186                  * out if that's the case, and read the TTY off it. */
1187
1188                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1189                         tty = strjoina("/dev/", q);
1190         }
1191
1192         if (tty) {
1193                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1194                 if (pam_code != PAM_SUCCESS)
1195                         goto fail;
1196         }
1197
1198         STRV_FOREACH(nv, *env) {
1199                 pam_code = pam_putenv(handle, *nv);
1200                 if (pam_code != PAM_SUCCESS)
1201                         goto fail;
1202         }
1203
1204         pam_code = pam_acct_mgmt(handle, flags);
1205         if (pam_code != PAM_SUCCESS)
1206                 goto fail;
1207
1208         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1209         if (pam_code != PAM_SUCCESS)
1210                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1211
1212         pam_code = pam_open_session(handle, flags);
1213         if (pam_code != PAM_SUCCESS)
1214                 goto fail;
1215
1216         close_session = true;
1217
1218         e = pam_getenvlist(handle);
1219         if (!e) {
1220                 pam_code = PAM_BUF_ERR;
1221                 goto fail;
1222         }
1223
1224         /* Block SIGTERM, so that we know that it won't get lost in
1225          * the child */
1226
1227         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1228
1229         parent_pid = getpid_cached();
1230
1231         r = safe_fork("(sd-pam)", 0, &pam_pid);
1232         if (r < 0)
1233                 goto fail;
1234         if (r == 0) {
1235                 int sig, ret = EXIT_PAM;
1236
1237                 /* The child's job is to reset the PAM session on
1238                  * termination */
1239                 barrier_set_role(&barrier, BARRIER_CHILD);
1240
1241                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1242                  * are open here that have been opened by PAM. */
1243                 (void) close_many(fds, n_fds);
1244
1245                 /* Drop privileges - we don't need any to pam_close_session
1246                  * and this will make PR_SET_PDEATHSIG work in most cases.
1247                  * If this fails, ignore the error - but expect sd-pam threads
1248                  * to fail to exit normally */
1249
1250                 r = maybe_setgroups(0, NULL);
1251                 if (r < 0)
1252                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1253                 if (setresgid(gid, gid, gid) < 0)
1254                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1255                 if (setresuid(uid, uid, uid) < 0)
1256                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1257
1258                 (void) ignore_signals(SIGPIPE, -1);
1259
1260                 /* Wait until our parent died. This will only work if
1261                  * the above setresuid() succeeds, otherwise the kernel
1262                  * will not allow unprivileged parents kill their privileged
1263                  * children this way. We rely on the control groups kill logic
1264                  * to do the rest for us. */
1265                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1266                         goto child_finish;
1267
1268                 /* Tell the parent that our setup is done. This is especially
1269                  * important regarding dropping privileges. Otherwise, unit
1270                  * setup might race against our setresuid(2) call.
1271                  *
1272                  * If the parent aborted, we'll detect this below, hence ignore
1273                  * return failure here. */
1274                 (void) barrier_place(&barrier);
1275
1276                 /* Check if our parent process might already have died? */
1277                 if (getppid() == parent_pid) {
1278                         sigset_t ss;
1279
1280                         assert_se(sigemptyset(&ss) >= 0);
1281                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1282
1283                         for (;;) {
1284                                 if (sigwait(&ss, &sig) < 0) {
1285                                         if (errno == EINTR)
1286                                                 continue;
1287
1288                                         goto child_finish;
1289                                 }
1290
1291                                 assert(sig == SIGTERM);
1292                                 break;
1293                         }
1294                 }
1295
1296                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1297                 if (pam_code != PAM_SUCCESS)
1298                         goto child_finish;
1299
1300                 /* If our parent died we'll end the session */
1301                 if (getppid() != parent_pid) {
1302                         pam_code = pam_close_session(handle, flags);
1303                         if (pam_code != PAM_SUCCESS)
1304                                 goto child_finish;
1305                 }
1306
1307                 ret = 0;
1308
1309         child_finish:
1310                 pam_end(handle, pam_code | flags);
1311                 _exit(ret);
1312         }
1313
1314         barrier_set_role(&barrier, BARRIER_PARENT);
1315
1316         /* If the child was forked off successfully it will do all the
1317          * cleanups, so forget about the handle here. */
1318         handle = NULL;
1319
1320         /* Unblock SIGTERM again in the parent */
1321         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1322
1323         /* We close the log explicitly here, since the PAM modules
1324          * might have opened it, but we don't want this fd around. */
1325         closelog();
1326
1327         /* Synchronously wait for the child to initialize. We don't care for
1328          * errors as we cannot recover. However, warn loudly if it happens. */
1329         if (!barrier_place_and_sync(&barrier))
1330                 log_error("PAM initialization failed");
1331
1332         return strv_free_and_replace(*env, e);
1333
1334 fail:
1335         if (pam_code != PAM_SUCCESS) {
1336                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1337                 r = -EPERM;  /* PAM errors do not map to errno */
1338         } else
1339                 log_error_errno(r, "PAM failed: %m");
1340
1341         if (handle) {
1342                 if (close_session)
1343                         pam_code = pam_close_session(handle, flags);
1344
1345                 pam_end(handle, pam_code | flags);
1346         }
1347
1348         strv_free(e);
1349         closelog();
1350
1351         return r;
1352 #else
1353         return 0;
1354 #endif
1355 }
1356
1357 static void rename_process_from_path(const char *path) {
1358         char process_name[11];
1359         const char *p;
1360         size_t l;
1361
1362         /* This resulting string must fit in 10 chars (i.e. the length
1363          * of "/sbin/init") to look pretty in /bin/ps */
1364
1365         p = basename(path);
1366         if (isempty(p)) {
1367                 rename_process("(...)");
1368                 return;
1369         }
1370
1371         l = strlen(p);
1372         if (l > 8) {
1373                 /* The end of the process name is usually more
1374                  * interesting, since the first bit might just be
1375                  * "systemd-" */
1376                 p = p + l - 8;
1377                 l = 8;
1378         }
1379
1380         process_name[0] = '(';
1381         memcpy(process_name+1, p, l);
1382         process_name[1+l] = ')';
1383         process_name[1+l+1] = 0;
1384
1385         rename_process(process_name);
1386 }
1387
1388 static bool context_has_address_families(const ExecContext *c) {
1389         assert(c);
1390
1391         return c->address_families_whitelist ||
1392                 !set_isempty(c->address_families);
1393 }
1394
1395 static bool context_has_syscall_filters(const ExecContext *c) {
1396         assert(c);
1397
1398         return c->syscall_whitelist ||
1399                 !hashmap_isempty(c->syscall_filter);
1400 }
1401
1402 static bool context_has_no_new_privileges(const ExecContext *c) {
1403         assert(c);
1404
1405         if (c->no_new_privileges)
1406                 return true;
1407
1408         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1409                 return false;
1410
1411         /* We need NNP if we have any form of seccomp and are unprivileged */
1412         return context_has_address_families(c) ||
1413                 c->memory_deny_write_execute ||
1414                 c->restrict_realtime ||
1415                 c->restrict_suid_sgid ||
1416                 exec_context_restrict_namespaces_set(c) ||
1417                 c->protect_clock ||
1418                 c->protect_kernel_tunables ||
1419                 c->protect_kernel_modules ||
1420                 c->protect_kernel_logs ||
1421                 c->private_devices ||
1422                 context_has_syscall_filters(c) ||
1423                 !set_isempty(c->syscall_archs) ||
1424                 c->lock_personality ||
1425                 c->protect_hostname;
1426 }
1427
1428 #if HAVE_SECCOMP
1429
1430 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1431
1432         if (is_seccomp_available())
1433                 return false;
1434
1435         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1436         return true;
1437 }
1438
1439 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1440         uint32_t negative_action, default_action, action;
1441         int r;
1442
1443         assert(u);
1444         assert(c);
1445
1446         if (!context_has_syscall_filters(c))
1447                 return 0;
1448
1449         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1450                 return 0;
1451
1452         negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1453
1454         if (c->syscall_whitelist) {
1455                 default_action = negative_action;
1456                 action = SCMP_ACT_ALLOW;
1457         } else {
1458                 default_action = SCMP_ACT_ALLOW;
1459                 action = negative_action;
1460         }
1461
1462         if (needs_ambient_hack) {
1463                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1464                 if (r < 0)
1465                         return r;
1466         }
1467
1468         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1469 }
1470
1471 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1472         assert(u);
1473         assert(c);
1474
1475         if (set_isempty(c->syscall_archs))
1476                 return 0;
1477
1478         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1479                 return 0;
1480
1481         return seccomp_restrict_archs(c->syscall_archs);
1482 }
1483
1484 static int apply_address_families(const Unit* u, const ExecContext *c) {
1485         assert(u);
1486         assert(c);
1487
1488         if (!context_has_address_families(c))
1489                 return 0;
1490
1491         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1492                 return 0;
1493
1494         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1495 }
1496
1497 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1498         assert(u);
1499         assert(c);
1500
1501         if (!c->memory_deny_write_execute)
1502                 return 0;
1503
1504         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1505                 return 0;
1506
1507         return seccomp_memory_deny_write_execute();
1508 }
1509
1510 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1511         assert(u);
1512         assert(c);
1513
1514         if (!c->restrict_realtime)
1515                 return 0;
1516
1517         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1518                 return 0;
1519
1520         return seccomp_restrict_realtime();
1521 }
1522
1523 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1524         assert(u);
1525         assert(c);
1526
1527         if (!c->restrict_suid_sgid)
1528                 return 0;
1529
1530         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1531                 return 0;
1532
1533         return seccomp_restrict_suid_sgid();
1534 }
1535
1536 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1537         assert(u);
1538         assert(c);
1539
1540         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1541          * let's protect even those systems where this is left on in the kernel. */
1542
1543         if (!c->protect_kernel_tunables)
1544                 return 0;
1545
1546         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1547                 return 0;
1548
1549         return seccomp_protect_sysctl();
1550 }
1551
1552 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1553         assert(u);
1554         assert(c);
1555
1556         /* Turn off module syscalls on ProtectKernelModules=yes */
1557
1558         if (!c->protect_kernel_modules)
1559                 return 0;
1560
1561         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1562                 return 0;
1563
1564         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1565 }
1566
1567 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1568         assert(u);
1569         assert(c);
1570
1571         if (!c->protect_kernel_logs)
1572                 return 0;
1573
1574         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1575                 return 0;
1576
1577         return seccomp_protect_syslog();
1578 }
1579
1580 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1581         assert(u);
1582         assert(c);
1583
1584         if (!c->protect_clock)
1585                 return 0;
1586
1587         if (skip_seccomp_unavailable(u, "ProtectClock="))
1588                 return 0;
1589
1590         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1591 }
1592
1593 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1594         assert(u);
1595         assert(c);
1596
1597         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1598
1599         if (!c->private_devices)
1600                 return 0;
1601
1602         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1603                 return 0;
1604
1605         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1606 }
1607
1608 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1609         assert(u);
1610         assert(c);
1611
1612         if (!exec_context_restrict_namespaces_set(c))
1613                 return 0;
1614
1615         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1616                 return 0;
1617
1618         return seccomp_restrict_namespaces(c->restrict_namespaces);
1619 }
1620
1621 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1622         unsigned long personality;
1623         int r;
1624
1625         assert(u);
1626         assert(c);
1627
1628         if (!c->lock_personality)
1629                 return 0;
1630
1631         if (skip_seccomp_unavailable(u, "LockPersonality="))
1632                 return 0;
1633
1634         personality = c->personality;
1635
1636         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1637         if (personality == PERSONALITY_INVALID) {
1638
1639                 r = opinionated_personality(&personality);
1640                 if (r < 0)
1641                         return r;
1642         }
1643
1644         return seccomp_lock_personality(personality);
1645 }
1646
1647 #endif
1648
1649 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1650         assert(u);
1651         assert(c);
1652
1653         if (!c->protect_hostname)
1654                 return 0;
1655
1656         if (ns_type_supported(NAMESPACE_UTS)) {
1657                 if (unshare(CLONE_NEWUTS) < 0) {
1658                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1659                                 *ret_exit_status = EXIT_NAMESPACE;
1660                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1661                         }
1662
1663                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1664                 }
1665         } else
1666                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1667
1668 #if HAVE_SECCOMP
1669         int r;
1670
1671         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1672                 return 0;
1673
1674         r = seccomp_protect_hostname();
1675         if (r < 0) {
1676                 *ret_exit_status = EXIT_SECCOMP;
1677                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1678         }
1679 #endif
1680
1681         return 0;
1682 }
1683
1684 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1685         assert(idle_pipe);
1686
1687         idle_pipe[1] = safe_close(idle_pipe[1]);
1688         idle_pipe[2] = safe_close(idle_pipe[2]);
1689
1690         if (idle_pipe[0] >= 0) {
1691                 int r;
1692
1693                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1694
1695                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1696                         ssize_t n;
1697
1698                         /* Signal systemd that we are bored and want to continue. */
1699                         n = write(idle_pipe[3], "x", 1);
1700                         if (n > 0)
1701                                 /* Wait for systemd to react to the signal above. */
1702                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1703                 }
1704
1705                 idle_pipe[0] = safe_close(idle_pipe[0]);
1706
1707         }
1708
1709         idle_pipe[3] = safe_close(idle_pipe[3]);
1710 }
1711
1712 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1713
1714 static int build_environment(
1715                 const Unit *u,
1716                 const ExecContext *c,
1717                 const ExecParameters *p,
1718                 size_t n_fds,
1719                 const char *home,
1720                 const char *username,
1721                 const char *shell,
1722                 dev_t journal_stream_dev,
1723                 ino_t journal_stream_ino,
1724                 char ***ret) {
1725
1726         _cleanup_strv_free_ char **our_env = NULL;
1727         ExecDirectoryType t;
1728         size_t n_env = 0;
1729         char *x;
1730
1731         assert(u);
1732         assert(c);
1733         assert(p);
1734         assert(ret);
1735
1736         our_env = new0(char*, 15 + _EXEC_DIRECTORY_TYPE_MAX);
1737         if (!our_env)
1738                 return -ENOMEM;
1739
1740         if (n_fds > 0) {
1741                 _cleanup_free_ char *joined = NULL;
1742
1743                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1744                         return -ENOMEM;
1745                 our_env[n_env++] = x;
1746
1747                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1748                         return -ENOMEM;
1749                 our_env[n_env++] = x;
1750
1751                 joined = strv_join(p->fd_names, ":");
1752                 if (!joined)
1753                         return -ENOMEM;
1754
1755                 x = strjoin("LISTEN_FDNAMES=", joined);
1756                 if (!x)
1757                         return -ENOMEM;
1758                 our_env[n_env++] = x;
1759         }
1760
1761         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1762                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1763                         return -ENOMEM;
1764                 our_env[n_env++] = x;
1765
1766                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1767                         return -ENOMEM;
1768                 our_env[n_env++] = x;
1769         }
1770
1771         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1772          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1773          * check the database directly. */
1774         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1775                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1776                 if (!x)
1777                         return -ENOMEM;
1778                 our_env[n_env++] = x;
1779         }
1780
1781         if (home) {
1782                 x = strjoin("HOME=", home);
1783                 if (!x)
1784                         return -ENOMEM;
1785
1786                 path_simplify(x + 5, true);
1787                 our_env[n_env++] = x;
1788         }
1789
1790         if (username) {
1791                 x = strjoin("LOGNAME=", username);
1792                 if (!x)
1793                         return -ENOMEM;
1794                 our_env[n_env++] = x;
1795
1796                 x = strjoin("USER=", username);
1797                 if (!x)
1798                         return -ENOMEM;
1799                 our_env[n_env++] = x;
1800         }
1801
1802         if (shell) {
1803                 x = strjoin("SHELL=", shell);
1804                 if (!x)
1805                         return -ENOMEM;
1806
1807                 path_simplify(x + 6, true);
1808                 our_env[n_env++] = x;
1809         }
1810
1811         if (!sd_id128_is_null(u->invocation_id)) {
1812                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1813                         return -ENOMEM;
1814
1815                 our_env[n_env++] = x;
1816         }
1817
1818         if (exec_context_needs_term(c)) {
1819                 const char *tty_path, *term = NULL;
1820
1821                 tty_path = exec_context_tty_path(c);
1822
1823                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1824                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1825                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1826
1827                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1828                         term = getenv("TERM");
1829
1830                 if (!term)
1831                         term = default_term_for_tty(tty_path);
1832
1833                 x = strjoin("TERM=", term);
1834                 if (!x)
1835                         return -ENOMEM;
1836                 our_env[n_env++] = x;
1837         }
1838
1839         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1840                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1841                         return -ENOMEM;
1842
1843                 our_env[n_env++] = x;
1844         }
1845
1846         if (c->log_namespace) {
1847                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1848                 if (!x)
1849                         return -ENOMEM;
1850
1851                 our_env[n_env++] = x;
1852         }
1853
1854         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1855                 _cleanup_free_ char *pre = NULL, *joined = NULL;
1856                 const char *n;
1857
1858                 if (!p->prefix[t])
1859                         continue;
1860
1861                 if (strv_isempty(c->directories[t].paths))
1862                         continue;
1863
1864                 n = exec_directory_env_name_to_string(t);
1865                 if (!n)
1866                         continue;
1867
1868                 pre = strjoin(p->prefix[t], "/");
1869                 if (!pre)
1870                         return -ENOMEM;
1871
1872                 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1873                 if (!joined)
1874                         return -ENOMEM;
1875
1876                 x = strjoin(n, "=", joined);
1877                 if (!x)
1878                         return -ENOMEM;
1879
1880                 our_env[n_env++] = x;
1881         }
1882
1883         our_env[n_env++] = NULL;
1884         assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1885
1886         *ret = TAKE_PTR(our_env);
1887
1888         return 0;
1889 }
1890
1891 static int build_pass_environment(const ExecContext *c, char ***ret) {
1892         _cleanup_strv_free_ char **pass_env = NULL;
1893         size_t n_env = 0, n_bufsize = 0;
1894         char **i;
1895
1896         STRV_FOREACH(i, c->pass_environment) {
1897                 _cleanup_free_ char *x = NULL;
1898                 char *v;
1899
1900                 v = getenv(*i);
1901                 if (!v)
1902                         continue;
1903                 x = strjoin(*i, "=", v);
1904                 if (!x)
1905                         return -ENOMEM;
1906
1907                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1908                         return -ENOMEM;
1909
1910                 pass_env[n_env++] = TAKE_PTR(x);
1911                 pass_env[n_env] = NULL;
1912         }
1913
1914         *ret = TAKE_PTR(pass_env);
1915
1916         return 0;
1917 }
1918
1919 static bool exec_needs_mount_namespace(
1920                 const ExecContext *context,
1921                 const ExecParameters *params,
1922                 const ExecRuntime *runtime) {
1923
1924         assert(context);
1925         assert(params);
1926
1927         if (context->root_image)
1928                 return true;
1929
1930         if (!strv_isempty(context->read_write_paths) ||
1931             !strv_isempty(context->read_only_paths) ||
1932             !strv_isempty(context->inaccessible_paths))
1933                 return true;
1934
1935         if (context->n_bind_mounts > 0)
1936                 return true;
1937
1938         if (context->n_temporary_filesystems > 0)
1939                 return true;
1940
1941         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1942                 return true;
1943
1944         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1945                 return true;
1946
1947         if (context->private_devices ||
1948             context->private_mounts ||
1949             context->protect_system != PROTECT_SYSTEM_NO ||
1950             context->protect_home != PROTECT_HOME_NO ||
1951             context->protect_kernel_tunables ||
1952             context->protect_kernel_modules ||
1953             context->protect_kernel_logs ||
1954             context->protect_control_groups)
1955                 return true;
1956
1957         if (context->root_directory) {
1958                 ExecDirectoryType t;
1959
1960                 if (context->mount_apivfs)
1961                         return true;
1962
1963                 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1964                         if (!params->prefix[t])
1965                                 continue;
1966
1967                         if (!strv_isempty(context->directories[t].paths))
1968                                 return true;
1969                 }
1970         }
1971
1972         if (context->dynamic_user &&
1973             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1974              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1975              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1976                 return true;
1977
1978         if (context->log_namespace)
1979                 return true;
1980
1981         return false;
1982 }
1983
1984 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
1985         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1986         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1987         _cleanup_close_ int unshare_ready_fd = -1;
1988         _cleanup_(sigkill_waitp) pid_t pid = 0;
1989         uint64_t c = 1;
1990         ssize_t n;
1991         int r;
1992
1993         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
1994          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
1995          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1996          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1997          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1998          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1999          * continues execution normally.
2000          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2001          * does not need CAP_SETUID to write the single line mapping to itself. */
2002
2003         /* Can only set up multiple mappings with CAP_SETUID. */
2004         if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2005                 r = asprintf(&uid_map,
2006                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2007                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2008                              ouid, ouid, uid, uid);
2009         else
2010                 r = asprintf(&uid_map,
2011                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2012                              ouid, ouid);
2013
2014         if (r < 0)
2015                 return -ENOMEM;
2016
2017         /* Can only set up multiple mappings with CAP_SETGID. */
2018         if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2019                 r = asprintf(&gid_map,
2020                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2021                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2022                              ogid, ogid, gid, gid);
2023         else
2024                 r = asprintf(&gid_map,
2025                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2026                              ogid, ogid);
2027
2028         if (r < 0)
2029                 return -ENOMEM;
2030
2031         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2032          * namespace. */
2033         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2034         if (unshare_ready_fd < 0)
2035                 return -errno;
2036
2037         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2038          * failed. */
2039         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2040                 return -errno;
2041
2042         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2043         if (r < 0)
2044                 return r;
2045         if (r == 0) {
2046                 _cleanup_close_ int fd = -1;
2047                 const char *a;
2048                 pid_t ppid;
2049
2050                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2051                  * here, after the parent opened its own user namespace. */
2052
2053                 ppid = getppid();
2054                 errno_pipe[0] = safe_close(errno_pipe[0]);
2055
2056                 /* Wait until the parent unshared the user namespace */
2057                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2058                         r = -errno;
2059                         goto child_fail;
2060                 }
2061
2062                 /* Disable the setgroups() system call in the child user namespace, for good. */
2063                 a = procfs_file_alloca(ppid, "setgroups");
2064                 fd = open(a, O_WRONLY|O_CLOEXEC);
2065                 if (fd < 0) {
2066                         if (errno != ENOENT) {
2067                                 r = -errno;
2068                                 goto child_fail;
2069                         }
2070
2071                         /* If the file is missing the kernel is too old, let's continue anyway. */
2072                 } else {
2073                         if (write(fd, "deny\n", 5) < 0) {
2074                                 r = -errno;
2075                                 goto child_fail;
2076                         }
2077
2078                         fd = safe_close(fd);
2079                 }
2080
2081                 /* First write the GID map */
2082                 a = procfs_file_alloca(ppid, "gid_map");
2083                 fd = open(a, O_WRONLY|O_CLOEXEC);
2084                 if (fd < 0) {
2085                         r = -errno;
2086                         goto child_fail;
2087                 }
2088                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2089                         r = -errno;
2090                         goto child_fail;
2091                 }
2092                 fd = safe_close(fd);
2093
2094                 /* The write the UID map */
2095                 a = procfs_file_alloca(ppid, "uid_map");
2096                 fd = open(a, O_WRONLY|O_CLOEXEC);
2097                 if (fd < 0) {
2098                         r = -errno;
2099                         goto child_fail;
2100                 }
2101                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2102                         r = -errno;
2103                         goto child_fail;
2104                 }
2105
2106                 _exit(EXIT_SUCCESS);
2107
2108         child_fail:
2109                 (void) write(errno_pipe[1], &r, sizeof(r));
2110                 _exit(EXIT_FAILURE);
2111         }
2112
2113         errno_pipe[1] = safe_close(errno_pipe[1]);
2114
2115         if (unshare(CLONE_NEWUSER) < 0)
2116                 return -errno;
2117
2118         /* Let the child know that the namespace is ready now */
2119         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2120                 return -errno;
2121
2122         /* Try to read an error code from the child */
2123         n = read(errno_pipe[0], &r, sizeof(r));
2124         if (n < 0)
2125                 return -errno;
2126         if (n == sizeof(r)) { /* an error code was sent to us */
2127                 if (r < 0)
2128                         return r;
2129                 return -EIO;
2130         }
2131         if (n != 0) /* on success we should have read 0 bytes */
2132                 return -EIO;
2133
2134         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2135         pid = 0;
2136         if (r < 0)
2137                 return r;
2138         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2139                 return -EIO;
2140
2141         return 0;
2142 }
2143
2144 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2145         if (!context->dynamic_user)
2146                 return false;
2147
2148         if (type == EXEC_DIRECTORY_CONFIGURATION)
2149                 return false;
2150
2151         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2152                 return false;
2153
2154         return true;
2155 }
2156
2157 static int setup_exec_directory(
2158                 const ExecContext *context,
2159                 const ExecParameters *params,
2160                 uid_t uid,
2161                 gid_t gid,
2162                 ExecDirectoryType type,
2163                 int *exit_status) {
2164
2165         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2166                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2167                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2168                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2169                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2170                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2171         };
2172         char **rt;
2173         int r;
2174
2175         assert(context);
2176         assert(params);
2177         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2178         assert(exit_status);
2179
2180         if (!params->prefix[type])
2181                 return 0;
2182
2183         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2184                 if (!uid_is_valid(uid))
2185                         uid = 0;
2186                 if (!gid_is_valid(gid))
2187                         gid = 0;
2188         }
2189
2190         STRV_FOREACH(rt, context->directories[type].paths) {
2191                 _cleanup_free_ char *p = NULL, *pp = NULL;
2192
2193                 p = path_join(params->prefix[type], *rt);
2194                 if (!p) {
2195                         r = -ENOMEM;
2196                         goto fail;
2197                 }
2198
2199                 r = mkdir_parents_label(p, 0755);
2200                 if (r < 0)
2201                         goto fail;
2202
2203                 if (exec_directory_is_private(context, type)) {
2204                         _cleanup_free_ char *private_root = NULL;
2205
2206                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2207                          * case we want to avoid leaving a directory around fully accessible that is owned by
2208                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2209                          * trick used by container managers to prohibit host users to get access to files of
2210                          * the same UID in containers: we place everything inside a directory that has an
2211                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2212                          * for unprivileged host code. We then use fs namespacing to make this directory
2213                          * permeable for the service itself.
2214                          *
2215                          * Specifically: for a service which wants a special directory "foo/" we first create
2216                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2217                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2218                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2219                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2220                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2221                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2222                          * for the service and making sure it only gets access to the dirs it needs but no
2223                          * others. Tricky? Yes, absolutely, but it works!
2224                          *
2225                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2226                          * to be owned by the service itself.
2227                          *
2228                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2229                          * for sharing files or sockets with other services. */
2230
2231                         private_root = path_join(params->prefix[type], "private");
2232                         if (!private_root) {
2233                                 r = -ENOMEM;
2234                                 goto fail;
2235                         }
2236
2237                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2238                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2239                         if (r < 0)
2240                                 goto fail;
2241
2242                         pp = path_join(private_root, *rt);
2243                         if (!pp) {
2244                                 r = -ENOMEM;
2245                                 goto fail;
2246                         }
2247
2248                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2249                         r = mkdir_parents_label(pp, 0755);
2250                         if (r < 0)
2251                                 goto fail;
2252
2253                         if (is_dir(p, false) > 0 &&
2254                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2255
2256                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2257                                  * it over. Most likely the service has been upgraded from one that didn't use
2258                                  * DynamicUser=1, to one that does. */
2259
2260                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2261                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2262                                          exec_directory_type_to_string(type), p, pp);
2263
2264                                 if (rename(p, pp) < 0) {
2265                                         r = -errno;
2266                                         goto fail;
2267                                 }
2268                         } else {
2269                                 /* Otherwise, create the actual directory for the service */
2270
2271                                 r = mkdir_label(pp, context->directories[type].mode);
2272                                 if (r < 0 && r != -EEXIST)
2273                                         goto fail;
2274                         }
2275
2276                         /* And link it up from the original place */
2277                         r = symlink_idempotent(pp, p, true);
2278                         if (r < 0)
2279                                 goto fail;
2280
2281                 } else {
2282                         _cleanup_free_ char *target = NULL;
2283
2284                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2285                             readlink_and_make_absolute(p, &target) >= 0) {
2286                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2287
2288                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2289                                  * by DynamicUser=1 (see above)?
2290                                  *
2291                                  * We do this for all directory types except for ConfigurationDirectory=,
2292                                  * since they all support the private/ symlink logic at least in some
2293                                  * configurations, see above. */
2294
2295                                 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2296                                 if (r < 0)
2297                                         goto fail;
2298
2299                                 q = path_join(params->prefix[type], "private", *rt);
2300                                 if (!q) {
2301                                         r = -ENOMEM;
2302                                         goto fail;
2303                                 }
2304
2305                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2306                                 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2307                                 if (r < 0)
2308                                         goto fail;
2309
2310                                 if (path_equal(q_resolved, target_resolved)) {
2311
2312                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2313                                          * but is no longer. Let's move the directory back up. */
2314
2315                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2316                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2317                                                  exec_directory_type_to_string(type), q, p);
2318
2319                                         if (unlink(p) < 0) {
2320                                                 r = -errno;
2321                                                 goto fail;
2322                                         }
2323
2324                                         if (rename(q, p) < 0) {
2325                                                 r = -errno;
2326                                                 goto fail;
2327                                         }
2328                                 }
2329                         }
2330
2331                         r = mkdir_label(p, context->directories[type].mode);
2332                         if (r < 0) {
2333                                 if (r != -EEXIST)
2334                                         goto fail;
2335
2336                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2337                                         struct stat st;
2338
2339                                         /* Don't change the owner/access mode of the configuration directory,
2340                                          * as in the common case it is not written to by a service, and shall
2341                                          * not be writable. */
2342
2343                                         if (stat(p, &st) < 0) {
2344                                                 r = -errno;
2345                                                 goto fail;
2346                                         }
2347
2348                                         /* Still complain if the access mode doesn't match */
2349                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2350                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2351                                                             "(File system: %o %sMode: %o)",
2352                                                             exec_directory_type_to_string(type), *rt,
2353                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2354
2355                                         continue;
2356                                 }
2357                         }
2358                 }
2359
2360                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2361                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2362                  * current UID/GID ownership.) */
2363                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2364                 if (r < 0)
2365                         goto fail;
2366
2367                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2368                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2369                  * assignments to exist.*/
2370                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2371                 if (r < 0)
2372                         goto fail;
2373         }
2374
2375         return 0;
2376
2377 fail:
2378         *exit_status = exit_status_table[type];
2379         return r;
2380 }
2381
2382 #if ENABLE_SMACK
2383 static int setup_smack(
2384                 const ExecContext *context,
2385                 const ExecCommand *command) {
2386
2387         int r;
2388
2389         assert(context);
2390         assert(command);
2391
2392         if (context->smack_process_label) {
2393                 r = mac_smack_apply_pid(0, context->smack_process_label);
2394                 if (r < 0)
2395                         return r;
2396         }
2397 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2398         else {
2399                 _cleanup_free_ char *exec_label = NULL;
2400
2401                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2402                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2403                         return r;
2404
2405                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2406                 if (r < 0)
2407                         return r;
2408         }
2409 #endif
2410
2411         return 0;
2412 }
2413 #endif
2414
2415 static int compile_bind_mounts(
2416                 const ExecContext *context,
2417                 const ExecParameters *params,
2418                 BindMount **ret_bind_mounts,
2419                 size_t *ret_n_bind_mounts,
2420                 char ***ret_empty_directories) {
2421
2422         _cleanup_strv_free_ char **empty_directories = NULL;
2423         BindMount *bind_mounts;
2424         size_t n, h = 0, i;
2425         ExecDirectoryType t;
2426         int r;
2427
2428         assert(context);
2429         assert(params);
2430         assert(ret_bind_mounts);
2431         assert(ret_n_bind_mounts);
2432         assert(ret_empty_directories);
2433
2434         n = context->n_bind_mounts;
2435         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2436                 if (!params->prefix[t])
2437                         continue;
2438
2439                 n += strv_length(context->directories[t].paths);
2440         }
2441
2442         if (n <= 0) {
2443                 *ret_bind_mounts = NULL;
2444                 *ret_n_bind_mounts = 0;
2445                 *ret_empty_directories = NULL;
2446                 return 0;
2447         }
2448
2449         bind_mounts = new(BindMount, n);
2450         if (!bind_mounts)
2451                 return -ENOMEM;
2452
2453         for (i = 0; i < context->n_bind_mounts; i++) {
2454                 BindMount *item = context->bind_mounts + i;
2455                 char *s, *d;
2456
2457                 s = strdup(item->source);
2458                 if (!s) {
2459                         r = -ENOMEM;
2460                         goto finish;
2461                 }
2462
2463                 d = strdup(item->destination);
2464                 if (!d) {
2465                         free(s);
2466                         r = -ENOMEM;
2467                         goto finish;
2468                 }
2469
2470                 bind_mounts[h++] = (BindMount) {
2471                         .source = s,
2472                         .destination = d,
2473                         .read_only = item->read_only,
2474                         .recursive = item->recursive,
2475                         .ignore_enoent = item->ignore_enoent,
2476                 };
2477         }
2478
2479         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2480                 char **suffix;
2481
2482                 if (!params->prefix[t])
2483                         continue;
2484
2485                 if (strv_isempty(context->directories[t].paths))
2486                         continue;
2487
2488                 if (exec_directory_is_private(context, t) &&
2489                     !(context->root_directory || context->root_image)) {
2490                         char *private_root;
2491
2492                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2493                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2494                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2495
2496                         private_root = path_join(params->prefix[t], "private");
2497                         if (!private_root) {
2498                                 r = -ENOMEM;
2499                                 goto finish;
2500                         }
2501
2502                         r = strv_consume(&empty_directories, private_root);
2503                         if (r < 0)
2504                                 goto finish;
2505                 }
2506
2507                 STRV_FOREACH(suffix, context->directories[t].paths) {
2508                         char *s, *d;
2509
2510                         if (exec_directory_is_private(context, t))
2511                                 s = path_join(params->prefix[t], "private", *suffix);
2512                         else
2513                                 s = path_join(params->prefix[t], *suffix);
2514                         if (!s) {
2515                                 r = -ENOMEM;
2516                                 goto finish;
2517                         }
2518
2519                         if (exec_directory_is_private(context, t) &&
2520                             (context->root_directory || context->root_image))
2521                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2522                                  * directory is not created on the root directory. So, let's bind-mount the directory
2523                                  * on the 'non-private' place. */
2524                                 d = path_join(params->prefix[t], *suffix);
2525                         else
2526                                 d = strdup(s);
2527                         if (!d) {
2528                                 free(s);
2529                                 r = -ENOMEM;
2530                                 goto finish;
2531                         }
2532
2533                         bind_mounts[h++] = (BindMount) {
2534                                 .source = s,
2535                                 .destination = d,
2536                                 .read_only = false,
2537                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2538                                 .recursive = true,
2539                                 .ignore_enoent = false,
2540                         };
2541                 }
2542         }
2543
2544         assert(h == n);
2545
2546         *ret_bind_mounts = bind_mounts;
2547         *ret_n_bind_mounts = n;
2548         *ret_empty_directories = TAKE_PTR(empty_directories);
2549
2550         return (int) n;
2551
2552 finish:
2553         bind_mount_free_many(bind_mounts, h);
2554         return r;
2555 }
2556
2557 static bool insist_on_sandboxing(
2558                 const ExecContext *context,
2559                 const char *root_dir,
2560                 const char *root_image,
2561                 const BindMount *bind_mounts,
2562                 size_t n_bind_mounts) {
2563
2564         size_t i;
2565
2566         assert(context);
2567         assert(n_bind_mounts == 0 || bind_mounts);
2568
2569         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2570          * would alter the view on the file system beyond making things read-only or invisble, i.e. would
2571          * rearrange stuff in a way we cannot ignore gracefully. */
2572
2573         if (context->n_temporary_filesystems > 0)
2574                 return true;
2575
2576         if (root_dir || root_image)
2577                 return true;
2578
2579         if (context->dynamic_user)
2580                 return true;
2581
2582         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2583          * essential. */
2584         for (i = 0; i < n_bind_mounts; i++)
2585                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2586                         return true;
2587
2588         if (context->log_namespace)
2589                 return true;
2590
2591         return false;
2592 }
2593
2594 static int apply_mount_namespace(
2595                 const Unit *u,
2596                 const ExecCommand *command,
2597                 const ExecContext *context,
2598                 const ExecParameters *params,
2599                 const ExecRuntime *runtime,
2600                 char **error_path) {
2601
2602         _cleanup_strv_free_ char **empty_directories = NULL;
2603         char *tmp = NULL, *var = NULL;
2604         const char *root_dir = NULL, *root_image = NULL;
2605         NamespaceInfo ns_info;
2606         bool needs_sandboxing;
2607         BindMount *bind_mounts = NULL;
2608         size_t n_bind_mounts = 0;
2609         int r;
2610
2611         assert(context);
2612
2613         if (params->flags & EXEC_APPLY_CHROOT) {
2614                 root_image = context->root_image;
2615
2616                 if (!root_image)
2617                         root_dir = context->root_directory;
2618         }
2619
2620         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2621         if (r < 0)
2622                 return r;
2623
2624         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2625         if (needs_sandboxing) {
2626                 /* The runtime struct only contains the parent of the private /tmp,
2627                  * which is non-accessible to world users. Inside of it there's a /tmp
2628                  * that is sticky, and that's the one we want to use here. */
2629
2630                 if (context->private_tmp && runtime) {
2631                         if (runtime->tmp_dir)
2632                                 tmp = strjoina(runtime->tmp_dir, "/tmp");
2633                         if (runtime->var_tmp_dir)
2634                                 var = strjoina(runtime->var_tmp_dir, "/tmp");
2635                 }
2636
2637                 ns_info = (NamespaceInfo) {
2638                         .ignore_protect_paths = false,
2639                         .private_dev = context->private_devices,
2640                         .protect_control_groups = context->protect_control_groups,
2641                         .protect_kernel_tunables = context->protect_kernel_tunables,
2642                         .protect_kernel_modules = context->protect_kernel_modules,
2643                         .protect_kernel_logs = context->protect_kernel_logs,
2644                         .protect_hostname = context->protect_hostname,
2645                         .mount_apivfs = context->mount_apivfs,
2646                         .private_mounts = context->private_mounts,
2647                 };
2648         } else if (!context->dynamic_user && root_dir)
2649                 /*
2650                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2651                  * sandbox info, otherwise enforce it, don't ignore protected paths and
2652                  * fail if we are enable to apply the sandbox inside the mount namespace.
2653                  */
2654                 ns_info = (NamespaceInfo) {
2655                         .ignore_protect_paths = true,
2656                 };
2657         else
2658                 ns_info = (NamespaceInfo) {};
2659
2660         if (context->mount_flags == MS_SHARED)
2661                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2662
2663         r = setup_namespace(root_dir, root_image,
2664                             &ns_info, context->read_write_paths,
2665                             needs_sandboxing ? context->read_only_paths : NULL,
2666                             needs_sandboxing ? context->inaccessible_paths : NULL,
2667                             empty_directories,
2668                             bind_mounts,
2669                             n_bind_mounts,
2670                             context->temporary_filesystems,
2671                             context->n_temporary_filesystems,
2672                             tmp,
2673                             var,
2674                             context->log_namespace,
2675                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2676                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2677                             context->mount_flags,
2678                             DISSECT_IMAGE_DISCARD_ON_LOOP|DISSECT_IMAGE_RELAX_VAR_CHECK|DISSECT_IMAGE_FSCK,
2679                             error_path);
2680
2681         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2682          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2683          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2684          * completely different execution environment. */
2685         if (r == -ENOANO) {
2686                 if (insist_on_sandboxing(
2687                                     context,
2688                                     root_dir, root_image,
2689                                     bind_mounts,
2690                                     n_bind_mounts)) {
2691                         log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2692                                        "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2693                                        n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2694
2695                         r = -EOPNOTSUPP;
2696                 } else {
2697                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2698                         r = 0;
2699                 }
2700         }
2701
2702         bind_mount_free_many(bind_mounts, n_bind_mounts);
2703         return r;
2704 }
2705
2706 static int apply_working_directory(
2707                 const ExecContext *context,
2708                 const ExecParameters *params,
2709                 const char *home,
2710                 int *exit_status) {
2711
2712         const char *d, *wd;
2713
2714         assert(context);
2715         assert(exit_status);
2716
2717         if (context->working_directory_home) {
2718
2719                 if (!home) {
2720                         *exit_status = EXIT_CHDIR;
2721                         return -ENXIO;
2722                 }
2723
2724                 wd = home;
2725
2726         } else if (context->working_directory)
2727                 wd = context->working_directory;
2728         else
2729                 wd = "/";
2730
2731         if (params->flags & EXEC_APPLY_CHROOT)
2732                 d = wd;
2733         else
2734                 d = prefix_roota(context->root_directory, wd);
2735
2736         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2737                 *exit_status = EXIT_CHDIR;
2738                 return -errno;
2739         }
2740
2741         return 0;
2742 }
2743
2744 static int apply_root_directory(
2745                 const ExecContext *context,
2746                 const ExecParameters *params,
2747                 const bool needs_mount_ns,
2748                 int *exit_status) {
2749
2750         assert(context);
2751         assert(exit_status);
2752
2753         if (params->flags & EXEC_APPLY_CHROOT) {
2754                 if (!needs_mount_ns && context->root_directory)
2755                         if (chroot(context->root_directory) < 0) {
2756                                 *exit_status = EXIT_CHROOT;
2757                                 return -errno;
2758                         }
2759         }
2760
2761         return 0;
2762 }
2763
2764 static int setup_keyring(
2765                 const Unit *u,
2766                 const ExecContext *context,
2767                 const ExecParameters *p,
2768                 uid_t uid, gid_t gid) {
2769
2770         key_serial_t keyring;
2771         int r = 0;
2772         uid_t saved_uid;
2773         gid_t saved_gid;
2774
2775         assert(u);
2776         assert(context);
2777         assert(p);
2778
2779         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2780          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2781          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2782          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2783          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2784          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2785
2786         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2787                 return 0;
2788
2789         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2790          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2791          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2792          * & group is just as nasty as acquiring a reference to the user keyring. */
2793
2794         saved_uid = getuid();
2795         saved_gid = getgid();
2796
2797         if (gid_is_valid(gid) && gid != saved_gid) {
2798                 if (setregid(gid, -1) < 0)
2799                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2800         }
2801
2802         if (uid_is_valid(uid) && uid != saved_uid) {
2803                 if (setreuid(uid, -1) < 0) {
2804                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2805                         goto out;
2806                 }
2807         }
2808
2809         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2810         if (keyring == -1) {
2811                 if (errno == ENOSYS)
2812                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2813                 else if (IN_SET(errno, EACCES, EPERM))
2814                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2815                 else if (errno == EDQUOT)
2816                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2817                 else
2818                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2819
2820                 goto out;
2821         }
2822
2823         /* When requested link the user keyring into the session keyring. */
2824         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2825
2826                 if (keyctl(KEYCTL_LINK,
2827                            KEY_SPEC_USER_KEYRING,
2828                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2829                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2830                         goto out;
2831                 }
2832         }
2833
2834         /* Restore uid/gid back */
2835         if (uid_is_valid(uid) && uid != saved_uid) {
2836                 if (setreuid(saved_uid, -1) < 0) {
2837                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2838                         goto out;
2839                 }
2840         }
2841
2842         if (gid_is_valid(gid) && gid != saved_gid) {
2843                 if (setregid(saved_gid, -1) < 0)
2844                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2845         }
2846
2847         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2848         if (!sd_id128_is_null(u->invocation_id)) {
2849                 key_serial_t key;
2850
2851                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2852                 if (key == -1)
2853                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2854                 else {
2855                         if (keyctl(KEYCTL_SETPERM, key,
2856                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2857                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2858                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2859                 }
2860         }
2861
2862 out:
2863         /* Revert back uid & gid for the the last time, and exit */
2864         /* no extra logging, as only the first already reported error matters */
2865         if (getuid() != saved_uid)
2866                 (void) setreuid(saved_uid, -1);
2867
2868         if (getgid() != saved_gid)
2869                 (void) setregid(saved_gid, -1);
2870
2871         return r;
2872 }
2873
2874 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2875         assert(array);
2876         assert(n);
2877         assert(pair);
2878
2879         if (pair[0] >= 0)
2880                 array[(*n)++] = pair[0];
2881         if (pair[1] >= 0)
2882                 array[(*n)++] = pair[1];
2883 }
2884
2885 static int close_remaining_fds(
2886                 const ExecParameters *params,
2887                 const ExecRuntime *runtime,
2888                 const DynamicCreds *dcreds,
2889                 int user_lookup_fd,
2890                 int socket_fd,
2891                 int exec_fd,
2892                 const int *fds, size_t n_fds) {
2893
2894         size_t n_dont_close = 0;
2895         int dont_close[n_fds + 12];
2896
2897         assert(params);
2898
2899         if (params->stdin_fd >= 0)
2900                 dont_close[n_dont_close++] = params->stdin_fd;
2901         if (params->stdout_fd >= 0)
2902                 dont_close[n_dont_close++] = params->stdout_fd;
2903         if (params->stderr_fd >= 0)
2904                 dont_close[n_dont_close++] = params->stderr_fd;
2905
2906         if (socket_fd >= 0)
2907                 dont_close[n_dont_close++] = socket_fd;
2908         if (exec_fd >= 0)
2909                 dont_close[n_dont_close++] = exec_fd;
2910         if (n_fds > 0) {
2911                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2912                 n_dont_close += n_fds;
2913         }
2914
2915         if (runtime)
2916                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2917
2918         if (dcreds) {
2919                 if (dcreds->user)
2920                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2921                 if (dcreds->group)
2922                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2923         }
2924
2925         if (user_lookup_fd >= 0)
2926                 dont_close[n_dont_close++] = user_lookup_fd;
2927
2928         return close_all_fds(dont_close, n_dont_close);
2929 }
2930
2931 static int send_user_lookup(
2932                 Unit *unit,
2933                 int user_lookup_fd,
2934                 uid_t uid,
2935                 gid_t gid) {
2936
2937         assert(unit);
2938
2939         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2940          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2941          * specified. */
2942
2943         if (user_lookup_fd < 0)
2944                 return 0;
2945
2946         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2947                 return 0;
2948
2949         if (writev(user_lookup_fd,
2950                (struct iovec[]) {
2951                            IOVEC_INIT(&uid, sizeof(uid)),
2952                            IOVEC_INIT(&gid, sizeof(gid)),
2953                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2954                 return -errno;
2955
2956         return 0;
2957 }
2958
2959 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2960         int r;
2961
2962         assert(c);
2963         assert(home);
2964         assert(buf);
2965
2966         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2967
2968         if (*home)
2969                 return 0;
2970
2971         if (!c->working_directory_home)
2972                 return 0;
2973
2974         r = get_home_dir(buf);
2975         if (r < 0)
2976                 return r;
2977
2978         *home = *buf;
2979         return 1;
2980 }
2981
2982 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2983         _cleanup_strv_free_ char ** list = NULL;
2984         ExecDirectoryType t;
2985         int r;
2986
2987         assert(c);
2988         assert(p);
2989         assert(ret);
2990
2991         assert(c->dynamic_user);
2992
2993         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2994          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2995          * directories. */
2996
2997         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2998                 char **i;
2999
3000                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3001                         continue;
3002
3003                 if (!p->prefix[t])
3004                         continue;
3005
3006                 STRV_FOREACH(i, c->directories[t].paths) {
3007                         char *e;
3008
3009                         if (exec_directory_is_private(c, t))
3010                                 e = path_join(p->prefix[t], "private", *i);
3011                         else
3012                                 e = path_join(p->prefix[t], *i);
3013                         if (!e)
3014                                 return -ENOMEM;
3015
3016                         r = strv_consume(&list, e);
3017                         if (r < 0)
3018                                 return r;
3019                 }
3020         }
3021
3022         *ret = TAKE_PTR(list);
3023
3024         return 0;
3025 }
3026
3027 static char *exec_command_line(char **argv);
3028
3029 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3030         bool using_subcgroup;
3031         char *p;
3032
3033         assert(params);
3034         assert(ret);
3035
3036         if (!params->cgroup_path)
3037                 return -EINVAL;
3038
3039         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3040          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3041          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3042          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3043          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3044          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3045          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3046          * flag, which is only passed for the former statements, not for the latter. */
3047
3048         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3049         if (using_subcgroup)
3050                 p = path_join(params->cgroup_path, ".control");
3051         else
3052                 p = strdup(params->cgroup_path);
3053         if (!p)
3054                 return -ENOMEM;
3055
3056         *ret = p;
3057         return using_subcgroup;
3058 }
3059
3060 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3061         _cleanup_(cpu_set_reset) CPUSet s = {};
3062         int r;
3063
3064         assert(c);
3065         assert(ret);
3066
3067         if (!c->numa_policy.nodes.set) {
3068                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3069                 return 0;
3070         }
3071
3072         r = numa_to_cpu_set(&c->numa_policy, &s);
3073         if (r < 0)
3074                 return r;
3075
3076         cpu_set_reset(ret);
3077
3078         return cpu_set_add_all(ret, &s);
3079 }
3080
3081 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3082         assert(c);
3083
3084         return c->cpu_affinity_from_numa;
3085 }
3086
3087 static int exec_child(
3088                 Unit *unit,
3089                 const ExecCommand *command,
3090                 const ExecContext *context,
3091                 const ExecParameters *params,
3092                 ExecRuntime *runtime,
3093                 DynamicCreds *dcreds,
3094                 int socket_fd,
3095                 const int named_iofds[static 3],
3096                 int *fds,
3097                 size_t n_socket_fds,
3098                 size_t n_storage_fds,
3099                 char **files_env,
3100                 int user_lookup_fd,
3101                 int *exit_status) {
3102
3103         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
3104         int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
3105         _cleanup_free_ gid_t *supplementary_gids = NULL;
3106         const char *username = NULL, *groupname = NULL;
3107         _cleanup_free_ char *home_buffer = NULL;
3108         const char *home = NULL, *shell = NULL;
3109         char **final_argv = NULL;
3110         dev_t journal_stream_dev = 0;
3111         ino_t journal_stream_ino = 0;
3112         bool userns_set_up = false;
3113         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3114                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
3115                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
3116                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
3117 #if HAVE_SELINUX
3118         _cleanup_free_ char *mac_selinux_context_net = NULL;
3119         bool use_selinux = false;
3120 #endif
3121 #if ENABLE_SMACK
3122         bool use_smack = false;
3123 #endif
3124 #if HAVE_APPARMOR
3125         bool use_apparmor = false;
3126 #endif
3127         uid_t saved_uid = getuid();
3128         gid_t saved_gid = getgid();
3129         uid_t uid = UID_INVALID;
3130         gid_t gid = GID_INVALID;
3131         size_t n_fds;
3132         ExecDirectoryType dt;
3133         int secure_bits;
3134         _cleanup_free_ gid_t *gids_after_pam = NULL;
3135         int ngids_after_pam = 0;
3136
3137         assert(unit);
3138         assert(command);
3139         assert(context);
3140         assert(params);
3141         assert(exit_status);
3142
3143         rename_process_from_path(command->path);
3144
3145         /* We reset exactly these signals, since they are the
3146          * only ones we set to SIG_IGN in the main daemon. All
3147          * others we leave untouched because we set them to
3148          * SIG_DFL or a valid handler initially, both of which
3149          * will be demoted to SIG_DFL. */
3150         (void) default_signals(SIGNALS_CRASH_HANDLER,
3151                                SIGNALS_IGNORE, -1);
3152
3153         if (context->ignore_sigpipe)
3154                 (void) ignore_signals(SIGPIPE, -1);
3155
3156         r = reset_signal_mask();
3157         if (r < 0) {
3158                 *exit_status = EXIT_SIGNAL_MASK;
3159                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3160         }
3161
3162         if (params->idle_pipe)
3163                 do_idle_pipe_dance(params->idle_pipe);
3164
3165         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3166          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3167          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3168          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3169
3170         log_forget_fds();
3171         log_set_open_when_needed(true);
3172
3173         /* In case anything used libc syslog(), close this here, too */
3174         closelog();
3175
3176         n_fds = n_socket_fds + n_storage_fds;
3177         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
3178         if (r < 0) {
3179                 *exit_status = EXIT_FDS;
3180                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3181         }
3182
3183         if (!context->same_pgrp)
3184                 if (setsid() < 0) {
3185                         *exit_status = EXIT_SETSID;
3186                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3187                 }
3188
3189         exec_context_tty_reset(context, params);
3190
3191         if (unit_shall_confirm_spawn(unit)) {
3192                 const char *vc = params->confirm_spawn;
3193                 _cleanup_free_ char *cmdline = NULL;
3194
3195                 cmdline = exec_command_line(command->argv);
3196                 if (!cmdline) {
3197                         *exit_status = EXIT_MEMORY;
3198                         return log_oom();
3199                 }
3200
3201                 r = ask_for_confirmation(vc, unit, cmdline);
3202                 if (r != CONFIRM_EXECUTE) {
3203                         if (r == CONFIRM_PRETEND_SUCCESS) {
3204                                 *exit_status = EXIT_SUCCESS;
3205                                 return 0;
3206                         }
3207                         *exit_status = EXIT_CONFIRM;
3208                         log_unit_error(unit, "Execution cancelled by the user");
3209                         return -ECANCELED;
3210                 }
3211         }
3212
3213         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3214          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3215          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3216          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3217          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3218         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3219             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3220                 *exit_status = EXIT_MEMORY;
3221                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3222         }
3223
3224         if (context->dynamic_user && dcreds) {
3225                 _cleanup_strv_free_ char **suggested_paths = NULL;
3226
3227                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3228                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3229                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3230                         *exit_status = EXIT_USER;
3231                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3232                 }
3233
3234                 r = compile_suggested_paths(context, params, &suggested_paths);
3235                 if (r < 0) {
3236                         *exit_status = EXIT_MEMORY;
3237                         return log_oom();
3238                 }
3239
3240                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3241                 if (r < 0) {
3242                         *exit_status = EXIT_USER;
3243                         if (r == -EILSEQ) {
3244                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3245                                 return -EOPNOTSUPP;
3246                         }
3247                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3248                 }
3249
3250                 if (!uid_is_valid(uid)) {
3251                         *exit_status = EXIT_USER;
3252                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3253                         return -ESRCH;
3254                 }
3255
3256                 if (!gid_is_valid(gid)) {
3257                         *exit_status = EXIT_USER;
3258                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3259                         return -ESRCH;
3260                 }
3261
3262                 if (dcreds->user)
3263                         username = dcreds->user->name;
3264
3265         } else {
3266                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3267                 if (r < 0) {
3268                         *exit_status = EXIT_USER;
3269                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3270                 }
3271
3272                 r = get_fixed_group(context, &groupname, &gid);
3273                 if (r < 0) {
3274                         *exit_status = EXIT_GROUP;
3275                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3276                 }
3277         }
3278
3279         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3280         r = get_supplementary_groups(context, username, groupname, gid,
3281                                      &supplementary_gids, &ngids);
3282         if (r < 0) {
3283                 *exit_status = EXIT_GROUP;
3284                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3285         }
3286
3287         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3288         if (r < 0) {
3289                 *exit_status = EXIT_USER;
3290                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3291         }
3292
3293         user_lookup_fd = safe_close(user_lookup_fd);
3294
3295         r = acquire_home(context, uid, &home, &home_buffer);
3296         if (r < 0) {
3297                 *exit_status = EXIT_CHDIR;
3298                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3299         }
3300
3301         /* If a socket is connected to STDIN/STDOUT/STDERR, we
3302          * must sure to drop O_NONBLOCK */
3303         if (socket_fd >= 0)
3304                 (void) fd_nonblock(socket_fd, false);
3305
3306         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3307          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3308         if (params->cgroup_path) {
3309                 _cleanup_free_ char *p = NULL;
3310
3311                 r = exec_parameters_get_cgroup_path(params, &p);
3312                 if (r < 0) {
3313                         *exit_status = EXIT_CGROUP;
3314                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3315                 }
3316
3317                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3318                 if (r < 0) {
3319                         *exit_status = EXIT_CGROUP;
3320                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3321                 }
3322         }
3323
3324         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3325                 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3326                 if (r < 0) {
3327                         *exit_status = EXIT_NETWORK;
3328                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3329                 }
3330         }
3331
3332         r = setup_input(context, params, socket_fd, named_iofds);
3333         if (r < 0) {
3334                 *exit_status = EXIT_STDIN;
3335                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3336         }
3337
3338         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3339         if (r < 0) {
3340                 *exit_status = EXIT_STDOUT;
3341                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3342         }
3343
3344         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3345         if (r < 0) {
3346                 *exit_status = EXIT_STDERR;
3347                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3348         }
3349
3350         if (context->oom_score_adjust_set) {
3351                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3352                  * prohibit write access to this file, and we shouldn't trip up over that. */
3353                 r = set_oom_score_adjust(context->oom_score_adjust);
3354                 if (IN_SET(r, -EPERM, -EACCES))
3355                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3356                 else if (r < 0) {
3357                         *exit_status = EXIT_OOM_ADJUST;
3358                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3359                 }
3360         }
3361
3362         if (context->nice_set) {
3363                 r = setpriority_closest(context->nice);
3364                 if (r < 0)
3365                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3366         }
3367
3368         if (context->cpu_sched_set) {
3369                 struct sched_param param = {
3370                         .sched_priority = context->cpu_sched_priority,
3371                 };
3372
3373                 r = sched_setscheduler(0,
3374                                        context->cpu_sched_policy |
3375                                        (context->cpu_sched_reset_on_fork ?
3376                                         SCHED_RESET_ON_FORK : 0),
3377                                        &param);
3378                 if (r < 0) {
3379                         *exit_status = EXIT_SETSCHEDULER;
3380                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3381                 }
3382         }
3383
3384         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
3385                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
3386                 const CPUSet *cpu_set;
3387
3388                 if (context->cpu_affinity_from_numa) {
3389                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
3390                         if (r < 0) {
3391                                 *exit_status = EXIT_CPUAFFINITY;
3392                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
3393                         }
3394
3395                         cpu_set = &converted_cpu_set;
3396                 } else
3397                         cpu_set = &context->cpu_set;
3398
3399                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
3400                         *exit_status = EXIT_CPUAFFINITY;
3401                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3402                 }
3403         }
3404
3405         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3406                 r = apply_numa_policy(&context->numa_policy);
3407                 if (r == -EOPNOTSUPP)
3408                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3409                 else if (r < 0) {
3410                         *exit_status = EXIT_NUMA_POLICY;
3411                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3412                 }
3413         }
3414
3415         if (context->ioprio_set)
3416                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3417                         *exit_status = EXIT_IOPRIO;
3418                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3419                 }
3420
3421         if (context->timer_slack_nsec != NSEC_INFINITY)
3422                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3423                         *exit_status = EXIT_TIMERSLACK;
3424                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3425                 }
3426
3427         if (context->personality != PERSONALITY_INVALID) {
3428                 r = safe_personality(context->personality);
3429                 if (r < 0) {
3430                         *exit_status = EXIT_PERSONALITY;
3431                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3432                 }
3433         }
3434
3435         if (context->utmp_id)
3436                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3437                                       context->tty_path,
3438                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3439                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3440                                       USER_PROCESS,
3441                                       username);
3442
3443         if (uid_is_valid(uid)) {
3444                 r = chown_terminal(STDIN_FILENO, uid);
3445                 if (r < 0) {
3446                         *exit_status = EXIT_STDIN;
3447                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3448                 }
3449         }
3450
3451         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3452          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3453          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3454          * touch a single hierarchy too. */
3455         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3456                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3457                 if (r < 0) {
3458                         *exit_status = EXIT_CGROUP;
3459                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3460                 }
3461         }
3462
3463         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3464                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3465                 if (r < 0)
3466                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3467         }
3468
3469         r = build_environment(
3470                         unit,
3471                         context,
3472                         params,
3473                         n_fds,
3474                         home,
3475                         username,
3476                         shell,
3477                         journal_stream_dev,
3478                         journal_stream_ino,
3479                         &our_env);
3480         if (r < 0) {
3481                 *exit_status = EXIT_MEMORY;
3482                 return log_oom();
3483         }
3484
3485         r = build_pass_environment(context, &pass_env);
3486         if (r < 0) {
3487                 *exit_status = EXIT_MEMORY;
3488                 return log_oom();
3489         }
3490
3491         accum_env = strv_env_merge(5,
3492                                    params->environment,
3493                                    our_env,
3494                                    pass_env,
3495                                    context->environment,
3496                                    files_env);
3497         if (!accum_env) {
3498                 *exit_status = EXIT_MEMORY;
3499                 return log_oom();
3500         }
3501         accum_env = strv_env_clean(accum_env);
3502
3503         (void) umask(context->umask);
3504
3505         r = setup_keyring(unit, context, params, uid, gid);
3506         if (r < 0) {
3507                 *exit_status = EXIT_KEYRING;
3508                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3509         }
3510
3511         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3512         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3513
3514         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3515         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3516
3517         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3518         if (needs_ambient_hack)
3519                 needs_setuid = false;
3520         else
3521                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3522
3523         if (needs_sandboxing) {
3524                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3525                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3526                  * impacting our own code paths. */
3527
3528 #if HAVE_SELINUX
3529                 use_selinux = mac_selinux_use();
3530 #endif
3531 #if ENABLE_SMACK
3532                 use_smack = mac_smack_use();
3533 #endif
3534 #if HAVE_APPARMOR
3535                 use_apparmor = mac_apparmor_use();
3536 #endif
3537         }
3538
3539         if (needs_sandboxing) {
3540                 int which_failed;
3541
3542                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3543                  * is set here. (See below.) */
3544
3545                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3546                 if (r < 0) {
3547                         *exit_status = EXIT_LIMITS;
3548                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3549                 }
3550         }
3551
3552         if (needs_setuid) {
3553
3554                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3555                  * wins here. (See above.) */
3556
3557                 if (context->pam_name && username) {
3558                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3559                         if (r < 0) {
3560                                 *exit_status = EXIT_PAM;
3561                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3562                         }
3563
3564                         ngids_after_pam = getgroups_alloc(&gids_after_pam);
3565                         if (ngids_after_pam < 0) {
3566                                 *exit_status = EXIT_MEMORY;
3567                                 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
3568                         }
3569                 }
3570         }
3571
3572         if (needs_sandboxing) {
3573 #if HAVE_SELINUX
3574                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3575                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3576                         if (r < 0) {
3577                                 *exit_status = EXIT_SELINUX_CONTEXT;
3578                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3579                         }
3580                 }
3581 #endif
3582
3583                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
3584                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
3585                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
3586                 if (context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
3587                         userns_set_up = true;
3588                         r = setup_private_users(saved_uid, saved_gid, uid, gid);
3589                         if (r < 0) {
3590                                 *exit_status = EXIT_USER;
3591                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
3592                         }
3593                 }
3594         }
3595
3596         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3597
3598                 if (ns_type_supported(NAMESPACE_NET)) {
3599                         r = setup_netns(runtime->netns_storage_socket);
3600                         if (r == -EPERM)
3601                                 log_unit_warning_errno(unit, r,
3602                                                        "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
3603                         else if (r < 0) {
3604                                 *exit_status = EXIT_NETWORK;
3605                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3606                         }
3607                 } else if (context->network_namespace_path) {
3608                         *exit_status = EXIT_NETWORK;
3609                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3610                                                     "NetworkNamespacePath= is not supported, refusing.");
3611                 } else
3612                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3613         }
3614
3615         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3616         if (needs_mount_namespace) {
3617                 _cleanup_free_ char *error_path = NULL;
3618
3619                 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3620                 if (r < 0) {
3621                         *exit_status = EXIT_NAMESPACE;
3622                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3623                                                     error_path ? ": " : "", strempty(error_path));
3624                 }
3625         }
3626
3627         if (needs_sandboxing) {
3628                 r = apply_protect_hostname(unit, context, exit_status);
3629                 if (r < 0)
3630                         return r;
3631         }
3632
3633         /* Drop groups as early as possible.
3634          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
3635          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
3636         if (needs_setuid) {
3637                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
3638                 int ngids_to_enforce = 0;
3639
3640                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
3641                                                    ngids,
3642                                                    gids_after_pam,
3643                                                    ngids_after_pam,
3644                                                    &gids_to_enforce);
3645                 if (ngids_to_enforce < 0) {
3646                         *exit_status = EXIT_MEMORY;
3647                         return log_unit_error_errno(unit,
3648                                                     ngids_to_enforce,
3649                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
3650                 }
3651
3652                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
3653                 if (r < 0) {
3654                         *exit_status = EXIT_GROUP;
3655                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3656                 }
3657         }
3658
3659         /* If the user namespace was not set up above, try to do it now.
3660          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
3661          * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
3662          * case of mount namespaces being less privileged when the mount point list is copied from a
3663          * different user namespace). */
3664
3665         if (needs_sandboxing && context->private_users && !userns_set_up) {
3666                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
3667                 if (r < 0) {
3668                         *exit_status = EXIT_USER;
3669                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3670                 }
3671         }
3672
3673         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3674          * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3675          * however if we have it as we want to keep it open until the final execve(). */
3676
3677         if (params->exec_fd >= 0) {
3678                 exec_fd = params->exec_fd;
3679
3680                 if (exec_fd < 3 + (int) n_fds) {
3681                         int moved_fd;
3682
3683                         /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3684                          * process we are about to execute. */
3685
3686                         moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3687                         if (moved_fd < 0) {
3688                                 *exit_status = EXIT_FDS;
3689                                 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3690                         }
3691
3692                         safe_close(exec_fd);
3693                         exec_fd = moved_fd;
3694                 } else {
3695                         /* This fd should be FD_CLOEXEC already, but let's make sure. */
3696                         r = fd_cloexec(exec_fd, true);
3697                         if (r < 0) {
3698                                 *exit_status = EXIT_FDS;
3699                                 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3700                         }
3701                 }
3702
3703                 fds_with_exec_fd = newa(int, n_fds + 1);
3704                 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3705                 fds_with_exec_fd[n_fds] = exec_fd;
3706                 n_fds_with_exec_fd = n_fds + 1;
3707         } else {
3708                 fds_with_exec_fd = fds;
3709                 n_fds_with_exec_fd = n_fds;
3710         }
3711
3712         r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3713         if (r >= 0)
3714                 r = shift_fds(fds, n_fds);
3715         if (r >= 0)
3716                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3717         if (r < 0) {
3718                 *exit_status = EXIT_FDS;
3719                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3720         }
3721
3722         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3723          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3724          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3725          * came this far. */
3726
3727         secure_bits = context->secure_bits;
3728
3729         if (needs_sandboxing) {
3730                 uint64_t bset;
3731
3732                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3733                  * requested. (Note this is placed after the general resource limit initialization, see
3734                  * above, in order to take precedence.) */
3735                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3736                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3737                                 *exit_status = EXIT_LIMITS;
3738                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3739                         }
3740                 }
3741
3742 #if ENABLE_SMACK
3743                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3744                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3745                 if (use_smack) {
3746                         r = setup_smack(context, command);
3747                         if (r < 0) {
3748                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3749                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3750                         }
3751                 }
3752 #endif
3753
3754                 bset = context->capability_bounding_set;
3755                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3756                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3757                  * instead of us doing that */
3758                 if (needs_ambient_hack)
3759                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3760                                 (UINT64_C(1) << CAP_SETUID) |
3761                                 (UINT64_C(1) << CAP_SETGID);
3762
3763                 if (!cap_test_all(bset)) {
3764                         r = capability_bounding_set_drop(bset, false);
3765                         if (r < 0) {
3766                                 *exit_status = EXIT_CAPABILITIES;
3767                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3768                         }
3769                 }
3770
3771                 /* This is done before enforce_user, but ambient set
3772                  * does not survive over setresuid() if keep_caps is not set. */
3773                 if (!needs_ambient_hack) {
3774                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3775                         if (r < 0) {
3776                                 *exit_status = EXIT_CAPABILITIES;
3777                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3778                         }
3779                 }
3780         }
3781
3782         /* chroot to root directory first, before we lose the ability to chroot */
3783         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3784         if (r < 0)
3785                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3786
3787         if (needs_setuid) {
3788                 if (uid_is_valid(uid)) {
3789                         r = enforce_user(context, uid);
3790                         if (r < 0) {
3791                                 *exit_status = EXIT_USER;
3792                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3793                         }
3794
3795                         if (!needs_ambient_hack &&
3796                             context->capability_ambient_set != 0) {
3797
3798                                 /* Fix the ambient capabilities after user change. */
3799                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3800                                 if (r < 0) {
3801                                         *exit_status = EXIT_CAPABILITIES;
3802                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3803                                 }
3804
3805                                 /* If we were asked to change user and ambient capabilities
3806                                  * were requested, we had to add keep-caps to the securebits
3807                                  * so that we would maintain the inherited capability set
3808                                  * through the setresuid(). Make sure that the bit is added
3809                                  * also to the context secure_bits so that we don't try to
3810                                  * drop the bit away next. */
3811
3812                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3813                         }
3814                 }
3815         }
3816
3817         /* Apply working directory here, because the working directory might be on NFS and only the user running
3818          * this service might have the correct privilege to change to the working directory */
3819         r = apply_working_directory(context, params, home, exit_status);
3820         if (r < 0)
3821                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3822
3823         if (needs_sandboxing) {
3824                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3825                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3826                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3827                  * are restricted. */
3828
3829 #if HAVE_SELINUX
3830                 if (use_selinux) {
3831                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3832
3833                         if (exec_context) {
3834                                 r = setexeccon(exec_context);
3835                                 if (r < 0) {
3836                                         *exit_status = EXIT_SELINUX_CONTEXT;
3837                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3838                                 }
3839                         }
3840                 }
3841 #endif
3842
3843 #if HAVE_APPARMOR
3844                 if (use_apparmor && context->apparmor_profile) {
3845                         r = aa_change_onexec(context->apparmor_profile);
3846                         if (r < 0 && !context->apparmor_profile_ignore) {
3847                                 *exit_status = EXIT_APPARMOR_PROFILE;
3848                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3849                         }
3850                 }
3851 #endif
3852
3853                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3854                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3855                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3856                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3857                                 *exit_status = EXIT_SECUREBITS;
3858                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3859                         }
3860
3861                 if (context_has_no_new_privileges(context))
3862                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3863                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3864                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3865                         }
3866
3867 #if HAVE_SECCOMP
3868                 r = apply_address_families(unit, context);
3869                 if (r < 0) {
3870                         *exit_status = EXIT_ADDRESS_FAMILIES;
3871                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3872                 }
3873
3874                 r = apply_memory_deny_write_execute(unit, context);
3875                 if (r < 0) {
3876                         *exit_status = EXIT_SECCOMP;
3877                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3878                 }
3879
3880                 r = apply_restrict_realtime(unit, context);
3881                 if (r < 0) {
3882                         *exit_status = EXIT_SECCOMP;
3883                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3884                 }
3885
3886                 r = apply_restrict_suid_sgid(unit, context);
3887                 if (r < 0) {
3888                         *exit_status = EXIT_SECCOMP;
3889                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3890                 }
3891
3892                 r = apply_restrict_namespaces(unit, context);
3893                 if (r < 0) {
3894                         *exit_status = EXIT_SECCOMP;
3895                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3896                 }
3897
3898                 r = apply_protect_sysctl(unit, context);
3899                 if (r < 0) {
3900                         *exit_status = EXIT_SECCOMP;
3901                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3902                 }
3903
3904                 r = apply_protect_kernel_modules(unit, context);
3905                 if (r < 0) {
3906                         *exit_status = EXIT_SECCOMP;
3907                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3908                 }
3909
3910                 r = apply_protect_kernel_logs(unit, context);
3911                 if (r < 0) {
3912                         *exit_status = EXIT_SECCOMP;
3913                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
3914                 }
3915
3916                 r = apply_protect_clock(unit, context);
3917                 if (r < 0) {
3918                         *exit_status = EXIT_SECCOMP;
3919                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
3920                 }
3921
3922                 r = apply_private_devices(unit, context);
3923                 if (r < 0) {
3924                         *exit_status = EXIT_SECCOMP;
3925                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3926                 }
3927
3928                 r = apply_syscall_archs(unit, context);
3929                 if (r < 0) {
3930                         *exit_status = EXIT_SECCOMP;
3931                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3932                 }
3933
3934                 r = apply_lock_personality(unit, context);
3935                 if (r < 0) {
3936                         *exit_status = EXIT_SECCOMP;
3937                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3938                 }
3939
3940                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3941                  * by the filter as little as possible. */
3942                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3943                 if (r < 0) {
3944                         *exit_status = EXIT_SECCOMP;
3945                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3946                 }
3947 #endif
3948         }
3949
3950         if (!strv_isempty(context->unset_environment)) {
3951                 char **ee = NULL;
3952
3953                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3954                 if (!ee) {
3955                         *exit_status = EXIT_MEMORY;
3956                         return log_oom();
3957                 }
3958
3959                 strv_free_and_replace(accum_env, ee);
3960         }
3961
3962         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3963                 replaced_argv = replace_env_argv(command->argv, accum_env);
3964                 if (!replaced_argv) {
3965                         *exit_status = EXIT_MEMORY;
3966                         return log_oom();
3967                 }
3968                 final_argv = replaced_argv;
3969         } else
3970                 final_argv = command->argv;
3971
3972         if (DEBUG_LOGGING) {
3973                 _cleanup_free_ char *line;
3974
3975                 line = exec_command_line(final_argv);
3976                 if (line)
3977                         log_struct(LOG_DEBUG,
3978                                    "EXECUTABLE=%s", command->path,
3979                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3980                                    LOG_UNIT_ID(unit),
3981                                    LOG_UNIT_INVOCATION_ID(unit));
3982         }
3983
3984         if (exec_fd >= 0) {
3985                 uint8_t hot = 1;
3986
3987                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3988                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3989
3990                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3991                         *exit_status = EXIT_EXEC;
3992                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3993                 }
3994         }
3995
3996         execve(command->path, final_argv, accum_env);
3997         r = -errno;
3998
3999         if (exec_fd >= 0) {
4000                 uint8_t hot = 0;
4001
4002                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4003                  * that POLLHUP on it no longer means execve() succeeded. */
4004
4005                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4006                         *exit_status = EXIT_EXEC;
4007                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4008                 }
4009         }
4010
4011         if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4012                 log_struct_errno(LOG_INFO, r,
4013                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4014                                  LOG_UNIT_ID(unit),
4015                                  LOG_UNIT_INVOCATION_ID(unit),
4016                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4017                                                   command->path),
4018                                  "EXECUTABLE=%s", command->path);
4019                 return 0;
4020         }
4021
4022         *exit_status = EXIT_EXEC;
4023         return log_unit_error_errno(unit, r, "Failed to execute command: %m");
4024 }
4025
4026 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
4027 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
4028
4029 int exec_spawn(Unit *unit,
4030                ExecCommand *command,
4031                const ExecContext *context,
4032                const ExecParameters *params,
4033                ExecRuntime *runtime,
4034                DynamicCreds *dcreds,
4035                pid_t *ret) {
4036
4037         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
4038         _cleanup_free_ char *subcgroup_path = NULL;
4039         _cleanup_strv_free_ char **files_env = NULL;
4040         size_t n_storage_fds = 0, n_socket_fds = 0;
4041         _cleanup_free_ char *line = NULL;
4042         pid_t pid;
4043
4044         assert(unit);
4045         assert(command);
4046         assert(context);
4047         assert(ret);
4048         assert(params);
4049         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4050
4051         if (context->std_input == EXEC_INPUT_SOCKET ||
4052             context->std_output == EXEC_OUTPUT_SOCKET ||
4053             context->std_error == EXEC_OUTPUT_SOCKET) {
4054
4055                 if (params->n_socket_fds > 1) {
4056                         log_unit_error(unit, "Got more than one socket.");
4057                         return -EINVAL;
4058                 }
4059
4060                 if (params->n_socket_fds == 0) {
4061                         log_unit_error(unit, "Got no socket.");
4062                         return -EINVAL;
4063                 }
4064
4065                 socket_fd = params->fds[0];
4066         } else {
4067                 socket_fd = -1;
4068                 fds = params->fds;
4069                 n_socket_fds = params->n_socket_fds;
4070                 n_storage_fds = params->n_storage_fds;
4071         }
4072
4073         r = exec_context_named_iofds(context, params, named_iofds);
4074         if (r < 0)
4075                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4076
4077         r = exec_context_load_environment(unit, context, &files_env);
4078         if (r < 0)
4079                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
4080
4081         line = exec_command_line(command->argv);
4082         if (!line)
4083                 return log_oom();
4084
4085         log_struct(LOG_DEBUG,
4086                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
4087                    "EXECUTABLE=%s", command->path,
4088                    LOG_UNIT_ID(unit),
4089                    LOG_UNIT_INVOCATION_ID(unit));
4090
4091         if (params->cgroup_path) {
4092                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4093                 if (r < 0)
4094                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4095                 if (r > 0) { /* We are using a child cgroup */
4096                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4097                         if (r < 0)
4098                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4099                 }
4100         }
4101
4102         pid = fork();
4103         if (pid < 0)
4104                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4105
4106         if (pid == 0) {
4107                 int exit_status = EXIT_SUCCESS;
4108
4109                 r = exec_child(unit,
4110                                command,
4111                                context,
4112                                params,
4113                                runtime,
4114                                dcreds,
4115                                socket_fd,
4116                                named_iofds,
4117                                fds,
4118                                n_socket_fds,
4119                                n_storage_fds,
4120                                files_env,
4121                                unit->manager->user_lookup_fds[1],
4122                                &exit_status);
4123
4124                 if (r < 0) {
4125                         const char *status =
4126                                 exit_status_to_string(exit_status,
4127                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
4128
4129                         log_struct_errno(LOG_ERR, r,
4130                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4131                                          LOG_UNIT_ID(unit),
4132                                          LOG_UNIT_INVOCATION_ID(unit),
4133                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4134                                                           status, command->path),
4135                                          "EXECUTABLE=%s", command->path);
4136                 }
4137
4138                 _exit(exit_status);
4139         }
4140
4141         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
4142
4143         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4144          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4145          * process will be killed too). */
4146         if (subcgroup_path)
4147                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
4148
4149         exec_status_start(&command->exec_status, pid);
4150
4151         *ret = pid;
4152         return 0;
4153 }
4154
4155 void exec_context_init(ExecContext *c) {
4156         ExecDirectoryType i;
4157
4158         assert(c);
4159
4160         c->umask = 0022;
4161         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
4162         c->cpu_sched_policy = SCHED_OTHER;
4163         c->syslog_priority = LOG_DAEMON|LOG_INFO;
4164         c->syslog_level_prefix = true;
4165         c->ignore_sigpipe = true;
4166         c->timer_slack_nsec = NSEC_INFINITY;
4167         c->personality = PERSONALITY_INVALID;
4168         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4169                 c->directories[i].mode = 0755;
4170         c->timeout_clean_usec = USEC_INFINITY;
4171         c->capability_bounding_set = CAP_ALL;
4172         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4173         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
4174         c->log_level_max = -1;
4175         numa_policy_reset(&c->numa_policy);
4176 }
4177
4178 void exec_context_done(ExecContext *c) {
4179         ExecDirectoryType i;
4180         size_t l;
4181
4182         assert(c);
4183
4184         c->environment = strv_free(c->environment);
4185         c->environment_files = strv_free(c->environment_files);
4186         c->pass_environment = strv_free(c->pass_environment);
4187         c->unset_environment = strv_free(c->unset_environment);
4188
4189         rlimit_free_all(c->rlimit);
4190
4191         for (l = 0; l < 3; l++) {
4192                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
4193                 c->stdio_file[l] = mfree(c->stdio_file[l]);
4194         }
4195
4196         c->working_directory = mfree(c->working_directory);
4197         c->root_directory = mfree(c->root_directory);
4198         c->root_image = mfree(c->root_image);
4199         c->tty_path = mfree(c->tty_path);
4200         c->syslog_identifier = mfree(c->syslog_identifier);
4201         c->user = mfree(c->user);
4202         c->group = mfree(c->group);
4203
4204         c->supplementary_groups = strv_free(c->supplementary_groups);
4205
4206         c->pam_name = mfree(c->pam_name);
4207
4208         c->read_only_paths = strv_free(c->read_only_paths);
4209         c->read_write_paths = strv_free(c->read_write_paths);
4210         c->inaccessible_paths = strv_free(c->inaccessible_paths);
4211
4212         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
4213         c->bind_mounts = NULL;
4214         c->n_bind_mounts = 0;
4215         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4216         c->temporary_filesystems = NULL;
4217         c->n_temporary_filesystems = 0;
4218
4219         cpu_set_reset(&c->cpu_set);
4220         numa_policy_reset(&c->numa_policy);
4221
4222         c->utmp_id = mfree(c->utmp_id);
4223         c->selinux_context = mfree(c->selinux_context);
4224         c->apparmor_profile = mfree(c->apparmor_profile);
4225         c->smack_process_label = mfree(c->smack_process_label);
4226
4227         c->syscall_filter = hashmap_free(c->syscall_filter);
4228         c->syscall_archs = set_free(c->syscall_archs);
4229         c->address_families = set_free(c->address_families);
4230
4231         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4232                 c->directories[i].paths = strv_free(c->directories[i].paths);
4233
4234         c->log_level_max = -1;
4235
4236         exec_context_free_log_extra_fields(c);
4237
4238         c->log_ratelimit_interval_usec = 0;
4239         c->log_ratelimit_burst = 0;
4240
4241         c->stdin_data = mfree(c->stdin_data);
4242         c->stdin_data_size = 0;
4243
4244         c->network_namespace_path = mfree(c->network_namespace_path);
4245
4246         c->log_namespace = mfree(c->log_namespace);
4247 }
4248
4249 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4250         char **i;
4251
4252         assert(c);
4253
4254         if (!runtime_prefix)
4255                 return 0;
4256
4257         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4258                 _cleanup_free_ char *p;
4259
4260                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4261                         p = path_join(runtime_prefix, "private", *i);
4262                 else
4263                         p = path_join(runtime_prefix, *i);
4264                 if (!p)
4265                         return -ENOMEM;
4266
4267                 /* We execute this synchronously, since we need to be sure this is gone when we start the
4268                  * service next. */
4269                 (void) rm_rf(p, REMOVE_ROOT);
4270         }
4271
4272         return 0;
4273 }
4274
4275 static void exec_command_done(ExecCommand *c) {
4276         assert(c);
4277
4278         c->path = mfree(c->path);
4279         c->argv = strv_free(c->argv);
4280 }
4281
4282 void exec_command_done_array(ExecCommand *c, size_t n) {
4283         size_t i;
4284
4285         for (i = 0; i < n; i++)
4286                 exec_command_done(c+i);
4287 }
4288
4289 ExecCommand* exec_command_free_list(ExecCommand *c) {
4290         ExecCommand *i;
4291
4292         while ((i = c)) {
4293                 LIST_REMOVE(command, c, i);
4294                 exec_command_done(i);
4295                 free(i);
4296         }
4297
4298         return NULL;
4299 }
4300
4301 void exec_command_free_array(ExecCommand **c, size_t n) {
4302         size_t i;
4303
4304         for (i = 0; i < n; i++)
4305                 c[i] = exec_command_free_list(c[i]);
4306 }
4307
4308 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4309         size_t i;
4310
4311         for (i = 0; i < n; i++)
4312                 exec_status_reset(&c[i].exec_status);
4313 }
4314
4315 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4316         size_t i;
4317
4318         for (i = 0; i < n; i++) {
4319                 ExecCommand *z;
4320
4321                 LIST_FOREACH(command, z, c[i])
4322                         exec_status_reset(&z->exec_status);
4323         }
4324 }
4325
4326 typedef struct InvalidEnvInfo {
4327         const Unit *unit;
4328         const char *path;
4329 } InvalidEnvInfo;
4330
4331 static void invalid_env(const char *p, void *userdata) {
4332         InvalidEnvInfo *info = userdata;
4333
4334         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4335 }
4336
4337 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4338         assert(c);
4339
4340         switch (fd_index) {
4341
4342         case STDIN_FILENO:
4343                 if (c->std_input != EXEC_INPUT_NAMED_FD)
4344                         return NULL;
4345
4346                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4347
4348         case STDOUT_FILENO:
4349                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4350                         return NULL;
4351
4352                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4353
4354         case STDERR_FILENO:
4355                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4356                         return NULL;
4357
4358                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4359
4360         default:
4361                 return NULL;
4362         }
4363 }
4364
4365 static int exec_context_named_iofds(
4366                 const ExecContext *c,
4367                 const ExecParameters *p,
4368                 int named_iofds[static 3]) {
4369
4370         size_t i, targets;
4371         const char* stdio_fdname[3];
4372         size_t n_fds;
4373
4374         assert(c);
4375         assert(p);
4376         assert(named_iofds);
4377
4378         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4379                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4380                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
4381
4382         for (i = 0; i < 3; i++)
4383                 stdio_fdname[i] = exec_context_fdname(c, i);
4384
4385         n_fds = p->n_storage_fds + p->n_socket_fds;
4386
4387         for (i = 0; i < n_fds  && targets > 0; i++)
4388                 if (named_iofds[STDIN_FILENO] < 0 &&
4389                     c->std_input == EXEC_INPUT_NAMED_FD &&
4390                     stdio_fdname[STDIN_FILENO] &&
4391                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4392
4393                         named_iofds[STDIN_FILENO] = p->fds[i];
4394                         targets--;
4395
4396                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4397                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
4398                            stdio_fdname[STDOUT_FILENO] &&
4399                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4400
4401                         named_iofds[STDOUT_FILENO] = p->fds[i];
4402                         targets--;
4403
4404                 } else if (named_iofds[STDERR_FILENO] < 0 &&
4405                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
4406                            stdio_fdname[STDERR_FILENO] &&
4407                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4408
4409                         named_iofds[STDERR_FILENO] = p->fds[i];
4410                         targets--;
4411                 }
4412
4413         return targets == 0 ? 0 : -ENOENT;
4414 }
4415
4416 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4417         char **i, **r = NULL;
4418
4419         assert(c);
4420         assert(l);
4421
4422         STRV_FOREACH(i, c->environment_files) {
4423                 char *fn;
4424                 int k;
4425                 unsigned n;
4426                 bool ignore = false;
4427                 char **p;
4428                 _cleanup_globfree_ glob_t pglob = {};
4429
4430                 fn = *i;
4431
4432                 if (fn[0] == '-') {
4433                         ignore = true;
4434                         fn++;
4435                 }
4436
4437                 if (!path_is_absolute(fn)) {
4438                         if (ignore)
4439                                 continue;
4440
4441                         strv_free(r);
4442                         return -EINVAL;
4443                 }
4444
4445                 /* Filename supports globbing, take all matching files */
4446                 k = safe_glob(fn, 0, &pglob);
4447                 if (k < 0) {
4448                         if (ignore)
4449                                 continue;
4450
4451                         strv_free(r);
4452                         return k;
4453                 }
4454
4455                 /* When we don't match anything, -ENOENT should be returned */
4456                 assert(pglob.gl_pathc > 0);
4457
4458                 for (n = 0; n < pglob.gl_pathc; n++) {
4459                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4460                         if (k < 0) {
4461                                 if (ignore)
4462                                         continue;
4463
4464                                 strv_free(r);
4465                                 return k;
4466                         }
4467                         /* Log invalid environment variables with filename */
4468                         if (p) {
4469                                 InvalidEnvInfo info = {
4470                                         .unit = unit,
4471                                         .path = pglob.gl_pathv[n]
4472                                 };
4473
4474                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
4475                         }
4476
4477                         if (!r)
4478                                 r = p;
4479                         else {
4480                                 char **m;
4481
4482                                 m = strv_env_merge(2, r, p);
4483                                 strv_free(r);
4484                                 strv_free(p);
4485                                 if (!m)
4486                                         return -ENOMEM;
4487
4488                                 r = m;
4489                         }
4490                 }
4491         }
4492
4493         *l = r;
4494
4495         return 0;
4496 }
4497
4498 static bool tty_may_match_dev_console(const char *tty) {
4499         _cleanup_free_ char *resolved = NULL;
4500
4501         if (!tty)
4502                 return true;
4503
4504         tty = skip_dev_prefix(tty);
4505
4506         /* trivial identity? */
4507         if (streq(tty, "console"))
4508                 return true;
4509
4510         if (resolve_dev_console(&resolved) < 0)
4511                 return true; /* if we could not resolve, assume it may */
4512
4513         /* "tty0" means the active VC, so it may be the same sometimes */
4514         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4515 }
4516
4517 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4518         assert(ec);
4519
4520         return ec->tty_reset ||
4521                 ec->tty_vhangup ||
4522                 ec->tty_vt_disallocate ||
4523                 is_terminal_input(ec->std_input) ||
4524                 is_terminal_output(ec->std_output) ||
4525                 is_terminal_output(ec->std_error);
4526 }
4527
4528 bool exec_context_may_touch_console(const ExecContext *ec) {
4529
4530         return exec_context_may_touch_tty(ec) &&
4531                tty_may_match_dev_console(exec_context_tty_path(ec));
4532 }
4533
4534 static void strv_fprintf(FILE *f, char **l) {
4535         char **g;
4536
4537         assert(f);
4538
4539         STRV_FOREACH(g, l)
4540                 fprintf(f, " %s", *g);
4541 }
4542
4543 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4544         char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
4545         ExecDirectoryType dt;
4546         unsigned i;
4547         int r;
4548
4549         assert(c);
4550         assert(f);
4551
4552         prefix = strempty(prefix);
4553
4554         fprintf(f,
4555                 "%sUMask: %04o\n"
4556                 "%sWorkingDirectory: %s\n"
4557                 "%sRootDirectory: %s\n"
4558                 "%sNonBlocking: %s\n"
4559                 "%sPrivateTmp: %s\n"
4560                 "%sPrivateDevices: %s\n"
4561                 "%sProtectKernelTunables: %s\n"
4562                 "%sProtectKernelModules: %s\n"
4563                 "%sProtectKernelLogs: %s\n"
4564                 "%sProtectClock: %s\n"
4565                 "%sProtectControlGroups: %s\n"
4566                 "%sPrivateNetwork: %s\n"
4567                 "%sPrivateUsers: %s\n"
4568                 "%sProtectHome: %s\n"
4569                 "%sProtectSystem: %s\n"
4570                 "%sMountAPIVFS: %s\n"
4571                 "%sIgnoreSIGPIPE: %s\n"
4572                 "%sMemoryDenyWriteExecute: %s\n"
4573                 "%sRestrictRealtime: %s\n"
4574                 "%sRestrictSUIDSGID: %s\n"
4575                 "%sKeyringMode: %s\n"
4576                 "%sProtectHostname: %s\n",
4577                 prefix, c->umask,
4578                 prefix, c->working_directory ? c->working_directory : "/",
4579                 prefix, c->root_directory ? c->root_directory : "/",
4580                 prefix, yes_no(c->non_blocking),
4581                 prefix, yes_no(c->private_tmp),
4582                 prefix, yes_no(c->private_devices),
4583                 prefix, yes_no(c->protect_kernel_tunables),
4584                 prefix, yes_no(c->protect_kernel_modules),
4585                 prefix, yes_no(c->protect_kernel_logs),
4586                 prefix, yes_no(c->protect_clock),
4587                 prefix, yes_no(c->protect_control_groups),
4588                 prefix, yes_no(c->private_network),
4589                 prefix, yes_no(c->private_users),
4590                 prefix, protect_home_to_string(c->protect_home),
4591                 prefix, protect_system_to_string(c->protect_system),
4592                 prefix, yes_no(c->mount_apivfs),
4593                 prefix, yes_no(c->ignore_sigpipe),
4594                 prefix, yes_no(c->memory_deny_write_execute),
4595                 prefix, yes_no(c->restrict_realtime),
4596                 prefix, yes_no(c->restrict_suid_sgid),
4597                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4598                 prefix, yes_no(c->protect_hostname));
4599
4600         if (c->root_image)
4601                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4602
4603         STRV_FOREACH(e, c->environment)
4604                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4605
4606         STRV_FOREACH(e, c->environment_files)
4607                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4608
4609         STRV_FOREACH(e, c->pass_environment)
4610                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4611
4612         STRV_FOREACH(e, c->unset_environment)
4613                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4614
4615         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4616
4617         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4618                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4619
4620                 STRV_FOREACH(d, c->directories[dt].paths)
4621                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4622         }
4623
4624         fprintf(f,
4625                 "%sTimeoutCleanSec: %s\n",
4626                 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
4627
4628         if (c->nice_set)
4629                 fprintf(f,
4630                         "%sNice: %i\n",
4631                         prefix, c->nice);
4632
4633         if (c->oom_score_adjust_set)
4634                 fprintf(f,
4635                         "%sOOMScoreAdjust: %i\n",
4636                         prefix, c->oom_score_adjust);
4637
4638         for (i = 0; i < RLIM_NLIMITS; i++)
4639                 if (c->rlimit[i]) {
4640                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4641                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4642                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4643                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4644                 }
4645
4646         if (c->ioprio_set) {
4647                 _cleanup_free_ char *class_str = NULL;
4648
4649                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4650                 if (r >= 0)
4651                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4652
4653                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4654         }
4655
4656         if (c->cpu_sched_set) {
4657                 _cleanup_free_ char *policy_str = NULL;
4658
4659                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4660                 if (r >= 0)
4661                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4662
4663                 fprintf(f,
4664                         "%sCPUSchedulingPriority: %i\n"
4665                         "%sCPUSchedulingResetOnFork: %s\n",
4666                         prefix, c->cpu_sched_priority,
4667                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4668         }
4669
4670         if (c->cpu_set.set) {
4671                 _cleanup_free_ char *affinity = NULL;
4672
4673                 affinity = cpu_set_to_range_string(&c->cpu_set);
4674                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4675         }
4676
4677         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4678                 _cleanup_free_ char *nodes = NULL;
4679
4680                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4681                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4682                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4683         }
4684
4685         if (c->timer_slack_nsec != NSEC_INFINITY)
4686                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4687
4688         fprintf(f,
4689                 "%sStandardInput: %s\n"
4690                 "%sStandardOutput: %s\n"
4691                 "%sStandardError: %s\n",
4692                 prefix, exec_input_to_string(c->std_input),
4693                 prefix, exec_output_to_string(c->std_output),
4694                 prefix, exec_output_to_string(c->std_error));
4695
4696         if (c->std_input == EXEC_INPUT_NAMED_FD)
4697                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4698         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4699                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4700         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4701                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4702
4703         if (c->std_input == EXEC_INPUT_FILE)
4704                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4705         if (c->std_output == EXEC_OUTPUT_FILE)
4706                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4707         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4708                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4709         if (c->std_error == EXEC_OUTPUT_FILE)
4710                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4711         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4712                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4713
4714         if (c->tty_path)
4715                 fprintf(f,
4716                         "%sTTYPath: %s\n"
4717                         "%sTTYReset: %s\n"
4718                         "%sTTYVHangup: %s\n"
4719                         "%sTTYVTDisallocate: %s\n",
4720                         prefix, c->tty_path,
4721                         prefix, yes_no(c->tty_reset),
4722                         prefix, yes_no(c->tty_vhangup),
4723                         prefix, yes_no(c->tty_vt_disallocate));
4724
4725         if (IN_SET(c->std_output,
4726                    EXEC_OUTPUT_SYSLOG,
4727                    EXEC_OUTPUT_KMSG,
4728                    EXEC_OUTPUT_JOURNAL,
4729                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4730                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4731                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4732             IN_SET(c->std_error,
4733                    EXEC_OUTPUT_SYSLOG,
4734                    EXEC_OUTPUT_KMSG,
4735                    EXEC_OUTPUT_JOURNAL,
4736                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4737                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4738                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4739
4740                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4741
4742                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4743                 if (r >= 0)
4744                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4745
4746                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4747                 if (r >= 0)
4748                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4749         }
4750
4751         if (c->log_level_max >= 0) {
4752                 _cleanup_free_ char *t = NULL;
4753
4754                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4755
4756                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4757         }
4758
4759         if (c->log_ratelimit_interval_usec > 0) {
4760                 char buf_timespan[FORMAT_TIMESPAN_MAX];
4761
4762                 fprintf(f,
4763                         "%sLogRateLimitIntervalSec: %s\n",
4764                         prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
4765         }
4766
4767         if (c->log_ratelimit_burst > 0)
4768                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
4769
4770         if (c->n_log_extra_fields > 0) {
4771                 size_t j;
4772
4773                 for (j = 0; j < c->n_log_extra_fields; j++) {
4774                         fprintf(f, "%sLogExtraFields: ", prefix);
4775                         fwrite(c->log_extra_fields[j].iov_base,
4776                                1, c->log_extra_fields[j].iov_len,
4777                                f);
4778                         fputc('\n', f);
4779                 }
4780         }
4781
4782         if (c->log_namespace)
4783                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
4784
4785         if (c->secure_bits) {
4786                 _cleanup_free_ char *str = NULL;
4787
4788                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4789                 if (r >= 0)
4790                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4791         }
4792
4793         if (c->capability_bounding_set != CAP_ALL) {
4794                 _cleanup_free_ char *str = NULL;
4795
4796                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4797                 if (r >= 0)
4798                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4799         }
4800
4801         if (c->capability_ambient_set != 0) {
4802                 _cleanup_free_ char *str = NULL;
4803
4804                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4805                 if (r >= 0)
4806                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4807         }
4808
4809         if (c->user)
4810                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4811         if (c->group)
4812                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4813
4814         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4815
4816         if (!strv_isempty(c->supplementary_groups)) {
4817                 fprintf(f, "%sSupplementaryGroups:", prefix);
4818                 strv_fprintf(f, c->supplementary_groups);
4819                 fputs("\n", f);
4820         }
4821
4822         if (c->pam_name)
4823                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4824
4825         if (!strv_isempty(c->read_write_paths)) {
4826                 fprintf(f, "%sReadWritePaths:", prefix);
4827                 strv_fprintf(f, c->read_write_paths);
4828                 fputs("\n", f);
4829         }
4830
4831         if (!strv_isempty(c->read_only_paths)) {
4832                 fprintf(f, "%sReadOnlyPaths:", prefix);
4833                 strv_fprintf(f, c->read_only_paths);
4834                 fputs("\n", f);
4835         }
4836
4837         if (!strv_isempty(c->inaccessible_paths)) {
4838                 fprintf(f, "%sInaccessiblePaths:", prefix);
4839                 strv_fprintf(f, c->inaccessible_paths);
4840                 fputs("\n", f);
4841         }
4842
4843         if (c->n_bind_mounts > 0)
4844                 for (i = 0; i < c->n_bind_mounts; i++)
4845                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4846                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4847                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4848                                 c->bind_mounts[i].source,
4849                                 c->bind_mounts[i].destination,
4850                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4851
4852         if (c->n_temporary_filesystems > 0)
4853                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4854                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4855
4856                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4857                                 t->path,
4858                                 isempty(t->options) ? "" : ":",
4859                                 strempty(t->options));
4860                 }
4861
4862         if (c->utmp_id)
4863                 fprintf(f,
4864                         "%sUtmpIdentifier: %s\n",
4865                         prefix, c->utmp_id);
4866
4867         if (c->selinux_context)
4868                 fprintf(f,
4869                         "%sSELinuxContext: %s%s\n",
4870                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4871
4872         if (c->apparmor_profile)
4873                 fprintf(f,
4874                         "%sAppArmorProfile: %s%s\n",
4875                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4876
4877         if (c->smack_process_label)
4878                 fprintf(f,
4879                         "%sSmackProcessLabel: %s%s\n",
4880                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4881
4882         if (c->personality != PERSONALITY_INVALID)
4883                 fprintf(f,
4884                         "%sPersonality: %s\n",
4885                         prefix, strna(personality_to_string(c->personality)));
4886
4887         fprintf(f,
4888                 "%sLockPersonality: %s\n",
4889                 prefix, yes_no(c->lock_personality));
4890
4891         if (c->syscall_filter) {
4892 #if HAVE_SECCOMP
4893                 Iterator j;
4894                 void *id, *val;
4895                 bool first = true;
4896 #endif
4897
4898                 fprintf(f,
4899                         "%sSystemCallFilter: ",
4900                         prefix);
4901
4902                 if (!c->syscall_whitelist)
4903                         fputc('~', f);
4904
4905 #if HAVE_SECCOMP
4906                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4907                         _cleanup_free_ char *name = NULL;
4908                         const char *errno_name = NULL;
4909                         int num = PTR_TO_INT(val);
4910
4911                         if (first)
4912                                 first = false;
4913                         else
4914                                 fputc(' ', f);
4915
4916                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4917                         fputs(strna(name), f);
4918
4919                         if (num >= 0) {
4920                                 errno_name = errno_to_name(num);
4921                                 if (errno_name)
4922                                         fprintf(f, ":%s", errno_name);
4923                                 else
4924                                         fprintf(f, ":%d", num);
4925                         }
4926                 }
4927 #endif
4928
4929                 fputc('\n', f);
4930         }
4931
4932         if (c->syscall_archs) {
4933 #if HAVE_SECCOMP
4934                 Iterator j;
4935                 void *id;
4936 #endif
4937
4938                 fprintf(f,
4939                         "%sSystemCallArchitectures:",
4940                         prefix);
4941
4942 #if HAVE_SECCOMP
4943                 SET_FOREACH(id, c->syscall_archs, j)
4944                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4945 #endif
4946                 fputc('\n', f);
4947         }
4948
4949         if (exec_context_restrict_namespaces_set(c)) {
4950                 _cleanup_free_ char *s = NULL;
4951
4952                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4953                 if (r >= 0)
4954                         fprintf(f, "%sRestrictNamespaces: %s\n",
4955                                 prefix, strna(s));
4956         }
4957
4958         if (c->network_namespace_path)
4959                 fprintf(f,
4960                         "%sNetworkNamespacePath: %s\n",
4961                         prefix, c->network_namespace_path);
4962
4963         if (c->syscall_errno > 0) {
4964                 const char *errno_name;
4965
4966                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4967
4968                 errno_name = errno_to_name(c->syscall_errno);
4969                 if (errno_name)
4970                         fprintf(f, "%s\n", errno_name);
4971                 else
4972                         fprintf(f, "%d\n", c->syscall_errno);
4973         }
4974 }
4975
4976 bool exec_context_maintains_privileges(const ExecContext *c) {
4977         assert(c);
4978
4979         /* Returns true if the process forked off would run under
4980          * an unchanged UID or as root. */
4981
4982         if (!c->user)
4983                 return true;
4984
4985         if (streq(c->user, "root") || streq(c->user, "0"))
4986                 return true;
4987
4988         return false;
4989 }
4990
4991 int exec_context_get_effective_ioprio(const ExecContext *c) {
4992         int p;
4993
4994         assert(c);
4995
4996         if (c->ioprio_set)
4997                 return c->ioprio;
4998
4999         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5000         if (p < 0)
5001                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5002
5003         return p;
5004 }
5005
5006 void exec_context_free_log_extra_fields(ExecContext *c) {
5007         size_t l;
5008
5009         assert(c);
5010
5011         for (l = 0; l < c->n_log_extra_fields; l++)
5012                 free(c->log_extra_fields[l].iov_base);
5013         c->log_extra_fields = mfree(c->log_extra_fields);
5014         c->n_log_extra_fields = 0;
5015 }
5016
5017 void exec_context_revert_tty(ExecContext *c) {
5018         int r;
5019
5020         assert(c);
5021
5022         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5023         exec_context_tty_reset(c, NULL);
5024
5025         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5026          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5027          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
5028
5029         if (exec_context_may_touch_tty(c)) {
5030                 const char *path;
5031
5032                 path = exec_context_tty_path(c);
5033                 if (path) {
5034                         r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
5035                         if (r < 0 && r != -ENOENT)
5036                                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
5037                 }
5038         }
5039 }
5040
5041 int exec_context_get_clean_directories(
5042                 ExecContext *c,
5043                 char **prefix,
5044                 ExecCleanMask mask,
5045                 char ***ret) {
5046
5047         _cleanup_strv_free_ char **l = NULL;
5048         ExecDirectoryType t;
5049         int r;
5050
5051         assert(c);
5052         assert(prefix);
5053         assert(ret);
5054
5055         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5056                 char **i;
5057
5058                 if (!FLAGS_SET(mask, 1U << t))
5059                         continue;
5060
5061                 if (!prefix[t])
5062                         continue;
5063
5064                 STRV_FOREACH(i, c->directories[t].paths) {
5065                         char *j;
5066
5067                         j = path_join(prefix[t], *i);
5068                         if (!j)
5069                                 return -ENOMEM;
5070
5071                         r = strv_consume(&l, j);
5072                         if (r < 0)
5073                                 return r;
5074
5075                         /* Also remove private directories unconditionally. */
5076                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
5077                                 j = path_join(prefix[t], "private", *i);
5078                                 if (!j)
5079                                         return -ENOMEM;
5080
5081                                 r = strv_consume(&l, j);
5082                                 if (r < 0)
5083                                         return r;
5084                         }
5085                 }
5086         }
5087
5088         *ret = TAKE_PTR(l);
5089         return 0;
5090 }
5091
5092 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5093         ExecCleanMask mask = 0;
5094
5095         assert(c);
5096         assert(ret);
5097
5098         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5099                 if (!strv_isempty(c->directories[t].paths))
5100                         mask |= 1U << t;
5101
5102         *ret = mask;
5103         return 0;
5104 }
5105
5106 void exec_status_start(ExecStatus *s, pid_t pid) {
5107         assert(s);
5108
5109         *s = (ExecStatus) {
5110                 .pid = pid,
5111         };
5112
5113         dual_timestamp_get(&s->start_timestamp);
5114 }
5115
5116 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
5117         assert(s);
5118
5119         if (s->pid != pid) {
5120                 *s = (ExecStatus) {
5121                         .pid = pid,
5122                 };
5123         }
5124
5125         dual_timestamp_get(&s->exit_timestamp);
5126
5127         s->code = code;
5128         s->status = status;
5129
5130         if (context && context->utmp_id)
5131                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
5132 }
5133
5134 void exec_status_reset(ExecStatus *s) {
5135         assert(s);
5136
5137         *s = (ExecStatus) {};
5138 }
5139
5140 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
5141         char buf[FORMAT_TIMESTAMP_MAX];
5142
5143         assert(s);
5144         assert(f);
5145
5146         if (s->pid <= 0)
5147                 return;
5148
5149         prefix = strempty(prefix);
5150
5151         fprintf(f,
5152                 "%sPID: "PID_FMT"\n",
5153                 prefix, s->pid);
5154
5155         if (dual_timestamp_is_set(&s->start_timestamp))
5156                 fprintf(f,
5157                         "%sStart Timestamp: %s\n",
5158                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
5159
5160         if (dual_timestamp_is_set(&s->exit_timestamp))
5161                 fprintf(f,
5162                         "%sExit Timestamp: %s\n"
5163                         "%sExit Code: %s\n"
5164                         "%sExit Status: %i\n",
5165                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
5166                         prefix, sigchld_code_to_string(s->code),
5167                         prefix, s->status);
5168 }
5169
5170 static char *exec_command_line(char **argv) {
5171         size_t k;
5172         char *n, *p, **a;
5173         bool first = true;
5174
5175         assert(argv);
5176
5177         k = 1;
5178         STRV_FOREACH(a, argv)
5179                 k += strlen(*a)+3;
5180
5181         n = new(char, k);
5182         if (!n)
5183                 return NULL;
5184
5185         p = n;
5186         STRV_FOREACH(a, argv) {
5187
5188                 if (!first)
5189                         *(p++) = ' ';
5190                 else
5191                         first = false;
5192
5193                 if (strpbrk(*a, WHITESPACE)) {
5194                         *(p++) = '\'';
5195                         p = stpcpy(p, *a);
5196                         *(p++) = '\'';
5197                 } else
5198                         p = stpcpy(p, *a);
5199
5200         }
5201
5202         *p = 0;
5203
5204         /* FIXME: this doesn't really handle arguments that have
5205          * spaces and ticks in them */
5206
5207         return n;
5208 }
5209
5210 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
5211         _cleanup_free_ char *cmd = NULL;
5212         const char *prefix2;
5213
5214         assert(c);
5215         assert(f);
5216
5217         prefix = strempty(prefix);
5218         prefix2 = strjoina(prefix, "\t");
5219
5220         cmd = exec_command_line(c->argv);
5221         fprintf(f,
5222                 "%sCommand Line: %s\n",
5223                 prefix, cmd ? cmd : strerror_safe(ENOMEM));
5224
5225         exec_status_dump(&c->exec_status, f, prefix2);
5226 }
5227
5228 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
5229         assert(f);
5230
5231         prefix = strempty(prefix);
5232
5233         LIST_FOREACH(command, c, c)
5234                 exec_command_dump(c, f, prefix);
5235 }
5236
5237 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
5238         ExecCommand *end;
5239
5240         assert(l);
5241         assert(e);
5242
5243         if (*l) {
5244                 /* It's kind of important, that we keep the order here */
5245                 LIST_FIND_TAIL(command, *l, end);
5246                 LIST_INSERT_AFTER(command, *l, end, e);
5247         } else
5248               *l = e;
5249 }
5250
5251 int exec_command_set(ExecCommand *c, const char *path, ...) {
5252         va_list ap;
5253         char **l, *p;
5254
5255         assert(c);
5256         assert(path);
5257
5258         va_start(ap, path);
5259         l = strv_new_ap(path, ap);
5260         va_end(ap);
5261
5262         if (!l)
5263                 return -ENOMEM;
5264
5265         p = strdup(path);
5266         if (!p) {
5267                 strv_free(l);
5268                 return -ENOMEM;
5269         }
5270
5271         free_and_replace(c->path, p);
5272
5273         return strv_free_and_replace(c->argv, l);
5274 }
5275
5276 int exec_command_append(ExecCommand *c, const char *path, ...) {
5277         _cleanup_strv_free_ char **l = NULL;
5278         va_list ap;
5279         int r;
5280
5281         assert(c);
5282         assert(path);
5283
5284         va_start(ap, path);
5285         l = strv_new_ap(path, ap);
5286         va_end(ap);
5287
5288         if (!l)
5289                 return -ENOMEM;
5290
5291         r = strv_extend_strv(&c->argv, l, false);
5292         if (r < 0)
5293                 return r;
5294
5295         return 0;
5296 }
5297
5298 static void *remove_tmpdir_thread(void *p) {
5299         _cleanup_free_ char *path = p;
5300
5301         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5302         return NULL;
5303 }
5304
5305 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5306         int r;
5307
5308         if (!rt)
5309                 return NULL;
5310
5311         if (rt->manager)
5312                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5313
5314         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5315         if (destroy && rt->tmp_dir) {
5316                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5317
5318                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5319                 if (r < 0) {
5320                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5321                         free(rt->tmp_dir);
5322                 }
5323
5324                 rt->tmp_dir = NULL;
5325         }
5326
5327         if (destroy && rt->var_tmp_dir) {
5328                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5329
5330                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5331                 if (r < 0) {
5332                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5333                         free(rt->var_tmp_dir);
5334                 }
5335
5336                 rt->var_tmp_dir = NULL;
5337         }
5338
5339         rt->id = mfree(rt->id);
5340         rt->tmp_dir = mfree(rt->tmp_dir);
5341         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5342         safe_close_pair(rt->netns_storage_socket);
5343         return mfree(rt);
5344 }
5345
5346 static void exec_runtime_freep(ExecRuntime **rt) {
5347         (void) exec_runtime_free(*rt, false);
5348 }
5349
5350 static int exec_runtime_allocate(ExecRuntime **ret) {
5351         ExecRuntime *n;
5352
5353         assert(ret);
5354
5355         n = new(ExecRuntime, 1);
5356         if (!n)
5357                 return -ENOMEM;
5358
5359         *n = (ExecRuntime) {
5360                 .netns_storage_socket = { -1, -1 },
5361         };
5362
5363         *ret = n;
5364         return 0;
5365 }
5366
5367 static int exec_runtime_add(
5368                 Manager *m,
5369                 const char *id,
5370                 const char *tmp_dir,
5371                 const char *var_tmp_dir,
5372                 const int netns_storage_socket[2],
5373                 ExecRuntime **ret) {
5374
5375         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
5376         int r;
5377
5378         assert(m);
5379         assert(id);
5380
5381         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5382         if (r < 0)
5383                 return r;
5384
5385         r = exec_runtime_allocate(&rt);
5386         if (r < 0)
5387                 return r;
5388
5389         rt->id = strdup(id);
5390         if (!rt->id)
5391                 return -ENOMEM;
5392
5393         if (tmp_dir) {
5394                 rt->tmp_dir = strdup(tmp_dir);
5395                 if (!rt->tmp_dir)
5396                         return -ENOMEM;
5397
5398                 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5399                 assert(var_tmp_dir);
5400                 rt->var_tmp_dir = strdup(var_tmp_dir);
5401                 if (!rt->var_tmp_dir)
5402                         return -ENOMEM;
5403         }
5404
5405         if (netns_storage_socket) {
5406                 rt->netns_storage_socket[0] = netns_storage_socket[0];
5407                 rt->netns_storage_socket[1] = netns_storage_socket[1];
5408         }
5409
5410         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5411         if (r < 0)
5412                 return r;
5413
5414         rt->manager = m;
5415
5416         if (ret)
5417                 *ret = rt;
5418
5419         /* do not remove created ExecRuntime object when the operation succeeds. */
5420         rt = NULL;
5421         return 0;
5422 }
5423
5424 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5425         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5426         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5427         int r;
5428
5429         assert(m);
5430         assert(c);
5431         assert(id);
5432
5433         /* It is not necessary to create ExecRuntime object. */
5434         if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5435                 return 0;
5436
5437         if (c->private_tmp) {
5438                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5439                 if (r < 0)
5440                         return r;
5441         }
5442
5443         if (c->private_network || c->network_namespace_path) {
5444                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5445                         return -errno;
5446         }
5447
5448         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5449         if (r < 0)
5450                 return r;
5451
5452         /* Avoid cleanup */
5453         netns_storage_socket[0] = netns_storage_socket[1] = -1;
5454         return 1;
5455 }
5456
5457 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5458         ExecRuntime *rt;
5459         int r;
5460
5461         assert(m);
5462         assert(id);
5463         assert(ret);
5464
5465         rt = hashmap_get(m->exec_runtime_by_id, id);
5466         if (rt)
5467                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5468                 goto ref;
5469
5470         if (!create)
5471                 return 0;
5472
5473         /* If not found, then create a new object. */
5474         r = exec_runtime_make(m, c, id, &rt);
5475         if (r <= 0)
5476                 /* When r == 0, it is not necessary to create ExecRuntime object. */
5477                 return r;
5478
5479 ref:
5480         /* increment reference counter. */
5481         rt->n_ref++;
5482         *ret = rt;
5483         return 1;
5484 }
5485
5486 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5487         if (!rt)
5488                 return NULL;
5489
5490         assert(rt->n_ref > 0);
5491
5492         rt->n_ref--;
5493         if (rt->n_ref > 0)
5494                 return NULL;
5495
5496         return exec_runtime_free(rt, destroy);
5497 }
5498
5499 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5500         ExecRuntime *rt;
5501         Iterator i;
5502
5503         assert(m);
5504         assert(f);
5505         assert(fds);
5506
5507         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5508                 fprintf(f, "exec-runtime=%s", rt->id);
5509
5510                 if (rt->tmp_dir)
5511                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5512
5513                 if (rt->var_tmp_dir)
5514                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5515
5516                 if (rt->netns_storage_socket[0] >= 0) {
5517                         int copy;
5518
5519                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5520                         if (copy < 0)
5521                                 return copy;
5522
5523                         fprintf(f, " netns-socket-0=%i", copy);
5524                 }
5525
5526                 if (rt->netns_storage_socket[1] >= 0) {
5527                         int copy;
5528
5529                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5530                         if (copy < 0)
5531                                 return copy;
5532
5533                         fprintf(f, " netns-socket-1=%i", copy);
5534                 }
5535
5536                 fputc('\n', f);
5537         }
5538
5539         return 0;
5540 }
5541
5542 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5543         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5544         ExecRuntime *rt;
5545         int r;
5546
5547         /* This is for the migration from old (v237 or earlier) deserialization text.
5548          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5549          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5550          * so or not from the serialized text, then we always creates a new object owned by this. */
5551
5552         assert(u);
5553         assert(key);
5554         assert(value);
5555
5556         /* Manager manages ExecRuntime objects by the unit id.
5557          * So, we omit the serialized text when the unit does not have id (yet?)... */
5558         if (isempty(u->id)) {
5559                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5560                 return 0;
5561         }
5562
5563         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5564         if (r < 0) {
5565                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5566                 return 0;
5567         }
5568
5569         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5570         if (!rt) {
5571                 r = exec_runtime_allocate(&rt_create);
5572                 if (r < 0)
5573                         return log_oom();
5574
5575                 rt_create->id = strdup(u->id);
5576                 if (!rt_create->id)
5577                         return log_oom();
5578
5579                 rt = rt_create;
5580         }
5581
5582         if (streq(key, "tmp-dir")) {
5583                 char *copy;
5584
5585                 copy = strdup(value);
5586                 if (!copy)
5587                         return log_oom();
5588
5589                 free_and_replace(rt->tmp_dir, copy);
5590
5591         } else if (streq(key, "var-tmp-dir")) {
5592                 char *copy;
5593
5594                 copy = strdup(value);
5595                 if (!copy)
5596                         return log_oom();
5597
5598                 free_and_replace(rt->var_tmp_dir, copy);
5599
5600         } else if (streq(key, "netns-socket-0")) {
5601                 int fd;
5602
5603                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5604                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5605                         return 0;
5606                 }
5607
5608                 safe_close(rt->netns_storage_socket[0]);
5609                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5610
5611         } else if (streq(key, "netns-socket-1")) {
5612                 int fd;
5613
5614                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5615                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5616                         return 0;
5617                 }
5618
5619                 safe_close(rt->netns_storage_socket[1]);
5620                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5621         } else
5622                 return 0;
5623
5624         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5625         if (rt_create) {
5626                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5627                 if (r < 0) {
5628                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5629                         return 0;
5630                 }
5631
5632                 rt_create->manager = u->manager;
5633
5634                 /* Avoid cleanup */
5635                 rt_create = NULL;
5636         }
5637
5638         return 1;
5639 }
5640
5641 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5642         char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5643         int r, fd0 = -1, fd1 = -1;
5644         const char *p, *v = value;
5645         size_t n;
5646
5647         assert(m);
5648         assert(value);
5649         assert(fds);
5650
5651         n = strcspn(v, " ");
5652         id = strndupa(v, n);
5653         if (v[n] != ' ')
5654                 goto finalize;
5655         p = v + n + 1;
5656
5657         v = startswith(p, "tmp-dir=");
5658         if (v) {
5659                 n = strcspn(v, " ");
5660                 tmp_dir = strndupa(v, n);
5661                 if (v[n] != ' ')
5662                         goto finalize;
5663                 p = v + n + 1;
5664         }
5665
5666         v = startswith(p, "var-tmp-dir=");
5667         if (v) {
5668                 n = strcspn(v, " ");
5669                 var_tmp_dir = strndupa(v, n);
5670                 if (v[n] != ' ')
5671                         goto finalize;
5672                 p = v + n + 1;
5673         }
5674
5675         v = startswith(p, "netns-socket-0=");
5676         if (v) {
5677                 char *buf;
5678
5679                 n = strcspn(v, " ");
5680                 buf = strndupa(v, n);
5681                 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5682                         log_debug("Unable to process exec-runtime netns fd specification.");
5683                         return;
5684                 }
5685                 fd0 = fdset_remove(fds, fd0);
5686                 if (v[n] != ' ')
5687                         goto finalize;
5688                 p = v + n + 1;
5689         }
5690
5691         v = startswith(p, "netns-socket-1=");
5692         if (v) {
5693                 char *buf;
5694
5695                 n = strcspn(v, " ");
5696                 buf = strndupa(v, n);
5697                 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5698                         log_debug("Unable to process exec-runtime netns fd specification.");
5699                         return;
5700                 }
5701                 fd1 = fdset_remove(fds, fd1);
5702         }
5703
5704 finalize:
5705
5706         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5707         if (r < 0)
5708                 log_debug_errno(r, "Failed to add exec-runtime: %m");
5709 }
5710
5711 void exec_runtime_vacuum(Manager *m) {
5712         ExecRuntime *rt;
5713         Iterator i;
5714
5715         assert(m);
5716
5717         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5718
5719         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5720                 if (rt->n_ref > 0)
5721                         continue;
5722
5723                 (void) exec_runtime_free(rt, false);
5724         }
5725 }
5726
5727 void exec_params_clear(ExecParameters *p) {
5728         if (!p)
5729                 return;
5730
5731         strv_free(p->environment);
5732 }
5733
5734 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5735         [EXEC_INPUT_NULL] = "null",
5736         [EXEC_INPUT_TTY] = "tty",
5737         [EXEC_INPUT_TTY_FORCE] = "tty-force",
5738         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5739         [EXEC_INPUT_SOCKET] = "socket",
5740         [EXEC_INPUT_NAMED_FD] = "fd",
5741         [EXEC_INPUT_DATA] = "data",
5742         [EXEC_INPUT_FILE] = "file",
5743 };
5744
5745 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5746
5747 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5748         [EXEC_OUTPUT_INHERIT] = "inherit",
5749         [EXEC_OUTPUT_NULL] = "null",
5750         [EXEC_OUTPUT_TTY] = "tty",
5751         [EXEC_OUTPUT_SYSLOG] = "syslog",
5752         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5753         [EXEC_OUTPUT_KMSG] = "kmsg",
5754         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5755         [EXEC_OUTPUT_JOURNAL] = "journal",
5756         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5757         [EXEC_OUTPUT_SOCKET] = "socket",
5758         [EXEC_OUTPUT_NAMED_FD] = "fd",
5759         [EXEC_OUTPUT_FILE] = "file",
5760         [EXEC_OUTPUT_FILE_APPEND] = "append",
5761 };
5762
5763 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5764
5765 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5766         [EXEC_UTMP_INIT] = "init",
5767         [EXEC_UTMP_LOGIN] = "login",
5768         [EXEC_UTMP_USER] = "user",
5769 };
5770
5771 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5772
5773 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5774         [EXEC_PRESERVE_NO] = "no",
5775         [EXEC_PRESERVE_YES] = "yes",
5776         [EXEC_PRESERVE_RESTART] = "restart",
5777 };
5778
5779 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5780
5781 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5782 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5783         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5784         [EXEC_DIRECTORY_STATE] = "StateDirectory",
5785         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5786         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5787         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5788 };
5789
5790 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5791
5792 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5793  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5794  * directories, specifically .timer units with their timestamp touch file. */
5795 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5796         [EXEC_DIRECTORY_RUNTIME] = "runtime",
5797         [EXEC_DIRECTORY_STATE] = "state",
5798         [EXEC_DIRECTORY_CACHE] = "cache",
5799         [EXEC_DIRECTORY_LOGS] = "logs",
5800         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5801 };
5802
5803 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5804
5805 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5806  * the service payload in. */
5807 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5808         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5809         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5810         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5811         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5812         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5813 };
5814
5815 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5816
5817 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5818         [EXEC_KEYRING_INHERIT] = "inherit",
5819         [EXEC_KEYRING_PRIVATE] = "private",
5820         [EXEC_KEYRING_SHARED] = "shared",
5821 };
5822
5823 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);