]> git.proxmox.com Git - systemd.git/blob - src/core/execute.c
New upstream version 245.6
[systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/ioctl.h>
8 #include <sys/mman.h>
9 #include <sys/personality.h>
10 #include <sys/prctl.h>
11 #include <sys/shm.h>
12 #include <sys/types.h>
13 #include <sys/un.h>
14 #include <unistd.h>
15 #include <utmpx.h>
16
17 #if HAVE_PAM
18 #include <security/pam_appl.h>
19 #endif
20
21 #if HAVE_SELINUX
22 #include <selinux/selinux.h>
23 #endif
24
25 #if HAVE_SECCOMP
26 #include <seccomp.h>
27 #endif
28
29 #if HAVE_APPARMOR
30 #include <sys/apparmor.h>
31 #endif
32
33 #include "sd-messages.h"
34
35 #include "af-list.h"
36 #include "alloc-util.h"
37 #if HAVE_APPARMOR
38 #include "apparmor-util.h"
39 #endif
40 #include "async.h"
41 #include "barrier.h"
42 #include "cap-list.h"
43 #include "capability-util.h"
44 #include "chown-recursive.h"
45 #include "cgroup-setup.h"
46 #include "cpu-set-util.h"
47 #include "def.h"
48 #include "env-file.h"
49 #include "env-util.h"
50 #include "errno-list.h"
51 #include "execute.h"
52 #include "exit-status.h"
53 #include "fd-util.h"
54 #include "format-util.h"
55 #include "fs-util.h"
56 #include "glob-util.h"
57 #include "io-util.h"
58 #include "ioprio.h"
59 #include "label.h"
60 #include "log.h"
61 #include "macro.h"
62 #include "manager.h"
63 #include "memory-util.h"
64 #include "missing_fs.h"
65 #include "mkdir.h"
66 #include "namespace.h"
67 #include "parse-util.h"
68 #include "path-util.h"
69 #include "process-util.h"
70 #include "rlimit-util.h"
71 #include "rm-rf.h"
72 #if HAVE_SECCOMP
73 #include "seccomp-util.h"
74 #endif
75 #include "securebits-util.h"
76 #include "selinux-util.h"
77 #include "signal-util.h"
78 #include "smack-util.h"
79 #include "socket-util.h"
80 #include "special.h"
81 #include "stat-util.h"
82 #include "string-table.h"
83 #include "string-util.h"
84 #include "strv.h"
85 #include "syslog-util.h"
86 #include "terminal-util.h"
87 #include "umask-util.h"
88 #include "unit.h"
89 #include "user-util.h"
90 #include "utmp-wtmp.h"
91
92 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
93 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
94
95 #define SNDBUF_SIZE (8*1024*1024)
96
97 static int shift_fds(int fds[], size_t n_fds) {
98 int start, restart_from;
99
100 if (n_fds <= 0)
101 return 0;
102
103 /* Modifies the fds array! (sorts it) */
104
105 assert(fds);
106
107 start = 0;
108 for (;;) {
109 int i;
110
111 restart_from = -1;
112
113 for (i = start; i < (int) n_fds; i++) {
114 int nfd;
115
116 /* Already at right index? */
117 if (fds[i] == i+3)
118 continue;
119
120 nfd = fcntl(fds[i], F_DUPFD, i + 3);
121 if (nfd < 0)
122 return -errno;
123
124 safe_close(fds[i]);
125 fds[i] = nfd;
126
127 /* Hmm, the fd we wanted isn't free? Then
128 * let's remember that and try again from here */
129 if (nfd != i+3 && restart_from < 0)
130 restart_from = i;
131 }
132
133 if (restart_from < 0)
134 break;
135
136 start = restart_from;
137 }
138
139 return 0;
140 }
141
142 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
143 size_t i, n_fds;
144 int r;
145
146 n_fds = n_socket_fds + n_storage_fds;
147 if (n_fds <= 0)
148 return 0;
149
150 assert(fds);
151
152 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
153 * O_NONBLOCK only applies to socket activation though. */
154
155 for (i = 0; i < n_fds; i++) {
156
157 if (i < n_socket_fds) {
158 r = fd_nonblock(fds[i], nonblock);
159 if (r < 0)
160 return r;
161 }
162
163 /* We unconditionally drop FD_CLOEXEC from the fds,
164 * since after all we want to pass these fds to our
165 * children */
166
167 r = fd_cloexec(fds[i], false);
168 if (r < 0)
169 return r;
170 }
171
172 return 0;
173 }
174
175 static const char *exec_context_tty_path(const ExecContext *context) {
176 assert(context);
177
178 if (context->stdio_as_fds)
179 return NULL;
180
181 if (context->tty_path)
182 return context->tty_path;
183
184 return "/dev/console";
185 }
186
187 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
188 const char *path;
189
190 assert(context);
191
192 path = exec_context_tty_path(context);
193
194 if (context->tty_vhangup) {
195 if (p && p->stdin_fd >= 0)
196 (void) terminal_vhangup_fd(p->stdin_fd);
197 else if (path)
198 (void) terminal_vhangup(path);
199 }
200
201 if (context->tty_reset) {
202 if (p && p->stdin_fd >= 0)
203 (void) reset_terminal_fd(p->stdin_fd, true);
204 else if (path)
205 (void) reset_terminal(path);
206 }
207
208 if (context->tty_vt_disallocate && path)
209 (void) vt_disallocate(path);
210 }
211
212 static bool is_terminal_input(ExecInput i) {
213 return IN_SET(i,
214 EXEC_INPUT_TTY,
215 EXEC_INPUT_TTY_FORCE,
216 EXEC_INPUT_TTY_FAIL);
217 }
218
219 static bool is_terminal_output(ExecOutput o) {
220 return IN_SET(o,
221 EXEC_OUTPUT_TTY,
222 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
223 EXEC_OUTPUT_KMSG_AND_CONSOLE,
224 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
225 }
226
227 static bool is_syslog_output(ExecOutput o) {
228 return IN_SET(o,
229 EXEC_OUTPUT_SYSLOG,
230 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
231 }
232
233 static bool is_kmsg_output(ExecOutput o) {
234 return IN_SET(o,
235 EXEC_OUTPUT_KMSG,
236 EXEC_OUTPUT_KMSG_AND_CONSOLE);
237 }
238
239 static bool exec_context_needs_term(const ExecContext *c) {
240 assert(c);
241
242 /* Return true if the execution context suggests we should set $TERM to something useful. */
243
244 if (is_terminal_input(c->std_input))
245 return true;
246
247 if (is_terminal_output(c->std_output))
248 return true;
249
250 if (is_terminal_output(c->std_error))
251 return true;
252
253 return !!c->tty_path;
254 }
255
256 static int open_null_as(int flags, int nfd) {
257 int fd;
258
259 assert(nfd >= 0);
260
261 fd = open("/dev/null", flags|O_NOCTTY);
262 if (fd < 0)
263 return -errno;
264
265 return move_fd(fd, nfd, false);
266 }
267
268 static int connect_journal_socket(
269 int fd,
270 const char *log_namespace,
271 uid_t uid,
272 gid_t gid) {
273
274 union sockaddr_union sa;
275 socklen_t sa_len;
276 uid_t olduid = UID_INVALID;
277 gid_t oldgid = GID_INVALID;
278 const char *j;
279 int r;
280
281 j = log_namespace ?
282 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
283 "/run/systemd/journal/stdout";
284 r = sockaddr_un_set_path(&sa.un, j);
285 if (r < 0)
286 return r;
287 sa_len = r;
288
289 if (gid_is_valid(gid)) {
290 oldgid = getgid();
291
292 if (setegid(gid) < 0)
293 return -errno;
294 }
295
296 if (uid_is_valid(uid)) {
297 olduid = getuid();
298
299 if (seteuid(uid) < 0) {
300 r = -errno;
301 goto restore_gid;
302 }
303 }
304
305 r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
306
307 /* If we fail to restore the uid or gid, things will likely
308 fail later on. This should only happen if an LSM interferes. */
309
310 if (uid_is_valid(uid))
311 (void) seteuid(olduid);
312
313 restore_gid:
314 if (gid_is_valid(gid))
315 (void) setegid(oldgid);
316
317 return r;
318 }
319
320 static int connect_logger_as(
321 const Unit *unit,
322 const ExecContext *context,
323 const ExecParameters *params,
324 ExecOutput output,
325 const char *ident,
326 int nfd,
327 uid_t uid,
328 gid_t gid) {
329
330 _cleanup_close_ int fd = -1;
331 int r;
332
333 assert(context);
334 assert(params);
335 assert(output < _EXEC_OUTPUT_MAX);
336 assert(ident);
337 assert(nfd >= 0);
338
339 fd = socket(AF_UNIX, SOCK_STREAM, 0);
340 if (fd < 0)
341 return -errno;
342
343 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
344 if (r < 0)
345 return r;
346
347 if (shutdown(fd, SHUT_RD) < 0)
348 return -errno;
349
350 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
351
352 if (dprintf(fd,
353 "%s\n"
354 "%s\n"
355 "%i\n"
356 "%i\n"
357 "%i\n"
358 "%i\n"
359 "%i\n",
360 context->syslog_identifier ?: ident,
361 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
362 context->syslog_priority,
363 !!context->syslog_level_prefix,
364 is_syslog_output(output),
365 is_kmsg_output(output),
366 is_terminal_output(output)) < 0)
367 return -errno;
368
369 return move_fd(TAKE_FD(fd), nfd, false);
370 }
371
372 static int open_terminal_as(const char *path, int flags, int nfd) {
373 int fd;
374
375 assert(path);
376 assert(nfd >= 0);
377
378 fd = open_terminal(path, flags | O_NOCTTY);
379 if (fd < 0)
380 return fd;
381
382 return move_fd(fd, nfd, false);
383 }
384
385 static int acquire_path(const char *path, int flags, mode_t mode) {
386 union sockaddr_union sa;
387 socklen_t sa_len;
388 _cleanup_close_ int fd = -1;
389 int r;
390
391 assert(path);
392
393 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
394 flags |= O_CREAT;
395
396 fd = open(path, flags|O_NOCTTY, mode);
397 if (fd >= 0)
398 return TAKE_FD(fd);
399
400 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
401 return -errno;
402
403 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
404
405 r = sockaddr_un_set_path(&sa.un, path);
406 if (r < 0)
407 return r == -EINVAL ? -ENXIO : r;
408 sa_len = r;
409
410 fd = socket(AF_UNIX, SOCK_STREAM, 0);
411 if (fd < 0)
412 return -errno;
413
414 if (connect(fd, &sa.sa, sa_len) < 0)
415 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
416 * indication that his wasn't an AF_UNIX socket after all */
417
418 if ((flags & O_ACCMODE) == O_RDONLY)
419 r = shutdown(fd, SHUT_WR);
420 else if ((flags & O_ACCMODE) == O_WRONLY)
421 r = shutdown(fd, SHUT_RD);
422 else
423 r = 0;
424 if (r < 0)
425 return -errno;
426
427 return TAKE_FD(fd);
428 }
429
430 static int fixup_input(
431 const ExecContext *context,
432 int socket_fd,
433 bool apply_tty_stdin) {
434
435 ExecInput std_input;
436
437 assert(context);
438
439 std_input = context->std_input;
440
441 if (is_terminal_input(std_input) && !apply_tty_stdin)
442 return EXEC_INPUT_NULL;
443
444 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
445 return EXEC_INPUT_NULL;
446
447 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
448 return EXEC_INPUT_NULL;
449
450 return std_input;
451 }
452
453 static int fixup_output(ExecOutput std_output, int socket_fd) {
454
455 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
456 return EXEC_OUTPUT_INHERIT;
457
458 return std_output;
459 }
460
461 static int setup_input(
462 const ExecContext *context,
463 const ExecParameters *params,
464 int socket_fd,
465 const int named_iofds[static 3]) {
466
467 ExecInput i;
468
469 assert(context);
470 assert(params);
471 assert(named_iofds);
472
473 if (params->stdin_fd >= 0) {
474 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
475 return -errno;
476
477 /* Try to make this the controlling tty, if it is a tty, and reset it */
478 if (isatty(STDIN_FILENO)) {
479 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
480 (void) reset_terminal_fd(STDIN_FILENO, true);
481 }
482
483 return STDIN_FILENO;
484 }
485
486 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
487
488 switch (i) {
489
490 case EXEC_INPUT_NULL:
491 return open_null_as(O_RDONLY, STDIN_FILENO);
492
493 case EXEC_INPUT_TTY:
494 case EXEC_INPUT_TTY_FORCE:
495 case EXEC_INPUT_TTY_FAIL: {
496 int fd;
497
498 fd = acquire_terminal(exec_context_tty_path(context),
499 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
500 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
501 ACQUIRE_TERMINAL_WAIT,
502 USEC_INFINITY);
503 if (fd < 0)
504 return fd;
505
506 return move_fd(fd, STDIN_FILENO, false);
507 }
508
509 case EXEC_INPUT_SOCKET:
510 assert(socket_fd >= 0);
511
512 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
513
514 case EXEC_INPUT_NAMED_FD:
515 assert(named_iofds[STDIN_FILENO] >= 0);
516
517 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
518 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
519
520 case EXEC_INPUT_DATA: {
521 int fd;
522
523 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
524 if (fd < 0)
525 return fd;
526
527 return move_fd(fd, STDIN_FILENO, false);
528 }
529
530 case EXEC_INPUT_FILE: {
531 bool rw;
532 int fd;
533
534 assert(context->stdio_file[STDIN_FILENO]);
535
536 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
537 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
538
539 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
540 if (fd < 0)
541 return fd;
542
543 return move_fd(fd, STDIN_FILENO, false);
544 }
545
546 default:
547 assert_not_reached("Unknown input type");
548 }
549 }
550
551 static bool can_inherit_stderr_from_stdout(
552 const ExecContext *context,
553 ExecOutput o,
554 ExecOutput e) {
555
556 assert(context);
557
558 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
559 * stderr fd */
560
561 if (e == EXEC_OUTPUT_INHERIT)
562 return true;
563 if (e != o)
564 return false;
565
566 if (e == EXEC_OUTPUT_NAMED_FD)
567 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
568
569 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
570 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
571
572 return true;
573 }
574
575 static int setup_output(
576 const Unit *unit,
577 const ExecContext *context,
578 const ExecParameters *params,
579 int fileno,
580 int socket_fd,
581 const int named_iofds[static 3],
582 const char *ident,
583 uid_t uid,
584 gid_t gid,
585 dev_t *journal_stream_dev,
586 ino_t *journal_stream_ino) {
587
588 ExecOutput o;
589 ExecInput i;
590 int r;
591
592 assert(unit);
593 assert(context);
594 assert(params);
595 assert(ident);
596 assert(journal_stream_dev);
597 assert(journal_stream_ino);
598
599 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
600
601 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
602 return -errno;
603
604 return STDOUT_FILENO;
605 }
606
607 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
608 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
609 return -errno;
610
611 return STDERR_FILENO;
612 }
613
614 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
615 o = fixup_output(context->std_output, socket_fd);
616
617 if (fileno == STDERR_FILENO) {
618 ExecOutput e;
619 e = fixup_output(context->std_error, socket_fd);
620
621 /* This expects the input and output are already set up */
622
623 /* Don't change the stderr file descriptor if we inherit all
624 * the way and are not on a tty */
625 if (e == EXEC_OUTPUT_INHERIT &&
626 o == EXEC_OUTPUT_INHERIT &&
627 i == EXEC_INPUT_NULL &&
628 !is_terminal_input(context->std_input) &&
629 getppid () != 1)
630 return fileno;
631
632 /* Duplicate from stdout if possible */
633 if (can_inherit_stderr_from_stdout(context, o, e))
634 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
635
636 o = e;
637
638 } else if (o == EXEC_OUTPUT_INHERIT) {
639 /* If input got downgraded, inherit the original value */
640 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
641 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
642
643 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
644 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
645 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
646
647 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
648 if (getppid() != 1)
649 return fileno;
650
651 /* We need to open /dev/null here anew, to get the right access mode. */
652 return open_null_as(O_WRONLY, fileno);
653 }
654
655 switch (o) {
656
657 case EXEC_OUTPUT_NULL:
658 return open_null_as(O_WRONLY, fileno);
659
660 case EXEC_OUTPUT_TTY:
661 if (is_terminal_input(i))
662 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
663
664 /* We don't reset the terminal if this is just about output */
665 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
666
667 case EXEC_OUTPUT_SYSLOG:
668 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
669 case EXEC_OUTPUT_KMSG:
670 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
671 case EXEC_OUTPUT_JOURNAL:
672 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
673 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
674 if (r < 0) {
675 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
676 r = open_null_as(O_WRONLY, fileno);
677 } else {
678 struct stat st;
679
680 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
681 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
682 * services to detect whether they are connected to the journal or not.
683 *
684 * If both stdout and stderr are connected to a stream then let's make sure to store the data
685 * about STDERR as that's usually the best way to do logging. */
686
687 if (fstat(fileno, &st) >= 0 &&
688 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
689 *journal_stream_dev = st.st_dev;
690 *journal_stream_ino = st.st_ino;
691 }
692 }
693 return r;
694
695 case EXEC_OUTPUT_SOCKET:
696 assert(socket_fd >= 0);
697
698 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
699
700 case EXEC_OUTPUT_NAMED_FD:
701 assert(named_iofds[fileno] >= 0);
702
703 (void) fd_nonblock(named_iofds[fileno], false);
704 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
705
706 case EXEC_OUTPUT_FILE:
707 case EXEC_OUTPUT_FILE_APPEND: {
708 bool rw;
709 int fd, flags;
710
711 assert(context->stdio_file[fileno]);
712
713 rw = context->std_input == EXEC_INPUT_FILE &&
714 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
715
716 if (rw)
717 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
718
719 flags = O_WRONLY;
720 if (o == EXEC_OUTPUT_FILE_APPEND)
721 flags |= O_APPEND;
722
723 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
724 if (fd < 0)
725 return fd;
726
727 return move_fd(fd, fileno, 0);
728 }
729
730 default:
731 assert_not_reached("Unknown error type");
732 }
733 }
734
735 static int chown_terminal(int fd, uid_t uid) {
736 int r;
737
738 assert(fd >= 0);
739
740 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
741 if (isatty(fd) < 1) {
742 if (IN_SET(errno, EINVAL, ENOTTY))
743 return 0; /* not a tty */
744
745 return -errno;
746 }
747
748 /* This might fail. What matters are the results. */
749 r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
750 if (r < 0)
751 return r;
752
753 return 1;
754 }
755
756 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
757 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
758 int r;
759
760 assert(_saved_stdin);
761 assert(_saved_stdout);
762
763 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
764 if (saved_stdin < 0)
765 return -errno;
766
767 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
768 if (saved_stdout < 0)
769 return -errno;
770
771 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
772 if (fd < 0)
773 return fd;
774
775 r = chown_terminal(fd, getuid());
776 if (r < 0)
777 return r;
778
779 r = reset_terminal_fd(fd, true);
780 if (r < 0)
781 return r;
782
783 r = rearrange_stdio(fd, fd, STDERR_FILENO);
784 fd = -1;
785 if (r < 0)
786 return r;
787
788 *_saved_stdin = saved_stdin;
789 *_saved_stdout = saved_stdout;
790
791 saved_stdin = saved_stdout = -1;
792
793 return 0;
794 }
795
796 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
797 assert(err < 0);
798
799 if (err == -ETIMEDOUT)
800 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
801 else {
802 errno = -err;
803 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
804 }
805 }
806
807 static void write_confirm_error(int err, const char *vc, const Unit *u) {
808 _cleanup_close_ int fd = -1;
809
810 assert(vc);
811
812 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
813 if (fd < 0)
814 return;
815
816 write_confirm_error_fd(err, fd, u);
817 }
818
819 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
820 int r = 0;
821
822 assert(saved_stdin);
823 assert(saved_stdout);
824
825 release_terminal();
826
827 if (*saved_stdin >= 0)
828 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
829 r = -errno;
830
831 if (*saved_stdout >= 0)
832 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
833 r = -errno;
834
835 *saved_stdin = safe_close(*saved_stdin);
836 *saved_stdout = safe_close(*saved_stdout);
837
838 return r;
839 }
840
841 enum {
842 CONFIRM_PRETEND_FAILURE = -1,
843 CONFIRM_PRETEND_SUCCESS = 0,
844 CONFIRM_EXECUTE = 1,
845 };
846
847 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
848 int saved_stdout = -1, saved_stdin = -1, r;
849 _cleanup_free_ char *e = NULL;
850 char c;
851
852 /* For any internal errors, assume a positive response. */
853 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
854 if (r < 0) {
855 write_confirm_error(r, vc, u);
856 return CONFIRM_EXECUTE;
857 }
858
859 /* confirm_spawn might have been disabled while we were sleeping. */
860 if (manager_is_confirm_spawn_disabled(u->manager)) {
861 r = 1;
862 goto restore_stdio;
863 }
864
865 e = ellipsize(cmdline, 60, 100);
866 if (!e) {
867 log_oom();
868 r = CONFIRM_EXECUTE;
869 goto restore_stdio;
870 }
871
872 for (;;) {
873 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
874 if (r < 0) {
875 write_confirm_error_fd(r, STDOUT_FILENO, u);
876 r = CONFIRM_EXECUTE;
877 goto restore_stdio;
878 }
879
880 switch (c) {
881 case 'c':
882 printf("Resuming normal execution.\n");
883 manager_disable_confirm_spawn();
884 r = 1;
885 break;
886 case 'D':
887 unit_dump(u, stdout, " ");
888 continue; /* ask again */
889 case 'f':
890 printf("Failing execution.\n");
891 r = CONFIRM_PRETEND_FAILURE;
892 break;
893 case 'h':
894 printf(" c - continue, proceed without asking anymore\n"
895 " D - dump, show the state of the unit\n"
896 " f - fail, don't execute the command and pretend it failed\n"
897 " h - help\n"
898 " i - info, show a short summary of the unit\n"
899 " j - jobs, show jobs that are in progress\n"
900 " s - skip, don't execute the command and pretend it succeeded\n"
901 " y - yes, execute the command\n");
902 continue; /* ask again */
903 case 'i':
904 printf(" Description: %s\n"
905 " Unit: %s\n"
906 " Command: %s\n",
907 u->id, u->description, cmdline);
908 continue; /* ask again */
909 case 'j':
910 manager_dump_jobs(u->manager, stdout, " ");
911 continue; /* ask again */
912 case 'n':
913 /* 'n' was removed in favor of 'f'. */
914 printf("Didn't understand 'n', did you mean 'f'?\n");
915 continue; /* ask again */
916 case 's':
917 printf("Skipping execution.\n");
918 r = CONFIRM_PRETEND_SUCCESS;
919 break;
920 case 'y':
921 r = CONFIRM_EXECUTE;
922 break;
923 default:
924 assert_not_reached("Unhandled choice");
925 }
926 break;
927 }
928
929 restore_stdio:
930 restore_confirm_stdio(&saved_stdin, &saved_stdout);
931 return r;
932 }
933
934 static int get_fixed_user(const ExecContext *c, const char **user,
935 uid_t *uid, gid_t *gid,
936 const char **home, const char **shell) {
937 int r;
938 const char *name;
939
940 assert(c);
941
942 if (!c->user)
943 return 0;
944
945 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
946 * (i.e. are "/" or "/bin/nologin"). */
947
948 name = c->user;
949 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
950 if (r < 0)
951 return r;
952
953 *user = name;
954 return 0;
955 }
956
957 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
958 int r;
959 const char *name;
960
961 assert(c);
962
963 if (!c->group)
964 return 0;
965
966 name = c->group;
967 r = get_group_creds(&name, gid, 0);
968 if (r < 0)
969 return r;
970
971 *group = name;
972 return 0;
973 }
974
975 static int get_supplementary_groups(const ExecContext *c, const char *user,
976 const char *group, gid_t gid,
977 gid_t **supplementary_gids, int *ngids) {
978 char **i;
979 int r, k = 0;
980 int ngroups_max;
981 bool keep_groups = false;
982 gid_t *groups = NULL;
983 _cleanup_free_ gid_t *l_gids = NULL;
984
985 assert(c);
986
987 /*
988 * If user is given, then lookup GID and supplementary groups list.
989 * We avoid NSS lookups for gid=0. Also we have to initialize groups
990 * here and as early as possible so we keep the list of supplementary
991 * groups of the caller.
992 */
993 if (user && gid_is_valid(gid) && gid != 0) {
994 /* First step, initialize groups from /etc/groups */
995 if (initgroups(user, gid) < 0)
996 return -errno;
997
998 keep_groups = true;
999 }
1000
1001 if (strv_isempty(c->supplementary_groups))
1002 return 0;
1003
1004 /*
1005 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1006 * be positive, otherwise fail.
1007 */
1008 errno = 0;
1009 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1010 if (ngroups_max <= 0)
1011 return errno_or_else(EOPNOTSUPP);
1012
1013 l_gids = new(gid_t, ngroups_max);
1014 if (!l_gids)
1015 return -ENOMEM;
1016
1017 if (keep_groups) {
1018 /*
1019 * Lookup the list of groups that the user belongs to, we
1020 * avoid NSS lookups here too for gid=0.
1021 */
1022 k = ngroups_max;
1023 if (getgrouplist(user, gid, l_gids, &k) < 0)
1024 return -EINVAL;
1025 } else
1026 k = 0;
1027
1028 STRV_FOREACH(i, c->supplementary_groups) {
1029 const char *g;
1030
1031 if (k >= ngroups_max)
1032 return -E2BIG;
1033
1034 g = *i;
1035 r = get_group_creds(&g, l_gids+k, 0);
1036 if (r < 0)
1037 return r;
1038
1039 k++;
1040 }
1041
1042 /*
1043 * Sets ngids to zero to drop all supplementary groups, happens
1044 * when we are under root and SupplementaryGroups= is empty.
1045 */
1046 if (k == 0) {
1047 *ngids = 0;
1048 return 0;
1049 }
1050
1051 /* Otherwise get the final list of supplementary groups */
1052 groups = memdup(l_gids, sizeof(gid_t) * k);
1053 if (!groups)
1054 return -ENOMEM;
1055
1056 *supplementary_gids = groups;
1057 *ngids = k;
1058
1059 groups = NULL;
1060
1061 return 0;
1062 }
1063
1064 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1065 int r;
1066
1067 /* Handle SupplementaryGroups= if it is not empty */
1068 if (ngids > 0) {
1069 r = maybe_setgroups(ngids, supplementary_gids);
1070 if (r < 0)
1071 return r;
1072 }
1073
1074 if (gid_is_valid(gid)) {
1075 /* Then set our gids */
1076 if (setresgid(gid, gid, gid) < 0)
1077 return -errno;
1078 }
1079
1080 return 0;
1081 }
1082
1083 static int enforce_user(const ExecContext *context, uid_t uid) {
1084 assert(context);
1085
1086 if (!uid_is_valid(uid))
1087 return 0;
1088
1089 /* Sets (but doesn't look up) the uid and make sure we keep the
1090 * capabilities while doing so. */
1091
1092 if (context->capability_ambient_set != 0) {
1093
1094 /* First step: If we need to keep capabilities but
1095 * drop privileges we need to make sure we keep our
1096 * caps, while we drop privileges. */
1097 if (uid != 0) {
1098 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1099
1100 if (prctl(PR_GET_SECUREBITS) != sb)
1101 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1102 return -errno;
1103 }
1104 }
1105
1106 /* Second step: actually set the uids */
1107 if (setresuid(uid, uid, uid) < 0)
1108 return -errno;
1109
1110 /* At this point we should have all necessary capabilities but
1111 are otherwise a normal user. However, the caps might got
1112 corrupted due to the setresuid() so we need clean them up
1113 later. This is done outside of this call. */
1114
1115 return 0;
1116 }
1117
1118 #if HAVE_PAM
1119
1120 static int null_conv(
1121 int num_msg,
1122 const struct pam_message **msg,
1123 struct pam_response **resp,
1124 void *appdata_ptr) {
1125
1126 /* We don't support conversations */
1127
1128 return PAM_CONV_ERR;
1129 }
1130
1131 #endif
1132
1133 static int setup_pam(
1134 const char *name,
1135 const char *user,
1136 uid_t uid,
1137 gid_t gid,
1138 const char *tty,
1139 char ***env,
1140 const int fds[], size_t n_fds) {
1141
1142 #if HAVE_PAM
1143
1144 static const struct pam_conv conv = {
1145 .conv = null_conv,
1146 .appdata_ptr = NULL
1147 };
1148
1149 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1150 pam_handle_t *handle = NULL;
1151 sigset_t old_ss;
1152 int pam_code = PAM_SUCCESS, r;
1153 char **nv, **e = NULL;
1154 bool close_session = false;
1155 pid_t pam_pid = 0, parent_pid;
1156 int flags = 0;
1157
1158 assert(name);
1159 assert(user);
1160 assert(env);
1161
1162 /* We set up PAM in the parent process, then fork. The child
1163 * will then stay around until killed via PR_GET_PDEATHSIG or
1164 * systemd via the cgroup logic. It will then remove the PAM
1165 * session again. The parent process will exec() the actual
1166 * daemon. We do things this way to ensure that the main PID
1167 * of the daemon is the one we initially fork()ed. */
1168
1169 r = barrier_create(&barrier);
1170 if (r < 0)
1171 goto fail;
1172
1173 if (log_get_max_level() < LOG_DEBUG)
1174 flags |= PAM_SILENT;
1175
1176 pam_code = pam_start(name, user, &conv, &handle);
1177 if (pam_code != PAM_SUCCESS) {
1178 handle = NULL;
1179 goto fail;
1180 }
1181
1182 if (!tty) {
1183 _cleanup_free_ char *q = NULL;
1184
1185 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1186 * out if that's the case, and read the TTY off it. */
1187
1188 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1189 tty = strjoina("/dev/", q);
1190 }
1191
1192 if (tty) {
1193 pam_code = pam_set_item(handle, PAM_TTY, tty);
1194 if (pam_code != PAM_SUCCESS)
1195 goto fail;
1196 }
1197
1198 STRV_FOREACH(nv, *env) {
1199 pam_code = pam_putenv(handle, *nv);
1200 if (pam_code != PAM_SUCCESS)
1201 goto fail;
1202 }
1203
1204 pam_code = pam_acct_mgmt(handle, flags);
1205 if (pam_code != PAM_SUCCESS)
1206 goto fail;
1207
1208 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1209 if (pam_code != PAM_SUCCESS)
1210 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1211
1212 pam_code = pam_open_session(handle, flags);
1213 if (pam_code != PAM_SUCCESS)
1214 goto fail;
1215
1216 close_session = true;
1217
1218 e = pam_getenvlist(handle);
1219 if (!e) {
1220 pam_code = PAM_BUF_ERR;
1221 goto fail;
1222 }
1223
1224 /* Block SIGTERM, so that we know that it won't get lost in
1225 * the child */
1226
1227 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1228
1229 parent_pid = getpid_cached();
1230
1231 r = safe_fork("(sd-pam)", 0, &pam_pid);
1232 if (r < 0)
1233 goto fail;
1234 if (r == 0) {
1235 int sig, ret = EXIT_PAM;
1236
1237 /* The child's job is to reset the PAM session on
1238 * termination */
1239 barrier_set_role(&barrier, BARRIER_CHILD);
1240
1241 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1242 * are open here that have been opened by PAM. */
1243 (void) close_many(fds, n_fds);
1244
1245 /* Drop privileges - we don't need any to pam_close_session
1246 * and this will make PR_SET_PDEATHSIG work in most cases.
1247 * If this fails, ignore the error - but expect sd-pam threads
1248 * to fail to exit normally */
1249
1250 r = maybe_setgroups(0, NULL);
1251 if (r < 0)
1252 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1253 if (setresgid(gid, gid, gid) < 0)
1254 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1255 if (setresuid(uid, uid, uid) < 0)
1256 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1257
1258 (void) ignore_signals(SIGPIPE, -1);
1259
1260 /* Wait until our parent died. This will only work if
1261 * the above setresuid() succeeds, otherwise the kernel
1262 * will not allow unprivileged parents kill their privileged
1263 * children this way. We rely on the control groups kill logic
1264 * to do the rest for us. */
1265 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1266 goto child_finish;
1267
1268 /* Tell the parent that our setup is done. This is especially
1269 * important regarding dropping privileges. Otherwise, unit
1270 * setup might race against our setresuid(2) call.
1271 *
1272 * If the parent aborted, we'll detect this below, hence ignore
1273 * return failure here. */
1274 (void) barrier_place(&barrier);
1275
1276 /* Check if our parent process might already have died? */
1277 if (getppid() == parent_pid) {
1278 sigset_t ss;
1279
1280 assert_se(sigemptyset(&ss) >= 0);
1281 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1282
1283 for (;;) {
1284 if (sigwait(&ss, &sig) < 0) {
1285 if (errno == EINTR)
1286 continue;
1287
1288 goto child_finish;
1289 }
1290
1291 assert(sig == SIGTERM);
1292 break;
1293 }
1294 }
1295
1296 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1297 if (pam_code != PAM_SUCCESS)
1298 goto child_finish;
1299
1300 /* If our parent died we'll end the session */
1301 if (getppid() != parent_pid) {
1302 pam_code = pam_close_session(handle, flags);
1303 if (pam_code != PAM_SUCCESS)
1304 goto child_finish;
1305 }
1306
1307 ret = 0;
1308
1309 child_finish:
1310 pam_end(handle, pam_code | flags);
1311 _exit(ret);
1312 }
1313
1314 barrier_set_role(&barrier, BARRIER_PARENT);
1315
1316 /* If the child was forked off successfully it will do all the
1317 * cleanups, so forget about the handle here. */
1318 handle = NULL;
1319
1320 /* Unblock SIGTERM again in the parent */
1321 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1322
1323 /* We close the log explicitly here, since the PAM modules
1324 * might have opened it, but we don't want this fd around. */
1325 closelog();
1326
1327 /* Synchronously wait for the child to initialize. We don't care for
1328 * errors as we cannot recover. However, warn loudly if it happens. */
1329 if (!barrier_place_and_sync(&barrier))
1330 log_error("PAM initialization failed");
1331
1332 return strv_free_and_replace(*env, e);
1333
1334 fail:
1335 if (pam_code != PAM_SUCCESS) {
1336 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1337 r = -EPERM; /* PAM errors do not map to errno */
1338 } else
1339 log_error_errno(r, "PAM failed: %m");
1340
1341 if (handle) {
1342 if (close_session)
1343 pam_code = pam_close_session(handle, flags);
1344
1345 pam_end(handle, pam_code | flags);
1346 }
1347
1348 strv_free(e);
1349 closelog();
1350
1351 return r;
1352 #else
1353 return 0;
1354 #endif
1355 }
1356
1357 static void rename_process_from_path(const char *path) {
1358 char process_name[11];
1359 const char *p;
1360 size_t l;
1361
1362 /* This resulting string must fit in 10 chars (i.e. the length
1363 * of "/sbin/init") to look pretty in /bin/ps */
1364
1365 p = basename(path);
1366 if (isempty(p)) {
1367 rename_process("(...)");
1368 return;
1369 }
1370
1371 l = strlen(p);
1372 if (l > 8) {
1373 /* The end of the process name is usually more
1374 * interesting, since the first bit might just be
1375 * "systemd-" */
1376 p = p + l - 8;
1377 l = 8;
1378 }
1379
1380 process_name[0] = '(';
1381 memcpy(process_name+1, p, l);
1382 process_name[1+l] = ')';
1383 process_name[1+l+1] = 0;
1384
1385 rename_process(process_name);
1386 }
1387
1388 static bool context_has_address_families(const ExecContext *c) {
1389 assert(c);
1390
1391 return c->address_families_whitelist ||
1392 !set_isempty(c->address_families);
1393 }
1394
1395 static bool context_has_syscall_filters(const ExecContext *c) {
1396 assert(c);
1397
1398 return c->syscall_whitelist ||
1399 !hashmap_isempty(c->syscall_filter);
1400 }
1401
1402 static bool context_has_no_new_privileges(const ExecContext *c) {
1403 assert(c);
1404
1405 if (c->no_new_privileges)
1406 return true;
1407
1408 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1409 return false;
1410
1411 /* We need NNP if we have any form of seccomp and are unprivileged */
1412 return context_has_address_families(c) ||
1413 c->memory_deny_write_execute ||
1414 c->restrict_realtime ||
1415 c->restrict_suid_sgid ||
1416 exec_context_restrict_namespaces_set(c) ||
1417 c->protect_clock ||
1418 c->protect_kernel_tunables ||
1419 c->protect_kernel_modules ||
1420 c->protect_kernel_logs ||
1421 c->private_devices ||
1422 context_has_syscall_filters(c) ||
1423 !set_isempty(c->syscall_archs) ||
1424 c->lock_personality ||
1425 c->protect_hostname;
1426 }
1427
1428 #if HAVE_SECCOMP
1429
1430 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1431
1432 if (is_seccomp_available())
1433 return false;
1434
1435 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1436 return true;
1437 }
1438
1439 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1440 uint32_t negative_action, default_action, action;
1441 int r;
1442
1443 assert(u);
1444 assert(c);
1445
1446 if (!context_has_syscall_filters(c))
1447 return 0;
1448
1449 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1450 return 0;
1451
1452 negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1453
1454 if (c->syscall_whitelist) {
1455 default_action = negative_action;
1456 action = SCMP_ACT_ALLOW;
1457 } else {
1458 default_action = SCMP_ACT_ALLOW;
1459 action = negative_action;
1460 }
1461
1462 if (needs_ambient_hack) {
1463 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1464 if (r < 0)
1465 return r;
1466 }
1467
1468 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1469 }
1470
1471 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1472 assert(u);
1473 assert(c);
1474
1475 if (set_isempty(c->syscall_archs))
1476 return 0;
1477
1478 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1479 return 0;
1480
1481 return seccomp_restrict_archs(c->syscall_archs);
1482 }
1483
1484 static int apply_address_families(const Unit* u, const ExecContext *c) {
1485 assert(u);
1486 assert(c);
1487
1488 if (!context_has_address_families(c))
1489 return 0;
1490
1491 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1492 return 0;
1493
1494 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1495 }
1496
1497 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1498 assert(u);
1499 assert(c);
1500
1501 if (!c->memory_deny_write_execute)
1502 return 0;
1503
1504 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1505 return 0;
1506
1507 return seccomp_memory_deny_write_execute();
1508 }
1509
1510 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1511 assert(u);
1512 assert(c);
1513
1514 if (!c->restrict_realtime)
1515 return 0;
1516
1517 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1518 return 0;
1519
1520 return seccomp_restrict_realtime();
1521 }
1522
1523 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1524 assert(u);
1525 assert(c);
1526
1527 if (!c->restrict_suid_sgid)
1528 return 0;
1529
1530 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1531 return 0;
1532
1533 return seccomp_restrict_suid_sgid();
1534 }
1535
1536 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1537 assert(u);
1538 assert(c);
1539
1540 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1541 * let's protect even those systems where this is left on in the kernel. */
1542
1543 if (!c->protect_kernel_tunables)
1544 return 0;
1545
1546 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1547 return 0;
1548
1549 return seccomp_protect_sysctl();
1550 }
1551
1552 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1553 assert(u);
1554 assert(c);
1555
1556 /* Turn off module syscalls on ProtectKernelModules=yes */
1557
1558 if (!c->protect_kernel_modules)
1559 return 0;
1560
1561 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1562 return 0;
1563
1564 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1565 }
1566
1567 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1568 assert(u);
1569 assert(c);
1570
1571 if (!c->protect_kernel_logs)
1572 return 0;
1573
1574 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1575 return 0;
1576
1577 return seccomp_protect_syslog();
1578 }
1579
1580 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1581 assert(u);
1582 assert(c);
1583
1584 if (!c->protect_clock)
1585 return 0;
1586
1587 if (skip_seccomp_unavailable(u, "ProtectClock="))
1588 return 0;
1589
1590 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1591 }
1592
1593 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1594 assert(u);
1595 assert(c);
1596
1597 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1598
1599 if (!c->private_devices)
1600 return 0;
1601
1602 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1603 return 0;
1604
1605 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1606 }
1607
1608 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1609 assert(u);
1610 assert(c);
1611
1612 if (!exec_context_restrict_namespaces_set(c))
1613 return 0;
1614
1615 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1616 return 0;
1617
1618 return seccomp_restrict_namespaces(c->restrict_namespaces);
1619 }
1620
1621 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1622 unsigned long personality;
1623 int r;
1624
1625 assert(u);
1626 assert(c);
1627
1628 if (!c->lock_personality)
1629 return 0;
1630
1631 if (skip_seccomp_unavailable(u, "LockPersonality="))
1632 return 0;
1633
1634 personality = c->personality;
1635
1636 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1637 if (personality == PERSONALITY_INVALID) {
1638
1639 r = opinionated_personality(&personality);
1640 if (r < 0)
1641 return r;
1642 }
1643
1644 return seccomp_lock_personality(personality);
1645 }
1646
1647 #endif
1648
1649 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1650 assert(u);
1651 assert(c);
1652
1653 if (!c->protect_hostname)
1654 return 0;
1655
1656 if (ns_type_supported(NAMESPACE_UTS)) {
1657 if (unshare(CLONE_NEWUTS) < 0) {
1658 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1659 *ret_exit_status = EXIT_NAMESPACE;
1660 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1661 }
1662
1663 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1664 }
1665 } else
1666 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1667
1668 #if HAVE_SECCOMP
1669 int r;
1670
1671 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1672 return 0;
1673
1674 r = seccomp_protect_hostname();
1675 if (r < 0) {
1676 *ret_exit_status = EXIT_SECCOMP;
1677 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1678 }
1679 #endif
1680
1681 return 0;
1682 }
1683
1684 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1685 assert(idle_pipe);
1686
1687 idle_pipe[1] = safe_close(idle_pipe[1]);
1688 idle_pipe[2] = safe_close(idle_pipe[2]);
1689
1690 if (idle_pipe[0] >= 0) {
1691 int r;
1692
1693 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1694
1695 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1696 ssize_t n;
1697
1698 /* Signal systemd that we are bored and want to continue. */
1699 n = write(idle_pipe[3], "x", 1);
1700 if (n > 0)
1701 /* Wait for systemd to react to the signal above. */
1702 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1703 }
1704
1705 idle_pipe[0] = safe_close(idle_pipe[0]);
1706
1707 }
1708
1709 idle_pipe[3] = safe_close(idle_pipe[3]);
1710 }
1711
1712 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1713
1714 static int build_environment(
1715 const Unit *u,
1716 const ExecContext *c,
1717 const ExecParameters *p,
1718 size_t n_fds,
1719 const char *home,
1720 const char *username,
1721 const char *shell,
1722 dev_t journal_stream_dev,
1723 ino_t journal_stream_ino,
1724 char ***ret) {
1725
1726 _cleanup_strv_free_ char **our_env = NULL;
1727 ExecDirectoryType t;
1728 size_t n_env = 0;
1729 char *x;
1730
1731 assert(u);
1732 assert(c);
1733 assert(p);
1734 assert(ret);
1735
1736 our_env = new0(char*, 15 + _EXEC_DIRECTORY_TYPE_MAX);
1737 if (!our_env)
1738 return -ENOMEM;
1739
1740 if (n_fds > 0) {
1741 _cleanup_free_ char *joined = NULL;
1742
1743 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1744 return -ENOMEM;
1745 our_env[n_env++] = x;
1746
1747 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1748 return -ENOMEM;
1749 our_env[n_env++] = x;
1750
1751 joined = strv_join(p->fd_names, ":");
1752 if (!joined)
1753 return -ENOMEM;
1754
1755 x = strjoin("LISTEN_FDNAMES=", joined);
1756 if (!x)
1757 return -ENOMEM;
1758 our_env[n_env++] = x;
1759 }
1760
1761 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1762 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1763 return -ENOMEM;
1764 our_env[n_env++] = x;
1765
1766 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1767 return -ENOMEM;
1768 our_env[n_env++] = x;
1769 }
1770
1771 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1772 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1773 * check the database directly. */
1774 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1775 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1776 if (!x)
1777 return -ENOMEM;
1778 our_env[n_env++] = x;
1779 }
1780
1781 if (home) {
1782 x = strjoin("HOME=", home);
1783 if (!x)
1784 return -ENOMEM;
1785
1786 path_simplify(x + 5, true);
1787 our_env[n_env++] = x;
1788 }
1789
1790 if (username) {
1791 x = strjoin("LOGNAME=", username);
1792 if (!x)
1793 return -ENOMEM;
1794 our_env[n_env++] = x;
1795
1796 x = strjoin("USER=", username);
1797 if (!x)
1798 return -ENOMEM;
1799 our_env[n_env++] = x;
1800 }
1801
1802 if (shell) {
1803 x = strjoin("SHELL=", shell);
1804 if (!x)
1805 return -ENOMEM;
1806
1807 path_simplify(x + 6, true);
1808 our_env[n_env++] = x;
1809 }
1810
1811 if (!sd_id128_is_null(u->invocation_id)) {
1812 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1813 return -ENOMEM;
1814
1815 our_env[n_env++] = x;
1816 }
1817
1818 if (exec_context_needs_term(c)) {
1819 const char *tty_path, *term = NULL;
1820
1821 tty_path = exec_context_tty_path(c);
1822
1823 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1824 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1825 * container manager passes to PID 1 ends up all the way in the console login shown. */
1826
1827 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1828 term = getenv("TERM");
1829
1830 if (!term)
1831 term = default_term_for_tty(tty_path);
1832
1833 x = strjoin("TERM=", term);
1834 if (!x)
1835 return -ENOMEM;
1836 our_env[n_env++] = x;
1837 }
1838
1839 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1840 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1841 return -ENOMEM;
1842
1843 our_env[n_env++] = x;
1844 }
1845
1846 if (c->log_namespace) {
1847 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1848 if (!x)
1849 return -ENOMEM;
1850
1851 our_env[n_env++] = x;
1852 }
1853
1854 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1855 _cleanup_free_ char *pre = NULL, *joined = NULL;
1856 const char *n;
1857
1858 if (!p->prefix[t])
1859 continue;
1860
1861 if (strv_isempty(c->directories[t].paths))
1862 continue;
1863
1864 n = exec_directory_env_name_to_string(t);
1865 if (!n)
1866 continue;
1867
1868 pre = strjoin(p->prefix[t], "/");
1869 if (!pre)
1870 return -ENOMEM;
1871
1872 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1873 if (!joined)
1874 return -ENOMEM;
1875
1876 x = strjoin(n, "=", joined);
1877 if (!x)
1878 return -ENOMEM;
1879
1880 our_env[n_env++] = x;
1881 }
1882
1883 our_env[n_env++] = NULL;
1884 assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1885
1886 *ret = TAKE_PTR(our_env);
1887
1888 return 0;
1889 }
1890
1891 static int build_pass_environment(const ExecContext *c, char ***ret) {
1892 _cleanup_strv_free_ char **pass_env = NULL;
1893 size_t n_env = 0, n_bufsize = 0;
1894 char **i;
1895
1896 STRV_FOREACH(i, c->pass_environment) {
1897 _cleanup_free_ char *x = NULL;
1898 char *v;
1899
1900 v = getenv(*i);
1901 if (!v)
1902 continue;
1903 x = strjoin(*i, "=", v);
1904 if (!x)
1905 return -ENOMEM;
1906
1907 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1908 return -ENOMEM;
1909
1910 pass_env[n_env++] = TAKE_PTR(x);
1911 pass_env[n_env] = NULL;
1912 }
1913
1914 *ret = TAKE_PTR(pass_env);
1915
1916 return 0;
1917 }
1918
1919 static bool exec_needs_mount_namespace(
1920 const ExecContext *context,
1921 const ExecParameters *params,
1922 const ExecRuntime *runtime) {
1923
1924 assert(context);
1925 assert(params);
1926
1927 if (context->root_image)
1928 return true;
1929
1930 if (!strv_isempty(context->read_write_paths) ||
1931 !strv_isempty(context->read_only_paths) ||
1932 !strv_isempty(context->inaccessible_paths))
1933 return true;
1934
1935 if (context->n_bind_mounts > 0)
1936 return true;
1937
1938 if (context->n_temporary_filesystems > 0)
1939 return true;
1940
1941 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1942 return true;
1943
1944 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1945 return true;
1946
1947 if (context->private_devices ||
1948 context->private_mounts ||
1949 context->protect_system != PROTECT_SYSTEM_NO ||
1950 context->protect_home != PROTECT_HOME_NO ||
1951 context->protect_kernel_tunables ||
1952 context->protect_kernel_modules ||
1953 context->protect_kernel_logs ||
1954 context->protect_control_groups)
1955 return true;
1956
1957 if (context->root_directory) {
1958 ExecDirectoryType t;
1959
1960 if (context->mount_apivfs)
1961 return true;
1962
1963 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1964 if (!params->prefix[t])
1965 continue;
1966
1967 if (!strv_isempty(context->directories[t].paths))
1968 return true;
1969 }
1970 }
1971
1972 if (context->dynamic_user &&
1973 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1974 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1975 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1976 return true;
1977
1978 if (context->log_namespace)
1979 return true;
1980
1981 return false;
1982 }
1983
1984 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
1985 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1986 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1987 _cleanup_close_ int unshare_ready_fd = -1;
1988 _cleanup_(sigkill_waitp) pid_t pid = 0;
1989 uint64_t c = 1;
1990 ssize_t n;
1991 int r;
1992
1993 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
1994 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
1995 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1996 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1997 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1998 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1999 * continues execution normally.
2000 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2001 * does not need CAP_SETUID to write the single line mapping to itself. */
2002
2003 /* Can only set up multiple mappings with CAP_SETUID. */
2004 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2005 r = asprintf(&uid_map,
2006 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2007 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2008 ouid, ouid, uid, uid);
2009 else
2010 r = asprintf(&uid_map,
2011 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2012 ouid, ouid);
2013
2014 if (r < 0)
2015 return -ENOMEM;
2016
2017 /* Can only set up multiple mappings with CAP_SETGID. */
2018 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2019 r = asprintf(&gid_map,
2020 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2021 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2022 ogid, ogid, gid, gid);
2023 else
2024 r = asprintf(&gid_map,
2025 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2026 ogid, ogid);
2027
2028 if (r < 0)
2029 return -ENOMEM;
2030
2031 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2032 * namespace. */
2033 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2034 if (unshare_ready_fd < 0)
2035 return -errno;
2036
2037 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2038 * failed. */
2039 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2040 return -errno;
2041
2042 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2043 if (r < 0)
2044 return r;
2045 if (r == 0) {
2046 _cleanup_close_ int fd = -1;
2047 const char *a;
2048 pid_t ppid;
2049
2050 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2051 * here, after the parent opened its own user namespace. */
2052
2053 ppid = getppid();
2054 errno_pipe[0] = safe_close(errno_pipe[0]);
2055
2056 /* Wait until the parent unshared the user namespace */
2057 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2058 r = -errno;
2059 goto child_fail;
2060 }
2061
2062 /* Disable the setgroups() system call in the child user namespace, for good. */
2063 a = procfs_file_alloca(ppid, "setgroups");
2064 fd = open(a, O_WRONLY|O_CLOEXEC);
2065 if (fd < 0) {
2066 if (errno != ENOENT) {
2067 r = -errno;
2068 goto child_fail;
2069 }
2070
2071 /* If the file is missing the kernel is too old, let's continue anyway. */
2072 } else {
2073 if (write(fd, "deny\n", 5) < 0) {
2074 r = -errno;
2075 goto child_fail;
2076 }
2077
2078 fd = safe_close(fd);
2079 }
2080
2081 /* First write the GID map */
2082 a = procfs_file_alloca(ppid, "gid_map");
2083 fd = open(a, O_WRONLY|O_CLOEXEC);
2084 if (fd < 0) {
2085 r = -errno;
2086 goto child_fail;
2087 }
2088 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2089 r = -errno;
2090 goto child_fail;
2091 }
2092 fd = safe_close(fd);
2093
2094 /* The write the UID map */
2095 a = procfs_file_alloca(ppid, "uid_map");
2096 fd = open(a, O_WRONLY|O_CLOEXEC);
2097 if (fd < 0) {
2098 r = -errno;
2099 goto child_fail;
2100 }
2101 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2102 r = -errno;
2103 goto child_fail;
2104 }
2105
2106 _exit(EXIT_SUCCESS);
2107
2108 child_fail:
2109 (void) write(errno_pipe[1], &r, sizeof(r));
2110 _exit(EXIT_FAILURE);
2111 }
2112
2113 errno_pipe[1] = safe_close(errno_pipe[1]);
2114
2115 if (unshare(CLONE_NEWUSER) < 0)
2116 return -errno;
2117
2118 /* Let the child know that the namespace is ready now */
2119 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2120 return -errno;
2121
2122 /* Try to read an error code from the child */
2123 n = read(errno_pipe[0], &r, sizeof(r));
2124 if (n < 0)
2125 return -errno;
2126 if (n == sizeof(r)) { /* an error code was sent to us */
2127 if (r < 0)
2128 return r;
2129 return -EIO;
2130 }
2131 if (n != 0) /* on success we should have read 0 bytes */
2132 return -EIO;
2133
2134 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2135 pid = 0;
2136 if (r < 0)
2137 return r;
2138 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2139 return -EIO;
2140
2141 return 0;
2142 }
2143
2144 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2145 if (!context->dynamic_user)
2146 return false;
2147
2148 if (type == EXEC_DIRECTORY_CONFIGURATION)
2149 return false;
2150
2151 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2152 return false;
2153
2154 return true;
2155 }
2156
2157 static int setup_exec_directory(
2158 const ExecContext *context,
2159 const ExecParameters *params,
2160 uid_t uid,
2161 gid_t gid,
2162 ExecDirectoryType type,
2163 int *exit_status) {
2164
2165 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2166 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2167 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2168 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2169 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2170 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2171 };
2172 char **rt;
2173 int r;
2174
2175 assert(context);
2176 assert(params);
2177 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2178 assert(exit_status);
2179
2180 if (!params->prefix[type])
2181 return 0;
2182
2183 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2184 if (!uid_is_valid(uid))
2185 uid = 0;
2186 if (!gid_is_valid(gid))
2187 gid = 0;
2188 }
2189
2190 STRV_FOREACH(rt, context->directories[type].paths) {
2191 _cleanup_free_ char *p = NULL, *pp = NULL;
2192
2193 p = path_join(params->prefix[type], *rt);
2194 if (!p) {
2195 r = -ENOMEM;
2196 goto fail;
2197 }
2198
2199 r = mkdir_parents_label(p, 0755);
2200 if (r < 0)
2201 goto fail;
2202
2203 if (exec_directory_is_private(context, type)) {
2204 _cleanup_free_ char *private_root = NULL;
2205
2206 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2207 * case we want to avoid leaving a directory around fully accessible that is owned by
2208 * a dynamic user whose UID is later on reused. To lock this down we use the same
2209 * trick used by container managers to prohibit host users to get access to files of
2210 * the same UID in containers: we place everything inside a directory that has an
2211 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2212 * for unprivileged host code. We then use fs namespacing to make this directory
2213 * permeable for the service itself.
2214 *
2215 * Specifically: for a service which wants a special directory "foo/" we first create
2216 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2217 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2218 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2219 * unprivileged host users can't look into it. Inside of the namespace of the unit
2220 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2221 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2222 * for the service and making sure it only gets access to the dirs it needs but no
2223 * others. Tricky? Yes, absolutely, but it works!
2224 *
2225 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2226 * to be owned by the service itself.
2227 *
2228 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2229 * for sharing files or sockets with other services. */
2230
2231 private_root = path_join(params->prefix[type], "private");
2232 if (!private_root) {
2233 r = -ENOMEM;
2234 goto fail;
2235 }
2236
2237 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2238 r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2239 if (r < 0)
2240 goto fail;
2241
2242 pp = path_join(private_root, *rt);
2243 if (!pp) {
2244 r = -ENOMEM;
2245 goto fail;
2246 }
2247
2248 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2249 r = mkdir_parents_label(pp, 0755);
2250 if (r < 0)
2251 goto fail;
2252
2253 if (is_dir(p, false) > 0 &&
2254 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2255
2256 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2257 * it over. Most likely the service has been upgraded from one that didn't use
2258 * DynamicUser=1, to one that does. */
2259
2260 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2261 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2262 exec_directory_type_to_string(type), p, pp);
2263
2264 if (rename(p, pp) < 0) {
2265 r = -errno;
2266 goto fail;
2267 }
2268 } else {
2269 /* Otherwise, create the actual directory for the service */
2270
2271 r = mkdir_label(pp, context->directories[type].mode);
2272 if (r < 0 && r != -EEXIST)
2273 goto fail;
2274 }
2275
2276 /* And link it up from the original place */
2277 r = symlink_idempotent(pp, p, true);
2278 if (r < 0)
2279 goto fail;
2280
2281 } else {
2282 _cleanup_free_ char *target = NULL;
2283
2284 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2285 readlink_and_make_absolute(p, &target) >= 0) {
2286 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2287
2288 /* This already exists and is a symlink? Interesting. Maybe it's one created
2289 * by DynamicUser=1 (see above)?
2290 *
2291 * We do this for all directory types except for ConfigurationDirectory=,
2292 * since they all support the private/ symlink logic at least in some
2293 * configurations, see above. */
2294
2295 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2296 if (r < 0)
2297 goto fail;
2298
2299 q = path_join(params->prefix[type], "private", *rt);
2300 if (!q) {
2301 r = -ENOMEM;
2302 goto fail;
2303 }
2304
2305 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2306 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2307 if (r < 0)
2308 goto fail;
2309
2310 if (path_equal(q_resolved, target_resolved)) {
2311
2312 /* Hmm, apparently DynamicUser= was once turned on for this service,
2313 * but is no longer. Let's move the directory back up. */
2314
2315 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2316 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2317 exec_directory_type_to_string(type), q, p);
2318
2319 if (unlink(p) < 0) {
2320 r = -errno;
2321 goto fail;
2322 }
2323
2324 if (rename(q, p) < 0) {
2325 r = -errno;
2326 goto fail;
2327 }
2328 }
2329 }
2330
2331 r = mkdir_label(p, context->directories[type].mode);
2332 if (r < 0) {
2333 if (r != -EEXIST)
2334 goto fail;
2335
2336 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2337 struct stat st;
2338
2339 /* Don't change the owner/access mode of the configuration directory,
2340 * as in the common case it is not written to by a service, and shall
2341 * not be writable. */
2342
2343 if (stat(p, &st) < 0) {
2344 r = -errno;
2345 goto fail;
2346 }
2347
2348 /* Still complain if the access mode doesn't match */
2349 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2350 log_warning("%s \'%s\' already exists but the mode is different. "
2351 "(File system: %o %sMode: %o)",
2352 exec_directory_type_to_string(type), *rt,
2353 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2354
2355 continue;
2356 }
2357 }
2358 }
2359
2360 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2361 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2362 * current UID/GID ownership.) */
2363 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2364 if (r < 0)
2365 goto fail;
2366
2367 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2368 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2369 * assignments to exist.*/
2370 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2371 if (r < 0)
2372 goto fail;
2373 }
2374
2375 return 0;
2376
2377 fail:
2378 *exit_status = exit_status_table[type];
2379 return r;
2380 }
2381
2382 #if ENABLE_SMACK
2383 static int setup_smack(
2384 const ExecContext *context,
2385 const ExecCommand *command) {
2386
2387 int r;
2388
2389 assert(context);
2390 assert(command);
2391
2392 if (context->smack_process_label) {
2393 r = mac_smack_apply_pid(0, context->smack_process_label);
2394 if (r < 0)
2395 return r;
2396 }
2397 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2398 else {
2399 _cleanup_free_ char *exec_label = NULL;
2400
2401 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2402 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2403 return r;
2404
2405 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2406 if (r < 0)
2407 return r;
2408 }
2409 #endif
2410
2411 return 0;
2412 }
2413 #endif
2414
2415 static int compile_bind_mounts(
2416 const ExecContext *context,
2417 const ExecParameters *params,
2418 BindMount **ret_bind_mounts,
2419 size_t *ret_n_bind_mounts,
2420 char ***ret_empty_directories) {
2421
2422 _cleanup_strv_free_ char **empty_directories = NULL;
2423 BindMount *bind_mounts;
2424 size_t n, h = 0, i;
2425 ExecDirectoryType t;
2426 int r;
2427
2428 assert(context);
2429 assert(params);
2430 assert(ret_bind_mounts);
2431 assert(ret_n_bind_mounts);
2432 assert(ret_empty_directories);
2433
2434 n = context->n_bind_mounts;
2435 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2436 if (!params->prefix[t])
2437 continue;
2438
2439 n += strv_length(context->directories[t].paths);
2440 }
2441
2442 if (n <= 0) {
2443 *ret_bind_mounts = NULL;
2444 *ret_n_bind_mounts = 0;
2445 *ret_empty_directories = NULL;
2446 return 0;
2447 }
2448
2449 bind_mounts = new(BindMount, n);
2450 if (!bind_mounts)
2451 return -ENOMEM;
2452
2453 for (i = 0; i < context->n_bind_mounts; i++) {
2454 BindMount *item = context->bind_mounts + i;
2455 char *s, *d;
2456
2457 s = strdup(item->source);
2458 if (!s) {
2459 r = -ENOMEM;
2460 goto finish;
2461 }
2462
2463 d = strdup(item->destination);
2464 if (!d) {
2465 free(s);
2466 r = -ENOMEM;
2467 goto finish;
2468 }
2469
2470 bind_mounts[h++] = (BindMount) {
2471 .source = s,
2472 .destination = d,
2473 .read_only = item->read_only,
2474 .recursive = item->recursive,
2475 .ignore_enoent = item->ignore_enoent,
2476 };
2477 }
2478
2479 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2480 char **suffix;
2481
2482 if (!params->prefix[t])
2483 continue;
2484
2485 if (strv_isempty(context->directories[t].paths))
2486 continue;
2487
2488 if (exec_directory_is_private(context, t) &&
2489 !(context->root_directory || context->root_image)) {
2490 char *private_root;
2491
2492 /* So this is for a dynamic user, and we need to make sure the process can access its own
2493 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2494 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2495
2496 private_root = path_join(params->prefix[t], "private");
2497 if (!private_root) {
2498 r = -ENOMEM;
2499 goto finish;
2500 }
2501
2502 r = strv_consume(&empty_directories, private_root);
2503 if (r < 0)
2504 goto finish;
2505 }
2506
2507 STRV_FOREACH(suffix, context->directories[t].paths) {
2508 char *s, *d;
2509
2510 if (exec_directory_is_private(context, t))
2511 s = path_join(params->prefix[t], "private", *suffix);
2512 else
2513 s = path_join(params->prefix[t], *suffix);
2514 if (!s) {
2515 r = -ENOMEM;
2516 goto finish;
2517 }
2518
2519 if (exec_directory_is_private(context, t) &&
2520 (context->root_directory || context->root_image))
2521 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2522 * directory is not created on the root directory. So, let's bind-mount the directory
2523 * on the 'non-private' place. */
2524 d = path_join(params->prefix[t], *suffix);
2525 else
2526 d = strdup(s);
2527 if (!d) {
2528 free(s);
2529 r = -ENOMEM;
2530 goto finish;
2531 }
2532
2533 bind_mounts[h++] = (BindMount) {
2534 .source = s,
2535 .destination = d,
2536 .read_only = false,
2537 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2538 .recursive = true,
2539 .ignore_enoent = false,
2540 };
2541 }
2542 }
2543
2544 assert(h == n);
2545
2546 *ret_bind_mounts = bind_mounts;
2547 *ret_n_bind_mounts = n;
2548 *ret_empty_directories = TAKE_PTR(empty_directories);
2549
2550 return (int) n;
2551
2552 finish:
2553 bind_mount_free_many(bind_mounts, h);
2554 return r;
2555 }
2556
2557 static bool insist_on_sandboxing(
2558 const ExecContext *context,
2559 const char *root_dir,
2560 const char *root_image,
2561 const BindMount *bind_mounts,
2562 size_t n_bind_mounts) {
2563
2564 size_t i;
2565
2566 assert(context);
2567 assert(n_bind_mounts == 0 || bind_mounts);
2568
2569 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2570 * would alter the view on the file system beyond making things read-only or invisble, i.e. would
2571 * rearrange stuff in a way we cannot ignore gracefully. */
2572
2573 if (context->n_temporary_filesystems > 0)
2574 return true;
2575
2576 if (root_dir || root_image)
2577 return true;
2578
2579 if (context->dynamic_user)
2580 return true;
2581
2582 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2583 * essential. */
2584 for (i = 0; i < n_bind_mounts; i++)
2585 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2586 return true;
2587
2588 if (context->log_namespace)
2589 return true;
2590
2591 return false;
2592 }
2593
2594 static int apply_mount_namespace(
2595 const Unit *u,
2596 const ExecCommand *command,
2597 const ExecContext *context,
2598 const ExecParameters *params,
2599 const ExecRuntime *runtime,
2600 char **error_path) {
2601
2602 _cleanup_strv_free_ char **empty_directories = NULL;
2603 char *tmp = NULL, *var = NULL;
2604 const char *root_dir = NULL, *root_image = NULL;
2605 NamespaceInfo ns_info;
2606 bool needs_sandboxing;
2607 BindMount *bind_mounts = NULL;
2608 size_t n_bind_mounts = 0;
2609 int r;
2610
2611 assert(context);
2612
2613 if (params->flags & EXEC_APPLY_CHROOT) {
2614 root_image = context->root_image;
2615
2616 if (!root_image)
2617 root_dir = context->root_directory;
2618 }
2619
2620 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2621 if (r < 0)
2622 return r;
2623
2624 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2625 if (needs_sandboxing) {
2626 /* The runtime struct only contains the parent of the private /tmp,
2627 * which is non-accessible to world users. Inside of it there's a /tmp
2628 * that is sticky, and that's the one we want to use here. */
2629
2630 if (context->private_tmp && runtime) {
2631 if (runtime->tmp_dir)
2632 tmp = strjoina(runtime->tmp_dir, "/tmp");
2633 if (runtime->var_tmp_dir)
2634 var = strjoina(runtime->var_tmp_dir, "/tmp");
2635 }
2636
2637 ns_info = (NamespaceInfo) {
2638 .ignore_protect_paths = false,
2639 .private_dev = context->private_devices,
2640 .protect_control_groups = context->protect_control_groups,
2641 .protect_kernel_tunables = context->protect_kernel_tunables,
2642 .protect_kernel_modules = context->protect_kernel_modules,
2643 .protect_kernel_logs = context->protect_kernel_logs,
2644 .protect_hostname = context->protect_hostname,
2645 .mount_apivfs = context->mount_apivfs,
2646 .private_mounts = context->private_mounts,
2647 };
2648 } else if (!context->dynamic_user && root_dir)
2649 /*
2650 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2651 * sandbox info, otherwise enforce it, don't ignore protected paths and
2652 * fail if we are enable to apply the sandbox inside the mount namespace.
2653 */
2654 ns_info = (NamespaceInfo) {
2655 .ignore_protect_paths = true,
2656 };
2657 else
2658 ns_info = (NamespaceInfo) {};
2659
2660 if (context->mount_flags == MS_SHARED)
2661 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2662
2663 r = setup_namespace(root_dir, root_image,
2664 &ns_info, context->read_write_paths,
2665 needs_sandboxing ? context->read_only_paths : NULL,
2666 needs_sandboxing ? context->inaccessible_paths : NULL,
2667 empty_directories,
2668 bind_mounts,
2669 n_bind_mounts,
2670 context->temporary_filesystems,
2671 context->n_temporary_filesystems,
2672 tmp,
2673 var,
2674 context->log_namespace,
2675 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2676 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2677 context->mount_flags,
2678 DISSECT_IMAGE_DISCARD_ON_LOOP|DISSECT_IMAGE_RELAX_VAR_CHECK|DISSECT_IMAGE_FSCK,
2679 error_path);
2680
2681 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2682 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2683 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2684 * completely different execution environment. */
2685 if (r == -ENOANO) {
2686 if (insist_on_sandboxing(
2687 context,
2688 root_dir, root_image,
2689 bind_mounts,
2690 n_bind_mounts)) {
2691 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2692 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2693 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2694
2695 r = -EOPNOTSUPP;
2696 } else {
2697 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2698 r = 0;
2699 }
2700 }
2701
2702 bind_mount_free_many(bind_mounts, n_bind_mounts);
2703 return r;
2704 }
2705
2706 static int apply_working_directory(
2707 const ExecContext *context,
2708 const ExecParameters *params,
2709 const char *home,
2710 int *exit_status) {
2711
2712 const char *d, *wd;
2713
2714 assert(context);
2715 assert(exit_status);
2716
2717 if (context->working_directory_home) {
2718
2719 if (!home) {
2720 *exit_status = EXIT_CHDIR;
2721 return -ENXIO;
2722 }
2723
2724 wd = home;
2725
2726 } else if (context->working_directory)
2727 wd = context->working_directory;
2728 else
2729 wd = "/";
2730
2731 if (params->flags & EXEC_APPLY_CHROOT)
2732 d = wd;
2733 else
2734 d = prefix_roota(context->root_directory, wd);
2735
2736 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2737 *exit_status = EXIT_CHDIR;
2738 return -errno;
2739 }
2740
2741 return 0;
2742 }
2743
2744 static int apply_root_directory(
2745 const ExecContext *context,
2746 const ExecParameters *params,
2747 const bool needs_mount_ns,
2748 int *exit_status) {
2749
2750 assert(context);
2751 assert(exit_status);
2752
2753 if (params->flags & EXEC_APPLY_CHROOT) {
2754 if (!needs_mount_ns && context->root_directory)
2755 if (chroot(context->root_directory) < 0) {
2756 *exit_status = EXIT_CHROOT;
2757 return -errno;
2758 }
2759 }
2760
2761 return 0;
2762 }
2763
2764 static int setup_keyring(
2765 const Unit *u,
2766 const ExecContext *context,
2767 const ExecParameters *p,
2768 uid_t uid, gid_t gid) {
2769
2770 key_serial_t keyring;
2771 int r = 0;
2772 uid_t saved_uid;
2773 gid_t saved_gid;
2774
2775 assert(u);
2776 assert(context);
2777 assert(p);
2778
2779 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2780 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2781 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2782 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2783 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2784 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2785
2786 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2787 return 0;
2788
2789 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2790 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2791 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2792 * & group is just as nasty as acquiring a reference to the user keyring. */
2793
2794 saved_uid = getuid();
2795 saved_gid = getgid();
2796
2797 if (gid_is_valid(gid) && gid != saved_gid) {
2798 if (setregid(gid, -1) < 0)
2799 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2800 }
2801
2802 if (uid_is_valid(uid) && uid != saved_uid) {
2803 if (setreuid(uid, -1) < 0) {
2804 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2805 goto out;
2806 }
2807 }
2808
2809 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2810 if (keyring == -1) {
2811 if (errno == ENOSYS)
2812 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2813 else if (IN_SET(errno, EACCES, EPERM))
2814 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2815 else if (errno == EDQUOT)
2816 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2817 else
2818 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2819
2820 goto out;
2821 }
2822
2823 /* When requested link the user keyring into the session keyring. */
2824 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2825
2826 if (keyctl(KEYCTL_LINK,
2827 KEY_SPEC_USER_KEYRING,
2828 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2829 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2830 goto out;
2831 }
2832 }
2833
2834 /* Restore uid/gid back */
2835 if (uid_is_valid(uid) && uid != saved_uid) {
2836 if (setreuid(saved_uid, -1) < 0) {
2837 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2838 goto out;
2839 }
2840 }
2841
2842 if (gid_is_valid(gid) && gid != saved_gid) {
2843 if (setregid(saved_gid, -1) < 0)
2844 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2845 }
2846
2847 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2848 if (!sd_id128_is_null(u->invocation_id)) {
2849 key_serial_t key;
2850
2851 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2852 if (key == -1)
2853 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2854 else {
2855 if (keyctl(KEYCTL_SETPERM, key,
2856 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2857 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2858 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2859 }
2860 }
2861
2862 out:
2863 /* Revert back uid & gid for the the last time, and exit */
2864 /* no extra logging, as only the first already reported error matters */
2865 if (getuid() != saved_uid)
2866 (void) setreuid(saved_uid, -1);
2867
2868 if (getgid() != saved_gid)
2869 (void) setregid(saved_gid, -1);
2870
2871 return r;
2872 }
2873
2874 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2875 assert(array);
2876 assert(n);
2877 assert(pair);
2878
2879 if (pair[0] >= 0)
2880 array[(*n)++] = pair[0];
2881 if (pair[1] >= 0)
2882 array[(*n)++] = pair[1];
2883 }
2884
2885 static int close_remaining_fds(
2886 const ExecParameters *params,
2887 const ExecRuntime *runtime,
2888 const DynamicCreds *dcreds,
2889 int user_lookup_fd,
2890 int socket_fd,
2891 int exec_fd,
2892 const int *fds, size_t n_fds) {
2893
2894 size_t n_dont_close = 0;
2895 int dont_close[n_fds + 12];
2896
2897 assert(params);
2898
2899 if (params->stdin_fd >= 0)
2900 dont_close[n_dont_close++] = params->stdin_fd;
2901 if (params->stdout_fd >= 0)
2902 dont_close[n_dont_close++] = params->stdout_fd;
2903 if (params->stderr_fd >= 0)
2904 dont_close[n_dont_close++] = params->stderr_fd;
2905
2906 if (socket_fd >= 0)
2907 dont_close[n_dont_close++] = socket_fd;
2908 if (exec_fd >= 0)
2909 dont_close[n_dont_close++] = exec_fd;
2910 if (n_fds > 0) {
2911 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2912 n_dont_close += n_fds;
2913 }
2914
2915 if (runtime)
2916 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2917
2918 if (dcreds) {
2919 if (dcreds->user)
2920 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2921 if (dcreds->group)
2922 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2923 }
2924
2925 if (user_lookup_fd >= 0)
2926 dont_close[n_dont_close++] = user_lookup_fd;
2927
2928 return close_all_fds(dont_close, n_dont_close);
2929 }
2930
2931 static int send_user_lookup(
2932 Unit *unit,
2933 int user_lookup_fd,
2934 uid_t uid,
2935 gid_t gid) {
2936
2937 assert(unit);
2938
2939 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2940 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2941 * specified. */
2942
2943 if (user_lookup_fd < 0)
2944 return 0;
2945
2946 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2947 return 0;
2948
2949 if (writev(user_lookup_fd,
2950 (struct iovec[]) {
2951 IOVEC_INIT(&uid, sizeof(uid)),
2952 IOVEC_INIT(&gid, sizeof(gid)),
2953 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2954 return -errno;
2955
2956 return 0;
2957 }
2958
2959 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2960 int r;
2961
2962 assert(c);
2963 assert(home);
2964 assert(buf);
2965
2966 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2967
2968 if (*home)
2969 return 0;
2970
2971 if (!c->working_directory_home)
2972 return 0;
2973
2974 r = get_home_dir(buf);
2975 if (r < 0)
2976 return r;
2977
2978 *home = *buf;
2979 return 1;
2980 }
2981
2982 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2983 _cleanup_strv_free_ char ** list = NULL;
2984 ExecDirectoryType t;
2985 int r;
2986
2987 assert(c);
2988 assert(p);
2989 assert(ret);
2990
2991 assert(c->dynamic_user);
2992
2993 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2994 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2995 * directories. */
2996
2997 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2998 char **i;
2999
3000 if (t == EXEC_DIRECTORY_CONFIGURATION)
3001 continue;
3002
3003 if (!p->prefix[t])
3004 continue;
3005
3006 STRV_FOREACH(i, c->directories[t].paths) {
3007 char *e;
3008
3009 if (exec_directory_is_private(c, t))
3010 e = path_join(p->prefix[t], "private", *i);
3011 else
3012 e = path_join(p->prefix[t], *i);
3013 if (!e)
3014 return -ENOMEM;
3015
3016 r = strv_consume(&list, e);
3017 if (r < 0)
3018 return r;
3019 }
3020 }
3021
3022 *ret = TAKE_PTR(list);
3023
3024 return 0;
3025 }
3026
3027 static char *exec_command_line(char **argv);
3028
3029 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3030 bool using_subcgroup;
3031 char *p;
3032
3033 assert(params);
3034 assert(ret);
3035
3036 if (!params->cgroup_path)
3037 return -EINVAL;
3038
3039 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3040 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3041 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3042 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3043 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3044 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3045 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3046 * flag, which is only passed for the former statements, not for the latter. */
3047
3048 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3049 if (using_subcgroup)
3050 p = path_join(params->cgroup_path, ".control");
3051 else
3052 p = strdup(params->cgroup_path);
3053 if (!p)
3054 return -ENOMEM;
3055
3056 *ret = p;
3057 return using_subcgroup;
3058 }
3059
3060 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3061 _cleanup_(cpu_set_reset) CPUSet s = {};
3062 int r;
3063
3064 assert(c);
3065 assert(ret);
3066
3067 if (!c->numa_policy.nodes.set) {
3068 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3069 return 0;
3070 }
3071
3072 r = numa_to_cpu_set(&c->numa_policy, &s);
3073 if (r < 0)
3074 return r;
3075
3076 cpu_set_reset(ret);
3077
3078 return cpu_set_add_all(ret, &s);
3079 }
3080
3081 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3082 assert(c);
3083
3084 return c->cpu_affinity_from_numa;
3085 }
3086
3087 static int exec_child(
3088 Unit *unit,
3089 const ExecCommand *command,
3090 const ExecContext *context,
3091 const ExecParameters *params,
3092 ExecRuntime *runtime,
3093 DynamicCreds *dcreds,
3094 int socket_fd,
3095 const int named_iofds[static 3],
3096 int *fds,
3097 size_t n_socket_fds,
3098 size_t n_storage_fds,
3099 char **files_env,
3100 int user_lookup_fd,
3101 int *exit_status) {
3102
3103 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
3104 int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
3105 _cleanup_free_ gid_t *supplementary_gids = NULL;
3106 const char *username = NULL, *groupname = NULL;
3107 _cleanup_free_ char *home_buffer = NULL;
3108 const char *home = NULL, *shell = NULL;
3109 char **final_argv = NULL;
3110 dev_t journal_stream_dev = 0;
3111 ino_t journal_stream_ino = 0;
3112 bool userns_set_up = false;
3113 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3114 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3115 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3116 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
3117 #if HAVE_SELINUX
3118 _cleanup_free_ char *mac_selinux_context_net = NULL;
3119 bool use_selinux = false;
3120 #endif
3121 #if ENABLE_SMACK
3122 bool use_smack = false;
3123 #endif
3124 #if HAVE_APPARMOR
3125 bool use_apparmor = false;
3126 #endif
3127 uid_t saved_uid = getuid();
3128 gid_t saved_gid = getgid();
3129 uid_t uid = UID_INVALID;
3130 gid_t gid = GID_INVALID;
3131 size_t n_fds;
3132 ExecDirectoryType dt;
3133 int secure_bits;
3134 _cleanup_free_ gid_t *gids_after_pam = NULL;
3135 int ngids_after_pam = 0;
3136
3137 assert(unit);
3138 assert(command);
3139 assert(context);
3140 assert(params);
3141 assert(exit_status);
3142
3143 rename_process_from_path(command->path);
3144
3145 /* We reset exactly these signals, since they are the
3146 * only ones we set to SIG_IGN in the main daemon. All
3147 * others we leave untouched because we set them to
3148 * SIG_DFL or a valid handler initially, both of which
3149 * will be demoted to SIG_DFL. */
3150 (void) default_signals(SIGNALS_CRASH_HANDLER,
3151 SIGNALS_IGNORE, -1);
3152
3153 if (context->ignore_sigpipe)
3154 (void) ignore_signals(SIGPIPE, -1);
3155
3156 r = reset_signal_mask();
3157 if (r < 0) {
3158 *exit_status = EXIT_SIGNAL_MASK;
3159 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3160 }
3161
3162 if (params->idle_pipe)
3163 do_idle_pipe_dance(params->idle_pipe);
3164
3165 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3166 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3167 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3168 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3169
3170 log_forget_fds();
3171 log_set_open_when_needed(true);
3172
3173 /* In case anything used libc syslog(), close this here, too */
3174 closelog();
3175
3176 n_fds = n_socket_fds + n_storage_fds;
3177 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
3178 if (r < 0) {
3179 *exit_status = EXIT_FDS;
3180 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3181 }
3182
3183 if (!context->same_pgrp)
3184 if (setsid() < 0) {
3185 *exit_status = EXIT_SETSID;
3186 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3187 }
3188
3189 exec_context_tty_reset(context, params);
3190
3191 if (unit_shall_confirm_spawn(unit)) {
3192 const char *vc = params->confirm_spawn;
3193 _cleanup_free_ char *cmdline = NULL;
3194
3195 cmdline = exec_command_line(command->argv);
3196 if (!cmdline) {
3197 *exit_status = EXIT_MEMORY;
3198 return log_oom();
3199 }
3200
3201 r = ask_for_confirmation(vc, unit, cmdline);
3202 if (r != CONFIRM_EXECUTE) {
3203 if (r == CONFIRM_PRETEND_SUCCESS) {
3204 *exit_status = EXIT_SUCCESS;
3205 return 0;
3206 }
3207 *exit_status = EXIT_CONFIRM;
3208 log_unit_error(unit, "Execution cancelled by the user");
3209 return -ECANCELED;
3210 }
3211 }
3212
3213 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3214 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3215 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3216 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3217 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3218 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3219 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3220 *exit_status = EXIT_MEMORY;
3221 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3222 }
3223
3224 if (context->dynamic_user && dcreds) {
3225 _cleanup_strv_free_ char **suggested_paths = NULL;
3226
3227 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3228 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3229 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3230 *exit_status = EXIT_USER;
3231 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3232 }
3233
3234 r = compile_suggested_paths(context, params, &suggested_paths);
3235 if (r < 0) {
3236 *exit_status = EXIT_MEMORY;
3237 return log_oom();
3238 }
3239
3240 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3241 if (r < 0) {
3242 *exit_status = EXIT_USER;
3243 if (r == -EILSEQ) {
3244 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3245 return -EOPNOTSUPP;
3246 }
3247 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3248 }
3249
3250 if (!uid_is_valid(uid)) {
3251 *exit_status = EXIT_USER;
3252 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3253 return -ESRCH;
3254 }
3255
3256 if (!gid_is_valid(gid)) {
3257 *exit_status = EXIT_USER;
3258 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3259 return -ESRCH;
3260 }
3261
3262 if (dcreds->user)
3263 username = dcreds->user->name;
3264
3265 } else {
3266 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3267 if (r < 0) {
3268 *exit_status = EXIT_USER;
3269 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3270 }
3271
3272 r = get_fixed_group(context, &groupname, &gid);
3273 if (r < 0) {
3274 *exit_status = EXIT_GROUP;
3275 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3276 }
3277 }
3278
3279 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3280 r = get_supplementary_groups(context, username, groupname, gid,
3281 &supplementary_gids, &ngids);
3282 if (r < 0) {
3283 *exit_status = EXIT_GROUP;
3284 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3285 }
3286
3287 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3288 if (r < 0) {
3289 *exit_status = EXIT_USER;
3290 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3291 }
3292
3293 user_lookup_fd = safe_close(user_lookup_fd);
3294
3295 r = acquire_home(context, uid, &home, &home_buffer);
3296 if (r < 0) {
3297 *exit_status = EXIT_CHDIR;
3298 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3299 }
3300
3301 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3302 * must sure to drop O_NONBLOCK */
3303 if (socket_fd >= 0)
3304 (void) fd_nonblock(socket_fd, false);
3305
3306 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3307 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3308 if (params->cgroup_path) {
3309 _cleanup_free_ char *p = NULL;
3310
3311 r = exec_parameters_get_cgroup_path(params, &p);
3312 if (r < 0) {
3313 *exit_status = EXIT_CGROUP;
3314 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3315 }
3316
3317 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3318 if (r < 0) {
3319 *exit_status = EXIT_CGROUP;
3320 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3321 }
3322 }
3323
3324 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3325 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3326 if (r < 0) {
3327 *exit_status = EXIT_NETWORK;
3328 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3329 }
3330 }
3331
3332 r = setup_input(context, params, socket_fd, named_iofds);
3333 if (r < 0) {
3334 *exit_status = EXIT_STDIN;
3335 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3336 }
3337
3338 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3339 if (r < 0) {
3340 *exit_status = EXIT_STDOUT;
3341 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3342 }
3343
3344 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3345 if (r < 0) {
3346 *exit_status = EXIT_STDERR;
3347 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3348 }
3349
3350 if (context->oom_score_adjust_set) {
3351 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3352 * prohibit write access to this file, and we shouldn't trip up over that. */
3353 r = set_oom_score_adjust(context->oom_score_adjust);
3354 if (IN_SET(r, -EPERM, -EACCES))
3355 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3356 else if (r < 0) {
3357 *exit_status = EXIT_OOM_ADJUST;
3358 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3359 }
3360 }
3361
3362 if (context->nice_set) {
3363 r = setpriority_closest(context->nice);
3364 if (r < 0)
3365 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3366 }
3367
3368 if (context->cpu_sched_set) {
3369 struct sched_param param = {
3370 .sched_priority = context->cpu_sched_priority,
3371 };
3372
3373 r = sched_setscheduler(0,
3374 context->cpu_sched_policy |
3375 (context->cpu_sched_reset_on_fork ?
3376 SCHED_RESET_ON_FORK : 0),
3377 &param);
3378 if (r < 0) {
3379 *exit_status = EXIT_SETSCHEDULER;
3380 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3381 }
3382 }
3383
3384 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
3385 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
3386 const CPUSet *cpu_set;
3387
3388 if (context->cpu_affinity_from_numa) {
3389 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
3390 if (r < 0) {
3391 *exit_status = EXIT_CPUAFFINITY;
3392 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
3393 }
3394
3395 cpu_set = &converted_cpu_set;
3396 } else
3397 cpu_set = &context->cpu_set;
3398
3399 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
3400 *exit_status = EXIT_CPUAFFINITY;
3401 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3402 }
3403 }
3404
3405 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3406 r = apply_numa_policy(&context->numa_policy);
3407 if (r == -EOPNOTSUPP)
3408 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3409 else if (r < 0) {
3410 *exit_status = EXIT_NUMA_POLICY;
3411 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3412 }
3413 }
3414
3415 if (context->ioprio_set)
3416 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3417 *exit_status = EXIT_IOPRIO;
3418 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3419 }
3420
3421 if (context->timer_slack_nsec != NSEC_INFINITY)
3422 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3423 *exit_status = EXIT_TIMERSLACK;
3424 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3425 }
3426
3427 if (context->personality != PERSONALITY_INVALID) {
3428 r = safe_personality(context->personality);
3429 if (r < 0) {
3430 *exit_status = EXIT_PERSONALITY;
3431 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3432 }
3433 }
3434
3435 if (context->utmp_id)
3436 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3437 context->tty_path,
3438 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
3439 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3440 USER_PROCESS,
3441 username);
3442
3443 if (uid_is_valid(uid)) {
3444 r = chown_terminal(STDIN_FILENO, uid);
3445 if (r < 0) {
3446 *exit_status = EXIT_STDIN;
3447 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3448 }
3449 }
3450
3451 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3452 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3453 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3454 * touch a single hierarchy too. */
3455 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3456 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3457 if (r < 0) {
3458 *exit_status = EXIT_CGROUP;
3459 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3460 }
3461 }
3462
3463 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3464 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3465 if (r < 0)
3466 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3467 }
3468
3469 r = build_environment(
3470 unit,
3471 context,
3472 params,
3473 n_fds,
3474 home,
3475 username,
3476 shell,
3477 journal_stream_dev,
3478 journal_stream_ino,
3479 &our_env);
3480 if (r < 0) {
3481 *exit_status = EXIT_MEMORY;
3482 return log_oom();
3483 }
3484
3485 r = build_pass_environment(context, &pass_env);
3486 if (r < 0) {
3487 *exit_status = EXIT_MEMORY;
3488 return log_oom();
3489 }
3490
3491 accum_env = strv_env_merge(5,
3492 params->environment,
3493 our_env,
3494 pass_env,
3495 context->environment,
3496 files_env);
3497 if (!accum_env) {
3498 *exit_status = EXIT_MEMORY;
3499 return log_oom();
3500 }
3501 accum_env = strv_env_clean(accum_env);
3502
3503 (void) umask(context->umask);
3504
3505 r = setup_keyring(unit, context, params, uid, gid);
3506 if (r < 0) {
3507 *exit_status = EXIT_KEYRING;
3508 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3509 }
3510
3511 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3512 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3513
3514 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3515 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3516
3517 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3518 if (needs_ambient_hack)
3519 needs_setuid = false;
3520 else
3521 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3522
3523 if (needs_sandboxing) {
3524 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3525 * present. The actual MAC context application will happen later, as late as possible, to avoid
3526 * impacting our own code paths. */
3527
3528 #if HAVE_SELINUX
3529 use_selinux = mac_selinux_use();
3530 #endif
3531 #if ENABLE_SMACK
3532 use_smack = mac_smack_use();
3533 #endif
3534 #if HAVE_APPARMOR
3535 use_apparmor = mac_apparmor_use();
3536 #endif
3537 }
3538
3539 if (needs_sandboxing) {
3540 int which_failed;
3541
3542 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3543 * is set here. (See below.) */
3544
3545 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3546 if (r < 0) {
3547 *exit_status = EXIT_LIMITS;
3548 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3549 }
3550 }
3551
3552 if (needs_setuid) {
3553
3554 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3555 * wins here. (See above.) */
3556
3557 if (context->pam_name && username) {
3558 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3559 if (r < 0) {
3560 *exit_status = EXIT_PAM;
3561 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3562 }
3563
3564 ngids_after_pam = getgroups_alloc(&gids_after_pam);
3565 if (ngids_after_pam < 0) {
3566 *exit_status = EXIT_MEMORY;
3567 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
3568 }
3569 }
3570 }
3571
3572 if (needs_sandboxing) {
3573 #if HAVE_SELINUX
3574 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3575 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3576 if (r < 0) {
3577 *exit_status = EXIT_SELINUX_CONTEXT;
3578 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3579 }
3580 }
3581 #endif
3582
3583 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
3584 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
3585 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
3586 if (context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
3587 userns_set_up = true;
3588 r = setup_private_users(saved_uid, saved_gid, uid, gid);
3589 if (r < 0) {
3590 *exit_status = EXIT_USER;
3591 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
3592 }
3593 }
3594 }
3595
3596 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3597
3598 if (ns_type_supported(NAMESPACE_NET)) {
3599 r = setup_netns(runtime->netns_storage_socket);
3600 if (r == -EPERM)
3601 log_unit_warning_errno(unit, r,
3602 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
3603 else if (r < 0) {
3604 *exit_status = EXIT_NETWORK;
3605 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3606 }
3607 } else if (context->network_namespace_path) {
3608 *exit_status = EXIT_NETWORK;
3609 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3610 "NetworkNamespacePath= is not supported, refusing.");
3611 } else
3612 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3613 }
3614
3615 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3616 if (needs_mount_namespace) {
3617 _cleanup_free_ char *error_path = NULL;
3618
3619 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3620 if (r < 0) {
3621 *exit_status = EXIT_NAMESPACE;
3622 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3623 error_path ? ": " : "", strempty(error_path));
3624 }
3625 }
3626
3627 if (needs_sandboxing) {
3628 r = apply_protect_hostname(unit, context, exit_status);
3629 if (r < 0)
3630 return r;
3631 }
3632
3633 /* Drop groups as early as possible.
3634 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
3635 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
3636 if (needs_setuid) {
3637 _cleanup_free_ gid_t *gids_to_enforce = NULL;
3638 int ngids_to_enforce = 0;
3639
3640 ngids_to_enforce = merge_gid_lists(supplementary_gids,
3641 ngids,
3642 gids_after_pam,
3643 ngids_after_pam,
3644 &gids_to_enforce);
3645 if (ngids_to_enforce < 0) {
3646 *exit_status = EXIT_MEMORY;
3647 return log_unit_error_errno(unit,
3648 ngids_to_enforce,
3649 "Failed to merge group lists. Group membership might be incorrect: %m");
3650 }
3651
3652 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
3653 if (r < 0) {
3654 *exit_status = EXIT_GROUP;
3655 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3656 }
3657 }
3658
3659 /* If the user namespace was not set up above, try to do it now.
3660 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
3661 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
3662 * case of mount namespaces being less privileged when the mount point list is copied from a
3663 * different user namespace). */
3664
3665 if (needs_sandboxing && context->private_users && !userns_set_up) {
3666 r = setup_private_users(saved_uid, saved_gid, uid, gid);
3667 if (r < 0) {
3668 *exit_status = EXIT_USER;
3669 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3670 }
3671 }
3672
3673 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3674 * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3675 * however if we have it as we want to keep it open until the final execve(). */
3676
3677 if (params->exec_fd >= 0) {
3678 exec_fd = params->exec_fd;
3679
3680 if (exec_fd < 3 + (int) n_fds) {
3681 int moved_fd;
3682
3683 /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3684 * process we are about to execute. */
3685
3686 moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3687 if (moved_fd < 0) {
3688 *exit_status = EXIT_FDS;
3689 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3690 }
3691
3692 safe_close(exec_fd);
3693 exec_fd = moved_fd;
3694 } else {
3695 /* This fd should be FD_CLOEXEC already, but let's make sure. */
3696 r = fd_cloexec(exec_fd, true);
3697 if (r < 0) {
3698 *exit_status = EXIT_FDS;
3699 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3700 }
3701 }
3702
3703 fds_with_exec_fd = newa(int, n_fds + 1);
3704 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3705 fds_with_exec_fd[n_fds] = exec_fd;
3706 n_fds_with_exec_fd = n_fds + 1;
3707 } else {
3708 fds_with_exec_fd = fds;
3709 n_fds_with_exec_fd = n_fds;
3710 }
3711
3712 r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3713 if (r >= 0)
3714 r = shift_fds(fds, n_fds);
3715 if (r >= 0)
3716 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3717 if (r < 0) {
3718 *exit_status = EXIT_FDS;
3719 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3720 }
3721
3722 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3723 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3724 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3725 * came this far. */
3726
3727 secure_bits = context->secure_bits;
3728
3729 if (needs_sandboxing) {
3730 uint64_t bset;
3731
3732 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3733 * requested. (Note this is placed after the general resource limit initialization, see
3734 * above, in order to take precedence.) */
3735 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3736 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3737 *exit_status = EXIT_LIMITS;
3738 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3739 }
3740 }
3741
3742 #if ENABLE_SMACK
3743 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3744 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3745 if (use_smack) {
3746 r = setup_smack(context, command);
3747 if (r < 0) {
3748 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3749 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3750 }
3751 }
3752 #endif
3753
3754 bset = context->capability_bounding_set;
3755 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3756 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3757 * instead of us doing that */
3758 if (needs_ambient_hack)
3759 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3760 (UINT64_C(1) << CAP_SETUID) |
3761 (UINT64_C(1) << CAP_SETGID);
3762
3763 if (!cap_test_all(bset)) {
3764 r = capability_bounding_set_drop(bset, false);
3765 if (r < 0) {
3766 *exit_status = EXIT_CAPABILITIES;
3767 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3768 }
3769 }
3770
3771 /* This is done before enforce_user, but ambient set
3772 * does not survive over setresuid() if keep_caps is not set. */
3773 if (!needs_ambient_hack) {
3774 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3775 if (r < 0) {
3776 *exit_status = EXIT_CAPABILITIES;
3777 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3778 }
3779 }
3780 }
3781
3782 /* chroot to root directory first, before we lose the ability to chroot */
3783 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3784 if (r < 0)
3785 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3786
3787 if (needs_setuid) {
3788 if (uid_is_valid(uid)) {
3789 r = enforce_user(context, uid);
3790 if (r < 0) {
3791 *exit_status = EXIT_USER;
3792 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3793 }
3794
3795 if (!needs_ambient_hack &&
3796 context->capability_ambient_set != 0) {
3797
3798 /* Fix the ambient capabilities after user change. */
3799 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3800 if (r < 0) {
3801 *exit_status = EXIT_CAPABILITIES;
3802 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3803 }
3804
3805 /* If we were asked to change user and ambient capabilities
3806 * were requested, we had to add keep-caps to the securebits
3807 * so that we would maintain the inherited capability set
3808 * through the setresuid(). Make sure that the bit is added
3809 * also to the context secure_bits so that we don't try to
3810 * drop the bit away next. */
3811
3812 secure_bits |= 1<<SECURE_KEEP_CAPS;
3813 }
3814 }
3815 }
3816
3817 /* Apply working directory here, because the working directory might be on NFS and only the user running
3818 * this service might have the correct privilege to change to the working directory */
3819 r = apply_working_directory(context, params, home, exit_status);
3820 if (r < 0)
3821 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3822
3823 if (needs_sandboxing) {
3824 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3825 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3826 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3827 * are restricted. */
3828
3829 #if HAVE_SELINUX
3830 if (use_selinux) {
3831 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3832
3833 if (exec_context) {
3834 r = setexeccon(exec_context);
3835 if (r < 0) {
3836 *exit_status = EXIT_SELINUX_CONTEXT;
3837 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3838 }
3839 }
3840 }
3841 #endif
3842
3843 #if HAVE_APPARMOR
3844 if (use_apparmor && context->apparmor_profile) {
3845 r = aa_change_onexec(context->apparmor_profile);
3846 if (r < 0 && !context->apparmor_profile_ignore) {
3847 *exit_status = EXIT_APPARMOR_PROFILE;
3848 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3849 }
3850 }
3851 #endif
3852
3853 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3854 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3855 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3856 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3857 *exit_status = EXIT_SECUREBITS;
3858 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3859 }
3860
3861 if (context_has_no_new_privileges(context))
3862 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3863 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3864 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3865 }
3866
3867 #if HAVE_SECCOMP
3868 r = apply_address_families(unit, context);
3869 if (r < 0) {
3870 *exit_status = EXIT_ADDRESS_FAMILIES;
3871 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3872 }
3873
3874 r = apply_memory_deny_write_execute(unit, context);
3875 if (r < 0) {
3876 *exit_status = EXIT_SECCOMP;
3877 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3878 }
3879
3880 r = apply_restrict_realtime(unit, context);
3881 if (r < 0) {
3882 *exit_status = EXIT_SECCOMP;
3883 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3884 }
3885
3886 r = apply_restrict_suid_sgid(unit, context);
3887 if (r < 0) {
3888 *exit_status = EXIT_SECCOMP;
3889 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3890 }
3891
3892 r = apply_restrict_namespaces(unit, context);
3893 if (r < 0) {
3894 *exit_status = EXIT_SECCOMP;
3895 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3896 }
3897
3898 r = apply_protect_sysctl(unit, context);
3899 if (r < 0) {
3900 *exit_status = EXIT_SECCOMP;
3901 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3902 }
3903
3904 r = apply_protect_kernel_modules(unit, context);
3905 if (r < 0) {
3906 *exit_status = EXIT_SECCOMP;
3907 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3908 }
3909
3910 r = apply_protect_kernel_logs(unit, context);
3911 if (r < 0) {
3912 *exit_status = EXIT_SECCOMP;
3913 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
3914 }
3915
3916 r = apply_protect_clock(unit, context);
3917 if (r < 0) {
3918 *exit_status = EXIT_SECCOMP;
3919 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
3920 }
3921
3922 r = apply_private_devices(unit, context);
3923 if (r < 0) {
3924 *exit_status = EXIT_SECCOMP;
3925 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3926 }
3927
3928 r = apply_syscall_archs(unit, context);
3929 if (r < 0) {
3930 *exit_status = EXIT_SECCOMP;
3931 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3932 }
3933
3934 r = apply_lock_personality(unit, context);
3935 if (r < 0) {
3936 *exit_status = EXIT_SECCOMP;
3937 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3938 }
3939
3940 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3941 * by the filter as little as possible. */
3942 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3943 if (r < 0) {
3944 *exit_status = EXIT_SECCOMP;
3945 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3946 }
3947 #endif
3948 }
3949
3950 if (!strv_isempty(context->unset_environment)) {
3951 char **ee = NULL;
3952
3953 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3954 if (!ee) {
3955 *exit_status = EXIT_MEMORY;
3956 return log_oom();
3957 }
3958
3959 strv_free_and_replace(accum_env, ee);
3960 }
3961
3962 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3963 replaced_argv = replace_env_argv(command->argv, accum_env);
3964 if (!replaced_argv) {
3965 *exit_status = EXIT_MEMORY;
3966 return log_oom();
3967 }
3968 final_argv = replaced_argv;
3969 } else
3970 final_argv = command->argv;
3971
3972 if (DEBUG_LOGGING) {
3973 _cleanup_free_ char *line;
3974
3975 line = exec_command_line(final_argv);
3976 if (line)
3977 log_struct(LOG_DEBUG,
3978 "EXECUTABLE=%s", command->path,
3979 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3980 LOG_UNIT_ID(unit),
3981 LOG_UNIT_INVOCATION_ID(unit));
3982 }
3983
3984 if (exec_fd >= 0) {
3985 uint8_t hot = 1;
3986
3987 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3988 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3989
3990 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3991 *exit_status = EXIT_EXEC;
3992 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3993 }
3994 }
3995
3996 execve(command->path, final_argv, accum_env);
3997 r = -errno;
3998
3999 if (exec_fd >= 0) {
4000 uint8_t hot = 0;
4001
4002 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4003 * that POLLHUP on it no longer means execve() succeeded. */
4004
4005 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4006 *exit_status = EXIT_EXEC;
4007 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4008 }
4009 }
4010
4011 if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4012 log_struct_errno(LOG_INFO, r,
4013 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4014 LOG_UNIT_ID(unit),
4015 LOG_UNIT_INVOCATION_ID(unit),
4016 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4017 command->path),
4018 "EXECUTABLE=%s", command->path);
4019 return 0;
4020 }
4021
4022 *exit_status = EXIT_EXEC;
4023 return log_unit_error_errno(unit, r, "Failed to execute command: %m");
4024 }
4025
4026 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
4027 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
4028
4029 int exec_spawn(Unit *unit,
4030 ExecCommand *command,
4031 const ExecContext *context,
4032 const ExecParameters *params,
4033 ExecRuntime *runtime,
4034 DynamicCreds *dcreds,
4035 pid_t *ret) {
4036
4037 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
4038 _cleanup_free_ char *subcgroup_path = NULL;
4039 _cleanup_strv_free_ char **files_env = NULL;
4040 size_t n_storage_fds = 0, n_socket_fds = 0;
4041 _cleanup_free_ char *line = NULL;
4042 pid_t pid;
4043
4044 assert(unit);
4045 assert(command);
4046 assert(context);
4047 assert(ret);
4048 assert(params);
4049 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4050
4051 if (context->std_input == EXEC_INPUT_SOCKET ||
4052 context->std_output == EXEC_OUTPUT_SOCKET ||
4053 context->std_error == EXEC_OUTPUT_SOCKET) {
4054
4055 if (params->n_socket_fds > 1) {
4056 log_unit_error(unit, "Got more than one socket.");
4057 return -EINVAL;
4058 }
4059
4060 if (params->n_socket_fds == 0) {
4061 log_unit_error(unit, "Got no socket.");
4062 return -EINVAL;
4063 }
4064
4065 socket_fd = params->fds[0];
4066 } else {
4067 socket_fd = -1;
4068 fds = params->fds;
4069 n_socket_fds = params->n_socket_fds;
4070 n_storage_fds = params->n_storage_fds;
4071 }
4072
4073 r = exec_context_named_iofds(context, params, named_iofds);
4074 if (r < 0)
4075 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4076
4077 r = exec_context_load_environment(unit, context, &files_env);
4078 if (r < 0)
4079 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
4080
4081 line = exec_command_line(command->argv);
4082 if (!line)
4083 return log_oom();
4084
4085 log_struct(LOG_DEBUG,
4086 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
4087 "EXECUTABLE=%s", command->path,
4088 LOG_UNIT_ID(unit),
4089 LOG_UNIT_INVOCATION_ID(unit));
4090
4091 if (params->cgroup_path) {
4092 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4093 if (r < 0)
4094 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4095 if (r > 0) { /* We are using a child cgroup */
4096 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4097 if (r < 0)
4098 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4099 }
4100 }
4101
4102 pid = fork();
4103 if (pid < 0)
4104 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4105
4106 if (pid == 0) {
4107 int exit_status = EXIT_SUCCESS;
4108
4109 r = exec_child(unit,
4110 command,
4111 context,
4112 params,
4113 runtime,
4114 dcreds,
4115 socket_fd,
4116 named_iofds,
4117 fds,
4118 n_socket_fds,
4119 n_storage_fds,
4120 files_env,
4121 unit->manager->user_lookup_fds[1],
4122 &exit_status);
4123
4124 if (r < 0) {
4125 const char *status =
4126 exit_status_to_string(exit_status,
4127 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
4128
4129 log_struct_errno(LOG_ERR, r,
4130 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4131 LOG_UNIT_ID(unit),
4132 LOG_UNIT_INVOCATION_ID(unit),
4133 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4134 status, command->path),
4135 "EXECUTABLE=%s", command->path);
4136 }
4137
4138 _exit(exit_status);
4139 }
4140
4141 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
4142
4143 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4144 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4145 * process will be killed too). */
4146 if (subcgroup_path)
4147 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
4148
4149 exec_status_start(&command->exec_status, pid);
4150
4151 *ret = pid;
4152 return 0;
4153 }
4154
4155 void exec_context_init(ExecContext *c) {
4156 ExecDirectoryType i;
4157
4158 assert(c);
4159
4160 c->umask = 0022;
4161 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
4162 c->cpu_sched_policy = SCHED_OTHER;
4163 c->syslog_priority = LOG_DAEMON|LOG_INFO;
4164 c->syslog_level_prefix = true;
4165 c->ignore_sigpipe = true;
4166 c->timer_slack_nsec = NSEC_INFINITY;
4167 c->personality = PERSONALITY_INVALID;
4168 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4169 c->directories[i].mode = 0755;
4170 c->timeout_clean_usec = USEC_INFINITY;
4171 c->capability_bounding_set = CAP_ALL;
4172 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4173 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
4174 c->log_level_max = -1;
4175 numa_policy_reset(&c->numa_policy);
4176 }
4177
4178 void exec_context_done(ExecContext *c) {
4179 ExecDirectoryType i;
4180 size_t l;
4181
4182 assert(c);
4183
4184 c->environment = strv_free(c->environment);
4185 c->environment_files = strv_free(c->environment_files);
4186 c->pass_environment = strv_free(c->pass_environment);
4187 c->unset_environment = strv_free(c->unset_environment);
4188
4189 rlimit_free_all(c->rlimit);
4190
4191 for (l = 0; l < 3; l++) {
4192 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
4193 c->stdio_file[l] = mfree(c->stdio_file[l]);
4194 }
4195
4196 c->working_directory = mfree(c->working_directory);
4197 c->root_directory = mfree(c->root_directory);
4198 c->root_image = mfree(c->root_image);
4199 c->tty_path = mfree(c->tty_path);
4200 c->syslog_identifier = mfree(c->syslog_identifier);
4201 c->user = mfree(c->user);
4202 c->group = mfree(c->group);
4203
4204 c->supplementary_groups = strv_free(c->supplementary_groups);
4205
4206 c->pam_name = mfree(c->pam_name);
4207
4208 c->read_only_paths = strv_free(c->read_only_paths);
4209 c->read_write_paths = strv_free(c->read_write_paths);
4210 c->inaccessible_paths = strv_free(c->inaccessible_paths);
4211
4212 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
4213 c->bind_mounts = NULL;
4214 c->n_bind_mounts = 0;
4215 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4216 c->temporary_filesystems = NULL;
4217 c->n_temporary_filesystems = 0;
4218
4219 cpu_set_reset(&c->cpu_set);
4220 numa_policy_reset(&c->numa_policy);
4221
4222 c->utmp_id = mfree(c->utmp_id);
4223 c->selinux_context = mfree(c->selinux_context);
4224 c->apparmor_profile = mfree(c->apparmor_profile);
4225 c->smack_process_label = mfree(c->smack_process_label);
4226
4227 c->syscall_filter = hashmap_free(c->syscall_filter);
4228 c->syscall_archs = set_free(c->syscall_archs);
4229 c->address_families = set_free(c->address_families);
4230
4231 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4232 c->directories[i].paths = strv_free(c->directories[i].paths);
4233
4234 c->log_level_max = -1;
4235
4236 exec_context_free_log_extra_fields(c);
4237
4238 c->log_ratelimit_interval_usec = 0;
4239 c->log_ratelimit_burst = 0;
4240
4241 c->stdin_data = mfree(c->stdin_data);
4242 c->stdin_data_size = 0;
4243
4244 c->network_namespace_path = mfree(c->network_namespace_path);
4245
4246 c->log_namespace = mfree(c->log_namespace);
4247 }
4248
4249 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4250 char **i;
4251
4252 assert(c);
4253
4254 if (!runtime_prefix)
4255 return 0;
4256
4257 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4258 _cleanup_free_ char *p;
4259
4260 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4261 p = path_join(runtime_prefix, "private", *i);
4262 else
4263 p = path_join(runtime_prefix, *i);
4264 if (!p)
4265 return -ENOMEM;
4266
4267 /* We execute this synchronously, since we need to be sure this is gone when we start the
4268 * service next. */
4269 (void) rm_rf(p, REMOVE_ROOT);
4270 }
4271
4272 return 0;
4273 }
4274
4275 static void exec_command_done(ExecCommand *c) {
4276 assert(c);
4277
4278 c->path = mfree(c->path);
4279 c->argv = strv_free(c->argv);
4280 }
4281
4282 void exec_command_done_array(ExecCommand *c, size_t n) {
4283 size_t i;
4284
4285 for (i = 0; i < n; i++)
4286 exec_command_done(c+i);
4287 }
4288
4289 ExecCommand* exec_command_free_list(ExecCommand *c) {
4290 ExecCommand *i;
4291
4292 while ((i = c)) {
4293 LIST_REMOVE(command, c, i);
4294 exec_command_done(i);
4295 free(i);
4296 }
4297
4298 return NULL;
4299 }
4300
4301 void exec_command_free_array(ExecCommand **c, size_t n) {
4302 size_t i;
4303
4304 for (i = 0; i < n; i++)
4305 c[i] = exec_command_free_list(c[i]);
4306 }
4307
4308 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4309 size_t i;
4310
4311 for (i = 0; i < n; i++)
4312 exec_status_reset(&c[i].exec_status);
4313 }
4314
4315 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4316 size_t i;
4317
4318 for (i = 0; i < n; i++) {
4319 ExecCommand *z;
4320
4321 LIST_FOREACH(command, z, c[i])
4322 exec_status_reset(&z->exec_status);
4323 }
4324 }
4325
4326 typedef struct InvalidEnvInfo {
4327 const Unit *unit;
4328 const char *path;
4329 } InvalidEnvInfo;
4330
4331 static void invalid_env(const char *p, void *userdata) {
4332 InvalidEnvInfo *info = userdata;
4333
4334 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4335 }
4336
4337 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4338 assert(c);
4339
4340 switch (fd_index) {
4341
4342 case STDIN_FILENO:
4343 if (c->std_input != EXEC_INPUT_NAMED_FD)
4344 return NULL;
4345
4346 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4347
4348 case STDOUT_FILENO:
4349 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4350 return NULL;
4351
4352 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4353
4354 case STDERR_FILENO:
4355 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4356 return NULL;
4357
4358 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4359
4360 default:
4361 return NULL;
4362 }
4363 }
4364
4365 static int exec_context_named_iofds(
4366 const ExecContext *c,
4367 const ExecParameters *p,
4368 int named_iofds[static 3]) {
4369
4370 size_t i, targets;
4371 const char* stdio_fdname[3];
4372 size_t n_fds;
4373
4374 assert(c);
4375 assert(p);
4376 assert(named_iofds);
4377
4378 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4379 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4380 (c->std_error == EXEC_OUTPUT_NAMED_FD);
4381
4382 for (i = 0; i < 3; i++)
4383 stdio_fdname[i] = exec_context_fdname(c, i);
4384
4385 n_fds = p->n_storage_fds + p->n_socket_fds;
4386
4387 for (i = 0; i < n_fds && targets > 0; i++)
4388 if (named_iofds[STDIN_FILENO] < 0 &&
4389 c->std_input == EXEC_INPUT_NAMED_FD &&
4390 stdio_fdname[STDIN_FILENO] &&
4391 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4392
4393 named_iofds[STDIN_FILENO] = p->fds[i];
4394 targets--;
4395
4396 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4397 c->std_output == EXEC_OUTPUT_NAMED_FD &&
4398 stdio_fdname[STDOUT_FILENO] &&
4399 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4400
4401 named_iofds[STDOUT_FILENO] = p->fds[i];
4402 targets--;
4403
4404 } else if (named_iofds[STDERR_FILENO] < 0 &&
4405 c->std_error == EXEC_OUTPUT_NAMED_FD &&
4406 stdio_fdname[STDERR_FILENO] &&
4407 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4408
4409 named_iofds[STDERR_FILENO] = p->fds[i];
4410 targets--;
4411 }
4412
4413 return targets == 0 ? 0 : -ENOENT;
4414 }
4415
4416 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4417 char **i, **r = NULL;
4418
4419 assert(c);
4420 assert(l);
4421
4422 STRV_FOREACH(i, c->environment_files) {
4423 char *fn;
4424 int k;
4425 unsigned n;
4426 bool ignore = false;
4427 char **p;
4428 _cleanup_globfree_ glob_t pglob = {};
4429
4430 fn = *i;
4431
4432 if (fn[0] == '-') {
4433 ignore = true;
4434 fn++;
4435 }
4436
4437 if (!path_is_absolute(fn)) {
4438 if (ignore)
4439 continue;
4440
4441 strv_free(r);
4442 return -EINVAL;
4443 }
4444
4445 /* Filename supports globbing, take all matching files */
4446 k = safe_glob(fn, 0, &pglob);
4447 if (k < 0) {
4448 if (ignore)
4449 continue;
4450
4451 strv_free(r);
4452 return k;
4453 }
4454
4455 /* When we don't match anything, -ENOENT should be returned */
4456 assert(pglob.gl_pathc > 0);
4457
4458 for (n = 0; n < pglob.gl_pathc; n++) {
4459 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4460 if (k < 0) {
4461 if (ignore)
4462 continue;
4463
4464 strv_free(r);
4465 return k;
4466 }
4467 /* Log invalid environment variables with filename */
4468 if (p) {
4469 InvalidEnvInfo info = {
4470 .unit = unit,
4471 .path = pglob.gl_pathv[n]
4472 };
4473
4474 p = strv_env_clean_with_callback(p, invalid_env, &info);
4475 }
4476
4477 if (!r)
4478 r = p;
4479 else {
4480 char **m;
4481
4482 m = strv_env_merge(2, r, p);
4483 strv_free(r);
4484 strv_free(p);
4485 if (!m)
4486 return -ENOMEM;
4487
4488 r = m;
4489 }
4490 }
4491 }
4492
4493 *l = r;
4494
4495 return 0;
4496 }
4497
4498 static bool tty_may_match_dev_console(const char *tty) {
4499 _cleanup_free_ char *resolved = NULL;
4500
4501 if (!tty)
4502 return true;
4503
4504 tty = skip_dev_prefix(tty);
4505
4506 /* trivial identity? */
4507 if (streq(tty, "console"))
4508 return true;
4509
4510 if (resolve_dev_console(&resolved) < 0)
4511 return true; /* if we could not resolve, assume it may */
4512
4513 /* "tty0" means the active VC, so it may be the same sometimes */
4514 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4515 }
4516
4517 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4518 assert(ec);
4519
4520 return ec->tty_reset ||
4521 ec->tty_vhangup ||
4522 ec->tty_vt_disallocate ||
4523 is_terminal_input(ec->std_input) ||
4524 is_terminal_output(ec->std_output) ||
4525 is_terminal_output(ec->std_error);
4526 }
4527
4528 bool exec_context_may_touch_console(const ExecContext *ec) {
4529
4530 return exec_context_may_touch_tty(ec) &&
4531 tty_may_match_dev_console(exec_context_tty_path(ec));
4532 }
4533
4534 static void strv_fprintf(FILE *f, char **l) {
4535 char **g;
4536
4537 assert(f);
4538
4539 STRV_FOREACH(g, l)
4540 fprintf(f, " %s", *g);
4541 }
4542
4543 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4544 char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
4545 ExecDirectoryType dt;
4546 unsigned i;
4547 int r;
4548
4549 assert(c);
4550 assert(f);
4551
4552 prefix = strempty(prefix);
4553
4554 fprintf(f,
4555 "%sUMask: %04o\n"
4556 "%sWorkingDirectory: %s\n"
4557 "%sRootDirectory: %s\n"
4558 "%sNonBlocking: %s\n"
4559 "%sPrivateTmp: %s\n"
4560 "%sPrivateDevices: %s\n"
4561 "%sProtectKernelTunables: %s\n"
4562 "%sProtectKernelModules: %s\n"
4563 "%sProtectKernelLogs: %s\n"
4564 "%sProtectClock: %s\n"
4565 "%sProtectControlGroups: %s\n"
4566 "%sPrivateNetwork: %s\n"
4567 "%sPrivateUsers: %s\n"
4568 "%sProtectHome: %s\n"
4569 "%sProtectSystem: %s\n"
4570 "%sMountAPIVFS: %s\n"
4571 "%sIgnoreSIGPIPE: %s\n"
4572 "%sMemoryDenyWriteExecute: %s\n"
4573 "%sRestrictRealtime: %s\n"
4574 "%sRestrictSUIDSGID: %s\n"
4575 "%sKeyringMode: %s\n"
4576 "%sProtectHostname: %s\n",
4577 prefix, c->umask,
4578 prefix, c->working_directory ? c->working_directory : "/",
4579 prefix, c->root_directory ? c->root_directory : "/",
4580 prefix, yes_no(c->non_blocking),
4581 prefix, yes_no(c->private_tmp),
4582 prefix, yes_no(c->private_devices),
4583 prefix, yes_no(c->protect_kernel_tunables),
4584 prefix, yes_no(c->protect_kernel_modules),
4585 prefix, yes_no(c->protect_kernel_logs),
4586 prefix, yes_no(c->protect_clock),
4587 prefix, yes_no(c->protect_control_groups),
4588 prefix, yes_no(c->private_network),
4589 prefix, yes_no(c->private_users),
4590 prefix, protect_home_to_string(c->protect_home),
4591 prefix, protect_system_to_string(c->protect_system),
4592 prefix, yes_no(c->mount_apivfs),
4593 prefix, yes_no(c->ignore_sigpipe),
4594 prefix, yes_no(c->memory_deny_write_execute),
4595 prefix, yes_no(c->restrict_realtime),
4596 prefix, yes_no(c->restrict_suid_sgid),
4597 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4598 prefix, yes_no(c->protect_hostname));
4599
4600 if (c->root_image)
4601 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4602
4603 STRV_FOREACH(e, c->environment)
4604 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4605
4606 STRV_FOREACH(e, c->environment_files)
4607 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4608
4609 STRV_FOREACH(e, c->pass_environment)
4610 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4611
4612 STRV_FOREACH(e, c->unset_environment)
4613 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4614
4615 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4616
4617 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4618 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4619
4620 STRV_FOREACH(d, c->directories[dt].paths)
4621 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4622 }
4623
4624 fprintf(f,
4625 "%sTimeoutCleanSec: %s\n",
4626 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
4627
4628 if (c->nice_set)
4629 fprintf(f,
4630 "%sNice: %i\n",
4631 prefix, c->nice);
4632
4633 if (c->oom_score_adjust_set)
4634 fprintf(f,
4635 "%sOOMScoreAdjust: %i\n",
4636 prefix, c->oom_score_adjust);
4637
4638 for (i = 0; i < RLIM_NLIMITS; i++)
4639 if (c->rlimit[i]) {
4640 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4641 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4642 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4643 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4644 }
4645
4646 if (c->ioprio_set) {
4647 _cleanup_free_ char *class_str = NULL;
4648
4649 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4650 if (r >= 0)
4651 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4652
4653 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4654 }
4655
4656 if (c->cpu_sched_set) {
4657 _cleanup_free_ char *policy_str = NULL;
4658
4659 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4660 if (r >= 0)
4661 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4662
4663 fprintf(f,
4664 "%sCPUSchedulingPriority: %i\n"
4665 "%sCPUSchedulingResetOnFork: %s\n",
4666 prefix, c->cpu_sched_priority,
4667 prefix, yes_no(c->cpu_sched_reset_on_fork));
4668 }
4669
4670 if (c->cpu_set.set) {
4671 _cleanup_free_ char *affinity = NULL;
4672
4673 affinity = cpu_set_to_range_string(&c->cpu_set);
4674 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4675 }
4676
4677 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4678 _cleanup_free_ char *nodes = NULL;
4679
4680 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4681 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4682 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4683 }
4684
4685 if (c->timer_slack_nsec != NSEC_INFINITY)
4686 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4687
4688 fprintf(f,
4689 "%sStandardInput: %s\n"
4690 "%sStandardOutput: %s\n"
4691 "%sStandardError: %s\n",
4692 prefix, exec_input_to_string(c->std_input),
4693 prefix, exec_output_to_string(c->std_output),
4694 prefix, exec_output_to_string(c->std_error));
4695
4696 if (c->std_input == EXEC_INPUT_NAMED_FD)
4697 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4698 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4699 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4700 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4701 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4702
4703 if (c->std_input == EXEC_INPUT_FILE)
4704 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4705 if (c->std_output == EXEC_OUTPUT_FILE)
4706 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4707 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4708 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4709 if (c->std_error == EXEC_OUTPUT_FILE)
4710 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4711 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4712 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4713
4714 if (c->tty_path)
4715 fprintf(f,
4716 "%sTTYPath: %s\n"
4717 "%sTTYReset: %s\n"
4718 "%sTTYVHangup: %s\n"
4719 "%sTTYVTDisallocate: %s\n",
4720 prefix, c->tty_path,
4721 prefix, yes_no(c->tty_reset),
4722 prefix, yes_no(c->tty_vhangup),
4723 prefix, yes_no(c->tty_vt_disallocate));
4724
4725 if (IN_SET(c->std_output,
4726 EXEC_OUTPUT_SYSLOG,
4727 EXEC_OUTPUT_KMSG,
4728 EXEC_OUTPUT_JOURNAL,
4729 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4730 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4731 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4732 IN_SET(c->std_error,
4733 EXEC_OUTPUT_SYSLOG,
4734 EXEC_OUTPUT_KMSG,
4735 EXEC_OUTPUT_JOURNAL,
4736 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4737 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4738 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4739
4740 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4741
4742 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4743 if (r >= 0)
4744 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4745
4746 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4747 if (r >= 0)
4748 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4749 }
4750
4751 if (c->log_level_max >= 0) {
4752 _cleanup_free_ char *t = NULL;
4753
4754 (void) log_level_to_string_alloc(c->log_level_max, &t);
4755
4756 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4757 }
4758
4759 if (c->log_ratelimit_interval_usec > 0) {
4760 char buf_timespan[FORMAT_TIMESPAN_MAX];
4761
4762 fprintf(f,
4763 "%sLogRateLimitIntervalSec: %s\n",
4764 prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
4765 }
4766
4767 if (c->log_ratelimit_burst > 0)
4768 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
4769
4770 if (c->n_log_extra_fields > 0) {
4771 size_t j;
4772
4773 for (j = 0; j < c->n_log_extra_fields; j++) {
4774 fprintf(f, "%sLogExtraFields: ", prefix);
4775 fwrite(c->log_extra_fields[j].iov_base,
4776 1, c->log_extra_fields[j].iov_len,
4777 f);
4778 fputc('\n', f);
4779 }
4780 }
4781
4782 if (c->log_namespace)
4783 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
4784
4785 if (c->secure_bits) {
4786 _cleanup_free_ char *str = NULL;
4787
4788 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4789 if (r >= 0)
4790 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4791 }
4792
4793 if (c->capability_bounding_set != CAP_ALL) {
4794 _cleanup_free_ char *str = NULL;
4795
4796 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4797 if (r >= 0)
4798 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4799 }
4800
4801 if (c->capability_ambient_set != 0) {
4802 _cleanup_free_ char *str = NULL;
4803
4804 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4805 if (r >= 0)
4806 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4807 }
4808
4809 if (c->user)
4810 fprintf(f, "%sUser: %s\n", prefix, c->user);
4811 if (c->group)
4812 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4813
4814 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4815
4816 if (!strv_isempty(c->supplementary_groups)) {
4817 fprintf(f, "%sSupplementaryGroups:", prefix);
4818 strv_fprintf(f, c->supplementary_groups);
4819 fputs("\n", f);
4820 }
4821
4822 if (c->pam_name)
4823 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4824
4825 if (!strv_isempty(c->read_write_paths)) {
4826 fprintf(f, "%sReadWritePaths:", prefix);
4827 strv_fprintf(f, c->read_write_paths);
4828 fputs("\n", f);
4829 }
4830
4831 if (!strv_isempty(c->read_only_paths)) {
4832 fprintf(f, "%sReadOnlyPaths:", prefix);
4833 strv_fprintf(f, c->read_only_paths);
4834 fputs("\n", f);
4835 }
4836
4837 if (!strv_isempty(c->inaccessible_paths)) {
4838 fprintf(f, "%sInaccessiblePaths:", prefix);
4839 strv_fprintf(f, c->inaccessible_paths);
4840 fputs("\n", f);
4841 }
4842
4843 if (c->n_bind_mounts > 0)
4844 for (i = 0; i < c->n_bind_mounts; i++)
4845 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4846 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4847 c->bind_mounts[i].ignore_enoent ? "-": "",
4848 c->bind_mounts[i].source,
4849 c->bind_mounts[i].destination,
4850 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4851
4852 if (c->n_temporary_filesystems > 0)
4853 for (i = 0; i < c->n_temporary_filesystems; i++) {
4854 TemporaryFileSystem *t = c->temporary_filesystems + i;
4855
4856 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4857 t->path,
4858 isempty(t->options) ? "" : ":",
4859 strempty(t->options));
4860 }
4861
4862 if (c->utmp_id)
4863 fprintf(f,
4864 "%sUtmpIdentifier: %s\n",
4865 prefix, c->utmp_id);
4866
4867 if (c->selinux_context)
4868 fprintf(f,
4869 "%sSELinuxContext: %s%s\n",
4870 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4871
4872 if (c->apparmor_profile)
4873 fprintf(f,
4874 "%sAppArmorProfile: %s%s\n",
4875 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4876
4877 if (c->smack_process_label)
4878 fprintf(f,
4879 "%sSmackProcessLabel: %s%s\n",
4880 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4881
4882 if (c->personality != PERSONALITY_INVALID)
4883 fprintf(f,
4884 "%sPersonality: %s\n",
4885 prefix, strna(personality_to_string(c->personality)));
4886
4887 fprintf(f,
4888 "%sLockPersonality: %s\n",
4889 prefix, yes_no(c->lock_personality));
4890
4891 if (c->syscall_filter) {
4892 #if HAVE_SECCOMP
4893 Iterator j;
4894 void *id, *val;
4895 bool first = true;
4896 #endif
4897
4898 fprintf(f,
4899 "%sSystemCallFilter: ",
4900 prefix);
4901
4902 if (!c->syscall_whitelist)
4903 fputc('~', f);
4904
4905 #if HAVE_SECCOMP
4906 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4907 _cleanup_free_ char *name = NULL;
4908 const char *errno_name = NULL;
4909 int num = PTR_TO_INT(val);
4910
4911 if (first)
4912 first = false;
4913 else
4914 fputc(' ', f);
4915
4916 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4917 fputs(strna(name), f);
4918
4919 if (num >= 0) {
4920 errno_name = errno_to_name(num);
4921 if (errno_name)
4922 fprintf(f, ":%s", errno_name);
4923 else
4924 fprintf(f, ":%d", num);
4925 }
4926 }
4927 #endif
4928
4929 fputc('\n', f);
4930 }
4931
4932 if (c->syscall_archs) {
4933 #if HAVE_SECCOMP
4934 Iterator j;
4935 void *id;
4936 #endif
4937
4938 fprintf(f,
4939 "%sSystemCallArchitectures:",
4940 prefix);
4941
4942 #if HAVE_SECCOMP
4943 SET_FOREACH(id, c->syscall_archs, j)
4944 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4945 #endif
4946 fputc('\n', f);
4947 }
4948
4949 if (exec_context_restrict_namespaces_set(c)) {
4950 _cleanup_free_ char *s = NULL;
4951
4952 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4953 if (r >= 0)
4954 fprintf(f, "%sRestrictNamespaces: %s\n",
4955 prefix, strna(s));
4956 }
4957
4958 if (c->network_namespace_path)
4959 fprintf(f,
4960 "%sNetworkNamespacePath: %s\n",
4961 prefix, c->network_namespace_path);
4962
4963 if (c->syscall_errno > 0) {
4964 const char *errno_name;
4965
4966 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4967
4968 errno_name = errno_to_name(c->syscall_errno);
4969 if (errno_name)
4970 fprintf(f, "%s\n", errno_name);
4971 else
4972 fprintf(f, "%d\n", c->syscall_errno);
4973 }
4974 }
4975
4976 bool exec_context_maintains_privileges(const ExecContext *c) {
4977 assert(c);
4978
4979 /* Returns true if the process forked off would run under
4980 * an unchanged UID or as root. */
4981
4982 if (!c->user)
4983 return true;
4984
4985 if (streq(c->user, "root") || streq(c->user, "0"))
4986 return true;
4987
4988 return false;
4989 }
4990
4991 int exec_context_get_effective_ioprio(const ExecContext *c) {
4992 int p;
4993
4994 assert(c);
4995
4996 if (c->ioprio_set)
4997 return c->ioprio;
4998
4999 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5000 if (p < 0)
5001 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5002
5003 return p;
5004 }
5005
5006 void exec_context_free_log_extra_fields(ExecContext *c) {
5007 size_t l;
5008
5009 assert(c);
5010
5011 for (l = 0; l < c->n_log_extra_fields; l++)
5012 free(c->log_extra_fields[l].iov_base);
5013 c->log_extra_fields = mfree(c->log_extra_fields);
5014 c->n_log_extra_fields = 0;
5015 }
5016
5017 void exec_context_revert_tty(ExecContext *c) {
5018 int r;
5019
5020 assert(c);
5021
5022 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5023 exec_context_tty_reset(c, NULL);
5024
5025 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5026 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5027 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
5028
5029 if (exec_context_may_touch_tty(c)) {
5030 const char *path;
5031
5032 path = exec_context_tty_path(c);
5033 if (path) {
5034 r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
5035 if (r < 0 && r != -ENOENT)
5036 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
5037 }
5038 }
5039 }
5040
5041 int exec_context_get_clean_directories(
5042 ExecContext *c,
5043 char **prefix,
5044 ExecCleanMask mask,
5045 char ***ret) {
5046
5047 _cleanup_strv_free_ char **l = NULL;
5048 ExecDirectoryType t;
5049 int r;
5050
5051 assert(c);
5052 assert(prefix);
5053 assert(ret);
5054
5055 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5056 char **i;
5057
5058 if (!FLAGS_SET(mask, 1U << t))
5059 continue;
5060
5061 if (!prefix[t])
5062 continue;
5063
5064 STRV_FOREACH(i, c->directories[t].paths) {
5065 char *j;
5066
5067 j = path_join(prefix[t], *i);
5068 if (!j)
5069 return -ENOMEM;
5070
5071 r = strv_consume(&l, j);
5072 if (r < 0)
5073 return r;
5074
5075 /* Also remove private directories unconditionally. */
5076 if (t != EXEC_DIRECTORY_CONFIGURATION) {
5077 j = path_join(prefix[t], "private", *i);
5078 if (!j)
5079 return -ENOMEM;
5080
5081 r = strv_consume(&l, j);
5082 if (r < 0)
5083 return r;
5084 }
5085 }
5086 }
5087
5088 *ret = TAKE_PTR(l);
5089 return 0;
5090 }
5091
5092 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5093 ExecCleanMask mask = 0;
5094
5095 assert(c);
5096 assert(ret);
5097
5098 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5099 if (!strv_isempty(c->directories[t].paths))
5100 mask |= 1U << t;
5101
5102 *ret = mask;
5103 return 0;
5104 }
5105
5106 void exec_status_start(ExecStatus *s, pid_t pid) {
5107 assert(s);
5108
5109 *s = (ExecStatus) {
5110 .pid = pid,
5111 };
5112
5113 dual_timestamp_get(&s->start_timestamp);
5114 }
5115
5116 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
5117 assert(s);
5118
5119 if (s->pid != pid) {
5120 *s = (ExecStatus) {
5121 .pid = pid,
5122 };
5123 }
5124
5125 dual_timestamp_get(&s->exit_timestamp);
5126
5127 s->code = code;
5128 s->status = status;
5129
5130 if (context && context->utmp_id)
5131 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
5132 }
5133
5134 void exec_status_reset(ExecStatus *s) {
5135 assert(s);
5136
5137 *s = (ExecStatus) {};
5138 }
5139
5140 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
5141 char buf[FORMAT_TIMESTAMP_MAX];
5142
5143 assert(s);
5144 assert(f);
5145
5146 if (s->pid <= 0)
5147 return;
5148
5149 prefix = strempty(prefix);
5150
5151 fprintf(f,
5152 "%sPID: "PID_FMT"\n",
5153 prefix, s->pid);
5154
5155 if (dual_timestamp_is_set(&s->start_timestamp))
5156 fprintf(f,
5157 "%sStart Timestamp: %s\n",
5158 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
5159
5160 if (dual_timestamp_is_set(&s->exit_timestamp))
5161 fprintf(f,
5162 "%sExit Timestamp: %s\n"
5163 "%sExit Code: %s\n"
5164 "%sExit Status: %i\n",
5165 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
5166 prefix, sigchld_code_to_string(s->code),
5167 prefix, s->status);
5168 }
5169
5170 static char *exec_command_line(char **argv) {
5171 size_t k;
5172 char *n, *p, **a;
5173 bool first = true;
5174
5175 assert(argv);
5176
5177 k = 1;
5178 STRV_FOREACH(a, argv)
5179 k += strlen(*a)+3;
5180
5181 n = new(char, k);
5182 if (!n)
5183 return NULL;
5184
5185 p = n;
5186 STRV_FOREACH(a, argv) {
5187
5188 if (!first)
5189 *(p++) = ' ';
5190 else
5191 first = false;
5192
5193 if (strpbrk(*a, WHITESPACE)) {
5194 *(p++) = '\'';
5195 p = stpcpy(p, *a);
5196 *(p++) = '\'';
5197 } else
5198 p = stpcpy(p, *a);
5199
5200 }
5201
5202 *p = 0;
5203
5204 /* FIXME: this doesn't really handle arguments that have
5205 * spaces and ticks in them */
5206
5207 return n;
5208 }
5209
5210 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
5211 _cleanup_free_ char *cmd = NULL;
5212 const char *prefix2;
5213
5214 assert(c);
5215 assert(f);
5216
5217 prefix = strempty(prefix);
5218 prefix2 = strjoina(prefix, "\t");
5219
5220 cmd = exec_command_line(c->argv);
5221 fprintf(f,
5222 "%sCommand Line: %s\n",
5223 prefix, cmd ? cmd : strerror_safe(ENOMEM));
5224
5225 exec_status_dump(&c->exec_status, f, prefix2);
5226 }
5227
5228 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
5229 assert(f);
5230
5231 prefix = strempty(prefix);
5232
5233 LIST_FOREACH(command, c, c)
5234 exec_command_dump(c, f, prefix);
5235 }
5236
5237 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
5238 ExecCommand *end;
5239
5240 assert(l);
5241 assert(e);
5242
5243 if (*l) {
5244 /* It's kind of important, that we keep the order here */
5245 LIST_FIND_TAIL(command, *l, end);
5246 LIST_INSERT_AFTER(command, *l, end, e);
5247 } else
5248 *l = e;
5249 }
5250
5251 int exec_command_set(ExecCommand *c, const char *path, ...) {
5252 va_list ap;
5253 char **l, *p;
5254
5255 assert(c);
5256 assert(path);
5257
5258 va_start(ap, path);
5259 l = strv_new_ap(path, ap);
5260 va_end(ap);
5261
5262 if (!l)
5263 return -ENOMEM;
5264
5265 p = strdup(path);
5266 if (!p) {
5267 strv_free(l);
5268 return -ENOMEM;
5269 }
5270
5271 free_and_replace(c->path, p);
5272
5273 return strv_free_and_replace(c->argv, l);
5274 }
5275
5276 int exec_command_append(ExecCommand *c, const char *path, ...) {
5277 _cleanup_strv_free_ char **l = NULL;
5278 va_list ap;
5279 int r;
5280
5281 assert(c);
5282 assert(path);
5283
5284 va_start(ap, path);
5285 l = strv_new_ap(path, ap);
5286 va_end(ap);
5287
5288 if (!l)
5289 return -ENOMEM;
5290
5291 r = strv_extend_strv(&c->argv, l, false);
5292 if (r < 0)
5293 return r;
5294
5295 return 0;
5296 }
5297
5298 static void *remove_tmpdir_thread(void *p) {
5299 _cleanup_free_ char *path = p;
5300
5301 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5302 return NULL;
5303 }
5304
5305 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5306 int r;
5307
5308 if (!rt)
5309 return NULL;
5310
5311 if (rt->manager)
5312 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5313
5314 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5315 if (destroy && rt->tmp_dir) {
5316 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5317
5318 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5319 if (r < 0) {
5320 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5321 free(rt->tmp_dir);
5322 }
5323
5324 rt->tmp_dir = NULL;
5325 }
5326
5327 if (destroy && rt->var_tmp_dir) {
5328 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5329
5330 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5331 if (r < 0) {
5332 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5333 free(rt->var_tmp_dir);
5334 }
5335
5336 rt->var_tmp_dir = NULL;
5337 }
5338
5339 rt->id = mfree(rt->id);
5340 rt->tmp_dir = mfree(rt->tmp_dir);
5341 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5342 safe_close_pair(rt->netns_storage_socket);
5343 return mfree(rt);
5344 }
5345
5346 static void exec_runtime_freep(ExecRuntime **rt) {
5347 (void) exec_runtime_free(*rt, false);
5348 }
5349
5350 static int exec_runtime_allocate(ExecRuntime **ret) {
5351 ExecRuntime *n;
5352
5353 assert(ret);
5354
5355 n = new(ExecRuntime, 1);
5356 if (!n)
5357 return -ENOMEM;
5358
5359 *n = (ExecRuntime) {
5360 .netns_storage_socket = { -1, -1 },
5361 };
5362
5363 *ret = n;
5364 return 0;
5365 }
5366
5367 static int exec_runtime_add(
5368 Manager *m,
5369 const char *id,
5370 const char *tmp_dir,
5371 const char *var_tmp_dir,
5372 const int netns_storage_socket[2],
5373 ExecRuntime **ret) {
5374
5375 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
5376 int r;
5377
5378 assert(m);
5379 assert(id);
5380
5381 r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5382 if (r < 0)
5383 return r;
5384
5385 r = exec_runtime_allocate(&rt);
5386 if (r < 0)
5387 return r;
5388
5389 rt->id = strdup(id);
5390 if (!rt->id)
5391 return -ENOMEM;
5392
5393 if (tmp_dir) {
5394 rt->tmp_dir = strdup(tmp_dir);
5395 if (!rt->tmp_dir)
5396 return -ENOMEM;
5397
5398 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5399 assert(var_tmp_dir);
5400 rt->var_tmp_dir = strdup(var_tmp_dir);
5401 if (!rt->var_tmp_dir)
5402 return -ENOMEM;
5403 }
5404
5405 if (netns_storage_socket) {
5406 rt->netns_storage_socket[0] = netns_storage_socket[0];
5407 rt->netns_storage_socket[1] = netns_storage_socket[1];
5408 }
5409
5410 r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5411 if (r < 0)
5412 return r;
5413
5414 rt->manager = m;
5415
5416 if (ret)
5417 *ret = rt;
5418
5419 /* do not remove created ExecRuntime object when the operation succeeds. */
5420 rt = NULL;
5421 return 0;
5422 }
5423
5424 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5425 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5426 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5427 int r;
5428
5429 assert(m);
5430 assert(c);
5431 assert(id);
5432
5433 /* It is not necessary to create ExecRuntime object. */
5434 if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5435 return 0;
5436
5437 if (c->private_tmp) {
5438 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5439 if (r < 0)
5440 return r;
5441 }
5442
5443 if (c->private_network || c->network_namespace_path) {
5444 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5445 return -errno;
5446 }
5447
5448 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5449 if (r < 0)
5450 return r;
5451
5452 /* Avoid cleanup */
5453 netns_storage_socket[0] = netns_storage_socket[1] = -1;
5454 return 1;
5455 }
5456
5457 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5458 ExecRuntime *rt;
5459 int r;
5460
5461 assert(m);
5462 assert(id);
5463 assert(ret);
5464
5465 rt = hashmap_get(m->exec_runtime_by_id, id);
5466 if (rt)
5467 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5468 goto ref;
5469
5470 if (!create)
5471 return 0;
5472
5473 /* If not found, then create a new object. */
5474 r = exec_runtime_make(m, c, id, &rt);
5475 if (r <= 0)
5476 /* When r == 0, it is not necessary to create ExecRuntime object. */
5477 return r;
5478
5479 ref:
5480 /* increment reference counter. */
5481 rt->n_ref++;
5482 *ret = rt;
5483 return 1;
5484 }
5485
5486 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5487 if (!rt)
5488 return NULL;
5489
5490 assert(rt->n_ref > 0);
5491
5492 rt->n_ref--;
5493 if (rt->n_ref > 0)
5494 return NULL;
5495
5496 return exec_runtime_free(rt, destroy);
5497 }
5498
5499 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5500 ExecRuntime *rt;
5501 Iterator i;
5502
5503 assert(m);
5504 assert(f);
5505 assert(fds);
5506
5507 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5508 fprintf(f, "exec-runtime=%s", rt->id);
5509
5510 if (rt->tmp_dir)
5511 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5512
5513 if (rt->var_tmp_dir)
5514 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5515
5516 if (rt->netns_storage_socket[0] >= 0) {
5517 int copy;
5518
5519 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5520 if (copy < 0)
5521 return copy;
5522
5523 fprintf(f, " netns-socket-0=%i", copy);
5524 }
5525
5526 if (rt->netns_storage_socket[1] >= 0) {
5527 int copy;
5528
5529 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5530 if (copy < 0)
5531 return copy;
5532
5533 fprintf(f, " netns-socket-1=%i", copy);
5534 }
5535
5536 fputc('\n', f);
5537 }
5538
5539 return 0;
5540 }
5541
5542 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5543 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5544 ExecRuntime *rt;
5545 int r;
5546
5547 /* This is for the migration from old (v237 or earlier) deserialization text.
5548 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5549 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5550 * so or not from the serialized text, then we always creates a new object owned by this. */
5551
5552 assert(u);
5553 assert(key);
5554 assert(value);
5555
5556 /* Manager manages ExecRuntime objects by the unit id.
5557 * So, we omit the serialized text when the unit does not have id (yet?)... */
5558 if (isempty(u->id)) {
5559 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5560 return 0;
5561 }
5562
5563 r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5564 if (r < 0) {
5565 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5566 return 0;
5567 }
5568
5569 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5570 if (!rt) {
5571 r = exec_runtime_allocate(&rt_create);
5572 if (r < 0)
5573 return log_oom();
5574
5575 rt_create->id = strdup(u->id);
5576 if (!rt_create->id)
5577 return log_oom();
5578
5579 rt = rt_create;
5580 }
5581
5582 if (streq(key, "tmp-dir")) {
5583 char *copy;
5584
5585 copy = strdup(value);
5586 if (!copy)
5587 return log_oom();
5588
5589 free_and_replace(rt->tmp_dir, copy);
5590
5591 } else if (streq(key, "var-tmp-dir")) {
5592 char *copy;
5593
5594 copy = strdup(value);
5595 if (!copy)
5596 return log_oom();
5597
5598 free_and_replace(rt->var_tmp_dir, copy);
5599
5600 } else if (streq(key, "netns-socket-0")) {
5601 int fd;
5602
5603 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5604 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5605 return 0;
5606 }
5607
5608 safe_close(rt->netns_storage_socket[0]);
5609 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5610
5611 } else if (streq(key, "netns-socket-1")) {
5612 int fd;
5613
5614 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5615 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5616 return 0;
5617 }
5618
5619 safe_close(rt->netns_storage_socket[1]);
5620 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5621 } else
5622 return 0;
5623
5624 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5625 if (rt_create) {
5626 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5627 if (r < 0) {
5628 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5629 return 0;
5630 }
5631
5632 rt_create->manager = u->manager;
5633
5634 /* Avoid cleanup */
5635 rt_create = NULL;
5636 }
5637
5638 return 1;
5639 }
5640
5641 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5642 char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5643 int r, fd0 = -1, fd1 = -1;
5644 const char *p, *v = value;
5645 size_t n;
5646
5647 assert(m);
5648 assert(value);
5649 assert(fds);
5650
5651 n = strcspn(v, " ");
5652 id = strndupa(v, n);
5653 if (v[n] != ' ')
5654 goto finalize;
5655 p = v + n + 1;
5656
5657 v = startswith(p, "tmp-dir=");
5658 if (v) {
5659 n = strcspn(v, " ");
5660 tmp_dir = strndupa(v, n);
5661 if (v[n] != ' ')
5662 goto finalize;
5663 p = v + n + 1;
5664 }
5665
5666 v = startswith(p, "var-tmp-dir=");
5667 if (v) {
5668 n = strcspn(v, " ");
5669 var_tmp_dir = strndupa(v, n);
5670 if (v[n] != ' ')
5671 goto finalize;
5672 p = v + n + 1;
5673 }
5674
5675 v = startswith(p, "netns-socket-0=");
5676 if (v) {
5677 char *buf;
5678
5679 n = strcspn(v, " ");
5680 buf = strndupa(v, n);
5681 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5682 log_debug("Unable to process exec-runtime netns fd specification.");
5683 return;
5684 }
5685 fd0 = fdset_remove(fds, fd0);
5686 if (v[n] != ' ')
5687 goto finalize;
5688 p = v + n + 1;
5689 }
5690
5691 v = startswith(p, "netns-socket-1=");
5692 if (v) {
5693 char *buf;
5694
5695 n = strcspn(v, " ");
5696 buf = strndupa(v, n);
5697 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5698 log_debug("Unable to process exec-runtime netns fd specification.");
5699 return;
5700 }
5701 fd1 = fdset_remove(fds, fd1);
5702 }
5703
5704 finalize:
5705
5706 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5707 if (r < 0)
5708 log_debug_errno(r, "Failed to add exec-runtime: %m");
5709 }
5710
5711 void exec_runtime_vacuum(Manager *m) {
5712 ExecRuntime *rt;
5713 Iterator i;
5714
5715 assert(m);
5716
5717 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5718
5719 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5720 if (rt->n_ref > 0)
5721 continue;
5722
5723 (void) exec_runtime_free(rt, false);
5724 }
5725 }
5726
5727 void exec_params_clear(ExecParameters *p) {
5728 if (!p)
5729 return;
5730
5731 strv_free(p->environment);
5732 }
5733
5734 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5735 [EXEC_INPUT_NULL] = "null",
5736 [EXEC_INPUT_TTY] = "tty",
5737 [EXEC_INPUT_TTY_FORCE] = "tty-force",
5738 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5739 [EXEC_INPUT_SOCKET] = "socket",
5740 [EXEC_INPUT_NAMED_FD] = "fd",
5741 [EXEC_INPUT_DATA] = "data",
5742 [EXEC_INPUT_FILE] = "file",
5743 };
5744
5745 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5746
5747 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5748 [EXEC_OUTPUT_INHERIT] = "inherit",
5749 [EXEC_OUTPUT_NULL] = "null",
5750 [EXEC_OUTPUT_TTY] = "tty",
5751 [EXEC_OUTPUT_SYSLOG] = "syslog",
5752 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5753 [EXEC_OUTPUT_KMSG] = "kmsg",
5754 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5755 [EXEC_OUTPUT_JOURNAL] = "journal",
5756 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5757 [EXEC_OUTPUT_SOCKET] = "socket",
5758 [EXEC_OUTPUT_NAMED_FD] = "fd",
5759 [EXEC_OUTPUT_FILE] = "file",
5760 [EXEC_OUTPUT_FILE_APPEND] = "append",
5761 };
5762
5763 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5764
5765 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5766 [EXEC_UTMP_INIT] = "init",
5767 [EXEC_UTMP_LOGIN] = "login",
5768 [EXEC_UTMP_USER] = "user",
5769 };
5770
5771 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5772
5773 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5774 [EXEC_PRESERVE_NO] = "no",
5775 [EXEC_PRESERVE_YES] = "yes",
5776 [EXEC_PRESERVE_RESTART] = "restart",
5777 };
5778
5779 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5780
5781 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5782 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5783 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5784 [EXEC_DIRECTORY_STATE] = "StateDirectory",
5785 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5786 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5787 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5788 };
5789
5790 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5791
5792 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5793 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5794 * directories, specifically .timer units with their timestamp touch file. */
5795 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5796 [EXEC_DIRECTORY_RUNTIME] = "runtime",
5797 [EXEC_DIRECTORY_STATE] = "state",
5798 [EXEC_DIRECTORY_CACHE] = "cache",
5799 [EXEC_DIRECTORY_LOGS] = "logs",
5800 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5801 };
5802
5803 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5804
5805 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5806 * the service payload in. */
5807 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5808 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5809 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5810 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5811 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5812 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5813 };
5814
5815 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5816
5817 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5818 [EXEC_KEYRING_INHERIT] = "inherit",
5819 [EXEC_KEYRING_PRIVATE] = "private",
5820 [EXEC_KEYRING_SHARED] = "shared",
5821 };
5822
5823 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);