1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
37 #include <sys/epoll.h>
39 #include <sys/signalfd.h>
43 #include <sys/socket.h>
46 #include <attr/xattr.h>
49 #include <systemd/sd-daemon.h>
57 #include "cgroup-util.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
62 #include "dev-setup.h"
71 typedef enum LinkJournal
{
78 static char *arg_directory
= NULL
;
79 static char *arg_user
= NULL
;
80 static char **arg_controllers
= NULL
;
81 static char *arg_uuid
= NULL
;
82 static char *arg_machine
= NULL
;
83 static bool arg_private_network
= false;
84 static bool arg_read_only
= false;
85 static bool arg_boot
= false;
86 static LinkJournal arg_link_journal
= LINK_AUTO
;
87 static uint64_t arg_retain
=
89 (1ULL << CAP_DAC_OVERRIDE
) |
90 (1ULL << CAP_DAC_READ_SEARCH
) |
91 (1ULL << CAP_FOWNER
) |
92 (1ULL << CAP_FSETID
) |
93 (1ULL << CAP_IPC_OWNER
) |
96 (1ULL << CAP_LINUX_IMMUTABLE
) |
97 (1ULL << CAP_NET_BIND_SERVICE
) |
98 (1ULL << CAP_NET_BROADCAST
) |
99 (1ULL << CAP_NET_RAW
) |
100 (1ULL << CAP_SETGID
) |
101 (1ULL << CAP_SETFCAP
) |
102 (1ULL << CAP_SETPCAP
) |
103 (1ULL << CAP_SETUID
) |
104 (1ULL << CAP_SYS_ADMIN
) |
105 (1ULL << CAP_SYS_CHROOT
) |
106 (1ULL << CAP_SYS_NICE
) |
107 (1ULL << CAP_SYS_PTRACE
) |
108 (1ULL << CAP_SYS_TTY_CONFIG
) |
109 (1ULL << CAP_SYS_RESOURCE
) |
110 (1ULL << CAP_SYS_BOOT
) |
111 (1ULL << CAP_AUDIT_WRITE
) |
112 (1ULL << CAP_AUDIT_CONTROL
);
113 static char **arg_bind
= NULL
;
114 static char **arg_bind_ro
= NULL
;
116 static int help(void) {
118 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
119 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
120 " -h --help Show this help\n"
121 " --version Print version string\n"
122 " -D --directory=NAME Root directory for the container\n"
123 " -b --boot Boot up full system (i.e. invoke init)\n"
124 " -u --user=USER Run the command under specified user or uid\n"
125 " -C --controllers=LIST Put the container in specified comma-separated\n"
126 " cgroup hierarchies\n"
127 " --uuid=UUID Set a specific machine UUID for the container\n"
128 " -M --machine=NAME Set the machine name for the container\n"
129 " --private-network Disable network in container\n"
130 " --read-only Mount the root directory read-only\n"
131 " --capability=CAP In addition to the default, retain specified\n"
133 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
134 " -j Equivalent to --link-journal=host\n"
135 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
137 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
138 program_invocation_short_name
);
143 static int parse_argv(int argc
, char *argv
[]) {
156 static const struct option options
[] = {
157 { "help", no_argument
, NULL
, 'h' },
158 { "version", no_argument
, NULL
, ARG_VERSION
},
159 { "directory", required_argument
, NULL
, 'D' },
160 { "user", required_argument
, NULL
, 'u' },
161 { "controllers", required_argument
, NULL
, 'C' },
162 { "private-network", no_argument
, NULL
, ARG_PRIVATE_NETWORK
},
163 { "boot", no_argument
, NULL
, 'b' },
164 { "uuid", required_argument
, NULL
, ARG_UUID
},
165 { "read-only", no_argument
, NULL
, ARG_READ_ONLY
},
166 { "capability", required_argument
, NULL
, ARG_CAPABILITY
},
167 { "link-journal", required_argument
, NULL
, ARG_LINK_JOURNAL
},
168 { "bind", required_argument
, NULL
, ARG_BIND
},
169 { "bind-ro", required_argument
, NULL
, ARG_BIND_RO
},
170 { "machine", required_argument
, NULL
, 'M' },
179 while ((c
= getopt_long(argc
, argv
, "+hD:u:C:bM:j", options
, NULL
)) >= 0) {
188 puts(PACKAGE_STRING
);
189 puts(SYSTEMD_FEATURES
);
194 arg_directory
= canonicalize_file_name(optarg
);
195 if (!arg_directory
) {
196 log_error("Failed to canonicalize root directory.");
204 arg_user
= strdup(optarg
);
211 strv_free(arg_controllers
);
212 arg_controllers
= strv_split(optarg
, ",");
213 if (!arg_controllers
)
216 cg_shorten_controllers(arg_controllers
);
219 case ARG_PRIVATE_NETWORK
:
220 arg_private_network
= true;
228 if (!id128_is_valid(optarg
)) {
229 log_error("Invalid UUID: %s", optarg
);
237 if (!hostname_is_valid(optarg
)) {
238 log_error("Invalid machine name: %s", optarg
);
243 arg_machine
= strdup(optarg
);
250 arg_read_only
= true;
253 case ARG_CAPABILITY
: {
257 FOREACH_WORD_SEPARATOR(word
, length
, optarg
, ",", state
) {
261 t
= strndup(word
, length
);
265 if (cap_from_name(t
, &cap
) < 0) {
266 log_error("Failed to parse capability %s.", t
);
272 arg_retain
|= 1ULL << (uint64_t) cap
;
279 arg_link_journal
= LINK_GUEST
;
282 case ARG_LINK_JOURNAL
:
283 if (streq(optarg
, "auto"))
284 arg_link_journal
= LINK_AUTO
;
285 else if (streq(optarg
, "no"))
286 arg_link_journal
= LINK_NO
;
287 else if (streq(optarg
, "guest"))
288 arg_link_journal
= LINK_GUEST
;
289 else if (streq(optarg
, "host"))
290 arg_link_journal
= LINK_HOST
;
292 log_error("Failed to parse link journal mode %s", optarg
);
300 _cleanup_free_
char *a
= NULL
, *b
= NULL
;
305 x
= c
== ARG_BIND
? &arg_bind
: &arg_bind_ro
;
307 e
= strchr(optarg
, ':');
309 a
= strndup(optarg
, e
- optarg
);
319 if (!path_is_absolute(a
) || !path_is_absolute(b
)) {
320 log_error("Invalid bind mount specification: %s", optarg
);
324 r
= strv_extend(x
, a
);
328 r
= strv_extend(x
, b
);
339 log_error("Unknown option code %c", c
);
347 static int mount_all(const char *dest
) {
349 typedef struct MountPoint
{
358 static const MountPoint mount_table
[] = {
359 { "proc", "/proc", "proc", NULL
, MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true },
360 { "/proc/sys", "/proc/sys", NULL
, NULL
, MS_BIND
, true }, /* Bind mount first */
361 { NULL
, "/proc/sys", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_REMOUNT
, true }, /* Then, make it r/o */
362 { "sysfs", "/sys", "sysfs", NULL
, MS_RDONLY
|MS_NOSUID
|MS_NOEXEC
|MS_NODEV
, true },
363 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID
|MS_STRICTATIME
, true },
364 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID
), MS_NOSUID
|MS_NOEXEC
, true },
365 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true },
366 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID
|MS_NODEV
|MS_STRICTATIME
, true },
368 { "/sys/fs/selinux", "/sys/fs/selinux", NULL
, NULL
, MS_BIND
, false }, /* Bind mount first */
369 { NULL
, "/sys/fs/selinux", NULL
, NULL
, MS_BIND
|MS_RDONLY
|MS_REMOUNT
, false }, /* Then, make it r/o */
376 for (k
= 0; k
< ELEMENTSOF(mount_table
); k
++) {
377 _cleanup_free_
char *where
= NULL
;
380 where
= strjoin(dest
, "/", mount_table
[k
].where
, NULL
);
384 t
= path_is_mount_point(where
, true);
386 log_error("Failed to detect whether %s is a mount point: %s", where
, strerror(-t
));
394 /* Skip this entry if it is not a remount. */
395 if (mount_table
[k
].what
&& t
> 0)
398 mkdir_p(where
, 0755);
400 if (mount(mount_table
[k
].what
,
403 mount_table
[k
].flags
,
404 mount_table
[k
].options
) < 0 &&
405 mount_table
[k
].fatal
) {
407 log_error("mount(%s) failed: %m", where
);
417 static int mount_binds(const char *dest
, char **l
, unsigned long flags
) {
420 STRV_FOREACH_PAIR(x
, y
, l
) {
421 _cleanup_free_
char *where
= NULL
;
423 where
= strjoin(dest
, "/", *y
, NULL
);
427 mkdir_p_label(where
, 0755);
429 if (mount(*x
, where
, "bind", MS_BIND
, NULL
) < 0) {
430 log_error("mount(%s) failed: %m", where
);
434 if (flags
&& mount(NULL
, where
, NULL
, MS_REMOUNT
|MS_BIND
|flags
, NULL
) < 0) {
435 log_error("mount(%s) failed: %m", where
);
443 static int setup_timezone(const char *dest
) {
444 _cleanup_free_
char *where
= NULL
, *p
= NULL
, *q
= NULL
, *check
= NULL
, *what
= NULL
;
450 /* Fix the timezone, if possible */
451 r
= readlink_malloc("/etc/localtime", &p
);
453 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
457 z
= path_startswith(p
, "../usr/share/zoneinfo/");
459 z
= path_startswith(p
, "/usr/share/zoneinfo/");
461 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
465 where
= strappend(dest
, "/etc/localtime");
469 r
= readlink_malloc(where
, &q
);
471 y
= path_startswith(q
, "../usr/share/zoneinfo/");
473 y
= path_startswith(q
, "/usr/share/zoneinfo/");
476 /* Already pointing to the right place? Then do nothing .. */
477 if (y
&& streq(y
, z
))
481 check
= strjoin(dest
, "/usr/share/zoneinfo/", z
, NULL
);
485 if (access(check
, F_OK
) < 0) {
486 log_warning("Timezone %s does not exist in container, not updating container timezone.", z
);
490 what
= strappend("../usr/share/zoneinfo/", z
);
495 if (symlink(what
, where
) < 0) {
496 log_error("Failed to correct timezone of container: %m");
503 static int setup_resolv_conf(const char *dest
) {
504 char _cleanup_free_
*where
= NULL
;
505 _cleanup_close_
int fd
= -1;
509 if (arg_private_network
)
512 /* Fix resolv.conf, if possible */
513 where
= strappend(dest
, "/etc/resolv.conf");
517 fd
= open(where
, O_WRONLY
|O_CREAT
|O_EXCL
|O_CLOEXEC
|O_NOCTTY
|O_NOFOLLOW
, 0644);
519 /* We don't really care for the results of this really. If it
520 * fails, it fails, but meh... */
521 if (mount("/etc/resolv.conf", where
, "bind", MS_BIND
, NULL
) < 0)
522 log_warning("Failed to bind mount /etc/resolv.conf: %m");
524 if (mount("/etc/resolv.conf", where
, "bind",
525 MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
) < 0) {
526 log_error("Failed to remount /etc/resolv.conf readonly: %m");
533 static int setup_boot_id(const char *dest
) {
534 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
541 /* Generate a new randomized boot ID, so that each boot-up of
542 * the container gets a new one */
544 from
= strappend(dest
, "/dev/proc-sys-kernel-random-boot-id");
545 to
= strappend(dest
, "/proc/sys/kernel/random/boot_id");
549 r
= sd_id128_randomize(&rnd
);
551 log_error("Failed to generate random boot id: %s", strerror(-r
));
555 snprintf(as_uuid
, sizeof(as_uuid
),
556 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
557 SD_ID128_FORMAT_VAL(rnd
));
558 char_array_0(as_uuid
);
560 r
= write_string_file(from
, as_uuid
);
562 log_error("Failed to write boot id: %s", strerror(-r
));
566 if (mount(from
, to
, "bind", MS_BIND
, NULL
) < 0) {
567 log_error("Failed to bind mount boot id: %m");
569 } else if (mount(from
, to
, "bind", MS_BIND
|MS_REMOUNT
|MS_RDONLY
, NULL
))
570 log_warning("Failed to make boot id read-only: %m");
576 static int copy_devnodes(const char *dest
) {
578 static const char devnodes
[] =
588 _cleanup_umask_ mode_t u
;
594 NULSTR_FOREACH(d
, devnodes
) {
596 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
598 asprintf(&from
, "/dev/%s", d
);
599 asprintf(&to
, "%s/dev/%s", dest
, d
);
610 if (stat(from
, &st
) < 0) {
612 if (errno
!= ENOENT
) {
613 log_error("Failed to stat %s: %m", from
);
618 } else if (!S_ISCHR(st
.st_mode
) && !S_ISBLK(st
.st_mode
)) {
620 log_error("%s is not a char or block device, cannot copy", from
);
624 } else if (mknod(to
, st
.st_mode
, st
.st_rdev
) < 0) {
626 log_error("mknod(%s) failed: %m", dest
);
635 static int setup_ptmx(const char *dest
) {
636 _cleanup_free_
char *p
= NULL
;
638 p
= strappend(dest
, "/dev/ptmx");
642 if (symlink("pts/ptmx", p
) < 0) {
643 log_error("Failed to create /dev/ptmx symlink: %m");
650 static int setup_dev_console(const char *dest
, const char *console
) {
652 _cleanup_free_
char *to
= NULL
;
654 _cleanup_umask_ mode_t u
;
661 if (stat(console
, &st
) < 0) {
662 log_error("Failed to stat %s: %m", console
);
665 } else if (!S_ISCHR(st
.st_mode
)) {
666 log_error("/dev/console is not a char device");
670 r
= chmod_and_chown(console
, 0600, 0, 0);
672 log_error("Failed to correct access mode for TTY: %s", strerror(-r
));
676 if (asprintf(&to
, "%s/dev/console", dest
) < 0)
679 /* We need to bind mount the right tty to /dev/console since
680 * ptys can only exist on pts file systems. To have something
681 * to bind mount things on we create a device node first, that
682 * has the right major/minor (note that the major minor
683 * doesn't actually matter here, since we mount it over
686 if (mknod(to
, (st
.st_mode
& ~07777) | 0600, st
.st_rdev
) < 0) {
687 log_error("mknod() for /dev/console failed: %m");
691 if (mount(console
, to
, "bind", MS_BIND
, NULL
) < 0) {
692 log_error("Bind mount for /dev/console failed: %m");
699 static int setup_kmsg(const char *dest
, int kmsg_socket
) {
700 _cleanup_free_
char *from
= NULL
, *to
= NULL
;
702 _cleanup_umask_ mode_t u
;
704 struct cmsghdr cmsghdr
;
705 uint8_t buf
[CMSG_SPACE(sizeof(int))];
708 .msg_control
= &control
,
709 .msg_controllen
= sizeof(control
),
711 struct cmsghdr
*cmsg
;
714 assert(kmsg_socket
>= 0);
718 /* We create the kmsg FIFO as /dev/kmsg, but immediately
719 * delete it after bind mounting it to /proc/kmsg. While FIFOs
720 * on the reading side behave very similar to /proc/kmsg,
721 * their writing side behaves differently from /dev/kmsg in
722 * that writing blocks when nothing is reading. In order to
723 * avoid any problems with containers deadlocking due to this
724 * we simply make /dev/kmsg unavailable to the container. */
725 if (asprintf(&from
, "%s/dev/kmsg", dest
) < 0 ||
726 asprintf(&to
, "%s/proc/kmsg", dest
) < 0)
729 if (mkfifo(from
, 0600) < 0) {
730 log_error("mkfifo() for /dev/kmsg failed: %m");
734 r
= chmod_and_chown(from
, 0600, 0, 0);
736 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r
));
740 if (mount(from
, to
, "bind", MS_BIND
, NULL
) < 0) {
741 log_error("Bind mount for /proc/kmsg failed: %m");
745 fd
= open(from
, O_RDWR
|O_NDELAY
|O_CLOEXEC
);
747 log_error("Failed to open fifo: %m");
751 cmsg
= CMSG_FIRSTHDR(&mh
);
752 cmsg
->cmsg_level
= SOL_SOCKET
;
753 cmsg
->cmsg_type
= SCM_RIGHTS
;
754 cmsg
->cmsg_len
= CMSG_LEN(sizeof(int));
755 memcpy(CMSG_DATA(cmsg
), &fd
, sizeof(int));
757 mh
.msg_controllen
= cmsg
->cmsg_len
;
759 /* Store away the fd in the socket, so that it stays open as
760 * long as we run the child */
761 k
= sendmsg(kmsg_socket
, &mh
, MSG_DONTWAIT
|MSG_NOSIGNAL
);
762 close_nointr_nofail(fd
);
765 log_error("Failed to send FIFO fd: %m");
769 /* And now make the FIFO unavailable as /dev/kmsg... */
774 static int setup_hostname(void) {
776 if (sethostname(arg_machine
, strlen(arg_machine
)) < 0)
782 static int setup_journal(const char *directory
) {
783 sd_id128_t machine_id
;
784 _cleanup_free_
char *p
= NULL
, *b
= NULL
, *q
= NULL
, *d
= NULL
;
788 if (arg_link_journal
== LINK_NO
)
791 p
= strappend(directory
, "/etc/machine-id");
795 r
= read_one_line_file(p
, &b
);
796 if (r
== -ENOENT
&& arg_link_journal
== LINK_AUTO
)
799 log_error("Failed to read machine ID from %s: %s", p
, strerror(-r
));
804 if (isempty(id
) && arg_link_journal
== LINK_AUTO
)
807 /* Verify validity */
808 r
= sd_id128_from_string(id
, &machine_id
);
810 log_error("Failed to parse machine ID from %s: %s", p
, strerror(-r
));
815 p
= strappend("/var/log/journal/", id
);
816 q
= strjoin(directory
, "/var/log/journal/", id
, NULL
);
820 if (path_is_mount_point(p
, false) > 0) {
821 if (arg_link_journal
!= LINK_AUTO
) {
822 log_error("%s: already a mount point, refusing to use for journal", p
);
829 if (path_is_mount_point(q
, false) > 0) {
830 if (arg_link_journal
!= LINK_AUTO
) {
831 log_error("%s: already a mount point, refusing to use for journal", q
);
838 r
= readlink_and_make_absolute(p
, &d
);
840 if ((arg_link_journal
== LINK_GUEST
||
841 arg_link_journal
== LINK_AUTO
) &&
844 r
= mkdir_p(q
, 0755);
846 log_warning("failed to create directory %s: %m", q
);
851 log_error("Failed to remove symlink %s: %m", p
);
854 } else if (r
== -EINVAL
) {
856 if (arg_link_journal
== LINK_GUEST
&&
859 if (errno
== ENOTDIR
) {
860 log_error("%s already exists and is neither a symlink nor a directory", p
);
863 log_error("Failed to remove %s: %m", p
);
867 } else if (r
!= -ENOENT
) {
868 log_error("readlink(%s) failed: %m", p
);
872 if (arg_link_journal
== LINK_GUEST
) {
874 if (symlink(q
, p
) < 0) {
875 log_error("Failed to symlink %s to %s: %m", q
, p
);
879 r
= mkdir_p(q
, 0755);
881 log_warning("failed to create directory %s: %m", q
);
885 if (arg_link_journal
== LINK_HOST
) {
886 r
= mkdir_p(p
, 0755);
888 log_error("Failed to create %s: %m", p
);
892 } else if (access(p
, F_OK
) < 0)
895 if (dir_is_empty(q
) == 0) {
896 log_error("%s not empty.", q
);
900 r
= mkdir_p(q
, 0755);
902 log_error("Failed to create %s: %m", q
);
906 if (mount(p
, q
, "bind", MS_BIND
, NULL
) < 0) {
907 log_error("Failed to bind mount journal from host into guest: %m");
914 static int setup_cgroup(const char *path
) {
918 r
= cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER
, path
, 1);
920 log_error("Failed to create cgroup: %s", strerror(-r
));
924 STRV_FOREACH(c
, arg_controllers
) {
925 r
= cg_create_and_attach(*c
, path
, 1);
927 log_warning("Failed to create cgroup in controller %s: %s", *c
, strerror(-r
));
933 static int save_attributes(const char *cgroup
, pid_t pid
, const char *uuid
, const char *directory
) {
935 _cleanup_free_
char *path
= NULL
;
936 char buf
[DECIMAL_STR_MAX(pid_t
)];
941 assert(arg_directory
);
943 assert_se(snprintf(buf
, sizeof(buf
), "%lu", (unsigned long) pid
) < (int) sizeof(buf
));
945 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, cgroup
, NULL
, &path
);
947 log_error("Failed to get path: %s", strerror(-r
));
951 r
= setxattr(path
, "trusted.init_pid", buf
, strlen(buf
), XATTR_CREATE
);
953 log_warning("Failed to set %s attribute on %s: %m", "trusted.init_pid", path
);
956 k
= setxattr(path
, "trusted.machine_id", uuid
, strlen(uuid
), XATTR_CREATE
);
958 log_warning("Failed to set %s attribute on %s: %m", "trusted.machine_id", path
);
964 k
= setxattr(path
, "trusted.root_directory", directory
, strlen(directory
), XATTR_CREATE
);
966 log_warning("Failed to set %s attribute on %s: %m", "trusted.root_directory", path
);
976 static int drop_capabilities(void) {
977 return capability_bounding_set_drop(~arg_retain
, false);
980 static int process_pty(int master
, pid_t pid
, sigset_t
*mask
) {
982 char in_buffer
[LINE_MAX
], out_buffer
[LINE_MAX
];
983 size_t in_buffer_full
= 0, out_buffer_full
= 0;
984 struct epoll_event stdin_ev
, stdout_ev
, master_ev
, signal_ev
;
985 bool stdin_readable
= false, stdout_writable
= false, master_readable
= false, master_writable
= false;
986 int ep
= -1, signal_fd
= -1, r
;
987 bool tried_orderly_shutdown
= false;
993 fd_nonblock(STDIN_FILENO
, 1);
994 fd_nonblock(STDOUT_FILENO
, 1);
995 fd_nonblock(master
, 1);
997 signal_fd
= signalfd(-1, mask
, SFD_NONBLOCK
|SFD_CLOEXEC
);
999 log_error("signalfd(): %m");
1004 ep
= epoll_create1(EPOLL_CLOEXEC
);
1006 log_error("Failed to create epoll: %m");
1011 /* We read from STDIN only if this is actually a TTY,
1012 * otherwise we assume non-interactivity. */
1013 if (isatty(STDIN_FILENO
)) {
1015 stdin_ev
.events
= EPOLLIN
|EPOLLET
;
1016 stdin_ev
.data
.fd
= STDIN_FILENO
;
1018 if (epoll_ctl(ep
, EPOLL_CTL_ADD
, STDIN_FILENO
, &stdin_ev
) < 0) {
1019 log_error("Failed to register STDIN in epoll: %m");
1026 stdout_ev
.events
= EPOLLOUT
|EPOLLET
;
1027 stdout_ev
.data
.fd
= STDOUT_FILENO
;
1030 master_ev
.events
= EPOLLIN
|EPOLLOUT
|EPOLLET
;
1031 master_ev
.data
.fd
= master
;
1034 signal_ev
.events
= EPOLLIN
;
1035 signal_ev
.data
.fd
= signal_fd
;
1037 if (epoll_ctl(ep
, EPOLL_CTL_ADD
, STDOUT_FILENO
, &stdout_ev
) < 0) {
1038 if (errno
!= EPERM
) {
1039 log_error("Failed to register stdout in epoll: %m");
1043 /* stdout without epoll support. Likely redirected to regular file. */
1044 stdout_writable
= true;
1047 if (epoll_ctl(ep
, EPOLL_CTL_ADD
, master
, &master_ev
) < 0 ||
1048 epoll_ctl(ep
, EPOLL_CTL_ADD
, signal_fd
, &signal_ev
) < 0) {
1049 log_error("Failed to register fds in epoll: %m");
1055 struct epoll_event ev
[16];
1059 nfds
= epoll_wait(ep
, ev
, ELEMENTSOF(ev
), -1);
1062 if (errno
== EINTR
|| errno
== EAGAIN
)
1065 log_error("epoll_wait(): %m");
1072 for (i
= 0; i
< nfds
; i
++) {
1073 if (ev
[i
].data
.fd
== STDIN_FILENO
) {
1075 if (ev
[i
].events
& (EPOLLIN
|EPOLLHUP
))
1076 stdin_readable
= true;
1078 } else if (ev
[i
].data
.fd
== STDOUT_FILENO
) {
1080 if (ev
[i
].events
& (EPOLLOUT
|EPOLLHUP
))
1081 stdout_writable
= true;
1083 } else if (ev
[i
].data
.fd
== master
) {
1085 if (ev
[i
].events
& (EPOLLIN
|EPOLLHUP
))
1086 master_readable
= true;
1088 if (ev
[i
].events
& (EPOLLOUT
|EPOLLHUP
))
1089 master_writable
= true;
1091 } else if (ev
[i
].data
.fd
== signal_fd
) {
1092 struct signalfd_siginfo sfsi
;
1095 n
= read(signal_fd
, &sfsi
, sizeof(sfsi
));
1096 if (n
!= sizeof(sfsi
)) {
1099 log_error("Failed to read from signalfd: invalid block size");
1104 if (errno
!= EINTR
&& errno
!= EAGAIN
) {
1105 log_error("Failed to read from signalfd: %m");
1111 if (sfsi
.ssi_signo
== SIGWINCH
) {
1114 /* The window size changed, let's forward that. */
1115 if (ioctl(STDIN_FILENO
, TIOCGWINSZ
, &ws
) >= 0)
1116 ioctl(master
, TIOCSWINSZ
, &ws
);
1117 } else if (sfsi
.ssi_signo
== SIGTERM
&& arg_boot
&& !tried_orderly_shutdown
) {
1119 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1121 /* This only works for systemd... */
1122 tried_orderly_shutdown
= true;
1123 kill(pid
, SIGRTMIN
+3);
1133 while ((stdin_readable
&& in_buffer_full
<= 0) ||
1134 (master_writable
&& in_buffer_full
> 0) ||
1135 (master_readable
&& out_buffer_full
<= 0) ||
1136 (stdout_writable
&& out_buffer_full
> 0)) {
1138 if (stdin_readable
&& in_buffer_full
< LINE_MAX
) {
1140 k
= read(STDIN_FILENO
, in_buffer
+ in_buffer_full
, LINE_MAX
- in_buffer_full
);
1143 if (errno
== EAGAIN
|| errno
== EPIPE
|| errno
== ECONNRESET
|| errno
== EIO
)
1144 stdin_readable
= false;
1146 log_error("read(): %m");
1151 in_buffer_full
+= (size_t) k
;
1154 if (master_writable
&& in_buffer_full
> 0) {
1156 k
= write(master
, in_buffer
, in_buffer_full
);
1159 if (errno
== EAGAIN
|| errno
== EPIPE
|| errno
== ECONNRESET
|| errno
== EIO
)
1160 master_writable
= false;
1162 log_error("write(): %m");
1168 assert(in_buffer_full
>= (size_t) k
);
1169 memmove(in_buffer
, in_buffer
+ k
, in_buffer_full
- k
);
1170 in_buffer_full
-= k
;
1174 if (master_readable
&& out_buffer_full
< LINE_MAX
) {
1176 k
= read(master
, out_buffer
+ out_buffer_full
, LINE_MAX
- out_buffer_full
);
1179 if (errno
== EAGAIN
|| errno
== EPIPE
|| errno
== ECONNRESET
|| errno
== EIO
)
1180 master_readable
= false;
1182 log_error("read(): %m");
1187 out_buffer_full
+= (size_t) k
;
1190 if (stdout_writable
&& out_buffer_full
> 0) {
1192 k
= write(STDOUT_FILENO
, out_buffer
, out_buffer_full
);
1195 if (errno
== EAGAIN
|| errno
== EPIPE
|| errno
== ECONNRESET
|| errno
== EIO
)
1196 stdout_writable
= false;
1198 log_error("write(): %m");
1204 assert(out_buffer_full
>= (size_t) k
);
1205 memmove(out_buffer
, out_buffer
+ k
, out_buffer_full
- k
);
1206 out_buffer_full
-= k
;
1214 close_nointr_nofail(ep
);
1217 close_nointr_nofail(signal_fd
);
1222 int main(int argc
, char *argv
[]) {
1224 int r
= EXIT_FAILURE
, k
;
1225 _cleanup_free_
char *newcg
= NULL
;
1226 _cleanup_close_
int master
= -1;
1228 const char *console
= NULL
;
1229 struct termios saved_attr
, raw_attr
;
1231 bool saved_attr_valid
= false;
1233 int kmsg_socket_pair
[2] = { -1, -1 };
1236 log_parse_environment();
1239 k
= parse_argv(argc
, argv
);
1247 if (arg_directory
) {
1250 p
= path_make_absolute_cwd(arg_directory
);
1251 free(arg_directory
);
1254 arg_directory
= get_current_dir_name();
1256 if (!arg_directory
) {
1257 log_error("Failed to determine path, please use -D.");
1261 path_kill_slashes(arg_directory
);
1264 arg_machine
= strdup(path_get_file_name(arg_directory
));
1270 hostname_cleanup(arg_machine
, false);
1271 if (isempty(arg_machine
)) {
1272 log_error("Failed to determine machine name automatically, please use -M.");
1277 if (geteuid() != 0) {
1278 log_error("Need to be root.");
1282 if (sd_booted() <= 0) {
1283 log_error("Not running on a systemd system.");
1287 if (path_equal(arg_directory
, "/")) {
1288 log_error("Spawning container on root directory not supported.");
1292 if (path_is_os_tree(arg_directory
) <= 0) {
1293 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory
);
1298 n_fd_passed
= sd_listen_fds(false);
1299 if (n_fd_passed
> 0) {
1300 k
= fdset_new_listen_fds(&fds
, false);
1302 log_error("Failed to collect file descriptors: %s", strerror(-k
));
1306 fdset_close_others(fds
);
1309 k
= cg_get_machine_path(arg_machine
, &newcg
);
1311 log_error("Failed to determine machine cgroup path: %s", strerror(-k
));
1315 k
= cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER
, newcg
, true);
1316 if (k
<= 0 && k
!= -ENOENT
) {
1317 log_error("Container already running.");
1325 master
= posix_openpt(O_RDWR
|O_NOCTTY
|O_CLOEXEC
|O_NDELAY
);
1327 log_error("Failed to acquire pseudo tty: %m");
1331 console
= ptsname(master
);
1333 log_error("Failed to determine tty name: %m");
1337 log_info("Spawning namespace container on %s (console is %s).", arg_directory
, console
);
1339 if (ioctl(STDIN_FILENO
, TIOCGWINSZ
, &ws
) >= 0)
1340 ioctl(master
, TIOCSWINSZ
, &ws
);
1342 if (unlockpt(master
) < 0) {
1343 log_error("Failed to unlock tty: %m");
1347 if (tcgetattr(STDIN_FILENO
, &saved_attr
) >= 0) {
1348 saved_attr_valid
= true;
1350 raw_attr
= saved_attr
;
1351 cfmakeraw(&raw_attr
);
1352 raw_attr
.c_lflag
&= ~ECHO
;
1355 if (socketpair(AF_UNIX
, SOCK_DGRAM
|SOCK_NONBLOCK
|SOCK_CLOEXEC
, 0, kmsg_socket_pair
) < 0) {
1356 log_error("Failed to create kmsg socket pair.");
1360 sd_notify(0, "READY=1");
1362 assert_se(sigemptyset(&mask
) == 0);
1363 sigset_add_many(&mask
, SIGCHLD
, SIGWINCH
, SIGTERM
, SIGINT
, -1);
1364 assert_se(sigprocmask(SIG_BLOCK
, &mask
, NULL
) == 0);
1368 int pipefd
[2], pipefd2
[2];
1370 if (pipe2(pipefd
, O_NONBLOCK
|O_CLOEXEC
) < 0) {
1371 log_error("pipe2(): %m");
1375 if (pipe2(pipefd2
, O_NONBLOCK
|O_CLOEXEC
) < 0) {
1376 log_error("pipe2(): %m");
1381 pid
= syscall(__NR_clone
, SIGCHLD
|CLONE_NEWIPC
|CLONE_NEWNS
|CLONE_NEWPID
|CLONE_NEWUTS
|(arg_private_network
? CLONE_NEWNET
: 0), NULL
);
1383 if (errno
== EINVAL
)
1384 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1386 log_error("clone() failed: %m");
1393 const char *home
= NULL
;
1394 uid_t uid
= (uid_t
) -1;
1395 gid_t gid
= (gid_t
) -1;
1397 const char *envp
[] = {
1398 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1399 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1404 NULL
, /* container_uuid */
1405 NULL
, /* LISTEN_FDS */
1406 NULL
, /* LISTEN_PID */
1410 envp
[n_env
] = strv_find_prefix(environ
, "TERM=");
1414 /* Wait for the parent process to log our PID */
1415 close_nointr_nofail(pipefd
[1]);
1416 fd_wait_for_event(pipefd
[0], POLLHUP
, -1);
1417 close_nointr_nofail(pipefd
[0]);
1419 close_nointr_nofail(master
);
1422 if (saved_attr_valid
) {
1423 if (tcsetattr(STDIN_FILENO
, TCSANOW
, &raw_attr
) < 0) {
1424 log_error("Failed to set terminal attributes: %m");
1429 close_nointr(STDIN_FILENO
);
1430 close_nointr(STDOUT_FILENO
);
1431 close_nointr(STDERR_FILENO
);
1433 close_nointr_nofail(kmsg_socket_pair
[0]);
1434 kmsg_socket_pair
[0] = -1;
1436 reset_all_signal_handlers();
1438 assert_se(sigemptyset(&mask
) == 0);
1439 assert_se(sigprocmask(SIG_SETMASK
, &mask
, NULL
) == 0);
1441 k
= open_terminal(console
, O_RDWR
);
1442 if (k
!= STDIN_FILENO
) {
1444 close_nointr_nofail(k
);
1448 log_error("Failed to open console: %s", strerror(-k
));
1452 if (dup2(STDIN_FILENO
, STDOUT_FILENO
) != STDOUT_FILENO
||
1453 dup2(STDIN_FILENO
, STDERR_FILENO
) != STDERR_FILENO
) {
1454 log_error("Failed to duplicate console: %m");
1459 log_error("setsid() failed: %m");
1463 if (prctl(PR_SET_PDEATHSIG
, SIGKILL
) < 0) {
1464 log_error("PR_SET_PDEATHSIG failed: %m");
1468 if (setup_cgroup(newcg
) < 0)
1471 close_pipe(pipefd2
);
1473 /* Mark everything as slave, so that we still
1474 * receive mounts from the real root, but don't
1475 * propagate mounts to the real root. */
1476 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0) {
1477 log_error("MS_SLAVE|MS_REC failed: %m");
1481 /* Turn directory into bind mount */
1482 if (mount(arg_directory
, arg_directory
, "bind", MS_BIND
|MS_REC
, NULL
) < 0) {
1483 log_error("Failed to make bind mount.");
1488 if (mount(arg_directory
, arg_directory
, "bind", MS_BIND
|MS_REMOUNT
|MS_RDONLY
|MS_REC
, NULL
) < 0) {
1489 log_error("Failed to make read-only.");
1493 if (mount_all(arg_directory
) < 0)
1496 if (copy_devnodes(arg_directory
) < 0)
1499 if (setup_ptmx(arg_directory
) < 0)
1502 dev_setup(arg_directory
);
1504 if (setup_dev_console(arg_directory
, console
) < 0)
1507 if (setup_kmsg(arg_directory
, kmsg_socket_pair
[1]) < 0)
1510 close_nointr_nofail(kmsg_socket_pair
[1]);
1511 kmsg_socket_pair
[1] = -1;
1513 if (setup_boot_id(arg_directory
) < 0)
1516 if (setup_timezone(arg_directory
) < 0)
1519 if (setup_resolv_conf(arg_directory
) < 0)
1522 if (setup_journal(arg_directory
) < 0)
1525 if (mount_binds(arg_directory
, arg_bind
, 0) < 0)
1528 if (mount_binds(arg_directory
, arg_bind_ro
, MS_RDONLY
) < 0)
1531 if (chdir(arg_directory
) < 0) {
1532 log_error("chdir(%s) failed: %m", arg_directory
);
1536 if (mount(arg_directory
, "/", NULL
, MS_MOVE
, NULL
) < 0) {
1537 log_error("mount(MS_MOVE) failed: %m");
1541 if (chroot(".") < 0) {
1542 log_error("chroot() failed: %m");
1546 if (chdir("/") < 0) {
1547 log_error("chdir() failed: %m");
1555 if (drop_capabilities() < 0) {
1556 log_error("drop_capabilities() failed: %m");
1562 /* Note that this resolves user names
1563 * inside the container, and hence
1564 * accesses the NSS modules from the
1565 * container and not the host. This is
1568 if (get_user_creds((const char**)&arg_user
, &uid
, &gid
, &home
, NULL
) < 0) {
1569 log_error("get_user_creds() failed: %m");
1573 if (mkdir_parents_label(home
, 0775) < 0) {
1574 log_error("mkdir_parents_label() failed: %m");
1578 if (mkdir_safe_label(home
, 0775, uid
, gid
) < 0) {
1579 log_error("mkdir_safe_label() failed: %m");
1583 if (initgroups((const char*)arg_user
, gid
) < 0) {
1584 log_error("initgroups() failed: %m");
1588 if (setresgid(gid
, gid
, gid
) < 0) {
1589 log_error("setregid() failed: %m");
1593 if (setresuid(uid
, uid
, uid
) < 0) {
1594 log_error("setreuid() failed: %m");
1598 /* Reset everything fully to 0, just in case */
1600 if (setgroups(0, NULL
) < 0) {
1601 log_error("setgroups() failed: %m");
1605 if (setresgid(0, 0, 0) < 0) {
1606 log_error("setregid() failed: %m");
1610 if (setresuid(0, 0, 0) < 0) {
1611 log_error("setreuid() failed: %m");
1616 if ((asprintf((char**)(envp
+ n_env
++), "HOME=%s", home
? home
: "/root") < 0) ||
1617 (asprintf((char**)(envp
+ n_env
++), "USER=%s", arg_user
? arg_user
: "root") < 0) ||
1618 (asprintf((char**)(envp
+ n_env
++), "LOGNAME=%s", arg_user
? arg_user
: "root") < 0)) {
1624 if (asprintf((char**)(envp
+ n_env
++), "container_uuid=%s", arg_uuid
) < 0) {
1630 if (fdset_size(fds
) > 0) {
1631 k
= fdset_cloexec(fds
, false);
1633 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1637 if ((asprintf((char **)(envp
+ n_env
++), "LISTEN_FDS=%u", n_fd_passed
) < 0) ||
1638 (asprintf((char **)(envp
+ n_env
++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1650 /* Automatically search for the init system */
1652 l
= 1 + argc
- optind
;
1653 a
= newa(char*, l
+ 1);
1654 memcpy(a
+ 1, argv
+ optind
, l
* sizeof(char*));
1656 a
[0] = (char*) "/usr/lib/systemd/systemd";
1657 execve(a
[0], a
, (char**) envp
);
1659 a
[0] = (char*) "/lib/systemd/systemd";
1660 execve(a
[0], a
, (char**) envp
);
1662 a
[0] = (char*) "/sbin/init";
1663 execve(a
[0], a
, (char**) envp
);
1664 } else if (argc
> optind
)
1665 execvpe(argv
[optind
], argv
+ optind
, (char**) envp
);
1667 chdir(home
? home
: "/root");
1668 execle("/bin/bash", "-bash", NULL
, (char**) envp
);
1671 log_error("execv() failed: %m");
1674 _exit(EXIT_FAILURE
);
1677 log_info("Init process in the container running as PID %lu.", (unsigned long) pid
);
1678 close_nointr_nofail(pipefd
[0]);
1679 close_nointr_nofail(pipefd
[1]);
1681 /* Wait for the child process to establish cgroup hierarchy */
1682 close_nointr_nofail(pipefd2
[1]);
1683 fd_wait_for_event(pipefd2
[0], POLLHUP
, -1);
1684 close_nointr_nofail(pipefd2
[0]);
1686 save_attributes(newcg
, pid
, arg_uuid
, arg_directory
);
1691 if (process_pty(master
, pid
, &mask
) < 0)
1694 if (saved_attr_valid
)
1695 tcsetattr(STDIN_FILENO
, TCSANOW
, &saved_attr
);
1697 k
= wait_for_terminate(pid
, &status
);
1703 if (status
.si_code
== CLD_EXITED
) {
1704 r
= status
.si_status
;
1705 if (status
.si_status
!= 0) {
1706 log_error("Container failed with error code %i.", status
.si_status
);
1710 log_debug("Container exited successfully.");
1712 } else if (status
.si_code
== CLD_KILLED
&&
1713 status
.si_status
== SIGINT
) {
1714 log_info("Container has been shut down.");
1717 } else if (status
.si_code
== CLD_KILLED
&&
1718 status
.si_status
== SIGHUP
) {
1719 log_info("Container is being rebooted.");
1721 } else if (status
.si_code
== CLD_KILLED
||
1722 status
.si_code
== CLD_DUMPED
) {
1724 log_error("Container terminated by signal %s.", signal_to_string(status
.si_status
));
1728 log_error("Container failed due to unknown reason.");
1735 if (saved_attr_valid
)
1736 tcsetattr(STDIN_FILENO
, TCSANOW
, &saved_attr
);
1738 close_pipe(kmsg_socket_pair
);
1741 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER
, newcg
, true);
1743 free(arg_directory
);
1745 strv_free(arg_controllers
);