}
#endif
+/* Parses the LISTEN_FDS environment variable value.
+ * The returned value is the highest fd number up to which the
+ * file descriptors must be passed to the container process.
+ *
+ * For example, if LISTEN_FDS=2 then 4 is returned and file descriptors 3 and 4
+ * MUST be passed to the container process (in addition to the standard streams)
+ * to support [socket activation][systemd-listen-fds].
+ */
+static unsigned int get_listen_fds_max(void)
+{
+ int ret;
+ unsigned int num_fds;
+ const char *val;
+
+ val = getenv("LISTEN_FDS");
+ if (!val)
+ return 0;
+
+ ret = lxc_safe_uint(val, &num_fds);
+ if (ret < 0)
+ return syserror_ret(0, "Failed to parse \"LISTEN_FDS=%s\" environment variable", val);
+
+ return log_trace(num_fds, "Parsed \"LISTEN_FDS=%s\" environment variable", val);
+}
+
int lxc_check_inherited(struct lxc_conf *conf, bool closeall,
int *fds_to_ignore, size_t len_fds)
{
size_t i;
DIR *dir;
struct dirent *direntp;
+ unsigned int listen_fds_max;
if (conf && conf->close_all_fds)
closeall = true;
+ listen_fds_max = get_listen_fds_max();
+
/*
* Disable syslog at this point to avoid the above logging
* function to open a new fd and make the check_inherited function
continue;
#endif
+
+ if (fd <= listen_fds_max) {
+ INFO("Inheriting fd %d (using the LISTEN_FDS environment variable)", fd);
+ continue;
+ }
+
if (closeall) {
if (close(fd))
SYSINFO("Closed inherited fd %d", fd);
}
static int signal_handler(int fd, uint32_t events, void *data,
- struct lxc_epoll_descr *descr)
+ struct lxc_async_descr *descr)
{
int ret;
siginfo_t info;
if (ret == 0 && info.si_pid == hdlr->pid)
hdlr->init_died = true;
+ TRACE("Received signal ssi_signo(%d) for ssi_pid(%d), si_signo(%d), si_pid(%d)",
+ siginfo.ssi_signo, siginfo.ssi_pid, info.si_signo, info.si_pid);
+
/* Try to figure out a reasonable exit status to report. */
if (hdlr->init_died) {
switch (info.si_code) {
int lxc_poll(const char *name, struct lxc_handler *handler)
{
int ret;
- bool has_console = true;
- struct lxc_epoll_descr descr, descr_console;
+ struct lxc_terminal *console = &handler->conf->console;
+ struct lxc_async_descr descr, descr_console;
- if (handler->conf->console.path &&
- strequal(handler->conf->console.path, "none"))
- has_console = false;
+ if (!wants_console(console))
+ console = NULL;
ret = lxc_mainloop_open(&descr);
if (ret < 0) {
goto out_sigfd;
}
- if (has_console) {
+ if (console) {
ret = lxc_mainloop_open(&descr_console);
if (ret < 0) {
ERROR("Failed to create console mainloop");
}
}
- ret = lxc_mainloop_add_handler(&descr, handler->sigfd, signal_handler, handler);
+ ret = lxc_mainloop_add_handler(&descr, handler->sigfd,
+ signal_handler,
+ default_cleanup_handler,
+ handler, "signal_handler");
if (ret < 0) {
ERROR("Failed to add signal handler for %d to mainloop", handler->sigfd);
goto out_mainloop_console;
goto out_mainloop_console;
}
- if (has_console) {
- struct lxc_terminal *console = &handler->conf->console;
-
+ if (console) {
ret = lxc_terminal_mainloop_add(&descr, console);
if (ret < 0) {
ERROR("Failed to add console handlers to mainloop");
goto out_mainloop_console;
}
-
- ret = lxc_terminal_mainloop_add(&descr_console, console);
- if (ret < 0) {
- ERROR("Failed to add console handlers to console mainloop");
- goto out_mainloop_console;
- }
-
- handler->conf->console.descr = &descr;
}
ret = lxc_cmd_mainloop_add(name, &descr, handler);
if (ret < 0 || !handler->init_died)
goto out_mainloop_console;
- if (has_console)
- ret = lxc_mainloop(&descr_console, 0);
+ if (console) {
+ ret = lxc_terminal_mainloop_add(&descr_console, console);
+ if (ret == 0)
+ ret = lxc_mainloop(&descr_console, 0);
+ }
out_mainloop_console:
- if (has_console) {
+ if (console) {
lxc_mainloop_close(&descr_console);
TRACE("Closed console mainloop");
}
}
if (handler->conf->reboot == REBOOT_NONE) {
- handler->conf->maincmd_fd = lxc_cmd_init(name, lxcpath, "command");
+ handler->conf->maincmd_fd = lxc_server_init(name, lxcpath, "command");
if (handler->conf->maincmd_fd < 0) {
ERROR("Failed to set up command socket");
goto on_error;
return log_error(-1, "Failed to run lxc.hook.pre-start for container \"%s\"", name);
TRACE("Ran pre-start hooks");
+ ret = lxc_terminal_parent(conf);
+ if (ret < 0)
+ return log_error(-1, "Failed to allocate terminal");
+
/* The signal fd has to be created before forking otherwise if the child
* process exits before we setup the signal fd, the event will be lost
* and the command will be stuck.
return log_error(-1, "Failed to setup SIGCHLD fd handler.");
TRACE("Set up signal fd");
- /* Do this after setting up signals since it might unblock SIGWINCH. */
- ret = lxc_terminal_setup(conf);
- if (ret < 0) {
- ERROR("Failed to create console");
- goto out_restore_sigmask;
- }
- TRACE("Created console");
-
handler->cgroup_ops = cgroup_init(handler->conf);
if (!handler->cgroup_ops) {
ERROR("Failed to initialize cgroup driver");
- goto out_delete_terminal;
+ goto out_restore_sigmask;
}
TRACE("Initialized cgroup driver");
ret = lxc_read_seccomp_config(conf);
- if (ret < 0)
- return log_error(-1, "Failed loading seccomp policy");
+ if (ret < 0) {
+ ERROR("Failed to read seccomp policy");
+ goto out_restore_sigmask;
+ }
TRACE("Read seccomp policy");
ret = handler->lsm_ops->prepare(handler->lsm_ops, conf, handler->lxcpath);
if (ret < 0) {
ERROR("Failed to initialize LSM");
- goto out_delete_terminal;
+ goto out_restore_sigmask;
}
TRACE("Initialized LSM");
handler->monitor_status_fd = move_fd(status_fd);
return 0;
-out_delete_terminal:
- lxc_terminal_delete(&handler->conf->console);
-
out_restore_sigmask:
(void)pthread_sigmask(SIG_SETMASK, &handler->oldmask, NULL);
lxc_sync_fini_parent(handler);
- if (lxc_abstract_unix_recv_fds(data_sock1, &status_fd, 1, NULL, 0) < 0) {
- ERROR("Failed to receive status file descriptor to child process");
+ if (lxc_abstract_unix_recv_one_fd(data_sock1, &status_fd, NULL, 0) < 0) {
+ ERROR("Failed to receive status file descriptor from parent process");
goto out_warn_father;
}
INFO("Unshared CLONE_NEWNET");
}
- /* Tell the parent task it can begin to configure the container and wait
- * for it to finish.
- */
- if (!lxc_sync_barrier_parent(handler, START_SYNC_CONFIGURE))
- goto out_error;
-
- if (handler->ns_clone_flags & CLONE_NEWNET) {
- ret = lxc_network_recv_from_parent(handler);
- if (ret < 0) {
- ERROR("Failed to receive veth names from parent");
- goto out_warn_father;
- }
- }
-
/* If we are in a new user namespace, become root there to have
* privilege over our namespace.
*/
}
}
- /* Ask father to setup cgroups and wait for him to finish. */
- if (!lxc_sync_barrier_parent(handler, START_SYNC_CGROUP))
+ /*
+ * Tell the parent task it can begin to configure the container and wait
+ * for it to finish.
+ */
+ if (!lxc_sync_wake_parent(handler, START_SYNC_CONFIGURE))
goto out_error;
/* Unshare cgroup namespace after we have setup our cgroups. If we do it
}
ret = setns(timens_fd, CLONE_NEWTIME);
- if (ret) {
+ if (ret) {
SYSERROR("Failed to setns(%d(\"/proc/self/ns/time_for_children\"))", timens_fd);
goto out_warn_father;
}
}
}
+ if (!lxc_sync_wait_parent(handler, START_SYNC_POST_CONFIGURE))
+ goto out_warn_father;
+
/* Setup the container, ip, names, utsname, ... */
ret = lxc_setup(handler);
if (ret < 0) {
if (ret < 0)
goto out_warn_father;
- ret = lxc_seccomp_send_notifier_fd(&handler->conf->seccomp, data_sock0);
- if (ret < 0) {
- SYSERROR("Failed to send seccomp notify fd to parent");
- goto out_warn_father;
- }
-
ret = run_lxc_hooks(handler->name, "start", handler->conf, NULL);
if (ret < 0) {
ERROR("Failed to run lxc.hook.start for container \"%s\"",
if (!lxc_sync_barrier_parent(handler, START_SYNC_CGROUP_LIMITS))
goto out_warn_father;
+ ret = lxc_sync_fds_child(handler);
+ if (ret < 0) {
+ SYSERROR("Failed to sync file descriptors with parent");
+ goto out_warn_father;
+ }
+
+ if (!lxc_sync_wait_parent(handler, START_SYNC_READY_START))
+ goto out_warn_father;
+
/* Reset the environment variables the user requested in a clear
* environment.
*/
return -1;
}
-static int lxc_recv_ttys_from_child(struct lxc_handler *handler)
-{
- int i;
- struct lxc_terminal_info *tty;
- int ret = -1;
- int sock = handler->data_sock[1];
- struct lxc_conf *conf = handler->conf;
- struct lxc_tty_info *ttys = &conf->ttys;
-
- if (!conf->ttys.max)
- return 0;
-
- ttys->tty = malloc(sizeof(*ttys->tty) * ttys->max);
- if (!ttys->tty)
- return -1;
-
- for (i = 0; i < conf->ttys.max; i++) {
- int ttyfds[2];
-
- ret = lxc_abstract_unix_recv_fds(sock, ttyfds, 2, NULL, 0);
- if (ret < 0)
- break;
-
- tty = &ttys->tty[i];
- tty->busy = -1;
- tty->ptx = ttyfds[0];
- tty->pty = ttyfds[1];
- TRACE("Received pty with ptx fd %d and pty fd %d from child", tty->ptx, tty->pty);
- }
-
- if (ret < 0)
- SYSERROR("Failed to receive %zu ttys from child", ttys->max);
- else
- TRACE("Received %zu ttys from child", ttys->max);
-
- return ret;
-}
-
int resolve_clone_flags(struct lxc_handler *handler)
{
int i;
* newer glibc versions where the getpid() cache is removed and the pid/tid is
* not reset anymore.
* However, if for whatever reason you - dear committer - somehow need to get the
- * pid of the dummy intermediate process for do_share_ns() you need to call
- * lxc_raw_getpid(). The next lxc_raw_clone() call does not employ CLONE_VM and
- * will be fine.
+ * pid of the placeholder intermediate process for do_share_ns() you need to
+ * call lxc_raw_getpid(). The next lxc_raw_clone() call does not employ
+ * CLONE_VM and will be fine.
*/
static inline int do_share_ns(void *arg)
{
goto out_delete_net;
}
- /* If the rootfs is not a blockdev, prevent the container from marking
- * it readonly.
- * If the container is unprivileged then skip rootfs pinning.
- */
- ret = lxc_rootfs_prepare(&conf->rootfs, wants_to_map_ids);
- if (ret) {
- ERROR("Failed to handle rootfs pinning for container \"%s\"", handler->name);
- goto out_delete_net;
- }
-
/* Create a process in a new set of namespaces. */
if (share_ns) {
pid_t attacher_pid;
/* Kernel might be too old for clone3(). */
if (handler->pid < 0) {
SYSTRACE("Failed to spawn container via clone3()");
+
+ /*
+ * In contrast to all other architectures arm64 verifies that
+ * the argument we use to retrieve the pidfd with is
+ * initialized to 0. But we need to be able to initialize it to
+ * a negative value such as our customary -EBADF so we can
+ * detect whether this kernel supports pidfds. If the syscall
+ * returns and the pidfd variable is set to something >= 0 then
+ * we know this is a kernel supporting pidfds. But if we can't
+ * set it to -EBADF then this won't work since 0 is a valid
+ * file descriptor too. And since legacy clone silently ignores
+ * unknown flags we are left without any way to detect support
+ * for pidfds. So let's special-case arm64 to not fail starting
+ * containers.
+ */
+ #if defined(__aarch64__)
+ handler->pid = lxc_raw_legacy_clone(handler->clone_flags & ~CLONE_PIDFD, NULL);
+ #else
handler->pid = lxc_raw_legacy_clone(handler->clone_flags, &handler->pidfd);
+ #endif
}
if (handler->pid < 0) {
}
}
- if (!lxc_sync_wake_child(handler, START_SYNC_STARTUP))
- goto out_delete_net;
-
- if (!lxc_sync_wait_child(handler, START_SYNC_CONFIGURE))
- goto out_delete_net;
-
if (!cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, false)) {
ERROR("Failed to setup cgroup limits for container \"%s\"", name);
goto out_delete_net;
if (!cgroup_ops->chown(cgroup_ops, handler->conf))
goto out_delete_net;
+ if (!lxc_sync_barrier_child(handler, START_SYNC_STARTUP))
+ goto out_delete_net;
+
/* If not done yet, we're now ready to preserve the network namespace */
if (handler->nsfd[LXC_NS_NET] < 0) {
ret = lxc_try_preserve_namespace(handler, LXC_NS_NET, "net");
ERROR("Failed to create the network");
goto out_delete_net;
}
-
- ret = lxc_network_send_to_child(handler);
- if (ret < 0) {
- ERROR("Failed to send veth names to child");
- goto out_delete_net;
- }
}
if (!lxc_list_empty(&conf->procs)) {
goto out_delete_net;
}
- /* Tell the child to continue its initialization. We'll get
- * START_SYNC_CGROUP when it is ready for us to setup cgroups.
- */
- if (!lxc_sync_barrier_child(handler, START_SYNC_POST_CONFIGURE))
- goto out_delete_net;
-
if (!lxc_list_empty(&conf->limits)) {
ret = setup_resource_limits(&conf->limits, handler->pid);
if (ret < 0) {
}
}
- if (!lxc_sync_barrier_child(handler, START_SYNC_CGROUP_UNSHARE))
+ /* Tell the child to continue its initialization. */
+ if (!lxc_sync_wake_child(handler, START_SYNC_POST_CONFIGURE))
+ goto out_delete_net;
+
+ ret = lxc_rootfs_prepare_parent(handler);
+ if (ret) {
+ ERROR("Failed to prepare rootfs");
+ goto out_delete_net;
+ }
+
+ if (handler->ns_clone_flags & CLONE_NEWNET) {
+ ret = lxc_network_send_to_child(handler);
+ if (ret < 0) {
+ SYSERROR("Failed to send veth names to child");
+ goto out_delete_net;
+ }
+ }
+
+ if (!lxc_sync_wait_child(handler, START_SYNC_IDMAPPED_MOUNTS))
+ goto out_delete_net;
+
+ ret = lxc_idmapped_mounts_parent(handler);
+ if (ret) {
+ ERROR("Failed to setup mount entries");
+ goto out_delete_net;
+ }
+
+ if (!lxc_sync_wait_child(handler, START_SYNC_CGROUP_LIMITS))
goto out_delete_net;
/*
}
TRACE("Set up cgroup2 device controller limits");
+ cgroup_ops->finalize(cgroup_ops);
+ TRACE("Finished setting up cgroups");
+
+ /* Run any host-side start hooks */
+ ret = run_lxc_hooks(name, "start-host", conf, NULL);
+ if (ret < 0) {
+ ERROR("Failed to run lxc.hook.start-host");
+ goto out_delete_net;
+ }
+
+ if (!lxc_sync_wake_child(handler, START_SYNC_FDS))
+ goto out_delete_net;
+
if (handler->ns_unshare_flags & CLONE_NEWCGROUP) {
/* Now we're ready to preserve the cgroup namespace */
ret = lxc_try_preserve_namespace(handler, LXC_NS_CGROUP, "cgroup");
}
}
- cgroup_ops->payload_finalize(cgroup_ops);
- TRACE("Finished setting up cgroups");
-
if (handler->ns_unshare_flags & CLONE_NEWTIME) {
/* Now we're ready to preserve the time namespace */
ret = lxc_try_preserve_namespace(handler, LXC_NS_TIME, "time");
}
}
- /* Run any host-side start hooks */
- ret = run_lxc_hooks(name, "start-host", conf, NULL);
+ ret = lxc_sync_fds_parent(handler);
if (ret < 0) {
- ERROR("Failed to run lxc.hook.start-host");
+ SYSERROR("Failed to sync file descriptors with child");
goto out_delete_net;
}
- /* Tell the child to complete its initialization and wait for it to exec
- * or return an error. (The child will never return
- * START_SYNC_READY_START+1. It will either close the sync pipe, causing
- * lxc_sync_barrier_child to return success, or return a different
- * value, causing us to error out).
- */
- if (!lxc_sync_barrier_child(handler, START_SYNC_READY_START))
+ ret = lxc_terminal_setup(conf);
+ if (ret < 0) {
+ SYSERROR("Failed to create console");
goto out_delete_net;
-
- if (handler->ns_clone_flags & CLONE_NEWNET) {
- ret = lxc_network_recv_name_and_ifindex_from_child(handler);
- if (ret < 0) {
- ERROR("Failed to receive names and ifindices for network devices from child");
- goto out_delete_net;
- }
}
- ret = lxc_setup_devpts_parent(handler);
- if (ret < 0) {
- SYSERROR("Failed to receive devpts fd from child");
+ /*
+ * Tell the child to complete its initialization and wait for it to
+ * exec or return an error. (The child will never return
+ * START_SYNC_READY_START+1. It will either close the sync pipe,
+ * causing lxc_sync_barrier_child to return success, or return a
+ * different value, causing us to error out).
+ */
+ if (!lxc_sync_barrier_child(handler, START_SYNC_READY_START))
goto out_delete_net;
- }
/* Now all networks are created, network devices are moved into place,
* and the correct names and ifindices in the respective namespaces have
*/
lxc_log_configured_netdevs(conf);
- /* Read tty fds allocated by child. */
- ret = lxc_recv_ttys_from_child(handler);
- if (ret < 0) {
- ERROR("Failed to receive tty info from child process");
- goto out_delete_net;
- }
-
- ret = lxc_seccomp_recv_notifier_fd(&handler->conf->seccomp, data_sock1);
- if (ret < 0) {
- SYSERROR("Failed to receive seccomp notify fd from child");
- goto out_delete_net;
- }
-
ret = handler->ops->post_start(handler, handler->data);
if (ret < 0)
goto out_abort;
goto out_abort;
}
+ /* If the rootfs is not a blockdev, prevent the container from marking
+ * it readonly.
+ * If the container is unprivileged then skip rootfs pinning.
+ */
+ ret = lxc_rootfs_init(conf, !lxc_list_empty(&conf->id_map));
+ if (ret) {
+ ERROR("Failed to handle rootfs pinning for container \"%s\"", handler->name);
+ ret = -1;
+ goto out_abort;
+ }
+
if (geteuid() == 0 && !lxc_list_empty(&conf->id_map)) {
- /* If the backing store is a device, mount it here and now. */
+ /*
+ * Most filesystems can't be mounted inside a userns so handle them here.
+ */
if (rootfs_is_blockdev(conf)) {
ret = unshare(CLONE_NEWNS);
if (ret < 0) {
}
INFO("Unshared CLONE_NEWNS");
- turn_into_dependent_mounts();
ret = lxc_setup_rootfs_prepare_root(conf, name, lxcpath);
if (ret < 0) {
ERROR("Error setting up rootfs mount as root before spawn");