#ifndef _GNU_SOURCE
#define _GNU_SOURCE 1
#endif
-#include <alloca.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include "lxcseccomp.h"
#include "macro.h"
#include "mainloop.h"
+#include "memory_utils.h"
#include "monitor.h"
#include "namespace.h"
#include "network.h"
static void print_top_failing_dir(const char *path)
{
+ __do_free char *copy = NULL;
int ret;
- size_t len;
- char *copy, *e, *p, saved;
-
- len = strlen(path);
- copy = alloca(len + 1);
- (void)strlcpy(copy, path, len + 1);
+ char *e, *p, saved;
+ copy = must_copy_string(path);
p = copy;
- e = copy + len;
+ e = copy + strlen(path);
while (p < e) {
while (p < e && *p == '/')
if (matched)
continue;
-#ifdef HAVE_DLOG
- if (match_dlog_fds(direntp))
- continue;
-
-#endif
if (current_config && fd == current_config->logfd)
continue;
if (match_stdfds(fd))
continue;
+#ifdef HAVE_DLOG
+ if (match_dlog_fds(direntp))
+ continue;
+
+#endif
if (closeall) {
close(fd);
closedir(dir);
}
if (siginfo.ssi_signo == SIGHUP) {
- kill(hdlr->pid, SIGTERM);
+ if (hdlr->pidfd >= 0)
+ lxc_raw_pidfd_send_signal(hdlr->pidfd, SIGTERM, NULL, 0);
+ else if (hdlr->proc_pidfd >= 0)
+ lxc_raw_pidfd_send_signal(hdlr->proc_pidfd, SIGTERM, NULL, 0);
+ else
+ kill(hdlr->pid, SIGTERM);
INFO("Killing %d since terminal hung up", hdlr->pid);
return hdlr->init_died ? LXC_MAINLOOP_CLOSE
: LXC_MAINLOOP_CONTINUE;
}
if (siginfo.ssi_signo != SIGCHLD) {
- kill(hdlr->pid, siginfo.ssi_signo);
+ if (hdlr->pidfd >= 0)
+ lxc_raw_pidfd_send_signal(hdlr->pidfd,
+ siginfo.ssi_signo, NULL, 0);
+ else if (hdlr->proc_pidfd >= 0)
+ lxc_raw_pidfd_send_signal(hdlr->proc_pidfd,
+ siginfo.ssi_signo, NULL, 0);
+ else
+ kill(hdlr->pid, siginfo.ssi_signo);
INFO("Forwarded signal %d to pid %d", siginfo.ssi_signo, hdlr->pid);
return hdlr->init_died ? LXC_MAINLOOP_CLOSE
: LXC_MAINLOOP_CONTINUE;
size_t retlen;
ssize_t ret;
struct lxc_list *cur, *next;
- struct lxc_state_client *client;
struct lxc_msg msg = {.type = lxc_msg_state, .value = state};
if (state == THAWED)
return -E2BIG;
lxc_list_for_each_safe(cur, &handler->conf->state_clients, next) {
- client = cur->elem;
+ struct lxc_state_client *client = cur->elem;
if (client->states[state] == 0) {
TRACE("State %s not registered for state client %d",
goto out_mainloop_console;
}
+ ret = lxc_seccomp_setup_proxy(&handler->conf->seccomp, &descr, handler);
+ if (ret < 0) {
+ ERROR("Failed to setup seccomp proxy");
+ goto out_mainloop_console;
+ }
+
if (has_console) {
struct lxc_terminal *console = &handler->conf->console;
handler->pinfd = -1;
+ handler->pidfd = -EBADF;
+
+ handler->proc_pidfd = -EBADF;
+
handler->sigfd = -1;
for (i = 0; i < LXC_NS_MAX; i++)
if (handler->pinfd >= 0)
close(handler->pinfd);
+ if (handler->pidfd >= 0)
+ close(handler->pidfd);
+
+ if (handler->proc_pidfd >= 0)
+ close(handler->proc_pidfd);
+
if (handler->sigfd >= 0)
close(handler->sigfd);
handler->conf = conf;
handler->lxcpath = lxcpath;
handler->pinfd = -1;
+ handler->pidfd = -EBADF;
+ handler->proc_pidfd = -EBADF;
handler->sigfd = -EBADF;
handler->init_died = false;
handler->state_socket_pair[0] = handler->state_socket_pair[1] = -1;
lxc_set_state(name, handler, STOPPED);
}
+ /* Avoid lingering namespace references. */
+ lxc_put_nsfds(handler);
+
ret = run_lxc_hooks(name, "post-stop", handler->conf, NULL);
if (ret < 0) {
ERROR("Failed to run lxc.hook.post-stop for container \"%s\"", name);
void lxc_abort(const char *name, struct lxc_handler *handler)
{
- int ret, status;
+ int ret = 0;
+ int status;
lxc_set_state(name, handler, ABORTING);
- if (handler->pid > 0) {
+ if (handler->pidfd > 0)
+ ret = lxc_raw_pidfd_send_signal(handler->pidfd, SIGKILL, NULL, 0);
+ else if (handler->proc_pidfd > 0)
+ ret = lxc_raw_pidfd_send_signal(handler->proc_pidfd, SIGKILL, NULL, 0);
+ else if (handler->pid > 0)
ret = kill(handler->pid, SIGKILL);
- if (ret < 0)
- SYSERROR("Failed to send SIGKILL to %d", handler->pid);
- }
+ if (ret < 0)
+ SYSERROR("Failed to send SIGKILL to %d", handler->pid);
- while ((ret = waitpid(-1, &status, 0)) > 0) {
- ;
- }
+ do {
+ ret = waitpid(-1, &status, 0);
+ } while (ret > 0);
}
static int do_start(void *data)
{
+ struct lxc_handler *handler = data;
+ ATTR_UNUSED __do_close_prot_errno int data_sock0 = handler->data_sock[0],
+ data_sock1 = handler->data_sock[1];
int ret;
- char path[PATH_MAX];
uid_t new_uid;
gid_t new_gid;
struct lxc_list *iterator;
uid_t nsuid = 0;
gid_t nsgid = 0;
int devnull_fd = -1;
- struct lxc_handler *handler = data;
lxc_sync_fini_parent(handler);
* exit before we set the pdeath signal leading to a unsupervized
* container.
*/
- ret = lxc_set_death_signal(SIGKILL, 0);
+ ret = lxc_set_death_signal(SIGKILL, handler->monitor_pid);
if (ret < 0) {
SYSERROR("Failed to set PR_SET_PDEATHSIG to SIGKILL");
goto out_warn_father;
goto out_warn_father;
/* set{g,u}id() clears deathsignal */
- ret = lxc_set_death_signal(SIGKILL, 0);
+ ret = lxc_set_death_signal(SIGKILL, handler->monitor_pid);
if (ret < 0) {
SYSERROR("Failed to set PR_SET_PDEATHSIG to SIGKILL");
goto out_warn_father;
goto out_warn_father;
}
- ret = snprintf(path, sizeof(path), "%s/dev/null",
- handler->conf->rootfs.mount);
- if (ret < 0 || ret >= sizeof(path))
- goto out_warn_father;
-
/* In order to checkpoint restore, we need to have everything in the
* same mount namespace. However, some containers may not have a
* reasonable /dev (in particular, they may not have /dev/null), so we
* where it isn't wanted.
*/
if (handler->daemonize && !handler->conf->autodev) {
+ char path[PATH_MAX];
+
+ ret = snprintf(path, sizeof(path), "%s/dev/null",
+ handler->conf->rootfs.mount);
+ if (ret < 0 || ret >= sizeof(path))
+ goto out_warn_father;
+
ret = access(path, F_OK);
if (ret != 0) {
devnull_fd = open_devnull();
if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
ret = unshare(CLONE_NEWCGROUP);
if (ret < 0) {
- INFO("Failed to unshare CLONE_NEWCGROUP");
- goto out_warn_father;
+ if (errno != EINVAL) {
+ SYSERROR("Failed to unshare CLONE_NEWCGROUP");
+ goto out_warn_father;
+ }
+
+ handler->ns_clone_flags &= ~CLONE_NEWCGROUP;
+ SYSINFO("Kernel does not support CLONE_NEWCGROUP");
+ } else {
+ INFO("Unshared CLONE_NEWCGROUP");
}
- INFO("Unshared CLONE_NEWCGROUP");
}
/* Add the requested environment variables to the current environment to
/* Setup the container, ip, names, utsname, ... */
ret = lxc_setup(handler);
- close(handler->data_sock[1]);
- close(handler->data_sock[0]);
if (ret < 0) {
ERROR("Failed to setup container \"%s\"", handler->name);
goto out_warn_father;
if (ret < 0)
goto out_warn_father;
+ ret = lxc_seccomp_send_notifier_fd(&handler->conf->seccomp, data_sock0);
+ if (ret < 0) {
+ SYSERROR("Failed to send seccomp notify fd to parent");
+ goto out_warn_father;
+ }
+
ret = run_lxc_hooks(handler->name, "start", handler->conf, NULL);
if (ret < 0) {
ERROR("Failed to run lxc.hook.start for container \"%s\"",
}
if (handler->conf->monitor_signal_pdeath != SIGKILL) {
- ret = lxc_set_death_signal(handler->conf->monitor_signal_pdeath, 0);
+ ret = lxc_set_death_signal(handler->conf->monitor_signal_pdeath, handler->monitor_pid);
if (ret < 0) {
SYSERROR("Failed to set PR_SET_PDEATHSIG to %d",
handler->conf->monitor_signal_pdeath);
struct lxc_conf *conf = handler->conf;
for (i = 0; i < LXC_NS_MAX; i++) {
- if (conf->ns_keep > 0) {
- if ((conf->ns_keep & ns_info[i].clone_flag) == 0)
+ if (conf->ns_keep) {
+ if (!(conf->ns_keep & ns_info[i].clone_flag))
handler->ns_clone_flags |= ns_info[i].clone_flag;
- } else if (conf->ns_clone > 0) {
- if ((conf->ns_clone & ns_info[i].clone_flag) > 0)
+ } else if (conf->ns_clone) {
+ if ((conf->ns_clone & ns_info[i].clone_flag))
handler->ns_clone_flags |= ns_info[i].clone_flag;
} else {
if (i == LXC_NS_USER && lxc_list_empty(&handler->conf->id_map))
* getpid() in the child would return the parent's pid. This is all fixed in
* newer glibc versions where the getpid() cache is removed and the pid/tid is
* not reset anymore.
- * However, if for whatever reason you - dear commiter - somehow need to get the
+ * However, if for whatever reason you - dear committer - somehow need to get the
* pid of the dummy intermediate process for do_share_ns() you need to call
* lxc_raw_getpid(). The next lxc_raw_clone() call does not employ CLONE_VM and
* will be fine.
flags = handler->ns_on_clone_flags;
flags |= CLONE_PARENT;
- handler->pid = lxc_raw_clone_cb(do_start, handler, flags);
+ handler->pid = lxc_raw_clone_cb(do_start, handler, CLONE_PIDFD | flags,
+ &handler->pidfd);
if (handler->pid < 0)
return -1;
return 0;
}
-static int lxc_setup_shmount(struct lxc_conf *conf)
+static int proc_pidfd_open(pid_t pid)
{
- size_t len_cont;
- char *full_cont_path;
- int ret = -1;
+ __do_close_prot_errno int proc_pidfd = -EBADF;
+ char path[100];
- /* Construct the shmount path under the container root. */
- len_cont = strlen(conf->rootfs.mount) + 1 + strlen(conf->shmount.path_cont);
- /* +1 for the terminating '\0' */
- full_cont_path = malloc(len_cont + 1);
- if (!full_cont_path) {
- SYSERROR("Not enough memory");
- return -ENOMEM;
- }
-
- ret = snprintf(full_cont_path, len_cont + 1, "%s/%s",
- conf->rootfs.mount, conf->shmount.path_cont);
- if (ret < 0 || ret >= len_cont + 1) {
- SYSERROR("Failed to create filename");
- free(full_cont_path);
+ snprintf(path, sizeof(path), "/proc/%d", pid);
+ proc_pidfd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+ if (proc_pidfd < 0) {
+ SYSERROR("Failed to open %s", path);
return -1;
}
- /* Check if shmount point is already set up. */
- if (is_shared_mountpoint(conf->shmount.path_host)) {
- INFO("Path \"%s\" is already MS_SHARED. Reusing",
- conf->shmount.path_host);
- free(full_cont_path);
- return 0;
- }
-
- /* Create host and cont mount paths */
- ret = mkdir_p(conf->shmount.path_host, 0711);
- if (ret < 0 && errno != EEXIST) {
- SYSERROR("Failed to create directory \"%s\"",
- conf->shmount.path_host);
- free(full_cont_path);
- return ret;
- }
-
- ret = mkdir_p(full_cont_path, 0711);
- if (ret < 0 && errno != EEXIST) {
- SYSERROR("Failed to create directory \"%s\"", full_cont_path);
- free(full_cont_path);
- return ret;
- }
-
- /* Prepare host mountpoint */
- ret = mount("tmpfs", conf->shmount.path_host, "tmpfs", 0,
- "size=100k,mode=0711");
- if (ret < 0) {
- SYSERROR("Failed to mount \"%s\"", conf->shmount.path_host);
- free(full_cont_path);
- return ret;
- }
-
- ret = mount(conf->shmount.path_host, conf->shmount.path_host, "none",
- MS_REC | MS_SHARED, "");
- if (ret < 0) {
- SYSERROR("Failed to make shared \"%s\"", conf->shmount.path_host);
- free(full_cont_path);
- return ret;
+ /* Test whether we can send signals. */
+ if (lxc_raw_pidfd_send_signal(proc_pidfd, 0, NULL, 0)) {
+ SYSERROR("Failed to send signal through pidfd");
+ return -1;
}
- INFO("Setup shared mount point \"%s\"", conf->shmount.path_host);
- free(full_cont_path);
- return 0;
+ return move_fd(proc_pidfd);
}
/* lxc_spawn() performs crucial setup tasks and clone()s the new process which
*/
static int lxc_spawn(struct lxc_handler *handler)
{
+ __do_close_prot_errno int data_sock0 = -EBADF, data_sock1 = -EBADF;
int i, ret;
char pidstr[20];
bool wants_to_map_ids;
handler->data_sock);
if (ret < 0)
goto out_sync_fini;
+ data_sock0 = handler->data_sock[0];
+ data_sock1 = handler->data_sock[1];
ret = resolve_clone_flags(handler);
if (ret < 0)
goto out_sync_fini;
- if (conf->shmount.path_host) {
- if (!conf->shmount.path_cont)
- goto out_sync_fini;
-
- ret = lxc_setup_shmount(conf);
- if (ret < 0) {
- ERROR("Failed to setup shared mount point");
- goto out_sync_fini;
- }
- }
-
if (handler->ns_clone_flags & CLONE_NEWNET) {
if (!lxc_list_empty(&conf->network)) {
pid_t attacher_pid;
attacher_pid = lxc_clone(do_share_ns, handler,
- CLONE_VFORK | CLONE_VM | CLONE_FILES);
+ CLONE_VFORK | CLONE_VM | CLONE_FILES, NULL);
if (attacher_pid < 0) {
SYSERROR(LXC_CLONE_ERROR);
goto out_delete_net;
}
} else {
handler->pid = lxc_raw_clone_cb(do_start, handler,
- handler->ns_on_clone_flags);
+ CLONE_PIDFD | handler->ns_on_clone_flags,
+ &handler->pidfd);
}
if (handler->pid < 0) {
SYSERROR(LXC_CLONE_ERROR);
}
TRACE("Cloned child process %d", handler->pid);
+ if (handler->pidfd < 0) {
+ handler->proc_pidfd = proc_pidfd_open(handler->pid);
+ if (handler->proc_pidfd < 0 && (errno != ENOSYS))
+ goto out_delete_net;
+ }
+
for (i = 0; i < LXC_NS_MAX; i++)
if (handler->ns_on_clone_flags & ns_info[i].clone_flag)
INFO("Cloned %s", ns_info[i].flag_name);
if (!cgroup_ops->chown(cgroup_ops, handler->conf))
goto out_delete_net;
- /* Now we're ready to preserve the network namespace */
- ret = lxc_try_preserve_ns(handler->pid, "net");
- if (ret < 0) {
- if (ret != -EOPNOTSUPP) {
- SYSERROR("Failed to preserve net namespace");
- goto out_delete_net;
+ /* If not done yet, we're now ready to preserve the network namespace */
+ if (handler->nsfd[LXC_NS_NET] < 0) {
+ ret = lxc_try_preserve_ns(handler->pid, "net");
+ if (ret < 0) {
+ if (ret != -EOPNOTSUPP) {
+ SYSERROR("Failed to preserve net namespace");
+ goto out_delete_net;
+ }
+ } else {
+ handler->nsfd[LXC_NS_NET] = ret;
+ DEBUG("Preserved net namespace via fd %d", ret);
}
- } else {
- handler->nsfd[LXC_NS_NET] = ret;
- DEBUG("Preserved net namespace via fd %d", ret);
-
- ret = lxc_netns_set_nsid(handler->nsfd[LXC_NS_NET]);
- if (ret < 0)
- SYSWARN("Failed to allocate new network namespace id");
- else
- TRACE("Allocated new network namespace id");
}
+ ret = lxc_netns_set_nsid(handler->nsfd[LXC_NS_NET]);
+ if (ret < 0)
+ SYSWARN("Failed to allocate new network namespace id");
+ else
+ TRACE("Allocated new network namespace id");
/* Create the network configuration. */
if (handler->ns_clone_flags & CLONE_NEWNET) {
}
/* Now all networks are created, network devices are moved into place,
- * and the correct names and ifindeces in the respective namespaces have
+ * and the correct names and ifindices in the respective namespaces have
* been recorded. The corresponding structs have now all been filled. So
* log them for debugging purposes.
*/
goto out_delete_net;
}
+ ret = lxc_seccomp_recv_notifier_fd(&handler->conf->seccomp, data_sock1);
+ if (ret < 0) {
+ SYSERROR("Failed to receive seccomp notify fd from child");
+ goto out_delete_net;
+ }
+
ret = handler->ops->post_start(handler, handler->data);
if (ret < 0)
goto out_abort;
ret = lxc_init(name, handler);
if (ret < 0) {
ERROR("Failed to initialize container \"%s\"", name);
- return -1;
+ goto out_fini_nonet;
}
handler->ops = ops;
handler->data = data;
if (!attach_block_device(handler->conf)) {
ERROR("Failed to attach block device");
+ ret = -1;
goto out_fini_nonet;
}
if (!cgroup_ops->monitor_create(cgroup_ops, handler)) {
ERROR("Failed to create monitor cgroup");
+ ret = -1;
goto out_fini_nonet;
}
if (!cgroup_ops->monitor_enter(cgroup_ops, handler->monitor_pid)) {
ERROR("Failed to enter monitor cgroup");
+ ret = -1;
goto out_fini_nonet;
}
ERROR("Failed to spawn container \"%s\"", name);
goto out_detach_blockdev;
}
- /* close parent side of data socket */
- close(handler->data_sock[0]);
- handler->data_sock[0] = -1;
- close(handler->data_sock[1]);
- handler->data_sock[1] = -1;
handler->conf->reboot = REBOOT_NONE;
if (!handler->init_died && handler->pid > 0) {
ERROR("Child process is not killed");
+ ret = -1;
goto out_abort;
}