]> git.proxmox.com Git - mirror_lxc.git/blame - src/lxc/start.c
lxc_monitord: use lxc_safe_int() && use exit()
[mirror_lxc.git] / src / lxc / start.c
CommitLineData
0ad19a3f 1/*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
9afe19d6 7 * Daniel Lezcano <daniel.lezcano at free.fr>
0ad19a3f 8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
250b1eec 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
0ad19a3f 22 */
23
f3787121 24#define _GNU_SOURCE
f549edcc
GK
25#include "config.h"
26
f3787121 27#include <alloca.h>
0ad19a3f 28#include <dirent.h>
29#include <errno.h>
b0a33c1e 30#include <fcntl.h>
c476bdce 31#include <grp.h>
37515ebd 32#include <poll.h>
f3787121
CB
33#include <signal.h>
34#include <stdio.h>
35#include <stdlib.h>
36#include <string.h>
37#include <unistd.h>
0ad19a3f 38#include <sys/file.h>
f4d507d5 39#include <sys/mount.h>
f3787121 40#include <sys/param.h>
0ad19a3f 41#include <sys/prctl.h>
f3787121
CB
42#include <sys/socket.h>
43#include <sys/stat.h>
44#include <sys/syscall.h>
ddceb1f9 45#include <sys/types.h>
b0a33c1e 46#include <sys/un.h>
f3787121 47#include <sys/wait.h>
ff218c25 48
495d2046
SG
49#if HAVE_SYS_CAPABILITY_H
50#include <sys/capability.h>
51#endif
52
955e2a02 53#ifndef HAVE_DECL_PR_CAPBSET_DROP
656994bb
MH
54#define PR_CAPBSET_DROP 24
55#endif
56
955e2a02
CB
57#ifndef HAVE_DECL_PR_SET_NO_NEW_PRIVS
58#define PR_SET_NO_NEW_PRIVS 38
59#endif
60
61#ifndef HAVE_DECL_PR_GET_NO_NEW_PRIVS
62#define PR_GET_NO_NEW_PRIVS 39
63#endif
64
f3787121 65#include "af_unix.h"
d8e48992 66#include "bdev.h"
f3787121 67#include "caps.h"
563f2f2c 68#include "cgroup.h"
f3787121
CB
69#include "commands.h"
70#include "conf.h"
71#include "console.h"
e2bcd7db 72#include "error.h"
f3787121
CB
73#include "log.h"
74#include "lxclock.h"
75#include "lxcseccomp.h"
565c2d76 76#include "lxcutmp.h"
f3787121 77#include "mainloop.h"
63376d7d 78#include "monitor.h"
f549edcc 79#include "namespace.h"
f3787121
CB
80#include "start.h"
81#include "sync.h"
82#include "utils.h"
fe4de9a6 83#include "lsm/lsm.h"
36eb9bde
CLG
84
85lxc_log_define(lxc_start, lxc);
86
f01f7975 87extern void mod_all_rdeps(struct lxc_container *c, bool inc);
28272964
CB
88static bool do_destroy_container(struct lxc_conf *conf);
89static int lxc_rmdir_onedev_wrapper(void *data);
90static void lxc_destroy_container_on_signal(struct lxc_handler *handler,
91 const char *name);
92
c8154066
SH
93static void print_top_failing_dir(const char *path)
94{
95 size_t len = strlen(path);
f3787121 96 char *copy = alloca(len + 1), *p, *e, saved;
c8154066
SH
97 strcpy(copy, path);
98
99 p = copy;
100 e = copy + len;
101 while (p < e) {
f3787121
CB
102 while (p < e && *p == '/')
103 p++;
104 while (p < e && *p != '/')
105 p++;
c8154066
SH
106 saved = *p;
107 *p = '\0';
108 if (access(copy, X_OK)) {
408da065
CB
109 SYSERROR("Could not access %s. Please grant it x "
110 "access, or add an ACL for the container "
111 "root.", copy);
c8154066
SH
112 return;
113 }
114 *p = saved;
115 }
116}
117
408da065
CB
118static void close_ns(int ns_fd[LXC_NS_MAX])
119{
9f30a190
MM
120 int i;
121
9f30a190
MM
122 for (i = 0; i < LXC_NS_MAX; i++) {
123 if (ns_fd[i] > -1) {
124 close(ns_fd[i]);
125 ns_fd[i] = -1;
126 }
127 }
9f30a190
MM
128}
129
4d8ac866 130/* preserve_ns: open /proc/@pid/ns/@ns for each namespace specified
62d05d9b 131 * in clone_flags.
4d8ac866 132 * Return true on success, false on failure.
62d05d9b 133 */
4d8ac866 134static bool preserve_ns(int ns_fd[LXC_NS_MAX], int clone_flags, pid_t pid)
f3787121 135{
62d05d9b 136 int i, ret;
9f30a190 137
9f30a190
MM
138 for (i = 0; i < LXC_NS_MAX; i++)
139 ns_fd[i] = -1;
140
4d8ac866
CB
141 ret = lxc_preserve_ns(pid, "");
142 if (ret < 0) {
143 SYSERROR("Kernel does not support attaching to namespaces.");
62d05d9b 144 return false;
4d8ac866
CB
145 } else {
146 close(ret);
cd43d2d1
SH
147 }
148
9f30a190
MM
149 for (i = 0; i < LXC_NS_MAX; i++) {
150 if ((clone_flags & ns_info[i].clone_flag) == 0)
151 continue;
4d8ac866 152 ns_fd[i] = lxc_preserve_ns(pid, ns_info[i].proc_name);
9f30a190
MM
153 if (ns_fd[i] < 0)
154 goto error;
155 }
156
62d05d9b 157 return true;
9f30a190
MM
158
159error:
4d8ac866
CB
160 if (errno == ENOENT)
161 SYSERROR("Kernel does not support attaching to %s namespaces.", ns_info[i].proc_name);
162 else
163 SYSERROR("Failed to open file descriptor for %s namespace: %s.", ns_info[i].proc_name, strerror(errno));
9f30a190 164 close_ns(ns_fd);
62d05d9b 165 return false;
9f30a190
MM
166}
167
168static int attach_ns(const int ns_fd[LXC_NS_MAX]) {
169 int i;
170
171 for (i = 0; i < LXC_NS_MAX; i++) {
172 if (ns_fd[i] < 0)
173 continue;
174
175 if (setns(ns_fd[i], 0) != 0)
176 goto error;
177 }
178 return 0;
179
180error:
408da065 181 SYSERROR("Failed to attach %s namespace.", ns_info[i].proc_name);
9f30a190
MM
182 return -1;
183}
184
80090207
CLG
185static int match_fd(int fd)
186{
187 return (fd == 0 || fd == 1 || fd == 2);
188}
189
408da065
CB
190/* Check for any fds we need to close.
191 * - If fd_to_ignore != -1, then if we find that fd open we will ignore it.
192 * - By default we warn about open fds we find.
193 * - If closeall is true, we will close open fds.
194 * - If lxc-start was passed "-C", then conf->close_all_fds will be true, in
195 * which case we also close all open fds.
196 * - A daemonized container will always pass closeall=true.
d2cf4c37
SH
197 */
198int lxc_check_inherited(struct lxc_conf *conf, bool closeall, int fd_to_ignore)
80090207 199{
74f96976 200 struct dirent *direntp;
80090207
CLG
201 int fd, fddir;
202 DIR *dir;
80090207 203
d2cf4c37
SH
204 if (conf && conf->close_all_fds)
205 closeall = true;
206
b119f362 207restart:
80090207
CLG
208 dir = opendir("/proc/self/fd");
209 if (!dir) {
408da065 210 WARN("Failed to open directory: %m.");
80090207
CLG
211 return -1;
212 }
213
214 fddir = dirfd(dir);
215
74f96976 216 while ((direntp = readdir(dir))) {
80090207
CLG
217 if (!direntp)
218 break;
219
220 if (!strcmp(direntp->d_name, "."))
221 continue;
222
223 if (!strcmp(direntp->d_name, ".."))
224 continue;
225
226 fd = atoi(direntp->d_name);
227
f2faa8fa 228 if (fd == fddir || fd == lxc_log_fd || fd == fd_to_ignore)
80090207
CLG
229 continue;
230
858377e4
SH
231 if (current_config && fd == current_config->logfd)
232 continue;
233
80090207
CLG
234 if (match_fd(fd))
235 continue;
80090207 236
d2cf4c37 237 if (closeall) {
b119f362
SH
238 close(fd);
239 closedir(dir);
408da065 240 INFO("Closed inherited fd: %d.", fd);
b119f362
SH
241 goto restart;
242 }
408da065 243 WARN("Inherited fd: %d.", fd);
80090207
CLG
244 }
245
408da065 246 /* Only enable syslog at this point to avoid the above logging function
64c57ea1
BD
247 * to open a new fd and make the check_inherited function enter an
248 * infinite loop.
249 */
250 lxc_log_enable_syslog();
251
92c7f629
GK
252 closedir(dir); /* cannot fail */
253 return 0;
80090207
CLG
254}
255
83ee7875 256static int setup_signal_fd(sigset_t *oldmask)
b0a33c1e 257{
258 sigset_t mask;
259 int fd;
260
408da065 261 /* Block everything except serious error signals. */
f3304a29
FW
262 if (sigfillset(&mask) ||
263 sigdelset(&mask, SIGILL) ||
264 sigdelset(&mask, SIGSEGV) ||
265 sigdelset(&mask, SIGBUS) ||
b5159817 266 sigdelset(&mask, SIGWINCH) ||
f3304a29 267 sigprocmask(SIG_BLOCK, &mask, oldmask)) {
408da065 268 SYSERROR("Failed to set signal mask.");
b0a33c1e 269 return -1;
270 }
271
272 fd = signalfd(-1, &mask, 0);
273 if (fd < 0) {
408da065 274 SYSERROR("Failed to create signal file descriptor.");
b0a33c1e 275 return -1;
276 }
277
278 if (fcntl(fd, F_SETFD, FD_CLOEXEC)) {
408da065 279 SYSERROR("Failed to set FD_CLOEXEC on the signal file descriptor: %d.", fd);
b0a33c1e 280 close(fd);
281 return -1;
282 }
283
408da065 284 DEBUG("Set SIGCHLD handler with file descriptor: %d.", fd);
1ac470c0 285
b0a33c1e 286 return fd;
287}
288
84c92abd 289static int signal_handler(int fd, uint32_t events, void *data,
f3787121 290 struct lxc_epoll_descr *descr)
b0a33c1e 291{
15cd25fd 292 struct signalfd_siginfo siginfo;
80507ee8 293 siginfo_t info;
15cd25fd 294 int ret;
82d89dce 295 pid_t *pid = data;
80507ee8 296 bool init_died = false;
15cd25fd
DL
297
298 ret = read(fd, &siginfo, sizeof(siginfo));
299 if (ret < 0) {
408da065 300 ERROR("Failed to read signal info from signal file descriptor: %d.", fd);
15cd25fd
DL
301 return -1;
302 }
303
304 if (ret != sizeof(siginfo)) {
408da065 305 ERROR("Unexpected size for siginfo struct.");
15cd25fd
DL
306 return -1;
307 }
308
408da065 309 /* Check whether init is running. */
80507ee8
SH
310 info.si_pid = 0;
311 ret = waitid(P_PID, *pid, &info, WEXITED | WNOWAIT | WNOHANG);
408da065 312 if (ret == 0 && info.si_pid == *pid)
80507ee8 313 init_died = true;
80507ee8 314
f3304a29
FW
315 if (siginfo.ssi_signo != SIGCHLD) {
316 kill(*pid, siginfo.ssi_signo);
408da065 317 INFO("Forwarded signal %d to pid %d.", siginfo.ssi_signo, *pid);
80507ee8 318 return init_died ? 1 : 0;
f3304a29
FW
319 }
320
408da065
CB
321 if (siginfo.ssi_code == CLD_STOPPED) {
322 INFO("Container init process was stopped.");
323 return init_died ? 1 : 0;
324 } else if (siginfo.ssi_code == CLD_CONTINUED) {
325 INFO("Container init process was continued.");
80507ee8 326 return init_died ? 1 : 0;
15cd25fd 327 }
1ac470c0 328
408da065
CB
329 /* More robustness, protect ourself from a SIGCHLD sent
330 * by a process different from the container init.
82d89dce
DL
331 */
332 if (siginfo.ssi_pid != *pid) {
408da065 333 WARN("Invalid pid for SIGCHLD. Received pid %d, expected pid %d.", siginfo.ssi_pid, *pid);
80507ee8 334 return init_died ? 1 : 0;
82d89dce
DL
335 }
336
408da065 337 DEBUG("Container init process %d exited.", *pid);
b0a33c1e 338 return 1;
339}
340
735f2c6e 341int lxc_set_state(const char *name, struct lxc_handler *handler, lxc_state_t state)
66aeffc7
DL
342{
343 handler->state = state;
9123e471 344 lxc_monitor_send_state(name, state, handler->lxcpath);
66aeffc7
DL
345 return 0;
346}
347
735f2c6e 348int lxc_poll(const char *name, struct lxc_handler *handler)
b0a33c1e 349{
ca5f7926
DL
350 int sigfd = handler->sigfd;
351 int pid = handler->pid;
b0a33c1e 352 struct lxc_epoll_descr descr;
353
a9e61274 354 if (lxc_mainloop_open(&descr)) {
408da065 355 ERROR("Failed to create LXC mainloop.");
50c8bf05 356 goto out_sigfd;
b0a33c1e 357 }
358
83ee7875 359 if (lxc_mainloop_add_handler(&descr, sigfd, signal_handler, &pid)) {
408da065 360 ERROR("Failed to add signal handler with file descriptor %d to LXC mainloop.", sigfd);
b0a33c1e 361 goto out_mainloop_open;
362 }
363
da41561c 364 if (lxc_console_mainloop_add(&descr, handler->conf)) {
408da065 365 ERROR("Failed to add console handler to LXC mainloop.");
63376d7d
DL
366 goto out_mainloop_open;
367 }
368
ef6e34ee 369 if (lxc_cmd_mainloop_add(name, &descr, handler)) {
408da065 370 ERROR("Failed to add command handler to LXC mainloop.");
96fa1ff0 371 goto out_mainloop_open;
563f2f2c
DL
372 }
373
828695d9 374 if (handler->conf->need_utmp_watch) {
495d2046 375 #if HAVE_SYS_CAPABILITY_H
828695d9 376 if (lxc_utmp_mainloop_add(&descr, handler)) {
408da065 377 ERROR("Failed to add utmp handler to LXC mainloop.");
828695d9
SH
378 goto out_mainloop_open;
379 }
495d2046 380 #else
408da065 381 DEBUG("Not starting utmp handler as CAP_SYS_BOOT cannot be dropped without capabilities support.");
495d2046 382 #endif
563f2f2c 383 }
b0a33c1e 384
e51d4895 385 return lxc_mainloop(&descr, -1);
b0a33c1e 386
387out_mainloop_open:
388 lxc_mainloop_close(&descr);
408da065 389
b0a33c1e 390out_sigfd:
391 close(sigfd);
408da065 392
c3e13372 393 return -1;
b0a33c1e 394}
395
13f5be62 396struct lxc_handler *lxc_init(const char *name, struct lxc_conf *conf, const char *lxcpath)
59eb99ba 397{
b6b2b194 398 int i;
3a0f472d
DL
399 struct lxc_handler *handler;
400
401 handler = malloc(sizeof(*handler));
402 if (!handler)
403 return NULL;
59eb99ba
DL
404
405 memset(handler, 0, sizeof(*handler));
406
e8bd4e43 407 handler->ttysock[0] = handler->ttysock[1] = -1;
fae349da 408 handler->conf = conf;
9123e471 409 handler->lxcpath = lxcpath;
5c068da9 410 handler->pinfd = -1;
fae349da 411
b6b2b194
WB
412 for (i = 0; i < LXC_NS_MAX; i++)
413 handler->nsfd[i] = -1;
414
fe4de9a6
DE
415 lsm_init();
416
3bdf52d7
DL
417 handler->name = strdup(name);
418 if (!handler->name) {
408da065 419 ERROR("Failed to allocate memory.");
3bdf52d7
DL
420 goto out_free;
421 }
422
ef6e34ee 423 if (lxc_cmd_init(name, handler, lxcpath))
d2e30e99
DE
424 goto out_free_name;
425
8f2c3a70 426 if (lxc_read_seccomp_config(conf) != 0) {
408da065 427 ERROR("Failed loading seccomp policy.");
d2e30e99 428 goto out_close_maincmd_fd;
8f2c3a70
SH
429 }
430
408da065 431 /* Begin by setting the state to STARTING. */
25c2aca5 432 if (lxc_set_state(name, handler, STARTING)) {
408da065 433 ERROR("Failed to set state for container \"%s\" to \"%s\".", name, lxc_state2str(STARTING));
051151de 434 goto out_close_maincmd_fd;
0ad19a3f 435 }
436
408da065
CB
437 /* Start of environment variable setup for hooks. */
438 if (name && setenv("LXC_NAME", name, 1))
439 SYSERROR("Failed to set environment variable: LXC_NAME=%s.", name);
440
441 if (conf->rcfile && setenv("LXC_CONFIG_FILE", conf->rcfile, 1))
442 SYSERROR("Failed to set environment variable: LXC_CONFIG_FILE=%s.", conf->rcfile);
443
444 if (conf->rootfs.mount && setenv("LXC_ROOTFS_MOUNT", conf->rootfs.mount, 1))
445 SYSERROR("Failed to set environment variable: LXC_ROOTFS_MOUNT=%s.", conf->rootfs.mount);
446
447 if (conf->rootfs.path && setenv("LXC_ROOTFS_PATH", conf->rootfs.path, 1))
448 SYSERROR("Failed to set environment variable: LXC_ROOTFS_PATH=%s.", conf->rootfs.path);
449
450 if (conf->console.path && setenv("LXC_CONSOLE", conf->console.path, 1))
451 SYSERROR("Failed to set environment variable: LXC_CONSOLE=%s.", conf->console.path);
452
453 if (conf->console.log_path && setenv("LXC_CONSOLE_LOGPATH", conf->console.log_path, 1))
454 SYSERROR("Failed to set environment variable: LXC_CONSOLE_LOGPATH=%s.", conf->console.log_path);
455
456 if (setenv("LXC_CGNS_AWARE", "1", 1))
457 SYSERROR("Failed to set environment variable LXC_CGNS_AWARE=1.");
458 /* End of environment variable setup for hooks. */
f7bee6c6 459
283678ed 460 if (run_lxc_hooks(name, "pre-start", conf, handler->lxcpath, NULL)) {
408da065 461 ERROR("Failed to run lxc.hook.pre-start for container \"%s\".", name);
773fb9ca
SH
462 goto out_aborting;
463 }
26ddeedd 464
408da065
CB
465 /* The signal fd has to be created before forking otherwise if the child
466 * process exits before we setup the signal fd, the event will be lost
467 * and the command will be stuck.
468 */
83ee7875 469 handler->sigfd = setup_signal_fd(&handler->oldmask);
59eb99ba 470 if (handler->sigfd < 0) {
408da065 471 ERROR("Failed to setup SIGCHLD fd handler.");
b5159817
DE
472 goto out_delete_tty;
473 }
474
408da065 475 /* Do this after setting up signals since it might unblock SIGWINCH. */
b5159817 476 if (lxc_console_create(conf)) {
408da065 477 ERROR("Failed to create console for container \"%s\".", name);
b5159817 478 goto out_restore_sigmask;
b0a33c1e 479 }
480
c4d10a05 481 if (ttys_shift_ids(conf) < 0) {
408da065 482 ERROR("Failed to shift tty into container.");
c4d10a05
SH
483 goto out_restore_sigmask;
484 }
485
408da065 486 INFO("Container \"%s\" is initialized.", name);
3a0f472d 487 return handler;
59eb99ba 488
b5159817
DE
489out_restore_sigmask:
490 sigprocmask(SIG_SETMASK, &handler->oldmask, NULL);
59eb99ba 491out_delete_tty:
fae349da 492 lxc_delete_tty(&conf->tty_info);
59eb99ba 493out_aborting:
25c2aca5 494 lxc_set_state(name, handler, ABORTING);
d2e30e99
DE
495out_close_maincmd_fd:
496 close(conf->maincmd_fd);
497 conf->maincmd_fd = -1;
3bdf52d7
DL
498out_free_name:
499 free(handler->name);
500 handler->name = NULL;
3a0f472d
DL
501out_free:
502 free(handler);
c3e13372 503 return NULL;
59eb99ba
DL
504}
505
3b72c4a0 506void lxc_fini(const char *name, struct lxc_handler *handler)
59eb99ba 507{
b3286b62
WB
508 int i, rc;
509 pid_t self = getpid();
510 char *namespaces[LXC_NS_MAX+1];
511 size_t namespace_count = 0;
b6b2b194 512
408da065
CB
513 /* The STOPPING state is there for future cleanup code which can take
514 * awhile.
59eb99ba 515 */
25c2aca5 516 lxc_set_state(name, handler, STOPPING);
b6b2b194 517
b3286b62
WB
518 for (i = 0; i < LXC_NS_MAX; i++) {
519 if (handler->nsfd[i] != -1) {
520 rc = asprintf(&namespaces[namespace_count], "%s:/proc/%d/fd/%d",
521 ns_info[i].proc_name, self, handler->nsfd[i]);
522 if (rc == -1) {
408da065 523 SYSERROR("Failed to allocate memory.");
b3286b62
WB
524 break;
525 }
526 ++namespace_count;
527 }
528 }
529 namespaces[namespace_count] = NULL;
c154af98 530
408da065
CB
531 if (handler->conf->reboot && setenv("LXC_TARGET", "reboot", 1))
532 SYSERROR("Failed to set environment variable: LXC_TARGET=reboot.");
533
534 if (!handler->conf->reboot && setenv("LXC_TARGET", "stop", 1))
535 SYSERROR("Failed to set environment variable: LXC_TARGET=stop.");
c154af98 536
b3286b62 537 if (run_lxc_hooks(name, "stop", handler->conf, handler->lxcpath, namespaces))
408da065 538 ERROR("Failed to run lxc.hook.stop for container \"%s\".", name);
c154af98 539
b3286b62
WB
540 while (namespace_count--)
541 free(namespaces[namespace_count]);
b6b2b194
WB
542 for (i = 0; i < LXC_NS_MAX; i++) {
543 if (handler->nsfd[i] != -1) {
544 close(handler->nsfd[i]);
545 handler->nsfd[i] = -1;
546 }
547 }
738d0deb
CB
548
549 if (handler->netnsfd >= 0) {
550 close(handler->netnsfd);
551 handler->netnsfd = -1;
552 }
553
25c2aca5 554 lxc_set_state(name, handler, STOPPED);
59eb99ba 555
f3787121 556 if (run_lxc_hooks(name, "post-stop", handler->conf, handler->lxcpath, NULL)) {
408da065 557 ERROR("Failed to run lxc.hook.post-stop for container \"%s\".", name);
f3787121
CB
558 if (handler->conf->reboot) {
559 WARN("Container will be stopped instead of rebooted.");
560 handler->conf->reboot = 0;
408da065
CB
561 if (setenv("LXC_TARGET", "stop", 1))
562 WARN("Failed to set environment variable: LXC_TARGET=stop.");
f3787121
CB
563 }
564 }
26ddeedd 565
408da065 566 /* Reset mask set by setup_signal_fd. */
8f64a3f6 567 if (sigprocmask(SIG_SETMASK, &handler->oldmask, NULL))
408da065 568 WARN("Failed to restore signal mask.");
8f64a3f6 569
b5159817 570 lxc_console_delete(&handler->conf->console);
b2431939 571 lxc_delete_tty(&handler->conf->tty_info);
d2e30e99
DE
572 close(handler->conf->maincmd_fd);
573 handler->conf->maincmd_fd = -1;
3bdf52d7 574 free(handler->name);
e8bd4e43
SH
575 if (handler->ttysock[0] != -1) {
576 close(handler->ttysock[0]);
577 close(handler->ttysock[1]);
578 }
2c5f2ede
CB
579
580 if (handler->conf->ephemeral == 1 && handler->conf->reboot != 1)
28272964 581 lxc_destroy_container_on_signal(handler, name);
2c5f2ede 582
d4ef7c50 583 cgroup_destroy(handler);
b2431939 584 free(handler);
59eb99ba
DL
585}
586
735f2c6e 587void lxc_abort(const char *name, struct lxc_handler *handler)
59eb99ba 588{
73e608b2
SH
589 int ret, status;
590
25c2aca5 591 lxc_set_state(name, handler, ABORTING);
7d9fb3e9
DL
592 if (handler->pid > 0)
593 kill(handler->pid, SIGKILL);
408da065
CB
594 while ((ret = waitpid(-1, &status, 0)) > 0) {
595 ;
596 }
59eb99ba
DL
597}
598
828695d9
SH
599#include <sys/reboot.h>
600#include <linux/reboot.h>
601
408da065
CB
602/* reboot(LINUX_REBOOT_CMD_CAD_ON) will return -EINVAL in a child pid namespace
603 * if container reboot support exists. Otherwise, it will either succeed or
604 * return -EPERM.
e2fa1520
SH
605 */
606static int container_reboot_supported(void *arg)
828695d9 607{
d028235d 608 int *cmd = arg;
828695d9 609 int ret;
828695d9 610
d028235d 611 ret = reboot(*cmd);
e2fa1520
SH
612 if (ret == -1 && errno == EINVAL)
613 return 1;
614 return 0;
615}
616
b60ed720 617static int must_drop_cap_sys_boot(struct lxc_conf *conf)
e2fa1520 618{
025ed0f3 619 FILE *f;
b60ed720 620 int ret, cmd, v, flags;
d028235d
SG
621 long stack_size = 4096;
622 void *stack = alloca(stack_size);
623 int status;
624 pid_t pid;
e2fa1520 625
025ed0f3 626 f = fopen("/proc/sys/kernel/ctrl-alt-del", "r");
e2fa1520
SH
627 if (!f) {
628 DEBUG("failed to open /proc/sys/kernel/ctrl-alt-del");
828695d9 629 return 1;
e2fa1520 630 }
828695d9
SH
631
632 ret = fscanf(f, "%d", &v);
633 fclose(f);
e2fa1520 634 if (ret != 1) {
408da065 635 DEBUG("Failed to read /proc/sys/kernel/ctrl-alt-del.");
828695d9 636 return 1;
e2fa1520
SH
637 }
638 cmd = v ? LINUX_REBOOT_CMD_CAD_ON : LINUX_REBOOT_CMD_CAD_OFF;
639
b60ed720
SH
640 flags = CLONE_NEWPID | SIGCHLD;
641 if (!lxc_list_empty(&conf->id_map))
642 flags |= CLONE_NEWUSER;
643
7f145a6d 644#ifdef __ia64__
959aee9c 645 pid = __clone2(container_reboot_supported, stack, stack_size, flags, &cmd);
7f145a6d 646#else
959aee9c
SG
647 stack += stack_size;
648 pid = clone(container_reboot_supported, stack, flags, &cmd);
7f145a6d 649#endif
959aee9c 650 if (pid < 0) {
c08220e9 651 if (flags & CLONE_NEWUSER)
408da065 652 ERROR("Failed to clone (%#x): %s (includes CLONE_NEWUSER).", flags, strerror(errno));
c08220e9 653 else
408da065 654 ERROR("Failed to clone (%#x): %s.", flags, strerror(errno));
959aee9c
SG
655 return -1;
656 }
657 if (wait(&status) < 0) {
408da065 658 SYSERROR("Unexpected wait error: %m.");
959aee9c
SG
659 return -1;
660 }
e2fa1520
SH
661
662 if (WEXITSTATUS(status) != 1)
828695d9 663 return 1;
e2fa1520 664
828695d9
SH
665 return 0;
666}
667
408da065
CB
668/* netpipe is used in the unprivileged case to transfer the ifindexes from
669 * parent to child
658979c5
SH
670 */
671static int netpipe = -1;
672
673static inline int count_veths(struct lxc_list *network)
674{
675 struct lxc_list *iterator;
676 struct lxc_netdev *netdev;
677 int count = 0;
678
679 lxc_list_for_each(iterator, network) {
680 netdev = iterator->elem;
681 if (netdev->type != LXC_NET_VETH)
682 continue;
683 count++;
684 }
685 return count;
686}
687
688static int read_unpriv_netifindex(struct lxc_list *network)
689{
690 struct lxc_list *iterator;
691 struct lxc_netdev *netdev;
692
693 if (netpipe == -1)
694 return 0;
695 lxc_list_for_each(iterator, network) {
696 netdev = iterator->elem;
697 if (netdev->type != LXC_NET_VETH)
698 continue;
699 if (!(netdev->name = malloc(IFNAMSIZ))) {
408da065 700 ERROR("Out of memory.");
658979c5
SH
701 close(netpipe);
702 return -1;
703 }
704 if (read(netpipe, netdev->name, IFNAMSIZ) != IFNAMSIZ) {
705 close(netpipe);
706 return -1;
707 }
708 }
709 close(netpipe);
710 return 0;
711}
712
ffe1e01a 713static int do_start(void *data)
50e98013 714{
7c661726 715 struct lxc_list *iterator;
23c53af9 716 struct lxc_handler *handler = data;
7a55c157
TA
717 int devnull_fd = -1, ret;
718 char path[PATH_MAX];
50e98013
DL
719
720 if (sigprocmask(SIG_SETMASK, &handler->oldmask, NULL)) {
408da065 721 SYSERROR("Failed to set signal mask.");
9d7f9e52 722 return -1;
50e98013
DL
723 }
724
408da065
CB
725 /* This prctl must be before the synchro, so if the parent dies before
726 * we set the parent death signal, we will detect its death with the
727 * synchro right after, otherwise we have a window where the parent can
728 * exit before we set the pdeath signal leading to a unsupervized
729 * container.
743ecd2e
DL
730 */
731 if (prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0)) {
408da065 732 SYSERROR("Failed to set PR_SET_PDEATHSIG to SIGKILL.");
743ecd2e
DL
733 return -1;
734 }
735
3c22086f 736 lxc_sync_fini_parent(handler);
50e98013 737
408da065 738 /* Don't leak the pinfd to the container. */
025ed0f3 739 if (handler->pinfd >= 0) {
0d03360a 740 close(handler->pinfd);
025ed0f3 741 }
2b0e17e4 742
5b1e83cb
SH
743 if (lxc_sync_wait_parent(handler, LXC_SYNC_STARTUP))
744 return -1;
745
408da065
CB
746 /* Unshare CLONE_NEWNET after CLONE_NEWUSER. See
747 * https://github.com/lxc/lxd/issues/1978.
748 */
5b1e83cb 749 if ((handler->clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) ==
408da065 750 (CLONE_NEWNET | CLONE_NEWUSER)) {
5b1e83cb
SH
751 ret = unshare(CLONE_NEWNET);
752 if (ret < 0) {
408da065 753 SYSERROR("Failed to unshare CLONE_NEWNET.");
5b1e83cb
SH
754 goto out_warn_father;
755 }
408da065 756 INFO("Unshared CLONE_NEWNET.");
5b1e83cb
SH
757 }
758
408da065
CB
759 /* Tell the parent task it can begin to configure the container and wait
760 * for it to finish.
3c22086f
CLG
761 */
762 if (lxc_sync_barrier_parent(handler, LXC_SYNC_CONFIGURE))
9d7f9e52 763 return -1;
50e98013 764
658979c5
SH
765 if (read_unpriv_netifindex(&handler->conf->network) < 0)
766 goto out_warn_father;
767
408da065
CB
768 /* If we are in a new user namespace, become root there to have
769 * privilege over our namespace. When using lxc-execute we default to
770 * root, but this can be overriden using the lxc.init_uid and
771 * lxc.init_gid configuration options.
f6d3e3e4
SH
772 */
773 if (!lxc_list_empty(&handler->conf->id_map)) {
c5cd20ce
PT
774 gid_t new_gid = 0;
775 if (handler->conf->is_execute && handler->conf->init_gid)
776 new_gid = handler->conf->init_gid;
777
778 uid_t new_uid = 0;
779 if (handler->conf->is_execute && handler->conf->init_uid)
780 new_uid = handler->conf->init_uid;
781
408da065 782 NOTICE("Switching to uid=%d and gid=%d in new user namespace.", new_uid, new_gid);
56f8ff00 783 if (setgid(new_gid)) {
408da065 784 SYSERROR("Failed to setgid().");
f6d3e3e4
SH
785 goto out_warn_father;
786 }
56f8ff00 787 if (setuid(new_uid)) {
408da065 788 SYSERROR("Failed to setuid().");
f6d3e3e4
SH
789 goto out_warn_father;
790 }
c476bdce 791 if (setgroups(0, NULL)) {
408da065 792 SYSERROR("Failed to setgroups().");
c476bdce
SH
793 goto out_warn_father;
794 }
f6d3e3e4
SH
795 }
796
99b71824 797 if (access(handler->lxcpath, X_OK)) {
c8154066
SH
798 print_top_failing_dir(handler->lxcpath);
799 goto out_warn_father;
800 }
801
495d2046 802 #if HAVE_SYS_CAPABILITY_H
69182a31 803 if (handler->conf->need_utmp_watch) {
828695d9 804 if (prctl(PR_CAPBSET_DROP, CAP_SYS_BOOT, 0, 0, 0)) {
408da065 805 SYSERROR("Failed to remove the CAP_SYS_BOOT capability.");
c4ea60df 806 goto out_warn_father;
828695d9 807 }
408da065 808 DEBUG("Dropped the CAP_SYS_BOOT capability.");
e2fa1520 809 }
495d2046 810 #endif
e2fa1520 811
a91cde21 812 ret = snprintf(path, sizeof(path), "%s/dev/null", handler->conf->rootfs.mount);
408da065 813 if (ret < 0 || ret >= sizeof(path))
7a55c157 814 goto out_warn_father;
7a55c157
TA
815
816 /* In order to checkpoint restore, we need to have everything in the
817 * same mount namespace. However, some containers may not have a
818 * reasonable /dev (in particular, they may not have /dev/null), so we
819 * can't set init's std fds to /dev/null by opening it from inside the
820 * container.
821 *
822 * If that's the case, fall back to using the host's /dev/null. This
823 * means that migration won't work, but at least we won't spew output
824 * where it isn't wanted.
825 */
826 if (handler->backgrounded && !handler->conf->autodev && access(path, F_OK) < 0) {
c44de748
AM
827 devnull_fd = open_devnull();
828
829 if (devnull_fd < 0)
830 goto out_warn_father;
408da065
CB
831 WARN("Using /dev/null from the host for container init's "
832 "standard file descriptors. Migration will not work.");
c44de748
AM
833 }
834
e2fa1520 835 /* Setup the container, ip, names, utsname, ... */
d4ef7c50 836 if (lxc_setup(handler)) {
408da065 837 ERROR("Failed to setup container \"%s\".", handler->name);
e2fa1520
SH
838 goto out_warn_father;
839 }
50e98013 840
408da065 841 /* Ask father to setup cgroups and wait for him to finish. */
544a48a0 842 if (lxc_sync_barrier_parent(handler, LXC_SYNC_CGROUP))
c44de748 843 goto out_error;
544a48a0 844
deefdf8a
CB
845 /* Unshare cgroup namespace after we have setup our cgroups. If we do it
846 * earlier we end up with a wrong view of /proc/self/cgroup. For
847 * example, assume we unshare(CLONE_NEWCGROUP) first, and then create
848 * the cgroup for the container, say /sys/fs/cgroup/cpuset/lxc/c, then
849 * /proc/self/cgroup would show us:
850 *
851 * 8:cpuset:/lxc/c
852 *
853 * whereas it should actually show
854 *
855 * 8:cpuset:/
856 */
857 if (cgns_supported()) {
858 if (unshare(CLONE_NEWCGROUP) < 0) {
859 INFO("Failed to unshare CLONE_NEWCGROUP.");
860 goto out_warn_father;
861 }
862 INFO("Unshared CLONE_NEWCGROUP.");
863 }
864
408da065 865 /* Set the label to change to when we exec(2) the container's init. */
7aff4f43 866 if (lsm_process_label_set(NULL, handler->conf, 1, 1) < 0)
e075f5d9 867 goto out_warn_father;
5112cd70 868
029cdff5 869 /* Set PR_SET_NO_NEW_PRIVS after we changed the lsm label. If we do it
408da065
CB
870 * before we aren't allowed anymore.
871 */
029cdff5
CB
872 if (handler->conf->no_new_privs) {
873 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
874 SYSERROR("Could not set PR_SET_NO_NEW_PRIVS to block execve() gainable privileges.");
875 goto out_warn_father;
876 }
877 DEBUG("Set PR_SET_NO_NEW_PRIVS to block execve() gainable privileges.");
878 }
879
0d9acb99
DE
880 /* Some init's such as busybox will set sane tty settings on stdin,
881 * stdout, stderr which it thinks is the console. We already set them
882 * the way we wanted on the real terminal, and we want init to do its
408da065
CB
883 * setup on its console ie. the pty allocated in lxc_console_create() so
884 * make sure that that pty is stdin,stdout,stderr.
0d9acb99 885 */
39a78bbe 886 if (lxc_console_set_stdfds(handler->conf->console.slave) < 0)
0d9acb99
DE
887 goto out_warn_father;
888
408da065 889 /* If we mounted a temporary proc, then unmount it now. */
5112cd70 890 tmp_proc_unmount(handler->conf);
e075f5d9 891
8f2c3a70
SH
892 if (lxc_seccomp_load(handler->conf) != 0)
893 goto out_warn_father;
894
283678ed 895 if (run_lxc_hooks(handler->name, "start", handler->conf, handler->lxcpath, NULL)) {
408da065 896 ERROR("Failed to run lxc.hook.start for container \"%s\".", handler->name);
773fb9ca
SH
897 goto out_warn_father;
898 }
fc25b815 899
408da065
CB
900 /* The clearenv() and putenv() calls have been moved here to allow us to
901 * use environment variables passed to the various hooks, such as the
902 * start hook above. Not all of the variables like CONFIG_PATH or ROOTFS
903 * are valid in this context but others are.
904 */
f7bee6c6 905 if (clearenv()) {
408da065
CB
906 SYSERROR("Failed to clear environment.");
907 /* Don't error out though. */
f7bee6c6
MW
908 }
909
7c661726
MP
910 lxc_list_for_each(iterator, &handler->conf->environment) {
911 if (putenv((char *)iterator->elem)) {
408da065 912 SYSERROR("Failed to set environment variable: %s.", (char *)iterator->elem);
7c661726
MP
913 goto out_warn_father;
914 }
915 }
916
f7bee6c6 917 if (putenv("container=lxc")) {
408da065 918 SYSERROR("Failed to set environment variable: container=lxc.");
c4ea60df 919 goto out_warn_father;
f7bee6c6
MW
920 }
921
393903d1
SH
922 if (handler->conf->pty_names) {
923 if (putenv(handler->conf->pty_names)) {
408da065 924 SYSERROR("Failed to set environment variable for container ptys.");
393903d1
SH
925 goto out_warn_father;
926 }
927 }
928
773fb9ca 929 close(handler->sigfd);
26ddeedd 930
7a55c157
TA
931 if (devnull_fd < 0) {
932 devnull_fd = open_devnull();
933
934 if (devnull_fd < 0)
935 goto out_warn_father;
936 }
937
c44de748 938 if (handler->backgrounded && set_stdfds(devnull_fd))
69aeabac 939 goto out_warn_father;
507cee36 940
c44de748
AM
941 if (devnull_fd >= 0) {
942 close(devnull_fd);
943 devnull_fd = -1;
944 }
945
8c9a7665
TA
946 setsid();
947
408da065
CB
948 /* After this call, we are in error because this ops should not return
949 * as it execs.
950 */
c4ea60df 951 handler->ops->start(handler, handler->data);
50e98013
DL
952
953out_warn_father:
408da065
CB
954 /* We want the parent to know something went wrong, so we return a
955 * special error code.
956 */
d1ccb562 957 lxc_sync_wake_parent(handler, LXC_SYNC_ERROR);
c44de748
AM
958
959out_error:
960 if (devnull_fd >= 0)
961 close(devnull_fd);
962
50e98013
DL
963 return -1;
964}
965
74a3920a 966static int save_phys_nics(struct lxc_conf *conf)
7b35f3d6
SH
967{
968 struct lxc_list *iterator;
40f2f8a2 969 int am_root = (getuid() == 0);
7b35f3d6 970
40f2f8a2
LQ
971 if (!am_root)
972 return 0;
2c5f2ede 973
7b35f3d6
SH
974 lxc_list_for_each(iterator, &conf->network) {
975 struct lxc_netdev *netdev = iterator->elem;
976
977 if (netdev->type != LXC_NET_PHYS)
978 continue;
979 conf->saved_nics = realloc(conf->saved_nics,
980 (conf->num_savednics+1)*sizeof(struct saved_nic));
408da065 981 if (!conf->saved_nics)
7b35f3d6 982 return -1;
7b35f3d6
SH
983 conf->saved_nics[conf->num_savednics].ifindex = netdev->ifindex;
984 conf->saved_nics[conf->num_savednics].orig_name = strdup(netdev->link);
408da065 985 if (!conf->saved_nics[conf->num_savednics].orig_name)
7b35f3d6 986 return -1;
408da065 987 INFO("Stored saved_nic #%d idx %d name %s.", conf->num_savednics,
7b35f3d6
SH
988 conf->saved_nics[conf->num_savednics].ifindex,
989 conf->saved_nics[conf->num_savednics].orig_name);
990 conf->num_savednics++;
991 }
992
993 return 0;
994}
995
e8bd4e43
SH
996static int recv_fd(int sock, int *fd)
997{
998 if (lxc_abstract_unix_recv_fd(sock, fd, NULL, 0) < 0) {
408da065 999 SYSERROR("Error receiving tty file descriptor from child process.");
e8bd4e43
SH
1000 return -1;
1001 }
1002 if (*fd == -1)
1003 return -1;
1004 return 0;
1005}
1006
1007static int recv_ttys_from_child(struct lxc_handler *handler)
1008{
1009 struct lxc_conf *conf = handler->conf;
1010 int i, sock = handler->ttysock[1];
1011 struct lxc_tty_info *tty_info = &conf->tty_info;
1012
1013 if (!conf->tty)
1014 return 0;
1015
408da065
CB
1016 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
1017 if (!tty_info->pty_info)
e8bd4e43 1018 return -1;
e8bd4e43
SH
1019
1020 for (i = 0; i < conf->tty; i++) {
1021 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
1022 pty_info->busy = 0;
1023 if (recv_fd(sock, &pty_info->slave) < 0 ||
408da065
CB
1024 recv_fd(sock, &pty_info->master) < 0) {
1025 ERROR("Error receiving tty info from child process.");
e8bd4e43
SH
1026 return -1;
1027 }
1028 }
1029 tty_info->nbtty = conf->tty;
1030
1031 return 0;
1032}
1033
f813849c
TA
1034void resolve_clone_flags(struct lxc_handler *handler)
1035{
1036 handler->clone_flags = CLONE_NEWPID | CLONE_NEWNS;
1037
408da065 1038 if (!lxc_list_empty(&handler->conf->id_map))
f813849c 1039 handler->clone_flags |= CLONE_NEWUSER;
f813849c
TA
1040
1041 if (handler->conf->inherit_ns_fd[LXC_NS_NET] == -1) {
408da065 1042 if (!lxc_requests_empty_network(handler))
f813849c
TA
1043 handler->clone_flags |= CLONE_NEWNET;
1044 } else {
198cbbaa 1045 INFO("Inheriting a NET namespace.");
f813849c
TA
1046 }
1047
408da065 1048 if (handler->conf->inherit_ns_fd[LXC_NS_IPC] == -1)
f813849c 1049 handler->clone_flags |= CLONE_NEWIPC;
408da065 1050 else
198cbbaa 1051 INFO("Inheriting an IPC namespace.");
f813849c 1052
408da065 1053 if (handler->conf->inherit_ns_fd[LXC_NS_UTS] == -1)
f813849c 1054 handler->clone_flags |= CLONE_NEWUTS;
408da065 1055 else
198cbbaa 1056 INFO("Inheriting a UTS namespace.");
f813849c
TA
1057}
1058
74a3920a 1059static int lxc_spawn(struct lxc_handler *handler)
59eb99ba 1060{
b98f7d6e 1061 int failed_before_rename = 0;
ffe1e01a 1062 const char *name = handler->name;
7e4dfe0b 1063 bool cgroups_connected = false;
9f30a190 1064 int saved_ns_fd[LXC_NS_MAX];
5b1e83cb 1065 int preserve_mask = 0, i, flags;
658979c5 1066 int netpipepair[2], nveths;
9f30a190 1067
2f2623ec
SH
1068 netpipe = -1;
1069
9f30a190 1070 for (i = 0; i < LXC_NS_MAX; i++)
6c544cb3 1071 if (handler->conf->inherit_ns_fd[i] != -1)
9f30a190 1072 preserve_mask |= ns_info[i].clone_flag;
50e98013 1073
3c22086f 1074 if (lxc_sync_init(handler))
9d7f9e52 1075 return -1;
0ad19a3f 1076
e8bd4e43
SH
1077 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, handler->ttysock) < 0) {
1078 lxc_sync_fini(handler);
1079 return -1;
1080 }
1081
f813849c 1082 resolve_clone_flags(handler);
9f30a190 1083
f813849c 1084 if (handler->clone_flags & CLONE_NEWNET) {
26b797f3
SH
1085 if (!lxc_list_empty(&handler->conf->network)) {
1086
9f30a190
MM
1087 /* Find gateway addresses from the link device, which is
1088 * no longer accessible inside the container. Do this
1089 * before creating network interfaces, since goto
408da065
CB
1090 * out_delete_net does not work before lxc_clone.
1091 */
9f30a190 1092 if (lxc_find_gateway_addresses(handler)) {
408da065 1093 ERROR("Failed to find gateway addresses.");
9f30a190
MM
1094 lxc_sync_fini(handler);
1095 return -1;
1096 }
1097
408da065
CB
1098 /* That should be done before the clone because we will
1099 * fill the netdev index and use them in the child.
9f30a190
MM
1100 */
1101 if (lxc_create_network(handler)) {
408da065 1102 ERROR("Failed to create the network.");
9f30a190
MM
1103 lxc_sync_fini(handler);
1104 return -1;
1105 }
19a26f82
MK
1106 }
1107
9f30a190 1108 if (save_phys_nics(handler->conf)) {
408da065 1109 ERROR("Failed to save physical nic info.");
9f30a190 1110 goto out_abort;
82d5ae15
DL
1111 }
1112 }
1113
d4ef7c50 1114 if (!cgroup_init(handler)) {
408da065 1115 ERROR("Failed initializing cgroup support.");
33ad9f1a
CS
1116 goto out_delete_net;
1117 }
1118
7e4dfe0b
SH
1119 cgroups_connected = true;
1120
d4ef7c50 1121 if (!cgroup_create(handler)) {
408da065 1122 ERROR("Failed creating cgroups.");
47d8fb3b
CS
1123 goto out_delete_net;
1124 }
1125
408da065
CB
1126 /* If the rootfs is not a blockdev, prevent the container from marking
1127 * it readonly.
1128 * If the container is unprivileged then skip rootfs pinning.
0c547523 1129 */
5e32a990
ÇO
1130 if (lxc_list_empty(&handler->conf->id_map)) {
1131 handler->pinfd = pin_rootfs(handler->conf->rootfs.path);
1132 if (handler->pinfd == -1)
408da065 1133 INFO("Failed to pin the rootfs for container \"%s\".", handler->name);
5e32a990 1134 }
0c547523 1135
4d8ac866 1136 if (!preserve_ns(saved_ns_fd, preserve_mask, getpid()))
cd43d2d1 1137 goto out_delete_net;
4d8ac866 1138
cd43d2d1
SH
1139 if (attach_ns(handler->conf->inherit_ns_fd) < 0)
1140 goto out_delete_net;
9f30a190 1141
658979c5
SH
1142 if (am_unpriv() && (nveths = count_veths(&handler->conf->network))) {
1143 if (pipe(netpipepair) < 0) {
408da065 1144 SYSERROR("Failed to create pipe.");
658979c5
SH
1145 goto out_delete_net;
1146 }
408da065 1147 /* Store netpipe in the global var for do_start's use. */
658979c5
SH
1148 netpipe = netpipepair[0];
1149 }
1150
408da065 1151 /* Create a process in a new set of namespaces. */
5b1e83cb 1152 flags = handler->clone_flags;
408da065
CB
1153 if (handler->clone_flags & CLONE_NEWUSER) {
1154 /* If CLONE_NEWUSER and CLONE_NEWNET was requested, we need to
1155 * clone a new user namespace first and only later unshare our
1156 * network namespace to ensure that network devices ownership is
1157 * set up correctly.
1158 */
5b1e83cb 1159 flags &= ~CLONE_NEWNET;
408da065 1160 }
9fac8fbb 1161 handler->pid = lxc_clone(do_start, handler, flags);
59eb99ba 1162 if (handler->pid < 0) {
408da065 1163 SYSERROR("Failed to clone a new set of namespaces.");
7fef7a06 1164 goto out_delete_net;
0ad19a3f 1165 }
9662e444
CB
1166 for (i = 0; i < LXC_NS_MAX; i++)
1167 if (flags & ns_info[i].clone_flag)
1168 INFO("Cloned %s.", ns_info[i].flag_name);
0ad19a3f 1169
4d8ac866
CB
1170 if (!preserve_ns(handler->nsfd, handler->clone_flags | preserve_mask, handler->pid))
1171 INFO("Failed to preserve namespace for lxc.hook.stop.");
b6b2b194 1172
cd43d2d1 1173 if (attach_ns(saved_ns_fd))
408da065 1174 WARN("Failed to restore saved namespaces.");
9f30a190 1175
3c22086f
CLG
1176 lxc_sync_fini_child(handler);
1177
408da065
CB
1178 /* Map the container uids. The container became an invalid userid the
1179 * moment it was cloned with CLONE_NEWUSER. This call doesn't change
1180 * anything immediately, but allows the container to setuid(0) (0 being
1181 * mapped to something else on the host.) later to become a valid uid
1182 * again.
1183 */
5b1e83cb 1184 if (lxc_map_ids(&handler->conf->id_map, handler->pid)) {
408da065 1185 ERROR("Failed to set up id mapping.");
5b1e83cb
SH
1186 goto out_delete_net;
1187 }
1188
1189 if (lxc_sync_wake_child(handler, LXC_SYNC_STARTUP)) {
99a6af52 1190 failed_before_rename = 1;
5b1e83cb
SH
1191 goto out_delete_net;
1192 }
1193
1194 if (lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE)) {
1195 failed_before_rename = 1;
1196 goto out_delete_net;
1197 }
0ad19a3f 1198
d4ef7c50 1199 if (!cgroup_create_legacy(handler)) {
408da065 1200 ERROR("Failed to setup legacy cgroups for container \"%s\".", name);
ae5c8b8e 1201 goto out_delete_net;
33ad9f1a 1202 }
9daf6f5d 1203 if (!cgroup_setup_limits(handler, false)) {
408da065 1204 ERROR("Failed to setup cgroup limits for container \"%s\".", name);
6031a6e5
DE
1205 goto out_delete_net;
1206 }
1207
d4ef7c50 1208 if (!cgroup_enter(handler))
7fef7a06 1209 goto out_delete_net;
218d4250 1210
0996e18a
SH
1211 if (!cgroup_chown(handler))
1212 goto out_delete_net;
1213
99a6af52
MN
1214 if (failed_before_rename)
1215 goto out_delete_net;
1216
408da065 1217 /* Create the network configuration. */
d5088cf2 1218 if (handler->clone_flags & CLONE_NEWNET) {
c43cbc04 1219 if (lxc_assign_network(handler->lxcpath, handler->name,
408da065
CB
1220 &handler->conf->network, handler->pid)) {
1221 ERROR("Failed to create the configured network.");
7fef7a06 1222 goto out_delete_net;
82d5ae15 1223 }
0ad19a3f 1224 }
1225
658979c5
SH
1226 if (netpipe != -1) {
1227 struct lxc_list *iterator;
1228 struct lxc_netdev *netdev;
1229
1230 close(netpipe);
1231 lxc_list_for_each(iterator, &handler->conf->network) {
1232 netdev = iterator->elem;
1233 if (netdev->type != LXC_NET_VETH)
1234 continue;
1235 if (write(netpipepair[1], netdev->name, IFNAMSIZ) != IFNAMSIZ) {
408da065 1236 ERROR("Error writing veth name to container.");
658979c5
SH
1237 goto out_delete_net;
1238 }
1239 }
1240 close(netpipepair[1]);
1241 }
1242
408da065
CB
1243 /* Tell the child to continue its initialization. We'll get
1244 * LXC_SYNC_CGROUP when it is ready for us to setup cgroups.
3c22086f
CLG
1245 */
1246 if (lxc_sync_barrier_child(handler, LXC_SYNC_POST_CONFIGURE))
544a48a0
SH
1247 goto out_delete_net;
1248
9daf6f5d 1249 if (!cgroup_setup_limits(handler, true)) {
408da065 1250 ERROR("Failed to setup the devices cgroup for container \"%s\".", name);
b98f7d6e 1251 goto out_delete_net;
544a48a0
SH
1252 }
1253
73d28d42 1254 cgroup_disconnect();
7e4dfe0b 1255 cgroups_connected = false;
73d28d42 1256
408da065 1257 /* Read tty fds allocated by child. */
e8bd4e43 1258 if (recv_ttys_from_child(handler) < 0) {
408da065 1259 ERROR("Failed to receive tty info from child process.");
e8bd4e43
SH
1260 goto out_delete_net;
1261 }
1262
408da065
CB
1263 /* Tell the child to complete its initialization and wait for it to exec
1264 * or return an error. (The child will never return
1265 * LXC_SYNC_POST_CGROUP+1. It will either close the sync pipe, causing
1266 * lxc_sync_barrier_child to return success, or return a different
1267 * value, causing us to error out).
544a48a0
SH
1268 */
1269 if (lxc_sync_barrier_child(handler, LXC_SYNC_POST_CGROUP))
3c22086f 1270 return -1;
0ad19a3f 1271
cc28d0b0
SH
1272 if (detect_shared_rootfs())
1273 umount2(handler->conf->rootfs.mount, MNT_DETACH);
1274
23c53af9 1275 if (handler->ops->post_start(handler, handler->data))
e6126dbe
MN
1276 goto out_abort;
1277
25c2aca5 1278 if (lxc_set_state(name, handler, RUNNING)) {
408da065
CB
1279 ERROR("Failed to set state for container \"%s\" to \"%s\".", name,
1280 lxc_state2str(RUNNING));
59eb99ba 1281 goto out_abort;
3f21c114 1282 }
22ebac19 1283
3c22086f 1284 lxc_sync_fini(handler);
738d0deb 1285 handler->netnsfd = lxc_preserve_ns(handler->pid, "net");
0c547523 1286
e6126dbe 1287 return 0;
1ac470c0 1288
7fef7a06 1289out_delete_net:
7e4dfe0b
SH
1290 if (cgroups_connected)
1291 cgroup_disconnect();
d5088cf2 1292 if (handler->clone_flags & CLONE_NEWNET)
74a2b586 1293 lxc_delete_network(handler);
59eb99ba
DL
1294out_abort:
1295 lxc_abort(name, handler);
3c22086f 1296 lxc_sync_fini(handler);
5c068da9
SH
1297 if (handler->pinfd >= 0) {
1298 close(handler->pinfd);
1299 handler->pinfd = -1;
1300 }
1301
b79fcd86 1302 return -1;
59eb99ba 1303}
0ad19a3f 1304
ee70bf78 1305int __lxc_start(const char *name, struct lxc_conf *conf,
507cee36
TA
1306 struct lxc_operations* ops, void *data, const char *lxcpath,
1307 bool backgrounded)
59eb99ba 1308{
3a0f472d 1309 struct lxc_handler *handler;
e043236e 1310 int err = -1;
59eb99ba 1311 int status;
358daf49 1312 bool removed_all_netdevs = true;
80090207 1313
13f5be62 1314 handler = lxc_init(name, conf, lxcpath);
3a0f472d 1315 if (!handler) {
408da065 1316 ERROR("Failed to initialize container \"%s\".", name);
66aeffc7 1317 return -1;
0ad19a3f 1318 }
ee70bf78
CLG
1319 handler->ops = ops;
1320 handler->data = data;
507cee36 1321 handler->backgrounded = backgrounded;
738d0deb 1322 handler->netnsfd = -1;
e6126dbe 1323
b60ed720 1324 if (must_drop_cap_sys_boot(handler->conf)) {
495d2046 1325 #if HAVE_SYS_CAPABILITY_H
408da065 1326 DEBUG("Dropping CAP_SYS_BOOT capability.");
495d2046 1327 #else
408da065 1328 DEBUG("Not dropping CAP_SYS_BOOT capability as capabilities aren't supported.");
495d2046 1329 #endif
69182a31 1330 } else {
408da065 1331 DEBUG("Not dropping CAP_SYS_BOOT or watching utmp.");
69182a31
SH
1332 handler->conf->need_utmp_watch = 0;
1333 }
1334
76a26f55 1335 if (!attach_block_device(handler->conf)) {
408da065 1336 ERROR("Failed to attach block device.");
76a26f55
SH
1337 goto out_fini_nonet;
1338 }
1339
35120d9c 1340 if (geteuid() == 0 && !lxc_list_empty(&conf->id_map)) {
408da065 1341 /* If the backing store is a device, mount it here and now. */
35120d9c
SH
1342 if (rootfs_is_blockdev(conf)) {
1343 if (unshare(CLONE_NEWNS) < 0) {
408da065 1344 ERROR("Failed to unshare CLONE_NEWNS.");
35120d9c
SH
1345 goto out_fini_nonet;
1346 }
408da065
CB
1347 INFO("Unshared CLONE_NEWNS.");
1348
6a0c909a 1349 remount_all_slave();
35120d9c 1350 if (do_rootfs_setup(conf, name, lxcpath) < 0) {
408da065 1351 ERROR("Error setting up rootfs mount as root before spawn.");
35120d9c
SH
1352 goto out_fini_nonet;
1353 }
408da065 1354 INFO("Set up container rootfs as host root.");
35120d9c
SH
1355 }
1356 }
1357
23c53af9 1358 err = lxc_spawn(handler);
59eb99ba 1359 if (err) {
408da065 1360 ERROR("Failed to spawn container \"%s\".", name);
76a26f55 1361 goto out_detach_blockdev;
0ad19a3f 1362 }
1363
8bee8851
WB
1364 handler->conf->reboot = 0;
1365
3a0f472d 1366 err = lxc_poll(name, handler);
e043236e 1367 if (err) {
408da065 1368 ERROR("LXC mainloop exited with error: %d.", err);
738d0deb
CB
1369 if (handler->netnsfd >= 0) {
1370 close(handler->netnsfd);
1371 handler->netnsfd = -1;
1372 }
59eb99ba
DL
1373 goto out_abort;
1374 }
0ad19a3f 1375
3a0f472d 1376 while (waitpid(handler->pid, &status, 0) < 0 && errno == EINTR)
1bc5cc8c 1377 continue;
e043236e 1378
408da065
CB
1379 /* If the child process exited but was not signaled, it didn't call
1380 * reboot. This should mean it was an lxc-execute which simply exited.
1381 * In any case, treat it as a 'halt'.
8b004f07 1382 */
d028235d 1383 if (WIFSIGNALED(status)) {
8b004f07
SH
1384 switch(WTERMSIG(status)) {
1385 case SIGINT: /* halt */
408da065 1386 DEBUG("Container \"%s\" is halting.", name);
8b004f07
SH
1387 break;
1388 case SIGHUP: /* reboot */
408da065 1389 DEBUG("Container \"%s\" is rebooting.", name);
8b004f07
SH
1390 handler->conf->reboot = 1;
1391 break;
c2b9bd9e 1392 case SIGSYS: /* seccomp */
408da065 1393 DEBUG("Container \"%s\" violated its seccomp policy.", name);
c2b9bd9e 1394 break;
8b004f07 1395 default:
408da065 1396 DEBUG("Unknown exit status for container \"%s\" init %d.", name, WTERMSIG(status));
8b004f07
SH
1397 break;
1398 }
d028235d 1399 }
828695d9 1400
ce5782df 1401 DEBUG("Pushing physical nics back to host namespace");
738d0deb 1402 lxc_restore_phys_nics_to_netns(handler->netnsfd, handler->conf);
ce5782df 1403
408da065 1404 DEBUG("Tearing down virtual network devices used by container \"%s\".", name);
358daf49 1405 removed_all_netdevs = lxc_delete_network(handler);
ce5782df 1406
5c068da9
SH
1407 if (handler->pinfd >= 0) {
1408 close(handler->pinfd);
1409 handler->pinfd = -1;
1410 }
1411
1787abca 1412 lxc_monitor_send_exit_code(name, status, handler->lxcpath);
3a0f472d 1413 err = lxc_error_set_and_log(handler->pid, status);
9d7f9e52 1414out_fini:
358daf49 1415 if (!removed_all_netdevs) {
408da065 1416 DEBUG("Failed tearing down network devices used by container. Trying again!");
358daf49
CB
1417 removed_all_netdevs = lxc_delete_network(handler);
1418 if (!removed_all_netdevs)
1419 DEBUG("Failed tearing down network devices used by container. Not trying again!");
1420 }
74a2b586 1421
76a26f55
SH
1422out_detach_blockdev:
1423 detach_block_device(handler->conf);
1424
74a2b586 1425out_fini_nonet:
3a0f472d 1426 lxc_fini(name, handler);
0ad19a3f 1427 return err;
1428
59eb99ba 1429out_abort:
3a0f472d 1430 lxc_abort(name, handler);
9d7f9e52 1431 goto out_fini;
0ad19a3f 1432}
ee70bf78
CLG
1433
1434struct start_args {
1435 char *const *argv;
1436};
1437
1438static int start(struct lxc_handler *handler, void* data)
1439{
1440 struct start_args *arg = data;
1441
408da065 1442 NOTICE("Exec'ing \"%s\".", arg->argv[0]);
ee70bf78
CLG
1443
1444 execvp(arg->argv[0], arg->argv);
408da065 1445 SYSERROR("Failed to exec \"%s\".", arg->argv[0]);
ee70bf78
CLG
1446 return 0;
1447}
1448
1449static int post_start(struct lxc_handler *handler, void* data)
1450{
1451 struct start_args *arg = data;
1452
408da065 1453 NOTICE("Started \"%s\" with pid \"%d\".", arg->argv[0], handler->pid);
ee70bf78
CLG
1454 return 0;
1455}
1456
1457static struct lxc_operations start_ops = {
1458 .start = start,
1459 .post_start = post_start
1460};
1461
13f5be62 1462int lxc_start(const char *name, char *const argv[], struct lxc_conf *conf,
507cee36 1463 const char *lxcpath, bool backgrounded)
ee70bf78
CLG
1464{
1465 struct start_args start_arg = {
1466 .argv = argv,
1467 };
1468
828695d9 1469 conf->need_utmp_watch = 1;
507cee36 1470 return __lxc_start(name, conf, &start_ops, &start_arg, lxcpath, backgrounded);
ee70bf78 1471}
28272964
CB
1472
1473static void lxc_destroy_container_on_signal(struct lxc_handler *handler,
1474 const char *name)
1475{
1476 char destroy[MAXPATHLEN];
1477 bool bret = true;
1478 int ret = 0;
f01f7975 1479 struct lxc_container *c;
df31363a 1480 if (handler->conf->rootfs.path && handler->conf->rootfs.mount) {
28272964
CB
1481 bret = do_destroy_container(handler->conf);
1482 if (!bret) {
408da065 1483 ERROR("Error destroying rootfs for container \"%s\".", name);
28272964
CB
1484 return;
1485 }
1486 }
408da065 1487 INFO("Destroyed rootfs for container \"%s\".", name);
28272964
CB
1488
1489 ret = snprintf(destroy, MAXPATHLEN, "%s/%s", handler->lxcpath, name);
1490 if (ret < 0 || ret >= MAXPATHLEN) {
408da065 1491 ERROR("Error destroying directory for container \"%s\".", name);
28272964
CB
1492 return;
1493 }
1494
f01f7975
CB
1495 c = lxc_container_new(name, handler->lxcpath);
1496 if (c) {
1497 if (container_disk_lock(c)) {
408da065 1498 INFO("Could not update lxc_snapshots file.");
f01f7975
CB
1499 lxc_container_put(c);
1500 } else {
1501 mod_all_rdeps(c, false);
1502 container_disk_unlock(c);
1503 lxc_container_put(c);
1504 }
1505 }
1506
28272964
CB
1507 if (am_unpriv())
1508 ret = userns_exec_1(handler->conf, lxc_rmdir_onedev_wrapper, destroy);
1509 else
1510 ret = lxc_rmdir_onedev(destroy, NULL);
1511
1512 if (ret < 0) {
408da065 1513 ERROR("Error destroying directory for container \"%s\".", name);
28272964
CB
1514 return;
1515 }
408da065 1516 INFO("Destroyed directory for container \"%s\".", name);
28272964
CB
1517}
1518
1519static int lxc_rmdir_onedev_wrapper(void *data)
1520{
1521 char *arg = (char *) data;
1522 return lxc_rmdir_onedev(arg, NULL);
1523}
1524
1525static bool do_destroy_container(struct lxc_conf *conf) {
d028235d
SG
1526 if (am_unpriv()) {
1527 if (userns_exec_1(conf, bdev_destroy_wrapper, conf) < 0)
1528 return false;
1529 return true;
1530 }
1531 return bdev_destroy(conf);
28272964 1532}