]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/start.c
start: fix container killing logic
[mirror_lxc.git] / src / lxc / start.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #ifndef _GNU_SOURCE
4 #define _GNU_SOURCE 1
5 #endif
6 #include <dirent.h>
7 #include <errno.h>
8 #include <fcntl.h>
9 #include <grp.h>
10 #include <poll.h>
11 #include <pthread.h>
12 #include <signal.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <sys/file.h>
17 #include <sys/mount.h>
18 #include <sys/param.h>
19 #include <sys/prctl.h>
20 #include <sys/socket.h>
21 #include <sys/stat.h>
22 #include <sys/syscall.h>
23 #include <sys/types.h>
24 #include <sys/un.h>
25 #include <sys/wait.h>
26 #include <unistd.h>
27
28 #include "af_unix.h"
29 #include "caps.h"
30 #include "cgroup.h"
31 #include "commands.h"
32 #include "commands_utils.h"
33 #include "conf.h"
34 #include "config.h"
35 #include "confile_utils.h"
36 #include "error.h"
37 #include "file_utils.h"
38 #include "list.h"
39 #include "log.h"
40 #include "lsm/lsm.h"
41 #include "lxccontainer.h"
42 #include "lxclock.h"
43 #include "lxcseccomp.h"
44 #include "macro.h"
45 #include "mainloop.h"
46 #include "memory_utils.h"
47 #include "monitor.h"
48 #include "namespace.h"
49 #include "network.h"
50 #include "raw_syscalls.h"
51 #include "start.h"
52 #include "storage/storage.h"
53 #include "storage/storage_utils.h"
54 #include "sync.h"
55 #include "syscall_wrappers.h"
56 #include "terminal.h"
57 #include "utils.h"
58
59 #if HAVE_LIBCAP
60 #include <sys/capability.h>
61 #endif
62
63 #ifndef HAVE_STRLCPY
64 #include "include/strlcpy.h"
65 #endif
66
67 lxc_log_define(start, lxc);
68
69 extern void mod_all_rdeps(struct lxc_container *c, bool inc);
70 static bool do_destroy_container(struct lxc_handler *handler);
71 static int lxc_rmdir_onedev_wrapper(void *data);
72 static void lxc_destroy_container_on_signal(struct lxc_handler *handler,
73 const char *name);
74
75 static void print_top_failing_dir(const char *path)
76 {
77 __do_free char *copy = NULL;
78 int ret;
79 char *e, *p, saved;
80
81 copy = must_copy_string(path);
82 p = copy;
83 e = copy + strlen(path);
84
85 while (p < e) {
86 while (p < e && *p == '/')
87 p++;
88
89 while (p < e && *p != '/')
90 p++;
91
92 saved = *p;
93 *p = '\0';
94
95 ret = access(copy, X_OK);
96 if (ret != 0) {
97 SYSERROR("Could not access %s. Please grant it x "
98 "access, or add an ACL for the container "
99 "root", copy);
100 return;
101 }
102 *p = saved;
103 }
104 }
105
106 static void lxc_put_nsfds(struct lxc_handler *handler)
107 {
108 int i;
109
110 for (i = 0; i < LXC_NS_MAX; i++) {
111 if (handler->nsfd[i] < 0)
112 continue;
113
114 close(handler->nsfd[i]);
115 handler->nsfd[i] = -EBADF;
116 }
117 }
118
119 static int lxc_try_preserve_ns(const int pid, const char *ns)
120 {
121 int fd;
122
123 fd = lxc_preserve_ns(pid, ns);
124 if (fd < 0) {
125 if (errno != ENOENT) {
126 SYSERROR("Failed to preserve %s namespace", ns);
127 return -EINVAL;
128 }
129
130 SYSWARN("Kernel does not support preserving %s namespaces", ns);
131 return -EOPNOTSUPP;
132 }
133
134 return fd;
135 }
136
137 /* lxc_try_preserve_namespaces: open /proc/@pid/ns/@ns for each namespace
138 * specified in ns_clone_flags.
139 * Return true on success, false on failure.
140 */
141 static bool lxc_try_preserve_namespaces(struct lxc_handler *handler,
142 int ns_clone_flags, pid_t pid)
143 {
144 int i;
145
146 for (i = 0; i < LXC_NS_MAX; i++)
147 handler->nsfd[i] = -EBADF;
148
149 for (i = 0; i < LXC_NS_MAX; i++) {
150 int fd;
151
152 if ((ns_clone_flags & ns_info[i].clone_flag) == 0)
153 continue;
154
155 fd = lxc_try_preserve_ns(pid, ns_info[i].proc_name);
156 if (fd < 0) {
157 /* Do not fail to start container on kernels that do
158 * not support interacting with namespaces through
159 * /proc.
160 */
161 if (fd == -EOPNOTSUPP)
162 continue;
163
164 lxc_put_nsfds(handler);
165 return false;
166 }
167
168 handler->nsfd[i] = fd;
169 DEBUG("Preserved %s namespace via fd %d", ns_info[i].proc_name,
170 handler->nsfd[i]);
171 }
172
173 return true;
174 }
175
176 static inline bool match_stdfds(int fd)
177 {
178 return (fd == STDIN_FILENO || fd == STDOUT_FILENO || fd == STDERR_FILENO);
179 }
180
181 #ifdef HAVE_DLOG
182 static bool match_dlog_fds(struct dirent *direntp)
183 {
184 char path[PATH_MAX] = {0};
185 char link[PATH_MAX] = {0};
186 ssize_t linklen;
187 int ret;
188
189 ret = snprintf(path, PATH_MAX, "/proc/self/fd/%s", direntp->d_name);
190 if (ret < 0 || ret >= PATH_MAX) {
191 ERROR("Failed to create file descriptor name");
192 return false;
193 }
194
195 linklen = readlink(path, link, PATH_MAX);
196 if (linklen < 0) {
197 SYSERROR("Failed to read link path - \"%s\"", path);
198 return false;
199 } else if (linklen >= PATH_MAX) {
200 ERROR("The name of link path is too long - \"%s\"", path);
201 return false;
202 }
203
204 if (strcmp(link, "/dev/log_main") == 0 ||
205 strcmp(link, "/dev/log_system") == 0 ||
206 strcmp(link, "/dev/log_radio") == 0)
207 return true;
208
209 return false;
210 }
211 #endif
212
213 int lxc_check_inherited(struct lxc_conf *conf, bool closeall,
214 int *fds_to_ignore, size_t len_fds)
215 {
216 int fd, fddir;
217 size_t i;
218 DIR *dir;
219 struct dirent *direntp;
220
221 if (conf && conf->close_all_fds)
222 closeall = true;
223
224 restart:
225 dir = opendir("/proc/self/fd");
226 if (!dir) {
227 SYSWARN("Failed to open directory");
228 return -1;
229 }
230
231 fddir = dirfd(dir);
232
233 while ((direntp = readdir(dir))) {
234 int ret;
235 struct lxc_list *cur;
236 bool matched = false;
237
238 if (strcmp(direntp->d_name, ".") == 0)
239 continue;
240
241 if (strcmp(direntp->d_name, "..") == 0)
242 continue;
243
244 ret = lxc_safe_int(direntp->d_name, &fd);
245 if (ret < 0) {
246 INFO("Could not parse file descriptor for \"%s\"", direntp->d_name);
247 continue;
248 }
249
250 for (i = 0; i < len_fds; i++)
251 if (fds_to_ignore[i] == fd)
252 break;
253
254 if (fd == fddir || fd == lxc_log_fd ||
255 (i < len_fds && fd == fds_to_ignore[i]))
256 continue;
257
258 /* Keep state clients that wait on reboots. */
259 if (conf) {
260 lxc_list_for_each(cur, &conf->state_clients) {
261 struct lxc_state_client *client = cur->elem;
262
263 if (client->clientfd != fd)
264 continue;
265
266 matched = true;
267 break;
268 }
269 }
270
271 if (matched)
272 continue;
273
274 if (current_config && fd == current_config->logfd)
275 continue;
276
277 if (match_stdfds(fd))
278 continue;
279
280 #ifdef HAVE_DLOG
281 if (match_dlog_fds(direntp))
282 continue;
283
284 #endif
285 if (closeall) {
286 close(fd);
287 closedir(dir);
288 INFO("Closed inherited fd %d", fd);
289 goto restart;
290 }
291 WARN("Inherited fd %d", fd);
292 }
293
294 /* Only enable syslog at this point to avoid the above logging function
295 * to open a new fd and make the check_inherited function enter an
296 * infinite loop.
297 */
298 lxc_log_enable_syslog();
299
300 closedir(dir); /* cannot fail */
301 return 0;
302 }
303
304 static int setup_signal_fd(sigset_t *oldmask)
305 {
306 int ret;
307 sigset_t mask;
308 const int signals[] = {SIGBUS, SIGILL, SIGSEGV, SIGWINCH};
309
310 /* Block everything except serious error signals. */
311 ret = sigfillset(&mask);
312 if (ret < 0)
313 return -EBADF;
314
315 for (int sig = 0; sig < (sizeof(signals) / sizeof(signals[0])); sig++) {
316 ret = sigdelset(&mask, signals[sig]);
317 if (ret < 0)
318 return -EBADF;
319 }
320
321 ret = pthread_sigmask(SIG_BLOCK, &mask, oldmask);
322 if (ret < 0) {
323 SYSERROR("Failed to set signal mask");
324 return -EBADF;
325 }
326
327 ret = signalfd(-1, &mask, SFD_CLOEXEC);
328 if (ret < 0) {
329 SYSERROR("Failed to create signal file descriptor");
330 return -EBADF;
331 }
332
333 TRACE("Created signal file descriptor %d", ret);
334
335 return ret;
336 }
337
338 static int signal_handler(int fd, uint32_t events, void *data,
339 struct lxc_epoll_descr *descr)
340 {
341 int ret;
342 siginfo_t info;
343 struct signalfd_siginfo siginfo;
344 struct lxc_handler *hdlr = data;
345
346 ret = lxc_read_nointr(fd, &siginfo, sizeof(siginfo));
347 if (ret < 0) {
348 ERROR("Failed to read signal info from signal file descriptor %d", fd);
349 return LXC_MAINLOOP_ERROR;
350 }
351
352 if (ret != sizeof(siginfo)) {
353 ERROR("Unexpected size for struct signalfd_siginfo");
354 return -EINVAL;
355 }
356
357 /* Check whether init is running. */
358 info.si_pid = 0;
359 ret = waitid(P_PID, hdlr->pid, &info, WEXITED | WNOWAIT | WNOHANG);
360 if (ret == 0 && info.si_pid == hdlr->pid)
361 hdlr->init_died = true;
362
363 /* Try to figure out a reasonable exit status to report. */
364 if (hdlr->init_died) {
365 switch (info.si_code) {
366 case CLD_EXITED:
367 hdlr->exit_status = info.si_status << 8;
368 break;
369 case CLD_KILLED:
370 case CLD_DUMPED:
371 case CLD_STOPPED:
372 hdlr->exit_status = info.si_status << 8 | 0x7f;
373 break;
374 case CLD_CONTINUED:
375 /* Huh? The waitid() told us it's dead *and* continued? */
376 WARN("Init %d dead and continued?", hdlr->pid);
377 hdlr->exit_status = 1;
378 break;
379 default:
380 ERROR("Unknown si_code: %d", info.si_code);
381 hdlr->exit_status = 1;
382 }
383 }
384
385 if (siginfo.ssi_signo == SIGHUP) {
386 if (hdlr->pidfd >= 0)
387 lxc_raw_pidfd_send_signal(hdlr->pidfd, SIGTERM, NULL, 0);
388 else
389 kill(hdlr->pid, SIGTERM);
390 INFO("Killing %d since terminal hung up", hdlr->pid);
391 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
392 : LXC_MAINLOOP_CONTINUE;
393 }
394
395 if (siginfo.ssi_signo != SIGCHLD) {
396 if (hdlr->pidfd >= 0)
397 lxc_raw_pidfd_send_signal(hdlr->pidfd,
398 siginfo.ssi_signo, NULL, 0);
399 else
400 kill(hdlr->pid, siginfo.ssi_signo);
401 INFO("Forwarded signal %d to pid %d", siginfo.ssi_signo, hdlr->pid);
402 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
403 : LXC_MAINLOOP_CONTINUE;
404 }
405
406 /* More robustness, protect ourself from a SIGCHLD sent
407 * by a process different from the container init.
408 */
409 if (siginfo.ssi_pid != hdlr->pid) {
410 NOTICE("Received %d from pid %d instead of container init %d",
411 siginfo.ssi_signo, siginfo.ssi_pid, hdlr->pid);
412 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
413 : LXC_MAINLOOP_CONTINUE;
414 }
415
416 if (siginfo.ssi_code == CLD_STOPPED) {
417 INFO("Container init process was stopped");
418 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
419 : LXC_MAINLOOP_CONTINUE;
420 }
421
422 if (siginfo.ssi_code == CLD_CONTINUED) {
423 INFO("Container init process was continued");
424 return hdlr->init_died ? LXC_MAINLOOP_CLOSE
425 : LXC_MAINLOOP_CONTINUE;
426 }
427
428 DEBUG("Container init process %d exited", hdlr->pid);
429
430 return LXC_MAINLOOP_CLOSE;
431 }
432
433 int lxc_serve_state_clients(const char *name, struct lxc_handler *handler,
434 lxc_state_t state)
435 {
436 size_t retlen;
437 ssize_t ret;
438 struct lxc_list *cur, *next;
439 struct lxc_msg msg = {.type = lxc_msg_state, .value = state};
440
441 if (state == THAWED)
442 handler->state = RUNNING;
443 else
444 handler->state = state;
445
446 TRACE("Set container state to %s", lxc_state2str(state));
447
448 if (lxc_list_empty(&handler->conf->state_clients)) {
449 TRACE("No state clients registered");
450 return 0;
451 }
452
453 retlen = strlcpy(msg.name, name, sizeof(msg.name));
454 if (retlen >= sizeof(msg.name))
455 return -E2BIG;
456
457 lxc_list_for_each_safe(cur, &handler->conf->state_clients, next) {
458 struct lxc_state_client *client = cur->elem;
459
460 if (client->states[state] == 0) {
461 TRACE("State %s not registered for state client %d",
462 lxc_state2str(state), client->clientfd);
463 continue;
464 }
465
466 TRACE("Sending state %s to state client %d",
467 lxc_state2str(state), client->clientfd);
468
469 ret = lxc_send_nointr(client->clientfd, &msg, sizeof(msg), MSG_NOSIGNAL);
470 if (ret <= 0)
471 SYSERROR("Failed to send message to client");
472
473 /* kick client from list */
474 lxc_list_del(cur);
475 close(client->clientfd);
476 free(cur->elem);
477 free(cur);
478 }
479
480 return 0;
481 }
482
483 static int lxc_serve_state_socket_pair(const char *name,
484 struct lxc_handler *handler,
485 lxc_state_t state)
486 {
487 ssize_t ret;
488
489 if (!handler->daemonize ||
490 handler->state_socket_pair[1] < 0 ||
491 state == STARTING)
492 return 0;
493
494 /* Close read end of the socket pair. */
495 close(handler->state_socket_pair[0]);
496 handler->state_socket_pair[0] = -1;
497
498 again:
499 ret = lxc_abstract_unix_send_credential(handler->state_socket_pair[1],
500 &(int){state}, sizeof(int));
501 if (ret < 0) {
502 SYSERROR("Failed to send state to %d", handler->state_socket_pair[1]);
503
504 if (errno == EINTR)
505 goto again;
506
507 return -1;
508 }
509
510 if (ret != sizeof(int)) {
511 ERROR("Message too long : %d", handler->state_socket_pair[1]);
512 return -1;
513 }
514
515 TRACE("Sent container state \"%s\" to %d", lxc_state2str(state),
516 handler->state_socket_pair[1]);
517
518 /* Close write end of the socket pair. */
519 close(handler->state_socket_pair[1]);
520 handler->state_socket_pair[1] = -1;
521
522 return 0;
523 }
524
525 int lxc_set_state(const char *name, struct lxc_handler *handler,
526 lxc_state_t state)
527 {
528 int ret;
529
530 ret = lxc_serve_state_socket_pair(name, handler, state);
531 if (ret < 0) {
532 ERROR("Failed to synchronize via anonymous pair of unix sockets");
533 return -1;
534 }
535
536 ret = lxc_serve_state_clients(name, handler, state);
537 if (ret < 0)
538 return -1;
539
540 /* This function will try to connect to the legacy lxc-monitord state
541 * server and only exists for backwards compatibility.
542 */
543 lxc_monitor_send_state(name, state, handler->lxcpath);
544
545 return 0;
546 }
547
548 int lxc_poll(const char *name, struct lxc_handler *handler)
549 {
550 int ret;
551 bool has_console = true;
552 struct lxc_epoll_descr descr, descr_console;
553
554 if (handler->conf->console.path &&
555 strcmp(handler->conf->console.path, "none") == 0)
556 has_console = false;
557
558 ret = lxc_mainloop_open(&descr);
559 if (ret < 0) {
560 ERROR("Failed to create mainloop");
561 goto out_sigfd;
562 }
563
564 if (has_console) {
565 ret = lxc_mainloop_open(&descr_console);
566 if (ret < 0) {
567 ERROR("Failed to create console mainloop");
568 goto out_mainloop;
569 }
570 }
571
572 ret = lxc_mainloop_add_handler(&descr, handler->sigfd, signal_handler, handler);
573 if (ret < 0) {
574 ERROR("Failed to add signal handler for %d to mainloop", handler->sigfd);
575 goto out_mainloop_console;
576 }
577
578 ret = lxc_seccomp_setup_proxy(&handler->conf->seccomp, &descr, handler);
579 if (ret < 0) {
580 ERROR("Failed to setup seccomp proxy");
581 goto out_mainloop_console;
582 }
583
584 if (has_console) {
585 struct lxc_terminal *console = &handler->conf->console;
586
587 ret = lxc_terminal_mainloop_add(&descr, console);
588 if (ret < 0) {
589 ERROR("Failed to add console handlers to mainloop");
590 goto out_mainloop_console;
591 }
592
593 ret = lxc_terminal_mainloop_add(&descr_console, console);
594 if (ret < 0) {
595 ERROR("Failed to add console handlers to console mainloop");
596 goto out_mainloop_console;
597 }
598
599 handler->conf->console.descr = &descr;
600 }
601
602 ret = lxc_cmd_mainloop_add(name, &descr, handler);
603 if (ret < 0) {
604 ERROR("Failed to add command handler to mainloop");
605 goto out_mainloop_console;
606 }
607
608 TRACE("Mainloop is ready");
609
610 ret = lxc_mainloop(&descr, -1);
611 close(descr.epfd);
612 descr.epfd = -EBADF;
613 if (ret < 0 || !handler->init_died)
614 goto out_mainloop_console;
615
616 if (has_console)
617 ret = lxc_mainloop(&descr_console, 0);
618
619 out_mainloop_console:
620 if (has_console) {
621 lxc_mainloop_close(&descr_console);
622 TRACE("Closed console mainloop");
623 }
624
625 out_mainloop:
626 lxc_mainloop_close(&descr);
627 TRACE("Closed mainloop");
628
629 out_sigfd:
630 close(handler->sigfd);
631 TRACE("Closed signal file descriptor %d", handler->sigfd);
632 handler->sigfd = -EBADF;
633
634 return ret;
635 }
636
637 void lxc_zero_handler(struct lxc_handler *handler)
638 {
639 int i;
640
641 memset(handler, 0, sizeof(struct lxc_handler));
642
643 handler->pinfd = -1;
644
645 handler->pidfd = -EBADF;
646
647 handler->sigfd = -1;
648
649 for (i = 0; i < LXC_NS_MAX; i++)
650 handler->nsfd[i] = -1;
651
652 handler->data_sock[0] = -1;
653 handler->data_sock[1] = -1;
654
655 handler->state_socket_pair[0] = -1;
656 handler->state_socket_pair[1] = -1;
657
658 handler->sync_sock[0] = -1;
659 handler->sync_sock[1] = -1;
660 }
661
662 void lxc_free_handler(struct lxc_handler *handler)
663 {
664 if (handler->pinfd >= 0)
665 close(handler->pinfd);
666
667 if (handler->pidfd >= 0)
668 close(handler->pidfd);
669
670 if (handler->sigfd >= 0)
671 close(handler->sigfd);
672
673 lxc_put_nsfds(handler);
674
675 if (handler->conf && handler->conf->reboot == REBOOT_NONE)
676 if (handler->conf->maincmd_fd >= 0)
677 lxc_abstract_unix_close(handler->conf->maincmd_fd);
678
679 if (handler->monitor_status_fd >= 0)
680 close(handler->monitor_status_fd);
681
682 if (handler->state_socket_pair[0] >= 0)
683 close(handler->state_socket_pair[0]);
684
685 if (handler->state_socket_pair[1] >= 0)
686 close(handler->state_socket_pair[1]);
687
688 if (handler->cgroup_ops)
689 cgroup_exit(handler->cgroup_ops);
690
691 handler->conf = NULL;
692 free(handler);
693 handler = NULL;
694 }
695
696 struct lxc_handler *lxc_init_handler(const char *name, struct lxc_conf *conf,
697 const char *lxcpath, bool daemonize)
698 {
699 int i, ret;
700 struct lxc_handler *handler;
701
702 handler = malloc(sizeof(*handler));
703 if (!handler)
704 return NULL;
705
706 memset(handler, 0, sizeof(*handler));
707
708 /* Note that am_guest_unpriv() checks the effective uid. We
709 * probably don't care if we are real root only if we are running
710 * as root so this should be fine.
711 */
712 handler->am_root = !am_guest_unpriv();
713 handler->data_sock[0] = handler->data_sock[1] = -1;
714 handler->conf = conf;
715 handler->lxcpath = lxcpath;
716 handler->monitor_status_fd = -EBADF;
717 handler->pinfd = -1;
718 handler->pidfd = -EBADF;
719 handler->sigfd = -EBADF;
720 handler->init_died = false;
721 handler->state_socket_pair[0] = handler->state_socket_pair[1] = -1;
722 if (handler->conf->reboot == REBOOT_NONE)
723 lxc_list_init(&handler->conf->state_clients);
724
725 for (i = 0; i < LXC_NS_MAX; i++)
726 handler->nsfd[i] = -1;
727
728 handler->name = name;
729 if (daemonize)
730 handler->transient_pid = lxc_raw_getpid();
731 else
732 handler->transient_pid = -1;
733
734 if (daemonize && handler->conf->reboot == REBOOT_NONE) {
735 /* Create socketpair() to synchronize on daemonized startup.
736 * When the container reboots we don't need to synchronize
737 * again currently so don't open another socketpair().
738 */
739 ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0,
740 handler->state_socket_pair);
741 if (ret < 0) {
742 ERROR("Failed to create anonymous pair of unix sockets");
743 goto on_error;
744 }
745
746 TRACE("Created anonymous pair {%d,%d} of unix sockets",
747 handler->state_socket_pair[0],
748 handler->state_socket_pair[1]);
749 }
750
751 if (handler->conf->reboot == REBOOT_NONE) {
752 handler->conf->maincmd_fd = lxc_cmd_init(name, lxcpath, "command");
753 if (handler->conf->maincmd_fd < 0) {
754 ERROR("Failed to set up command socket");
755 goto on_error;
756 }
757 }
758
759 TRACE("Unix domain socket %d for command server is ready",
760 handler->conf->maincmd_fd);
761
762 return handler;
763
764 on_error:
765 lxc_free_handler(handler);
766
767 return NULL;
768 }
769
770 int lxc_init(const char *name, struct lxc_handler *handler)
771 {
772 __do_close_prot_errno int status_fd = -EBADF;
773 int ret;
774 const char *loglevel;
775 struct lxc_conf *conf = handler->conf;
776
777 handler->monitor_pid = lxc_raw_getpid();
778 status_fd = open("/proc/self/status", O_RDONLY | O_CLOEXEC);
779 if (status_fd < 0) {
780 SYSERROR("Failed to open monitor status fd");
781 goto out_close_maincmd_fd;
782 }
783
784 lsm_init();
785 TRACE("Initialized LSM");
786
787 ret = lxc_read_seccomp_config(conf);
788 if (ret < 0) {
789 ERROR("Failed loading seccomp policy");
790 goto out_close_maincmd_fd;
791 }
792 TRACE("Read seccomp policy");
793
794 /* Begin by setting the state to STARTING. */
795 ret = lxc_set_state(name, handler, STARTING);
796 if (ret < 0) {
797 ERROR("Failed to set state to \"%s\"", lxc_state2str(STARTING));
798 goto out_close_maincmd_fd;
799 }
800 TRACE("Set container state to \"STARTING\"");
801
802 /* Start of environment variable setup for hooks. */
803 ret = setenv("LXC_NAME", name, 1);
804 if (ret < 0)
805 SYSERROR("Failed to set environment variable: LXC_NAME=%s", name);
806
807 if (conf->rcfile) {
808 ret = setenv("LXC_CONFIG_FILE", conf->rcfile, 1);
809 if (ret < 0)
810 SYSERROR("Failed to set environment variable: "
811 "LXC_CONFIG_FILE=%s", conf->rcfile);
812 }
813
814 if (conf->rootfs.mount) {
815 ret = setenv("LXC_ROOTFS_MOUNT", conf->rootfs.mount, 1);
816 if (ret < 0)
817 SYSERROR("Failed to set environment variable: "
818 "LXC_ROOTFS_MOUNT=%s", conf->rootfs.mount);
819 }
820
821 if (conf->rootfs.path) {
822 ret = setenv("LXC_ROOTFS_PATH", conf->rootfs.path, 1);
823 if (ret < 0)
824 SYSERROR("Failed to set environment variable: "
825 "LXC_ROOTFS_PATH=%s", conf->rootfs.path);
826 }
827
828 if (conf->console.path) {
829 ret = setenv("LXC_CONSOLE", conf->console.path, 1);
830 if (ret < 0)
831 SYSERROR("Failed to set environment variable: "
832 "LXC_CONSOLE=%s", conf->console.path);
833 }
834
835 if (conf->console.log_path) {
836 ret = setenv("LXC_CONSOLE_LOGPATH", conf->console.log_path, 1);
837 if (ret < 0)
838 SYSERROR("Failed to set environment variable: "
839 "LXC_CONSOLE_LOGPATH=%s", conf->console.log_path);
840 }
841
842 if (cgns_supported()) {
843 ret = setenv("LXC_CGNS_AWARE", "1", 1);
844 if (ret < 0)
845 SYSERROR("Failed to set environment variable "
846 "LXC_CGNS_AWARE=1");
847 }
848
849 loglevel = lxc_log_priority_to_string(lxc_log_get_level());
850 ret = setenv("LXC_LOG_LEVEL", loglevel, 1);
851 if (ret < 0)
852 SYSERROR("Set environment variable LXC_LOG_LEVEL=%s",
853 loglevel);
854
855 if (conf->hooks_version == 0)
856 ret = setenv("LXC_HOOK_VERSION", "0", 1);
857 else
858 ret = setenv("LXC_HOOK_VERSION", "1", 1);
859 if (ret < 0)
860 SYSERROR("Failed to set environment variable LXC_HOOK_VERSION=%u", conf->hooks_version);
861 /* End of environment variable setup for hooks. */
862
863 TRACE("Set environment variables");
864
865 ret = run_lxc_hooks(name, "pre-start", conf, NULL);
866 if (ret < 0) {
867 ERROR("Failed to run lxc.hook.pre-start for container \"%s\"", name);
868 goto out_aborting;
869 }
870 TRACE("Ran pre-start hooks");
871
872 /* The signal fd has to be created before forking otherwise if the child
873 * process exits before we setup the signal fd, the event will be lost
874 * and the command will be stuck.
875 */
876 handler->sigfd = setup_signal_fd(&handler->oldmask);
877 if (handler->sigfd < 0) {
878 ERROR("Failed to setup SIGCHLD fd handler.");
879 goto out_delete_tty;
880 }
881 TRACE("Set up signal fd");
882
883 /* Do this after setting up signals since it might unblock SIGWINCH. */
884 ret = lxc_terminal_setup(conf);
885 if (ret < 0) {
886 ERROR("Failed to create console");
887 goto out_restore_sigmask;
888 }
889 TRACE("Created console");
890
891 ret = lxc_terminal_map_ids(conf, &conf->console);
892 if (ret < 0) {
893 ERROR("Failed to chown console");
894 goto out_delete_terminal;
895 }
896 TRACE("Chowned console");
897
898 handler->cgroup_ops = cgroup_init(handler->conf);
899 if (!handler->cgroup_ops) {
900 ERROR("Failed to initialize cgroup driver");
901 goto out_delete_terminal;
902 }
903 TRACE("Initialized cgroup driver");
904
905 ret = lsm_process_prepare(conf, handler->lxcpath);
906 if (ret < 0) {
907 ERROR("Failed to initialize LSM");
908 goto out_delete_terminal;
909 }
910 TRACE("Initialized LSM");
911
912 INFO("Container \"%s\" is initialized", name);
913 handler->monitor_status_fd = move_fd(status_fd);
914 return 0;
915
916 out_delete_terminal:
917 lxc_terminal_delete(&handler->conf->console);
918
919 out_restore_sigmask:
920 (void)pthread_sigmask(SIG_SETMASK, &handler->oldmask, NULL);
921
922 out_delete_tty:
923 lxc_delete_tty(&conf->ttys);
924
925 out_aborting:
926 (void)lxc_set_state(name, handler, ABORTING);
927
928 out_close_maincmd_fd:
929 lxc_abstract_unix_close(conf->maincmd_fd);
930 conf->maincmd_fd = -1;
931 return -1;
932 }
933
934 void lxc_fini(const char *name, struct lxc_handler *handler)
935 {
936 int i, ret;
937 pid_t self;
938 struct lxc_list *cur, *next;
939 char *namespaces[LXC_NS_MAX + 1];
940 size_t namespace_count = 0;
941 struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
942
943 /* The STOPPING state is there for future cleanup code which can take
944 * awhile.
945 */
946 lxc_set_state(name, handler, STOPPING);
947
948 self = lxc_raw_getpid();
949 for (i = 0; i < LXC_NS_MAX; i++) {
950 if (handler->nsfd[i] < 0)
951 continue;
952
953 if (handler->conf->hooks_version == 0)
954 ret = asprintf(&namespaces[namespace_count],
955 "%s:/proc/%d/fd/%d", ns_info[i].proc_name,
956 self, handler->nsfd[i]);
957 else
958 ret = asprintf(&namespaces[namespace_count],
959 "/proc/%d/fd/%d", self, handler->nsfd[i]);
960 if (ret == -1) {
961 SYSERROR("Failed to allocate memory");
962 break;
963 }
964
965 if (handler->conf->hooks_version == 0) {
966 namespace_count++;
967 continue;
968 }
969
970 ret = setenv(ns_info[i].env_name, namespaces[namespace_count], 1);
971 if (ret < 0)
972 SYSERROR("Failed to set environment variable %s=%s",
973 ns_info[i].env_name, namespaces[namespace_count]);
974 else
975 TRACE("Set environment variable %s=%s",
976 ns_info[i].env_name, namespaces[namespace_count]);
977
978 namespace_count++;
979 }
980 namespaces[namespace_count] = NULL;
981
982 if (handler->conf->reboot > REBOOT_NONE) {
983 ret = setenv("LXC_TARGET", "reboot", 1);
984 if (ret < 0)
985 SYSERROR("Failed to set environment variable: "
986 "LXC_TARGET=reboot");
987 }
988
989 if (handler->conf->reboot == REBOOT_NONE) {
990 ret = setenv("LXC_TARGET", "stop", 1);
991 if (ret < 0)
992 SYSERROR("Failed to set environment variable: "
993 "LXC_TARGET=stop");
994 }
995
996 if (handler->conf->hooks_version == 0)
997 ret = run_lxc_hooks(name, "stop", handler->conf, namespaces);
998 else
999 ret = run_lxc_hooks(name, "stop", handler->conf, NULL);
1000 if (ret < 0)
1001 ERROR("Failed to run \"lxc.hook.stop\" hook");
1002
1003 while (namespace_count--)
1004 free(namespaces[namespace_count]);
1005
1006 lsm_process_cleanup(handler->conf, handler->lxcpath);
1007
1008 if (cgroup_ops) {
1009 cgroup_ops->payload_destroy(cgroup_ops, handler);
1010 cgroup_ops->monitor_destroy(cgroup_ops, handler);
1011 }
1012
1013 if (handler->conf->reboot == REBOOT_NONE) {
1014 /* For all new state clients simply close the command socket.
1015 * This will inform all state clients that the container is
1016 * STOPPED and also prevents a race between a open()/close() on
1017 * the command socket causing a new process to get ECONNREFUSED
1018 * because we haven't yet closed the command socket.
1019 */
1020 lxc_abstract_unix_close(handler->conf->maincmd_fd);
1021 handler->conf->maincmd_fd = -1;
1022 TRACE("Closed command socket");
1023
1024 /* This function will try to connect to the legacy lxc-monitord
1025 * state server and only exists for backwards compatibility.
1026 */
1027 lxc_monitor_send_state(name, STOPPED, handler->lxcpath);
1028
1029 /* The command socket is closed so no one can acces the command
1030 * socket anymore so there's no need to lock it.
1031 */
1032 handler->state = STOPPED;
1033 TRACE("Set container state to \"STOPPED\"");
1034 } else {
1035 lxc_set_state(name, handler, STOPPED);
1036 }
1037
1038 /* Avoid lingering namespace references. */
1039 lxc_put_nsfds(handler);
1040
1041 ret = run_lxc_hooks(name, "post-stop", handler->conf, NULL);
1042 if (ret < 0) {
1043 ERROR("Failed to run lxc.hook.post-stop for container \"%s\"", name);
1044 if (handler->conf->reboot > REBOOT_NONE) {
1045 WARN("Container will be stopped instead of rebooted");
1046 handler->conf->reboot = REBOOT_NONE;
1047
1048 ret = setenv("LXC_TARGET", "stop", 1);
1049 if (ret < 0)
1050 WARN("Failed to set environment variable: "
1051 "LXC_TARGET=stop");
1052 }
1053 }
1054
1055 /* Reset mask set by setup_signal_fd. */
1056 ret = pthread_sigmask(SIG_SETMASK, &handler->oldmask, NULL);
1057 if (ret < 0)
1058 SYSWARN("Failed to restore signal mask");
1059
1060 lxc_terminal_delete(&handler->conf->console);
1061 lxc_delete_tty(&handler->conf->ttys);
1062
1063 /* The command socket is now closed, no more state clients can register
1064 * themselves from now on. So free the list of state clients.
1065 */
1066 lxc_list_for_each_safe(cur, &handler->conf->state_clients, next) {
1067 struct lxc_state_client *client = cur->elem;
1068
1069 /* Keep state clients that want to be notified about reboots. */
1070 if ((handler->conf->reboot > REBOOT_NONE) &&
1071 (client->states[RUNNING] == 2))
1072 continue;
1073
1074 /* close state client socket */
1075 lxc_list_del(cur);
1076 close(client->clientfd);
1077 free(cur->elem);
1078 free(cur);
1079 }
1080
1081 if (handler->conf->ephemeral == 1 && handler->conf->reboot != REBOOT_REQ)
1082 lxc_destroy_container_on_signal(handler, name);
1083
1084 lxc_free_handler(handler);
1085 }
1086
1087 void lxc_abort(const char *name, struct lxc_handler *handler)
1088 {
1089 int ret = 0;
1090 int status;
1091
1092 lxc_set_state(name, handler, ABORTING);
1093
1094 if (handler->pidfd >= 0) {
1095 ret = lxc_raw_pidfd_send_signal(handler->pidfd, SIGKILL, NULL, 0);
1096 if (ret)
1097 SYSWARN("Failed to send SIGKILL via pidfd %d for process %d",
1098 handler->pidfd, handler->pid);
1099 }
1100
1101 if (!ret || errno != ESRCH)
1102 if (kill(handler->pid, SIGKILL))
1103 SYSWARN("Failed to send SIGKILL to %d", handler->pid);
1104
1105 do {
1106 ret = waitpid(-1, &status, 0);
1107 } while (ret > 0);
1108 }
1109
1110 static int do_start(void *data)
1111 {
1112 struct lxc_handler *handler = data;
1113 __lxc_unused __do_close_prot_errno int data_sock0 = handler->data_sock[0],
1114 data_sock1 = handler->data_sock[1];
1115 __do_close_prot_errno int status_fd = -EBADF;
1116 int ret;
1117 uid_t new_uid;
1118 gid_t new_gid;
1119 struct lxc_list *iterator;
1120 uid_t nsuid = 0;
1121 gid_t nsgid = 0;
1122 int devnull_fd = -1;
1123
1124 lxc_sync_fini_parent(handler);
1125
1126 if (lxc_abstract_unix_recv_fds(data_sock1, &status_fd, 1, NULL, 0) < 0) {
1127 ERROR("Failed to receive status file descriptor to child process");
1128 goto out_warn_father;
1129 }
1130
1131 /* This prctl must be before the synchro, so if the parent dies before
1132 * we set the parent death signal, we will detect its death with the
1133 * synchro right after, otherwise we have a window where the parent can
1134 * exit before we set the pdeath signal leading to a unsupervized
1135 * container.
1136 */
1137 ret = lxc_set_death_signal(SIGKILL, handler->monitor_pid, status_fd);
1138 if (ret < 0) {
1139 SYSERROR("Failed to set PR_SET_PDEATHSIG to SIGKILL");
1140 goto out_warn_father;
1141 }
1142
1143 ret = lxc_ambient_caps_up();
1144 if (ret < 0) {
1145 ERROR("Failed to raise ambient capabilities");
1146 goto out_warn_father;
1147 }
1148
1149 ret = pthread_sigmask(SIG_SETMASK, &handler->oldmask, NULL);
1150 if (ret < 0) {
1151 SYSERROR("Failed to set signal mask");
1152 goto out_warn_father;
1153 }
1154
1155 /* Don't leak the pinfd to the container. */
1156 if (handler->pinfd >= 0)
1157 close(handler->pinfd);
1158
1159 ret = lxc_sync_wait_parent(handler, LXC_SYNC_STARTUP);
1160 if (ret < 0)
1161 goto out_warn_father;
1162
1163 /* Unshare CLONE_NEWNET after CLONE_NEWUSER. See
1164 * https://github.com/lxc/lxd/issues/1978.
1165 */
1166 if ((handler->ns_clone_flags & (CLONE_NEWNET | CLONE_NEWUSER)) ==
1167 (CLONE_NEWNET | CLONE_NEWUSER)) {
1168 ret = unshare(CLONE_NEWNET);
1169 if (ret < 0) {
1170 SYSERROR("Failed to unshare CLONE_NEWNET");
1171 goto out_warn_father;
1172 }
1173 INFO("Unshared CLONE_NEWNET");
1174 }
1175
1176 /* Tell the parent task it can begin to configure the container and wait
1177 * for it to finish.
1178 */
1179 ret = lxc_sync_barrier_parent(handler, LXC_SYNC_CONFIGURE);
1180 if (ret < 0)
1181 goto out_error;
1182
1183 if (handler->ns_clone_flags & CLONE_NEWNET) {
1184 ret = lxc_network_recv_from_parent(handler);
1185 if (ret < 0) {
1186 ERROR("Failed to receive veth names from parent");
1187 goto out_warn_father;
1188 }
1189 }
1190
1191 /* If we are in a new user namespace, become root there to have
1192 * privilege over our namespace.
1193 */
1194 if (!lxc_list_empty(&handler->conf->id_map)) {
1195 if (!handler->conf->root_nsuid_map)
1196 nsuid = handler->conf->init_uid;
1197
1198 if (!handler->conf->root_nsgid_map)
1199 nsgid = handler->conf->init_gid;
1200
1201 if (!lxc_switch_uid_gid(nsuid, nsgid))
1202 goto out_warn_father;
1203
1204 /* Drop groups only after we switched to a valid gid in the new
1205 * user namespace.
1206 */
1207 if (!lxc_setgroups(0, NULL) &&
1208 (handler->am_root || errno != EPERM))
1209 goto out_warn_father;
1210
1211 ret = prctl(PR_SET_DUMPABLE, prctl_arg(1), prctl_arg(0),
1212 prctl_arg(0), prctl_arg(0));
1213 if (ret < 0)
1214 goto out_warn_father;
1215
1216 /* set{g,u}id() clears deathsignal */
1217 ret = lxc_set_death_signal(SIGKILL, handler->monitor_pid, status_fd);
1218 if (ret < 0) {
1219 SYSERROR("Failed to set PR_SET_PDEATHSIG to SIGKILL");
1220 goto out_warn_father;
1221 }
1222 }
1223
1224 ret = access(handler->lxcpath, X_OK);
1225 if (ret != 0) {
1226 print_top_failing_dir(handler->lxcpath);
1227 goto out_warn_father;
1228 }
1229
1230 /* In order to checkpoint restore, we need to have everything in the
1231 * same mount namespace. However, some containers may not have a
1232 * reasonable /dev (in particular, they may not have /dev/null), so we
1233 * can't set init's std fds to /dev/null by opening it from inside the
1234 * container.
1235 *
1236 * If that's the case, fall back to using the host's /dev/null. This
1237 * means that migration won't work, but at least we won't spew output
1238 * where it isn't wanted.
1239 */
1240 if (handler->daemonize && !handler->conf->autodev) {
1241 char path[PATH_MAX];
1242
1243 ret = snprintf(path, sizeof(path), "%s/dev/null",
1244 handler->conf->rootfs.mount);
1245 if (ret < 0 || ret >= sizeof(path))
1246 goto out_warn_father;
1247
1248 ret = access(path, F_OK);
1249 if (ret != 0) {
1250 devnull_fd = open_devnull();
1251
1252 if (devnull_fd < 0)
1253 goto out_warn_father;
1254 WARN("Using /dev/null from the host for container "
1255 "init's standard file descriptors. Migration will "
1256 "not work");
1257 }
1258 }
1259
1260 /* Ask father to setup cgroups and wait for him to finish. */
1261 ret = lxc_sync_barrier_parent(handler, LXC_SYNC_CGROUP);
1262 if (ret < 0)
1263 goto out_error;
1264
1265 /* Unshare cgroup namespace after we have setup our cgroups. If we do it
1266 * earlier we end up with a wrong view of /proc/self/cgroup. For
1267 * example, assume we unshare(CLONE_NEWCGROUP) first, and then create
1268 * the cgroup for the container, say /sys/fs/cgroup/cpuset/lxc/c, then
1269 * /proc/self/cgroup would show us:
1270 *
1271 * 8:cpuset:/lxc/c
1272 *
1273 * whereas it should actually show
1274 *
1275 * 8:cpuset:/
1276 */
1277 if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
1278 ret = unshare(CLONE_NEWCGROUP);
1279 if (ret < 0) {
1280 if (errno != EINVAL) {
1281 SYSERROR("Failed to unshare CLONE_NEWCGROUP");
1282 goto out_warn_father;
1283 }
1284
1285 handler->ns_clone_flags &= ~CLONE_NEWCGROUP;
1286 SYSINFO("Kernel does not support CLONE_NEWCGROUP");
1287 } else {
1288 INFO("Unshared CLONE_NEWCGROUP");
1289 }
1290 }
1291
1292 /* Add the requested environment variables to the current environment to
1293 * allow them to be used by the various hooks, such as the start hook
1294 * below.
1295 */
1296 lxc_list_for_each(iterator, &handler->conf->environment) {
1297 ret = putenv((char *)iterator->elem);
1298 if (ret < 0) {
1299 SYSERROR("Failed to set environment variable: %s",
1300 (char *)iterator->elem);
1301 goto out_warn_father;
1302 }
1303 }
1304
1305 /* Setup the container, ip, names, utsname, ... */
1306 ret = lxc_setup(handler);
1307 if (ret < 0) {
1308 ERROR("Failed to setup container \"%s\"", handler->name);
1309 goto out_warn_father;
1310 }
1311
1312 /* Set the label to change to when we exec(2) the container's init. */
1313 ret = lsm_process_label_set(NULL, handler->conf, true);
1314 if (ret < 0)
1315 goto out_warn_father;
1316
1317 /* Set PR_SET_NO_NEW_PRIVS after we changed the lsm label. If we do it
1318 * before we aren't allowed anymore.
1319 */
1320 if (handler->conf->no_new_privs) {
1321 ret = prctl(PR_SET_NO_NEW_PRIVS, prctl_arg(1), prctl_arg(0),
1322 prctl_arg(0), prctl_arg(0));
1323 if (ret < 0) {
1324 SYSERROR("Could not set PR_SET_NO_NEW_PRIVS to block "
1325 "execve() gainable privileges");
1326 goto out_warn_father;
1327 }
1328 DEBUG("Set PR_SET_NO_NEW_PRIVS to block execve() gainable "
1329 "privileges");
1330 }
1331
1332 /* Some init's such as busybox will set sane tty settings on stdin,
1333 * stdout, stderr which it thinks is the console. We already set them
1334 * the way we wanted on the real terminal, and we want init to do its
1335 * setup on its console ie. the pty allocated in lxc_terminal_setup() so
1336 * make sure that that pty is stdin,stdout,stderr.
1337 */
1338 if (handler->conf->console.slave >= 0) {
1339 if (handler->daemonize || !handler->conf->is_execute)
1340 ret = set_stdfds(handler->conf->console.slave);
1341 else
1342 ret = lxc_terminal_set_stdfds(handler->conf->console.slave);
1343 if (ret < 0) {
1344 ERROR("Failed to redirect std{in,out,err} to pty file "
1345 "descriptor %d", handler->conf->console.slave);
1346 goto out_warn_father;
1347 }
1348 }
1349
1350 /* If we mounted a temporary proc, then unmount it now. */
1351 tmp_proc_unmount(handler->conf);
1352
1353 ret = lxc_seccomp_load(handler->conf);
1354 if (ret < 0)
1355 goto out_warn_father;
1356
1357 ret = lxc_seccomp_send_notifier_fd(&handler->conf->seccomp, data_sock0);
1358 if (ret < 0) {
1359 SYSERROR("Failed to send seccomp notify fd to parent");
1360 goto out_warn_father;
1361 }
1362
1363 ret = run_lxc_hooks(handler->name, "start", handler->conf, NULL);
1364 if (ret < 0) {
1365 ERROR("Failed to run lxc.hook.start for container \"%s\"",
1366 handler->name);
1367 goto out_warn_father;
1368 }
1369
1370 close(handler->sigfd);
1371
1372 if (handler->conf->console.slave < 0 && handler->daemonize) {
1373 if (devnull_fd < 0) {
1374 devnull_fd = open_devnull();
1375 if (devnull_fd < 0)
1376 goto out_warn_father;
1377 }
1378
1379 ret = set_stdfds(devnull_fd);
1380 if (ret < 0) {
1381 ERROR("Failed to redirect std{in,out,err} to \"/dev/null\"");
1382 goto out_warn_father;
1383 }
1384 }
1385
1386 if (devnull_fd >= 0) {
1387 close(devnull_fd);
1388 devnull_fd = -1;
1389 }
1390
1391 setsid();
1392
1393 if (handler->conf->init_cwd) {
1394 ret = chdir(handler->conf->init_cwd);
1395 if (ret < 0) {
1396 SYSERROR("Could not change directory to \"%s\"",
1397 handler->conf->init_cwd);
1398 goto out_warn_father;
1399 }
1400 }
1401
1402 ret = lxc_sync_barrier_parent(handler, LXC_SYNC_CGROUP_LIMITS);
1403 if (ret < 0)
1404 goto out_warn_father;
1405
1406 /* Reset the environment variables the user requested in a clear
1407 * environment.
1408 */
1409 ret = clearenv();
1410 /* Don't error out though. */
1411 if (ret < 0)
1412 SYSERROR("Failed to clear environment.");
1413
1414 lxc_list_for_each(iterator, &handler->conf->environment) {
1415 ret = putenv((char *)iterator->elem);
1416 if (ret < 0) {
1417 SYSERROR("Failed to set environment variable: %s",
1418 (char *)iterator->elem);
1419 goto out_warn_father;
1420 }
1421 }
1422
1423 ret = putenv("container=lxc");
1424 if (ret < 0) {
1425 SYSERROR("Failed to set environment variable: container=lxc");
1426 goto out_warn_father;
1427 }
1428
1429 if (handler->conf->ttys.tty_names) {
1430 ret = putenv(handler->conf->ttys.tty_names);
1431 if (ret < 0) {
1432 SYSERROR("Failed to set environment variable for container ptys");
1433 goto out_warn_father;
1434 }
1435 }
1436
1437 /* The container has been setup. We can now switch to an unprivileged
1438 * uid/gid.
1439 */
1440 new_uid = handler->conf->init_uid;
1441 new_gid = handler->conf->init_gid;
1442
1443 /* Avoid unnecessary syscalls. */
1444 if (new_uid == nsuid)
1445 new_uid = LXC_INVALID_UID;
1446
1447 if (new_gid == nsgid)
1448 new_gid = LXC_INVALID_GID;
1449
1450 if (!lxc_switch_uid_gid(new_uid, new_gid))
1451 goto out_warn_father;
1452
1453 /* If we are in a new user namespace we already dropped all groups when
1454 * we switched to root in the new user namespace further above. Only
1455 * drop groups if we can, so ensure that we have necessary privilege.
1456 */
1457 if (lxc_list_empty(&handler->conf->id_map))
1458 #if HAVE_LIBCAP
1459 if (lxc_proc_cap_is_set(CAP_SETGID, CAP_EFFECTIVE))
1460 #endif
1461 if (!lxc_setgroups(0, NULL))
1462 goto out_warn_father;
1463
1464 ret = lxc_ambient_caps_down();
1465 if (ret < 0) {
1466 ERROR("Failed to clear ambient capabilities");
1467 goto out_warn_father;
1468 }
1469
1470 if (handler->conf->monitor_signal_pdeath != SIGKILL) {
1471 ret = lxc_set_death_signal(handler->conf->monitor_signal_pdeath,
1472 handler->monitor_pid, status_fd);
1473 if (ret < 0) {
1474 SYSERROR("Failed to set PR_SET_PDEATHSIG to %d",
1475 handler->conf->monitor_signal_pdeath);
1476 goto out_warn_father;
1477 }
1478 }
1479
1480 /* After this call, we are in error because this ops should not return
1481 * as it execs.
1482 */
1483 handler->ops->start(handler, handler->data);
1484
1485 out_warn_father:
1486 /* We want the parent to know something went wrong, so we return a
1487 * special error code.
1488 */
1489 lxc_sync_wake_parent(handler, LXC_SYNC_ERROR);
1490
1491 out_error:
1492 if (devnull_fd >= 0)
1493 close(devnull_fd);
1494
1495 return -1;
1496 }
1497
1498 static int lxc_recv_ttys_from_child(struct lxc_handler *handler)
1499 {
1500 int i;
1501 struct lxc_terminal_info *tty;
1502 int ret = -1;
1503 int sock = handler->data_sock[1];
1504 struct lxc_conf *conf = handler->conf;
1505 struct lxc_tty_info *ttys = &conf->ttys;
1506
1507 if (!conf->ttys.max)
1508 return 0;
1509
1510 ttys->tty = malloc(sizeof(*ttys->tty) * ttys->max);
1511 if (!ttys->tty)
1512 return -1;
1513
1514 for (i = 0; i < conf->ttys.max; i++) {
1515 int ttyfds[2];
1516
1517 ret = lxc_abstract_unix_recv_fds(sock, ttyfds, 2, NULL, 0);
1518 if (ret < 0)
1519 break;
1520
1521 tty = &ttys->tty[i];
1522 tty->busy = -1;
1523 tty->master = ttyfds[0];
1524 tty->slave = ttyfds[1];
1525 TRACE("Received pty with master fd %d and slave fd %d from "
1526 "child", tty->master, tty->slave);
1527 }
1528
1529 if (ret < 0)
1530 SYSERROR("Failed to receive %zu ttys from child", ttys->max);
1531 else
1532 TRACE("Received %zu ttys from child", ttys->max);
1533
1534 return ret;
1535 }
1536
1537 int resolve_clone_flags(struct lxc_handler *handler)
1538 {
1539 int i;
1540 struct lxc_conf *conf = handler->conf;
1541
1542 for (i = 0; i < LXC_NS_MAX; i++) {
1543 if (conf->ns_keep) {
1544 if (!(conf->ns_keep & ns_info[i].clone_flag))
1545 handler->ns_clone_flags |= ns_info[i].clone_flag;
1546 } else if (conf->ns_clone) {
1547 if ((conf->ns_clone & ns_info[i].clone_flag))
1548 handler->ns_clone_flags |= ns_info[i].clone_flag;
1549 } else {
1550 if (i == LXC_NS_USER && lxc_list_empty(&handler->conf->id_map))
1551 continue;
1552
1553 if (i == LXC_NS_NET && lxc_requests_empty_network(handler))
1554 continue;
1555
1556 if (i == LXC_NS_CGROUP && !cgns_supported())
1557 continue;
1558
1559 handler->ns_clone_flags |= ns_info[i].clone_flag;
1560 }
1561
1562 if (!conf->ns_share[i])
1563 continue;
1564
1565 handler->ns_clone_flags &= ~ns_info[i].clone_flag;
1566 TRACE("Sharing %s namespace", ns_info[i].proc_name);
1567 }
1568
1569 return 0;
1570 }
1571
1572 /* Note that this function is used with clone(CLONE_VM). Some glibc versions
1573 * used to reset the pid/tid to -1 when CLONE_VM was used without CLONE_THREAD.
1574 * But since the memory between parent and child is shared on CLONE_VM this
1575 * would invalidate the getpid() cache that glibc used to maintain and so
1576 * getpid() in the child would return the parent's pid. This is all fixed in
1577 * newer glibc versions where the getpid() cache is removed and the pid/tid is
1578 * not reset anymore.
1579 * However, if for whatever reason you - dear committer - somehow need to get the
1580 * pid of the dummy intermediate process for do_share_ns() you need to call
1581 * lxc_raw_getpid(). The next lxc_raw_clone() call does not employ CLONE_VM and
1582 * will be fine.
1583 */
1584 static inline int do_share_ns(void *arg)
1585 {
1586 int i, flags, ret;
1587 struct lxc_handler *handler = arg;
1588
1589 for (i = 0; i < LXC_NS_MAX; i++) {
1590 if (handler->nsfd[i] < 0)
1591 continue;
1592
1593 ret = setns(handler->nsfd[i], 0);
1594 if (ret < 0) {
1595 /*
1596 * Note that joining a user and/or mount namespace
1597 * requires the process is not multithreaded otherwise
1598 * setns() will fail here.
1599 */
1600 SYSERROR("Failed to inherit %s namespace",
1601 ns_info[i].proc_name);
1602 return -1;
1603 }
1604
1605 DEBUG("Inherited %s namespace", ns_info[i].proc_name);
1606 }
1607
1608 flags = handler->ns_on_clone_flags;
1609 flags |= CLONE_PARENT;
1610 handler->pid = lxc_raw_clone_cb(do_start, handler, CLONE_PIDFD | flags,
1611 &handler->pidfd);
1612 if (handler->pid < 0)
1613 return -1;
1614
1615 return 0;
1616 }
1617
1618 /* lxc_spawn() performs crucial setup tasks and clone()s the new process which
1619 * exec()s the requested container binary.
1620 * Note that lxc_spawn() runs in the parent namespaces. Any operations performed
1621 * right here should be double checked if they'd pose a security risk. (For
1622 * example, any {u}mount() operations performed here will be reflected on the
1623 * host!)
1624 */
1625 static int lxc_spawn(struct lxc_handler *handler)
1626 {
1627 __do_close_prot_errno int data_sock0 = -EBADF, data_sock1 = -EBADF;
1628 int i, ret;
1629 char pidstr[20];
1630 bool wants_to_map_ids;
1631 struct lxc_list *id_map;
1632 const char *name = handler->name;
1633 const char *lxcpath = handler->lxcpath;
1634 bool share_ns = false;
1635 struct lxc_conf *conf = handler->conf;
1636 struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
1637
1638 id_map = &conf->id_map;
1639 wants_to_map_ids = !lxc_list_empty(id_map);
1640
1641 for (i = 0; i < LXC_NS_MAX; i++) {
1642 if (!conf->ns_share[i])
1643 continue;
1644
1645 handler->nsfd[i] = lxc_inherit_namespace(conf->ns_share[i], lxcpath, ns_info[i].proc_name);
1646 if (handler->nsfd[i] < 0)
1647 return -1;
1648
1649 share_ns = true;
1650 }
1651
1652 ret = lxc_sync_init(handler);
1653 if (ret < 0)
1654 return -1;
1655
1656 ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0,
1657 handler->data_sock);
1658 if (ret < 0)
1659 goto out_sync_fini;
1660 data_sock0 = handler->data_sock[0];
1661 data_sock1 = handler->data_sock[1];
1662
1663 ret = resolve_clone_flags(handler);
1664 if (ret < 0)
1665 goto out_sync_fini;
1666
1667 if (handler->ns_clone_flags & CLONE_NEWNET) {
1668 ret = lxc_find_gateway_addresses(handler);
1669 if (ret) {
1670 ERROR("Failed to find gateway addresses");
1671 goto out_sync_fini;
1672 }
1673 }
1674
1675 if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
1676 ERROR("Failed creating cgroups");
1677 goto out_delete_net;
1678 }
1679
1680 /* If the rootfs is not a blockdev, prevent the container from marking
1681 * it readonly.
1682 * If the container is unprivileged then skip rootfs pinning.
1683 */
1684 if (!wants_to_map_ids) {
1685 handler->pinfd = pin_rootfs(conf->rootfs.path);
1686 if (handler->pinfd == -1)
1687 INFO("Failed to pin the rootfs for container \"%s\"", handler->name);
1688 }
1689
1690 /* Create a process in a new set of namespaces. */
1691 handler->ns_on_clone_flags = handler->ns_clone_flags;
1692 if (handler->ns_clone_flags & CLONE_NEWUSER) {
1693 /* If CLONE_NEWUSER and CLONE_NEWNET was requested, we need to
1694 * clone a new user namespace first and only later unshare our
1695 * network namespace to ensure that network devices ownership is
1696 * set up correctly.
1697 */
1698 handler->ns_on_clone_flags &= ~CLONE_NEWNET;
1699 }
1700 /* The cgroup namespace gets unshare()ed not clone()ed. */
1701 handler->ns_on_clone_flags &= ~CLONE_NEWCGROUP;
1702
1703 if (share_ns) {
1704 pid_t attacher_pid;
1705
1706 attacher_pid = lxc_clone(do_share_ns, handler,
1707 CLONE_VFORK | CLONE_VM | CLONE_FILES, NULL);
1708 if (attacher_pid < 0) {
1709 SYSERROR(LXC_CLONE_ERROR);
1710 goto out_delete_net;
1711 }
1712
1713 ret = wait_for_pid(attacher_pid);
1714 if (ret < 0) {
1715 SYSERROR("Intermediate process failed");
1716 goto out_delete_net;
1717 }
1718 } else {
1719 handler->pid = lxc_raw_clone_cb(do_start, handler,
1720 CLONE_PIDFD | handler->ns_on_clone_flags,
1721 &handler->pidfd);
1722 }
1723 if (handler->pid < 0) {
1724 SYSERROR(LXC_CLONE_ERROR);
1725 goto out_delete_net;
1726 }
1727 TRACE("Cloned child process %d", handler->pid);
1728
1729 ret = snprintf(pidstr, 20, "%d", handler->pid);
1730 if (ret < 0 || ret >= 20)
1731 goto out_delete_net;
1732
1733 ret = setenv("LXC_PID", pidstr, 1);
1734 if (ret < 0)
1735 SYSERROR("Failed to set environment variable: LXC_PID=%s", pidstr);
1736
1737 for (i = 0; i < LXC_NS_MAX; i++)
1738 if (handler->ns_on_clone_flags & ns_info[i].clone_flag)
1739 INFO("Cloned %s", ns_info[i].flag_name);
1740
1741 if (!lxc_try_preserve_namespaces(handler, handler->ns_on_clone_flags, handler->pid)) {
1742 ERROR("Failed to preserve cloned namespaces for lxc.hook.stop");
1743 goto out_delete_net;
1744 }
1745
1746 lxc_sync_fini_child(handler);
1747
1748 if (lxc_abstract_unix_send_fds(handler->data_sock[0], &handler->monitor_status_fd, 1, NULL, 0) < 0) {
1749 ERROR("Failed to send status file descriptor to child process");
1750 goto out_delete_net;
1751 }
1752 close_prot_errno_disarm(handler->monitor_status_fd);
1753
1754 /* Map the container uids. The container became an invalid userid the
1755 * moment it was cloned with CLONE_NEWUSER. This call doesn't change
1756 * anything immediately, but allows the container to setuid(0) (0 being
1757 * mapped to something else on the host.) later to become a valid uid
1758 * again.
1759 */
1760 if (wants_to_map_ids) {
1761 if (!handler->conf->ns_share[LXC_NS_USER] &&
1762 (handler->conf->ns_keep & CLONE_NEWUSER) == 0) {
1763 ret = lxc_map_ids(id_map, handler->pid);
1764 if (ret < 0) {
1765 ERROR("Failed to set up id mapping.");
1766 goto out_delete_net;
1767 }
1768 }
1769 }
1770
1771 ret = lxc_sync_wake_child(handler, LXC_SYNC_STARTUP);
1772 if (ret < 0)
1773 goto out_delete_net;
1774
1775 ret = lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE);
1776 if (ret < 0)
1777 goto out_delete_net;
1778
1779 if (!cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, false)) {
1780 ERROR("Failed to setup cgroup limits for container \"%s\"", name);
1781 goto out_delete_net;
1782 }
1783
1784 if (!cgroup_ops->payload_enter(cgroup_ops, handler)) {
1785 goto out_delete_net;
1786 }
1787
1788 if (!cgroup_ops->payload_delegate_controllers(cgroup_ops)) {
1789 ERROR("Failed to delegate controllers to payload cgroup");
1790 goto out_delete_net;
1791 }
1792
1793 if (!cgroup_ops->setup_limits(cgroup_ops, handler)) {
1794 ERROR("Failed to setup cgroup limits for container \"%s\"", name);
1795 goto out_delete_net;
1796 }
1797
1798 if (!cgroup_ops->chown(cgroup_ops, handler->conf))
1799 goto out_delete_net;
1800
1801 /* If not done yet, we're now ready to preserve the network namespace */
1802 if (handler->nsfd[LXC_NS_NET] < 0) {
1803 ret = lxc_try_preserve_ns(handler->pid, "net");
1804 if (ret < 0) {
1805 if (ret != -EOPNOTSUPP) {
1806 SYSERROR("Failed to preserve net namespace");
1807 goto out_delete_net;
1808 }
1809 } else {
1810 handler->nsfd[LXC_NS_NET] = ret;
1811 DEBUG("Preserved net namespace via fd %d", ret);
1812 }
1813 }
1814 ret = lxc_netns_set_nsid(handler->nsfd[LXC_NS_NET]);
1815 if (ret < 0)
1816 SYSWARN("Failed to allocate new network namespace id");
1817 else
1818 TRACE("Allocated new network namespace id");
1819
1820 /* Create the network configuration. */
1821 if (handler->ns_clone_flags & CLONE_NEWNET) {
1822 ret = lxc_create_network(handler);
1823 if (ret < 0) {
1824 ERROR("Failed to create the network");
1825 goto out_delete_net;
1826 }
1827
1828 ret = lxc_network_send_to_child(handler);
1829 if (ret < 0) {
1830 ERROR("Failed to send veth names to child");
1831 goto out_delete_net;
1832 }
1833 }
1834
1835 if (!lxc_list_empty(&conf->procs)) {
1836 ret = setup_proc_filesystem(&conf->procs, handler->pid);
1837 if (ret < 0)
1838 goto out_delete_net;
1839 }
1840
1841 /* Tell the child to continue its initialization. We'll get
1842 * LXC_SYNC_CGROUP when it is ready for us to setup cgroups.
1843 */
1844 ret = lxc_sync_barrier_child(handler, LXC_SYNC_POST_CONFIGURE);
1845 if (ret < 0)
1846 goto out_delete_net;
1847
1848 if (!lxc_list_empty(&conf->limits)) {
1849 ret = setup_resource_limits(&conf->limits, handler->pid);
1850 if (ret < 0) {
1851 ERROR("Failed to setup resource limits");
1852 goto out_delete_net;
1853 }
1854 }
1855
1856 ret = lxc_sync_barrier_child(handler, LXC_SYNC_CGROUP_UNSHARE);
1857 if (ret < 0)
1858 goto out_delete_net;
1859
1860 if (!cgroup_ops->setup_limits_legacy(cgroup_ops, handler->conf, true)) {
1861 ERROR("Failed to setup legacy device cgroup controller limits");
1862 goto out_delete_net;
1863 }
1864 TRACE("Set up legacy device cgroup controller limits");
1865
1866 if (!cgroup_ops->devices_activate(cgroup_ops, handler)) {
1867 ERROR("Failed to setup cgroup2 device controller limits");
1868 goto out_delete_net;
1869 }
1870 TRACE("Set up cgroup2 device controller limits");
1871
1872 if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
1873 /* Now we're ready to preserve the cgroup namespace */
1874 ret = lxc_try_preserve_ns(handler->pid, "cgroup");
1875 if (ret < 0) {
1876 if (ret != -EOPNOTSUPP) {
1877 SYSERROR("Failed to preserve cgroup namespace");
1878 goto out_delete_net;
1879 }
1880 } else {
1881 handler->nsfd[LXC_NS_CGROUP] = ret;
1882 DEBUG("Preserved cgroup namespace via fd %d", ret);
1883 }
1884 }
1885
1886 cgroup_ops->payload_finalize(cgroup_ops);
1887 TRACE("Finished setting up cgroups");
1888
1889 /* Run any host-side start hooks */
1890 ret = run_lxc_hooks(name, "start-host", conf, NULL);
1891 if (ret < 0) {
1892 ERROR("Failed to run lxc.hook.start-host");
1893 goto out_delete_net;
1894 }
1895
1896 /* Tell the child to complete its initialization and wait for it to exec
1897 * or return an error. (The child will never return
1898 * LXC_SYNC_READY_START+1. It will either close the sync pipe, causing
1899 * lxc_sync_barrier_child to return success, or return a different
1900 * value, causing us to error out).
1901 */
1902 ret = lxc_sync_barrier_child(handler, LXC_SYNC_READY_START);
1903 if (ret < 0)
1904 goto out_delete_net;
1905
1906 if (handler->ns_clone_flags & CLONE_NEWNET) {
1907 ret = lxc_network_recv_name_and_ifindex_from_child(handler);
1908 if (ret < 0) {
1909 ERROR("Failed to receive names and ifindices for network devices from child");
1910 goto out_delete_net;
1911 }
1912 }
1913
1914 /* Now all networks are created, network devices are moved into place,
1915 * and the correct names and ifindices in the respective namespaces have
1916 * been recorded. The corresponding structs have now all been filled. So
1917 * log them for debugging purposes.
1918 */
1919 lxc_log_configured_netdevs(conf);
1920
1921 /* Read tty fds allocated by child. */
1922 ret = lxc_recv_ttys_from_child(handler);
1923 if (ret < 0) {
1924 ERROR("Failed to receive tty info from child process");
1925 goto out_delete_net;
1926 }
1927
1928 ret = lxc_seccomp_recv_notifier_fd(&handler->conf->seccomp, data_sock1);
1929 if (ret < 0) {
1930 SYSERROR("Failed to receive seccomp notify fd from child");
1931 goto out_delete_net;
1932 }
1933
1934 ret = handler->ops->post_start(handler, handler->data);
1935 if (ret < 0)
1936 goto out_abort;
1937
1938 ret = lxc_set_state(name, handler, RUNNING);
1939 if (ret < 0) {
1940 ERROR("Failed to set state to \"%s\"", lxc_state2str(RUNNING));
1941 goto out_abort;
1942 }
1943
1944 lxc_sync_fini(handler);
1945
1946 return 0;
1947
1948 out_delete_net:
1949 if (handler->ns_clone_flags & CLONE_NEWNET)
1950 lxc_delete_network(handler);
1951
1952 out_abort:
1953 lxc_abort(name, handler);
1954
1955 out_sync_fini:
1956 lxc_sync_fini(handler);
1957 if (handler->pinfd >= 0) {
1958 close(handler->pinfd);
1959 handler->pinfd = -1;
1960 }
1961
1962 return -1;
1963 }
1964
1965 int __lxc_start(const char *name, struct lxc_handler *handler,
1966 struct lxc_operations* ops, void *data, const char *lxcpath,
1967 bool daemonize, int *error_num)
1968 {
1969 int ret, status;
1970 struct lxc_conf *conf = handler->conf;
1971 struct cgroup_ops *cgroup_ops;
1972
1973 ret = lxc_init(name, handler);
1974 if (ret < 0) {
1975 ERROR("Failed to initialize container \"%s\"", name);
1976 goto out_fini_nonet;
1977 }
1978 handler->ops = ops;
1979 handler->data = data;
1980 handler->daemonize = daemonize;
1981 cgroup_ops = handler->cgroup_ops;
1982
1983 if (!attach_block_device(handler->conf)) {
1984 ERROR("Failed to attach block device");
1985 ret = -1;
1986 goto out_fini_nonet;
1987 }
1988
1989 if (!cgroup_ops->monitor_create(cgroup_ops, handler)) {
1990 ERROR("Failed to create monitor cgroup");
1991 ret = -1;
1992 goto out_fini_nonet;
1993 }
1994
1995 if (!cgroup_ops->monitor_enter(cgroup_ops, handler)) {
1996 ERROR("Failed to enter monitor cgroup");
1997 ret = -1;
1998 goto out_fini_nonet;
1999 }
2000
2001 if (!cgroup_ops->monitor_delegate_controllers(cgroup_ops)) {
2002 ERROR("Failed to delegate controllers to monitor cgroup");
2003 ret = -1;
2004 goto out_fini_nonet;
2005 }
2006
2007 if (geteuid() == 0 && !lxc_list_empty(&conf->id_map)) {
2008 /* If the backing store is a device, mount it here and now. */
2009 if (rootfs_is_blockdev(conf)) {
2010 ret = unshare(CLONE_NEWNS);
2011 if (ret < 0) {
2012 ERROR("Failed to unshare CLONE_NEWNS");
2013 goto out_fini_nonet;
2014 }
2015 INFO("Unshared CLONE_NEWNS");
2016
2017 remount_all_slave();
2018 ret = lxc_setup_rootfs_prepare_root(conf, name, lxcpath);
2019 if (ret < 0) {
2020 ERROR("Error setting up rootfs mount as root before spawn");
2021 goto out_fini_nonet;
2022 }
2023 INFO("Set up container rootfs as host root");
2024 }
2025 }
2026
2027 ret = lxc_spawn(handler);
2028 if (ret < 0) {
2029 ERROR("Failed to spawn container \"%s\"", name);
2030 goto out_detach_blockdev;
2031 }
2032
2033 handler->conf->reboot = REBOOT_NONE;
2034
2035 ret = lxc_poll(name, handler);
2036 if (ret) {
2037 ERROR("LXC mainloop exited with error: %d", ret);
2038 goto out_abort;
2039 }
2040
2041 if (!handler->init_died && handler->pid > 0) {
2042 ERROR("Child process is not killed");
2043 ret = -1;
2044 goto out_abort;
2045 }
2046
2047 status = lxc_wait_for_pid_status(handler->pid);
2048 if (status < 0)
2049 SYSERROR("Failed to retrieve status for %d", handler->pid);
2050
2051 /* If the child process exited but was not signaled, it didn't call
2052 * reboot. This should mean it was an lxc-execute which simply exited.
2053 * In any case, treat it as a 'halt'.
2054 */
2055 if (WIFSIGNALED(status)) {
2056 switch(WTERMSIG(status)) {
2057 case SIGINT: /* halt */
2058 DEBUG("Container \"%s\" is halting", name);
2059 break;
2060 case SIGHUP: /* reboot */
2061 DEBUG("Container \"%s\" is rebooting", name);
2062 handler->conf->reboot = REBOOT_REQ;
2063 break;
2064 case SIGSYS: /* seccomp */
2065 DEBUG("Container \"%s\" violated its seccomp policy", name);
2066 break;
2067 default:
2068 DEBUG("Unknown exit status for container \"%s\" init %d", name, WTERMSIG(status));
2069 break;
2070 }
2071 }
2072
2073 ret = lxc_restore_phys_nics_to_netns(handler);
2074 if (ret < 0)
2075 ERROR("Failed to move physical network devices back to parent "
2076 "network namespace");
2077
2078 if (handler->pinfd >= 0) {
2079 close(handler->pinfd);
2080 handler->pinfd = -1;
2081 }
2082
2083 lxc_monitor_send_exit_code(name, status, handler->lxcpath);
2084 lxc_error_set_and_log(handler->pid, status);
2085 if (error_num)
2086 *error_num = handler->exit_status;
2087
2088 out_fini:
2089 lxc_delete_network(handler);
2090
2091 out_detach_blockdev:
2092 detach_block_device(handler->conf);
2093
2094 out_fini_nonet:
2095 lxc_fini(name, handler);
2096 return ret;
2097
2098 out_abort:
2099 lxc_abort(name, handler);
2100 goto out_fini;
2101 }
2102
2103 struct start_args {
2104 char *const *argv;
2105 };
2106
2107 static int start(struct lxc_handler *handler, void* data)
2108 {
2109 struct start_args *arg = data;
2110
2111 NOTICE("Exec'ing \"%s\"", arg->argv[0]);
2112
2113 execvp(arg->argv[0], arg->argv);
2114 SYSERROR("Failed to exec \"%s\"", arg->argv[0]);
2115 return 0;
2116 }
2117
2118 static int post_start(struct lxc_handler *handler, void* data)
2119 {
2120 struct start_args *arg = data;
2121
2122 NOTICE("Started \"%s\" with pid \"%d\"", arg->argv[0], handler->pid);
2123 return 0;
2124 }
2125
2126 static struct lxc_operations start_ops = {
2127 .start = start,
2128 .post_start = post_start
2129 };
2130
2131 int lxc_start(const char *name, char *const argv[], struct lxc_handler *handler,
2132 const char *lxcpath, bool daemonize, int *error_num)
2133 {
2134 struct start_args start_arg = {
2135 .argv = argv,
2136 };
2137
2138 TRACE("Doing lxc_start");
2139 return __lxc_start(name, handler, &start_ops, &start_arg, lxcpath, daemonize, error_num);
2140 }
2141
2142 static void lxc_destroy_container_on_signal(struct lxc_handler *handler,
2143 const char *name)
2144 {
2145 char destroy[PATH_MAX];
2146 struct lxc_container *c;
2147 int ret = 0;
2148 bool bret = true;
2149
2150 if (handler->conf->rootfs.path && handler->conf->rootfs.mount) {
2151 bret = do_destroy_container(handler);
2152 if (!bret) {
2153 ERROR("Error destroying rootfs for container \"%s\"", name);
2154 return;
2155 }
2156 }
2157 INFO("Destroyed rootfs for container \"%s\"", name);
2158
2159 ret = snprintf(destroy, PATH_MAX, "%s/%s", handler->lxcpath, name);
2160 if (ret < 0 || ret >= PATH_MAX) {
2161 ERROR("Error destroying directory for container \"%s\"", name);
2162 return;
2163 }
2164
2165 c = lxc_container_new(name, handler->lxcpath);
2166 if (c) {
2167 if (container_disk_lock(c)) {
2168 INFO("Could not update lxc_snapshots file");
2169 lxc_container_put(c);
2170 } else {
2171 mod_all_rdeps(c, false);
2172 container_disk_unlock(c);
2173 lxc_container_put(c);
2174 }
2175 }
2176
2177 if (!handler->am_root)
2178 ret = userns_exec_full(handler->conf, lxc_rmdir_onedev_wrapper,
2179 destroy, "lxc_rmdir_onedev_wrapper");
2180 else
2181 ret = lxc_rmdir_onedev(destroy, NULL);
2182
2183 if (ret < 0) {
2184 ERROR("Error destroying directory for container \"%s\"", name);
2185 return;
2186 }
2187 INFO("Destroyed directory for container \"%s\"", name);
2188 }
2189
2190 static int lxc_rmdir_onedev_wrapper(void *data)
2191 {
2192 char *arg = (char *) data;
2193 return lxc_rmdir_onedev(arg, NULL);
2194 }
2195
2196 static bool do_destroy_container(struct lxc_handler *handler)
2197 {
2198 int ret;
2199
2200 if (!handler->am_root) {
2201 ret = userns_exec_full(handler->conf, storage_destroy_wrapper,
2202 handler->conf, "storage_destroy_wrapper");
2203 if (ret < 0)
2204 return false;
2205
2206 return true;
2207 }
2208
2209 return storage_destroy(handler->conf);
2210 }