]> git.proxmox.com Git - systemd.git/blob - src/nspawn/nspawn.c
Imported Upstream version 204
[systemd.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44
45 #ifdef HAVE_XATTR
46 #include <attr/xattr.h>
47 #endif
48
49 #include <systemd/sd-daemon.h>
50
51 #include "log.h"
52 #include "util.h"
53 #include "mkdir.h"
54 #include "macro.h"
55 #include "audit.h"
56 #include "missing.h"
57 #include "cgroup-util.h"
58 #include "strv.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "sd-id128.h"
62 #include "dev-setup.h"
63 #include "fdset.h"
64 #include "build.h"
65 #include "fileio.h"
66
67 #ifndef TTY_GID
68 #define TTY_GID 5
69 #endif
70
71 typedef enum LinkJournal {
72 LINK_NO,
73 LINK_AUTO,
74 LINK_HOST,
75 LINK_GUEST
76 } LinkJournal;
77
78 static char *arg_directory = NULL;
79 static char *arg_user = NULL;
80 static char **arg_controllers = NULL;
81 static char *arg_uuid = NULL;
82 static char *arg_machine = NULL;
83 static bool arg_private_network = false;
84 static bool arg_read_only = false;
85 static bool arg_boot = false;
86 static LinkJournal arg_link_journal = LINK_AUTO;
87 static uint64_t arg_retain =
88 (1ULL << CAP_CHOWN) |
89 (1ULL << CAP_DAC_OVERRIDE) |
90 (1ULL << CAP_DAC_READ_SEARCH) |
91 (1ULL << CAP_FOWNER) |
92 (1ULL << CAP_FSETID) |
93 (1ULL << CAP_IPC_OWNER) |
94 (1ULL << CAP_KILL) |
95 (1ULL << CAP_LEASE) |
96 (1ULL << CAP_LINUX_IMMUTABLE) |
97 (1ULL << CAP_NET_BIND_SERVICE) |
98 (1ULL << CAP_NET_BROADCAST) |
99 (1ULL << CAP_NET_RAW) |
100 (1ULL << CAP_SETGID) |
101 (1ULL << CAP_SETFCAP) |
102 (1ULL << CAP_SETPCAP) |
103 (1ULL << CAP_SETUID) |
104 (1ULL << CAP_SYS_ADMIN) |
105 (1ULL << CAP_SYS_CHROOT) |
106 (1ULL << CAP_SYS_NICE) |
107 (1ULL << CAP_SYS_PTRACE) |
108 (1ULL << CAP_SYS_TTY_CONFIG) |
109 (1ULL << CAP_SYS_RESOURCE) |
110 (1ULL << CAP_SYS_BOOT) |
111 (1ULL << CAP_AUDIT_WRITE) |
112 (1ULL << CAP_AUDIT_CONTROL);
113 static char **arg_bind = NULL;
114 static char **arg_bind_ro = NULL;
115
116 static int help(void) {
117
118 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
119 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
120 " -h --help Show this help\n"
121 " --version Print version string\n"
122 " -D --directory=NAME Root directory for the container\n"
123 " -b --boot Boot up full system (i.e. invoke init)\n"
124 " -u --user=USER Run the command under specified user or uid\n"
125 " -C --controllers=LIST Put the container in specified comma-separated\n"
126 " cgroup hierarchies\n"
127 " --uuid=UUID Set a specific machine UUID for the container\n"
128 " -M --machine=NAME Set the machine name for the container\n"
129 " --private-network Disable network in container\n"
130 " --read-only Mount the root directory read-only\n"
131 " --capability=CAP In addition to the default, retain specified\n"
132 " capability\n"
133 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host\n"
134 " -j Equivalent to --link-journal=host\n"
135 " --bind=PATH[:PATH] Bind mount a file or directory from the host into\n"
136 " the container\n"
137 " --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
138 program_invocation_short_name);
139
140 return 0;
141 }
142
143 static int parse_argv(int argc, char *argv[]) {
144
145 enum {
146 ARG_VERSION = 0x100,
147 ARG_PRIVATE_NETWORK,
148 ARG_UUID,
149 ARG_READ_ONLY,
150 ARG_CAPABILITY,
151 ARG_LINK_JOURNAL,
152 ARG_BIND,
153 ARG_BIND_RO
154 };
155
156 static const struct option options[] = {
157 { "help", no_argument, NULL, 'h' },
158 { "version", no_argument, NULL, ARG_VERSION },
159 { "directory", required_argument, NULL, 'D' },
160 { "user", required_argument, NULL, 'u' },
161 { "controllers", required_argument, NULL, 'C' },
162 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
163 { "boot", no_argument, NULL, 'b' },
164 { "uuid", required_argument, NULL, ARG_UUID },
165 { "read-only", no_argument, NULL, ARG_READ_ONLY },
166 { "capability", required_argument, NULL, ARG_CAPABILITY },
167 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
168 { "bind", required_argument, NULL, ARG_BIND },
169 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
170 { "machine", required_argument, NULL, 'M' },
171 { NULL, 0, NULL, 0 }
172 };
173
174 int c;
175
176 assert(argc >= 0);
177 assert(argv);
178
179 while ((c = getopt_long(argc, argv, "+hD:u:C:bM:j", options, NULL)) >= 0) {
180
181 switch (c) {
182
183 case 'h':
184 help();
185 return 0;
186
187 case ARG_VERSION:
188 puts(PACKAGE_STRING);
189 puts(SYSTEMD_FEATURES);
190 return 0;
191
192 case 'D':
193 free(arg_directory);
194 arg_directory = canonicalize_file_name(optarg);
195 if (!arg_directory) {
196 log_error("Failed to canonicalize root directory.");
197 return -ENOMEM;
198 }
199
200 break;
201
202 case 'u':
203 free(arg_user);
204 arg_user = strdup(optarg);
205 if (!arg_user)
206 return log_oom();
207
208 break;
209
210 case 'C':
211 strv_free(arg_controllers);
212 arg_controllers = strv_split(optarg, ",");
213 if (!arg_controllers)
214 return log_oom();
215
216 cg_shorten_controllers(arg_controllers);
217 break;
218
219 case ARG_PRIVATE_NETWORK:
220 arg_private_network = true;
221 break;
222
223 case 'b':
224 arg_boot = true;
225 break;
226
227 case ARG_UUID:
228 if (!id128_is_valid(optarg)) {
229 log_error("Invalid UUID: %s", optarg);
230 return -EINVAL;
231 }
232
233 arg_uuid = optarg;
234 break;
235
236 case 'M':
237 if (!hostname_is_valid(optarg)) {
238 log_error("Invalid machine name: %s", optarg);
239 return -EINVAL;
240 }
241
242 free(arg_machine);
243 arg_machine = strdup(optarg);
244 if (!arg_machine)
245 return log_oom();
246
247 break;
248
249 case ARG_READ_ONLY:
250 arg_read_only = true;
251 break;
252
253 case ARG_CAPABILITY: {
254 char *state, *word;
255 size_t length;
256
257 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
258 cap_value_t cap;
259 char *t;
260
261 t = strndup(word, length);
262 if (!t)
263 return log_oom();
264
265 if (cap_from_name(t, &cap) < 0) {
266 log_error("Failed to parse capability %s.", t);
267 free(t);
268 return -EINVAL;
269 }
270
271 free(t);
272 arg_retain |= 1ULL << (uint64_t) cap;
273 }
274
275 break;
276 }
277
278 case 'j':
279 arg_link_journal = LINK_GUEST;
280 break;
281
282 case ARG_LINK_JOURNAL:
283 if (streq(optarg, "auto"))
284 arg_link_journal = LINK_AUTO;
285 else if (streq(optarg, "no"))
286 arg_link_journal = LINK_NO;
287 else if (streq(optarg, "guest"))
288 arg_link_journal = LINK_GUEST;
289 else if (streq(optarg, "host"))
290 arg_link_journal = LINK_HOST;
291 else {
292 log_error("Failed to parse link journal mode %s", optarg);
293 return -EINVAL;
294 }
295
296 break;
297
298 case ARG_BIND:
299 case ARG_BIND_RO: {
300 _cleanup_free_ char *a = NULL, *b = NULL;
301 char *e;
302 char ***x;
303 int r;
304
305 x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
306
307 e = strchr(optarg, ':');
308 if (e) {
309 a = strndup(optarg, e - optarg);
310 b = strdup(e + 1);
311 } else {
312 a = strdup(optarg);
313 b = strdup(optarg);
314 }
315
316 if (!a || !b)
317 return log_oom();
318
319 if (!path_is_absolute(a) || !path_is_absolute(b)) {
320 log_error("Invalid bind mount specification: %s", optarg);
321 return -EINVAL;
322 }
323
324 r = strv_extend(x, a);
325 if (r < 0)
326 return r;
327
328 r = strv_extend(x, b);
329 if (r < 0)
330 return r;
331
332 break;
333 }
334
335 case '?':
336 return -EINVAL;
337
338 default:
339 log_error("Unknown option code %c", c);
340 return -EINVAL;
341 }
342 }
343
344 return 1;
345 }
346
347 static int mount_all(const char *dest) {
348
349 typedef struct MountPoint {
350 const char *what;
351 const char *where;
352 const char *type;
353 const char *options;
354 unsigned long flags;
355 bool fatal;
356 } MountPoint;
357
358 static const MountPoint mount_table[] = {
359 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
360 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true }, /* Bind mount first */
361 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, true }, /* Then, make it r/o */
362 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true },
363 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true },
364 { "devpts", "/dev/pts", "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
365 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
366 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true },
367 #ifdef HAVE_SELINUX
368 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false }, /* Bind mount first */
369 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false }, /* Then, make it r/o */
370 #endif
371 };
372
373 unsigned k;
374 int r = 0;
375
376 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
377 _cleanup_free_ char *where = NULL;
378 int t;
379
380 where = strjoin(dest, "/", mount_table[k].where, NULL);
381 if (!where)
382 return log_oom();
383
384 t = path_is_mount_point(where, true);
385 if (t < 0) {
386 log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
387
388 if (r == 0)
389 r = t;
390
391 continue;
392 }
393
394 /* Skip this entry if it is not a remount. */
395 if (mount_table[k].what && t > 0)
396 continue;
397
398 mkdir_p(where, 0755);
399
400 if (mount(mount_table[k].what,
401 where,
402 mount_table[k].type,
403 mount_table[k].flags,
404 mount_table[k].options) < 0 &&
405 mount_table[k].fatal) {
406
407 log_error("mount(%s) failed: %m", where);
408
409 if (r == 0)
410 r = -errno;
411 }
412 }
413
414 return r;
415 }
416
417 static int mount_binds(const char *dest, char **l, unsigned long flags) {
418 char **x, **y;
419
420 STRV_FOREACH_PAIR(x, y, l) {
421 _cleanup_free_ char *where = NULL;
422
423 where = strjoin(dest, "/", *y, NULL);
424 if (!where)
425 return log_oom();
426
427 mkdir_p_label(where, 0755);
428
429 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
430 log_error("mount(%s) failed: %m", where);
431 return -errno;
432 }
433
434 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
435 log_error("mount(%s) failed: %m", where);
436 return -errno;
437 }
438 }
439
440 return 0;
441 }
442
443 static int setup_timezone(const char *dest) {
444 _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
445 char *z, *y;
446 int r;
447
448 assert(dest);
449
450 /* Fix the timezone, if possible */
451 r = readlink_malloc("/etc/localtime", &p);
452 if (r < 0) {
453 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
454 return 0;
455 }
456
457 z = path_startswith(p, "../usr/share/zoneinfo/");
458 if (!z)
459 z = path_startswith(p, "/usr/share/zoneinfo/");
460 if (!z) {
461 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
462 return 0;
463 }
464
465 where = strappend(dest, "/etc/localtime");
466 if (!where)
467 return log_oom();
468
469 r = readlink_malloc(where, &q);
470 if (r >= 0) {
471 y = path_startswith(q, "../usr/share/zoneinfo/");
472 if (!y)
473 y = path_startswith(q, "/usr/share/zoneinfo/");
474
475
476 /* Already pointing to the right place? Then do nothing .. */
477 if (y && streq(y, z))
478 return 0;
479 }
480
481 check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
482 if (!check)
483 return log_oom();
484
485 if (access(check, F_OK) < 0) {
486 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
487 return 0;
488 }
489
490 what = strappend("../usr/share/zoneinfo/", z);
491 if (!what)
492 return log_oom();
493
494 unlink(where);
495 if (symlink(what, where) < 0) {
496 log_error("Failed to correct timezone of container: %m");
497 return 0;
498 }
499
500 return 0;
501 }
502
503 static int setup_resolv_conf(const char *dest) {
504 char _cleanup_free_ *where = NULL;
505 _cleanup_close_ int fd = -1;
506
507 assert(dest);
508
509 if (arg_private_network)
510 return 0;
511
512 /* Fix resolv.conf, if possible */
513 where = strappend(dest, "/etc/resolv.conf");
514 if (!where)
515 return log_oom();
516
517 fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
518
519 /* We don't really care for the results of this really. If it
520 * fails, it fails, but meh... */
521 if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
522 log_warning("Failed to bind mount /etc/resolv.conf: %m");
523 else
524 if (mount("/etc/resolv.conf", where, "bind",
525 MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
526 log_error("Failed to remount /etc/resolv.conf readonly: %m");
527 return -errno;
528 }
529
530 return 0;
531 }
532
533 static int setup_boot_id(const char *dest) {
534 _cleanup_free_ char *from = NULL, *to = NULL;
535 sd_id128_t rnd;
536 char as_uuid[37];
537 int r;
538
539 assert(dest);
540
541 /* Generate a new randomized boot ID, so that each boot-up of
542 * the container gets a new one */
543
544 from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
545 to = strappend(dest, "/proc/sys/kernel/random/boot_id");
546 if (!from || !to)
547 return log_oom();
548
549 r = sd_id128_randomize(&rnd);
550 if (r < 0) {
551 log_error("Failed to generate random boot id: %s", strerror(-r));
552 return r;
553 }
554
555 snprintf(as_uuid, sizeof(as_uuid),
556 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
557 SD_ID128_FORMAT_VAL(rnd));
558 char_array_0(as_uuid);
559
560 r = write_string_file(from, as_uuid);
561 if (r < 0) {
562 log_error("Failed to write boot id: %s", strerror(-r));
563 return r;
564 }
565
566 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
567 log_error("Failed to bind mount boot id: %m");
568 r = -errno;
569 } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
570 log_warning("Failed to make boot id read-only: %m");
571
572 unlink(from);
573 return r;
574 }
575
576 static int copy_devnodes(const char *dest) {
577
578 static const char devnodes[] =
579 "null\0"
580 "zero\0"
581 "full\0"
582 "random\0"
583 "urandom\0"
584 "tty\0";
585
586 const char *d;
587 int r = 0;
588 _cleanup_umask_ mode_t u;
589
590 assert(dest);
591
592 u = umask(0000);
593
594 NULSTR_FOREACH(d, devnodes) {
595 struct stat st;
596 _cleanup_free_ char *from = NULL, *to = NULL;
597
598 asprintf(&from, "/dev/%s", d);
599 asprintf(&to, "%s/dev/%s", dest, d);
600
601 if (!from || !to) {
602 log_oom();
603
604 if (r == 0)
605 r = -ENOMEM;
606
607 break;
608 }
609
610 if (stat(from, &st) < 0) {
611
612 if (errno != ENOENT) {
613 log_error("Failed to stat %s: %m", from);
614 if (r == 0)
615 r = -errno;
616 }
617
618 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
619
620 log_error("%s is not a char or block device, cannot copy", from);
621 if (r == 0)
622 r = -EIO;
623
624 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
625
626 log_error("mknod(%s) failed: %m", dest);
627 if (r == 0)
628 r = -errno;
629 }
630 }
631
632 return r;
633 }
634
635 static int setup_ptmx(const char *dest) {
636 _cleanup_free_ char *p = NULL;
637
638 p = strappend(dest, "/dev/ptmx");
639 if (!p)
640 return log_oom();
641
642 if (symlink("pts/ptmx", p) < 0) {
643 log_error("Failed to create /dev/ptmx symlink: %m");
644 return -errno;
645 }
646
647 return 0;
648 }
649
650 static int setup_dev_console(const char *dest, const char *console) {
651 struct stat st;
652 _cleanup_free_ char *to = NULL;
653 int r;
654 _cleanup_umask_ mode_t u;
655
656 assert(dest);
657 assert(console);
658
659 u = umask(0000);
660
661 if (stat(console, &st) < 0) {
662 log_error("Failed to stat %s: %m", console);
663 return -errno;
664
665 } else if (!S_ISCHR(st.st_mode)) {
666 log_error("/dev/console is not a char device");
667 return -EIO;
668 }
669
670 r = chmod_and_chown(console, 0600, 0, 0);
671 if (r < 0) {
672 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
673 return r;
674 }
675
676 if (asprintf(&to, "%s/dev/console", dest) < 0)
677 return log_oom();
678
679 /* We need to bind mount the right tty to /dev/console since
680 * ptys can only exist on pts file systems. To have something
681 * to bind mount things on we create a device node first, that
682 * has the right major/minor (note that the major minor
683 * doesn't actually matter here, since we mount it over
684 * anyway). */
685
686 if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
687 log_error("mknod() for /dev/console failed: %m");
688 return -errno;
689 }
690
691 if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
692 log_error("Bind mount for /dev/console failed: %m");
693 return -errno;
694 }
695
696 return 0;
697 }
698
699 static int setup_kmsg(const char *dest, int kmsg_socket) {
700 _cleanup_free_ char *from = NULL, *to = NULL;
701 int r, fd, k;
702 _cleanup_umask_ mode_t u;
703 union {
704 struct cmsghdr cmsghdr;
705 uint8_t buf[CMSG_SPACE(sizeof(int))];
706 } control = {};
707 struct msghdr mh = {
708 .msg_control = &control,
709 .msg_controllen = sizeof(control),
710 };
711 struct cmsghdr *cmsg;
712
713 assert(dest);
714 assert(kmsg_socket >= 0);
715
716 u = umask(0000);
717
718 /* We create the kmsg FIFO as /dev/kmsg, but immediately
719 * delete it after bind mounting it to /proc/kmsg. While FIFOs
720 * on the reading side behave very similar to /proc/kmsg,
721 * their writing side behaves differently from /dev/kmsg in
722 * that writing blocks when nothing is reading. In order to
723 * avoid any problems with containers deadlocking due to this
724 * we simply make /dev/kmsg unavailable to the container. */
725 if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
726 asprintf(&to, "%s/proc/kmsg", dest) < 0)
727 return log_oom();
728
729 if (mkfifo(from, 0600) < 0) {
730 log_error("mkfifo() for /dev/kmsg failed: %m");
731 return -errno;
732 }
733
734 r = chmod_and_chown(from, 0600, 0, 0);
735 if (r < 0) {
736 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
737 return r;
738 }
739
740 if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
741 log_error("Bind mount for /proc/kmsg failed: %m");
742 return -errno;
743 }
744
745 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
746 if (fd < 0) {
747 log_error("Failed to open fifo: %m");
748 return -errno;
749 }
750
751 cmsg = CMSG_FIRSTHDR(&mh);
752 cmsg->cmsg_level = SOL_SOCKET;
753 cmsg->cmsg_type = SCM_RIGHTS;
754 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
755 memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
756
757 mh.msg_controllen = cmsg->cmsg_len;
758
759 /* Store away the fd in the socket, so that it stays open as
760 * long as we run the child */
761 k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
762 close_nointr_nofail(fd);
763
764 if (k < 0) {
765 log_error("Failed to send FIFO fd: %m");
766 return -errno;
767 }
768
769 /* And now make the FIFO unavailable as /dev/kmsg... */
770 unlink(from);
771 return 0;
772 }
773
774 static int setup_hostname(void) {
775
776 if (sethostname(arg_machine, strlen(arg_machine)) < 0)
777 return -errno;
778
779 return 0;
780 }
781
782 static int setup_journal(const char *directory) {
783 sd_id128_t machine_id;
784 _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
785 char *id;
786 int r;
787
788 if (arg_link_journal == LINK_NO)
789 return 0;
790
791 p = strappend(directory, "/etc/machine-id");
792 if (!p)
793 return log_oom();
794
795 r = read_one_line_file(p, &b);
796 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
797 return 0;
798 else if (r < 0) {
799 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
800 return r;
801 }
802
803 id = strstrip(b);
804 if (isempty(id) && arg_link_journal == LINK_AUTO)
805 return 0;
806
807 /* Verify validity */
808 r = sd_id128_from_string(id, &machine_id);
809 if (r < 0) {
810 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
811 return r;
812 }
813
814 free(p);
815 p = strappend("/var/log/journal/", id);
816 q = strjoin(directory, "/var/log/journal/", id, NULL);
817 if (!p || !q)
818 return log_oom();
819
820 if (path_is_mount_point(p, false) > 0) {
821 if (arg_link_journal != LINK_AUTO) {
822 log_error("%s: already a mount point, refusing to use for journal", p);
823 return -EEXIST;
824 }
825
826 return 0;
827 }
828
829 if (path_is_mount_point(q, false) > 0) {
830 if (arg_link_journal != LINK_AUTO) {
831 log_error("%s: already a mount point, refusing to use for journal", q);
832 return -EEXIST;
833 }
834
835 return 0;
836 }
837
838 r = readlink_and_make_absolute(p, &d);
839 if (r >= 0) {
840 if ((arg_link_journal == LINK_GUEST ||
841 arg_link_journal == LINK_AUTO) &&
842 path_equal(d, q)) {
843
844 r = mkdir_p(q, 0755);
845 if (r < 0)
846 log_warning("failed to create directory %s: %m", q);
847 return 0;
848 }
849
850 if (unlink(p) < 0) {
851 log_error("Failed to remove symlink %s: %m", p);
852 return -errno;
853 }
854 } else if (r == -EINVAL) {
855
856 if (arg_link_journal == LINK_GUEST &&
857 rmdir(p) < 0) {
858
859 if (errno == ENOTDIR) {
860 log_error("%s already exists and is neither a symlink nor a directory", p);
861 return r;
862 } else {
863 log_error("Failed to remove %s: %m", p);
864 return -errno;
865 }
866 }
867 } else if (r != -ENOENT) {
868 log_error("readlink(%s) failed: %m", p);
869 return r;
870 }
871
872 if (arg_link_journal == LINK_GUEST) {
873
874 if (symlink(q, p) < 0) {
875 log_error("Failed to symlink %s to %s: %m", q, p);
876 return -errno;
877 }
878
879 r = mkdir_p(q, 0755);
880 if (r < 0)
881 log_warning("failed to create directory %s: %m", q);
882 return 0;
883 }
884
885 if (arg_link_journal == LINK_HOST) {
886 r = mkdir_p(p, 0755);
887 if (r < 0) {
888 log_error("Failed to create %s: %m", p);
889 return r;
890 }
891
892 } else if (access(p, F_OK) < 0)
893 return 0;
894
895 if (dir_is_empty(q) == 0) {
896 log_error("%s not empty.", q);
897 return -ENOTEMPTY;
898 }
899
900 r = mkdir_p(q, 0755);
901 if (r < 0) {
902 log_error("Failed to create %s: %m", q);
903 return r;
904 }
905
906 if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
907 log_error("Failed to bind mount journal from host into guest: %m");
908 return -errno;
909 }
910
911 return 0;
912 }
913
914 static int setup_cgroup(const char *path) {
915 char **c;
916 int r;
917
918 r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
919 if (r < 0) {
920 log_error("Failed to create cgroup: %s", strerror(-r));
921 return r;
922 }
923
924 STRV_FOREACH(c, arg_controllers) {
925 r = cg_create_and_attach(*c, path, 1);
926 if (r < 0)
927 log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
928 }
929
930 return 0;
931 }
932
933 static int save_attributes(const char *cgroup, pid_t pid, const char *uuid, const char *directory) {
934 #ifdef HAVE_XATTR
935 _cleanup_free_ char *path = NULL;
936 char buf[DECIMAL_STR_MAX(pid_t)];
937 int r = 0, k;
938
939 assert(cgroup);
940 assert(pid >= 0);
941 assert(arg_directory);
942
943 assert_se(snprintf(buf, sizeof(buf), "%lu", (unsigned long) pid) < (int) sizeof(buf));
944
945 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup, NULL, &path);
946 if (r < 0) {
947 log_error("Failed to get path: %s", strerror(-r));
948 return r;
949 }
950
951 r = setxattr(path, "trusted.init_pid", buf, strlen(buf), XATTR_CREATE);
952 if (r < 0)
953 log_warning("Failed to set %s attribute on %s: %m", "trusted.init_pid", path);
954
955 if (uuid) {
956 k = setxattr(path, "trusted.machine_id", uuid, strlen(uuid), XATTR_CREATE);
957 if (k < 0) {
958 log_warning("Failed to set %s attribute on %s: %m", "trusted.machine_id", path);
959 if (r == 0)
960 r = k;
961 }
962 }
963
964 k = setxattr(path, "trusted.root_directory", directory, strlen(directory), XATTR_CREATE);
965 if (k < 0) {
966 log_warning("Failed to set %s attribute on %s: %m", "trusted.root_directory", path);
967 if (r == 0)
968 r = k;
969 }
970 return r;
971 #else
972 return 0;
973 #endif
974 }
975
976 static int drop_capabilities(void) {
977 return capability_bounding_set_drop(~arg_retain, false);
978 }
979
980 static int process_pty(int master, pid_t pid, sigset_t *mask) {
981
982 char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
983 size_t in_buffer_full = 0, out_buffer_full = 0;
984 struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
985 bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
986 int ep = -1, signal_fd = -1, r;
987 bool tried_orderly_shutdown = false;
988
989 assert(master >= 0);
990 assert(pid > 0);
991 assert(mask);
992
993 fd_nonblock(STDIN_FILENO, 1);
994 fd_nonblock(STDOUT_FILENO, 1);
995 fd_nonblock(master, 1);
996
997 signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
998 if (signal_fd < 0) {
999 log_error("signalfd(): %m");
1000 r = -errno;
1001 goto finish;
1002 }
1003
1004 ep = epoll_create1(EPOLL_CLOEXEC);
1005 if (ep < 0) {
1006 log_error("Failed to create epoll: %m");
1007 r = -errno;
1008 goto finish;
1009 }
1010
1011 /* We read from STDIN only if this is actually a TTY,
1012 * otherwise we assume non-interactivity. */
1013 if (isatty(STDIN_FILENO)) {
1014 zero(stdin_ev);
1015 stdin_ev.events = EPOLLIN|EPOLLET;
1016 stdin_ev.data.fd = STDIN_FILENO;
1017
1018 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
1019 log_error("Failed to register STDIN in epoll: %m");
1020 r = -errno;
1021 goto finish;
1022 }
1023 }
1024
1025 zero(stdout_ev);
1026 stdout_ev.events = EPOLLOUT|EPOLLET;
1027 stdout_ev.data.fd = STDOUT_FILENO;
1028
1029 zero(master_ev);
1030 master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
1031 master_ev.data.fd = master;
1032
1033 zero(signal_ev);
1034 signal_ev.events = EPOLLIN;
1035 signal_ev.data.fd = signal_fd;
1036
1037 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
1038 if (errno != EPERM) {
1039 log_error("Failed to register stdout in epoll: %m");
1040 r = -errno;
1041 goto finish;
1042 }
1043 /* stdout without epoll support. Likely redirected to regular file. */
1044 stdout_writable = true;
1045 }
1046
1047 if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1048 epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1049 log_error("Failed to register fds in epoll: %m");
1050 r = -errno;
1051 goto finish;
1052 }
1053
1054 for (;;) {
1055 struct epoll_event ev[16];
1056 ssize_t k;
1057 int i, nfds;
1058
1059 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1060 if (nfds < 0) {
1061
1062 if (errno == EINTR || errno == EAGAIN)
1063 continue;
1064
1065 log_error("epoll_wait(): %m");
1066 r = -errno;
1067 goto finish;
1068 }
1069
1070 assert(nfds >= 1);
1071
1072 for (i = 0; i < nfds; i++) {
1073 if (ev[i].data.fd == STDIN_FILENO) {
1074
1075 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1076 stdin_readable = true;
1077
1078 } else if (ev[i].data.fd == STDOUT_FILENO) {
1079
1080 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1081 stdout_writable = true;
1082
1083 } else if (ev[i].data.fd == master) {
1084
1085 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1086 master_readable = true;
1087
1088 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1089 master_writable = true;
1090
1091 } else if (ev[i].data.fd == signal_fd) {
1092 struct signalfd_siginfo sfsi;
1093 ssize_t n;
1094
1095 n = read(signal_fd, &sfsi, sizeof(sfsi));
1096 if (n != sizeof(sfsi)) {
1097
1098 if (n >= 0) {
1099 log_error("Failed to read from signalfd: invalid block size");
1100 r = -EIO;
1101 goto finish;
1102 }
1103
1104 if (errno != EINTR && errno != EAGAIN) {
1105 log_error("Failed to read from signalfd: %m");
1106 r = -errno;
1107 goto finish;
1108 }
1109 } else {
1110
1111 if (sfsi.ssi_signo == SIGWINCH) {
1112 struct winsize ws;
1113
1114 /* The window size changed, let's forward that. */
1115 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1116 ioctl(master, TIOCSWINSZ, &ws);
1117 } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1118
1119 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1120
1121 /* This only works for systemd... */
1122 tried_orderly_shutdown = true;
1123 kill(pid, SIGRTMIN+3);
1124
1125 } else {
1126 r = 0;
1127 goto finish;
1128 }
1129 }
1130 }
1131 }
1132
1133 while ((stdin_readable && in_buffer_full <= 0) ||
1134 (master_writable && in_buffer_full > 0) ||
1135 (master_readable && out_buffer_full <= 0) ||
1136 (stdout_writable && out_buffer_full > 0)) {
1137
1138 if (stdin_readable && in_buffer_full < LINE_MAX) {
1139
1140 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1141 if (k < 0) {
1142
1143 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1144 stdin_readable = false;
1145 else {
1146 log_error("read(): %m");
1147 r = -errno;
1148 goto finish;
1149 }
1150 } else
1151 in_buffer_full += (size_t) k;
1152 }
1153
1154 if (master_writable && in_buffer_full > 0) {
1155
1156 k = write(master, in_buffer, in_buffer_full);
1157 if (k < 0) {
1158
1159 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1160 master_writable = false;
1161 else {
1162 log_error("write(): %m");
1163 r = -errno;
1164 goto finish;
1165 }
1166
1167 } else {
1168 assert(in_buffer_full >= (size_t) k);
1169 memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1170 in_buffer_full -= k;
1171 }
1172 }
1173
1174 if (master_readable && out_buffer_full < LINE_MAX) {
1175
1176 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1177 if (k < 0) {
1178
1179 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1180 master_readable = false;
1181 else {
1182 log_error("read(): %m");
1183 r = -errno;
1184 goto finish;
1185 }
1186 } else
1187 out_buffer_full += (size_t) k;
1188 }
1189
1190 if (stdout_writable && out_buffer_full > 0) {
1191
1192 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1193 if (k < 0) {
1194
1195 if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1196 stdout_writable = false;
1197 else {
1198 log_error("write(): %m");
1199 r = -errno;
1200 goto finish;
1201 }
1202
1203 } else {
1204 assert(out_buffer_full >= (size_t) k);
1205 memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1206 out_buffer_full -= k;
1207 }
1208 }
1209 }
1210 }
1211
1212 finish:
1213 if (ep >= 0)
1214 close_nointr_nofail(ep);
1215
1216 if (signal_fd >= 0)
1217 close_nointr_nofail(signal_fd);
1218
1219 return r;
1220 }
1221
1222 int main(int argc, char *argv[]) {
1223 pid_t pid = 0;
1224 int r = EXIT_FAILURE, k;
1225 _cleanup_free_ char *newcg = NULL;
1226 _cleanup_close_ int master = -1;
1227 int n_fd_passed;
1228 const char *console = NULL;
1229 struct termios saved_attr, raw_attr;
1230 sigset_t mask;
1231 bool saved_attr_valid = false;
1232 struct winsize ws;
1233 int kmsg_socket_pair[2] = { -1, -1 };
1234 FDSet *fds = NULL;
1235
1236 log_parse_environment();
1237 log_open();
1238
1239 k = parse_argv(argc, argv);
1240 if (k < 0)
1241 goto finish;
1242 else if (k == 0) {
1243 r = EXIT_SUCCESS;
1244 goto finish;
1245 }
1246
1247 if (arg_directory) {
1248 char *p;
1249
1250 p = path_make_absolute_cwd(arg_directory);
1251 free(arg_directory);
1252 arg_directory = p;
1253 } else
1254 arg_directory = get_current_dir_name();
1255
1256 if (!arg_directory) {
1257 log_error("Failed to determine path, please use -D.");
1258 goto finish;
1259 }
1260
1261 path_kill_slashes(arg_directory);
1262
1263 if (!arg_machine) {
1264 arg_machine = strdup(path_get_file_name(arg_directory));
1265 if (!arg_machine) {
1266 log_oom();
1267 goto finish;
1268 }
1269
1270 hostname_cleanup(arg_machine, false);
1271 if (isempty(arg_machine)) {
1272 log_error("Failed to determine machine name automatically, please use -M.");
1273 goto finish;
1274 }
1275 }
1276
1277 if (geteuid() != 0) {
1278 log_error("Need to be root.");
1279 goto finish;
1280 }
1281
1282 if (sd_booted() <= 0) {
1283 log_error("Not running on a systemd system.");
1284 goto finish;
1285 }
1286
1287 if (path_equal(arg_directory, "/")) {
1288 log_error("Spawning container on root directory not supported.");
1289 goto finish;
1290 }
1291
1292 if (path_is_os_tree(arg_directory) <= 0) {
1293 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1294 goto finish;
1295 }
1296
1297 log_close();
1298 n_fd_passed = sd_listen_fds(false);
1299 if (n_fd_passed > 0) {
1300 k = fdset_new_listen_fds(&fds, false);
1301 if (k < 0) {
1302 log_error("Failed to collect file descriptors: %s", strerror(-k));
1303 goto finish;
1304 }
1305 }
1306 fdset_close_others(fds);
1307 log_open();
1308
1309 k = cg_get_machine_path(arg_machine, &newcg);
1310 if (k < 0) {
1311 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1312 goto finish;
1313 }
1314
1315 k = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1316 if (k <= 0 && k != -ENOENT) {
1317 log_error("Container already running.");
1318
1319 free(newcg);
1320 newcg = NULL;
1321
1322 goto finish;
1323 }
1324
1325 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1326 if (master < 0) {
1327 log_error("Failed to acquire pseudo tty: %m");
1328 goto finish;
1329 }
1330
1331 console = ptsname(master);
1332 if (!console) {
1333 log_error("Failed to determine tty name: %m");
1334 goto finish;
1335 }
1336
1337 log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1338
1339 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1340 ioctl(master, TIOCSWINSZ, &ws);
1341
1342 if (unlockpt(master) < 0) {
1343 log_error("Failed to unlock tty: %m");
1344 goto finish;
1345 }
1346
1347 if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1348 saved_attr_valid = true;
1349
1350 raw_attr = saved_attr;
1351 cfmakeraw(&raw_attr);
1352 raw_attr.c_lflag &= ~ECHO;
1353 }
1354
1355 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1356 log_error("Failed to create kmsg socket pair.");
1357 goto finish;
1358 }
1359
1360 sd_notify(0, "READY=1");
1361
1362 assert_se(sigemptyset(&mask) == 0);
1363 sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1364 assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1365
1366 for (;;) {
1367 siginfo_t status;
1368 int pipefd[2], pipefd2[2];
1369
1370 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1371 log_error("pipe2(): %m");
1372 goto finish;
1373 }
1374
1375 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1376 log_error("pipe2(): %m");
1377 close_pipe(pipefd);
1378 goto finish;
1379 }
1380
1381 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1382 if (pid < 0) {
1383 if (errno == EINVAL)
1384 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1385 else
1386 log_error("clone() failed: %m");
1387
1388 goto finish;
1389 }
1390
1391 if (pid == 0) {
1392 /* child */
1393 const char *home = NULL;
1394 uid_t uid = (uid_t) -1;
1395 gid_t gid = (gid_t) -1;
1396 unsigned n_env = 2;
1397 const char *envp[] = {
1398 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1399 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1400 NULL, /* TERM */
1401 NULL, /* HOME */
1402 NULL, /* USER */
1403 NULL, /* LOGNAME */
1404 NULL, /* container_uuid */
1405 NULL, /* LISTEN_FDS */
1406 NULL, /* LISTEN_PID */
1407 NULL
1408 };
1409
1410 envp[n_env] = strv_find_prefix(environ, "TERM=");
1411 if (envp[n_env])
1412 n_env ++;
1413
1414 /* Wait for the parent process to log our PID */
1415 close_nointr_nofail(pipefd[1]);
1416 fd_wait_for_event(pipefd[0], POLLHUP, -1);
1417 close_nointr_nofail(pipefd[0]);
1418
1419 close_nointr_nofail(master);
1420 master = -1;
1421
1422 if (saved_attr_valid) {
1423 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1424 log_error("Failed to set terminal attributes: %m");
1425 goto child_fail;
1426 }
1427 }
1428
1429 close_nointr(STDIN_FILENO);
1430 close_nointr(STDOUT_FILENO);
1431 close_nointr(STDERR_FILENO);
1432
1433 close_nointr_nofail(kmsg_socket_pair[0]);
1434 kmsg_socket_pair[0] = -1;
1435
1436 reset_all_signal_handlers();
1437
1438 assert_se(sigemptyset(&mask) == 0);
1439 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1440
1441 k = open_terminal(console, O_RDWR);
1442 if (k != STDIN_FILENO) {
1443 if (k >= 0) {
1444 close_nointr_nofail(k);
1445 k = -EINVAL;
1446 }
1447
1448 log_error("Failed to open console: %s", strerror(-k));
1449 goto child_fail;
1450 }
1451
1452 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1453 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1454 log_error("Failed to duplicate console: %m");
1455 goto child_fail;
1456 }
1457
1458 if (setsid() < 0) {
1459 log_error("setsid() failed: %m");
1460 goto child_fail;
1461 }
1462
1463 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1464 log_error("PR_SET_PDEATHSIG failed: %m");
1465 goto child_fail;
1466 }
1467
1468 if (setup_cgroup(newcg) < 0)
1469 goto child_fail;
1470
1471 close_pipe(pipefd2);
1472
1473 /* Mark everything as slave, so that we still
1474 * receive mounts from the real root, but don't
1475 * propagate mounts to the real root. */
1476 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1477 log_error("MS_SLAVE|MS_REC failed: %m");
1478 goto child_fail;
1479 }
1480
1481 /* Turn directory into bind mount */
1482 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1483 log_error("Failed to make bind mount.");
1484 goto child_fail;
1485 }
1486
1487 if (arg_read_only)
1488 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1489 log_error("Failed to make read-only.");
1490 goto child_fail;
1491 }
1492
1493 if (mount_all(arg_directory) < 0)
1494 goto child_fail;
1495
1496 if (copy_devnodes(arg_directory) < 0)
1497 goto child_fail;
1498
1499 if (setup_ptmx(arg_directory) < 0)
1500 goto child_fail;
1501
1502 dev_setup(arg_directory);
1503
1504 if (setup_dev_console(arg_directory, console) < 0)
1505 goto child_fail;
1506
1507 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1508 goto child_fail;
1509
1510 close_nointr_nofail(kmsg_socket_pair[1]);
1511 kmsg_socket_pair[1] = -1;
1512
1513 if (setup_boot_id(arg_directory) < 0)
1514 goto child_fail;
1515
1516 if (setup_timezone(arg_directory) < 0)
1517 goto child_fail;
1518
1519 if (setup_resolv_conf(arg_directory) < 0)
1520 goto child_fail;
1521
1522 if (setup_journal(arg_directory) < 0)
1523 goto child_fail;
1524
1525 if (mount_binds(arg_directory, arg_bind, 0) < 0)
1526 goto child_fail;
1527
1528 if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1529 goto child_fail;
1530
1531 if (chdir(arg_directory) < 0) {
1532 log_error("chdir(%s) failed: %m", arg_directory);
1533 goto child_fail;
1534 }
1535
1536 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1537 log_error("mount(MS_MOVE) failed: %m");
1538 goto child_fail;
1539 }
1540
1541 if (chroot(".") < 0) {
1542 log_error("chroot() failed: %m");
1543 goto child_fail;
1544 }
1545
1546 if (chdir("/") < 0) {
1547 log_error("chdir() failed: %m");
1548 goto child_fail;
1549 }
1550
1551 umask(0022);
1552
1553 loopback_setup();
1554
1555 if (drop_capabilities() < 0) {
1556 log_error("drop_capabilities() failed: %m");
1557 goto child_fail;
1558 }
1559
1560 if (arg_user) {
1561
1562 /* Note that this resolves user names
1563 * inside the container, and hence
1564 * accesses the NSS modules from the
1565 * container and not the host. This is
1566 * a bit weird... */
1567
1568 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1569 log_error("get_user_creds() failed: %m");
1570 goto child_fail;
1571 }
1572
1573 if (mkdir_parents_label(home, 0775) < 0) {
1574 log_error("mkdir_parents_label() failed: %m");
1575 goto child_fail;
1576 }
1577
1578 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1579 log_error("mkdir_safe_label() failed: %m");
1580 goto child_fail;
1581 }
1582
1583 if (initgroups((const char*)arg_user, gid) < 0) {
1584 log_error("initgroups() failed: %m");
1585 goto child_fail;
1586 }
1587
1588 if (setresgid(gid, gid, gid) < 0) {
1589 log_error("setregid() failed: %m");
1590 goto child_fail;
1591 }
1592
1593 if (setresuid(uid, uid, uid) < 0) {
1594 log_error("setreuid() failed: %m");
1595 goto child_fail;
1596 }
1597 } else {
1598 /* Reset everything fully to 0, just in case */
1599
1600 if (setgroups(0, NULL) < 0) {
1601 log_error("setgroups() failed: %m");
1602 goto child_fail;
1603 }
1604
1605 if (setresgid(0, 0, 0) < 0) {
1606 log_error("setregid() failed: %m");
1607 goto child_fail;
1608 }
1609
1610 if (setresuid(0, 0, 0) < 0) {
1611 log_error("setreuid() failed: %m");
1612 goto child_fail;
1613 }
1614 }
1615
1616 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1617 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1618 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1619 log_oom();
1620 goto child_fail;
1621 }
1622
1623 if (arg_uuid) {
1624 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1625 log_oom();
1626 goto child_fail;
1627 }
1628 }
1629
1630 if (fdset_size(fds) > 0) {
1631 k = fdset_cloexec(fds, false);
1632 if (k < 0) {
1633 log_error("Failed to unset O_CLOEXEC for file descriptors.");
1634 goto child_fail;
1635 }
1636
1637 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1638 (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1639 log_oom();
1640 goto child_fail;
1641 }
1642 }
1643
1644 setup_hostname();
1645
1646 if (arg_boot) {
1647 char **a;
1648 size_t l;
1649
1650 /* Automatically search for the init system */
1651
1652 l = 1 + argc - optind;
1653 a = newa(char*, l + 1);
1654 memcpy(a + 1, argv + optind, l * sizeof(char*));
1655
1656 a[0] = (char*) "/usr/lib/systemd/systemd";
1657 execve(a[0], a, (char**) envp);
1658
1659 a[0] = (char*) "/lib/systemd/systemd";
1660 execve(a[0], a, (char**) envp);
1661
1662 a[0] = (char*) "/sbin/init";
1663 execve(a[0], a, (char**) envp);
1664 } else if (argc > optind)
1665 execvpe(argv[optind], argv + optind, (char**) envp);
1666 else {
1667 chdir(home ? home : "/root");
1668 execle("/bin/bash", "-bash", NULL, (char**) envp);
1669 }
1670
1671 log_error("execv() failed: %m");
1672
1673 child_fail:
1674 _exit(EXIT_FAILURE);
1675 }
1676
1677 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1678 close_nointr_nofail(pipefd[0]);
1679 close_nointr_nofail(pipefd[1]);
1680
1681 /* Wait for the child process to establish cgroup hierarchy */
1682 close_nointr_nofail(pipefd2[1]);
1683 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1684 close_nointr_nofail(pipefd2[0]);
1685
1686 save_attributes(newcg, pid, arg_uuid, arg_directory);
1687
1688 fdset_free(fds);
1689 fds = NULL;
1690
1691 if (process_pty(master, pid, &mask) < 0)
1692 goto finish;
1693
1694 if (saved_attr_valid)
1695 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1696
1697 k = wait_for_terminate(pid, &status);
1698 if (k < 0) {
1699 r = EXIT_FAILURE;
1700 break;
1701 }
1702
1703 if (status.si_code == CLD_EXITED) {
1704 r = status.si_status;
1705 if (status.si_status != 0) {
1706 log_error("Container failed with error code %i.", status.si_status);
1707 break;
1708 }
1709
1710 log_debug("Container exited successfully.");
1711 break;
1712 } else if (status.si_code == CLD_KILLED &&
1713 status.si_status == SIGINT) {
1714 log_info("Container has been shut down.");
1715 r = 0;
1716 break;
1717 } else if (status.si_code == CLD_KILLED &&
1718 status.si_status == SIGHUP) {
1719 log_info("Container is being rebooted.");
1720 continue;
1721 } else if (status.si_code == CLD_KILLED ||
1722 status.si_code == CLD_DUMPED) {
1723
1724 log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1725 r = EXIT_FAILURE;
1726 break;
1727 } else {
1728 log_error("Container failed due to unknown reason.");
1729 r = EXIT_FAILURE;
1730 break;
1731 }
1732 }
1733
1734 finish:
1735 if (saved_attr_valid)
1736 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1737
1738 close_pipe(kmsg_socket_pair);
1739
1740 if (newcg)
1741 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1742
1743 free(arg_directory);
1744 free(arg_machine);
1745 strv_free(arg_controllers);
1746
1747 fdset_free(fds);
1748
1749 return r;
1750 }