]> git.proxmox.com Git - systemd.git/blame - src/nspawn/nspawn.c
Imported Upstream version 227
[systemd.git] / src / nspawn / nspawn.c
CommitLineData
663996b3
MS
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
d9dfd233
MP
22#ifdef HAVE_BLKID
23#include <blkid/blkid.h>
24#endif
663996b3 25#include <errno.h>
663996b3 26#include <getopt.h>
60f067b4 27#include <linux/loop.h>
d9dfd233 28#include <sched.h>
60f067b4
JS
29#ifdef HAVE_SECCOMP
30#include <seccomp.h>
31#endif
d9dfd233
MP
32#ifdef HAVE_SELINUX
33#include <selinux/selinux.h>
60f067b4 34#endif
d9dfd233
MP
35#include <signal.h>
36#include <stdio.h>
37#include <stdlib.h>
38#include <string.h>
39#include <sys/file.h>
40#include <sys/mount.h>
41#include <sys/personality.h>
42#include <sys/prctl.h>
43#include <sys/types.h>
44#include <unistd.h>
60f067b4
JS
45
46#include "sd-daemon.h"
60f067b4 47#include "sd-id128.h"
d9dfd233
MP
48
49#include "barrier.h"
50#include "base-filesystem.h"
51#include "blkid-util.h"
52#include "btrfs-util.h"
d9dfd233
MP
53#include "cap-list.h"
54#include "capability.h"
663996b3 55#include "cgroup-util.h"
d9dfd233 56#include "copy.h"
663996b3 57#include "dev-setup.h"
d9dfd233
MP
58#include "env-util.h"
59#include "event-util.h"
663996b3 60#include "fdset.h"
663996b3 61#include "fileio.h"
d9dfd233 62#include "formats-util.h"
60f067b4 63#include "gpt.h"
d9dfd233
MP
64#include "hostname-util.h"
65#include "log.h"
66#include "loopback-setup.h"
e735f4d4 67#include "machine-image.h"
d9dfd233
MP
68#include "macro.h"
69#include "missing.h"
70#include "mkdir.h"
71#include "netlink-util.h"
72#include "path-util.h"
e3bff60a 73#include "process-util.h"
d9dfd233
MP
74#include "ptyfwd.h"
75#include "random-util.h"
76#include "rm-rf.h"
60f067b4
JS
77#ifdef HAVE_SECCOMP
78#include "seccomp-util.h"
663996b3 79#endif
d9dfd233
MP
80#include "signal-util.h"
81#include "strv.h"
82#include "terminal-util.h"
83#include "udev-util.h"
84#include "util.h"
663996b3 85
6300502b
MP
86#include "nspawn-cgroup.h"
87#include "nspawn-expose-ports.h"
d9dfd233
MP
88#include "nspawn-mount.h"
89#include "nspawn-network.h"
d9dfd233 90#include "nspawn-register.h"
6300502b 91#include "nspawn-settings.h"
d9dfd233 92#include "nspawn-setuid.h"
e735f4d4 93
60f067b4
JS
94typedef enum ContainerStatus {
95 CONTAINER_TERMINATED,
96 CONTAINER_REBOOTED
97} ContainerStatus;
98
663996b3
MS
99typedef enum LinkJournal {
100 LINK_NO,
101 LINK_AUTO,
102 LINK_HOST,
103 LINK_GUEST
104} LinkJournal;
105
106static char *arg_directory = NULL;
e735f4d4 107static char *arg_template = NULL;
663996b3 108static char *arg_user = NULL;
14228c0d 109static sd_id128_t arg_uuid = {};
663996b3 110static char *arg_machine = NULL;
60f067b4
JS
111static const char *arg_selinux_context = NULL;
112static const char *arg_selinux_apifs_context = NULL;
14228c0d 113static const char *arg_slice = NULL;
663996b3
MS
114static bool arg_private_network = false;
115static bool arg_read_only = false;
116static bool arg_boot = false;
e735f4d4 117static bool arg_ephemeral = false;
663996b3 118static LinkJournal arg_link_journal = LINK_AUTO;
f47781d8 119static bool arg_link_journal_try = false;
663996b3
MS
120static uint64_t arg_retain =
121 (1ULL << CAP_CHOWN) |
122 (1ULL << CAP_DAC_OVERRIDE) |
123 (1ULL << CAP_DAC_READ_SEARCH) |
124 (1ULL << CAP_FOWNER) |
125 (1ULL << CAP_FSETID) |
126 (1ULL << CAP_IPC_OWNER) |
127 (1ULL << CAP_KILL) |
128 (1ULL << CAP_LEASE) |
129 (1ULL << CAP_LINUX_IMMUTABLE) |
130 (1ULL << CAP_NET_BIND_SERVICE) |
131 (1ULL << CAP_NET_BROADCAST) |
132 (1ULL << CAP_NET_RAW) |
133 (1ULL << CAP_SETGID) |
134 (1ULL << CAP_SETFCAP) |
135 (1ULL << CAP_SETPCAP) |
136 (1ULL << CAP_SETUID) |
137 (1ULL << CAP_SYS_ADMIN) |
138 (1ULL << CAP_SYS_CHROOT) |
139 (1ULL << CAP_SYS_NICE) |
140 (1ULL << CAP_SYS_PTRACE) |
141 (1ULL << CAP_SYS_TTY_CONFIG) |
142 (1ULL << CAP_SYS_RESOURCE) |
143 (1ULL << CAP_SYS_BOOT) |
144 (1ULL << CAP_AUDIT_WRITE) |
60f067b4
JS
145 (1ULL << CAP_AUDIT_CONTROL) |
146 (1ULL << CAP_MKNOD);
e3bff60a
MP
147static CustomMount *arg_custom_mounts = NULL;
148static unsigned arg_n_custom_mounts = 0;
60f067b4
JS
149static char **arg_setenv = NULL;
150static bool arg_quiet = false;
151static bool arg_share_system = false;
152static bool arg_register = true;
153static bool arg_keep_unit = false;
154static char **arg_network_interfaces = NULL;
155static char **arg_network_macvlan = NULL;
e735f4d4 156static char **arg_network_ipvlan = NULL;
60f067b4 157static bool arg_network_veth = false;
d9dfd233 158static char *arg_network_bridge = NULL;
e3bff60a 159static unsigned long arg_personality = PERSONALITY_INVALID;
e735f4d4 160static char *arg_image = NULL;
d9dfd233 161static VolatileMode arg_volatile_mode = VOLATILE_NO;
e735f4d4 162static ExposePort *arg_expose_ports = NULL;
e3bff60a
MP
163static char **arg_property = NULL;
164static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
165static bool arg_userns = false;
166static int arg_kill_signal = 0;
d9dfd233
MP
167static bool arg_unified_cgroup_hierarchy = false;
168static SettingsMask arg_settings_mask = 0;
169static int arg_settings_trusted = -1;
170static char **arg_parameters = NULL;
663996b3 171
5eef597e 172static void help(void) {
663996b3
MS
173 printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
174 "Spawn a minimal namespace container for debugging, testing and building.\n\n"
60f067b4
JS
175 " -h --help Show this help\n"
176 " --version Print version string\n"
177 " -q --quiet Do not show status information\n"
178 " -D --directory=PATH Root directory for the container\n"
e735f4d4
MP
179 " --template=PATH Initialize root directory from template directory,\n"
180 " if missing\n"
181 " -x --ephemeral Run container with snapshot of root directory, and\n"
182 " remove it after exit\n"
183 " -i --image=PATH File system device or disk image for the container\n"
60f067b4
JS
184 " -b --boot Boot up full system (i.e. invoke init)\n"
185 " -u --user=USER Run the command under specified user or uid\n"
186 " -M --machine=NAME Set the machine name for the container\n"
187 " --uuid=UUID Set a specific machine UUID for the container\n"
188 " -S --slice=SLICE Place the container in the specified slice\n"
e3bff60a
MP
189 " --property=NAME=VALUE Set scope unit property\n"
190 " --private-users[=UIDBASE[:NUIDS]]\n"
191 " Run within user namespace\n"
60f067b4
JS
192 " --private-network Disable network in container\n"
193 " --network-interface=INTERFACE\n"
194 " Assign an existing network interface to the\n"
195 " container\n"
196 " --network-macvlan=INTERFACE\n"
197 " Create a macvlan network interface based on an\n"
198 " existing network interface to the container\n"
e735f4d4
MP
199 " --network-ipvlan=INTERFACE\n"
200 " Create a ipvlan network interface based on an\n"
201 " existing network interface to the container\n"
202 " -n --network-veth Add a virtual ethernet connection between host\n"
60f067b4
JS
203 " and container\n"
204 " --network-bridge=INTERFACE\n"
205 " Add a virtual ethernet connection between host\n"
206 " and container and add it to an existing bridge on\n"
207 " the host\n"
e735f4d4
MP
208 " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
209 " Expose a container IP port on the host\n"
60f067b4
JS
210 " -Z --selinux-context=SECLABEL\n"
211 " Set the SELinux security context to be used by\n"
212 " processes in the container\n"
213 " -L --selinux-apifs-context=SECLABEL\n"
214 " Set the SELinux security context to be used by\n"
215 " API/tmpfs file systems in the container\n"
216 " --capability=CAP In addition to the default, retain specified\n"
217 " capability\n"
218 " --drop-capability=CAP Drop the specified capability from the default set\n"
e3bff60a 219 " --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
f47781d8
MP
220 " --link-journal=MODE Link up guest journal, one of no, auto, guest, host,\n"
221 " try-guest, try-host\n"
222 " -j Equivalent to --link-journal=try-guest\n"
60f067b4 223 " --read-only Mount the root directory read-only\n"
d9dfd233
MP
224 " --bind=PATH[:PATH[:OPTIONS]]\n"
225 " Bind mount a file or directory from the host into\n"
60f067b4 226 " the container\n"
d9dfd233
MP
227 " --bind-ro=PATH[:PATH[:OPTIONS]\n"
228 " Similar, but creates a read-only bind mount\n"
60f067b4 229 " --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
e3bff60a
MP
230 " --overlay=PATH[:PATH...]:PATH\n"
231 " Create an overlay mount from the host to \n"
232 " the container\n"
233 " --overlay-ro=PATH[:PATH...]:PATH\n"
234 " Similar, but creates a read-only overlay mount\n"
60f067b4
JS
235 " --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
236 " --share-system Share system namespaces with host\n"
237 " --register=BOOLEAN Register container as machine\n"
238 " --keep-unit Do not register a scope for the machine, reuse\n"
5eef597e 239 " the service unit nspawn is running in\n"
e735f4d4 240 " --volatile[=MODE] Run the system in volatile mode\n"
d9dfd233 241 " --settings=BOOLEAN Load additional settings from .nspawn file\n"
e735f4d4
MP
242 , program_invocation_short_name);
243}
244
e3bff60a
MP
245
246static int custom_mounts_prepare(void) {
247 unsigned i;
248 int r;
249
250 /* Ensure the mounts are applied prefix first. */
251 qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
252
253 /* Allocate working directories for the overlay file systems that need it */
254 for (i = 0; i < arg_n_custom_mounts; i++) {
255 CustomMount *m = &arg_custom_mounts[i];
256
fb183854
MP
257 if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) {
258 log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
259 return -EINVAL;
260 }
261
e3bff60a
MP
262 if (m->type != CUSTOM_MOUNT_OVERLAY)
263 continue;
264
265 if (m->work_dir)
266 continue;
267
268 if (m->read_only)
269 continue;
270
86f210e9 271 r = tempfn_random(m->source, NULL, &m->work_dir);
e3bff60a
MP
272 if (r < 0)
273 return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
274 }
275
276 return 0;
277}
278
e735f4d4
MP
279static int set_sanitized_path(char **b, const char *path) {
280 char *p;
281
282 assert(b);
283 assert(path);
284
285 p = canonicalize_file_name(path);
286 if (!p) {
287 if (errno != ENOENT)
288 return -errno;
289
290 p = path_make_absolute_cwd(path);
291 if (!p)
292 return -ENOMEM;
293 }
294
295 free(*b);
296 *b = path_kill_slashes(p);
297 return 0;
663996b3
MS
298}
299
d9dfd233
MP
300static int detect_unified_cgroup_hierarchy(void) {
301 const char *e;
302 int r;
303
304 /* Allow the user to control whether the unified hierarchy is used */
305 e = getenv("UNIFIED_CGROUP_HIERARCHY");
306 if (e) {
307 r = parse_boolean(e);
308 if (r < 0)
309 return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
310
311 arg_unified_cgroup_hierarchy = r;
312 return 0;
313 }
314
315 /* Otherwise inherit the default from the host system */
316 r = cg_unified();
317 if (r < 0)
318 return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
319
320 arg_unified_cgroup_hierarchy = r;
321 return 0;
322}
323
663996b3
MS
324static int parse_argv(int argc, char *argv[]) {
325
326 enum {
327 ARG_VERSION = 0x100,
328 ARG_PRIVATE_NETWORK,
329 ARG_UUID,
330 ARG_READ_ONLY,
331 ARG_CAPABILITY,
60f067b4 332 ARG_DROP_CAPABILITY,
663996b3
MS
333 ARG_LINK_JOURNAL,
334 ARG_BIND,
60f067b4
JS
335 ARG_BIND_RO,
336 ARG_TMPFS,
e3bff60a
MP
337 ARG_OVERLAY,
338 ARG_OVERLAY_RO,
60f067b4
JS
339 ARG_SETENV,
340 ARG_SHARE_SYSTEM,
341 ARG_REGISTER,
342 ARG_KEEP_UNIT,
343 ARG_NETWORK_INTERFACE,
344 ARG_NETWORK_MACVLAN,
e735f4d4 345 ARG_NETWORK_IPVLAN,
60f067b4
JS
346 ARG_NETWORK_BRIDGE,
347 ARG_PERSONALITY,
5eef597e 348 ARG_VOLATILE,
e735f4d4 349 ARG_TEMPLATE,
e3bff60a
MP
350 ARG_PROPERTY,
351 ARG_PRIVATE_USERS,
352 ARG_KILL_SIGNAL,
d9dfd233 353 ARG_SETTINGS,
663996b3
MS
354 };
355
356 static const struct option options[] = {
60f067b4
JS
357 { "help", no_argument, NULL, 'h' },
358 { "version", no_argument, NULL, ARG_VERSION },
359 { "directory", required_argument, NULL, 'D' },
e735f4d4
MP
360 { "template", required_argument, NULL, ARG_TEMPLATE },
361 { "ephemeral", no_argument, NULL, 'x' },
60f067b4
JS
362 { "user", required_argument, NULL, 'u' },
363 { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK },
364 { "boot", no_argument, NULL, 'b' },
365 { "uuid", required_argument, NULL, ARG_UUID },
366 { "read-only", no_argument, NULL, ARG_READ_ONLY },
367 { "capability", required_argument, NULL, ARG_CAPABILITY },
368 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
369 { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL },
370 { "bind", required_argument, NULL, ARG_BIND },
371 { "bind-ro", required_argument, NULL, ARG_BIND_RO },
372 { "tmpfs", required_argument, NULL, ARG_TMPFS },
e3bff60a
MP
373 { "overlay", required_argument, NULL, ARG_OVERLAY },
374 { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
60f067b4
JS
375 { "machine", required_argument, NULL, 'M' },
376 { "slice", required_argument, NULL, 'S' },
377 { "setenv", required_argument, NULL, ARG_SETENV },
378 { "selinux-context", required_argument, NULL, 'Z' },
379 { "selinux-apifs-context", required_argument, NULL, 'L' },
380 { "quiet", no_argument, NULL, 'q' },
381 { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM },
382 { "register", required_argument, NULL, ARG_REGISTER },
383 { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT },
384 { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE },
385 { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN },
e735f4d4
MP
386 { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN },
387 { "network-veth", no_argument, NULL, 'n' },
60f067b4
JS
388 { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE },
389 { "personality", required_argument, NULL, ARG_PERSONALITY },
390 { "image", required_argument, NULL, 'i' },
5eef597e 391 { "volatile", optional_argument, NULL, ARG_VOLATILE },
e735f4d4 392 { "port", required_argument, NULL, 'p' },
e3bff60a
MP
393 { "property", required_argument, NULL, ARG_PROPERTY },
394 { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
395 { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
d9dfd233 396 { "settings", required_argument, NULL, ARG_SETTINGS },
60f067b4 397 {}
663996b3
MS
398 };
399
14228c0d 400 int c, r;
60f067b4 401 uint64_t plus = 0, minus = 0;
d9dfd233 402 bool mask_all_settings = false, mask_no_settings = false;
663996b3
MS
403
404 assert(argc >= 0);
405 assert(argv);
406
e735f4d4 407 while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
663996b3
MS
408
409 switch (c) {
410
411 case 'h':
5eef597e
MP
412 help();
413 return 0;
663996b3
MS
414
415 case ARG_VERSION:
6300502b 416 return version();
663996b3
MS
417
418 case 'D':
e735f4d4
MP
419 r = set_sanitized_path(&arg_directory, optarg);
420 if (r < 0)
421 return log_error_errno(r, "Invalid root directory: %m");
422
423 break;
424
425 case ARG_TEMPLATE:
426 r = set_sanitized_path(&arg_template, optarg);
427 if (r < 0)
428 return log_error_errno(r, "Invalid template directory: %m");
663996b3
MS
429
430 break;
431
60f067b4 432 case 'i':
e735f4d4
MP
433 r = set_sanitized_path(&arg_image, optarg);
434 if (r < 0)
435 return log_error_errno(r, "Invalid image path: %m");
436
437 break;
438
439 case 'x':
440 arg_ephemeral = true;
60f067b4
JS
441 break;
442
663996b3 443 case 'u':
5fd56512
MP
444 r = free_and_strdup(&arg_user, optarg);
445 if (r < 0)
663996b3
MS
446 return log_oom();
447
d9dfd233 448 arg_settings_mask |= SETTING_USER;
663996b3
MS
449 break;
450
60f067b4 451 case ARG_NETWORK_BRIDGE:
d9dfd233
MP
452 r = free_and_strdup(&arg_network_bridge, optarg);
453 if (r < 0)
454 return log_oom();
60f067b4
JS
455
456 /* fall through */
457
e735f4d4 458 case 'n':
60f067b4
JS
459 arg_network_veth = true;
460 arg_private_network = true;
d9dfd233 461 arg_settings_mask |= SETTING_NETWORK;
60f067b4
JS
462 break;
463
464 case ARG_NETWORK_INTERFACE:
465 if (strv_extend(&arg_network_interfaces, optarg) < 0)
466 return log_oom();
467
468 arg_private_network = true;
d9dfd233 469 arg_settings_mask |= SETTING_NETWORK;
60f067b4
JS
470 break;
471
472 case ARG_NETWORK_MACVLAN:
473 if (strv_extend(&arg_network_macvlan, optarg) < 0)
474 return log_oom();
475
e735f4d4 476 arg_private_network = true;
d9dfd233 477 arg_settings_mask |= SETTING_NETWORK;
e735f4d4
MP
478 break;
479
480 case ARG_NETWORK_IPVLAN:
481 if (strv_extend(&arg_network_ipvlan, optarg) < 0)
482 return log_oom();
483
60f067b4
JS
484 /* fall through */
485
663996b3
MS
486 case ARG_PRIVATE_NETWORK:
487 arg_private_network = true;
d9dfd233 488 arg_settings_mask |= SETTING_NETWORK;
663996b3
MS
489 break;
490
491 case 'b':
492 arg_boot = true;
d9dfd233 493 arg_settings_mask |= SETTING_BOOT;
663996b3
MS
494 break;
495
496 case ARG_UUID:
14228c0d
MB
497 r = sd_id128_from_string(optarg, &arg_uuid);
498 if (r < 0) {
663996b3 499 log_error("Invalid UUID: %s", optarg);
14228c0d 500 return r;
663996b3 501 }
d9dfd233
MP
502
503 arg_settings_mask |= SETTING_MACHINE_ID;
14228c0d 504 break;
663996b3 505
14228c0d 506 case 'S':
60f067b4 507 arg_slice = optarg;
663996b3
MS
508 break;
509
510 case 'M':
13d276d0
MP
511 if (isempty(optarg))
512 arg_machine = mfree(arg_machine);
513 else {
e735f4d4 514 if (!machine_name_is_valid(optarg)) {
60f067b4
JS
515 log_error("Invalid machine name: %s", optarg);
516 return -EINVAL;
517 }
518
e735f4d4
MP
519 r = free_and_strdup(&arg_machine, optarg);
520 if (r < 0)
60f067b4
JS
521 return log_oom();
522
523 break;
663996b3
MS
524 }
525
60f067b4
JS
526 case 'Z':
527 arg_selinux_context = optarg;
528 break;
663996b3 529
60f067b4
JS
530 case 'L':
531 arg_selinux_apifs_context = optarg;
663996b3
MS
532 break;
533
534 case ARG_READ_ONLY:
535 arg_read_only = true;
d9dfd233 536 arg_settings_mask |= SETTING_READ_ONLY;
663996b3
MS
537 break;
538
60f067b4
JS
539 case ARG_CAPABILITY:
540 case ARG_DROP_CAPABILITY: {
5eef597e 541 const char *state, *word;
663996b3
MS
542 size_t length;
543
544 FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
60f067b4 545 _cleanup_free_ char *t;
663996b3
MS
546
547 t = strndup(word, length);
548 if (!t)
549 return log_oom();
550
60f067b4
JS
551 if (streq(t, "all")) {
552 if (c == ARG_CAPABILITY)
553 plus = (uint64_t) -1;
554 else
555 minus = (uint64_t) -1;
556 } else {
f47781d8
MP
557 int cap;
558
559 cap = capability_from_name(t);
560 if (cap < 0) {
60f067b4
JS
561 log_error("Failed to parse capability %s.", t);
562 return -EINVAL;
563 }
663996b3 564
60f067b4
JS
565 if (c == ARG_CAPABILITY)
566 plus |= 1ULL << (uint64_t) cap;
567 else
568 minus |= 1ULL << (uint64_t) cap;
569 }
663996b3
MS
570 }
571
d9dfd233 572 arg_settings_mask |= SETTING_CAPABILITY;
663996b3
MS
573 break;
574 }
575
576 case 'j':
577 arg_link_journal = LINK_GUEST;
f47781d8 578 arg_link_journal_try = true;
663996b3
MS
579 break;
580
581 case ARG_LINK_JOURNAL:
e735f4d4 582 if (streq(optarg, "auto")) {
663996b3 583 arg_link_journal = LINK_AUTO;
e735f4d4
MP
584 arg_link_journal_try = false;
585 } else if (streq(optarg, "no")) {
663996b3 586 arg_link_journal = LINK_NO;
e735f4d4
MP
587 arg_link_journal_try = false;
588 } else if (streq(optarg, "guest")) {
663996b3 589 arg_link_journal = LINK_GUEST;
e735f4d4
MP
590 arg_link_journal_try = false;
591 } else if (streq(optarg, "host")) {
663996b3 592 arg_link_journal = LINK_HOST;
e735f4d4
MP
593 arg_link_journal_try = false;
594 } else if (streq(optarg, "try-guest")) {
f47781d8
MP
595 arg_link_journal = LINK_GUEST;
596 arg_link_journal_try = true;
597 } else if (streq(optarg, "try-host")) {
598 arg_link_journal = LINK_HOST;
599 arg_link_journal_try = true;
600 } else {
663996b3
MS
601 log_error("Failed to parse link journal mode %s", optarg);
602 return -EINVAL;
603 }
604
605 break;
606
607 case ARG_BIND:
d9dfd233
MP
608 case ARG_BIND_RO:
609 r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
610 if (r < 0)
611 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
60f067b4 612
d9dfd233 613 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
60f067b4 614 break;
60f067b4 615
d9dfd233
MP
616 case ARG_TMPFS:
617 r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
618 if (r < 0)
619 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
e3bff60a 620
d9dfd233 621 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
e3bff60a 622 break;
e3bff60a
MP
623
624 case ARG_OVERLAY:
625 case ARG_OVERLAY_RO: {
626 _cleanup_free_ char *upper = NULL, *destination = NULL;
627 _cleanup_strv_free_ char **lower = NULL;
628 CustomMount *m;
629 unsigned n = 0;
630 char **i;
631
13d276d0
MP
632 r = strv_split_extract(&lower, optarg, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
633 if (r == -ENOMEM)
e3bff60a 634 return log_oom();
13d276d0
MP
635 else if (r < 0) {
636 log_error("Invalid overlay specification: %s", optarg);
637 return r;
638 }
e3bff60a
MP
639
640 STRV_FOREACH(i, lower) {
641 if (!path_is_absolute(*i)) {
642 log_error("Overlay path %s is not absolute.", *i);
643 return -EINVAL;
644 }
645
646 n++;
647 }
648
649 if (n < 2) {
650 log_error("--overlay= needs at least two colon-separated directories specified.");
651 return -EINVAL;
652 }
653
654 if (n == 2) {
655 /* If two parameters are specified,
656 * the first one is the lower, the
657 * second one the upper directory. And
fb183854
MP
658 * we'll also define the destination
659 * mount point the same as the upper. */
e3bff60a
MP
660 upper = lower[1];
661 lower[1] = NULL;
662
663 destination = strdup(upper);
664 if (!destination)
665 return log_oom();
666
667 } else {
668 upper = lower[n - 2];
669 destination = lower[n - 1];
670 lower[n - 2] = NULL;
671 }
672
d9dfd233 673 m = custom_mount_add(&arg_custom_mounts, &arg_n_custom_mounts, CUSTOM_MOUNT_OVERLAY);
e3bff60a 674 if (!m)
60f067b4
JS
675 return log_oom();
676
e3bff60a
MP
677 m->destination = destination;
678 m->source = upper;
679 m->lower = lower;
680 m->read_only = c == ARG_OVERLAY_RO;
681
682 upper = destination = NULL;
683 lower = NULL;
60f067b4 684
d9dfd233 685 arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
60f067b4
JS
686 break;
687 }
688
689 case ARG_SETENV: {
690 char **n;
691
692 if (!env_assignment_is_valid(optarg)) {
693 log_error("Environment variable assignment '%s' is not valid.", optarg);
694 return -EINVAL;
695 }
696
697 n = strv_env_set(arg_setenv, optarg);
698 if (!n)
699 return log_oom();
663996b3 700
60f067b4
JS
701 strv_free(arg_setenv);
702 arg_setenv = n;
d9dfd233
MP
703
704 arg_settings_mask |= SETTING_ENVIRONMENT;
663996b3
MS
705 break;
706 }
707
60f067b4
JS
708 case 'q':
709 arg_quiet = true;
710 break;
711
712 case ARG_SHARE_SYSTEM:
713 arg_share_system = true;
714 break;
715
716 case ARG_REGISTER:
717 r = parse_boolean(optarg);
718 if (r < 0) {
719 log_error("Failed to parse --register= argument: %s", optarg);
720 return r;
721 }
722
723 arg_register = r;
724 break;
725
726 case ARG_KEEP_UNIT:
727 arg_keep_unit = true;
728 break;
729
730 case ARG_PERSONALITY:
731
732 arg_personality = personality_from_string(optarg);
e3bff60a 733 if (arg_personality == PERSONALITY_INVALID) {
60f067b4
JS
734 log_error("Unknown or unsupported personality '%s'.", optarg);
735 return -EINVAL;
736 }
737
d9dfd233 738 arg_settings_mask |= SETTING_PERSONALITY;
60f067b4
JS
739 break;
740
5eef597e
MP
741 case ARG_VOLATILE:
742
743 if (!optarg)
d9dfd233 744 arg_volatile_mode = VOLATILE_YES;
e735f4d4 745 else {
d9dfd233 746 VolatileMode m;
e735f4d4 747
d9dfd233
MP
748 m = volatile_mode_from_string(optarg);
749 if (m < 0) {
750 log_error("Failed to parse --volatile= argument: %s", optarg);
e735f4d4 751 return -EINVAL;
d9dfd233
MP
752 } else
753 arg_volatile_mode = m;
e735f4d4
MP
754 }
755
d9dfd233
MP
756 arg_settings_mask |= SETTING_VOLATILE_MODE;
757 break;
e735f4d4 758
d9dfd233
MP
759 case 'p':
760 r = expose_port_parse(&arg_expose_ports, optarg);
761 if (r == -EEXIST)
762 return log_error_errno(r, "Duplicate port specification: %s", optarg);
763 if (r < 0)
764 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
e735f4d4 765
d9dfd233 766 arg_settings_mask |= SETTING_EXPOSE_PORTS;
e735f4d4 767 break;
e735f4d4 768
e3bff60a
MP
769 case ARG_PROPERTY:
770 if (strv_extend(&arg_property, optarg) < 0)
771 return log_oom();
772
773 break;
774
775 case ARG_PRIVATE_USERS:
776 if (optarg) {
777 _cleanup_free_ char *buffer = NULL;
778 const char *range, *shift;
779
780 range = strchr(optarg, ':');
781 if (range) {
782 buffer = strndup(optarg, range - optarg);
783 if (!buffer)
784 return log_oom();
785 shift = buffer;
786
787 range++;
788 if (safe_atou32(range, &arg_uid_range) < 0 || arg_uid_range <= 0) {
789 log_error("Failed to parse UID range: %s", range);
790 return -EINVAL;
791 }
792 } else
793 shift = optarg;
794
795 if (parse_uid(shift, &arg_uid_shift) < 0) {
796 log_error("Failed to parse UID: %s", optarg);
797 return -EINVAL;
798 }
799 }
800
801 arg_userns = true;
802 break;
803
804 case ARG_KILL_SIGNAL:
805 arg_kill_signal = signal_from_string_try_harder(optarg);
806 if (arg_kill_signal < 0) {
807 log_error("Cannot parse signal: %s", optarg);
808 return -EINVAL;
809 }
810
d9dfd233
MP
811 arg_settings_mask |= SETTING_KILL_SIGNAL;
812 break;
813
814 case ARG_SETTINGS:
815
816 /* no → do not read files
817 * yes → read files, do not override cmdline, trust only subset
818 * override → read files, override cmdline, trust only subset
819 * trusted → read files, do not override cmdline, trust all
820 */
821
822 r = parse_boolean(optarg);
823 if (r < 0) {
824 if (streq(optarg, "trusted")) {
825 mask_all_settings = false;
826 mask_no_settings = false;
827 arg_settings_trusted = true;
828
829 } else if (streq(optarg, "override")) {
830 mask_all_settings = false;
831 mask_no_settings = true;
832 arg_settings_trusted = -1;
833 } else
834 return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
835 } else if (r > 0) {
836 /* yes */
837 mask_all_settings = false;
838 mask_no_settings = false;
839 arg_settings_trusted = -1;
840 } else {
841 /* no */
842 mask_all_settings = true;
843 mask_no_settings = false;
844 arg_settings_trusted = false;
845 }
846
e3bff60a
MP
847 break;
848
663996b3
MS
849 case '?':
850 return -EINVAL;
851
852 default:
60f067b4 853 assert_not_reached("Unhandled option");
663996b3 854 }
663996b3 855
60f067b4
JS
856 if (arg_share_system)
857 arg_register = false;
858
859 if (arg_boot && arg_share_system) {
860 log_error("--boot and --share-system may not be combined.");
861 return -EINVAL;
862 }
863
864 if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
865 log_error("--keep-unit may not be used when invoked from a user session.");
866 return -EINVAL;
867 }
868
869 if (arg_directory && arg_image) {
870 log_error("--directory= and --image= may not be combined.");
871 return -EINVAL;
872 }
873
e735f4d4
MP
874 if (arg_template && arg_image) {
875 log_error("--template= and --image= may not be combined.");
876 return -EINVAL;
877 }
878
879 if (arg_template && !(arg_directory || arg_machine)) {
880 log_error("--template= needs --directory= or --machine=.");
881 return -EINVAL;
882 }
883
884 if (arg_ephemeral && arg_template) {
885 log_error("--ephemeral and --template= may not be combined.");
886 return -EINVAL;
887 }
888
889 if (arg_ephemeral && arg_image) {
890 log_error("--ephemeral and --image= may not be combined.");
891 return -EINVAL;
892 }
893
894 if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
895 log_error("--ephemeral and --link-journal= may not be combined.");
896 return -EINVAL;
897 }
898
d9dfd233
MP
899 if (arg_userns && access("/proc/self/uid_map", F_OK) < 0)
900 return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support.");
901
902 if (argc > optind) {
903 arg_parameters = strv_copy(argv + optind);
904 if (!arg_parameters)
905 return log_oom();
906
907 arg_settings_mask |= SETTING_BOOT;
908 }
909
910 /* Load all settings from .nspawn files */
911 if (mask_no_settings)
912 arg_settings_mask = 0;
913
914 /* Don't load any settings from .nspawn files */
915 if (mask_all_settings)
916 arg_settings_mask = _SETTINGS_MASK_ALL;
917
918 arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
919
920 r = detect_unified_cgroup_hierarchy();
921 if (r < 0)
922 return r;
923
924 return 1;
925}
926
927static int verify_arguments(void) {
928
929 if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
5eef597e
MP
930 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
931 return -EINVAL;
932 }
933
e735f4d4
MP
934 if (arg_expose_ports && !arg_private_network) {
935 log_error("Cannot use --port= without private networking.");
936 return -EINVAL;
937 }
938
e3bff60a
MP
939 if (arg_boot && arg_kill_signal <= 0)
940 arg_kill_signal = SIGRTMIN+3;
941
d9dfd233 942 return 0;
663996b3
MS
943}
944
d9dfd233
MP
945static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
946 assert(p);
e3bff60a 947
d9dfd233
MP
948 if (!arg_userns)
949 return 0;
e3bff60a 950
d9dfd233
MP
951 if (uid == UID_INVALID && gid == GID_INVALID)
952 return 0;
e3bff60a 953
d9dfd233
MP
954 if (uid != UID_INVALID) {
955 uid += arg_uid_shift;
e3bff60a 956
d9dfd233
MP
957 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
958 return -EOVERFLOW;
e3bff60a
MP
959 }
960
961 if (gid != GID_INVALID) {
962 gid += (gid_t) arg_uid_shift;
60f067b4 963
e3bff60a
MP
964 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
965 return -EOVERFLOW;
663996b3
MS
966 }
967
e3bff60a
MP
968 if (lchown(p, uid, gid) < 0)
969 return -errno;
970
663996b3
MS
971 return 0;
972}
973
e3bff60a
MP
974static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
975 const char *q;
976
977 q = prefix_roota(root, path);
978 if (mkdir(q, mode) < 0) {
979 if (errno == EEXIST)
980 return 0;
981 return -errno;
982 }
983
984 return userns_lchown(q, uid, gid);
985}
986
987static int setup_timezone(const char *dest) {
988 _cleanup_free_ char *p = NULL, *q = NULL;
989 const char *where, *check, *what;
990 char *z, *y;
663996b3
MS
991 int r;
992
993 assert(dest);
994
995 /* Fix the timezone, if possible */
996 r = readlink_malloc("/etc/localtime", &p);
997 if (r < 0) {
998 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
999 return 0;
1000 }
1001
1002 z = path_startswith(p, "../usr/share/zoneinfo/");
1003 if (!z)
1004 z = path_startswith(p, "/usr/share/zoneinfo/");
1005 if (!z) {
1006 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1007 return 0;
1008 }
1009
e3bff60a 1010 where = prefix_roota(dest, "/etc/localtime");
663996b3
MS
1011 r = readlink_malloc(where, &q);
1012 if (r >= 0) {
1013 y = path_startswith(q, "../usr/share/zoneinfo/");
1014 if (!y)
1015 y = path_startswith(q, "/usr/share/zoneinfo/");
1016
663996b3
MS
1017 /* Already pointing to the right place? Then do nothing .. */
1018 if (y && streq(y, z))
1019 return 0;
1020 }
1021
e3bff60a
MP
1022 check = strjoina("/usr/share/zoneinfo/", z);
1023 check = prefix_root(dest, check);
1024 if (laccess(check, F_OK) < 0) {
663996b3
MS
1025 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1026 return 0;
1027 }
1028
5eef597e
MP
1029 r = unlink(where);
1030 if (r < 0 && errno != ENOENT) {
f47781d8 1031 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
5eef597e
MP
1032 return 0;
1033 }
1034
e3bff60a 1035 what = strjoina("../usr/share/zoneinfo/", z);
663996b3 1036 if (symlink(what, where) < 0) {
f47781d8 1037 log_error_errno(errno, "Failed to correct timezone of container: %m");
663996b3
MS
1038 return 0;
1039 }
1040
e3bff60a
MP
1041 r = userns_lchown(where, 0, 0);
1042 if (r < 0)
1043 return log_warning_errno(r, "Failed to chown /etc/localtime: %m");
1044
663996b3
MS
1045 return 0;
1046}
1047
1048static int setup_resolv_conf(const char *dest) {
e3bff60a 1049 const char *where = NULL;
5eef597e 1050 int r;
663996b3
MS
1051
1052 assert(dest);
1053
1054 if (arg_private_network)
1055 return 0;
1056
1057 /* Fix resolv.conf, if possible */
e3bff60a 1058 where = prefix_roota(dest, "/etc/resolv.conf");
5eef597e 1059
e735f4d4 1060 r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
5eef597e 1061 if (r < 0) {
86f210e9
MP
1062 /* If the file already exists as symlink, let's
1063 * suppress the warning, under the assumption that
1064 * resolved or something similar runs inside and the
1065 * symlink points there.
1066 *
1067 * If the disk image is read-only, there's also no
1068 * point in complaining.
1069 */
1070 log_full_errno(IN_SET(r, -ELOOP, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
1071 "Failed to copy /etc/resolv.conf to %s: %m", where);
5eef597e
MP
1072 return 0;
1073 }
1074
e3bff60a
MP
1075 r = userns_lchown(where, 0, 0);
1076 if (r < 0)
1077 log_warning_errno(r, "Failed to chown /etc/resolv.conf: %m");
1078
5eef597e
MP
1079 return 0;
1080}
1081
d9dfd233
MP
1082static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1083 assert(s);
e3bff60a 1084
d9dfd233
MP
1085 snprintf(s, 37,
1086 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1087 SD_ID128_FORMAT_VAL(id));
663996b3 1088
d9dfd233 1089 return s;
663996b3
MS
1090}
1091
d9dfd233
MP
1092static int setup_boot_id(const char *dest) {
1093 const char *from, *to;
1094 sd_id128_t rnd = {};
1095 char as_uuid[37];
5eef597e
MP
1096 int r;
1097
d9dfd233 1098 if (arg_share_system)
5eef597e
MP
1099 return 0;
1100
d9dfd233
MP
1101 /* Generate a new randomized boot ID, so that each boot-up of
1102 * the container gets a new one */
5eef597e 1103
d9dfd233
MP
1104 from = prefix_roota(dest, "/run/proc-sys-kernel-random-boot-id");
1105 to = prefix_roota(dest, "/proc/sys/kernel/random/boot_id");
5eef597e 1106
d9dfd233 1107 r = sd_id128_randomize(&rnd);
e3bff60a 1108 if (r < 0)
d9dfd233 1109 return log_error_errno(r, "Failed to generate random boot id: %m");
e3bff60a 1110
d9dfd233 1111 id128_format_as_uuid(rnd, as_uuid);
5eef597e 1112
d9dfd233
MP
1113 r = write_string_file(from, as_uuid, WRITE_STRING_FILE_CREATE);
1114 if (r < 0)
1115 return log_error_errno(r, "Failed to write boot id: %m");
5eef597e 1116
d9dfd233
MP
1117 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1118 r = log_error_errno(errno, "Failed to bind mount boot id: %m");
1119 else if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL) < 0)
1120 log_warning_errno(errno, "Failed to make boot id read-only: %m");
5eef597e 1121
d9dfd233
MP
1122 unlink(from);
1123 return r;
1124}
5eef597e 1125
d9dfd233 1126static int copy_devnodes(const char *dest) {
663996b3
MS
1127
1128 static const char devnodes[] =
1129 "null\0"
1130 "zero\0"
1131 "full\0"
1132 "random\0"
1133 "urandom\0"
5eef597e
MP
1134 "tty\0"
1135 "net/tun\0";
663996b3
MS
1136
1137 const char *d;
1138 int r = 0;
1139 _cleanup_umask_ mode_t u;
1140
1141 assert(dest);
1142
1143 u = umask(0000);
1144
e3bff60a
MP
1145 /* Create /dev/net, so that we can create /dev/net/tun in it */
1146 if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
1147 return log_error_errno(r, "Failed to create /dev/net directory: %m");
1148
663996b3 1149 NULSTR_FOREACH(d, devnodes) {
663996b3 1150 _cleanup_free_ char *from = NULL, *to = NULL;
60f067b4 1151 struct stat st;
663996b3 1152
60f067b4 1153 from = strappend("/dev/", d);
e3bff60a 1154 to = prefix_root(dest, from);
663996b3
MS
1155
1156 if (stat(from, &st) < 0) {
1157
f47781d8
MP
1158 if (errno != ENOENT)
1159 return log_error_errno(errno, "Failed to stat %s: %m", from);
663996b3
MS
1160
1161 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1162
e3bff60a 1163 log_error("%s is not a char or block device, cannot copy.", from);
60f067b4 1164 return -EIO;
663996b3 1165
5eef597e 1166 } else {
e3bff60a
MP
1167 if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1168 if (errno != EPERM)
1169 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1170
1171 /* Some systems abusively restrict mknod but
1172 * allow bind mounts. */
1173 r = touch(to);
1174 if (r < 0)
1175 return log_error_errno(r, "touch (%s) failed: %m", to);
1176 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
1177 return log_error_errno(errno, "Both mknod and bind mount (%s) failed: %m", to);
5eef597e 1178 }
663996b3 1179
e3bff60a
MP
1180 r = userns_lchown(to, 0, 0);
1181 if (r < 0)
1182 return log_error_errno(r, "chown() of device node %s failed: %m", to);
663996b3
MS
1183 }
1184 }
1185
1186 return r;
1187}
1188
e3bff60a
MP
1189static int setup_pts(const char *dest) {
1190 _cleanup_free_ char *options = NULL;
1191 const char *p;
1192
1193#ifdef HAVE_SELINUX
1194 if (arg_selinux_apifs_context)
1195 (void) asprintf(&options,
7035cd9e 1196 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
e3bff60a
MP
1197 arg_uid_shift + TTY_GID,
1198 arg_selinux_apifs_context);
1199 else
1200#endif
1201 (void) asprintf(&options,
7035cd9e 1202 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
e3bff60a 1203 arg_uid_shift + TTY_GID);
663996b3 1204
e3bff60a 1205 if (!options)
663996b3
MS
1206 return log_oom();
1207
e3bff60a 1208 /* Mount /dev/pts itself */
86f210e9 1209 p = prefix_roota(dest, "/dev/pts");
e3bff60a
MP
1210 if (mkdir(p, 0755) < 0)
1211 return log_error_errno(errno, "Failed to create /dev/pts: %m");
1212 if (mount("devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options) < 0)
1213 return log_error_errno(errno, "Failed to mount /dev/pts: %m");
1214 if (userns_lchown(p, 0, 0) < 0)
1215 return log_error_errno(errno, "Failed to chown /dev/pts: %m");
1216
1217 /* Create /dev/ptmx symlink */
1218 p = prefix_roota(dest, "/dev/ptmx");
f47781d8
MP
1219 if (symlink("pts/ptmx", p) < 0)
1220 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
e3bff60a
MP
1221 if (userns_lchown(p, 0, 0) < 0)
1222 return log_error_errno(errno, "Failed to chown /dev/ptmx: %m");
1223
1224 /* And fix /dev/pts/ptmx ownership */
1225 p = prefix_roota(dest, "/dev/pts/ptmx");
1226 if (userns_lchown(p, 0, 0) < 0)
1227 return log_error_errno(errno, "Failed to chown /dev/pts/ptmx: %m");
663996b3
MS
1228
1229 return 0;
1230}
1231
1232static int setup_dev_console(const char *dest, const char *console) {
60f067b4
JS
1233 _cleanup_umask_ mode_t u;
1234 const char *to;
663996b3 1235 int r;
663996b3
MS
1236
1237 assert(dest);
1238 assert(console);
1239
1240 u = umask(0000);
1241
e3bff60a 1242 r = chmod_and_chown(console, 0600, arg_uid_shift, arg_uid_shift);
f47781d8
MP
1243 if (r < 0)
1244 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
663996b3 1245
663996b3
MS
1246 /* We need to bind mount the right tty to /dev/console since
1247 * ptys can only exist on pts file systems. To have something
e3bff60a 1248 * to bind mount things on we create a empty regular file. */
663996b3 1249
e3bff60a
MP
1250 to = prefix_roota(dest, "/dev/console");
1251 r = touch(to);
1252 if (r < 0)
1253 return log_error_errno(r, "touch() for /dev/console failed: %m");
663996b3 1254
e3bff60a 1255 if (mount(console, to, NULL, MS_BIND, NULL) < 0)
f47781d8 1256 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
663996b3
MS
1257
1258 return 0;
1259}
1260
1261static int setup_kmsg(const char *dest, int kmsg_socket) {
e3bff60a 1262 const char *from, *to;
663996b3 1263 _cleanup_umask_ mode_t u;
6300502b 1264 int fd, r;
663996b3 1265
663996b3
MS
1266 assert(kmsg_socket >= 0);
1267
1268 u = umask(0000);
1269
e3bff60a 1270 /* We create the kmsg FIFO as /run/kmsg, but immediately
663996b3
MS
1271 * delete it after bind mounting it to /proc/kmsg. While FIFOs
1272 * on the reading side behave very similar to /proc/kmsg,
1273 * their writing side behaves differently from /dev/kmsg in
1274 * that writing blocks when nothing is reading. In order to
1275 * avoid any problems with containers deadlocking due to this
1276 * we simply make /dev/kmsg unavailable to the container. */
e3bff60a
MP
1277 from = prefix_roota(dest, "/run/kmsg");
1278 to = prefix_roota(dest, "/proc/kmsg");
663996b3 1279
f47781d8 1280 if (mkfifo(from, 0600) < 0)
e3bff60a
MP
1281 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
1282 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
f47781d8 1283 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
663996b3
MS
1284
1285 fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
f47781d8
MP
1286 if (fd < 0)
1287 return log_error_errno(errno, "Failed to open fifo: %m");
663996b3 1288
663996b3
MS
1289 /* Store away the fd in the socket, so that it stays open as
1290 * long as we run the child */
6300502b 1291 r = send_one_fd(kmsg_socket, fd, 0);
60f067b4 1292 safe_close(fd);
663996b3 1293
6300502b
MP
1294 if (r < 0)
1295 return log_error_errno(r, "Failed to send FIFO fd: %m");
663996b3 1296
e3bff60a
MP
1297 /* And now make the FIFO unavailable as /run/kmsg... */
1298 (void) unlink(from);
1299
663996b3
MS
1300 return 0;
1301}
1302
86f210e9 1303static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
e735f4d4
MP
1304 union in_addr_union *exposed = userdata;
1305
1306 assert(rtnl);
1307 assert(m);
1308 assert(exposed);
1309
d9dfd233 1310 expose_port_execute(rtnl, arg_expose_ports, exposed);
e735f4d4
MP
1311 return 0;
1312}
1313
663996b3
MS
1314static int setup_hostname(void) {
1315
60f067b4
JS
1316 if (arg_share_system)
1317 return 0;
1318
5eef597e 1319 if (sethostname_idempotent(arg_machine) < 0)
663996b3
MS
1320 return -errno;
1321
1322 return 0;
1323}
1324
1325static int setup_journal(const char *directory) {
60f067b4 1326 sd_id128_t machine_id, this_id;
e3bff60a
MP
1327 _cleanup_free_ char *b = NULL, *d = NULL;
1328 const char *etc_machine_id, *p, *q;
663996b3
MS
1329 char *id;
1330 int r;
1331
e735f4d4
MP
1332 /* Don't link journals in ephemeral mode */
1333 if (arg_ephemeral)
1334 return 0;
1335
e3bff60a 1336 etc_machine_id = prefix_roota(directory, "/etc/machine-id");
663996b3 1337
e3bff60a 1338 r = read_one_line_file(etc_machine_id, &b);
663996b3
MS
1339 if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1340 return 0;
f47781d8 1341 else if (r < 0)
e3bff60a 1342 return log_error_errno(r, "Failed to read machine ID from %s: %m", etc_machine_id);
663996b3
MS
1343
1344 id = strstrip(b);
1345 if (isempty(id) && arg_link_journal == LINK_AUTO)
1346 return 0;
1347
1348 /* Verify validity */
1349 r = sd_id128_from_string(id, &machine_id);
f47781d8 1350 if (r < 0)
e3bff60a 1351 return log_error_errno(r, "Failed to parse machine ID from %s: %m", etc_machine_id);
663996b3 1352
60f067b4 1353 r = sd_id128_get_machine(&this_id);
f47781d8
MP
1354 if (r < 0)
1355 return log_error_errno(r, "Failed to retrieve machine ID: %m");
60f067b4
JS
1356
1357 if (sd_id128_equal(machine_id, this_id)) {
1358 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1359 "Host and machine ids are equal (%s): refusing to link journals", id);
1360 if (arg_link_journal == LINK_AUTO)
1361 return 0;
e735f4d4 1362 return -EEXIST;
60f067b4
JS
1363 }
1364
1365 if (arg_link_journal == LINK_NO)
1366 return 0;
1367
e3bff60a
MP
1368 r = userns_mkdir(directory, "/var", 0755, 0, 0);
1369 if (r < 0)
1370 return log_error_errno(r, "Failed to create /var: %m");
1371
1372 r = userns_mkdir(directory, "/var/log", 0755, 0, 0);
1373 if (r < 0)
1374 return log_error_errno(r, "Failed to create /var/log: %m");
1375
1376 r = userns_mkdir(directory, "/var/log/journal", 0755, 0, 0);
1377 if (r < 0)
1378 return log_error_errno(r, "Failed to create /var/log/journal: %m");
1379
1380 p = strjoina("/var/log/journal/", id);
1381 q = prefix_roota(directory, p);
663996b3 1382
86f210e9 1383 if (path_is_mount_point(p, 0) > 0) {
663996b3
MS
1384 if (arg_link_journal != LINK_AUTO) {
1385 log_error("%s: already a mount point, refusing to use for journal", p);
1386 return -EEXIST;
1387 }
1388
1389 return 0;
1390 }
1391
86f210e9 1392 if (path_is_mount_point(q, 0) > 0) {
663996b3
MS
1393 if (arg_link_journal != LINK_AUTO) {
1394 log_error("%s: already a mount point, refusing to use for journal", q);
1395 return -EEXIST;
1396 }
1397
1398 return 0;
1399 }
1400
1401 r = readlink_and_make_absolute(p, &d);
1402 if (r >= 0) {
1403 if ((arg_link_journal == LINK_GUEST ||
1404 arg_link_journal == LINK_AUTO) &&
1405 path_equal(d, q)) {
1406
e3bff60a 1407 r = userns_mkdir(directory, p, 0755, 0, 0);
663996b3 1408 if (r < 0)
f47781d8 1409 log_warning_errno(errno, "Failed to create directory %s: %m", q);
663996b3
MS
1410 return 0;
1411 }
1412
f47781d8
MP
1413 if (unlink(p) < 0)
1414 return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
663996b3
MS
1415 } else if (r == -EINVAL) {
1416
1417 if (arg_link_journal == LINK_GUEST &&
1418 rmdir(p) < 0) {
1419
1420 if (errno == ENOTDIR) {
1421 log_error("%s already exists and is neither a symlink nor a directory", p);
1422 return r;
1423 } else {
f47781d8 1424 log_error_errno(errno, "Failed to remove %s: %m", p);
663996b3
MS
1425 return -errno;
1426 }
1427 }
1428 } else if (r != -ENOENT) {
f47781d8 1429 log_error_errno(errno, "readlink(%s) failed: %m", p);
663996b3
MS
1430 return r;
1431 }
1432
1433 if (arg_link_journal == LINK_GUEST) {
1434
1435 if (symlink(q, p) < 0) {
f47781d8
MP
1436 if (arg_link_journal_try) {
1437 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1438 return 0;
1439 } else {
1440 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1441 return -errno;
1442 }
663996b3
MS
1443 }
1444
e3bff60a 1445 r = userns_mkdir(directory, p, 0755, 0, 0);
663996b3 1446 if (r < 0)
f47781d8 1447 log_warning_errno(errno, "Failed to create directory %s: %m", q);
663996b3
MS
1448 return 0;
1449 }
1450
1451 if (arg_link_journal == LINK_HOST) {
f47781d8
MP
1452 /* don't create parents here -- if the host doesn't have
1453 * permanent journal set up, don't force it here */
1454 r = mkdir(p, 0755);
663996b3 1455 if (r < 0) {
f47781d8
MP
1456 if (arg_link_journal_try) {
1457 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1458 return 0;
1459 } else {
1460 log_error_errno(errno, "Failed to create %s: %m", p);
1461 return r;
1462 }
663996b3
MS
1463 }
1464
1465 } else if (access(p, F_OK) < 0)
1466 return 0;
1467
60f067b4
JS
1468 if (dir_is_empty(q) == 0)
1469 log_warning("%s is not empty, proceeding anyway.", q);
663996b3 1470
e3bff60a 1471 r = userns_mkdir(directory, p, 0755, 0, 0);
663996b3 1472 if (r < 0) {
f47781d8 1473 log_error_errno(errno, "Failed to create %s: %m", q);
663996b3
MS
1474 return r;
1475 }
1476
e3bff60a 1477 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
f47781d8 1478 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
60f067b4
JS
1479
1480 return 0;
1481}
1482
663996b3
MS
1483static int drop_capabilities(void) {
1484 return capability_bounding_set_drop(~arg_retain, false);
1485}
1486
60f067b4
JS
1487static int reset_audit_loginuid(void) {
1488 _cleanup_free_ char *p = NULL;
1489 int r;
1490
1491 if (arg_share_system)
1492 return 0;
1493
1494 r = read_one_line_file("/proc/self/loginuid", &p);
1495 if (r == -ENOENT)
1496 return 0;
f47781d8
MP
1497 if (r < 0)
1498 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
60f067b4
JS
1499
1500 /* Already reset? */
1501 if (streq(p, "4294967295"))
1502 return 0;
1503
7035cd9e 1504 r = write_string_file("/proc/self/loginuid", "4294967295", 0);
60f067b4 1505 if (r < 0) {
d9dfd233
MP
1506 log_error_errno(r,
1507 "Failed to reset audit login UID. This probably means that your kernel is too\n"
1508 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1509 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1510 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1511 "using systemd-nspawn. Sleeping for 5s... (%m)");
e735f4d4 1512
d9dfd233 1513 sleep(5);
e735f4d4
MP
1514 }
1515
1516 return 0;
1517}
1518
1519static int setup_seccomp(void) {
1520
1521#ifdef HAVE_SECCOMP
e3bff60a
MP
1522 static const struct {
1523 uint64_t capability;
1524 int syscall_num;
1525 } blacklist[] = {
1526 { CAP_SYS_RAWIO, SCMP_SYS(iopl) },
1527 { CAP_SYS_RAWIO, SCMP_SYS(ioperm) },
1528 { CAP_SYS_BOOT, SCMP_SYS(kexec_load) },
1529 { CAP_SYS_ADMIN, SCMP_SYS(swapon) },
1530 { CAP_SYS_ADMIN, SCMP_SYS(swapoff) },
1531 { CAP_SYS_ADMIN, SCMP_SYS(open_by_handle_at) },
1532 { CAP_SYS_MODULE, SCMP_SYS(init_module) },
1533 { CAP_SYS_MODULE, SCMP_SYS(finit_module) },
1534 { CAP_SYS_MODULE, SCMP_SYS(delete_module) },
1535 { CAP_SYSLOG, SCMP_SYS(syslog) },
e735f4d4
MP
1536 };
1537
1538 scmp_filter_ctx seccomp;
1539 unsigned i;
1540 int r;
60f067b4 1541
60f067b4
JS
1542 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1543 if (!seccomp)
1544 return log_oom();
1545
1546 r = seccomp_add_secondary_archs(seccomp);
1547 if (r < 0) {
f47781d8 1548 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
60f067b4
JS
1549 goto finish;
1550 }
1551
e842803a 1552 for (i = 0; i < ELEMENTSOF(blacklist); i++) {
e3bff60a
MP
1553 if (arg_retain & (1ULL << blacklist[i].capability))
1554 continue;
1555
1556 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i].syscall_num, 0);
e842803a
MB
1557 if (r == -EFAULT)
1558 continue; /* unknown syscall */
1559 if (r < 0) {
f47781d8 1560 log_error_errno(r, "Failed to block syscall: %m");
e842803a
MB
1561 goto finish;
1562 }
1563 }
1564
e735f4d4 1565
e842803a
MB
1566 /*
1567 Audit is broken in containers, much of the userspace audit
1568 hookup will fail if running inside a container. We don't
1569 care and just turn off creation of audit sockets.
1570
1571 This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1572 with EAFNOSUPPORT which audit userspace uses as indication
1573 that audit is disabled in the kernel.
1574 */
1575
60f067b4
JS
1576 r = seccomp_rule_add(
1577 seccomp,
1578 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1579 SCMP_SYS(socket),
1580 2,
1581 SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1582 SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1583 if (r < 0) {
f47781d8 1584 log_error_errno(r, "Failed to add audit seccomp rule: %m");
60f067b4
JS
1585 goto finish;
1586 }
1587
1588 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1589 if (r < 0) {
f47781d8 1590 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
60f067b4
JS
1591 goto finish;
1592 }
1593
1594 r = seccomp_load(seccomp);
86f210e9
MP
1595 if (r == -EINVAL) {
1596 log_debug_errno(r, "Kernel is probably not configured with CONFIG_SECCOMP. Disabling seccomp audit filter: %m");
1597 r = 0;
1598 goto finish;
1599 }
1600 if (r < 0) {
f47781d8 1601 log_error_errno(r, "Failed to install seccomp audit filter: %m");
86f210e9
MP
1602 goto finish;
1603 }
60f067b4
JS
1604
1605finish:
1606 seccomp_release(seccomp);
1607 return r;
1608#else
1609 return 0;
1610#endif
1611
1612}
1613
e735f4d4
MP
1614static int setup_propagate(const char *root) {
1615 const char *p, *q;
1616
1617 (void) mkdir_p("/run/systemd/nspawn/", 0755);
1618 (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
1619 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
1620 (void) mkdir_p(p, 0600);
1621
e3bff60a
MP
1622 if (userns_mkdir(root, "/run/systemd", 0755, 0, 0) < 0)
1623 return log_error_errno(errno, "Failed to create /run/systemd: %m");
1624
1625 if (userns_mkdir(root, "/run/systemd/nspawn", 0755, 0, 0) < 0)
1626 return log_error_errno(errno, "Failed to create /run/systemd/nspawn: %m");
1627
1628 if (userns_mkdir(root, "/run/systemd/nspawn/incoming", 0600, 0, 0) < 0)
1629 return log_error_errno(errno, "Failed to create /run/systemd/nspawn/incoming: %m");
e735f4d4 1630
e3bff60a 1631 q = prefix_roota(root, "/run/systemd/nspawn/incoming");
e735f4d4
MP
1632 if (mount(p, q, NULL, MS_BIND, NULL) < 0)
1633 return log_error_errno(errno, "Failed to install propagation bind mount.");
1634
1635 if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
1636 return log_error_errno(errno, "Failed to make propagation mount read-only");
1637
1638 return 0;
1639}
1640
60f067b4
JS
1641static int setup_image(char **device_path, int *loop_nr) {
1642 struct loop_info64 info = {
1643 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1644 };
1645 _cleanup_close_ int fd = -1, control = -1, loop = -1;
1646 _cleanup_free_ char* loopdev = NULL;
1647 struct stat st;
1648 int r, nr;
1649
1650 assert(device_path);
1651 assert(loop_nr);
e735f4d4 1652 assert(arg_image);
60f067b4
JS
1653
1654 fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
f47781d8
MP
1655 if (fd < 0)
1656 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
60f067b4 1657
f47781d8
MP
1658 if (fstat(fd, &st) < 0)
1659 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
60f067b4
JS
1660
1661 if (S_ISBLK(st.st_mode)) {
1662 char *p;
1663
1664 p = strdup(arg_image);
1665 if (!p)
1666 return log_oom();
1667
1668 *device_path = p;
1669
1670 *loop_nr = -1;
1671
1672 r = fd;
1673 fd = -1;
1674
1675 return r;
1676 }
1677
1678 if (!S_ISREG(st.st_mode)) {
f47781d8 1679 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
60f067b4
JS
1680 return -EINVAL;
1681 }
1682
1683 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
f47781d8
MP
1684 if (control < 0)
1685 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
60f067b4
JS
1686
1687 nr = ioctl(control, LOOP_CTL_GET_FREE);
f47781d8
MP
1688 if (nr < 0)
1689 return log_error_errno(errno, "Failed to allocate loop device: %m");
60f067b4
JS
1690
1691 if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1692 return log_oom();
1693
1694 loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
f47781d8
MP
1695 if (loop < 0)
1696 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
60f067b4 1697
f47781d8
MP
1698 if (ioctl(loop, LOOP_SET_FD, fd) < 0)
1699 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
60f067b4
JS
1700
1701 if (arg_read_only)
1702 info.lo_flags |= LO_FLAGS_READ_ONLY;
1703
f47781d8
MP
1704 if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
1705 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
60f067b4
JS
1706
1707 *device_path = loopdev;
1708 loopdev = NULL;
1709
1710 *loop_nr = nr;
1711
1712 r = loop;
1713 loop = -1;
1714
1715 return r;
1716}
1717
e735f4d4
MP
1718#define PARTITION_TABLE_BLURB \
1719 "Note that the disk image needs to either contain only a single MBR partition of\n" \
e3bff60a 1720 "type 0x83 that is marked bootable, or a single GPT partition of type " \
e735f4d4
MP
1721 "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
1722 " http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
1723 "to be bootable with systemd-nspawn."
1724
60f067b4
JS
1725static int dissect_image(
1726 int fd,
1727 char **root_device, bool *root_device_rw,
1728 char **home_device, bool *home_device_rw,
1729 char **srv_device, bool *srv_device_rw,
1730 bool *secondary) {
1731
1732#ifdef HAVE_BLKID
f47781d8
MP
1733 int home_nr = -1, srv_nr = -1;
1734#ifdef GPT_ROOT_NATIVE
1735 int root_nr = -1;
1736#endif
1737#ifdef GPT_ROOT_SECONDARY
1738 int secondary_root_nr = -1;
1739#endif
e735f4d4 1740 _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
60f067b4
JS
1741 _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1742 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1743 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1744 _cleanup_udev_unref_ struct udev *udev = NULL;
1745 struct udev_list_entry *first, *item;
e735f4d4
MP
1746 bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
1747 bool is_gpt, is_mbr, multiple_generic = false;
60f067b4
JS
1748 const char *pttype = NULL;
1749 blkid_partlist pl;
1750 struct stat st;
e735f4d4 1751 unsigned i;
60f067b4
JS
1752 int r;
1753
1754 assert(fd >= 0);
1755 assert(root_device);
1756 assert(home_device);
1757 assert(srv_device);
1758 assert(secondary);
e735f4d4 1759 assert(arg_image);
60f067b4
JS
1760
1761 b = blkid_new_probe();
1762 if (!b)
1763 return log_oom();
1764
1765 errno = 0;
1766 r = blkid_probe_set_device(b, fd, 0, 0);
1767 if (r != 0) {
1768 if (errno == 0)
1769 return log_oom();
1770
f47781d8 1771 log_error_errno(errno, "Failed to set device on blkid probe: %m");
60f067b4
JS
1772 return -errno;
1773 }
1774
1775 blkid_probe_enable_partitions(b, 1);
1776 blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1777
1778 errno = 0;
1779 r = blkid_do_safeprobe(b);
1780 if (r == -2 || r == 1) {
e735f4d4
MP
1781 log_error("Failed to identify any partition table on\n"
1782 " %s\n"
1783 PARTITION_TABLE_BLURB, arg_image);
60f067b4
JS
1784 return -EINVAL;
1785 } else if (r != 0) {
1786 if (errno == 0)
1787 errno = EIO;
f47781d8 1788 log_error_errno(errno, "Failed to probe: %m");
60f067b4
JS
1789 return -errno;
1790 }
1791
e3bff60a 1792 (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
e735f4d4
MP
1793
1794 is_gpt = streq_ptr(pttype, "gpt");
1795 is_mbr = streq_ptr(pttype, "dos");
1796
1797 if (!is_gpt && !is_mbr) {
1798 log_error("No GPT or MBR partition table discovered on\n"
1799 " %s\n"
1800 PARTITION_TABLE_BLURB, arg_image);
60f067b4
JS
1801 return -EINVAL;
1802 }
1803
1804 errno = 0;
1805 pl = blkid_probe_get_partitions(b);
1806 if (!pl) {
1807 if (errno == 0)
1808 return log_oom();
1809
1810 log_error("Failed to list partitions of %s", arg_image);
1811 return -errno;
1812 }
1813
1814 udev = udev_new();
1815 if (!udev)
1816 return log_oom();
1817
f47781d8
MP
1818 if (fstat(fd, &st) < 0)
1819 return log_error_errno(errno, "Failed to stat block device: %m");
60f067b4
JS
1820
1821 d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1822 if (!d)
1823 return log_oom();
1824
e735f4d4
MP
1825 for (i = 0;; i++) {
1826 int n, m;
60f067b4 1827
e735f4d4
MP
1828 if (i >= 10) {
1829 log_error("Kernel partitions never appeared.");
1830 return -ENXIO;
1831 }
60f067b4 1832
e735f4d4
MP
1833 e = udev_enumerate_new(udev);
1834 if (!e)
1835 return log_oom();
1836
1837 r = udev_enumerate_add_match_parent(e, d);
1838 if (r < 0)
1839 return log_oom();
1840
1841 r = udev_enumerate_scan_devices(e);
1842 if (r < 0)
1843 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
1844
1845 /* Count the partitions enumerated by the kernel */
1846 n = 0;
1847 first = udev_enumerate_get_list_entry(e);
1848 udev_list_entry_foreach(item, first)
1849 n++;
1850
1851 /* Count the partitions enumerated by blkid */
1852 m = blkid_partlist_numof_partitions(pl);
1853 if (n == m + 1)
1854 break;
1855 if (n > m + 1) {
1856 log_error("blkid and kernel partition list do not match.");
1857 return -EIO;
1858 }
1859 if (n < m + 1) {
1860 unsigned j;
1861
1862 /* The kernel has probed fewer partitions than
1863 * blkid? Maybe the kernel prober is still
1864 * running or it got EBUSY because udev
1865 * already opened the device. Let's reprobe
1866 * the device, which is a synchronous call
1867 * that waits until probing is complete. */
1868
1869 for (j = 0; j < 20; j++) {
1870
1871 r = ioctl(fd, BLKRRPART, 0);
1872 if (r < 0)
1873 r = -errno;
1874 if (r >= 0 || r != -EBUSY)
1875 break;
1876
1877 /* If something else has the device
1878 * open, such as an udev rule, the
1879 * ioctl will return EBUSY. Since
1880 * there's no way to wait until it
1881 * isn't busy anymore, let's just wait
1882 * a bit, and try again.
1883 *
1884 * This is really something they
1885 * should fix in the kernel! */
1886
1887 usleep(50 * USEC_PER_MSEC);
1888 }
1889
1890 if (r < 0)
1891 return log_error_errno(r, "Failed to reread partition table: %m");
1892 }
1893
1894 e = udev_enumerate_unref(e);
1895 }
60f067b4
JS
1896
1897 first = udev_enumerate_get_list_entry(e);
1898 udev_list_entry_foreach(item, first) {
1899 _cleanup_udev_device_unref_ struct udev_device *q;
e735f4d4 1900 const char *node;
60f067b4 1901 unsigned long long flags;
60f067b4
JS
1902 blkid_partition pp;
1903 dev_t qn;
1904 int nr;
1905
1906 errno = 0;
1907 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
1908 if (!q) {
1909 if (!errno)
1910 errno = ENOMEM;
1911
f47781d8 1912 log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
60f067b4 1913 return -errno;
663996b3
MS
1914 }
1915
60f067b4
JS
1916 qn = udev_device_get_devnum(q);
1917 if (major(qn) == 0)
1918 continue;
663996b3 1919
60f067b4
JS
1920 if (st.st_rdev == qn)
1921 continue;
663996b3 1922
60f067b4
JS
1923 node = udev_device_get_devnode(q);
1924 if (!node)
1925 continue;
663996b3 1926
60f067b4
JS
1927 pp = blkid_partlist_devno_to_partition(pl, qn);
1928 if (!pp)
1929 continue;
663996b3 1930
60f067b4 1931 flags = blkid_partition_get_flags(pp);
663996b3 1932
60f067b4
JS
1933 nr = blkid_partition_get_partno(pp);
1934 if (nr < 0)
1935 continue;
663996b3 1936
e735f4d4
MP
1937 if (is_gpt) {
1938 sd_id128_t type_id;
1939 const char *stype;
663996b3 1940
e735f4d4
MP
1941 if (flags & GPT_FLAG_NO_AUTO)
1942 continue;
663996b3 1943
e735f4d4
MP
1944 stype = blkid_partition_get_type_string(pp);
1945 if (!stype)
1946 continue;
663996b3 1947
e735f4d4 1948 if (sd_id128_from_string(stype, &type_id) < 0)
60f067b4 1949 continue;
663996b3 1950
e735f4d4 1951 if (sd_id128_equal(type_id, GPT_HOME)) {
663996b3 1952
e735f4d4
MP
1953 if (home && nr >= home_nr)
1954 continue;
60f067b4 1955
e735f4d4
MP
1956 home_nr = nr;
1957 home_rw = !(flags & GPT_FLAG_READ_ONLY);
60f067b4 1958
e735f4d4
MP
1959 r = free_and_strdup(&home, node);
1960 if (r < 0)
1961 return log_oom();
60f067b4 1962
e735f4d4
MP
1963 } else if (sd_id128_equal(type_id, GPT_SRV)) {
1964
1965 if (srv && nr >= srv_nr)
1966 continue;
1967
1968 srv_nr = nr;
1969 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
1970
1971 r = free_and_strdup(&srv, node);
1972 if (r < 0)
1973 return log_oom();
1974 }
60f067b4 1975#ifdef GPT_ROOT_NATIVE
e735f4d4 1976 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
60f067b4 1977
e735f4d4
MP
1978 if (root && nr >= root_nr)
1979 continue;
60f067b4 1980
e735f4d4
MP
1981 root_nr = nr;
1982 root_rw = !(flags & GPT_FLAG_READ_ONLY);
60f067b4 1983
e735f4d4
MP
1984 r = free_and_strdup(&root, node);
1985 if (r < 0)
1986 return log_oom();
1987 }
60f067b4
JS
1988#endif
1989#ifdef GPT_ROOT_SECONDARY
e735f4d4
MP
1990 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
1991
1992 if (secondary_root && nr >= secondary_root_nr)
1993 continue;
1994
1995 secondary_root_nr = nr;
1996 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
60f067b4 1997
e735f4d4
MP
1998 r = free_and_strdup(&secondary_root, node);
1999 if (r < 0)
2000 return log_oom();
2001 }
2002#endif
2003 else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2004
2005 if (generic)
2006 multiple_generic = true;
2007 else {
2008 generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2009
2010 r = free_and_strdup(&generic, node);
2011 if (r < 0)
2012 return log_oom();
2013 }
2014 }
2015
2016 } else if (is_mbr) {
2017 int type;
2018
2019 if (flags != 0x80) /* Bootable flag */
60f067b4
JS
2020 continue;
2021
e735f4d4
MP
2022 type = blkid_partition_get_type(pp);
2023 if (type != 0x83) /* Linux partition */
2024 continue;
60f067b4 2025
e735f4d4
MP
2026 if (generic)
2027 multiple_generic = true;
2028 else {
2029 generic_rw = true;
60f067b4 2030
e735f4d4
MP
2031 r = free_and_strdup(&root, node);
2032 if (r < 0)
2033 return log_oom();
2034 }
60f067b4 2035 }
60f067b4
JS
2036 }
2037
2038 if (root) {
2039 *root_device = root;
2040 root = NULL;
2041
2042 *root_device_rw = root_rw;
2043 *secondary = false;
2044 } else if (secondary_root) {
2045 *root_device = secondary_root;
2046 secondary_root = NULL;
2047
2048 *root_device_rw = secondary_root_rw;
2049 *secondary = true;
e735f4d4
MP
2050 } else if (generic) {
2051
2052 /* There were no partitions with precise meanings
2053 * around, but we found generic partitions. In this
2054 * case, if there's only one, we can go ahead and boot
2055 * it, otherwise we bail out, because we really cannot
2056 * make any sense of it. */
2057
2058 if (multiple_generic) {
2059 log_error("Identified multiple bootable Linux partitions on\n"
2060 " %s\n"
2061 PARTITION_TABLE_BLURB, arg_image);
2062 return -EINVAL;
2063 }
2064
2065 *root_device = generic;
2066 generic = NULL;
2067
2068 *root_device_rw = generic_rw;
2069 *secondary = false;
2070 } else {
2071 log_error("Failed to identify root partition in disk image\n"
2072 " %s\n"
2073 PARTITION_TABLE_BLURB, arg_image);
2074 return -EINVAL;
60f067b4
JS
2075 }
2076
2077 if (home) {
2078 *home_device = home;
2079 home = NULL;
2080
2081 *home_device_rw = home_rw;
2082 }
2083
2084 if (srv) {
2085 *srv_device = srv;
2086 srv = NULL;
2087
2088 *srv_device_rw = srv_rw;
2089 }
2090
2091 return 0;
2092#else
2093 log_error("--image= is not supported, compiled without blkid support.");
e3bff60a 2094 return -EOPNOTSUPP;
60f067b4
JS
2095#endif
2096}
2097
2098static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2099#ifdef HAVE_BLKID
2100 _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2101 const char *fstype, *p;
2102 int r;
2103
2104 assert(what);
2105 assert(where);
2106
2107 if (arg_read_only)
2108 rw = false;
2109
2110 if (directory)
e735f4d4 2111 p = strjoina(where, directory);
60f067b4
JS
2112 else
2113 p = where;
2114
2115 errno = 0;
2116 b = blkid_new_probe_from_filename(what);
2117 if (!b) {
2118 if (errno == 0)
2119 return log_oom();
f47781d8 2120 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
60f067b4
JS
2121 return -errno;
2122 }
2123
2124 blkid_probe_enable_superblocks(b, 1);
2125 blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2126
2127 errno = 0;
2128 r = blkid_do_safeprobe(b);
2129 if (r == -1 || r == 1) {
2130 log_error("Cannot determine file system type of %s", what);
2131 return -EINVAL;
2132 } else if (r != 0) {
2133 if (errno == 0)
2134 errno = EIO;
f47781d8 2135 log_error_errno(errno, "Failed to probe %s: %m", what);
60f067b4
JS
2136 return -errno;
2137 }
2138
2139 errno = 0;
2140 if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2141 if (errno == 0)
2142 errno = EINVAL;
2143 log_error("Failed to determine file system type of %s", what);
2144 return -errno;
2145 }
2146
2147 if (streq(fstype, "crypto_LUKS")) {
2148 log_error("nspawn currently does not support LUKS disk images.");
e3bff60a 2149 return -EOPNOTSUPP;
60f067b4
JS
2150 }
2151
f47781d8
MP
2152 if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2153 return log_error_errno(errno, "Failed to mount %s: %m", what);
60f067b4
JS
2154
2155 return 0;
2156#else
2157 log_error("--image= is not supported, compiled without blkid support.");
e3bff60a 2158 return -EOPNOTSUPP;
60f067b4
JS
2159#endif
2160}
2161
2162static int mount_devices(
2163 const char *where,
2164 const char *root_device, bool root_device_rw,
2165 const char *home_device, bool home_device_rw,
2166 const char *srv_device, bool srv_device_rw) {
2167 int r;
2168
2169 assert(where);
2170
2171 if (root_device) {
2172 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
f47781d8
MP
2173 if (r < 0)
2174 return log_error_errno(r, "Failed to mount root directory: %m");
60f067b4
JS
2175 }
2176
2177 if (home_device) {
2178 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
f47781d8
MP
2179 if (r < 0)
2180 return log_error_errno(r, "Failed to mount home directory: %m");
60f067b4
JS
2181 }
2182
2183 if (srv_device) {
2184 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
f47781d8
MP
2185 if (r < 0)
2186 return log_error_errno(r, "Failed to mount server data directory: %m");
60f067b4
JS
2187 }
2188
2189 return 0;
2190}
2191
2192static void loop_remove(int nr, int *image_fd) {
2193 _cleanup_close_ int control = -1;
5eef597e 2194 int r;
60f067b4
JS
2195
2196 if (nr < 0)
2197 return;
2198
2199 if (image_fd && *image_fd >= 0) {
5eef597e
MP
2200 r = ioctl(*image_fd, LOOP_CLR_FD);
2201 if (r < 0)
e735f4d4 2202 log_debug_errno(errno, "Failed to close loop image: %m");
60f067b4
JS
2203 *image_fd = safe_close(*image_fd);
2204 }
2205
2206 control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
5eef597e 2207 if (control < 0) {
f47781d8 2208 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
60f067b4 2209 return;
5eef597e 2210 }
60f067b4 2211
5eef597e
MP
2212 r = ioctl(control, LOOP_CTL_REMOVE, nr);
2213 if (r < 0)
e735f4d4 2214 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
60f067b4
JS
2215}
2216
60f067b4 2217/*
e842803a
MB
2218 * Return values:
2219 * < 0 : wait_for_terminate() failed to get the state of the
2220 * container, the container was terminated by a signal, or
2221 * failed for an unknown reason. No change is made to the
2222 * container argument.
2223 * > 0 : The program executed in the container terminated with an
2224 * error. The exit code of the program executed in the
f47781d8
MP
2225 * container is returned. The container argument has been set
2226 * to CONTAINER_TERMINATED.
e842803a
MB
2227 * 0 : The container is being rebooted, has been shut down or exited
2228 * successfully. The container argument has been set to either
2229 * CONTAINER_TERMINATED or CONTAINER_REBOOTED.
60f067b4 2230 *
e842803a
MB
2231 * That is, success is indicated by a return value of zero, and an
2232 * error is indicated by a non-zero value.
60f067b4
JS
2233 */
2234static int wait_for_container(pid_t pid, ContainerStatus *container) {
60f067b4 2235 siginfo_t status;
f47781d8 2236 int r;
14228c0d 2237
60f067b4 2238 r = wait_for_terminate(pid, &status);
f47781d8
MP
2239 if (r < 0)
2240 return log_warning_errno(r, "Failed to wait for container: %m");
14228c0d 2241
60f067b4 2242 switch (status.si_code) {
f47781d8 2243
60f067b4 2244 case CLD_EXITED:
f47781d8
MP
2245 if (status.si_status == 0) {
2246 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
14228c0d 2247
f47781d8
MP
2248 } else
2249 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2250
2251 *container = CONTAINER_TERMINATED;
2252 return status.si_status;
60f067b4
JS
2253
2254 case CLD_KILLED:
2255 if (status.si_status == SIGINT) {
14228c0d 2256
f47781d8 2257 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
60f067b4 2258 *container = CONTAINER_TERMINATED;
f47781d8
MP
2259 return 0;
2260
60f067b4 2261 } else if (status.si_status == SIGHUP) {
14228c0d 2262
f47781d8 2263 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
60f067b4 2264 *container = CONTAINER_REBOOTED;
f47781d8 2265 return 0;
60f067b4 2266 }
f47781d8 2267
60f067b4
JS
2268 /* CLD_KILLED fallthrough */
2269
2270 case CLD_DUMPED:
f47781d8
MP
2271 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2272 return -EIO;
60f067b4
JS
2273
2274 default:
f47781d8
MP
2275 log_error("Container %s failed due to unknown reason.", arg_machine);
2276 return -EIO;
14228c0d 2277 }
60f067b4
JS
2278
2279 return r;
14228c0d
MB
2280}
2281
f47781d8
MP
2282static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2283 pid_t pid;
2284
2285 pid = PTR_TO_UINT32(userdata);
2286 if (pid > 0) {
e3bff60a 2287 if (kill(pid, arg_kill_signal) >= 0) {
f47781d8
MP
2288 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2289 sd_event_source_set_userdata(s, NULL);
2290 return 0;
2291 }
2292 }
2293
2294 sd_event_exit(sd_event_source_get_event(s), 0);
2295 return 0;
2296}
2297
e735f4d4
MP
2298static int determine_names(void) {
2299 int r;
663996b3 2300
13d276d0
MP
2301 if (arg_template && !arg_directory && arg_machine) {
2302
2303 /* If --template= was specified then we should not
2304 * search for a machine, but instead create a new one
2305 * in /var/lib/machine. */
2306
2307 arg_directory = strjoin("/var/lib/machines/", arg_machine, NULL);
2308 if (!arg_directory)
2309 return log_oom();
2310 }
2311
e735f4d4
MP
2312 if (!arg_image && !arg_directory) {
2313 if (arg_machine) {
2314 _cleanup_(image_unrefp) Image *i = NULL;
663996b3 2315
e735f4d4
MP
2316 r = image_find(arg_machine, &i);
2317 if (r < 0)
2318 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2319 else if (r == 0) {
2320 log_error("No image for machine '%s': %m", arg_machine);
2321 return -ENOENT;
2322 }
663996b3 2323
e735f4d4
MP
2324 if (i->type == IMAGE_RAW)
2325 r = set_sanitized_path(&arg_image, i->path);
2326 else
2327 r = set_sanitized_path(&arg_directory, i->path);
2328 if (r < 0)
2329 return log_error_errno(r, "Invalid image directory: %m");
663996b3 2330
e3bff60a
MP
2331 if (!arg_ephemeral)
2332 arg_read_only = arg_read_only || i->read_only;
60f067b4
JS
2333 } else
2334 arg_directory = get_current_dir_name();
663996b3 2335
e735f4d4
MP
2336 if (!arg_directory && !arg_machine) {
2337 log_error("Failed to determine path, please use -D or -i.");
2338 return -EINVAL;
60f067b4 2339 }
663996b3
MS
2340 }
2341
663996b3 2342 if (!arg_machine) {
e735f4d4
MP
2343 if (arg_directory && path_equal(arg_directory, "/"))
2344 arg_machine = gethostname_malloc();
2345 else
2346 arg_machine = strdup(basename(arg_image ?: arg_directory));
2347
2348 if (!arg_machine)
2349 return log_oom();
663996b3 2350
13d276d0 2351 hostname_cleanup(arg_machine);
e735f4d4 2352 if (!machine_name_is_valid(arg_machine)) {
663996b3 2353 log_error("Failed to determine machine name automatically, please use -M.");
e735f4d4
MP
2354 return -EINVAL;
2355 }
2356
2357 if (arg_ephemeral) {
2358 char *b;
2359
2360 /* Add a random suffix when this is an
2361 * ephemeral machine, so that we can run many
2362 * instances at once without manually having
2363 * to specify -M each time. */
2364
2365 if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2366 return log_oom();
2367
2368 free(arg_machine);
2369 arg_machine = b;
663996b3
MS
2370 }
2371 }
2372
e735f4d4
MP
2373 return 0;
2374}
2375
e3bff60a
MP
2376static int determine_uid_shift(const char *directory) {
2377 int r;
e735f4d4 2378
e3bff60a
MP
2379 if (!arg_userns) {
2380 arg_uid_shift = 0;
2381 return 0;
2382 }
e735f4d4 2383
e3bff60a
MP
2384 if (arg_uid_shift == UID_INVALID) {
2385 struct stat st;
e735f4d4 2386
e3bff60a
MP
2387 r = stat(directory, &st);
2388 if (r < 0)
2389 return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
e735f4d4 2390
e3bff60a 2391 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
e735f4d4 2392
e3bff60a
MP
2393 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000))) {
2394 log_error("UID and GID base of %s don't match.", directory);
2395 return -EINVAL;
2396 }
2397
2398 arg_uid_range = UINT32_C(0x10000);
663996b3
MS
2399 }
2400
e3bff60a
MP
2401 if (arg_uid_shift > (uid_t) -1 - arg_uid_range) {
2402 log_error("UID base too high for UID range.");
2403 return -EINVAL;
663996b3
MS
2404 }
2405
e3bff60a
MP
2406 log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
2407 return 0;
2408}
2409
2410static int inner_child(
2411 Barrier *barrier,
2412 const char *directory,
2413 bool secondary,
2414 int kmsg_socket,
2415 int rtnl_socket,
d9dfd233 2416 FDSet *fds) {
e3bff60a
MP
2417
2418 _cleanup_free_ char *home = NULL;
2419 unsigned n_env = 2;
2420 const char *envp[] = {
2421 "PATH=" DEFAULT_PATH_SPLIT_USR,
2422 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2423 NULL, /* TERM */
2424 NULL, /* HOME */
2425 NULL, /* USER */
2426 NULL, /* LOGNAME */
2427 NULL, /* container_uuid */
2428 NULL, /* LISTEN_FDS */
2429 NULL, /* LISTEN_PID */
2430 NULL
2431 };
2432
86f210e9 2433 _cleanup_strv_free_ char **env_use = NULL;
e3bff60a
MP
2434 int r;
2435
2436 assert(barrier);
2437 assert(directory);
2438 assert(kmsg_socket >= 0);
2439
d9dfd233
MP
2440 cg_unified_flush();
2441
e3bff60a
MP
2442 if (arg_userns) {
2443 /* Tell the parent, that it now can write the UID map. */
2444 (void) barrier_place(barrier); /* #1 */
2445
2446 /* Wait until the parent wrote the UID map */
2447 if (!barrier_place_and_sync(barrier)) { /* #2 */
2448 log_error("Parent died too early");
2449 return -ESRCH;
663996b3
MS
2450 }
2451 }
663996b3 2452
6300502b
MP
2453 r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
2454 if (r < 0)
2455 return r;
2456
2457 r = mount_sysfs(NULL);
e3bff60a
MP
2458 if (r < 0)
2459 return r;
e735f4d4 2460
e3bff60a
MP
2461 /* Wait until we are cgroup-ified, so that we
2462 * can mount the right cgroup path writable */
2463 if (!barrier_place_and_sync(barrier)) { /* #3 */
2464 log_error("Parent died too early");
2465 return -ESRCH;
2466 }
60f067b4 2467
d9dfd233 2468 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
e3bff60a
MP
2469 if (r < 0)
2470 return r;
e735f4d4 2471
e3bff60a
MP
2472 r = reset_uid_gid();
2473 if (r < 0)
2474 return log_error_errno(r, "Couldn't become new root: %m");
e735f4d4 2475
e3bff60a
MP
2476 r = setup_boot_id(NULL);
2477 if (r < 0)
2478 return r;
e735f4d4 2479
e3bff60a
MP
2480 r = setup_kmsg(NULL, kmsg_socket);
2481 if (r < 0)
2482 return r;
2483 kmsg_socket = safe_close(kmsg_socket);
e735f4d4 2484
e3bff60a 2485 umask(0022);
e735f4d4 2486
e3bff60a
MP
2487 if (setsid() < 0)
2488 return log_error_errno(errno, "setsid() failed: %m");
e735f4d4 2489
e3bff60a
MP
2490 if (arg_private_network)
2491 loopback_setup();
e735f4d4 2492
d9dfd233
MP
2493 if (arg_expose_ports) {
2494 r = expose_port_send_rtnl(rtnl_socket);
2495 if (r < 0)
2496 return r;
2497 rtnl_socket = safe_close(rtnl_socket);
2498 }
e735f4d4 2499
e3bff60a
MP
2500 if (drop_capabilities() < 0)
2501 return log_error_errno(errno, "drop_capabilities() failed: %m");
60f067b4 2502
e3bff60a
MP
2503 setup_hostname();
2504
2505 if (arg_personality != PERSONALITY_INVALID) {
2506 if (personality(arg_personality) < 0)
2507 return log_error_errno(errno, "personality() failed: %m");
2508 } else if (secondary) {
2509 if (personality(PER_LINUX32) < 0)
2510 return log_error_errno(errno, "personality() failed: %m");
2511 }
2512
2513#ifdef HAVE_SELINUX
2514 if (arg_selinux_context)
2515 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2516 return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
2517#endif
2518
d9dfd233 2519 r = change_uid_gid(arg_user, &home);
e3bff60a
MP
2520 if (r < 0)
2521 return r;
2522
2523 envp[n_env] = strv_find_prefix(environ, "TERM=");
2524 if (envp[n_env])
2525 n_env ++;
2526
2527 if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2528 (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2529 (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0))
2530 return log_oom();
2531
2532 if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2533 char as_uuid[37];
2534
2535 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0)
2536 return log_oom();
2537 }
2538
2539 if (fdset_size(fds) > 0) {
2540 r = fdset_cloexec(fds, false);
2541 if (r < 0)
2542 return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
2543
2544 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
2545 (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0))
2546 return log_oom();
2547 }
2548
86f210e9
MP
2549 env_use = strv_env_merge(2, envp, arg_setenv);
2550 if (!env_use)
2551 return log_oom();
e3bff60a
MP
2552
2553 /* Let the parent know that we are ready and
2554 * wait until the parent is ready with the
2555 * setup, too... */
2556 if (!barrier_place_and_sync(barrier)) { /* #4 */
2557 log_error("Parent died too early");
2558 return -ESRCH;
2559 }
2560
2561 /* Now, explicitly close the log, so that we
2562 * then can close all remaining fds. Closing
2563 * the log explicitly first has the benefit
2564 * that the logging subsystem knows about it,
2565 * and is thus ready to be reopened should we
2566 * need it again. Note that the other fds
2567 * closed here are at least the locking and
2568 * barrier fds. */
2569 log_close();
2570 (void) fdset_close_others(fds);
2571
2572 if (arg_boot) {
2573 char **a;
2574 size_t m;
2575
2576 /* Automatically search for the init system */
2577
d9dfd233 2578 m = 1 + strv_length(arg_parameters);
e3bff60a 2579 a = newa(char*, m + 1);
d9dfd233
MP
2580 if (strv_isempty(arg_parameters))
2581 a[1] = NULL;
2582 else
2583 memcpy(a + 1, arg_parameters, m * sizeof(char*));
e3bff60a
MP
2584
2585 a[0] = (char*) "/usr/lib/systemd/systemd";
2586 execve(a[0], a, env_use);
2587
2588 a[0] = (char*) "/lib/systemd/systemd";
2589 execve(a[0], a, env_use);
2590
2591 a[0] = (char*) "/sbin/init";
2592 execve(a[0], a, env_use);
d9dfd233
MP
2593 } else if (!strv_isempty(arg_parameters))
2594 execvpe(arg_parameters[0], arg_parameters, env_use);
e3bff60a 2595 else {
d9dfd233 2596 chdir(home ?: "/root");
e3bff60a
MP
2597 execle("/bin/bash", "-bash", NULL, env_use);
2598 execle("/bin/sh", "-sh", NULL, env_use);
2599 }
2600
2601 (void) log_open();
2602 return log_error_errno(errno, "execv() failed: %m");
2603}
2604
2605static int outer_child(
2606 Barrier *barrier,
2607 const char *directory,
2608 const char *console,
2609 const char *root_device, bool root_device_rw,
2610 const char *home_device, bool home_device_rw,
2611 const char *srv_device, bool srv_device_rw,
2612 bool interactive,
2613 bool secondary,
2614 int pid_socket,
2615 int kmsg_socket,
2616 int rtnl_socket,
fb183854 2617 int uid_shift_socket,
d9dfd233 2618 FDSet *fds) {
e3bff60a
MP
2619
2620 pid_t pid;
2621 ssize_t l;
2622 int r;
2623
2624 assert(barrier);
2625 assert(directory);
2626 assert(console);
2627 assert(pid_socket >= 0);
2628 assert(kmsg_socket >= 0);
2629
d9dfd233
MP
2630 cg_unified_flush();
2631
e3bff60a
MP
2632 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
2633 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
2634
2635 if (interactive) {
2636 close_nointr(STDIN_FILENO);
2637 close_nointr(STDOUT_FILENO);
2638 close_nointr(STDERR_FILENO);
2639
2640 r = open_terminal(console, O_RDWR);
2641 if (r != STDIN_FILENO) {
2642 if (r >= 0) {
2643 safe_close(r);
2644 r = -EINVAL;
2645 }
2646
2647 return log_error_errno(r, "Failed to open console: %m");
2648 }
2649
2650 if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2651 dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
2652 return log_error_errno(errno, "Failed to duplicate console: %m");
2653 }
2654
2655 r = reset_audit_loginuid();
2656 if (r < 0)
2657 return r;
2658
2659 /* Mark everything as slave, so that we still
2660 * receive mounts from the real root, but don't
2661 * propagate mounts to the real root. */
2662 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
2663 return log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
2664
2665 r = mount_devices(directory,
2666 root_device, root_device_rw,
2667 home_device, home_device_rw,
2668 srv_device, srv_device_rw);
2669 if (r < 0)
2670 return r;
2671
2672 r = determine_uid_shift(directory);
2673 if (r < 0)
2674 return r;
2675
fb183854
MP
2676 if (arg_userns) {
2677 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
2678 if (l < 0)
2679 return log_error_errno(errno, "Failed to send UID shift: %m");
2680 if (l != sizeof(arg_uid_shift)) {
2681 log_error("Short write while sending UID shift.");
2682 return -EIO;
2683 }
2684 }
2685
e3bff60a
MP
2686 /* Turn directory into bind mount */
2687 if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
2688 return log_error_errno(errno, "Failed to make bind mount: %m");
2689
d9dfd233 2690 r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
e3bff60a
MP
2691 if (r < 0)
2692 return r;
2693
d9dfd233 2694 r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context);
e3bff60a
MP
2695 if (r < 0)
2696 return r;
2697
e3bff60a
MP
2698 r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
2699 if (r < 0)
2700 return r;
2701
e3bff60a
MP
2702 if (arg_read_only) {
2703 r = bind_remount_recursive(directory, true);
2704 if (r < 0)
2705 return log_error_errno(r, "Failed to make tree read-only: %m");
2706 }
2707
6300502b 2708 r = mount_all(directory, arg_userns, false, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
e3bff60a
MP
2709 if (r < 0)
2710 return r;
2711
d9dfd233
MP
2712 r = copy_devnodes(directory);
2713 if (r < 0)
e3bff60a
MP
2714 return r;
2715
2716 dev_setup(directory, arg_uid_shift, arg_uid_shift);
2717
d9dfd233
MP
2718 r = setup_pts(directory);
2719 if (r < 0)
e3bff60a
MP
2720 return r;
2721
2722 r = setup_propagate(directory);
2723 if (r < 0)
2724 return r;
2725
2726 r = setup_dev_console(directory, console);
2727 if (r < 0)
2728 return r;
2729
2730 r = setup_seccomp();
2731 if (r < 0)
2732 return r;
2733
2734 r = setup_timezone(directory);
2735 if (r < 0)
2736 return r;
2737
2738 r = setup_resolv_conf(directory);
2739 if (r < 0)
2740 return r;
2741
2742 r = setup_journal(directory);
2743 if (r < 0)
2744 return r;
2745
d9dfd233 2746 r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
e3bff60a
MP
2747 if (r < 0)
2748 return r;
2749
d9dfd233 2750 r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context);
e3bff60a
MP
2751 if (r < 0)
2752 return r;
2753
2754 r = mount_move_root(directory);
2755 if (r < 0)
2756 return log_error_errno(r, "Failed to move root directory: %m");
2757
2758 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
2759 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
2760 (arg_private_network ? CLONE_NEWNET : 0) |
2761 (arg_userns ? CLONE_NEWUSER : 0),
2762 NULL);
2763 if (pid < 0)
2764 return log_error_errno(errno, "Failed to fork inner child: %m");
e3bff60a
MP
2765 if (pid == 0) {
2766 pid_socket = safe_close(pid_socket);
fb183854 2767 uid_shift_socket = safe_close(uid_shift_socket);
e3bff60a
MP
2768
2769 /* The inner child has all namespaces that are
2770 * requested, so that we all are owned by the user if
2771 * user namespaces are turned on. */
2772
d9dfd233 2773 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds);
e3bff60a
MP
2774 if (r < 0)
2775 _exit(EXIT_FAILURE);
2776
2777 _exit(EXIT_SUCCESS);
2778 }
2779
2780 l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
2781 if (l < 0)
2782 return log_error_errno(errno, "Failed to send PID: %m");
2783 if (l != sizeof(pid)) {
2784 log_error("Short write while sending PID.");
2785 return -EIO;
2786 }
2787
2788 pid_socket = safe_close(pid_socket);
6300502b
MP
2789 kmsg_socket = safe_close(kmsg_socket);
2790 rtnl_socket = safe_close(rtnl_socket);
e3bff60a
MP
2791
2792 return 0;
2793}
2794
2795static int setup_uid_map(pid_t pid) {
2796 char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
2797 int r;
2798
2799 assert(pid > 1);
2800
2801 xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
2802 xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
7035cd9e 2803 r = write_string_file(uid_map, line, 0);
e3bff60a
MP
2804 if (r < 0)
2805 return log_error_errno(r, "Failed to write UID map: %m");
2806
2807 /* We always assign the same UID and GID ranges */
2808 xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
7035cd9e 2809 r = write_string_file(uid_map, line, 0);
e3bff60a
MP
2810 if (r < 0)
2811 return log_error_errno(r, "Failed to write GID map: %m");
2812
2813 return 0;
2814}
2815
d9dfd233
MP
2816static int load_settings(void) {
2817 _cleanup_(settings_freep) Settings *settings = NULL;
2818 _cleanup_fclose_ FILE *f = NULL;
2819 _cleanup_free_ char *p = NULL;
2820 const char *fn, *i;
e3bff60a
MP
2821 int r;
2822
d9dfd233
MP
2823 /* If all settings are masked, there's no point in looking for
2824 * the settings file */
2825 if ((arg_settings_mask & _SETTINGS_MASK_ALL) == _SETTINGS_MASK_ALL)
2826 return 0;
2827
2828 fn = strjoina(arg_machine, ".nspawn");
2829
2830 /* We first look in the admin's directories in /etc and /run */
2831 FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
2832 _cleanup_free_ char *j = NULL;
2833
2834 j = strjoin(i, "/", fn, NULL);
2835 if (!j)
2836 return log_oom();
2837
2838 f = fopen(j, "re");
2839 if (f) {
2840 p = j;
2841 j = NULL;
2842
2843 /* By default we trust configuration from /etc and /run */
2844 if (arg_settings_trusted < 0)
2845 arg_settings_trusted = true;
2846
2847 break;
2848 }
2849
2850 if (errno != ENOENT)
2851 return log_error_errno(errno, "Failed to open %s: %m", j);
2852 }
2853
2854 if (!f) {
2855 /* After that, let's look for a file next to the
2856 * actual image we shall boot. */
2857
2858 if (arg_image) {
2859 p = file_in_same_dir(arg_image, fn);
2860 if (!p)
2861 return log_oom();
2862 } else if (arg_directory) {
2863 p = file_in_same_dir(arg_directory, fn);
2864 if (!p)
2865 return log_oom();
2866 }
2867
2868 if (p) {
2869 f = fopen(p, "re");
2870 if (!f && errno != ENOENT)
2871 return log_error_errno(errno, "Failed to open %s: %m", p);
2872
2873 /* By default we do not trust configuration from /var/lib/machines */
2874 if (arg_settings_trusted < 0)
2875 arg_settings_trusted = false;
2876 }
2877 }
e3bff60a 2878
d9dfd233
MP
2879 if (!f)
2880 return 0;
2881
2882 log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
2883
2884 r = settings_load(f, p, &settings);
e3bff60a 2885 if (r < 0)
d9dfd233 2886 return r;
e3bff60a 2887
d9dfd233
MP
2888 /* Copy over bits from the settings, unless they have been
2889 * explicitly masked by command line switches. */
2890
2891 if ((arg_settings_mask & SETTING_BOOT) == 0 &&
2892 settings->boot >= 0) {
2893 arg_boot = settings->boot;
2894
2895 strv_free(arg_parameters);
2896 arg_parameters = settings->parameters;
2897 settings->parameters = NULL;
2898 }
2899
2900 if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
2901 settings->environment) {
2902 strv_free(arg_setenv);
2903 arg_setenv = settings->environment;
2904 settings->environment = NULL;
2905 }
2906
2907 if ((arg_settings_mask & SETTING_USER) == 0 &&
2908 settings->user) {
2909 free(arg_user);
2910 arg_user = settings->user;
2911 settings->user = NULL;
2912 }
2913
2914 if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
2915
2916 if (!arg_settings_trusted && settings->capability != 0)
2917 log_warning("Ignoring Capability= setting, file %s is not trusted.", p);
2918 else
2919 arg_retain |= settings->capability;
2920
2921 arg_retain &= ~settings->drop_capability;
2922 }
2923
2924 if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
2925 settings->kill_signal > 0)
2926 arg_kill_signal = settings->kill_signal;
2927
2928 if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
2929 settings->personality != PERSONALITY_INVALID)
2930 arg_personality = settings->personality;
2931
2932 if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
2933 !sd_id128_is_null(settings->machine_id)) {
2934
2935 if (!arg_settings_trusted)
2936 log_warning("Ignoring MachineID= setting, file %s is not trusted.", p);
2937 else
2938 arg_uuid = settings->machine_id;
2939 }
2940
2941 if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
2942 settings->read_only >= 0)
2943 arg_read_only = settings->read_only;
2944
2945 if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
2946 settings->volatile_mode != _VOLATILE_MODE_INVALID)
2947 arg_volatile_mode = settings->volatile_mode;
2948
2949 if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
2950 settings->n_custom_mounts > 0) {
2951
2952 if (!arg_settings_trusted)
2953 log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", p);
2954 else {
2955 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
2956 arg_custom_mounts = settings->custom_mounts;
2957 arg_n_custom_mounts = settings->n_custom_mounts;
2958
2959 settings->custom_mounts = NULL;
2960 settings->n_custom_mounts = 0;
2961 }
2962 }
e3bff60a 2963
d9dfd233
MP
2964 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
2965 (settings->private_network >= 0 ||
2966 settings->network_veth >= 0 ||
2967 settings->network_bridge ||
2968 settings->network_interfaces ||
2969 settings->network_macvlan ||
2970 settings->network_ipvlan)) {
2971
2972 if (!arg_settings_trusted)
2973 log_warning("Ignoring network settings, file %s is not trusted.", p);
2974 else {
2975 strv_free(arg_network_interfaces);
2976 arg_network_interfaces = settings->network_interfaces;
2977 settings->network_interfaces = NULL;
2978
2979 strv_free(arg_network_macvlan);
2980 arg_network_macvlan = settings->network_macvlan;
2981 settings->network_macvlan = NULL;
2982
2983 strv_free(arg_network_ipvlan);
2984 arg_network_ipvlan = settings->network_ipvlan;
2985 settings->network_ipvlan = NULL;
2986
2987 free(arg_network_bridge);
2988 arg_network_bridge = settings->network_bridge;
2989 settings->network_bridge = NULL;
2990
2991 arg_network_veth = settings->network_veth > 0 || settings->network_bridge;
2992
2993 arg_private_network = true; /* all these settings imply private networking */
2994 }
2995 }
2996
2997 if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
2998 settings->expose_ports) {
2999
3000 if (!arg_settings_trusted)
3001 log_warning("Ignoring Port= setting, file %s is not trusted.", p);
3002 else {
3003 expose_port_free_all(arg_expose_ports);
3004 arg_expose_ports = settings->expose_ports;
3005 settings->expose_ports = NULL;
3006 }
3007 }
e3bff60a
MP
3008
3009 return 0;
3010}
3011
3012int main(int argc, char *argv[]) {
3013
3014 _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3015 bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3016 _cleanup_close_ int master = -1, image_fd = -1;
3017 _cleanup_fdset_free_ FDSet *fds = NULL;
3018 int r, n_fd_passed, loop_nr = -1;
3019 char veth_name[IFNAMSIZ];
3020 bool secondary = false, remove_subvol = false;
86f210e9 3021 sigset_t mask_chld;
e3bff60a
MP
3022 pid_t pid = 0;
3023 int ret = EXIT_SUCCESS;
3024 union in_addr_union exposed = {};
3025 _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3026 bool interactive;
3027
3028 log_parse_environment();
3029 log_open();
3030
3031 r = parse_argv(argc, argv);
3032 if (r <= 0)
3033 goto finish;
3034
e3bff60a
MP
3035 if (geteuid() != 0) {
3036 log_error("Need to be root.");
3037 r = -EPERM;
3038 goto finish;
3039 }
d9dfd233
MP
3040 r = determine_names();
3041 if (r < 0)
3042 goto finish;
3043
3044 r = load_settings();
3045 if (r < 0)
3046 goto finish;
3047
3048 r = verify_arguments();
3049 if (r < 0)
3050 goto finish;
e3bff60a
MP
3051
3052 n_fd_passed = sd_listen_fds(false);
3053 if (n_fd_passed > 0) {
3054 r = fdset_new_listen_fds(&fds, false);
3055 if (r < 0) {
3056 log_error_errno(r, "Failed to collect file descriptors: %m");
3057 goto finish;
3058 }
3059 }
3060
3061 if (arg_directory) {
3062 assert(!arg_image);
3063
3064 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3065 log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3066 r = -EINVAL;
3067 goto finish;
3068 }
3069
3070 if (arg_ephemeral) {
3071 _cleanup_free_ char *np = NULL;
3072
3073 /* If the specified path is a mount point we
3074 * generate the new snapshot immediately
3075 * inside it under a random name. However if
3076 * the specified is not a mount point we
3077 * create the new snapshot in the parent
3078 * directory, just next to it. */
86f210e9 3079 r = path_is_mount_point(arg_directory, 0);
e3bff60a
MP
3080 if (r < 0) {
3081 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3082 goto finish;
3083 }
3084 if (r > 0)
86f210e9 3085 r = tempfn_random_child(arg_directory, "machine.", &np);
e3bff60a 3086 else
86f210e9 3087 r = tempfn_random(arg_directory, "machine.", &np);
e3bff60a
MP
3088 if (r < 0) {
3089 log_error_errno(r, "Failed to generate name for snapshot: %m");
3090 goto finish;
3091 }
3092
3093 r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3094 if (r < 0) {
3095 log_error_errno(r, "Failed to lock %s: %m", np);
3096 goto finish;
3097 }
3098
3099 r = btrfs_subvol_snapshot(arg_directory, np, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
3100 if (r < 0) {
3101 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3102 goto finish;
3103 }
3104
3105 free(arg_directory);
3106 arg_directory = np;
3107 np = NULL;
3108
3109 remove_subvol = true;
3110
3111 } else {
3112 r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3113 if (r == -EBUSY) {
3114 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3115 goto finish;
3116 }
3117 if (r < 0) {
3118 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3119 return r;
3120 }
3121
3122 if (arg_template) {
3123 r = btrfs_subvol_snapshot(arg_template, arg_directory, (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) | BTRFS_SNAPSHOT_FALLBACK_COPY | BTRFS_SNAPSHOT_RECURSIVE);
3124 if (r == -EEXIST) {
3125 if (!arg_quiet)
3126 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3127 } else if (r < 0) {
3128 log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3129 goto finish;
3130 } else {
3131 if (!arg_quiet)
3132 log_info("Populated %s from template %s.", arg_directory, arg_template);
3133 }
3134 }
3135 }
3136
3137 if (arg_boot) {
3138 if (path_is_os_tree(arg_directory) <= 0) {
3139 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3140 r = -EINVAL;
3141 goto finish;
3142 }
3143 } else {
3144 const char *p;
3145
3146 p = strjoina(arg_directory,
60f067b4
JS
3147 argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3148 if (access(p, F_OK) < 0) {
3149 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
e735f4d4 3150 r = -EINVAL;
60f067b4 3151 goto finish;
60f067b4
JS
3152 }
3153 }
e735f4d4 3154
60f067b4
JS
3155 } else {
3156 char template[] = "/tmp/nspawn-root-XXXXXX";
3157
e735f4d4
MP
3158 assert(arg_image);
3159 assert(!arg_template);
3160
3161 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3162 if (r == -EBUSY) {
3163 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3164 goto finish;
3165 }
3166 if (r < 0) {
3167 r = log_error_errno(r, "Failed to create image lock: %m");
3168 goto finish;
3169 }
3170
60f067b4 3171 if (!mkdtemp(template)) {
f47781d8 3172 log_error_errno(errno, "Failed to create temporary directory: %m");
60f067b4
JS
3173 r = -errno;
3174 goto finish;
3175 }
3176
3177 arg_directory = strdup(template);
3178 if (!arg_directory) {
3179 r = log_oom();
3180 goto finish;
3181 }
3182
3183 image_fd = setup_image(&device_path, &loop_nr);
3184 if (image_fd < 0) {
3185 r = image_fd;
3186 goto finish;
3187 }
3188
5eef597e
MP
3189 r = dissect_image(image_fd,
3190 &root_device, &root_device_rw,
3191 &home_device, &home_device_rw,
3192 &srv_device, &srv_device_rw,
3193 &secondary);
60f067b4
JS
3194 if (r < 0)
3195 goto finish;
3196 }
3197
e3bff60a
MP
3198 r = custom_mounts_prepare();
3199 if (r < 0)
3200 goto finish;
3201
3202 interactive =
3203 isatty(STDIN_FILENO) > 0 &&
3204 isatty(STDOUT_FILENO) > 0;
3205
663996b3
MS
3206 master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3207 if (master < 0) {
e735f4d4 3208 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
663996b3
MS
3209 goto finish;
3210 }
3211
e735f4d4
MP
3212 r = ptsname_malloc(master, &console);
3213 if (r < 0) {
3214 r = log_error_errno(r, "Failed to determine tty name: %m");
663996b3
MS
3215 goto finish;
3216 }
3217
663996b3 3218 if (unlockpt(master) < 0) {
e735f4d4 3219 r = log_error_errno(errno, "Failed to unlock tty: %m");
663996b3
MS
3220 goto finish;
3221 }
3222
e3bff60a
MP
3223 if (!arg_quiet)
3224 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3225 arg_machine, arg_image ?: arg_directory);
3226
86f210e9 3227 assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
663996b3 3228
f47781d8
MP
3229 assert_se(sigemptyset(&mask_chld) == 0);
3230 assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3231
e3bff60a
MP
3232 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0) {
3233 r = log_error_errno(errno, "Failed to become subreaper: %m");
3234 goto finish;
3235 }
3236
663996b3 3237 for (;;) {
fb183854
MP
3238 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 }, pid_socket_pair[2] = { -1, -1 },
3239 uid_shift_socket_pair[2] = { -1, -1 };
60f067b4 3240 ContainerStatus container_status;
5eef597e 3241 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
e3bff60a 3242 static const struct sigaction sa = {
6300502b 3243 .sa_handler = nop_signal_handler,
60f067b4
JS
3244 .sa_flags = SA_NOCLDSTOP,
3245 };
e3bff60a
MP
3246 int ifi = 0;
3247 ssize_t l;
fb183854
MP
3248 _cleanup_event_unref_ sd_event *event = NULL;
3249 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3250 _cleanup_netlink_unref_ sd_netlink *rtnl = NULL;
3251 char last_char = 0;
60f067b4 3252
5eef597e
MP
3253 r = barrier_create(&barrier);
3254 if (r < 0) {
f47781d8 3255 log_error_errno(r, "Cannot initialize IPC barrier: %m");
5eef597e
MP
3256 goto finish;
3257 }
3258
d9dfd233 3259 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
e735f4d4
MP
3260 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3261 goto finish;
3262 }
3263
d9dfd233 3264 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
e735f4d4
MP
3265 r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3266 goto finish;
3267 }
3268
d9dfd233 3269 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0) {
e3bff60a
MP
3270 r = log_error_errno(errno, "Failed to create pid socket pair: %m");
3271 goto finish;
3272 }
3273
fb183854 3274 if (arg_userns)
d9dfd233 3275 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
fb183854
MP
3276 r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
3277 goto finish;
3278 }
3279
60f067b4
JS
3280 /* Child can be killed before execv(), so handle SIGCHLD
3281 * in order to interrupt parent's blocking calls and
3282 * give it a chance to call wait() and terminate. */
3283 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3284 if (r < 0) {
e735f4d4 3285 r = log_error_errno(errno, "Failed to change the signal mask: %m");
663996b3
MS
3286 goto finish;
3287 }
3288
60f067b4
JS
3289 r = sigaction(SIGCHLD, &sa, NULL);
3290 if (r < 0) {
e735f4d4 3291 r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
663996b3
MS
3292 goto finish;
3293 }
3294
e3bff60a 3295 pid = raw_clone(SIGCHLD|CLONE_NEWNS, NULL);
663996b3
MS
3296 if (pid < 0) {
3297 if (errno == EINVAL)
e735f4d4 3298 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
663996b3 3299 else
e735f4d4 3300 r = log_error_errno(errno, "clone() failed: %m");
663996b3
MS
3301
3302 goto finish;
3303 }
3304
3305 if (pid == 0) {
e3bff60a 3306 /* The outer child only has a file system namespace. */
5eef597e
MP
3307 barrier_set_role(&barrier, BARRIER_CHILD);
3308
60f067b4 3309 master = safe_close(master);
663996b3 3310
60f067b4 3311 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
e735f4d4 3312 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
e3bff60a 3313 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
fb183854 3314 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
663996b3 3315
86f210e9
MP
3316 (void) reset_all_signal_handlers();
3317 (void) reset_signal_mask();
663996b3 3318
e3bff60a
MP
3319 r = outer_child(&barrier,
3320 arg_directory,
3321 console,
3322 root_device, root_device_rw,
3323 home_device, home_device_rw,
3324 srv_device, srv_device_rw,
3325 interactive,
3326 secondary,
3327 pid_socket_pair[1],
3328 kmsg_socket_pair[1],
3329 rtnl_socket_pair[1],
fb183854 3330 uid_shift_socket_pair[1],
d9dfd233 3331 fds);
5eef597e
MP
3332 if (r < 0)
3333 _exit(EXIT_FAILURE);
3334
e3bff60a
MP
3335 _exit(EXIT_SUCCESS);
3336 }
663996b3 3337
e3bff60a 3338 barrier_set_role(&barrier, BARRIER_PARENT);
663996b3 3339
6300502b 3340 fds = fdset_free(fds);
663996b3 3341
e3bff60a
MP
3342 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3343 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3344 pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
d9dfd233 3345 uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
663996b3 3346
e3bff60a
MP
3347 /* Wait for the outer child. */
3348 r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
3349 if (r < 0)
3350 goto finish;
3351 if (r != 0) {
3352 r = -EIO;
3353 goto finish;
3354 }
3355 pid = 0;
663996b3 3356
e3bff60a
MP
3357 /* And now retrieve the PID of the inner child. */
3358 l = recv(pid_socket_pair[0], &pid, sizeof(pid), 0);
3359 if (l < 0) {
3360 r = log_error_errno(errno, "Failed to read inner child PID: %m");
3361 goto finish;
3362 }
3363 if (l != sizeof(pid)) {
d9dfd233 3364 log_error("Short read while reading inner child PID.");
e3bff60a
MP
3365 r = EIO;
3366 goto finish;
3367 }
663996b3 3368
e3bff60a 3369 log_debug("Init process invoked as PID " PID_FMT, pid);
663996b3 3370
e3bff60a
MP
3371 if (arg_userns) {
3372 if (!barrier_place_and_sync(&barrier)) { /* #1 */
3373 log_error("Child died too early.");
3374 r = -ESRCH;
3375 goto finish;
663996b3
MS
3376 }
3377
fb183854
MP
3378 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
3379 if (l < 0) {
3380 r = log_error_errno(errno, "Failed to read UID shift: %m");
3381 goto finish;
3382 }
3383 if (l != sizeof(arg_uid_shift)) {
d9dfd233 3384 log_error("Short read while reading UID shift.");
fb183854
MP
3385 r = EIO;
3386 goto finish;
3387 }
3388
e3bff60a 3389 r = setup_uid_map(pid);
60f067b4 3390 if (r < 0)
e3bff60a 3391 goto finish;
60f067b4 3392
e3bff60a
MP
3393 (void) barrier_place(&barrier); /* #2 */
3394 }
60f067b4 3395
d9dfd233 3396 if (arg_private_network) {
60f067b4 3397
d9dfd233
MP
3398 r = move_network_interfaces(pid, arg_network_interfaces);
3399 if (r < 0)
3400 goto finish;
60f067b4 3401
d9dfd233
MP
3402 if (arg_network_veth) {
3403 r = setup_veth(arg_machine, pid, veth_name, !!arg_network_bridge);
3404 if (r < 0)
3405 goto finish;
3406 else if (r > 0)
3407 ifi = r;
60f067b4 3408
d9dfd233
MP
3409 if (arg_network_bridge) {
3410 r = setup_bridge(veth_name, arg_network_bridge);
3411 if (r < 0)
3412 goto finish;
3413 if (r > 0)
3414 ifi = r;
3415 }
3416 }
663996b3 3417
d9dfd233
MP
3418 r = setup_macvlan(arg_machine, pid, arg_network_macvlan);
3419 if (r < 0)
3420 goto finish;
3421
3422 r = setup_ipvlan(arg_machine, pid, arg_network_ipvlan);
3423 if (r < 0)
3424 goto finish;
3425 }
3426
3427 if (arg_register) {
3428 r = register_machine(
3429 arg_machine,
3430 pid,
3431 arg_directory,
3432 arg_uuid,
3433 ifi,
3434 arg_slice,
3435 arg_custom_mounts, arg_n_custom_mounts,
3436 arg_kill_signal,
3437 arg_property,
3438 arg_keep_unit);
3439 if (r < 0)
3440 goto finish;
3441 }
663996b3 3442
d9dfd233 3443 r = sync_cgroup(pid, arg_unified_cgroup_hierarchy);
e3bff60a
MP
3444 if (r < 0)
3445 goto finish;
663996b3 3446
d9dfd233
MP
3447 if (arg_keep_unit) {
3448 r = create_subcgroup(pid, arg_unified_cgroup_hierarchy);
3449 if (r < 0)
3450 goto finish;
3451 }
3452
3453 r = chown_cgroup(pid, arg_uid_shift);
e3bff60a
MP
3454 if (r < 0)
3455 goto finish;
663996b3 3456
e3bff60a
MP
3457 /* Notify the child that the parent is ready with all
3458 * its setup (including cgroup-ification), and that
3459 * the child can now hand over control to the code to
3460 * run inside the container. */
3461 (void) barrier_place(&barrier); /* #3 */
663996b3 3462
e3bff60a
MP
3463 /* Block SIGCHLD here, before notifying child.
3464 * process_pty() will handle it with the other signals. */
3465 assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
663996b3 3466
e3bff60a
MP
3467 /* Reset signal to default */
3468 r = default_signals(SIGCHLD, -1);
3469 if (r < 0) {
3470 log_error_errno(r, "Failed to reset SIGCHLD: %m");
3471 goto finish;
663996b3
MS
3472 }
3473
e3bff60a 3474 /* Let the child know that we are ready and wait that the child is completely ready now. */
6300502b
MP
3475 if (!barrier_place_and_sync(&barrier)) { /* #4 */
3476 log_error("Child died too early.");
e3bff60a
MP
3477 r = -ESRCH;
3478 goto finish;
3479 }
60f067b4 3480
e3bff60a
MP
3481 sd_notifyf(false,
3482 "READY=1\n"
3483 "STATUS=Container running.\n"
3484 "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
60f067b4 3485
e3bff60a
MP
3486 r = sd_event_new(&event);
3487 if (r < 0) {
3488 log_error_errno(r, "Failed to get default event source: %m");
3489 goto finish;
3490 }
e735f4d4 3491
e3bff60a
MP
3492 if (arg_kill_signal > 0) {
3493 /* Try to kill the init system on SIGINT or SIGTERM */
3494 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3495 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3496 } else {
3497 /* Immediately exit */
3498 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3499 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3500 }
5eef597e 3501
e3bff60a
MP
3502 /* simply exit on sigchld */
3503 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
60f067b4 3504
e3bff60a 3505 if (arg_expose_ports) {
d9dfd233 3506 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, &exposed, &rtnl);
e842803a
MB
3507 if (r < 0)
3508 goto finish;
663996b3 3509
d9dfd233 3510 (void) expose_port_execute(rtnl, arg_expose_ports, &exposed);
e3bff60a 3511 }
f47781d8 3512
e3bff60a 3513 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
663996b3 3514
e3bff60a
MP
3515 r = pty_forward_new(event, master, true, !interactive, &forward);
3516 if (r < 0) {
3517 log_error_errno(r, "Failed to create PTY forwarder: %m");
3518 goto finish;
3519 }
e735f4d4 3520
e3bff60a
MP
3521 r = sd_event_loop(event);
3522 if (r < 0) {
3523 log_error_errno(r, "Failed to run event loop: %m");
3524 goto finish;
3525 }
e735f4d4 3526
e3bff60a 3527 pty_forward_get_last_char(forward, &last_char);
e735f4d4 3528
e3bff60a 3529 forward = pty_forward_free(forward);
e735f4d4 3530
e3bff60a
MP
3531 if (!arg_quiet && last_char != '\n')
3532 putc('\n', stdout);
e735f4d4 3533
e3bff60a 3534 /* Kill if it is not dead yet anyway */
d9dfd233
MP
3535 if (arg_register && !arg_keep_unit)
3536 terminate_machine(pid);
60f067b4 3537
e842803a 3538 /* Normally redundant, but better safe than sorry */
60f067b4 3539 kill(pid, SIGKILL);
663996b3 3540
60f067b4
JS
3541 r = wait_for_container(pid, &container_status);
3542 pid = 0;
3543
e735f4d4 3544 if (r < 0)
e842803a
MB
3545 /* We failed to wait for the container, or the
3546 * container exited abnormally */
e735f4d4
MP
3547 goto finish;
3548 else if (r > 0 || container_status == CONTAINER_TERMINATED){
e842803a
MB
3549 /* The container exited with a non-zero
3550 * status, or with zero status and no reboot
3551 * was requested. */
e735f4d4 3552 ret = r;
663996b3 3553 break;
e735f4d4 3554 }
60f067b4
JS
3555
3556 /* CONTAINER_REBOOTED, loop again */
e842803a
MB
3557
3558 if (arg_keep_unit) {
3559 /* Special handling if we are running as a
3560 * service: instead of simply restarting the
3561 * machine we want to restart the entire
3562 * service, so let's inform systemd about this
3563 * with the special exit code 133. The service
3564 * file uses RestartForceExitStatus=133 so
3565 * that this results in a full nspawn
3566 * restart. This is necessary since we might
3567 * have cgroup parameters set we want to have
3568 * flushed out. */
e735f4d4
MP
3569 ret = 133;
3570 r = 0;
e842803a
MB
3571 break;
3572 }
e735f4d4 3573
d9dfd233 3574 expose_port_flush(arg_expose_ports, &exposed);
663996b3
MS
3575 }
3576
3577finish:
5eef597e
MP
3578 sd_notify(false,
3579 "STOPPING=1\n"
3580 "STATUS=Terminating...");
3581
14228c0d
MB
3582 if (pid > 0)
3583 kill(pid, SIGKILL);
663996b3 3584
86f210e9
MP
3585 /* Try to flush whatever is still queued in the pty */
3586 if (master >= 0)
6300502b 3587 (void) copy_bytes(master, STDOUT_FILENO, (uint64_t) -1, false);
86f210e9 3588
e3bff60a
MP
3589 loop_remove(loop_nr, &image_fd);
3590
e735f4d4
MP
3591 if (remove_subvol && arg_directory) {
3592 int k;
3593
e3bff60a 3594 k = btrfs_subvol_remove(arg_directory, true);
e735f4d4
MP
3595 if (k < 0)
3596 log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3597 }
3598
3599 if (arg_machine) {
3600 const char *p;
3601
3602 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
e3bff60a 3603 (void) rm_rf(p, REMOVE_ROOT);
e735f4d4
MP
3604 }
3605
d9dfd233
MP
3606 expose_port_flush(arg_expose_ports, &exposed);
3607
663996b3 3608 free(arg_directory);
e735f4d4
MP
3609 free(arg_template);
3610 free(arg_image);
663996b3 3611 free(arg_machine);
60f067b4
JS
3612 free(arg_user);
3613 strv_free(arg_setenv);
d9dfd233 3614 free(arg_network_bridge);
60f067b4
JS
3615 strv_free(arg_network_interfaces);
3616 strv_free(arg_network_macvlan);
e735f4d4 3617 strv_free(arg_network_ipvlan);
d9dfd233
MP
3618 strv_free(arg_parameters);
3619 custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
3620 expose_port_free_all(arg_expose_ports);
e735f4d4
MP
3621
3622 return r < 0 ? EXIT_FAILURE : ret;
663996b3 3623}