]> git.proxmox.com Git - systemd.git/blob - src/nspawn/nspawn-mount.c
Enable seccomp support on powerpc, ppc64el, and s390x
[systemd.git] / src / nspawn / nspawn-mount.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2015 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mount.h>
23 #include <linux/magic.h>
24
25 #include "alloc-util.h"
26 #include "cgroup-util.h"
27 #include "escape.h"
28 #include "fs-util.h"
29 #include "label.h"
30 #include "mkdir.h"
31 #include "mount-util.h"
32 #include "nspawn-mount.h"
33 #include "parse-util.h"
34 #include "path-util.h"
35 #include "rm-rf.h"
36 #include "set.h"
37 #include "stat-util.h"
38 #include "string-util.h"
39 #include "strv.h"
40 #include "user-util.h"
41 #include "util.h"
42
43 CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
44 CustomMount *c, *ret;
45
46 assert(l);
47 assert(n);
48 assert(t >= 0);
49 assert(t < _CUSTOM_MOUNT_TYPE_MAX);
50
51 c = realloc(*l, (*n + 1) * sizeof(CustomMount));
52 if (!c)
53 return NULL;
54
55 *l = c;
56 ret = *l + *n;
57 (*n)++;
58
59 *ret = (CustomMount) { .type = t };
60
61 return ret;
62 }
63
64 void custom_mount_free_all(CustomMount *l, unsigned n) {
65 unsigned i;
66
67 for (i = 0; i < n; i++) {
68 CustomMount *m = l + i;
69
70 free(m->source);
71 free(m->destination);
72 free(m->options);
73
74 if (m->work_dir) {
75 (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
76 free(m->work_dir);
77 }
78
79 strv_free(m->lower);
80 }
81
82 free(l);
83 }
84
85 int custom_mount_compare(const void *a, const void *b) {
86 const CustomMount *x = a, *y = b;
87 int r;
88
89 r = path_compare(x->destination, y->destination);
90 if (r != 0)
91 return r;
92
93 if (x->type < y->type)
94 return -1;
95 if (x->type > y->type)
96 return 1;
97
98 return 0;
99 }
100
101 int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
102 _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
103 const char *p = s;
104 CustomMount *m;
105 int r;
106
107 assert(l);
108 assert(n);
109
110 r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
111 if (r < 0)
112 return r;
113 if (r == 0)
114 return -EINVAL;
115
116 if (r == 1) {
117 destination = strdup(source);
118 if (!destination)
119 return -ENOMEM;
120 }
121
122 if (r == 2 && !isempty(p)) {
123 opts = strdup(p);
124 if (!opts)
125 return -ENOMEM;
126 }
127
128 if (!path_is_absolute(source))
129 return -EINVAL;
130
131 if (!path_is_absolute(destination))
132 return -EINVAL;
133
134 m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
135 if (!m)
136 return log_oom();
137
138 m->source = source;
139 m->destination = destination;
140 m->read_only = read_only;
141 m->options = opts;
142
143 source = destination = opts = NULL;
144 return 0;
145 }
146
147 int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
148 _cleanup_free_ char *path = NULL, *opts = NULL;
149 const char *p = s;
150 CustomMount *m;
151 int r;
152
153 assert(l);
154 assert(n);
155 assert(s);
156
157 r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
158 if (r < 0)
159 return r;
160 if (r == 0)
161 return -EINVAL;
162
163 if (isempty(p))
164 opts = strdup("mode=0755");
165 else
166 opts = strdup(p);
167 if (!opts)
168 return -ENOMEM;
169
170 if (!path_is_absolute(path))
171 return -EINVAL;
172
173 m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
174 if (!m)
175 return -ENOMEM;
176
177 m->destination = path;
178 m->options = opts;
179
180 path = opts = NULL;
181 return 0;
182 }
183
184 static int tmpfs_patch_options(
185 const char *options,
186 bool userns, uid_t uid_shift, uid_t uid_range,
187 const char *selinux_apifs_context,
188 char **ret) {
189
190 char *buf = NULL;
191
192 if (userns && uid_shift != 0) {
193 assert(uid_shift != UID_INVALID);
194
195 if (options)
196 (void) asprintf(&buf, "%s,uid=" UID_FMT ",gid=" UID_FMT, options, uid_shift, uid_shift);
197 else
198 (void) asprintf(&buf, "uid=" UID_FMT ",gid=" UID_FMT, uid_shift, uid_shift);
199 if (!buf)
200 return -ENOMEM;
201
202 options = buf;
203 }
204
205 #ifdef HAVE_SELINUX
206 if (selinux_apifs_context) {
207 char *t;
208
209 if (options)
210 t = strjoin(options, ",context=\"", selinux_apifs_context, "\"", NULL);
211 else
212 t = strjoin("context=\"", selinux_apifs_context, "\"", NULL);
213 if (!t) {
214 free(buf);
215 return -ENOMEM;
216 }
217
218 free(buf);
219 buf = t;
220 }
221 #endif
222
223 *ret = buf;
224 return !!buf;
225 }
226
227 int mount_sysfs(const char *dest) {
228 const char *full, *top, *x;
229 int r;
230
231 top = prefix_roota(dest, "/sys");
232 r = path_check_fstype(top, SYSFS_MAGIC);
233 if (r < 0)
234 return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
235 /* /sys might already be mounted as sysfs by the outer child in the
236 * !netns case. In this case, it's all good. Don't touch it because we
237 * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
238 */
239 if (r > 0)
240 return 0;
241
242 full = prefix_roota(top, "/full");
243
244 (void) mkdir(full, 0755);
245
246 if (mount("sysfs", full, "sysfs", MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) < 0)
247 return log_error_errno(errno, "Failed to mount sysfs to %s: %m", full);
248
249 FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
250 _cleanup_free_ char *from = NULL, *to = NULL;
251
252 from = prefix_root(full, x);
253 if (!from)
254 return log_oom();
255
256 to = prefix_root(top, x);
257 if (!to)
258 return log_oom();
259
260 (void) mkdir(to, 0755);
261
262 if (mount(from, to, NULL, MS_BIND, NULL) < 0)
263 return log_error_errno(errno, "Failed to mount /sys/%s into place: %m", x);
264
265 if (mount(NULL, to, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
266 return log_error_errno(errno, "Failed to mount /sys/%s read-only: %m", x);
267 }
268
269 if (umount(full) < 0)
270 return log_error_errno(errno, "Failed to unmount %s: %m", full);
271
272 if (rmdir(full) < 0)
273 return log_error_errno(errno, "Failed to remove %s: %m", full);
274
275 x = prefix_roota(top, "/fs/kdbus");
276 (void) mkdir(x, 0755);
277
278 if (mount(NULL, top, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
279 return log_error_errno(errno, "Failed to make %s read-only: %m", top);
280
281 return 0;
282 }
283
284 int mount_all(const char *dest,
285 bool use_userns, bool in_userns,
286 bool use_netns,
287 uid_t uid_shift, uid_t uid_range,
288 const char *selinux_apifs_context) {
289
290 typedef struct MountPoint {
291 const char *what;
292 const char *where;
293 const char *type;
294 const char *options;
295 unsigned long flags;
296 bool fatal;
297 bool in_userns;
298 bool use_netns;
299 } MountPoint;
300
301 static const MountPoint mount_table[] = {
302 { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true, false },
303 { "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true, false }, /* Bind mount first */
304 { NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true, false }, /* Then, make it r/o */
305 { "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, true },
306 { "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, false },
307 { "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false, false },
308 { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
309 { "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
310 { "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, false, false },
311 #ifdef HAVE_SELINUX
312 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false, false }, /* Bind mount first */
313 { NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false }, /* Then, make it r/o */
314 #endif
315 };
316
317 unsigned k;
318 int r;
319
320 for (k = 0; k < ELEMENTSOF(mount_table); k++) {
321 _cleanup_free_ char *where = NULL, *options = NULL;
322 const char *o;
323
324 if (in_userns != mount_table[k].in_userns)
325 continue;
326
327 if (!use_netns && mount_table[k].use_netns)
328 continue;
329
330 where = prefix_root(dest, mount_table[k].where);
331 if (!where)
332 return log_oom();
333
334 r = path_is_mount_point(where, AT_SYMLINK_FOLLOW);
335 if (r < 0 && r != -ENOENT)
336 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
337
338 /* Skip this entry if it is not a remount. */
339 if (mount_table[k].what && r > 0)
340 continue;
341
342 r = mkdir_p(where, 0755);
343 if (r < 0) {
344 if (mount_table[k].fatal)
345 return log_error_errno(r, "Failed to create directory %s: %m", where);
346
347 log_warning_errno(r, "Failed to create directory %s: %m", where);
348 continue;
349 }
350
351 o = mount_table[k].options;
352 if (streq_ptr(mount_table[k].type, "tmpfs")) {
353 r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, selinux_apifs_context, &options);
354 if (r < 0)
355 return log_oom();
356 if (r > 0)
357 o = options;
358 }
359
360 if (mount(mount_table[k].what,
361 where,
362 mount_table[k].type,
363 mount_table[k].flags,
364 o) < 0) {
365
366 if (mount_table[k].fatal)
367 return log_error_errno(errno, "mount(%s) failed: %m", where);
368
369 log_warning_errno(errno, "mount(%s) failed, ignoring: %m", where);
370 }
371 }
372
373 return 0;
374 }
375
376 static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts) {
377 const char *p = options;
378 unsigned long flags = *mount_flags;
379 char *opts = NULL;
380
381 assert(options);
382
383 for (;;) {
384 _cleanup_free_ char *word = NULL;
385 int r = extract_first_word(&p, &word, ",", 0);
386 if (r < 0)
387 return log_error_errno(r, "Failed to extract mount option: %m");
388 if (r == 0)
389 break;
390
391 if (streq(word, "rbind"))
392 flags |= MS_REC;
393 else if (streq(word, "norbind"))
394 flags &= ~MS_REC;
395 else {
396 log_error("Invalid bind mount option: %s", word);
397 return -EINVAL;
398 }
399 }
400
401 *mount_flags = flags;
402 /* in the future mount_opts will hold string options for mount(2) */
403 *mount_opts = opts;
404
405 return 0;
406 }
407
408 static int mount_bind(const char *dest, CustomMount *m) {
409 struct stat source_st, dest_st;
410 const char *where;
411 unsigned long mount_flags = MS_BIND | MS_REC;
412 _cleanup_free_ char *mount_opts = NULL;
413 int r;
414
415 assert(m);
416
417 if (m->options) {
418 r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts);
419 if (r < 0)
420 return r;
421 }
422
423 if (stat(m->source, &source_st) < 0)
424 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
425
426 where = prefix_roota(dest, m->destination);
427
428 if (stat(where, &dest_st) >= 0) {
429 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
430 log_error("Cannot bind mount directory %s on file %s.", m->source, where);
431 return -EINVAL;
432 }
433
434 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
435 log_error("Cannot bind mount file %s on directory %s.", m->source, where);
436 return -EINVAL;
437 }
438
439 } else if (errno == ENOENT) {
440 r = mkdir_parents_label(where, 0755);
441 if (r < 0)
442 return log_error_errno(r, "Failed to make parents of %s: %m", where);
443 } else {
444 return log_error_errno(errno, "Failed to stat %s: %m", where);
445 }
446
447 /* Create the mount point. Any non-directory file can be
448 * mounted on any non-directory file (regular, fifo, socket,
449 * char, block).
450 */
451 if (S_ISDIR(source_st.st_mode))
452 r = mkdir_label(where, 0755);
453 else
454 r = touch(where);
455 if (r < 0 && r != -EEXIST)
456 return log_error_errno(r, "Failed to create mount point %s: %m", where);
457
458 if (mount(m->source, where, NULL, mount_flags, mount_opts) < 0)
459 return log_error_errno(errno, "mount(%s) failed: %m", where);
460
461 if (m->read_only) {
462 r = bind_remount_recursive(where, true);
463 if (r < 0)
464 return log_error_errno(r, "Read-only bind mount failed: %m");
465 }
466
467 return 0;
468 }
469
470 static int mount_tmpfs(
471 const char *dest,
472 CustomMount *m,
473 bool userns, uid_t uid_shift, uid_t uid_range,
474 const char *selinux_apifs_context) {
475
476 const char *where, *options;
477 _cleanup_free_ char *buf = NULL;
478 int r;
479
480 assert(dest);
481 assert(m);
482
483 where = prefix_roota(dest, m->destination);
484
485 r = mkdir_p_label(where, 0755);
486 if (r < 0 && r != -EEXIST)
487 return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
488
489 r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
490 if (r < 0)
491 return log_oom();
492 options = r > 0 ? buf : m->options;
493
494 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options) < 0)
495 return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
496
497 return 0;
498 }
499
500 static char *joined_and_escaped_lower_dirs(char * const *lower) {
501 _cleanup_strv_free_ char **sv = NULL;
502
503 sv = strv_copy(lower);
504 if (!sv)
505 return NULL;
506
507 strv_reverse(sv);
508
509 if (!strv_shell_escape(sv, ",:"))
510 return NULL;
511
512 return strv_join(sv, ":");
513 }
514
515 static int mount_overlay(const char *dest, CustomMount *m) {
516 _cleanup_free_ char *lower = NULL;
517 const char *where, *options;
518 int r;
519
520 assert(dest);
521 assert(m);
522
523 where = prefix_roota(dest, m->destination);
524
525 r = mkdir_label(where, 0755);
526 if (r < 0 && r != -EEXIST)
527 return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
528
529 (void) mkdir_p_label(m->source, 0755);
530
531 lower = joined_and_escaped_lower_dirs(m->lower);
532 if (!lower)
533 return log_oom();
534
535 if (m->read_only) {
536 _cleanup_free_ char *escaped_source = NULL;
537
538 escaped_source = shell_escape(m->source, ",:");
539 if (!escaped_source)
540 return log_oom();
541
542 options = strjoina("lowerdir=", escaped_source, ":", lower);
543 } else {
544 _cleanup_free_ char *escaped_source = NULL, *escaped_work_dir = NULL;
545
546 assert(m->work_dir);
547 (void) mkdir_label(m->work_dir, 0700);
548
549 escaped_source = shell_escape(m->source, ",:");
550 if (!escaped_source)
551 return log_oom();
552 escaped_work_dir = shell_escape(m->work_dir, ",:");
553 if (!escaped_work_dir)
554 return log_oom();
555
556 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
557 }
558
559 if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
560 return log_error_errno(errno, "overlay mount to %s failed: %m", where);
561
562 return 0;
563 }
564
565 int mount_custom(
566 const char *dest,
567 CustomMount *mounts, unsigned n,
568 bool userns, uid_t uid_shift, uid_t uid_range,
569 const char *selinux_apifs_context) {
570
571 unsigned i;
572 int r;
573
574 assert(dest);
575
576 for (i = 0; i < n; i++) {
577 CustomMount *m = mounts + i;
578
579 switch (m->type) {
580
581 case CUSTOM_MOUNT_BIND:
582 r = mount_bind(dest, m);
583 break;
584
585 case CUSTOM_MOUNT_TMPFS:
586 r = mount_tmpfs(dest, m, userns, uid_shift, uid_range, selinux_apifs_context);
587 break;
588
589 case CUSTOM_MOUNT_OVERLAY:
590 r = mount_overlay(dest, m);
591 break;
592
593 default:
594 assert_not_reached("Unknown custom mount type");
595 }
596
597 if (r < 0)
598 return r;
599 }
600
601 return 0;
602 }
603
604 static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
605 char *to;
606 int r;
607
608 to = strjoina(strempty(dest), "/sys/fs/cgroup/", hierarchy);
609
610 r = path_is_mount_point(to, 0);
611 if (r < 0 && r != -ENOENT)
612 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
613 if (r > 0)
614 return 0;
615
616 mkdir_p(to, 0755);
617
618 /* The superblock mount options of the mount point need to be
619 * identical to the hosts', and hence writable... */
620 if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
621 return log_error_errno(errno, "Failed to mount to %s: %m", to);
622
623 /* ... hence let's only make the bind mount read-only, not the
624 * superblock. */
625 if (read_only) {
626 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
627 return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
628 }
629 return 1;
630 }
631
632 static int mount_legacy_cgroups(
633 const char *dest,
634 bool userns, uid_t uid_shift, uid_t uid_range,
635 const char *selinux_apifs_context) {
636
637 _cleanup_set_free_free_ Set *controllers = NULL;
638 const char *cgroup_root;
639 int r;
640
641 cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
642
643 (void) mkdir_p(cgroup_root, 0755);
644
645 /* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
646 r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
647 if (r < 0)
648 return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
649 if (r == 0) {
650 _cleanup_free_ char *options = NULL;
651
652 r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, selinux_apifs_context, &options);
653 if (r < 0)
654 return log_oom();
655
656 if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
657 return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
658 }
659
660 if (cg_unified() > 0)
661 goto skip_controllers;
662
663 controllers = set_new(&string_hash_ops);
664 if (!controllers)
665 return log_oom();
666
667 r = cg_kernel_controllers(controllers);
668 if (r < 0)
669 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
670
671 for (;;) {
672 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
673
674 controller = set_steal_first(controllers);
675 if (!controller)
676 break;
677
678 origin = prefix_root("/sys/fs/cgroup/", controller);
679 if (!origin)
680 return log_oom();
681
682 r = readlink_malloc(origin, &combined);
683 if (r == -EINVAL) {
684 /* Not a symbolic link, but directly a single cgroup hierarchy */
685
686 r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
687 if (r < 0)
688 return r;
689
690 } else if (r < 0)
691 return log_error_errno(r, "Failed to read link %s: %m", origin);
692 else {
693 _cleanup_free_ char *target = NULL;
694
695 target = prefix_root(dest, origin);
696 if (!target)
697 return log_oom();
698
699 /* A symbolic link, a combination of controllers in one hierarchy */
700
701 if (!filename_is_valid(combined)) {
702 log_warning("Ignoring invalid combined hierarchy %s.", combined);
703 continue;
704 }
705
706 r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
707 if (r < 0)
708 return r;
709
710 r = symlink_idempotent(combined, target);
711 if (r == -EINVAL) {
712 log_error("Invalid existing symlink for combined hierarchy");
713 return r;
714 }
715 if (r < 0)
716 return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
717 }
718 }
719
720 skip_controllers:
721 r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
722 if (r < 0)
723 return r;
724
725 if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
726 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
727
728 return 0;
729 }
730
731 static int mount_unified_cgroups(const char *dest) {
732 const char *p;
733 int r;
734
735 assert(dest);
736
737 p = prefix_roota(dest, "/sys/fs/cgroup");
738
739 (void) mkdir_p(p, 0755);
740
741 r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
742 if (r < 0)
743 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
744 if (r > 0) {
745 p = prefix_roota(dest, "/sys/fs/cgroup/cgroup.procs");
746 if (access(p, F_OK) >= 0)
747 return 0;
748 if (errno != ENOENT)
749 return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
750
751 log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
752 return -EINVAL;
753 }
754
755 if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0)
756 return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
757
758 return 0;
759 }
760
761 int mount_cgroups(
762 const char *dest,
763 bool unified_requested,
764 bool userns, uid_t uid_shift, uid_t uid_range,
765 const char *selinux_apifs_context) {
766
767 if (unified_requested)
768 return mount_unified_cgroups(dest);
769 else
770 return mount_legacy_cgroups(dest, userns, uid_shift, uid_range, selinux_apifs_context);
771 }
772
773 int mount_systemd_cgroup_writable(
774 const char *dest,
775 bool unified_requested) {
776
777 _cleanup_free_ char *own_cgroup_path = NULL;
778 const char *systemd_root, *systemd_own;
779 int r;
780
781 assert(dest);
782
783 r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
784 if (r < 0)
785 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
786
787 /* If we are living in the top-level, then there's nothing to do... */
788 if (path_equal(own_cgroup_path, "/"))
789 return 0;
790
791 if (unified_requested) {
792 systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
793 systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
794 } else {
795 systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
796 systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
797 }
798
799 /* Make our own cgroup a (writable) bind mount */
800 if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
801 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
802
803 /* And then remount the systemd cgroup root read-only */
804 if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
805 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
806
807 return 0;
808 }
809
810 int setup_volatile_state(
811 const char *directory,
812 VolatileMode mode,
813 bool userns, uid_t uid_shift, uid_t uid_range,
814 const char *selinux_apifs_context) {
815
816 _cleanup_free_ char *buf = NULL;
817 const char *p, *options;
818 int r;
819
820 assert(directory);
821
822 if (mode != VOLATILE_STATE)
823 return 0;
824
825 /* --volatile=state means we simply overmount /var
826 with a tmpfs, and the rest read-only. */
827
828 r = bind_remount_recursive(directory, true);
829 if (r < 0)
830 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
831
832 p = prefix_roota(directory, "/var");
833 r = mkdir(p, 0755);
834 if (r < 0 && errno != EEXIST)
835 return log_error_errno(errno, "Failed to create %s: %m", directory);
836
837 options = "mode=755";
838 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
839 if (r < 0)
840 return log_oom();
841 if (r > 0)
842 options = buf;
843
844 if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, options) < 0)
845 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
846
847 return 0;
848 }
849
850 int setup_volatile(
851 const char *directory,
852 VolatileMode mode,
853 bool userns, uid_t uid_shift, uid_t uid_range,
854 const char *selinux_apifs_context) {
855
856 bool tmpfs_mounted = false, bind_mounted = false;
857 char template[] = "/tmp/nspawn-volatile-XXXXXX";
858 _cleanup_free_ char *buf = NULL;
859 const char *f, *t, *options;
860 int r;
861
862 assert(directory);
863
864 if (mode != VOLATILE_YES)
865 return 0;
866
867 /* --volatile=yes means we mount a tmpfs to the root dir, and
868 the original /usr to use inside it, and that read-only. */
869
870 if (!mkdtemp(template))
871 return log_error_errno(errno, "Failed to create temporary directory: %m");
872
873 options = "mode=755";
874 r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
875 if (r < 0)
876 return log_oom();
877 if (r > 0)
878 options = buf;
879
880 if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, options) < 0) {
881 r = log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
882 goto fail;
883 }
884
885 tmpfs_mounted = true;
886
887 f = prefix_roota(directory, "/usr");
888 t = prefix_roota(template, "/usr");
889
890 r = mkdir(t, 0755);
891 if (r < 0 && errno != EEXIST) {
892 r = log_error_errno(errno, "Failed to create %s: %m", t);
893 goto fail;
894 }
895
896 if (mount(f, t, NULL, MS_BIND|MS_REC, NULL) < 0) {
897 r = log_error_errno(errno, "Failed to create /usr bind mount: %m");
898 goto fail;
899 }
900
901 bind_mounted = true;
902
903 r = bind_remount_recursive(t, true);
904 if (r < 0) {
905 log_error_errno(r, "Failed to remount %s read-only: %m", t);
906 goto fail;
907 }
908
909 if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
910 r = log_error_errno(errno, "Failed to move root mount: %m");
911 goto fail;
912 }
913
914 (void) rmdir(template);
915
916 return 0;
917
918 fail:
919 if (bind_mounted)
920 (void) umount(t);
921
922 if (tmpfs_mounted)
923 (void) umount(template);
924 (void) rmdir(template);
925 return r;
926 }
927
928 VolatileMode volatile_mode_from_string(const char *s) {
929 int b;
930
931 if (isempty(s))
932 return _VOLATILE_MODE_INVALID;
933
934 b = parse_boolean(s);
935 if (b > 0)
936 return VOLATILE_YES;
937 if (b == 0)
938 return VOLATILE_NO;
939
940 if (streq(s, "state"))
941 return VOLATILE_STATE;
942
943 return _VOLATILE_MODE_INVALID;
944 }