1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/mount.h>
31 #include "alloc-util.h"
32 #include "dev-setup.h"
34 #include "loopback-setup.h"
37 #include "mount-util.h"
38 #include "namespace.h"
39 #include "path-util.h"
40 #include "selinux-util.h"
41 #include "socket-util.h"
42 #include "string-table.h"
43 #include "string-util.h"
45 #include "umask-util.h"
46 #include "user-util.h"
49 typedef enum MountMode
{
50 /* This is ordered by priority! */
60 typedef struct BindMount
{
67 static int append_mounts(BindMount
**p
, char **strv
, MountMode mode
) {
72 STRV_FOREACH(i
, strv
) {
77 if ((mode
== INACCESSIBLE
|| mode
== READONLY
|| mode
== READWRITE
) && (*i
)[0] == '-') {
82 if (!path_is_absolute(*i
))
93 static int mount_path_compare(const void *a
, const void *b
) {
94 const BindMount
*p
= a
, *q
= b
;
97 d
= path_compare(p
->path
, q
->path
);
100 /* If the paths are equal, check the mode */
101 if (p
->mode
< q
->mode
)
104 if (p
->mode
> q
->mode
)
110 /* If the paths are not equal, then order prefixes first */
114 static void drop_duplicates(BindMount
*m
, unsigned *n
) {
115 BindMount
*f
, *t
, *previous
;
120 for (f
= m
, t
= m
, previous
= NULL
; f
< m
+*n
; f
++) {
122 /* The first one wins */
123 if (previous
&& path_equal(f
->path
, previous
->path
))
136 static int mount_dev(BindMount
*m
) {
137 static const char devnodes
[] =
145 char temporary_mount
[] = "/tmp/namespace-dev-XXXXXX";
146 const char *d
, *dev
= NULL
, *devpts
= NULL
, *devshm
= NULL
, *devhugepages
= NULL
, *devmqueue
= NULL
, *devlog
= NULL
, *devptmx
= NULL
;
147 _cleanup_umask_ mode_t u
;
154 if (!mkdtemp(temporary_mount
))
157 dev
= strjoina(temporary_mount
, "/dev");
158 (void) mkdir(dev
, 0755);
159 if (mount("tmpfs", dev
, "tmpfs", MS_NOSUID
|MS_STRICTATIME
, "mode=755") < 0) {
164 devpts
= strjoina(temporary_mount
, "/dev/pts");
165 (void) mkdir(devpts
, 0755);
166 if (mount("/dev/pts", devpts
, NULL
, MS_BIND
, NULL
) < 0) {
171 devptmx
= strjoina(temporary_mount
, "/dev/ptmx");
172 if (symlink("pts/ptmx", devptmx
) < 0) {
177 devshm
= strjoina(temporary_mount
, "/dev/shm");
178 (void) mkdir(devshm
, 01777);
179 r
= mount("/dev/shm", devshm
, NULL
, MS_BIND
, NULL
);
185 devmqueue
= strjoina(temporary_mount
, "/dev/mqueue");
186 (void) mkdir(devmqueue
, 0755);
187 (void) mount("/dev/mqueue", devmqueue
, NULL
, MS_BIND
, NULL
);
189 devhugepages
= strjoina(temporary_mount
, "/dev/hugepages");
190 (void) mkdir(devhugepages
, 0755);
191 (void) mount("/dev/hugepages", devhugepages
, NULL
, MS_BIND
, NULL
);
193 devlog
= strjoina(temporary_mount
, "/dev/log");
194 (void) symlink("/run/systemd/journal/dev-log", devlog
);
196 NULSTR_FOREACH(d
, devnodes
) {
197 _cleanup_free_
char *dn
= NULL
;
210 if (!S_ISBLK(st
.st_mode
) &&
211 !S_ISCHR(st
.st_mode
)) {
219 dn
= strappend(temporary_mount
, d
);
225 mac_selinux_create_file_prepare(d
, st
.st_mode
);
226 r
= mknod(dn
, st
.st_mode
, st
.st_rdev
);
227 mac_selinux_create_file_clear();
235 dev_setup(temporary_mount
, UID_INVALID
, GID_INVALID
);
237 /* Create the /dev directory if missing. It is more likely to be
238 * missing when the service is started with RootDirectory. This is
239 * consistent with mount units creating the mount points when missing.
241 (void) mkdir_p_label(m
->path
, 0755);
243 if (mount(dev
, m
->path
, NULL
, MS_MOVE
, NULL
) < 0) {
249 rmdir(temporary_mount
);
261 umount(devhugepages
);
268 rmdir(temporary_mount
);
273 static int mount_kdbus(BindMount
*m
) {
275 char temporary_mount
[] = "/tmp/kdbus-dev-XXXXXX";
276 _cleanup_free_
char *basepath
= NULL
;
277 _cleanup_umask_ mode_t u
;
278 char *busnode
= NULL
, *root
;
286 if (!mkdtemp(temporary_mount
))
287 return log_error_errno(errno
, "Failed create temp dir: %m");
289 root
= strjoina(temporary_mount
, "/kdbus");
290 (void) mkdir(root
, 0755);
291 if (mount("tmpfs", root
, "tmpfs", MS_NOSUID
|MS_STRICTATIME
, "mode=777") < 0) {
296 /* create a new /dev/null dev node copy so we have some fodder to
297 * bind-mount the custom endpoint over. */
298 if (stat("/dev/null", &st
) < 0) {
299 r
= log_error_errno(errno
, "Failed to stat /dev/null: %m");
303 busnode
= strjoina(root
, "/bus");
304 if (mknod(busnode
, (st
.st_mode
& ~07777) | 0600, st
.st_rdev
) < 0) {
305 r
= log_error_errno(errno
, "mknod() for %s failed: %m",
310 r
= mount(m
->path
, busnode
, NULL
, MS_BIND
, NULL
);
312 r
= log_error_errno(errno
, "bind mount of %s failed: %m",
317 basepath
= dirname_malloc(m
->path
);
323 if (mount(root
, basepath
, NULL
, MS_MOVE
, NULL
) < 0) {
324 r
= log_error_errno(errno
, "bind mount of %s failed: %m",
329 rmdir(temporary_mount
);
340 rmdir(temporary_mount
);
345 static int apply_mount(
348 const char *var_tmp_dir
) {
359 /* First, get rid of everything that is below if there
360 * is anything... Then, overmount it with an
361 * inaccessible directory. */
362 umount_recursive(m
->path
, 0);
364 what
= "/run/systemd/inaccessible";
369 /* Nothing to mount here, we just later toggle the
370 * MS_RDONLY bit for the mount point */
377 case PRIVATE_VAR_TMP
:
384 case PRIVATE_BUS_ENDPOINT
:
385 return mount_kdbus(m
);
388 assert_not_reached("Unknown mode");
393 r
= mount(what
, m
->path
, NULL
, MS_BIND
|MS_REC
, NULL
);
395 log_debug("Successfully mounted %s to %s", what
, m
->path
);
396 else if (m
->ignore
&& errno
== ENOENT
)
402 static int make_read_only(BindMount
*m
) {
407 if (IN_SET(m
->mode
, INACCESSIBLE
, READONLY
))
408 r
= bind_remount_recursive(m
->path
, true);
409 else if (IN_SET(m
->mode
, READWRITE
, PRIVATE_TMP
, PRIVATE_VAR_TMP
, PRIVATE_DEV
))
410 r
= bind_remount_recursive(m
->path
, false);
414 if (m
->ignore
&& r
== -ENOENT
)
421 const char* root_directory
,
422 char** read_write_dirs
,
423 char** read_only_dirs
,
424 char** inaccessible_dirs
,
426 const char* var_tmp_dir
,
427 const char* bus_endpoint_path
,
429 ProtectHome protect_home
,
430 ProtectSystem protect_system
,
431 unsigned long mount_flags
) {
433 BindMount
*m
, *mounts
= NULL
;
437 if (mount_flags
== 0)
438 mount_flags
= MS_SHARED
;
440 if (unshare(CLONE_NEWNS
) < 0)
443 n
= !!tmp_dir
+ !!var_tmp_dir
+ !!bus_endpoint_path
+
444 strv_length(read_write_dirs
) +
445 strv_length(read_only_dirs
) +
446 strv_length(inaccessible_dirs
) +
448 (protect_home
!= PROTECT_HOME_NO
? 3 : 0) +
449 (protect_system
!= PROTECT_SYSTEM_NO
? 2 : 0) +
450 (protect_system
== PROTECT_SYSTEM_FULL
? 1 : 0);
453 m
= mounts
= (BindMount
*) alloca0(n
* sizeof(BindMount
));
454 r
= append_mounts(&m
, read_write_dirs
, READWRITE
);
458 r
= append_mounts(&m
, read_only_dirs
, READONLY
);
462 r
= append_mounts(&m
, inaccessible_dirs
, INACCESSIBLE
);
467 m
->path
= prefix_roota(root_directory
, "/tmp");
468 m
->mode
= PRIVATE_TMP
;
473 m
->path
= prefix_roota(root_directory
, "/var/tmp");
474 m
->mode
= PRIVATE_VAR_TMP
;
479 m
->path
= prefix_roota(root_directory
, "/dev");
480 m
->mode
= PRIVATE_DEV
;
484 if (bus_endpoint_path
) {
485 m
->path
= prefix_roota(root_directory
, bus_endpoint_path
);
486 m
->mode
= PRIVATE_BUS_ENDPOINT
;
490 if (protect_home
!= PROTECT_HOME_NO
) {
491 const char *home_dir
, *run_user_dir
, *root_dir
;
493 home_dir
= prefix_roota(root_directory
, "/home");
494 home_dir
= strjoina("-", home_dir
);
495 run_user_dir
= prefix_roota(root_directory
, "/run/user");
496 run_user_dir
= strjoina("-", run_user_dir
);
497 root_dir
= prefix_roota(root_directory
, "/root");
498 root_dir
= strjoina("-", root_dir
);
500 r
= append_mounts(&m
, STRV_MAKE(home_dir
, run_user_dir
, root_dir
),
501 protect_home
== PROTECT_HOME_READ_ONLY
? READONLY
: INACCESSIBLE
);
506 if (protect_system
!= PROTECT_SYSTEM_NO
) {
507 const char *usr_dir
, *boot_dir
, *etc_dir
;
509 usr_dir
= prefix_roota(root_directory
, "/usr");
510 boot_dir
= prefix_roota(root_directory
, "/boot");
511 boot_dir
= strjoina("-", boot_dir
);
512 etc_dir
= prefix_roota(root_directory
, "/etc");
514 r
= append_mounts(&m
, protect_system
== PROTECT_SYSTEM_FULL
515 ? STRV_MAKE(usr_dir
, boot_dir
, etc_dir
)
516 : STRV_MAKE(usr_dir
, boot_dir
), READONLY
);
521 assert(mounts
+ n
== m
);
523 qsort(mounts
, n
, sizeof(BindMount
), mount_path_compare
);
524 drop_duplicates(mounts
, &n
);
527 if (n
> 0 || root_directory
) {
528 /* Remount / as SLAVE so that nothing now mounted in the namespace
529 shows up in the parent */
530 if (mount(NULL
, "/", NULL
, MS_SLAVE
|MS_REC
, NULL
) < 0)
534 if (root_directory
) {
535 /* Turn directory into bind mount */
536 if (mount(root_directory
, root_directory
, NULL
, MS_BIND
|MS_REC
, NULL
) < 0)
541 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
542 r
= apply_mount(m
, tmp_dir
, var_tmp_dir
);
547 for (m
= mounts
; m
< mounts
+ n
; ++m
) {
548 r
= make_read_only(m
);
554 if (root_directory
) {
555 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
556 r
= mount_move_root(root_directory
);
558 /* at this point, we cannot rollback */
563 /* Remount / as the desired mode. Not that this will not
564 * reestablish propagation from our side to the host, since
565 * what's disconnected is disconnected. */
566 if (mount(NULL
, "/", NULL
, mount_flags
| MS_REC
, NULL
) < 0)
567 /* at this point, we cannot rollback */
574 for (m
= mounts
; m
< mounts
+ n
; ++m
)
576 (void) umount2(m
->path
, MNT_DETACH
);
582 static int setup_one_tmp_dir(const char *id
, const char *prefix
, char **path
) {
583 _cleanup_free_
char *x
= NULL
;
584 char bid
[SD_ID128_STRING_MAX
];
592 /* We include the boot id in the directory so that after a
593 * reboot we can easily identify obsolete directories. */
595 r
= sd_id128_get_boot(&boot_id
);
599 x
= strjoin(prefix
, "/systemd-private-", sd_id128_to_string(boot_id
, bid
), "-", id
, "-XXXXXX", NULL
);
607 RUN_WITH_UMASK(0000) {
610 y
= strjoina(x
, "/tmp");
612 if (mkdir(y
, 0777 | S_ISVTX
) < 0)
622 int setup_tmp_dirs(const char *id
, char **tmp_dir
, char **var_tmp_dir
) {
630 r
= setup_one_tmp_dir(id
, "/tmp", &a
);
634 r
= setup_one_tmp_dir(id
, "/var/tmp", &b
);
638 t
= strjoina(a
, "/tmp");
652 int setup_netns(int netns_storage_socket
[2]) {
653 _cleanup_close_
int netns
= -1;
656 assert(netns_storage_socket
);
657 assert(netns_storage_socket
[0] >= 0);
658 assert(netns_storage_socket
[1] >= 0);
660 /* We use the passed socketpair as a storage buffer for our
661 * namespace reference fd. Whatever process runs this first
662 * shall create a new namespace, all others should just join
663 * it. To serialize that we use a file lock on the socket
666 * It's a bit crazy, but hey, works great! */
668 if (lockf(netns_storage_socket
[0], F_LOCK
, 0) < 0)
671 netns
= receive_one_fd(netns_storage_socket
[0], MSG_DONTWAIT
);
672 if (netns
== -EAGAIN
) {
673 /* Nothing stored yet, so let's create a new namespace */
675 if (unshare(CLONE_NEWNET
) < 0) {
682 netns
= open("/proc/self/ns/net", O_RDONLY
|O_CLOEXEC
|O_NOCTTY
);
690 } else if (netns
< 0) {
695 /* Yay, found something, so let's join the namespace */
696 if (setns(netns
, CLONE_NEWNET
) < 0) {
704 q
= send_one_fd(netns_storage_socket
[1], netns
, MSG_DONTWAIT
);
711 lockf(netns_storage_socket
[0], F_ULOCK
, 0);
715 static const char *const protect_home_table
[_PROTECT_HOME_MAX
] = {
716 [PROTECT_HOME_NO
] = "no",
717 [PROTECT_HOME_YES
] = "yes",
718 [PROTECT_HOME_READ_ONLY
] = "read-only",
721 DEFINE_STRING_TABLE_LOOKUP(protect_home
, ProtectHome
);
723 static const char *const protect_system_table
[_PROTECT_SYSTEM_MAX
] = {
724 [PROTECT_SYSTEM_NO
] = "no",
725 [PROTECT_SYSTEM_YES
] = "yes",
726 [PROTECT_SYSTEM_FULL
] = "full",
729 DEFINE_STRING_TABLE_LOOKUP(protect_system
, ProtectSystem
);