]> git.proxmox.com Git - systemd.git/blob - src/core/namespace.c
Enable seccomp support on powerpc, ppc64el, and s390x
[systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sched.h>
24 #include <stdio.h>
25 #include <string.h>
26 #include <sys/mount.h>
27 #include <sys/stat.h>
28 #include <unistd.h>
29 #include <linux/fs.h>
30
31 #include "alloc-util.h"
32 #include "dev-setup.h"
33 #include "fd-util.h"
34 #include "loopback-setup.h"
35 #include "missing.h"
36 #include "mkdir.h"
37 #include "mount-util.h"
38 #include "namespace.h"
39 #include "path-util.h"
40 #include "selinux-util.h"
41 #include "socket-util.h"
42 #include "string-table.h"
43 #include "string-util.h"
44 #include "strv.h"
45 #include "umask-util.h"
46 #include "user-util.h"
47 #include "util.h"
48
49 typedef enum MountMode {
50 /* This is ordered by priority! */
51 INACCESSIBLE,
52 READONLY,
53 PRIVATE_TMP,
54 PRIVATE_VAR_TMP,
55 PRIVATE_DEV,
56 PRIVATE_BUS_ENDPOINT,
57 READWRITE
58 } MountMode;
59
60 typedef struct BindMount {
61 const char *path;
62 MountMode mode;
63 bool done;
64 bool ignore;
65 } BindMount;
66
67 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
68 char **i;
69
70 assert(p);
71
72 STRV_FOREACH(i, strv) {
73
74 (*p)->ignore = false;
75 (*p)->done = false;
76
77 if ((mode == INACCESSIBLE || mode == READONLY || mode == READWRITE) && (*i)[0] == '-') {
78 (*p)->ignore = true;
79 (*i)++;
80 }
81
82 if (!path_is_absolute(*i))
83 return -EINVAL;
84
85 (*p)->path = *i;
86 (*p)->mode = mode;
87 (*p)++;
88 }
89
90 return 0;
91 }
92
93 static int mount_path_compare(const void *a, const void *b) {
94 const BindMount *p = a, *q = b;
95 int d;
96
97 d = path_compare(p->path, q->path);
98
99 if (d == 0) {
100 /* If the paths are equal, check the mode */
101 if (p->mode < q->mode)
102 return -1;
103
104 if (p->mode > q->mode)
105 return 1;
106
107 return 0;
108 }
109
110 /* If the paths are not equal, then order prefixes first */
111 return d;
112 }
113
114 static void drop_duplicates(BindMount *m, unsigned *n) {
115 BindMount *f, *t, *previous;
116
117 assert(m);
118 assert(n);
119
120 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
121
122 /* The first one wins */
123 if (previous && path_equal(f->path, previous->path))
124 continue;
125
126 *t = *f;
127
128 previous = t;
129
130 t++;
131 }
132
133 *n = t - m;
134 }
135
136 static int mount_dev(BindMount *m) {
137 static const char devnodes[] =
138 "/dev/null\0"
139 "/dev/zero\0"
140 "/dev/full\0"
141 "/dev/random\0"
142 "/dev/urandom\0"
143 "/dev/tty\0";
144
145 char temporary_mount[] = "/tmp/namespace-dev-XXXXXX";
146 const char *d, *dev = NULL, *devpts = NULL, *devshm = NULL, *devhugepages = NULL, *devmqueue = NULL, *devlog = NULL, *devptmx = NULL;
147 _cleanup_umask_ mode_t u;
148 int r;
149
150 assert(m);
151
152 u = umask(0000);
153
154 if (!mkdtemp(temporary_mount))
155 return -errno;
156
157 dev = strjoina(temporary_mount, "/dev");
158 (void) mkdir(dev, 0755);
159 if (mount("tmpfs", dev, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=755") < 0) {
160 r = -errno;
161 goto fail;
162 }
163
164 devpts = strjoina(temporary_mount, "/dev/pts");
165 (void) mkdir(devpts, 0755);
166 if (mount("/dev/pts", devpts, NULL, MS_BIND, NULL) < 0) {
167 r = -errno;
168 goto fail;
169 }
170
171 devptmx = strjoina(temporary_mount, "/dev/ptmx");
172 if (symlink("pts/ptmx", devptmx) < 0) {
173 r = -errno;
174 goto fail;
175 }
176
177 devshm = strjoina(temporary_mount, "/dev/shm");
178 (void) mkdir(devshm, 01777);
179 r = mount("/dev/shm", devshm, NULL, MS_BIND, NULL);
180 if (r < 0) {
181 r = -errno;
182 goto fail;
183 }
184
185 devmqueue = strjoina(temporary_mount, "/dev/mqueue");
186 (void) mkdir(devmqueue, 0755);
187 (void) mount("/dev/mqueue", devmqueue, NULL, MS_BIND, NULL);
188
189 devhugepages = strjoina(temporary_mount, "/dev/hugepages");
190 (void) mkdir(devhugepages, 0755);
191 (void) mount("/dev/hugepages", devhugepages, NULL, MS_BIND, NULL);
192
193 devlog = strjoina(temporary_mount, "/dev/log");
194 (void) symlink("/run/systemd/journal/dev-log", devlog);
195
196 NULSTR_FOREACH(d, devnodes) {
197 _cleanup_free_ char *dn = NULL;
198 struct stat st;
199
200 r = stat(d, &st);
201 if (r < 0) {
202
203 if (errno == ENOENT)
204 continue;
205
206 r = -errno;
207 goto fail;
208 }
209
210 if (!S_ISBLK(st.st_mode) &&
211 !S_ISCHR(st.st_mode)) {
212 r = -EINVAL;
213 goto fail;
214 }
215
216 if (st.st_rdev == 0)
217 continue;
218
219 dn = strappend(temporary_mount, d);
220 if (!dn) {
221 r = -ENOMEM;
222 goto fail;
223 }
224
225 mac_selinux_create_file_prepare(d, st.st_mode);
226 r = mknod(dn, st.st_mode, st.st_rdev);
227 mac_selinux_create_file_clear();
228
229 if (r < 0) {
230 r = -errno;
231 goto fail;
232 }
233 }
234
235 dev_setup(temporary_mount, UID_INVALID, GID_INVALID);
236
237 /* Create the /dev directory if missing. It is more likely to be
238 * missing when the service is started with RootDirectory. This is
239 * consistent with mount units creating the mount points when missing.
240 */
241 (void) mkdir_p_label(m->path, 0755);
242
243 if (mount(dev, m->path, NULL, MS_MOVE, NULL) < 0) {
244 r = -errno;
245 goto fail;
246 }
247
248 rmdir(dev);
249 rmdir(temporary_mount);
250
251 return 0;
252
253 fail:
254 if (devpts)
255 umount(devpts);
256
257 if (devshm)
258 umount(devshm);
259
260 if (devhugepages)
261 umount(devhugepages);
262
263 if (devmqueue)
264 umount(devmqueue);
265
266 umount(dev);
267 rmdir(dev);
268 rmdir(temporary_mount);
269
270 return r;
271 }
272
273 static int mount_kdbus(BindMount *m) {
274
275 char temporary_mount[] = "/tmp/kdbus-dev-XXXXXX";
276 _cleanup_free_ char *basepath = NULL;
277 _cleanup_umask_ mode_t u;
278 char *busnode = NULL, *root;
279 struct stat st;
280 int r;
281
282 assert(m);
283
284 u = umask(0000);
285
286 if (!mkdtemp(temporary_mount))
287 return log_error_errno(errno, "Failed create temp dir: %m");
288
289 root = strjoina(temporary_mount, "/kdbus");
290 (void) mkdir(root, 0755);
291 if (mount("tmpfs", root, "tmpfs", MS_NOSUID|MS_STRICTATIME, "mode=777") < 0) {
292 r = -errno;
293 goto fail;
294 }
295
296 /* create a new /dev/null dev node copy so we have some fodder to
297 * bind-mount the custom endpoint over. */
298 if (stat("/dev/null", &st) < 0) {
299 r = log_error_errno(errno, "Failed to stat /dev/null: %m");
300 goto fail;
301 }
302
303 busnode = strjoina(root, "/bus");
304 if (mknod(busnode, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
305 r = log_error_errno(errno, "mknod() for %s failed: %m",
306 busnode);
307 goto fail;
308 }
309
310 r = mount(m->path, busnode, NULL, MS_BIND, NULL);
311 if (r < 0) {
312 r = log_error_errno(errno, "bind mount of %s failed: %m",
313 m->path);
314 goto fail;
315 }
316
317 basepath = dirname_malloc(m->path);
318 if (!basepath) {
319 r = -ENOMEM;
320 goto fail;
321 }
322
323 if (mount(root, basepath, NULL, MS_MOVE, NULL) < 0) {
324 r = log_error_errno(errno, "bind mount of %s failed: %m",
325 basepath);
326 goto fail;
327 }
328
329 rmdir(temporary_mount);
330 return 0;
331
332 fail:
333 if (busnode) {
334 umount(busnode);
335 unlink(busnode);
336 }
337
338 umount(root);
339 rmdir(root);
340 rmdir(temporary_mount);
341
342 return r;
343 }
344
345 static int apply_mount(
346 BindMount *m,
347 const char *tmp_dir,
348 const char *var_tmp_dir) {
349
350 const char *what;
351 int r;
352
353 assert(m);
354
355 switch (m->mode) {
356
357 case INACCESSIBLE:
358
359 /* First, get rid of everything that is below if there
360 * is anything... Then, overmount it with an
361 * inaccessible directory. */
362 umount_recursive(m->path, 0);
363
364 what = "/run/systemd/inaccessible";
365 break;
366
367 case READONLY:
368 case READWRITE:
369 /* Nothing to mount here, we just later toggle the
370 * MS_RDONLY bit for the mount point */
371 return 0;
372
373 case PRIVATE_TMP:
374 what = tmp_dir;
375 break;
376
377 case PRIVATE_VAR_TMP:
378 what = var_tmp_dir;
379 break;
380
381 case PRIVATE_DEV:
382 return mount_dev(m);
383
384 case PRIVATE_BUS_ENDPOINT:
385 return mount_kdbus(m);
386
387 default:
388 assert_not_reached("Unknown mode");
389 }
390
391 assert(what);
392
393 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
394 if (r >= 0)
395 log_debug("Successfully mounted %s to %s", what, m->path);
396 else if (m->ignore && errno == ENOENT)
397 return 0;
398
399 return r;
400 }
401
402 static int make_read_only(BindMount *m) {
403 int r;
404
405 assert(m);
406
407 if (IN_SET(m->mode, INACCESSIBLE, READONLY))
408 r = bind_remount_recursive(m->path, true);
409 else if (IN_SET(m->mode, READWRITE, PRIVATE_TMP, PRIVATE_VAR_TMP, PRIVATE_DEV))
410 r = bind_remount_recursive(m->path, false);
411 else
412 r = 0;
413
414 if (m->ignore && r == -ENOENT)
415 return 0;
416
417 return r;
418 }
419
420 int setup_namespace(
421 const char* root_directory,
422 char** read_write_dirs,
423 char** read_only_dirs,
424 char** inaccessible_dirs,
425 const char* tmp_dir,
426 const char* var_tmp_dir,
427 const char* bus_endpoint_path,
428 bool private_dev,
429 ProtectHome protect_home,
430 ProtectSystem protect_system,
431 unsigned long mount_flags) {
432
433 BindMount *m, *mounts = NULL;
434 unsigned n;
435 int r = 0;
436
437 if (mount_flags == 0)
438 mount_flags = MS_SHARED;
439
440 if (unshare(CLONE_NEWNS) < 0)
441 return -errno;
442
443 n = !!tmp_dir + !!var_tmp_dir + !!bus_endpoint_path +
444 strv_length(read_write_dirs) +
445 strv_length(read_only_dirs) +
446 strv_length(inaccessible_dirs) +
447 private_dev +
448 (protect_home != PROTECT_HOME_NO ? 3 : 0) +
449 (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
450 (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
451
452 if (n > 0) {
453 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
454 r = append_mounts(&m, read_write_dirs, READWRITE);
455 if (r < 0)
456 return r;
457
458 r = append_mounts(&m, read_only_dirs, READONLY);
459 if (r < 0)
460 return r;
461
462 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
463 if (r < 0)
464 return r;
465
466 if (tmp_dir) {
467 m->path = prefix_roota(root_directory, "/tmp");
468 m->mode = PRIVATE_TMP;
469 m++;
470 }
471
472 if (var_tmp_dir) {
473 m->path = prefix_roota(root_directory, "/var/tmp");
474 m->mode = PRIVATE_VAR_TMP;
475 m++;
476 }
477
478 if (private_dev) {
479 m->path = prefix_roota(root_directory, "/dev");
480 m->mode = PRIVATE_DEV;
481 m++;
482 }
483
484 if (bus_endpoint_path) {
485 m->path = prefix_roota(root_directory, bus_endpoint_path);
486 m->mode = PRIVATE_BUS_ENDPOINT;
487 m++;
488 }
489
490 if (protect_home != PROTECT_HOME_NO) {
491 const char *home_dir, *run_user_dir, *root_dir;
492
493 home_dir = prefix_roota(root_directory, "/home");
494 home_dir = strjoina("-", home_dir);
495 run_user_dir = prefix_roota(root_directory, "/run/user");
496 run_user_dir = strjoina("-", run_user_dir);
497 root_dir = prefix_roota(root_directory, "/root");
498 root_dir = strjoina("-", root_dir);
499
500 r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
501 protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
502 if (r < 0)
503 return r;
504 }
505
506 if (protect_system != PROTECT_SYSTEM_NO) {
507 const char *usr_dir, *boot_dir, *etc_dir;
508
509 usr_dir = prefix_roota(root_directory, "/usr");
510 boot_dir = prefix_roota(root_directory, "/boot");
511 boot_dir = strjoina("-", boot_dir);
512 etc_dir = prefix_roota(root_directory, "/etc");
513
514 r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
515 ? STRV_MAKE(usr_dir, boot_dir, etc_dir)
516 : STRV_MAKE(usr_dir, boot_dir), READONLY);
517 if (r < 0)
518 return r;
519 }
520
521 assert(mounts + n == m);
522
523 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
524 drop_duplicates(mounts, &n);
525 }
526
527 if (n > 0 || root_directory) {
528 /* Remount / as SLAVE so that nothing now mounted in the namespace
529 shows up in the parent */
530 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
531 return -errno;
532 }
533
534 if (root_directory) {
535 /* Turn directory into bind mount */
536 if (mount(root_directory, root_directory, NULL, MS_BIND|MS_REC, NULL) < 0)
537 return -errno;
538 }
539
540 if (n > 0) {
541 for (m = mounts; m < mounts + n; ++m) {
542 r = apply_mount(m, tmp_dir, var_tmp_dir);
543 if (r < 0)
544 goto fail;
545 }
546
547 for (m = mounts; m < mounts + n; ++m) {
548 r = make_read_only(m);
549 if (r < 0)
550 goto fail;
551 }
552 }
553
554 if (root_directory) {
555 /* MS_MOVE does not work on MS_SHARED so the remount MS_SHARED will be done later */
556 r = mount_move_root(root_directory);
557
558 /* at this point, we cannot rollback */
559 if (r < 0)
560 return r;
561 }
562
563 /* Remount / as the desired mode. Not that this will not
564 * reestablish propagation from our side to the host, since
565 * what's disconnected is disconnected. */
566 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0)
567 /* at this point, we cannot rollback */
568 return -errno;
569
570 return 0;
571
572 fail:
573 if (n > 0) {
574 for (m = mounts; m < mounts + n; ++m)
575 if (m->done)
576 (void) umount2(m->path, MNT_DETACH);
577 }
578
579 return r;
580 }
581
582 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
583 _cleanup_free_ char *x = NULL;
584 char bid[SD_ID128_STRING_MAX];
585 sd_id128_t boot_id;
586 int r;
587
588 assert(id);
589 assert(prefix);
590 assert(path);
591
592 /* We include the boot id in the directory so that after a
593 * reboot we can easily identify obsolete directories. */
594
595 r = sd_id128_get_boot(&boot_id);
596 if (r < 0)
597 return r;
598
599 x = strjoin(prefix, "/systemd-private-", sd_id128_to_string(boot_id, bid), "-", id, "-XXXXXX", NULL);
600 if (!x)
601 return -ENOMEM;
602
603 RUN_WITH_UMASK(0077)
604 if (!mkdtemp(x))
605 return -errno;
606
607 RUN_WITH_UMASK(0000) {
608 char *y;
609
610 y = strjoina(x, "/tmp");
611
612 if (mkdir(y, 0777 | S_ISVTX) < 0)
613 return -errno;
614 }
615
616 *path = x;
617 x = NULL;
618
619 return 0;
620 }
621
622 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
623 char *a, *b;
624 int r;
625
626 assert(id);
627 assert(tmp_dir);
628 assert(var_tmp_dir);
629
630 r = setup_one_tmp_dir(id, "/tmp", &a);
631 if (r < 0)
632 return r;
633
634 r = setup_one_tmp_dir(id, "/var/tmp", &b);
635 if (r < 0) {
636 char *t;
637
638 t = strjoina(a, "/tmp");
639 rmdir(t);
640 rmdir(a);
641
642 free(a);
643 return r;
644 }
645
646 *tmp_dir = a;
647 *var_tmp_dir = b;
648
649 return 0;
650 }
651
652 int setup_netns(int netns_storage_socket[2]) {
653 _cleanup_close_ int netns = -1;
654 int r, q;
655
656 assert(netns_storage_socket);
657 assert(netns_storage_socket[0] >= 0);
658 assert(netns_storage_socket[1] >= 0);
659
660 /* We use the passed socketpair as a storage buffer for our
661 * namespace reference fd. Whatever process runs this first
662 * shall create a new namespace, all others should just join
663 * it. To serialize that we use a file lock on the socket
664 * pair.
665 *
666 * It's a bit crazy, but hey, works great! */
667
668 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
669 return -errno;
670
671 netns = receive_one_fd(netns_storage_socket[0], MSG_DONTWAIT);
672 if (netns == -EAGAIN) {
673 /* Nothing stored yet, so let's create a new namespace */
674
675 if (unshare(CLONE_NEWNET) < 0) {
676 r = -errno;
677 goto fail;
678 }
679
680 loopback_setup();
681
682 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
683 if (netns < 0) {
684 r = -errno;
685 goto fail;
686 }
687
688 r = 1;
689
690 } else if (netns < 0) {
691 r = netns;
692 goto fail;
693
694 } else {
695 /* Yay, found something, so let's join the namespace */
696 if (setns(netns, CLONE_NEWNET) < 0) {
697 r = -errno;
698 goto fail;
699 }
700
701 r = 0;
702 }
703
704 q = send_one_fd(netns_storage_socket[1], netns, MSG_DONTWAIT);
705 if (q < 0) {
706 r = q;
707 goto fail;
708 }
709
710 fail:
711 lockf(netns_storage_socket[0], F_ULOCK, 0);
712 return r;
713 }
714
715 static const char *const protect_home_table[_PROTECT_HOME_MAX] = {
716 [PROTECT_HOME_NO] = "no",
717 [PROTECT_HOME_YES] = "yes",
718 [PROTECT_HOME_READ_ONLY] = "read-only",
719 };
720
721 DEFINE_STRING_TABLE_LOOKUP(protect_home, ProtectHome);
722
723 static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
724 [PROTECT_SYSTEM_NO] = "no",
725 [PROTECT_SYSTEM_YES] = "yes",
726 [PROTECT_SYSTEM_FULL] = "full",
727 };
728
729 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);