]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/conf.c
5f060fe18c08da2ec09721dffe54a52250fa72ed
[mirror_lxc.git] / src / lxc / conf.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #define _GNU_SOURCE
25 #include "config.h"
26
27 #include <arpa/inet.h>
28 #include <dirent.h>
29 #include <errno.h>
30 #include <fcntl.h>
31 #include <grp.h>
32 #include <inttypes.h>
33 #include <libgen.h>
34 #include <linux/loop.h>
35 #include <net/if.h>
36 #include <netinet/in.h>
37 #include <pwd.h>
38 #include <stdarg.h>
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <sys/mman.h>
43 #include <sys/mount.h>
44 #include <sys/param.h>
45 #include <sys/prctl.h>
46 #include <sys/sendfile.h>
47 #include <sys/socket.h>
48 #include <sys/stat.h>
49 #include <sys/syscall.h>
50 #include <sys/sysmacros.h>
51 #include <sys/types.h>
52 #include <sys/utsname.h>
53 #include <sys/wait.h>
54 #include <time.h>
55 #include <unistd.h>
56
57 #ifdef MAJOR_IN_MKDEV
58 #include <sys/mkdev.h>
59 #endif
60
61 #ifdef HAVE_STATVFS
62 #include <sys/statvfs.h>
63 #endif
64
65 #if HAVE_PTY_H
66 #include <pty.h>
67 #else
68 #include <../include/openpty.h>
69 #endif
70
71 #if HAVE_LIBCAP
72 #include <sys/capability.h>
73 #endif
74
75 #if HAVE_SYS_PERSONALITY_H
76 #include <sys/personality.h>
77 #endif
78
79 #ifndef HAVE_STRLCAT
80 #include "include/strlcat.h"
81 #endif
82
83 #if IS_BIONIC
84 #include <../include/lxcmntent.h>
85 #else
86 #include <mntent.h>
87 #endif
88
89 #if !defined(HAVE_PRLIMIT) && defined(HAVE_PRLIMIT64)
90 #include <../include/prlimit.h>
91 #endif
92
93 #include "af_unix.h"
94 #include "caps.h"
95 #include "cgroup.h"
96 #include "conf.h"
97 #include "confile_utils.h"
98 #include "error.h"
99 #include "log.h"
100 #include "lsm/lsm.h"
101 #include "lxclock.h"
102 #include "lxcseccomp.h"
103 #include "namespace.h"
104 #include "network.h"
105 #include "parse.h"
106 #include "ringbuf.h"
107 #include "start.h"
108 #include "storage.h"
109 #include "storage/overlay.h"
110 #include "terminal.h"
111 #include "utils.h"
112
113 #ifndef MS_PRIVATE
114 #define MS_PRIVATE (1<<18)
115 #endif
116
117 #ifndef MS_LAZYTIME
118 #define MS_LAZYTIME (1<<25)
119 #endif
120
121 lxc_log_define(conf, lxc);
122
123 /* The lxc_conf of the container currently being worked on in an API call.
124 * This is used in the error calls.
125 */
126 #ifdef HAVE_TLS
127 __thread struct lxc_conf *current_config;
128 #else
129 struct lxc_conf *current_config;
130 #endif
131
132 /* Define pivot_root() if missing from the C library */
133 #ifndef HAVE_PIVOT_ROOT
134 static int pivot_root(const char *new_root, const char *put_old)
135 {
136 #ifdef __NR_pivot_root
137 return syscall(__NR_pivot_root, new_root, put_old);
138 #else
139 errno = ENOSYS;
140 return -1;
141 #endif
142 }
143 #else
144 extern int pivot_root(const char *new_root, const char *put_old);
145 #endif
146
147 char *lxchook_names[NUM_LXC_HOOKS] = {
148 "pre-start",
149 "pre-mount",
150 "mount",
151 "autodev",
152 "start",
153 "stop",
154 "post-stop",
155 "clone",
156 "destroy",
157 "start-host"
158 };
159
160 struct mount_opt {
161 char *name;
162 int clear;
163 int flag;
164 };
165
166 struct caps_opt {
167 char *name;
168 int value;
169 };
170
171 struct limit_opt {
172 char *name;
173 int value;
174 };
175
176 static struct mount_opt mount_opt[] = {
177 { "async", 1, MS_SYNCHRONOUS },
178 { "atime", 1, MS_NOATIME },
179 { "bind", 0, MS_BIND },
180 { "defaults", 0, 0 },
181 { "dev", 1, MS_NODEV },
182 { "diratime", 1, MS_NODIRATIME },
183 { "dirsync", 0, MS_DIRSYNC },
184 { "exec", 1, MS_NOEXEC },
185 { "lazytime", 0, MS_LAZYTIME },
186 { "mand", 0, MS_MANDLOCK },
187 { "noatime", 0, MS_NOATIME },
188 { "nodev", 0, MS_NODEV },
189 { "nodiratime", 0, MS_NODIRATIME },
190 { "noexec", 0, MS_NOEXEC },
191 { "nomand", 1, MS_MANDLOCK },
192 { "norelatime", 1, MS_RELATIME },
193 { "nostrictatime", 1, MS_STRICTATIME },
194 { "nosuid", 0, MS_NOSUID },
195 { "rbind", 0, MS_BIND|MS_REC },
196 { "relatime", 0, MS_RELATIME },
197 { "remount", 0, MS_REMOUNT },
198 { "ro", 0, MS_RDONLY },
199 { "rw", 1, MS_RDONLY },
200 { "strictatime", 0, MS_STRICTATIME },
201 { "suid", 1, MS_NOSUID },
202 { "sync", 0, MS_SYNCHRONOUS },
203 { NULL, 0, 0 },
204 };
205
206 static struct mount_opt propagation_opt[] = {
207 { "private", 0, MS_PRIVATE },
208 { "shared", 0, MS_SHARED },
209 { "slave", 0, MS_SLAVE },
210 { "unbindable", 0, MS_UNBINDABLE },
211 { "rprivate", 0, MS_PRIVATE|MS_REC },
212 { "rshared", 0, MS_SHARED|MS_REC },
213 { "rslave", 0, MS_SLAVE|MS_REC },
214 { "runbindable", 0, MS_UNBINDABLE|MS_REC },
215 { NULL, 0, 0 },
216 };
217
218 static struct caps_opt caps_opt[] = {
219 #if HAVE_LIBCAP
220 { "chown", CAP_CHOWN },
221 { "dac_override", CAP_DAC_OVERRIDE },
222 { "dac_read_search", CAP_DAC_READ_SEARCH },
223 { "fowner", CAP_FOWNER },
224 { "fsetid", CAP_FSETID },
225 { "kill", CAP_KILL },
226 { "setgid", CAP_SETGID },
227 { "setuid", CAP_SETUID },
228 { "setpcap", CAP_SETPCAP },
229 { "linux_immutable", CAP_LINUX_IMMUTABLE },
230 { "net_bind_service", CAP_NET_BIND_SERVICE },
231 { "net_broadcast", CAP_NET_BROADCAST },
232 { "net_admin", CAP_NET_ADMIN },
233 { "net_raw", CAP_NET_RAW },
234 { "ipc_lock", CAP_IPC_LOCK },
235 { "ipc_owner", CAP_IPC_OWNER },
236 { "sys_module", CAP_SYS_MODULE },
237 { "sys_rawio", CAP_SYS_RAWIO },
238 { "sys_chroot", CAP_SYS_CHROOT },
239 { "sys_ptrace", CAP_SYS_PTRACE },
240 { "sys_pacct", CAP_SYS_PACCT },
241 { "sys_admin", CAP_SYS_ADMIN },
242 { "sys_boot", CAP_SYS_BOOT },
243 { "sys_nice", CAP_SYS_NICE },
244 { "sys_resource", CAP_SYS_RESOURCE },
245 { "sys_time", CAP_SYS_TIME },
246 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
247 { "mknod", CAP_MKNOD },
248 { "lease", CAP_LEASE },
249 #ifdef CAP_AUDIT_READ
250 { "audit_read", CAP_AUDIT_READ },
251 #endif
252 #ifdef CAP_AUDIT_WRITE
253 { "audit_write", CAP_AUDIT_WRITE },
254 #endif
255 #ifdef CAP_AUDIT_CONTROL
256 { "audit_control", CAP_AUDIT_CONTROL },
257 #endif
258 { "setfcap", CAP_SETFCAP },
259 { "mac_override", CAP_MAC_OVERRIDE },
260 { "mac_admin", CAP_MAC_ADMIN },
261 #ifdef CAP_SYSLOG
262 { "syslog", CAP_SYSLOG },
263 #endif
264 #ifdef CAP_WAKE_ALARM
265 { "wake_alarm", CAP_WAKE_ALARM },
266 #endif
267 #ifdef CAP_BLOCK_SUSPEND
268 { "block_suspend", CAP_BLOCK_SUSPEND },
269 #endif
270 #endif
271 };
272
273 static struct limit_opt limit_opt[] = {
274 #ifdef RLIMIT_AS
275 { "as", RLIMIT_AS },
276 #endif
277 #ifdef RLIMIT_CORE
278 { "core", RLIMIT_CORE },
279 #endif
280 #ifdef RLIMIT_CPU
281 { "cpu", RLIMIT_CPU },
282 #endif
283 #ifdef RLIMIT_DATA
284 { "data", RLIMIT_DATA },
285 #endif
286 #ifdef RLIMIT_FSIZE
287 { "fsize", RLIMIT_FSIZE },
288 #endif
289 #ifdef RLIMIT_LOCKS
290 { "locks", RLIMIT_LOCKS },
291 #endif
292 #ifdef RLIMIT_MEMLOCK
293 { "memlock", RLIMIT_MEMLOCK },
294 #endif
295 #ifdef RLIMIT_MSGQUEUE
296 { "msgqueue", RLIMIT_MSGQUEUE },
297 #endif
298 #ifdef RLIMIT_NICE
299 { "nice", RLIMIT_NICE },
300 #endif
301 #ifdef RLIMIT_NOFILE
302 { "nofile", RLIMIT_NOFILE },
303 #endif
304 #ifdef RLIMIT_NPROC
305 { "nproc", RLIMIT_NPROC },
306 #endif
307 #ifdef RLIMIT_RSS
308 { "rss", RLIMIT_RSS },
309 #endif
310 #ifdef RLIMIT_RTPRIO
311 { "rtprio", RLIMIT_RTPRIO },
312 #endif
313 #ifdef RLIMIT_RTTIME
314 { "rttime", RLIMIT_RTTIME },
315 #endif
316 #ifdef RLIMIT_SIGPENDING
317 { "sigpending", RLIMIT_SIGPENDING },
318 #endif
319 #ifdef RLIMIT_STACK
320 { "stack", RLIMIT_STACK },
321 #endif
322 };
323
324 static int run_buffer(char *buffer)
325 {
326 int ret;
327 char *output;
328 struct lxc_popen_FILE *f;
329
330 f = lxc_popen(buffer);
331 if (!f) {
332 SYSERROR("Failed to popen() %s", buffer);
333 return -1;
334 }
335
336 output = malloc(LXC_LOG_BUFFER_SIZE);
337 if (!output) {
338 ERROR("Failed to allocate memory for %s", buffer);
339 lxc_pclose(f);
340 return -1;
341 }
342
343 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
344 DEBUG("Script %s with output: %s", buffer, output);
345
346 free(output);
347
348 ret = lxc_pclose(f);
349 if (ret == -1) {
350 SYSERROR("Script exited with error");
351 return -1;
352 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
353 ERROR("Script exited with status %d", WEXITSTATUS(ret));
354 return -1;
355 } else if (WIFSIGNALED(ret)) {
356 ERROR("Script terminated by signal %d", WTERMSIG(ret));
357 return -1;
358 }
359
360 return 0;
361 }
362
363 int run_script_argv(const char *name, unsigned int hook_version,
364 const char *section, const char *script,
365 const char *hookname, char **argv)
366 {
367 int buf_pos, i, ret;
368 char *buffer;
369 int fret = -1;
370 size_t size = 0;
371
372 if (hook_version == 0)
373 INFO("Executing script \"%s\" for container \"%s\", config "
374 "section \"%s\"", script, name, section);
375 else
376 INFO("Executing script \"%s\" for container \"%s\"", script, name);
377
378 for (i = 0; argv && argv[i]; i++)
379 size += strlen(argv[i]) + 1;
380
381 size += sizeof("exec");
382 size += strlen(script);
383 size++;
384
385 if (size > INT_MAX)
386 return -EFBIG;
387
388 if (hook_version == 0) {
389 size += strlen(hookname);
390 size++;
391
392 size += strlen(name);
393 size++;
394
395 size += strlen(section);
396 size++;
397
398 if (size > INT_MAX)
399 return -EFBIG;
400 }
401
402 buffer = malloc(size);
403 if (!buffer)
404 return -ENOMEM;
405
406 if (hook_version == 0)
407 buf_pos = snprintf(buffer, size, "exec %s %s %s %s", script, name, section, hookname);
408 else
409 buf_pos = snprintf(buffer, size, "exec %s", script);
410 if (buf_pos < 0 || (size_t)buf_pos >= size) {
411 ERROR("Failed to create command line for script \"%s\"", script);
412 goto on_error;
413 }
414
415 if (hook_version == 1) {
416 ret = setenv("LXC_HOOK_TYPE", hookname, 1);
417 if (ret < 0) {
418 SYSERROR("Failed to set environment variable: "
419 "LXC_HOOK_TYPE=%s", hookname);
420 goto on_error;
421 }
422 TRACE("Set environment variable: LXC_HOOK_TYPE=%s", hookname);
423
424 ret = setenv("LXC_HOOK_SECTION", section, 1);
425 if (ret < 0) {
426 SYSERROR("Failed to set environment variable: "
427 "LXC_HOOK_SECTION=%s", section);
428 goto on_error;
429 }
430 TRACE("Set environment variable: LXC_HOOK_SECTION=%s", section);
431
432 if (strcmp(section, "net") == 0) {
433 char *parent;
434
435 if (!argv || !argv[0])
436 goto on_error;
437
438 ret = setenv("LXC_NET_TYPE", argv[0], 1);
439 if (ret < 0) {
440 SYSERROR("Failed to set environment variable: "
441 "LXC_NET_TYPE=%s", argv[0]);
442 goto on_error;
443 }
444 TRACE("Set environment variable: LXC_NET_TYPE=%s", argv[0]);
445
446 parent = argv[1] ? argv[1] : "";
447
448 if (strcmp(argv[0], "macvlan") == 0) {
449 ret = setenv("LXC_NET_PARENT", parent, 1);
450 if (ret < 0) {
451 SYSERROR("Failed to set environment "
452 "variable: LXC_NET_PARENT=%s", parent);
453 goto on_error;
454 }
455 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
456 } else if (strcmp(argv[0], "phys") == 0) {
457 ret = setenv("LXC_NET_PARENT", parent, 1);
458 if (ret < 0) {
459 SYSERROR("Failed to set environment "
460 "variable: LXC_NET_PARENT=%s", parent);
461 goto on_error;
462 }
463 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
464 } else if (strcmp(argv[0], "veth") == 0) {
465 char *peer = argv[2] ? argv[2] : "";
466
467 ret = setenv("LXC_NET_PEER", peer, 1);
468 if (ret < 0) {
469 SYSERROR("Failed to set environment "
470 "variable: LXC_NET_PEER=%s", peer);
471 goto on_error;
472 }
473 TRACE("Set environment variable: LXC_NET_PEER=%s", peer);
474
475 ret = setenv("LXC_NET_PARENT", parent, 1);
476 if (ret < 0) {
477 SYSERROR("Failed to set environment "
478 "variable: LXC_NET_PARENT=%s", parent);
479 goto on_error;
480 }
481 TRACE("Set environment variable: LXC_NET_PARENT=%s", parent);
482 }
483 }
484 }
485
486 for (i = 0; argv && argv[i]; i++) {
487 size_t len = size - buf_pos;
488
489 ret = snprintf(buffer + buf_pos, len, " %s", argv[i]);
490 if (ret < 0 || (size_t)ret >= len) {
491 ERROR("Failed to create command line for script \"%s\"", script);
492 goto on_error;
493 }
494 buf_pos += ret;
495 }
496
497 fret = run_buffer(buffer);
498
499 on_error:
500 free(buffer);
501 return fret;
502 }
503
504 int run_script(const char *name, const char *section, const char *script, ...)
505 {
506 int ret;
507 char *buffer, *p;
508 va_list ap;
509 size_t size = 0;
510
511 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\"",
512 script, name, section);
513
514 va_start(ap, script);
515 while ((p = va_arg(ap, char *)))
516 size += strlen(p) + 1;
517 va_end(ap);
518
519 size += strlen("exec");
520 size += strlen(script);
521 size += strlen(name);
522 size += strlen(section);
523 size += 4;
524
525 if (size > INT_MAX)
526 return -1;
527
528 buffer = alloca(size);
529 ret = snprintf(buffer, size, "exec %s %s %s", script, name, section);
530 if (ret < 0 || ret >= size)
531 return -1;
532
533 va_start(ap, script);
534 while ((p = va_arg(ap, char *))) {
535 int len = size - ret;
536 int rc;
537 rc = snprintf(buffer + ret, len, " %s", p);
538 if (rc < 0 || rc >= len) {
539 va_end(ap);
540 return -1;
541 }
542 ret += rc;
543 }
544 va_end(ap);
545
546 return run_buffer(buffer);
547 }
548
549 /* pin_rootfs
550 * if rootfs is a directory, then open ${rootfs}/.lxc-keep for writing for
551 * the duration of the container run, to prevent the container from marking
552 * the underlying fs readonly on shutdown. unlink the file immediately so
553 * no name pollution is happens.
554 * don't unlink on NFS to avoid random named stale handles.
555 * return -1 on error.
556 * return -2 if nothing needed to be pinned.
557 * return an open fd (>=0) if we pinned it.
558 */
559 int pin_rootfs(const char *rootfs)
560 {
561 int fd, ret;
562 char absrootfs[MAXPATHLEN], absrootfspin[MAXPATHLEN];
563 struct stat s;
564 struct statfs sfs;
565
566 if (rootfs == NULL || strlen(rootfs) == 0)
567 return -2;
568
569 if (!realpath(rootfs, absrootfs))
570 return -2;
571
572 ret = stat(absrootfs, &s);
573 if (ret < 0)
574 return -1;
575
576 if (!S_ISDIR(s.st_mode))
577 return -2;
578
579 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/.lxc-keep", absrootfs);
580 if (ret >= MAXPATHLEN)
581 return -1;
582
583 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR | S_IRUSR);
584 if (fd < 0)
585 return fd;
586
587 ret = fstatfs (fd, &sfs);
588 if (ret < 0)
589 return fd;
590
591 if (sfs.f_type == NFS_SUPER_MAGIC) {
592 DEBUG("Rootfs on NFS, not unlinking pin file \"%s\"", absrootfspin);
593 return fd;
594 }
595
596 (void)unlink(absrootfspin);
597
598 return fd;
599 }
600
601 /* If we are asking to remount something, make sure that any NOEXEC etc are
602 * honored.
603 */
604 unsigned long add_required_remount_flags(const char *s, const char *d,
605 unsigned long flags)
606 {
607 #ifdef HAVE_STATVFS
608 int ret;
609 struct statvfs sb;
610 unsigned long required_flags = 0;
611
612 if (!(flags & MS_REMOUNT))
613 return flags;
614
615 if (!s)
616 s = d;
617
618 if (!s)
619 return flags;
620
621 ret = statvfs(s, &sb);
622 if (ret < 0)
623 return flags;
624
625 if (sb.f_flag & MS_NOSUID)
626 required_flags |= MS_NOSUID;
627 if (sb.f_flag & MS_NODEV)
628 required_flags |= MS_NODEV;
629 if (sb.f_flag & MS_RDONLY)
630 required_flags |= MS_RDONLY;
631 if (sb.f_flag & MS_NOEXEC)
632 required_flags |= MS_NOEXEC;
633
634 return flags | required_flags;
635 #else
636 return flags;
637 #endif
638 }
639
640 static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
641 {
642 int i, r;
643 static struct {
644 int match_mask;
645 int match_flag;
646 const char *source;
647 const char *destination;
648 const char *fstype;
649 unsigned long flags;
650 const char *options;
651 } default_mounts[] = {
652 /* Read-only bind-mounting... In older kernels, doing that
653 * required to do one MS_BIND mount and then
654 * MS_REMOUNT|MS_RDONLY the same one. According to mount(2)
655 * manpage, MS_BIND honors MS_RDONLY from kernel 2.6.26
656 * onwards. However, this apparently does not work on kernel
657 * 3.8. Unfortunately, on that very same kernel, doing the same
658 * trick as above doesn't seem to work either, there one needs
659 * to ALSO specify MS_BIND for the remount, otherwise the
660 * entire fs is remounted read-only or the mount fails because
661 * it's busy... MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for
662 * kernels as low as 2.6.32...
663 */
664 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
665 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
666 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
667 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
668 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
669 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
670 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
671 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
672 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
673 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
674 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
675 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
676 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
677 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
678 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
679 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
680 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
681 { 0, 0, NULL, NULL, NULL, 0, NULL }
682 };
683
684 for (i = 0; default_mounts[i].match_mask; i++) {
685 int saved_errno;
686 unsigned long mflags;
687 char *destination = NULL;
688 char *source = NULL;
689 if ((flags & default_mounts[i].match_mask) != default_mounts[i].match_flag)
690 continue;
691
692 if (default_mounts[i].source) {
693 /* will act like strdup if %r is not present */
694 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
695 if (!source)
696 return -1;
697 }
698
699 if (!default_mounts[i].destination) {
700 ERROR("BUG: auto mounts destination %d was NULL", i);
701 free(source);
702 return -1;
703 }
704
705 /* will act like strdup if %r is not present */
706 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
707 if (!destination) {
708 saved_errno = errno;
709 free(source);
710 errno = saved_errno;
711 return -1;
712 }
713
714 mflags = add_required_remount_flags(source, destination,
715 default_mounts[i].flags);
716 r = safe_mount(source, destination, default_mounts[i].fstype,
717 mflags, default_mounts[i].options,
718 conf->rootfs.path ? conf->rootfs.mount : NULL);
719 saved_errno = errno;
720 if (r < 0 && errno == ENOENT) {
721 INFO("Mount source or target for \"%s\" on \"%s\" does "
722 "not exist. Skipping", source, destination);
723 r = 0;
724 } else if (r < 0) {
725 SYSERROR("Failed to mount \"%s\" on \"%s\" with flags %lu", source, destination, mflags);
726 }
727
728 free(source);
729 free(destination);
730 if (r < 0) {
731 errno = saved_errno;
732 return -1;
733 }
734 }
735
736 if (flags & LXC_AUTO_CGROUP_MASK) {
737 int cg_flags;
738
739 cg_flags = flags & (LXC_AUTO_CGROUP_MASK & ~LXC_AUTO_CGROUP_FORCE);
740 /* If the type of cgroup mount was not specified, it depends on
741 * the container's capabilities as to what makes sense: if we
742 * have CAP_SYS_ADMIN, the read-only part can be remounted
743 * read-write anyway, so we may as well default to read-write;
744 * then the admin will not be given a false sense of security.
745 * (And if they really want mixed r/o r/w, then they can
746 * explicitly specify :mixed.) OTOH, if the container lacks
747 * CAP_SYS_ADMIN, do only default to :mixed, because then the
748 * container can't remount it read-write.
749 */
750 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
751 int has_sys_admin = 0;
752
753 if (!lxc_list_empty(&conf->keepcaps))
754 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
755 else
756 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
757
758 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
759 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
760 else
761 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
762 }
763
764 if (flags & LXC_AUTO_CGROUP_FORCE)
765 cg_flags |= LXC_AUTO_CGROUP_FORCE;
766
767 if (!handler->cgroup_ops->mount(handler->cgroup_ops,
768 handler,
769 conf->rootfs.path ? conf->rootfs.mount : "",
770 cg_flags)) {
771 SYSERROR("Failed to mount \"/sys/fs/cgroup\"");
772 return -1;
773 }
774 }
775
776 return 0;
777 }
778
779 static int setup_utsname(struct utsname *utsname)
780 {
781 int ret;
782
783 if (!utsname)
784 return 0;
785
786 ret = sethostname(utsname->nodename, strlen(utsname->nodename));
787 if (ret < 0) {
788 SYSERROR("Failed to set the hostname to \"%s\"", utsname->nodename);
789 return -1;
790 }
791
792 INFO("Set hostname to \"%s\"", utsname->nodename);
793
794 return 0;
795 }
796
797 struct dev_symlinks {
798 const char *oldpath;
799 const char *name;
800 };
801
802 static const struct dev_symlinks dev_symlinks[] = {
803 { "/proc/self/fd", "fd" },
804 { "/proc/self/fd/0", "stdin" },
805 { "/proc/self/fd/1", "stdout" },
806 { "/proc/self/fd/2", "stderr" },
807 };
808
809 static int lxc_setup_dev_symlinks(const struct lxc_rootfs *rootfs)
810 {
811 int i, ret;
812 char path[MAXPATHLEN];
813 struct stat s;
814
815 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
816 const struct dev_symlinks *d = &dev_symlinks[i];
817
818 ret = snprintf(path, sizeof(path), "%s/dev/%s",
819 rootfs->path ? rootfs->mount : "", d->name);
820 if (ret < 0 || ret >= MAXPATHLEN)
821 return -1;
822
823 /* Stat the path first. If we don't get an error accept it as
824 * is and don't try to create it
825 */
826 ret = stat(path, &s);
827 if (ret == 0)
828 continue;
829
830 ret = symlink(d->oldpath, path);
831 if (ret && errno != EEXIST) {
832 if (errno == EROFS) {
833 WARN("Failed to create \"%s\". Read-only filesystem", path);
834 } else {
835 SYSERROR("Failed to create \"%s\"", path);
836 return -1;
837 }
838 }
839 }
840
841 return 0;
842 }
843
844 /* Build a space-separate list of ptys to pass to systemd. */
845 static bool append_ttyname(char **pp, char *name)
846 {
847 char *p;
848 size_t size;
849
850 if (!*pp) {
851 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
852 if (!*pp)
853 return false;
854
855 sprintf(*pp, "container_ttys=%s", name);
856 return true;
857 }
858
859 size = strlen(*pp) + strlen(name) + 2;
860 p = realloc(*pp, size);
861 if (!p)
862 return false;
863
864 *pp = p;
865 (void)strlcat(p, " ", size);
866 (void)strlcat(p, name, size);
867
868 return true;
869 }
870
871 static int lxc_setup_ttys(struct lxc_conf *conf)
872 {
873 int i, ret;
874 const struct lxc_tty_info *ttys = &conf->ttys;
875 char *ttydir = ttys->dir;
876 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
877
878 if (!conf->rootfs.path)
879 return 0;
880
881 for (i = 0; i < ttys->max; i++) {
882 struct lxc_terminal_info *tty = &ttys->tty[i];
883
884 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
885 if (ret < 0 || (size_t)ret >= sizeof(path))
886 return -1;
887
888 if (ttydir) {
889 /* create dev/lxc/tty%d" */
890 ret = snprintf(lxcpath, sizeof(lxcpath),
891 "/dev/%s/tty%d", ttydir, i + 1);
892 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
893 return -1;
894
895 ret = creat(lxcpath, 0660);
896 if (ret < 0 && errno != EEXIST) {
897 SYSERROR("Failed to create \"%s\"", lxcpath);
898 return -1;
899 }
900 if (ret >= 0)
901 close(ret);
902
903 ret = unlink(path);
904 if (ret < 0 && errno != ENOENT) {
905 SYSERROR("Failed to unlink \"%s\"", path);
906 return -1;
907 }
908
909 ret = mount(tty->name, lxcpath, "none", MS_BIND, 0);
910 if (ret < 0) {
911 WARN("Failed to bind mount \"%s\" onto \"%s\"",
912 tty->name, path);
913 continue;
914 }
915 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name,
916 path);
917
918 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
919 ttydir, i + 1);
920 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
921 return -1;
922
923 ret = symlink(lxcpath, path);
924 if (ret < 0) {
925 SYSERROR("Failed to create symlink \"%s\" -> \"%s\"",
926 path, lxcpath);
927 return -1;
928 }
929 } else {
930 /* If we populated /dev, then we need to create
931 * /dev/ttyN
932 */
933 ret = mknod(path, S_IFREG | 0000, 0);
934 if (ret < 0) /* this isn't fatal, continue */
935 SYSERROR("Failed to create \"%s\"", path);
936
937 ret = mount(tty->name, path, "none", MS_BIND, 0);
938 if (ret < 0) {
939 SYSERROR("Failed to mount '%s'->'%s'", tty->name, path);
940 continue;
941 }
942
943 DEBUG("Bind mounted \"%s\" onto \"%s\"", tty->name, path);
944 }
945
946 if (!append_ttyname(&conf->ttys.tty_names, tty->name)) {
947 ERROR("Error setting up container_ttys string");
948 return -1;
949 }
950 }
951
952 INFO("Finished setting up %zu /dev/tty<N> device(s)", ttys->max);
953 return 0;
954 }
955
956 int lxc_allocate_ttys(struct lxc_conf *conf)
957 {
958 int i, ret;
959 struct lxc_tty_info *ttys = &conf->ttys;
960
961 /* no tty in the configuration */
962 if (ttys->max == 0)
963 return 0;
964
965 ttys->tty = malloc(sizeof(*ttys->tty) * ttys->max);
966 if (!ttys->tty)
967 return -ENOMEM;
968
969 for (i = 0; i < ttys->max; i++) {
970 struct lxc_terminal_info *tty = &ttys->tty[i];
971
972 tty->master = -EBADF;
973 tty->slave = -EBADF;
974 ret = openpty(&tty->master, &tty->slave,
975 tty->name, NULL, NULL);
976 if (ret) {
977 SYSERROR("Failed to create tty %d", i);
978 ttys->max = i;
979 lxc_delete_tty(ttys);
980 return -ENOTTY;
981 }
982
983 DEBUG("Created tty \"%s\" with master fd %d and slave fd %d",
984 tty->name, tty->master, tty->slave);
985
986 /* Prevent leaking the file descriptors to the container */
987 ret = fcntl(tty->master, F_SETFD, FD_CLOEXEC);
988 if (ret < 0)
989 SYSWARN("Failed to set FD_CLOEXEC flag on master fd %d of "
990 "tty device \"%s\"", tty->master, tty->name);
991
992 ret = fcntl(tty->slave, F_SETFD, FD_CLOEXEC);
993 if (ret < 0)
994 SYSWARN("Failed to set FD_CLOEXEC flag on slave fd %d of "
995 "tty device \"%s\"", tty->slave, tty->name);
996
997 tty->busy = 0;
998 }
999
1000 INFO("Finished creating %zu tty devices", ttys->max);
1001 return 0;
1002 }
1003
1004 void lxc_delete_tty(struct lxc_tty_info *ttys)
1005 {
1006 int i;
1007
1008 if (!ttys->tty)
1009 return;
1010
1011 for (i = 0; i < ttys->max; i++) {
1012 struct lxc_terminal_info *tty = &ttys->tty[i];
1013
1014 if (tty->master >= 0) {
1015 close(tty->master);
1016 tty->master = -EBADF;
1017 }
1018
1019 if (tty->slave >= 0) {
1020 close(tty->slave);
1021 tty->slave = -EBADF;
1022 }
1023 }
1024
1025 free(ttys->tty);
1026 ttys->tty = NULL;
1027 }
1028
1029 static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
1030 {
1031 int i;
1032 int ret = -1;
1033 struct lxc_conf *conf = handler->conf;
1034 struct lxc_tty_info *ttys = &conf->ttys;
1035 int sock = handler->data_sock[0];
1036
1037 if (ttys->max == 0)
1038 return 0;
1039
1040 for (i = 0; i < ttys->max; i++) {
1041 int ttyfds[2];
1042 struct lxc_terminal_info *tty = &ttys->tty[i];
1043
1044 ttyfds[0] = tty->master;
1045 ttyfds[1] = tty->slave;
1046
1047 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
1048 if (ret < 0)
1049 break;
1050
1051 TRACE("Sent ty \"%s\" with master fd %d and slave fd %d to "
1052 "parent", tty->name, tty->master, tty->slave);
1053 }
1054
1055 if (ret < 0)
1056 SYSERROR("Failed to send %zu ttys to parent", ttys->max);
1057 else
1058 TRACE("Sent %zu ttys to parent", ttys->max);
1059
1060 return ret;
1061 }
1062
1063 static int lxc_create_ttys(struct lxc_handler *handler)
1064 {
1065 int ret = -1;
1066 struct lxc_conf *conf = handler->conf;
1067
1068 ret = lxc_allocate_ttys(conf);
1069 if (ret < 0) {
1070 ERROR("Failed to allocate ttys");
1071 goto on_error;
1072 }
1073
1074 ret = lxc_send_ttys_to_parent(handler);
1075 if (ret < 0) {
1076 ERROR("Failed to send ttys to parent");
1077 goto on_error;
1078 }
1079
1080 if (!conf->is_execute) {
1081 ret = lxc_setup_ttys(conf);
1082 if (ret < 0) {
1083 ERROR("Failed to setup ttys");
1084 goto on_error;
1085 }
1086 }
1087
1088 if (conf->ttys.tty_names) {
1089 ret = setenv("container_ttys", conf->ttys.tty_names, 1);
1090 if (ret < 0)
1091 SYSERROR("Failed to set \"container_ttys=%s\"", conf->ttys.tty_names);
1092 }
1093
1094 ret = 0;
1095
1096 on_error:
1097 lxc_delete_tty(&conf->ttys);
1098
1099 return ret;
1100 }
1101
1102 static int setup_rootfs_pivot_root(const char *rootfs)
1103 {
1104 int ret;
1105 int newroot = -1, oldroot = -1;
1106
1107 oldroot = open("/", O_DIRECTORY | O_RDONLY);
1108 if (oldroot < 0) {
1109 SYSERROR("Failed to open old root directory");
1110 return -1;
1111 }
1112
1113 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
1114 if (newroot < 0) {
1115 SYSERROR("Failed to open new root directory");
1116 goto on_error;
1117 }
1118
1119 /* change into new root fs */
1120 ret = fchdir(newroot);
1121 if (ret < 0) {
1122 SYSERROR("Failed to change to new rootfs \"%s\"", rootfs);
1123 goto on_error;
1124 }
1125
1126 /* pivot_root into our new root fs */
1127 ret = pivot_root(".", ".");
1128 if (ret < 0) {
1129 SYSERROR("Failed to pivot_root()");
1130 goto on_error;
1131 }
1132
1133 /* At this point the old-root is mounted on top of our new-root. To
1134 * unmounted it we must not be chdir'd into it, so escape back to
1135 * old-root.
1136 */
1137 ret = fchdir(oldroot);
1138 if (ret < 0) {
1139 SYSERROR("Failed to enter old root directory");
1140 goto on_error;
1141 }
1142
1143 /* Make oldroot rslave to make sure our umounts don't propagate to the
1144 * host.
1145 */
1146 ret = mount("", ".", "", MS_SLAVE | MS_REC, NULL);
1147 if (ret < 0) {
1148 SYSERROR("Failed to make oldroot rslave");
1149 goto on_error;
1150 }
1151
1152 ret = umount2(".", MNT_DETACH);
1153 if (ret < 0) {
1154 SYSERROR("Failed to detach old root directory");
1155 goto on_error;
1156 }
1157
1158 ret = fchdir(newroot);
1159 if (ret < 0) {
1160 SYSERROR("Failed to re-enter new root directory");
1161 goto on_error;
1162 }
1163
1164 close(oldroot);
1165 close(newroot);
1166
1167 DEBUG("pivot_root(\"%s\") successful", rootfs);
1168
1169 return 0;
1170
1171 on_error:
1172 if (oldroot != -1)
1173 close(oldroot);
1174 if (newroot != -1)
1175 close(newroot);
1176
1177 return -1;
1178 }
1179
1180 /* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1181 * error, log it but don't fail yet.
1182 */
1183 static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1184 const char *lxcpath)
1185 {
1186 int ret;
1187 size_t clen;
1188 char *path;
1189
1190 INFO("Preparing \"/dev\"");
1191
1192 /* $(rootfs->mount) + "/dev/pts" + '\0' */
1193 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
1194 path = alloca(clen);
1195
1196 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
1197 if (ret < 0 || (size_t)ret >= clen)
1198 return -1;
1199
1200 if (!dir_exists(path)) {
1201 WARN("\"/dev\" directory does not exist. Proceeding without "
1202 "autodev being set up");
1203 return 0;
1204 }
1205
1206 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
1207 rootfs->path ? rootfs->mount : NULL);
1208 if (ret < 0) {
1209 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1210 return -1;
1211 }
1212 INFO("Mounted tmpfs on \"%s\"", path);
1213
1214 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
1215 if (ret < 0 || (size_t)ret >= clen)
1216 return -1;
1217
1218 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
1219 * If not, then create it and exit if that fails...
1220 */
1221 if (!dir_exists(path)) {
1222 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1223 if (ret < 0) {
1224 SYSERROR("Failed to create directory \"%s\"", path);
1225 return -1;
1226 }
1227 }
1228
1229 INFO("Prepared \"/dev\"");
1230 return 0;
1231 }
1232
1233 struct lxc_device_node {
1234 const char *name;
1235 const mode_t mode;
1236 const int maj;
1237 const int min;
1238 };
1239
1240 static const struct lxc_device_node lxc_devices[] = {
1241 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1242 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1243 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1244 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
1245 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1246 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1247 };
1248
1249 static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
1250 {
1251 int i, ret;
1252 char path[MAXPATHLEN];
1253 mode_t cmask;
1254 int can_mknod = 1;
1255
1256 ret = snprintf(path, MAXPATHLEN, "%s/dev",
1257 rootfs->path ? rootfs->mount : "");
1258 if (ret < 0 || ret >= MAXPATHLEN)
1259 return -1;
1260
1261 /* ignore, just don't try to fill in */
1262 if (!dir_exists(path))
1263 return 0;
1264
1265 INFO("Populating \"/dev\"");
1266
1267 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1268 for (i = 0; i < sizeof(lxc_devices) / sizeof(lxc_devices[0]); i++) {
1269 char hostpath[MAXPATHLEN];
1270 const struct lxc_device_node *device = &lxc_devices[i];
1271
1272 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
1273 rootfs->path ? rootfs->mount : "", device->name);
1274 if (ret < 0 || ret >= MAXPATHLEN)
1275 return -1;
1276
1277 /* See
1278 * - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=55956b59df336f6738da916dbb520b6e37df9fbd
1279 * - https://lists.linuxfoundation.org/pipermail/containers/2018-June/039176.html
1280 */
1281 if (can_mknod == 2 || (can_mknod == 1 && !am_host_unpriv())) {
1282 ret = mknod(path, device->mode, makedev(device->maj, device->min));
1283 if (ret == 0 || (ret < 0 && errno == EEXIST)) {
1284 DEBUG("Created device node \"%s\"", path);
1285 continue;
1286 }
1287
1288 if (errno != EPERM) {
1289 SYSERROR("Failed to create device node \"%s\"", path);
1290 return -1;
1291 }
1292
1293 /* This can e.g. happen when the container is
1294 * unprivileged or CAP_MKNOD has been dropped.
1295 */
1296 can_mknod = 2;
1297 } else {
1298 can_mknod = 0;
1299 }
1300
1301 ret = mknod(path, S_IFREG, 0);
1302 if (ret < 0 && errno != EEXIST) {
1303 SYSERROR("Failed to create file \"%s\"", path);
1304 return -1;
1305 }
1306
1307 /* Fallback to bind-mounting the device from the host. */
1308 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", device->name);
1309 if (ret < 0 || ret >= MAXPATHLEN)
1310 return -1;
1311
1312 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1313 rootfs->path ? rootfs->mount : NULL);
1314 if (ret < 0) {
1315 SYSERROR("Failed to bind mount host device node \"%s\" "
1316 "onto \"%s\"", hostpath, path);
1317 return -1;
1318 }
1319 DEBUG("Bind mounted host device node \"%s\" onto \"%s\"",
1320 hostpath, path);
1321 }
1322 (void)umask(cmask);
1323
1324 INFO("Populated \"/dev\"");
1325 return 0;
1326 }
1327
1328 static int lxc_setup_rootfs(struct lxc_conf *conf)
1329 {
1330 int ret;
1331 struct lxc_storage *bdev;
1332 const struct lxc_rootfs *rootfs;
1333
1334 rootfs = &conf->rootfs;
1335 if (!rootfs->path) {
1336 ret = mount("", "/", NULL, MS_SLAVE | MS_REC, 0);
1337 if (ret < 0) {
1338 SYSERROR("Failed to make / rslave");
1339 return -1;
1340 }
1341
1342 return 0;
1343 }
1344
1345 ret = access(rootfs->mount, F_OK);
1346 if (ret != 0) {
1347 SYSERROR("Failed to access to \"%s\". Check it is present",
1348 rootfs->mount);
1349 return -1;
1350 }
1351
1352 bdev = storage_init(conf);
1353 if (!bdev) {
1354 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
1355 rootfs->path, rootfs->mount,
1356 rootfs->options ? rootfs->options : "(null)");
1357 return -1;
1358 }
1359
1360 ret = bdev->ops->mount(bdev);
1361 storage_put(bdev);
1362 if (ret < 0) {
1363 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\"",
1364 rootfs->path, rootfs->mount,
1365 rootfs->options ? rootfs->options : "(null)");
1366 return -1;
1367 }
1368
1369 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\"",
1370 rootfs->path, rootfs->mount,
1371 rootfs->options ? rootfs->options : "(null)");
1372
1373 return 0;
1374 }
1375
1376 int prepare_ramfs_root(char *root)
1377 {
1378 int i, ret;
1379 char *p, *p2;
1380 char buf[LXC_LINELEN], nroot[PATH_MAX];
1381 FILE *f;
1382
1383 if (!realpath(root, nroot))
1384 return -1;
1385
1386 ret = chdir("/");
1387 if (ret < 0)
1388 return -1;
1389
1390 /* We could use here MS_MOVE, but in userns this mount is locked and
1391 * can't be moved.
1392 */
1393 ret = mount(root, "/", NULL, MS_REC | MS_BIND, NULL);
1394 if (ret < 0) {
1395 SYSERROR("Failed to move \"%s\" into \"/\"", root);
1396 return -1;
1397 }
1398
1399 ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL);
1400 if (ret < 0) {
1401 SYSERROR("Failed to make \"/\" rprivate");
1402 return -1;
1403 }
1404
1405 /* The following code cleans up inhereted mounts which are not required
1406 * for CT.
1407 *
1408 * The mountinfo file shows not all mounts, if a few points have been
1409 * unmounted between read operations from the mountinfo. So we need to
1410 * read mountinfo a few times.
1411 *
1412 * This loop can be skipped if a container uses unserns, because all
1413 * inherited mounts are locked and we should live with all this trash.
1414 */
1415 for (;;) {
1416 int progress = 0;
1417
1418 f = fopen("./proc/self/mountinfo", "r");
1419 if (!f) {
1420 SYSERROR("Unable to open /proc/self/mountinfo");
1421 return -1;
1422 }
1423
1424 while (fgets(buf, LXC_LINELEN, f)) {
1425 for (p = buf, i=0; p && i < 4; i++)
1426 p = strchr(p+1, ' ');
1427
1428 if (!p)
1429 continue;
1430
1431 p2 = strchr(p+1, ' ');
1432 if (!p2)
1433 continue;
1434
1435 *p2 = '\0';
1436 *p = '.';
1437
1438 if (strcmp(p + 1, "/") == 0)
1439 continue;
1440
1441 if (strcmp(p + 1, "/proc") == 0)
1442 continue;
1443
1444 ret = umount2(p, MNT_DETACH);
1445 if (ret == 0)
1446 progress++;
1447 }
1448
1449 fclose(f);
1450
1451 if (!progress)
1452 break;
1453 }
1454
1455 /* This also can be skipped if a container uses unserns. */
1456 (void)umount2("./proc", MNT_DETACH);
1457
1458 /* It is weird, but chdir("..") moves us in a new root */
1459 ret = chdir("..");
1460 if (ret < 0) {
1461 SYSERROR("Unable to change working directory");
1462 return -1;
1463 }
1464
1465 ret = chroot(".");
1466 if (ret < 0) {
1467 SYSERROR("Unable to chroot");
1468 return -1;
1469 }
1470
1471 return 0;
1472 }
1473
1474 static int setup_pivot_root(const struct lxc_rootfs *rootfs)
1475 {
1476 int ret;
1477
1478 if (!rootfs->path) {
1479 DEBUG("Container does not have a rootfs");
1480 return 0;
1481 }
1482
1483 if (detect_ramfs_rootfs()) {
1484 DEBUG("Detected that container is on ramfs");
1485
1486 ret = prepare_ramfs_root(rootfs->mount);
1487 if (ret < 0) {
1488 ERROR("Failed to prepare minimal ramfs root");
1489 return -1;
1490 }
1491
1492 DEBUG("Prepared ramfs root for container");
1493 return 0;
1494 }
1495
1496 ret = setup_rootfs_pivot_root(rootfs->mount);
1497 if (ret < 0) {
1498 ERROR("Failed to pivot_root()");
1499 return -1;
1500 }
1501
1502 DEBUG("Finished pivot_root()");
1503 return 0;
1504 }
1505
1506 static const struct id_map *find_mapped_nsid_entry(struct lxc_conf *conf, unsigned id,
1507 enum idtype idtype)
1508 {
1509 struct lxc_list *it;
1510 struct id_map *map;
1511 struct id_map *retmap = NULL;
1512
1513 /* Shortcut for container's root mappings. */
1514 if (id == 0) {
1515 if (idtype == ID_TYPE_UID)
1516 return conf->root_nsuid_map;
1517
1518 if (idtype == ID_TYPE_GID)
1519 return conf->root_nsgid_map;
1520 }
1521
1522 lxc_list_for_each(it, &conf->id_map) {
1523 map = it->elem;
1524 if (map->idtype != idtype)
1525 continue;
1526
1527 if (id >= map->nsid && id < map->nsid + map->range) {
1528 retmap = map;
1529 break;
1530 }
1531 }
1532
1533 return retmap;
1534 }
1535
1536 static int lxc_setup_devpts(struct lxc_conf *conf)
1537 {
1538 int ret;
1539 const char *default_devpts_mntopts = "gid=5,newinstance,ptmxmode=0666,mode=0620";
1540 char devpts_mntopts[256];
1541
1542 if (conf->pty_max <= 0) {
1543 DEBUG("No new devpts instance will be mounted since no pts "
1544 "devices are requested");
1545 return 0;
1546 }
1547
1548 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%zu",
1549 default_devpts_mntopts, conf->pty_max);
1550 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1551 return -1;
1552
1553 ret = umount2("/dev/pts", MNT_DETACH);
1554 if (ret < 0)
1555 SYSWARN("Failed to unmount old devpts instance");
1556 else
1557 DEBUG("Unmounted old devpts instance");
1558
1559 /* Create mountpoint for devpts instance. */
1560 ret = mkdir("/dev/pts", 0755);
1561 if (ret < 0 && errno != EEXIST) {
1562 SYSERROR("Failed to create \"/dev/pts\" directory");
1563 return -1;
1564 }
1565
1566 /* mount new devpts instance */
1567 ret = mount("devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC, devpts_mntopts);
1568 if (ret < 0) {
1569 /* try mounting without gid=5 */
1570 ret = mount("devpts", "/dev/pts", "devpts",
1571 MS_NOSUID | MS_NOEXEC, devpts_mntopts + sizeof("gid=5"));
1572 if (ret < 0) {
1573 SYSERROR("Failed to mount new devpts instance");
1574 return -1;
1575 }
1576 }
1577 DEBUG("Mount new devpts instance with options \"%s\"", devpts_mntopts);
1578
1579 /* Remove any pre-existing /dev/ptmx file. */
1580 ret = remove("/dev/ptmx");
1581 if (ret < 0) {
1582 if (errno != ENOENT) {
1583 SYSERROR("Failed to remove existing \"/dev/ptmx\" file");
1584 return -1;
1585 }
1586 } else {
1587 DEBUG("Removed existing \"/dev/ptmx\" file");
1588 }
1589
1590 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1591 ret = open("/dev/ptmx", O_CREAT, 0666);
1592 if (ret < 0) {
1593 SYSERROR("Failed to create dummy \"/dev/ptmx\" file as bind mount target");
1594 return -1;
1595 }
1596 close(ret);
1597 DEBUG("Created dummy \"/dev/ptmx\" file as bind mount target");
1598
1599 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
1600 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
1601 if (!ret) {
1602 DEBUG("Bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1603 return 0;
1604 } else {
1605 /* Fallthrough and try to create a symlink. */
1606 ERROR("Failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1607 }
1608
1609 /* Remove the dummy /dev/ptmx file we created above. */
1610 ret = remove("/dev/ptmx");
1611 if (ret < 0) {
1612 SYSERROR("Failed to remove existing \"/dev/ptmx\"");
1613 return -1;
1614 }
1615
1616 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1617 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1618 if (ret < 0) {
1619 SYSERROR("Failed to create symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
1620 return -1;
1621 }
1622 DEBUG("Created symlink from \"/dev/ptmx\" to \"/dev/pts/ptmx\"");
1623
1624 return 0;
1625 }
1626
1627 static int setup_personality(int persona)
1628 {
1629 int ret;
1630
1631 #if HAVE_SYS_PERSONALITY_H
1632 if (persona == -1)
1633 return 0;
1634
1635 ret = personality(persona);
1636 if (ret < 0) {
1637 SYSERROR("Failed to set personality to \"0x%x\"", persona);
1638 return -1;
1639 }
1640
1641 INFO("Set personality to \"0x%x\"", persona);
1642 #endif
1643
1644 return 0;
1645 }
1646
1647 static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1648 const struct lxc_terminal *console)
1649 {
1650 int ret;
1651 char path[MAXPATHLEN];
1652 char *rootfs_path = rootfs->path ? rootfs->mount : "";
1653
1654 if (console->path && !strcmp(console->path, "none"))
1655 return 0;
1656
1657 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
1658 if (ret < 0 || (size_t)ret >= sizeof(path))
1659 return -1;
1660
1661 /* When we are asked to setup a console we remove any previous
1662 * /dev/console bind-mounts.
1663 */
1664 if (file_exists(path)) {
1665 ret = lxc_unstack_mountpoint(path, false);
1666 if (ret < 0) {
1667 SYSERROR("Failed to unmount \"%s\"", path);
1668 return -ret;
1669 } else {
1670 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
1671 }
1672 }
1673
1674 /* For unprivileged containers autodev or automounts will already have
1675 * taken care of creating /dev/console.
1676 */
1677 ret = mknod(path, S_IFREG | 0000, 0);
1678 if (ret < 0) {
1679 if (errno != EEXIST) {
1680 SYSERROR("Failed to create console");
1681 return -errno;
1682 }
1683 }
1684
1685 ret = fchmod(console->slave, S_IXUSR | S_IXGRP | S_IXOTH);
1686 if (ret < 0) {
1687 SYSERROR("Failed to set mode \"0%o\" to \"%s\"",
1688 S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1689 return -errno;
1690 }
1691
1692 ret = safe_mount(console->name, path, "none", MS_BIND, 0, rootfs_path);
1693 if (ret < 0) {
1694 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, path);
1695 return -1;
1696 }
1697
1698 DEBUG("Mounted pts device \"%s\" onto \"%s\"", console->name, path);
1699 return 0;
1700 }
1701
1702 static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1703 const struct lxc_terminal *console,
1704 char *ttydir)
1705 {
1706 int ret, fd;
1707 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
1708 char *rootfs_path = rootfs->path ? rootfs->mount : "";
1709
1710 if (console->path && !strcmp(console->path, "none"))
1711 return 0;
1712
1713 /* create rootfs/dev/<ttydir> directory */
1714 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs_path, ttydir);
1715 if (ret < 0 || (size_t)ret >= sizeof(path))
1716 return -1;
1717
1718 ret = mkdir(path, 0755);
1719 if (ret && errno != EEXIST) {
1720 SYSERROR("Failed to create \"%s\"", path);
1721 return -errno;
1722 }
1723 DEBUG("Created directory for console and tty devices at \"%s\"", path);
1724
1725 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs_path, ttydir);
1726 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1727 return -1;
1728
1729 ret = creat(lxcpath, 0660);
1730 if (ret == -1 && errno != EEXIST) {
1731 SYSERROR("Failed to create \"%s\"", lxcpath);
1732 return -errno;
1733 }
1734 if (ret >= 0)
1735 close(ret);
1736
1737 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs_path);
1738 if (ret < 0 || (size_t)ret >= sizeof(path))
1739 return -1;
1740
1741 if (file_exists(path)) {
1742 ret = lxc_unstack_mountpoint(path, false);
1743 if (ret < 0) {
1744 SYSERROR("Failed to unmount \"%s\"", path);
1745 return -ret;
1746 } else {
1747 DEBUG("Cleared all (%d) mounts from \"%s\"", ret, path);
1748 }
1749 }
1750
1751 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1752 if (fd < 0) {
1753 if (errno != EEXIST) {
1754 SYSERROR("Failed to create console");
1755 return -errno;
1756 }
1757 } else {
1758 close(fd);
1759 }
1760
1761 ret = chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH);
1762 if (ret < 0) {
1763 SYSERROR("Failed to set mode \"0%o\" to \"%s\"",
1764 S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1765 return -errno;
1766 }
1767
1768 /* bind mount console->name to '/dev/<ttydir>/console' */
1769 ret = safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs_path);
1770 if (ret < 0) {
1771 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
1772 return -1;
1773 }
1774 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
1775
1776 /* bind mount '/dev/<ttydir>/console' to '/dev/console' */
1777 ret = safe_mount(lxcpath, path, "none", MS_BIND, 0, rootfs_path);
1778 if (ret < 0) {
1779 ERROR("Failed to mount \"%s\" on \"%s\"", console->name, lxcpath);
1780 return -1;
1781 }
1782 DEBUG("Mounted \"%s\" onto \"%s\"", console->name, lxcpath);
1783
1784 DEBUG("Console has been setup under \"%s\" and mounted to \"%s\"", lxcpath, path);
1785 return 0;
1786 }
1787
1788 static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1789 const struct lxc_terminal *console, char *ttydir)
1790 {
1791
1792 if (!ttydir)
1793 return lxc_setup_dev_console(rootfs, console);
1794
1795 return lxc_setup_ttydir_console(rootfs, console, ttydir);
1796 }
1797
1798 static void parse_mntopt(char *opt, unsigned long *flags, char **data, size_t size)
1799 {
1800 struct mount_opt *mo;
1801
1802 /* If opt is found in mount_opt, set or clear flags.
1803 * Otherwise append it to data. */
1804
1805 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1806 if (strncmp(opt, mo->name, strlen(mo->name)) == 0) {
1807 if (mo->clear)
1808 *flags &= ~mo->flag;
1809 else
1810 *flags |= mo->flag;
1811 return;
1812 }
1813 }
1814
1815 if (strlen(*data))
1816 (void)strlcat(*data, ",", size);
1817
1818 (void)strlcat(*data, opt, size);
1819 }
1820
1821 int parse_mntopts(const char *mntopts, unsigned long *mntflags, char **mntdata)
1822 {
1823 char *data, *p, *s;
1824 char *saveptr = NULL;
1825 size_t size;
1826
1827 *mntdata = NULL;
1828 *mntflags = 0L;
1829
1830 if (!mntopts)
1831 return 0;
1832
1833 s = strdup(mntopts);
1834 if (!s)
1835 return -1;
1836
1837 size = strlen(s) + 1;
1838 data = malloc(size);
1839 if (!data) {
1840 free(s);
1841 return -1;
1842 }
1843 *data = 0;
1844
1845 for (; (p = strtok_r(s, ",", &saveptr)); s = NULL)
1846 parse_mntopt(p, mntflags, &data, size);
1847
1848 if (*data)
1849 *mntdata = data;
1850 else
1851 free(data);
1852 free(s);
1853
1854 return 0;
1855 }
1856
1857 static void parse_propagationopt(char *opt, unsigned long *flags)
1858 {
1859 struct mount_opt *mo;
1860
1861 /* If opt is found in propagation_opt, set or clear flags. */
1862 for (mo = &propagation_opt[0]; mo->name != NULL; mo++) {
1863 if (strncmp(opt, mo->name, strlen(mo->name)) != 0)
1864 continue;
1865
1866 if (mo->clear)
1867 *flags &= ~mo->flag;
1868 else
1869 *flags |= mo->flag;
1870
1871 return;
1872 }
1873 }
1874
1875 static int parse_propagationopts(const char *mntopts, unsigned long *pflags)
1876 {
1877 char *p, *s;
1878 char *saveptr = NULL;
1879
1880 if (!mntopts)
1881 return 0;
1882
1883 s = strdup(mntopts);
1884 if (!s) {
1885 SYSERROR("Failed to allocate memory");
1886 return -ENOMEM;
1887 }
1888
1889 *pflags = 0L;
1890 for (; (p = strtok_r(s, ",", &saveptr)); s = NULL)
1891 parse_propagationopt(p, pflags);
1892 free(s);
1893
1894 return 0;
1895 }
1896
1897 static void null_endofword(char *word)
1898 {
1899 while (*word && *word != ' ' && *word != '\t')
1900 word++;
1901 *word = '\0';
1902 }
1903
1904 /* skip @nfields spaces in @src */
1905 static char *get_field(char *src, int nfields)
1906 {
1907 int i;
1908 char *p = src;
1909
1910 for (i = 0; i < nfields; i++) {
1911 while (*p && *p != ' ' && *p != '\t')
1912 p++;
1913
1914 if (!*p)
1915 break;
1916
1917 p++;
1918 }
1919
1920 return p;
1921 }
1922
1923 static int mount_entry(const char *fsname, const char *target,
1924 const char *fstype, unsigned long mountflags,
1925 unsigned long pflags, const char *data, bool optional,
1926 bool dev, bool relative, const char *rootfs)
1927 {
1928 int ret;
1929 char srcbuf[MAXPATHLEN];
1930 const char *srcpath = fsname;
1931 #ifdef HAVE_STATVFS
1932 struct statvfs sb;
1933 #endif
1934
1935 if (relative) {
1936 ret = snprintf(srcbuf, MAXPATHLEN, "%s/%s", rootfs ? rootfs : "/", fsname ? fsname : "");
1937 if (ret < 0 || ret >= MAXPATHLEN) {
1938 ERROR("source path is too long");
1939 return -1;
1940 }
1941 srcpath = srcbuf;
1942 }
1943
1944 ret = safe_mount(srcpath, target, fstype, mountflags & ~MS_REMOUNT, data,
1945 rootfs);
1946 if (ret < 0) {
1947 if (optional) {
1948 SYSINFO("Failed to mount \"%s\" on \"%s\" (optional)",
1949 srcpath ? srcpath : "(null)", target);
1950 return 0;
1951 }
1952
1953 SYSERROR("Failed to mount \"%s\" on \"%s\"",
1954 srcpath ? srcpath : "(null)", target);
1955 return -1;
1956 }
1957
1958 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
1959 unsigned long rqd_flags = 0;
1960
1961 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
1962 "options", srcpath ? srcpath : "(none)", target ? target : "(none)");
1963
1964 if (mountflags & MS_RDONLY)
1965 rqd_flags |= MS_RDONLY;
1966 #ifdef HAVE_STATVFS
1967 if (srcpath && statvfs(srcpath, &sb) == 0) {
1968 unsigned long required_flags = rqd_flags;
1969
1970 if (sb.f_flag & MS_NOSUID)
1971 required_flags |= MS_NOSUID;
1972
1973 if (sb.f_flag & MS_NODEV && !dev)
1974 required_flags |= MS_NODEV;
1975
1976 if (sb.f_flag & MS_RDONLY)
1977 required_flags |= MS_RDONLY;
1978
1979 if (sb.f_flag & MS_NOEXEC)
1980 required_flags |= MS_NOEXEC;
1981
1982 DEBUG("Flags for \"%s\" were %lu, required extra flags "
1983 "are %lu", srcpath, sb.f_flag, required_flags);
1984
1985 /* If this was a bind mount request, and required_flags
1986 * does not have any flags which are not already in
1987 * mountflags, then skip the remount.
1988 */
1989 if (!(mountflags & MS_REMOUNT)) {
1990 if (!(required_flags & ~mountflags) &&
1991 rqd_flags == 0) {
1992 DEBUG("Mountflags already were %lu, "
1993 "skipping remount", mountflags);
1994 goto skipremount;
1995 }
1996 }
1997
1998 mountflags |= required_flags;
1999 }
2000 #endif
2001
2002 ret = mount(srcpath, target, fstype, mountflags | MS_REMOUNT, data);
2003 if (ret < 0) {
2004 if (optional) {
2005 SYSINFO("Failed to mount \"%s\" on \"%s\" (optional)",
2006 srcpath ? srcpath : "(null)", target);
2007 return 0;
2008 }
2009
2010 SYSERROR("Failed to mount \"%s\" on \"%s\"",
2011 srcpath ? srcpath : "(null)", target);
2012 return -1;
2013 }
2014 }
2015
2016 if (pflags) {
2017 ret = mount(NULL, target, NULL, pflags, NULL);
2018 if (ret < 0) {
2019 if (optional) {
2020 SYSINFO("Failed to change mount propagation "
2021 "for \"%s\" (optional)", target);
2022 return 0;
2023 } else {
2024 SYSERROR("Failed to change mount propagation "
2025 "for \"%s\" (optional)", target);
2026 return -1;
2027 }
2028 }
2029 DEBUG("Changed mount propagation for \"%s\"", target);
2030 }
2031
2032
2033 #ifdef HAVE_STATVFS
2034 skipremount:
2035 #endif
2036 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"",
2037 srcpath ? srcpath : "(null)", target, fstype);
2038
2039 return 0;
2040 }
2041
2042 /* Remove "optional", "create=dir", and "create=file" from mntopt */
2043 static void cull_mntent_opt(struct mntent *mntent)
2044 {
2045 int i;
2046 char *list[] = {
2047 "create=dir",
2048 "create=file",
2049 "optional",
2050 "relative",
2051 NULL
2052 };
2053
2054 for (i = 0; list[i]; i++) {
2055 char *p, *p2;
2056
2057 p = strstr(mntent->mnt_opts, list[i]);
2058 if (!p)
2059 continue;
2060
2061 p2 = strchr(p, ',');
2062 if (!p2) {
2063 /* no more mntopts, so just chop it here */
2064 *p = '\0';
2065 continue;
2066 }
2067
2068 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
2069 }
2070 }
2071
2072 static int mount_entry_create_dir_file(const struct mntent *mntent,
2073 const char *path,
2074 const struct lxc_rootfs *rootfs,
2075 const char *lxc_name, const char *lxc_path)
2076 {
2077 int fd, ret;
2078 char *p1, *p2;
2079
2080 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
2081 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
2082 if (ret < 0)
2083 return -1;
2084 }
2085
2086 if (hasmntopt(mntent, "create=dir")) {
2087 ret = mkdir_p(path, 0755);
2088 if (ret < 0 && errno != EEXIST) {
2089 SYSERROR("Failed to create directory \"%s\"", path);
2090 return -1;
2091 }
2092 }
2093
2094 if (!hasmntopt(mntent, "create=file"))
2095 return 0;
2096
2097 ret = access(path, F_OK);
2098 if (ret == 0)
2099 return 0;
2100
2101 p1 = strdup(path);
2102 if (!p1)
2103 return -1;
2104
2105 p2 = dirname(p1);
2106
2107 ret = mkdir_p(p2, 0755);
2108 free(p1);
2109 if (ret < 0 && errno != EEXIST) {
2110 SYSERROR("Failed to create directory \"%s\"", path);
2111 return -1;
2112 }
2113
2114 fd = open(path, O_CREAT, 0644);
2115 if (fd < 0)
2116 return -1;
2117 close(fd);
2118
2119 return 0;
2120 }
2121
2122 /* rootfs, lxc_name, and lxc_path can be NULL when the container is created
2123 * without a rootfs. */
2124 static inline int mount_entry_on_generic(struct mntent *mntent,
2125 const char *path,
2126 const struct lxc_rootfs *rootfs,
2127 const char *lxc_name,
2128 const char *lxc_path)
2129 {
2130 int ret;
2131 unsigned long mntflags;
2132 char *mntdata;
2133 bool dev, optional, relative;
2134 unsigned long pflags = 0;
2135 char *rootfs_path = NULL;
2136
2137 optional = hasmntopt(mntent, "optional") != NULL;
2138 dev = hasmntopt(mntent, "dev") != NULL;
2139 relative = hasmntopt(mntent, "relative") != NULL;
2140
2141 if (rootfs && rootfs->path)
2142 rootfs_path = rootfs->mount;
2143
2144 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
2145 lxc_path);
2146 if (ret < 0) {
2147 if (optional)
2148 return 0;
2149
2150 return -1;
2151 }
2152 cull_mntent_opt(mntent);
2153
2154 ret = parse_propagationopts(mntent->mnt_opts, &pflags);
2155 if (ret < 0)
2156 return -1;
2157
2158 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
2159 if (ret < 0)
2160 return -1;
2161
2162 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
2163 pflags, mntdata, optional, dev, relative, rootfs_path);
2164
2165 free(mntdata);
2166 return ret;
2167 }
2168
2169 static inline int mount_entry_on_systemfs(struct mntent *mntent)
2170 {
2171 int ret;
2172 char path[MAXPATHLEN];
2173
2174 /* For containers created without a rootfs all mounts are treated as
2175 * absolute paths starting at / on the host.
2176 */
2177 if (mntent->mnt_dir[0] != '/')
2178 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2179 else
2180 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
2181 if (ret < 0 || ret >= sizeof(path))
2182 return -1;
2183
2184 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
2185 }
2186
2187 static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
2188 const struct lxc_rootfs *rootfs,
2189 const char *lxc_name,
2190 const char *lxc_path)
2191 {
2192 int offset;
2193 char *aux;
2194 const char *lxcpath;
2195 char path[MAXPATHLEN];
2196 int ret = 0;
2197
2198 lxcpath = lxc_global_config_value("lxc.lxcpath");
2199 if (!lxcpath)
2200 return -1;
2201
2202 /* If rootfs->path is a blockdev path, allow container fstab to use
2203 * <lxcpath>/<name>/rootfs" as the target prefix.
2204 */
2205 ret = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
2206 if (ret < 0 || ret >= MAXPATHLEN)
2207 goto skipvarlib;
2208
2209 aux = strstr(mntent->mnt_dir, path);
2210 if (aux) {
2211 offset = strlen(path);
2212 goto skipabs;
2213 }
2214
2215 skipvarlib:
2216 aux = strstr(mntent->mnt_dir, rootfs->path);
2217 if (!aux) {
2218 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
2219 return ret;
2220 }
2221 offset = strlen(rootfs->path);
2222
2223 skipabs:
2224 ret = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset);
2225 if (ret < 0 || ret >= MAXPATHLEN)
2226 return -1;
2227
2228 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2229 }
2230
2231 static int mount_entry_on_relative_rootfs(struct mntent *mntent,
2232 const struct lxc_rootfs *rootfs,
2233 const char *lxc_name,
2234 const char *lxc_path)
2235 {
2236 int ret;
2237 char path[MAXPATHLEN];
2238
2239 /* relative to root mount point */
2240 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
2241 if (ret < 0 || (size_t)ret >= sizeof(path))
2242 return -1;
2243
2244 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2245 }
2246
2247 static int mount_file_entries(const struct lxc_conf *conf,
2248 const struct lxc_rootfs *rootfs, FILE *file,
2249 const char *lxc_name, const char *lxc_path)
2250 {
2251 char buf[4096];
2252 struct mntent mntent;
2253 int ret = -1;
2254
2255 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
2256 if (!rootfs->path)
2257 ret = mount_entry_on_systemfs(&mntent);
2258 else if (mntent.mnt_dir[0] != '/')
2259 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2260 lxc_name, lxc_path);
2261 else
2262 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2263 lxc_name, lxc_path);
2264 if (ret < 0)
2265 return -1;
2266 }
2267 ret = 0;
2268
2269 INFO("Finished setting up mounts");
2270 return ret;
2271 }
2272
2273 static int setup_mount(const struct lxc_conf *conf,
2274 const struct lxc_rootfs *rootfs, const char *fstab,
2275 const char *lxc_name, const char *lxc_path)
2276 {
2277 FILE *f;
2278 int ret;
2279
2280 if (!fstab)
2281 return 0;
2282
2283 f = setmntent(fstab, "r");
2284 if (!f) {
2285 SYSERROR("Failed to open \"%s\"", fstab);
2286 return -1;
2287 }
2288
2289 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
2290 if (ret < 0)
2291 ERROR("Failed to set up mount entries");
2292
2293 endmntent(f);
2294 return ret;
2295 }
2296
2297 FILE *make_anonymous_mount_file(struct lxc_list *mount)
2298 {
2299 int ret;
2300 char *mount_entry;
2301 struct lxc_list *iterator;
2302 int fd = -1;
2303
2304 fd = memfd_create(".lxc_mount_file", MFD_CLOEXEC);
2305 if (fd < 0) {
2306 char template[] = P_tmpdir "/.lxc_mount_file_XXXXXX";
2307
2308 if (errno != ENOSYS)
2309 return NULL;
2310
2311 fd = lxc_make_tmpfile(template, true);
2312 if (fd < 0) {
2313 SYSERROR("Could not create temporary mount file");
2314 return NULL;
2315 }
2316
2317 TRACE("Created temporary mount file");
2318 }
2319 if (fd < 0) {
2320 SYSERROR("Could not create temporary mount file");
2321 return NULL;
2322 }
2323
2324 lxc_list_for_each (iterator, mount) {
2325 size_t len;
2326
2327 mount_entry = iterator->elem;
2328 len = strlen(mount_entry);
2329
2330 ret = lxc_write_nointr(fd, mount_entry, len);
2331 if (ret != len)
2332 goto on_error;
2333
2334 ret = lxc_write_nointr(fd, "\n", 1);
2335 if (ret != 1)
2336 goto on_error;
2337 }
2338
2339 ret = lseek(fd, 0, SEEK_SET);
2340 if (ret < 0)
2341 goto on_error;
2342
2343 return fdopen(fd, "r+");
2344
2345 on_error:
2346 SYSERROR("Failed to write mount entry to temporary mount file");
2347 close(fd);
2348 return NULL;
2349 }
2350
2351 static int setup_mount_entries(const struct lxc_conf *conf,
2352 const struct lxc_rootfs *rootfs,
2353 struct lxc_list *mount, const char *lxc_name,
2354 const char *lxc_path)
2355 {
2356 int ret;
2357 FILE *f;
2358
2359 f = make_anonymous_mount_file(mount);
2360 if (!f)
2361 return -1;
2362
2363 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
2364 fclose(f);
2365
2366 return ret;
2367 }
2368
2369 static int parse_cap(const char *cap)
2370 {
2371 size_t i;
2372 int capid = -1;
2373 size_t end = sizeof(caps_opt) / sizeof(caps_opt[0]);
2374 char *ptr = NULL;
2375
2376 if (strcmp(cap, "none") == 0)
2377 return -2;
2378
2379 for (i = 0; i < end; i++) {
2380 if (strcmp(cap, caps_opt[i].name))
2381 continue;
2382
2383 capid = caps_opt[i].value;
2384 break;
2385 }
2386
2387 if (capid < 0) {
2388 /* Try to see if it's numeric, so the user may specify
2389 * capabilities that the running kernel knows about but we
2390 * don't
2391 */
2392 errno = 0;
2393 capid = strtol(cap, &ptr, 10);
2394 if (!ptr || *ptr != '\0' || errno != 0)
2395 /* not a valid number */
2396 capid = -1;
2397 else if (capid > lxc_caps_last_cap())
2398 /* we have a number but it's not a valid
2399 * capability */
2400 capid = -1;
2401 }
2402
2403 return capid;
2404 }
2405
2406 int in_caplist(int cap, struct lxc_list *caps)
2407 {
2408 int capid;
2409 struct lxc_list *iterator;
2410
2411 lxc_list_for_each (iterator, caps) {
2412 capid = parse_cap(iterator->elem);
2413 if (capid == cap)
2414 return 1;
2415 }
2416
2417 return 0;
2418 }
2419
2420 static int setup_caps(struct lxc_list *caps)
2421 {
2422 int capid;
2423 char *drop_entry;
2424 struct lxc_list *iterator;
2425
2426 lxc_list_for_each (iterator, caps) {
2427 int ret;
2428
2429 drop_entry = iterator->elem;
2430
2431 capid = parse_cap(drop_entry);
2432 if (capid < 0) {
2433 ERROR("unknown capability %s", drop_entry);
2434 return -1;
2435 }
2436
2437 ret = prctl(PR_CAPBSET_DROP, capid, 0, 0, 0);
2438 if (ret < 0) {
2439 SYSERROR("Failed to remove %s capability", drop_entry);
2440 return -1;
2441 }
2442 DEBUG("Dropped %s (%d) capability", drop_entry, capid);
2443 }
2444
2445 DEBUG("Capabilities have been setup");
2446 return 0;
2447 }
2448
2449 static int dropcaps_except(struct lxc_list *caps)
2450 {
2451 int i, capid, numcaps;
2452 char *keep_entry;
2453 struct lxc_list *iterator;
2454
2455 numcaps = lxc_caps_last_cap() + 1;
2456 if (numcaps <= 0 || numcaps > 200)
2457 return -1;
2458 TRACE("Found %d capabilities", numcaps);
2459
2460 /* caplist[i] is 1 if we keep capability i */
2461 int *caplist = alloca(numcaps * sizeof(int));
2462 memset(caplist, 0, numcaps * sizeof(int));
2463
2464 lxc_list_for_each (iterator, caps) {
2465 keep_entry = iterator->elem;
2466
2467 capid = parse_cap(keep_entry);
2468 if (capid == -2)
2469 continue;
2470
2471 if (capid < 0) {
2472 ERROR("Unknown capability %s", keep_entry);
2473 return -1;
2474 }
2475
2476 DEBUG("Keep capability %s (%d)", keep_entry, capid);
2477 caplist[capid] = 1;
2478 }
2479
2480 for (i = 0; i < numcaps; i++) {
2481 int ret;
2482
2483 if (caplist[i])
2484 continue;
2485
2486 ret = prctl(PR_CAPBSET_DROP, i, 0, 0, 0);
2487 if (ret < 0) {
2488 SYSERROR("Failed to remove capability %d", i);
2489 return -1;
2490 }
2491 }
2492
2493 DEBUG("Capabilities have been setup");
2494 return 0;
2495 }
2496
2497 static int parse_resource(const char *res)
2498 {
2499 int ret;
2500 size_t i;
2501 int resid = -1;
2502
2503 for (i = 0; i < sizeof(limit_opt) / sizeof(limit_opt[0]); ++i)
2504 if (strcmp(res, limit_opt[i].name) == 0)
2505 return limit_opt[i].value;
2506
2507 /* Try to see if it's numeric, so the user may specify
2508 * resources that the running kernel knows about but
2509 * we don't.
2510 */
2511 ret = lxc_safe_int(res, &resid);
2512 if (ret < 0)
2513 return -1;
2514
2515 return resid;
2516 }
2517
2518 int setup_resource_limits(struct lxc_list *limits, pid_t pid)
2519 {
2520 int resid;
2521 struct lxc_list *it;
2522 struct lxc_limit *lim;
2523
2524 lxc_list_for_each (it, limits) {
2525 lim = it->elem;
2526
2527 resid = parse_resource(lim->resource);
2528 if (resid < 0) {
2529 ERROR("Unknown resource %s", lim->resource);
2530 return -1;
2531 }
2532
2533 #if HAVE_PRLIMIT || HAVE_PRLIMIT64
2534 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2535 SYSERROR("Failed to set limit %s", lim->resource);
2536 return -1;
2537 }
2538 #else
2539 ERROR("Cannot set limit %s as prlimit is missing", lim->resource);
2540 return -1;
2541 #endif
2542 }
2543
2544 return 0;
2545 }
2546
2547 int setup_sysctl_parameters(struct lxc_list *sysctls)
2548 {
2549 struct lxc_list *it;
2550 struct lxc_sysctl *elem;
2551 int ret = 0;
2552 char *tmp = NULL;
2553 char filename[MAXPATHLEN] = {0};
2554
2555 lxc_list_for_each (it, sysctls) {
2556 elem = it->elem;
2557 tmp = lxc_string_replace(".", "/", elem->key);
2558 if (!tmp) {
2559 ERROR("Failed to replace key %s", elem->key);
2560 return -1;
2561 }
2562
2563 ret = snprintf(filename, sizeof(filename), "/proc/sys/%s", tmp);
2564 free(tmp);
2565 if (ret < 0 || (size_t)ret >= sizeof(filename)) {
2566 ERROR("Error setting up sysctl parameters path");
2567 return -1;
2568 }
2569
2570 ret = lxc_write_to_file(filename, elem->value,
2571 strlen(elem->value), false, 0666);
2572 if (ret < 0) {
2573 ERROR("Failed to setup sysctl parameters %s to %s",
2574 elem->key, elem->value);
2575 return -1;
2576 }
2577 }
2578
2579 return 0;
2580 }
2581
2582 int setup_proc_filesystem(struct lxc_list *procs, pid_t pid)
2583 {
2584 struct lxc_list *it;
2585 struct lxc_proc *elem;
2586 int ret = 0;
2587 char *tmp = NULL;
2588 char filename[MAXPATHLEN] = {0};
2589
2590 lxc_list_for_each (it, procs) {
2591 elem = it->elem;
2592 tmp = lxc_string_replace(".", "/", elem->filename);
2593 if (!tmp) {
2594 ERROR("Failed to replace key %s", elem->filename);
2595 return -1;
2596 }
2597
2598 ret = snprintf(filename, sizeof(filename), "/proc/%d/%s", pid, tmp);
2599 free(tmp);
2600 if (ret < 0 || (size_t)ret >= sizeof(filename)) {
2601 ERROR("Error setting up proc filesystem path");
2602 return -1;
2603 }
2604
2605 ret = lxc_write_to_file(filename, elem->value,
2606 strlen(elem->value), false, 0666);
2607 if (ret < 0) {
2608 ERROR("Failed to setup proc filesystem %s to %s",
2609 elem->filename, elem->value);
2610 return -1;
2611 }
2612 }
2613
2614 return 0;
2615 }
2616
2617 static char *default_rootfs_mount = LXCROOTFSMOUNT;
2618
2619 struct lxc_conf *lxc_conf_init(void)
2620 {
2621 int i;
2622 struct lxc_conf *new;
2623
2624 new = malloc(sizeof(*new));
2625 if (!new)
2626 return NULL;
2627 memset(new, 0, sizeof(*new));
2628
2629 new->loglevel = LXC_LOG_LEVEL_NOTSET;
2630 new->personality = -1;
2631 new->autodev = 1;
2632 new->console.buffer_size = 0;
2633 new->console.log_path = NULL;
2634 new->console.log_fd = -1;
2635 new->console.log_size = 0;
2636 new->console.path = NULL;
2637 new->console.peer = -1;
2638 new->console.proxy.busy = -1;
2639 new->console.proxy.master = -1;
2640 new->console.proxy.slave = -1;
2641 new->console.master = -1;
2642 new->console.slave = -1;
2643 new->console.name[0] = '\0';
2644 memset(&new->console.ringbuf, 0, sizeof(struct lxc_ringbuf));
2645 new->maincmd_fd = -1;
2646 new->nbd_idx = -1;
2647 new->rootfs.mount = strdup(default_rootfs_mount);
2648 if (!new->rootfs.mount) {
2649 free(new);
2650 return NULL;
2651 }
2652 new->logfd = -1;
2653 lxc_list_init(&new->cgroup);
2654 lxc_list_init(&new->cgroup2);
2655 lxc_list_init(&new->network);
2656 lxc_list_init(&new->mount_list);
2657 lxc_list_init(&new->caps);
2658 lxc_list_init(&new->keepcaps);
2659 lxc_list_init(&new->id_map);
2660 new->root_nsuid_map = NULL;
2661 new->root_nsgid_map = NULL;
2662 lxc_list_init(&new->includes);
2663 lxc_list_init(&new->aliens);
2664 lxc_list_init(&new->environment);
2665 lxc_list_init(&new->limits);
2666 lxc_list_init(&new->sysctls);
2667 lxc_list_init(&new->procs);
2668 new->hooks_version = 0;
2669 for (i = 0; i < NUM_LXC_HOOKS; i++)
2670 lxc_list_init(&new->hooks[i]);
2671 lxc_list_init(&new->groups);
2672 lxc_list_init(&new->state_clients);
2673 new->lsm_aa_profile = NULL;
2674 new->lsm_se_context = NULL;
2675 new->tmp_umount_proc = false;
2676
2677 /* if running in a new user namespace, init and COMMAND
2678 * default to running as UID/GID 0 when using lxc-execute */
2679 new->init_uid = 0;
2680 new->init_gid = 0;
2681 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
2682 memset(&new->ns_share, 0, sizeof(char *) * LXC_NS_MAX);
2683
2684 return new;
2685 }
2686
2687 int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
2688 size_t buf_size)
2689 {
2690 int fd, ret;
2691 char path[MAXPATHLEN];
2692
2693 if (geteuid() != 0 && idtype == ID_TYPE_GID) {
2694 size_t buflen;
2695
2696 ret = snprintf(path, MAXPATHLEN, "/proc/%d/setgroups", pid);
2697 if (ret < 0 || ret >= MAXPATHLEN)
2698 return -E2BIG;
2699
2700 fd = open(path, O_WRONLY);
2701 if (fd < 0 && errno != ENOENT) {
2702 SYSERROR("Failed to open \"%s\"", path);
2703 return -1;
2704 }
2705
2706 if (fd >= 0) {
2707 buflen = sizeof("deny\n") - 1;
2708 errno = 0;
2709 ret = lxc_write_nointr(fd, "deny\n", buflen);
2710 close(fd);
2711 if (ret != buflen) {
2712 SYSERROR("Failed to write \"deny\" to "
2713 "\"/proc/%d/setgroups\"", pid);
2714 return -1;
2715 }
2716 TRACE("Wrote \"deny\" to \"/proc/%d/setgroups\"", pid);
2717 }
2718 }
2719
2720 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
2721 idtype == ID_TYPE_UID ? 'u' : 'g');
2722 if (ret < 0 || ret >= MAXPATHLEN)
2723 return -E2BIG;
2724
2725 fd = open(path, O_WRONLY);
2726 if (fd < 0) {
2727 SYSERROR("Failed to open \"%s\"", path);
2728 return -1;
2729 }
2730
2731 errno = 0;
2732 ret = lxc_write_nointr(fd, buf, buf_size);
2733 close(fd);
2734 if (ret != buf_size) {
2735 SYSERROR("Failed to write %cid mapping to \"%s\"",
2736 idtype == ID_TYPE_UID ? 'u' : 'g', path);
2737 return -1;
2738 }
2739
2740 return 0;
2741 }
2742
2743 /* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2744 *
2745 * @return 1 if functional binary was found
2746 * @return 0 if binary exists but is lacking privilege
2747 * @return -ENOENT if binary does not exist
2748 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
2749 */
2750 static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2751 {
2752 char *path;
2753 int ret;
2754 struct stat st;
2755 int fret = 0;
2756
2757 if (cap != CAP_SETUID && cap != CAP_SETGID)
2758 return -EINVAL;
2759
2760 path = on_path(binary, NULL);
2761 if (!path)
2762 return -ENOENT;
2763
2764 ret = stat(path, &st);
2765 if (ret < 0) {
2766 fret = -errno;
2767 goto cleanup;
2768 }
2769
2770 /* Check if the binary is setuid. */
2771 if (st.st_mode & S_ISUID) {
2772 DEBUG("The binary \"%s\" does have the setuid bit set", path);
2773 fret = 1;
2774 goto cleanup;
2775 }
2776
2777 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
2778 /* Check if it has the CAP_SETUID capability. */
2779 if ((cap & CAP_SETUID) &&
2780 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
2781 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
2782 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
2783 "and CAP_PERMITTED sets", path);
2784 fret = 1;
2785 goto cleanup;
2786 }
2787
2788 /* Check if it has the CAP_SETGID capability. */
2789 if ((cap & CAP_SETGID) &&
2790 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
2791 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
2792 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
2793 "and CAP_PERMITTED sets", path);
2794 fret = 1;
2795 goto cleanup;
2796 }
2797 #else
2798 /* If we cannot check for file capabilities we need to give the benefit
2799 * of the doubt. Otherwise we might fail even though all the necessary
2800 * file capabilities are set.
2801 */
2802 DEBUG("Cannot check for file capabilites as full capability support is "
2803 "missing. Manual intervention needed");
2804 fret = 1;
2805 #endif
2806
2807 cleanup:
2808 free(path);
2809 return fret;
2810 }
2811
2812 int lxc_map_ids_exec_wrapper(void *args)
2813 {
2814 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2815 return -1;
2816 }
2817
2818 int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2819 {
2820 int fill, left;
2821 char u_or_g;
2822 char *pos;
2823 char cmd_output[MAXPATHLEN];
2824 struct id_map *map;
2825 struct lxc_list *iterator;
2826 enum idtype type;
2827 /* strlen("new@idmap") = 9
2828 * +
2829 * strlen(" ") = 1
2830 * +
2831 * LXC_NUMSTRLEN64
2832 * +
2833 * strlen(" ") = 1
2834 *
2835 * We add some additional space to make sure that we really have
2836 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
2837 */
2838 int ret = 0, gidmap = 0, uidmap = 0;
2839 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
2840 bool had_entry = false, use_shadow = false;
2841 int hostuid, hostgid;
2842
2843 hostuid = geteuid();
2844 hostgid = getegid();
2845
2846 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2847 * ranges, then insist that root also reserve ranges in subuid. This
2848 * will protected it by preventing another user from being handed the
2849 * range by shadow.
2850 */
2851 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
2852 if (uidmap == -ENOENT)
2853 WARN("newuidmap binary is missing");
2854 else if (!uidmap)
2855 WARN("newuidmap is lacking necessary privileges");
2856
2857 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
2858 if (gidmap == -ENOENT)
2859 WARN("newgidmap binary is missing");
2860 else if (!gidmap)
2861 WARN("newgidmap is lacking necessary privileges");
2862
2863 if (uidmap > 0 && gidmap > 0) {
2864 DEBUG("Functional newuidmap and newgidmap binary found");
2865 use_shadow = true;
2866 } else {
2867 /* In case unprivileged users run application containers via
2868 * execute() or a start*() there are valid cases where they may
2869 * only want to map their own {g,u}id. Let's not block them from
2870 * doing so by requiring geteuid() == 0.
2871 */
2872 DEBUG("No newuidmap and newgidmap binary found. Trying to "
2873 "write directly with euid %d", hostuid);
2874 }
2875
2876 /* Check if we really need to use newuidmap and newgidmap.
2877 * If the user is only remapping his own {g,u}id, we don't need it.
2878 */
2879 if (use_shadow && lxc_list_len(idmap) == 2) {
2880 use_shadow = false;
2881 lxc_list_for_each(iterator, idmap) {
2882 map = iterator->elem;
2883 if (map->idtype == ID_TYPE_UID && map->range == 1 &&
2884 map->nsid == hostuid && map->hostid == hostuid)
2885 continue;
2886 if (map->idtype == ID_TYPE_GID && map->range == 1 &&
2887 map->nsid == hostgid && map->hostid == hostgid)
2888 continue;
2889 use_shadow = true;
2890 break;
2891 }
2892 }
2893
2894 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2895 type++, u_or_g = 'g') {
2896 pos = mapbuf;
2897
2898 if (use_shadow)
2899 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
2900
2901 lxc_list_for_each(iterator, idmap) {
2902 map = iterator->elem;
2903 if (map->idtype != type)
2904 continue;
2905
2906 had_entry = true;
2907
2908 left = LXC_IDMAPLEN - (pos - mapbuf);
2909 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
2910 use_shadow ? " " : "", map->nsid,
2911 map->hostid, map->range,
2912 use_shadow ? "" : "\n");
2913 if (fill <= 0 || fill >= left) {
2914 /* The kernel only takes <= 4k for writes to
2915 * /proc/<pid>/{g,u}id_map
2916 */
2917 SYSERROR("Too many %cid mappings defined", u_or_g);
2918 return -1;
2919 }
2920
2921 pos += fill;
2922 }
2923 if (!had_entry)
2924 continue;
2925
2926 /* Try to catch the ouput of new{g,u}idmap to make debugging
2927 * easier.
2928 */
2929 if (use_shadow) {
2930 ret = run_command(cmd_output, sizeof(cmd_output),
2931 lxc_map_ids_exec_wrapper,
2932 (void *)mapbuf);
2933 if (ret < 0) {
2934 ERROR("new%cidmap failed to write mapping \"%s\": %s",
2935 u_or_g, cmd_output, mapbuf);
2936 return -1;
2937 }
2938 TRACE("new%cidmap wrote mapping \"%s\"", u_or_g, mapbuf);
2939 } else {
2940 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
2941 if (ret < 0) {
2942 ERROR("Failed to write mapping: %s", mapbuf);
2943 return -1;
2944 }
2945 TRACE("Wrote mapping \"%s\"", mapbuf);
2946 }
2947
2948 memset(mapbuf, 0, sizeof(mapbuf));
2949 }
2950
2951 return 0;
2952 }
2953
2954 /* Return the host uid/gid to which the container root is mapped in val.
2955 * Return true if id was found, false otherwise.
2956 */
2957 bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
2958 unsigned long *val)
2959 {
2960 unsigned nsid;
2961 struct id_map *map;
2962 struct lxc_list *it;
2963
2964 if (idtype == ID_TYPE_UID)
2965 nsid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
2966 else
2967 nsid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
2968
2969 lxc_list_for_each (it, &conf->id_map) {
2970 map = it->elem;
2971 if (map->idtype != idtype)
2972 continue;
2973 if (map->nsid != nsid)
2974 continue;
2975 *val = map->hostid;
2976 return true;
2977 }
2978
2979 return false;
2980 }
2981
2982 int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
2983 {
2984 struct id_map *map;
2985 struct lxc_list *it;
2986
2987 lxc_list_for_each (it, &conf->id_map) {
2988 map = it->elem;
2989 if (map->idtype != idtype)
2990 continue;
2991
2992 if (id >= map->hostid && id < map->hostid + map->range)
2993 return (id - map->hostid) + map->nsid;
2994 }
2995
2996 return -1;
2997 }
2998
2999 int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
3000 {
3001 struct id_map *map;
3002 struct lxc_list *it;
3003 unsigned int freeid = 0;
3004
3005 again:
3006 lxc_list_for_each (it, &conf->id_map) {
3007 map = it->elem;
3008 if (map->idtype != idtype)
3009 continue;
3010
3011 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3012 freeid = map->nsid + map->range;
3013 goto again;
3014 }
3015 }
3016
3017 return freeid;
3018 }
3019
3020 int chown_mapped_root_exec_wrapper(void *args)
3021 {
3022 execvp("lxc-usernsexec", args);
3023 return -1;
3024 }
3025
3026 /* chown_mapped_root: for an unprivileged user with uid/gid X to
3027 * chown a dir to subuid/subgid Y, he needs to run chown as root
3028 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3029 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3030 * root is privileged with respect to hostuid/hostgid X, allowing
3031 * him to do the chown.
3032 */
3033 int chown_mapped_root(const char *path, struct lxc_conf *conf)
3034 {
3035 uid_t rootuid, rootgid;
3036 unsigned long val;
3037 int hostuid, hostgid, ret;
3038 struct stat sb;
3039 char map1[100], map2[100], map3[100], map4[100], map5[100];
3040 char ugid[100];
3041 const char *args1[] = {"lxc-usernsexec",
3042 "-m", map1,
3043 "-m", map2,
3044 "-m", map3,
3045 "-m", map5,
3046 "--", "chown", ugid, path,
3047 NULL};
3048 const char *args2[] = {"lxc-usernsexec",
3049 "-m", map1,
3050 "-m", map2,
3051 "-m", map3,
3052 "-m", map4,
3053 "-m", map5,
3054 "--", "chown", ugid, path,
3055 NULL};
3056 char cmd_output[MAXPATHLEN];
3057
3058 hostuid = geteuid();
3059 hostgid = getegid();
3060
3061 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
3062 ERROR("No uid mapping for container root");
3063 return -1;
3064 }
3065 rootuid = (uid_t)val;
3066
3067 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
3068 ERROR("No gid mapping for container root");
3069 return -1;
3070 }
3071 rootgid = (gid_t)val;
3072
3073 if (hostuid == 0) {
3074 if (chown(path, rootuid, rootgid) < 0) {
3075 ERROR("Error chowning %s", path);
3076 return -1;
3077 }
3078
3079 return 0;
3080 }
3081
3082 if (rootuid == hostuid) {
3083 /* nothing to do */
3084 INFO("Container root is our uid; no need to chown");
3085 return 0;
3086 }
3087
3088 /* save the current gid of "path" */
3089 if (stat(path, &sb) < 0) {
3090 ERROR("Error stat %s", path);
3091 return -1;
3092 }
3093
3094 /* Update the path argument in case this was overlayfs. */
3095 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
3096 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
3097
3098 /*
3099 * A file has to be group-owned by a gid mapped into the
3100 * container, or the container won't be privileged over it.
3101 */
3102 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3103 if (sb.st_uid == hostuid &&
3104 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3105 chown(path, -1, hostgid) < 0) {
3106 ERROR("Failed chgrping %s", path);
3107 return -1;
3108 }
3109
3110 /* "u:0:rootuid:1" */
3111 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3112 if (ret < 0 || ret >= 100) {
3113 ERROR("Error uid printing map string");
3114 return -1;
3115 }
3116
3117 /* "u:hostuid:hostuid:1" */
3118 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3119 if (ret < 0 || ret >= 100) {
3120 ERROR("Error uid printing map string");
3121 return -1;
3122 }
3123
3124 /* "g:0:rootgid:1" */
3125 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3126 if (ret < 0 || ret >= 100) {
3127 ERROR("Error gid printing map string");
3128 return -1;
3129 }
3130
3131 /* "g:pathgid:rootgid+pathgid:1" */
3132 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3133 rootgid + (gid_t)sb.st_gid);
3134 if (ret < 0 || ret >= 100) {
3135 ERROR("Error gid printing map string");
3136 return -1;
3137 }
3138
3139 /* "g:hostgid:hostgid:1" */
3140 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3141 if (ret < 0 || ret >= 100) {
3142 ERROR("Error gid printing map string");
3143 return -1;
3144 }
3145
3146 /* "0:pathgid" (chown) */
3147 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3148 if (ret < 0 || ret >= 100) {
3149 ERROR("Error owner printing format string for chown");
3150 return -1;
3151 }
3152
3153 if (hostgid == sb.st_gid)
3154 ret = run_command(cmd_output, sizeof(cmd_output),
3155 chown_mapped_root_exec_wrapper,
3156 (void *)args1);
3157 else
3158 ret = run_command(cmd_output, sizeof(cmd_output),
3159 chown_mapped_root_exec_wrapper,
3160 (void *)args2);
3161 if (ret < 0)
3162 ERROR("lxc-usernsexec failed: %s", cmd_output);
3163
3164 return ret;
3165 }
3166
3167 /* NOTE: Must not be called from inside the container namespace! */
3168 int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
3169 {
3170 int mounted;
3171
3172 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
3173 if (mounted == -1) {
3174 SYSERROR("Failed to mount proc in the container");
3175 /* continue only if there is no rootfs */
3176 if (conf->rootfs.path)
3177 return -1;
3178 } else if (mounted == 1) {
3179 conf->tmp_umount_proc = true;
3180 }
3181
3182 return 0;
3183 }
3184
3185 void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3186 {
3187 if (!lxc_conf->tmp_umount_proc)
3188 return;
3189
3190 (void)umount2("/proc", MNT_DETACH);
3191 lxc_conf->tmp_umount_proc = false;
3192 }
3193
3194 /* Walk /proc/mounts and change any shared entries to slave. */
3195 void remount_all_slave(void)
3196 {
3197 int memfd, mntinfo_fd, ret;
3198 ssize_t copied;
3199 FILE *f;
3200 size_t len = 0;
3201 char *line = NULL;
3202
3203 mntinfo_fd = open("/proc/self/mountinfo", O_RDONLY | O_CLOEXEC);
3204 if (mntinfo_fd < 0) {
3205 SYSERROR("Failed to open \"/proc/self/mountinfo\"");
3206 return;
3207 }
3208
3209 memfd = memfd_create(".lxc_mountinfo", MFD_CLOEXEC);
3210 if (memfd < 0) {
3211 char template[] = P_tmpdir "/.lxc_mountinfo_XXXXXX";
3212
3213 if (errno != ENOSYS) {
3214 SYSERROR("Failed to create temporary in-memory file");
3215 close(mntinfo_fd);
3216 return;
3217 }
3218
3219 memfd = lxc_make_tmpfile(template, true);
3220 if (memfd < 0) {
3221 close(mntinfo_fd);
3222 WARN("Failed to create temporary file");
3223 return;
3224 }
3225 }
3226
3227 #define __LXC_SENDFILE_MAX 0x7ffff000 /* maximum number of bytes sendfile can handle */
3228 again:
3229 copied = sendfile(memfd, mntinfo_fd, NULL, __LXC_SENDFILE_MAX);
3230 if (copied < 0) {
3231 if (errno == EINTR)
3232 goto again;
3233
3234 SYSERROR("Failed to copy \"/proc/self/mountinfo\"");
3235 close(mntinfo_fd);
3236 close(memfd);
3237 return;
3238 }
3239 close(mntinfo_fd);
3240
3241 /* After a successful fdopen() memfd will be closed when calling
3242 * fclose(f). Calling close(memfd) afterwards is undefined.
3243 */
3244 ret = lseek(memfd, 0, SEEK_SET);
3245 if (ret < 0) {
3246 SYSERROR("Failed to reset file descriptor offset");
3247 close(memfd);
3248 return;
3249 }
3250
3251 f = fdopen(memfd, "r");
3252 if (!f) {
3253 SYSERROR("Failed to open copy of \"/proc/self/mountinfo\" to mark "
3254 "all shared. Continuing");
3255 close(memfd);
3256 return;
3257 }
3258
3259 while (getline(&line, &len, f) != -1) {
3260 int ret;
3261 char *opts, *target;
3262
3263 target = get_field(line, 4);
3264 if (!target)
3265 continue;
3266
3267 opts = get_field(target, 2);
3268 if (!opts)
3269 continue;
3270
3271 null_endofword(opts);
3272 if (!strstr(opts, "shared"))
3273 continue;
3274
3275 null_endofword(target);
3276 ret = mount(NULL, target, NULL, MS_SLAVE, NULL);
3277 if (ret < 0) {
3278 SYSERROR("Failed to make \"%s\" MS_SLAVE", target);
3279 ERROR("Continuing...");
3280 continue;
3281 }
3282 TRACE("Remounted \"%s\" as MS_SLAVE", target);
3283 }
3284 fclose(f);
3285 free(line);
3286 TRACE("Remounted all mount table entries as MS_SLAVE");
3287 }
3288
3289 static int lxc_execute_bind_init(struct lxc_handler *handler)
3290 {
3291 int ret;
3292 char *p;
3293 char path[PATH_MAX], destpath[PATH_MAX];
3294 struct lxc_conf *conf = handler->conf;
3295
3296 /* If init exists in the container, don't bind mount a static one */
3297 p = choose_init(conf->rootfs.mount);
3298 if (p) {
3299 char *old = p;
3300
3301 p = strdup(old + strlen(conf->rootfs.mount));
3302 free(old);
3303 if (!p)
3304 return -ENOMEM;
3305
3306 INFO("Found existing init at \"%s\"", p);
3307 goto out;
3308 }
3309
3310 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3311 if (ret < 0 || ret >= PATH_MAX)
3312 return -1;
3313
3314 if (!file_exists(path)) {
3315 ERROR("The file \"%s\" does not exist on host", path);
3316 return -1;
3317 }
3318
3319 ret = snprintf(destpath, PATH_MAX, "%s" P_tmpdir "%s", conf->rootfs.mount, "/.lxc-init");
3320 if (ret < 0 || ret >= PATH_MAX)
3321 return -1;
3322
3323 if (!file_exists(destpath)) {
3324 ret = mknod(destpath, S_IFREG | 0000, 0);
3325 if (ret < 0 && errno != EEXIST) {
3326 SYSERROR("Failed to create dummy \"%s\" file as bind mount target", destpath);
3327 return -1;
3328 }
3329 }
3330
3331 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
3332 if (ret < 0) {
3333 SYSERROR("Failed to bind mount lxc.init.static into container");
3334 return -1;
3335 }
3336
3337 p = strdup(destpath + strlen(conf->rootfs.mount));
3338 if (!p)
3339 return -ENOMEM;
3340
3341 INFO("Bind mounted lxc.init.static into container at \"%s\"", path);
3342 out:
3343 ((struct execute_args *)handler->data)->init_fd = -1;
3344 ((struct execute_args *)handler->data)->init_path = p;
3345 return 0;
3346 }
3347
3348 /* This does the work of remounting / if it is shared, calling the container
3349 * pre-mount hooks, and mounting the rootfs.
3350 */
3351 int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
3352 {
3353 int ret;
3354
3355 if (conf->rootfs_setup) {
3356 const char *path = conf->rootfs.mount;
3357
3358 /* The rootfs was set up in another namespace. bind-mount it to
3359 * give us a mount in our own ns so we can pivot_root to it
3360 */
3361 ret = mount(path, path, "rootfs", MS_BIND, NULL);
3362 if (ret < 0) {
3363 ERROR("Failed to bind mount container / onto itself");
3364 return -1;
3365 }
3366
3367 TRACE("Bind mounted container / onto itself");
3368 return 0;
3369 }
3370
3371 remount_all_slave();
3372
3373 ret = run_lxc_hooks(name, "pre-mount", conf, NULL);
3374 if (ret < 0) {
3375 ERROR("Failed to run pre-mount hooks");
3376 return -1;
3377 }
3378
3379 ret = lxc_setup_rootfs(conf);
3380 if (ret < 0) {
3381 ERROR("Failed to setup rootfs for");
3382 return -1;
3383 }
3384
3385 conf->rootfs_setup = true;
3386 return 0;
3387 }
3388
3389 static bool verify_start_hooks(struct lxc_conf *conf)
3390 {
3391 char path[MAXPATHLEN];
3392 struct lxc_list *it;
3393
3394 lxc_list_for_each (it, &conf->hooks[LXCHOOK_START]) {
3395 int ret;
3396 struct stat st;
3397 char *hookname = it->elem;
3398
3399 ret = snprintf(path, MAXPATHLEN, "%s%s",
3400 conf->rootfs.path ? conf->rootfs.mount : "",
3401 hookname);
3402 if (ret < 0 || ret >= MAXPATHLEN)
3403 return false;
3404
3405 ret = stat(path, &st);
3406 if (ret < 0) {
3407 SYSERROR("Start hook %s not found in container",
3408 hookname);
3409 return false;
3410 }
3411
3412 return true;
3413 }
3414
3415 return true;
3416 }
3417
3418 static bool execveat_supported(void)
3419 {
3420 #ifdef __NR_execveat
3421 /*
3422 * We use the syscall here, because it was introduced in kernel 3.19,
3423 * while glibc got support for using the syscall much later, in 2.27.
3424 * We don't want to use glibc because it falls back to /proc, and the
3425 * container may not have /proc mounted depending on its configuration.
3426 */
3427 syscall(__NR_execveat, -1, "", NULL, NULL, AT_EMPTY_PATH);
3428 if (errno == ENOSYS)
3429 return false;
3430
3431 return true;
3432 #else
3433 return false;
3434 #endif
3435 }
3436
3437 int lxc_setup(struct lxc_handler *handler)
3438 {
3439 int ret;
3440 const char *lxcpath = handler->lxcpath, *name = handler->name;
3441 struct lxc_conf *lxc_conf = handler->conf;
3442
3443 ret = do_rootfs_setup(lxc_conf, name, lxcpath);
3444 if (ret < 0) {
3445 ERROR("Failed to setup rootfs");
3446 return -1;
3447 }
3448
3449 if (handler->nsfd[LXC_NS_UTS] == -1) {
3450 ret = setup_utsname(lxc_conf->utsname);
3451 if (ret < 0) {
3452 ERROR("Failed to setup the utsname %s", name);
3453 return -1;
3454 }
3455 }
3456
3457 ret = lxc_setup_network_in_child_namespaces(lxc_conf, &lxc_conf->network);
3458 if (ret < 0) {
3459 ERROR("Failed to setup network");
3460 return -1;
3461 }
3462
3463 ret = lxc_network_send_name_and_ifindex_to_parent(handler);
3464 if (ret < 0) {
3465 ERROR("Failed to send network device names and ifindices to parent");
3466 return -1;
3467 }
3468
3469 if (lxc_conf->autodev > 0) {
3470 ret = mount_autodev(name, &lxc_conf->rootfs, lxcpath);
3471 if (ret < 0) {
3472 ERROR("Failed to mount \"/dev\"");
3473 return -1;
3474 }
3475 }
3476
3477 /* Do automatic mounts (mainly /proc and /sys), but exclude those that
3478 * need to wait until other stuff has finished.
3479 */
3480 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler);
3481 if (ret < 0) {
3482 ERROR("Failed to setup first automatic mounts");
3483 return -1;
3484 }
3485
3486 ret = setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath);
3487 if (ret < 0) {
3488 ERROR("Failed to setup mounts");
3489 return -1;
3490 }
3491
3492 /* Make sure any start hooks are in the container */
3493 if (!verify_start_hooks(lxc_conf))
3494 return -1;
3495
3496 if (lxc_conf->is_execute) {
3497 if (execveat_supported()) {
3498 int fd;
3499 char path[PATH_MAX];
3500
3501 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3502 if (ret < 0 || ret >= PATH_MAX) {
3503 ERROR("Path to init.lxc.static too long");
3504 return -1;
3505 }
3506
3507 fd = open(path, O_PATH | O_CLOEXEC);
3508 if (fd < 0) {
3509 SYSERROR("Unable to open lxc.init.static");
3510 return -1;
3511 }
3512
3513 ((struct execute_args *)handler->data)->init_fd = fd;
3514 ((struct execute_args *)handler->data)->init_path = NULL;
3515 } else {
3516 ret = lxc_execute_bind_init(handler);
3517 if (ret < 0) {
3518 ERROR("Failed to bind-mount the lxc init system");
3519 return -1;
3520 }
3521 }
3522 }
3523
3524 /* Now mount only cgroups, if wanted. Before, /sys could not have been
3525 * mounted. It is guaranteed to be mounted now either through
3526 * automatically or via fstab entries.
3527 */
3528 ret = lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler);
3529 if (ret < 0) {
3530 ERROR("Failed to setup remaining automatic mounts");
3531 return -1;
3532 }
3533
3534 ret = run_lxc_hooks(name, "mount", lxc_conf, NULL);
3535 if (ret < 0) {
3536 ERROR("Failed to run mount hooks");
3537 return -1;
3538 }
3539
3540 if (lxc_conf->autodev > 0) {
3541 ret = run_lxc_hooks(name, "autodev", lxc_conf, NULL);
3542 if (ret < 0) {
3543 ERROR("Failed to run autodev hooks");
3544 return -1;
3545 }
3546
3547 ret = lxc_fill_autodev(&lxc_conf->rootfs);
3548 if (ret < 0) {
3549 ERROR("Failed to populate \"/dev\"");
3550 return -1;
3551 }
3552 }
3553
3554 if (!lxc_list_empty(&lxc_conf->mount_list)) {
3555 ret = setup_mount_entries(lxc_conf, &lxc_conf->rootfs,
3556 &lxc_conf->mount_list, name, lxcpath);
3557 if (ret < 0) {
3558 ERROR("Failed to setup mount entries");
3559 return -1;
3560 }
3561 }
3562
3563 ret = lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console,
3564 lxc_conf->ttys.dir);
3565 if (ret < 0) {
3566 ERROR("Failed to setup console");
3567 return -1;
3568 }
3569
3570 ret = lxc_setup_dev_symlinks(&lxc_conf->rootfs);
3571 if (ret < 0) {
3572 ERROR("Failed to setup \"/dev\" symlinks");
3573 return -1;
3574 }
3575
3576 ret = lxc_create_tmp_proc_mount(lxc_conf);
3577 if (ret < 0) {
3578 ERROR("Failed to \"/proc\" LSMs");
3579 return -1;
3580 }
3581
3582 ret = setup_pivot_root(&lxc_conf->rootfs);
3583 if (ret < 0) {
3584 ERROR("Failed to pivot root into rootfs");
3585 return -1;
3586 }
3587
3588 ret = lxc_setup_devpts(lxc_conf);
3589 if (ret < 0) {
3590 ERROR("Failed to setup new devpts instance");
3591 return -1;
3592 }
3593
3594 ret = lxc_create_ttys(handler);
3595 if (ret < 0)
3596 return -1;
3597
3598 ret = setup_personality(lxc_conf->personality);
3599 if (ret < 0) {
3600 ERROR("Failed to set personality");
3601 return -1;
3602 }
3603
3604 /* Set sysctl value to a path under /proc/sys as determined from the
3605 * key. For e.g. net.ipv4.ip_forward translated to
3606 * /proc/sys/net/ipv4/ip_forward.
3607 */
3608 if (!lxc_list_empty(&lxc_conf->sysctls)) {
3609 ret = setup_sysctl_parameters(&lxc_conf->sysctls);
3610 if (ret < 0) {
3611 ERROR("Failed to setup sysctl parameters");
3612 return -1;
3613 }
3614 }
3615
3616 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
3617 if (!lxc_list_empty(&lxc_conf->caps)) {
3618 ERROR("Container requests lxc.cap.drop and "
3619 "lxc.cap.keep: either use lxc.cap.drop or "
3620 "lxc.cap.keep, not both");
3621 return -1;
3622 }
3623
3624 if (dropcaps_except(&lxc_conf->keepcaps)) {
3625 ERROR("Failed to keep capabilities");
3626 return -1;
3627 }
3628 } else if (setup_caps(&lxc_conf->caps)) {
3629 ERROR("Failed to drop capabilities");
3630 return -1;
3631 }
3632
3633 NOTICE("The container \"%s\" is set up", name);
3634
3635 return 0;
3636 }
3637
3638 int run_lxc_hooks(const char *name, char *hookname, struct lxc_conf *conf,
3639 char *argv[])
3640 {
3641 struct lxc_list *it;
3642 int which = -1;
3643
3644 if (strcmp(hookname, "pre-start") == 0)
3645 which = LXCHOOK_PRESTART;
3646 else if (strcmp(hookname, "start-host") == 0)
3647 which = LXCHOOK_START_HOST;
3648 else if (strcmp(hookname, "pre-mount") == 0)
3649 which = LXCHOOK_PREMOUNT;
3650 else if (strcmp(hookname, "mount") == 0)
3651 which = LXCHOOK_MOUNT;
3652 else if (strcmp(hookname, "autodev") == 0)
3653 which = LXCHOOK_AUTODEV;
3654 else if (strcmp(hookname, "start") == 0)
3655 which = LXCHOOK_START;
3656 else if (strcmp(hookname, "stop") == 0)
3657 which = LXCHOOK_STOP;
3658 else if (strcmp(hookname, "post-stop") == 0)
3659 which = LXCHOOK_POSTSTOP;
3660 else if (strcmp(hookname, "clone") == 0)
3661 which = LXCHOOK_CLONE;
3662 else if (strcmp(hookname, "destroy") == 0)
3663 which = LXCHOOK_DESTROY;
3664 else
3665 return -1;
3666
3667 lxc_list_for_each (it, &conf->hooks[which]) {
3668 int ret;
3669 char *hook = it->elem;
3670
3671 ret = run_script_argv(name, conf->hooks_version, "lxc", hook,
3672 hookname, argv);
3673 if (ret < 0)
3674 return -1;
3675 }
3676
3677 return 0;
3678 }
3679
3680 int lxc_clear_config_caps(struct lxc_conf *c)
3681 {
3682 struct lxc_list *it, *next;
3683
3684 lxc_list_for_each_safe (it, &c->caps, next) {
3685 lxc_list_del(it);
3686 free(it->elem);
3687 free(it);
3688 }
3689
3690 return 0;
3691 }
3692
3693 static int lxc_free_idmap(struct lxc_list *id_map)
3694 {
3695 struct lxc_list *it, *next;
3696
3697 lxc_list_for_each_safe (it, id_map, next) {
3698 lxc_list_del(it);
3699 free(it->elem);
3700 free(it);
3701 }
3702
3703 return 0;
3704 }
3705
3706 int lxc_clear_idmaps(struct lxc_conf *c)
3707 {
3708 return lxc_free_idmap(&c->id_map);
3709 }
3710
3711 int lxc_clear_config_keepcaps(struct lxc_conf *c)
3712 {
3713 struct lxc_list *it, *next;
3714
3715 lxc_list_for_each_safe (it, &c->keepcaps, next) {
3716 lxc_list_del(it);
3717 free(it->elem);
3718 free(it);
3719 }
3720
3721 return 0;
3722 }
3723
3724 int lxc_clear_cgroups(struct lxc_conf *c, const char *key, int version)
3725 {
3726 char *global_token, *namespaced_token;
3727 size_t namespaced_token_len;
3728 struct lxc_list *it, *next, *list;
3729 const char *k = key;
3730 bool all = false;
3731
3732 if (version == CGROUP2_SUPER_MAGIC) {
3733 global_token = "lxc.cgroup2";
3734 namespaced_token = "lxc.cgroup2.";
3735 namespaced_token_len = sizeof("lxc.cgroup2.") - 1;
3736 list = &c->cgroup2;
3737 } else if (version == CGROUP_SUPER_MAGIC) {
3738 global_token = "lxc.cgroup";
3739 namespaced_token = "lxc.cgroup.";
3740 namespaced_token_len = sizeof("lxc.cgroup.") - 1;
3741 list = &c->cgroup;
3742 } else {
3743 return -EINVAL;
3744 }
3745
3746 if (strcmp(key, global_token) == 0)
3747 all = true;
3748 else if (strncmp(key, namespaced_token, sizeof(namespaced_token) - 1) == 0)
3749 k += namespaced_token_len;
3750 else
3751 return -EINVAL;
3752
3753 lxc_list_for_each_safe (it, list, next) {
3754 struct lxc_cgroup *cg = it->elem;
3755
3756 if (!all && strcmp(cg->subsystem, k) != 0)
3757 continue;
3758
3759 lxc_list_del(it);
3760 free(cg->subsystem);
3761 free(cg->value);
3762 free(cg);
3763 free(it);
3764 }
3765
3766 return 0;
3767 }
3768
3769 int lxc_clear_limits(struct lxc_conf *c, const char *key)
3770 {
3771 struct lxc_list *it, *next;
3772 const char *k = NULL;
3773 bool all = false;
3774
3775 if (strcmp(key, "lxc.limit") == 0 || strcmp(key, "lxc.prlimit") == 0)
3776 all = true;
3777 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.") - 1) == 0)
3778 k = key + sizeof("lxc.limit.") - 1;
3779 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.") - 1) == 0)
3780 k = key + sizeof("lxc.prlimit.") - 1;
3781 else
3782 return -1;
3783
3784 lxc_list_for_each_safe (it, &c->limits, next) {
3785 struct lxc_limit *lim = it->elem;
3786
3787 if (!all && strcmp(lim->resource, k) != 0)
3788 continue;
3789
3790 lxc_list_del(it);
3791 free(lim->resource);
3792 free(lim);
3793 free(it);
3794 }
3795
3796 return 0;
3797 }
3798
3799 int lxc_clear_sysctls(struct lxc_conf *c, const char *key)
3800 {
3801 struct lxc_list *it, *next;
3802 const char *k = NULL;
3803 bool all = false;
3804
3805 if (strcmp(key, "lxc.sysctl") == 0)
3806 all = true;
3807 else if (strncmp(key, "lxc.sysctl.", sizeof("lxc.sysctl.") - 1) == 0)
3808 k = key + sizeof("lxc.sysctl.") - 1;
3809 else
3810 return -1;
3811
3812 lxc_list_for_each_safe (it, &c->sysctls, next) {
3813 struct lxc_sysctl *elem = it->elem;
3814
3815 if (!all && strcmp(elem->key, k) != 0)
3816 continue;
3817
3818 lxc_list_del(it);
3819 free(elem->key);
3820 free(elem->value);
3821 free(elem);
3822 free(it);
3823 }
3824
3825 return 0;
3826 }
3827
3828 int lxc_clear_procs(struct lxc_conf *c, const char *key)
3829 {
3830 struct lxc_list *it, *next;
3831 const char *k = NULL;
3832 bool all = false;
3833
3834 if (strcmp(key, "lxc.proc") == 0)
3835 all = true;
3836 else if (strncmp(key, "lxc.proc.", sizeof("lxc.proc.") - 1) == 0)
3837 k = key + sizeof("lxc.proc.") - 1;
3838 else
3839 return -1;
3840
3841 lxc_list_for_each_safe (it, &c->procs, next) {
3842 struct lxc_proc *proc = it->elem;
3843
3844 if (!all && strcmp(proc->filename, k) != 0)
3845 continue;
3846
3847 lxc_list_del(it);
3848 free(proc->filename);
3849 free(proc->value);
3850 free(proc);
3851 free(it);
3852 }
3853
3854 return 0;
3855 }
3856
3857 int lxc_clear_groups(struct lxc_conf *c)
3858 {
3859 struct lxc_list *it, *next;
3860
3861 lxc_list_for_each_safe (it, &c->groups, next) {
3862 lxc_list_del(it);
3863 free(it->elem);
3864 free(it);
3865 }
3866
3867 return 0;
3868 }
3869
3870 int lxc_clear_environment(struct lxc_conf *c)
3871 {
3872 struct lxc_list *it, *next;
3873
3874 lxc_list_for_each_safe (it, &c->environment, next) {
3875 lxc_list_del(it);
3876 free(it->elem);
3877 free(it);
3878 }
3879
3880 return 0;
3881 }
3882
3883 int lxc_clear_mount_entries(struct lxc_conf *c)
3884 {
3885 struct lxc_list *it, *next;
3886
3887 lxc_list_for_each_safe (it, &c->mount_list, next) {
3888 lxc_list_del(it);
3889 free(it->elem);
3890 free(it);
3891 }
3892
3893 return 0;
3894 }
3895
3896 int lxc_clear_automounts(struct lxc_conf *c)
3897 {
3898 c->auto_mounts = 0;
3899 return 0;
3900 }
3901
3902 int lxc_clear_hooks(struct lxc_conf *c, const char *key)
3903 {
3904 int i;
3905 struct lxc_list *it, *next;
3906 const char *k = NULL;
3907 bool all = false, done = false;
3908
3909 if (strcmp(key, "lxc.hook") == 0)
3910 all = true;
3911 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.") - 1) == 0)
3912 k = key + sizeof("lxc.hook.") - 1;
3913 else
3914 return -1;
3915
3916 for (i = 0; i < NUM_LXC_HOOKS; i++) {
3917 if (all || strcmp(k, lxchook_names[i]) == 0) {
3918 lxc_list_for_each_safe (it, &c->hooks[i], next) {
3919 lxc_list_del(it);
3920 free(it->elem);
3921 free(it);
3922 }
3923
3924 done = true;
3925 }
3926 }
3927
3928 if (!done) {
3929 ERROR("Invalid hook key: %s", key);
3930 return -1;
3931 }
3932
3933 return 0;
3934 }
3935
3936 static inline void lxc_clear_aliens(struct lxc_conf *conf)
3937 {
3938 struct lxc_list *it, *next;
3939
3940 lxc_list_for_each_safe (it, &conf->aliens, next) {
3941 lxc_list_del(it);
3942 free(it->elem);
3943 free(it);
3944 }
3945 }
3946
3947 void lxc_clear_includes(struct lxc_conf *conf)
3948 {
3949 struct lxc_list *it, *next;
3950
3951 lxc_list_for_each_safe (it, &conf->includes, next) {
3952 lxc_list_del(it);
3953 free(it->elem);
3954 free(it);
3955 }
3956 }
3957
3958 void lxc_conf_free(struct lxc_conf *conf)
3959 {
3960 if (!conf)
3961 return;
3962
3963 if (current_config == conf)
3964 current_config = NULL;
3965 lxc_terminal_conf_free(&conf->console);
3966 free(conf->rootfs.mount);
3967 free(conf->rootfs.bdev_type);
3968 free(conf->rootfs.options);
3969 free(conf->rootfs.path);
3970 free(conf->logfile);
3971 if (conf->logfd != -1)
3972 close(conf->logfd);
3973 free(conf->utsname);
3974 free(conf->ttys.dir);
3975 free(conf->ttys.tty_names);
3976 free(conf->fstab);
3977 free(conf->rcfile);
3978 free(conf->execute_cmd);
3979 free(conf->init_cmd);
3980 free(conf->init_cwd);
3981 free(conf->unexpanded_config);
3982 free(conf->syslog);
3983 lxc_free_networks(&conf->network);
3984 free(conf->lsm_aa_profile);
3985 free(conf->lsm_se_context);
3986 lxc_seccomp_free(conf);
3987 lxc_clear_config_caps(conf);
3988 lxc_clear_config_keepcaps(conf);
3989 lxc_clear_cgroups(conf, "lxc.cgroup", CGROUP_SUPER_MAGIC);
3990 lxc_clear_cgroups(conf, "lxc.cgroup2", CGROUP2_SUPER_MAGIC);
3991 lxc_clear_hooks(conf, "lxc.hook");
3992 lxc_clear_mount_entries(conf);
3993 lxc_clear_idmaps(conf);
3994 lxc_clear_groups(conf);
3995 lxc_clear_includes(conf);
3996 lxc_clear_aliens(conf);
3997 lxc_clear_environment(conf);
3998 lxc_clear_limits(conf, "lxc.prlimit");
3999 lxc_clear_sysctls(conf, "lxc.sysctl");
4000 lxc_clear_procs(conf, "lxc.proc");
4001 free(conf->cgroup_meta.dir);
4002 free(conf->cgroup_meta.controllers);
4003 free(conf);
4004 }
4005
4006 struct userns_fn_data {
4007 int (*fn)(void *);
4008 const char *fn_name;
4009 void *arg;
4010 int p[2];
4011 };
4012
4013 static int run_userns_fn(void *data)
4014 {
4015 char c;
4016 struct userns_fn_data *d = data;
4017
4018 /* Close write end of the pipe. */
4019 close(d->p[1]);
4020
4021 /* Wait for parent to finish establishing a new mapping in the user
4022 * namespace we are executing in.
4023 */
4024 if (lxc_read_nointr(d->p[0], &c, 1) != 1)
4025 return -1;
4026
4027 /* Close read end of the pipe. */
4028 close(d->p[0]);
4029
4030 if (d->fn_name)
4031 TRACE("calling function \"%s\"", d->fn_name);
4032
4033 /* Call function to run. */
4034 return d->fn(d->arg);
4035 }
4036
4037 static struct id_map *mapped_nsid_add(struct lxc_conf *conf, unsigned id,
4038 enum idtype idtype)
4039 {
4040 const struct id_map *map;
4041 struct id_map *retmap;
4042
4043 map = find_mapped_nsid_entry(conf, id, idtype);
4044 if (!map)
4045 return NULL;
4046
4047 retmap = malloc(sizeof(*retmap));
4048 if (!retmap)
4049 return NULL;
4050
4051 memcpy(retmap, map, sizeof(*retmap));
4052 return retmap;
4053 }
4054
4055 static struct id_map *find_mapped_hostid_entry(struct lxc_conf *conf,
4056 unsigned id, enum idtype idtype)
4057 {
4058 struct id_map *map;
4059 struct lxc_list *it;
4060 struct id_map *retmap = NULL;
4061
4062 lxc_list_for_each (it, &conf->id_map) {
4063 map = it->elem;
4064 if (map->idtype != idtype)
4065 continue;
4066
4067 if (id >= map->hostid && id < map->hostid + map->range) {
4068 retmap = map;
4069 break;
4070 }
4071 }
4072
4073 return retmap;
4074 }
4075
4076 /* Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4077 * existing one or establish a new one.
4078 */
4079 static struct id_map *mapped_hostid_add(struct lxc_conf *conf, uid_t id,
4080 enum idtype type)
4081 {
4082 int hostid_mapped;
4083 struct id_map *entry = NULL, *tmp = NULL;
4084
4085 entry = malloc(sizeof(*entry));
4086 if (!entry)
4087 return NULL;
4088
4089 /* Reuse existing mapping. */
4090 tmp = find_mapped_hostid_entry(conf, id, type);
4091 if (tmp)
4092 return memcpy(entry, tmp, sizeof(*entry));
4093
4094 /* Find new mapping. */
4095 hostid_mapped = find_unmapped_nsid(conf, type);
4096 if (hostid_mapped < 0) {
4097 DEBUG("Failed to find free mapping for id %d", id);
4098 free(entry);
4099 return NULL;
4100 }
4101
4102 entry->idtype = type;
4103 entry->nsid = hostid_mapped;
4104 entry->hostid = (unsigned long)id;
4105 entry->range = 1;
4106
4107 return entry;
4108 }
4109
4110 struct lxc_list *get_minimal_idmap(struct lxc_conf *conf)
4111 {
4112 uid_t euid, egid;
4113 uid_t nsuid = (conf->root_nsuid_map != NULL) ? 0 : conf->init_uid;
4114 gid_t nsgid = (conf->root_nsgid_map != NULL) ? 0 : conf->init_gid;
4115 struct lxc_list *idmap = NULL, *tmplist = NULL;
4116 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4117 *host_uid_map = NULL, *host_gid_map = NULL;
4118
4119 /* Find container root mappings. */
4120 container_root_uid = mapped_nsid_add(conf, nsuid, ID_TYPE_UID);
4121 if (!container_root_uid) {
4122 DEBUG("Failed to find mapping for namespace uid %d", 0);
4123 goto on_error;
4124 }
4125 euid = geteuid();
4126 if (euid >= container_root_uid->hostid &&
4127 euid < (container_root_uid->hostid + container_root_uid->range))
4128 host_uid_map = container_root_uid;
4129
4130 container_root_gid = mapped_nsid_add(conf, nsgid, ID_TYPE_GID);
4131 if (!container_root_gid) {
4132 DEBUG("Failed to find mapping for namespace gid %d", 0);
4133 goto on_error;
4134 }
4135 egid = getegid();
4136 if (egid >= container_root_gid->hostid &&
4137 egid < (container_root_gid->hostid + container_root_gid->range))
4138 host_gid_map = container_root_gid;
4139
4140 /* Check whether the {g,u}id of the user has a mapping. */
4141 if (!host_uid_map)
4142 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
4143 if (!host_uid_map) {
4144 DEBUG("Failed to find mapping for uid %d", euid);
4145 goto on_error;
4146 }
4147
4148 if (!host_gid_map)
4149 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
4150 if (!host_gid_map) {
4151 DEBUG("Failed to find mapping for gid %d", egid);
4152 goto on_error;
4153 }
4154
4155 /* Allocate new {g,u}id map list. */
4156 idmap = malloc(sizeof(*idmap));
4157 if (!idmap)
4158 goto on_error;
4159 lxc_list_init(idmap);
4160
4161 /* Add container root to the map. */
4162 tmplist = malloc(sizeof(*tmplist));
4163 if (!tmplist)
4164 goto on_error;
4165 lxc_list_add_elem(tmplist, container_root_uid);
4166 lxc_list_add_tail(idmap, tmplist);
4167
4168 if (host_uid_map && (host_uid_map != container_root_uid)) {
4169 /* idmap will now keep track of that memory. */
4170 container_root_uid = NULL;
4171
4172 /* Add container root to the map. */
4173 tmplist = malloc(sizeof(*tmplist));
4174 if (!tmplist)
4175 goto on_error;
4176 lxc_list_add_elem(tmplist, host_uid_map);
4177 lxc_list_add_tail(idmap, tmplist);
4178 }
4179 /* idmap will now keep track of that memory. */
4180 container_root_uid = NULL;
4181 /* idmap will now keep track of that memory. */
4182 host_uid_map = NULL;
4183
4184 tmplist = malloc(sizeof(*tmplist));
4185 if (!tmplist)
4186 goto on_error;
4187 lxc_list_add_elem(tmplist, container_root_gid);
4188 lxc_list_add_tail(idmap, tmplist);
4189
4190 if (host_gid_map && (host_gid_map != container_root_gid)) {
4191 /* idmap will now keep track of that memory. */
4192 container_root_gid = NULL;
4193
4194 tmplist = malloc(sizeof(*tmplist));
4195 if (!tmplist)
4196 goto on_error;
4197 lxc_list_add_elem(tmplist, host_gid_map);
4198 lxc_list_add_tail(idmap, tmplist);
4199 }
4200 /* idmap will now keep track of that memory. */
4201 container_root_gid = NULL;
4202 /* idmap will now keep track of that memory. */
4203 host_gid_map = NULL;
4204
4205 TRACE("Allocated minimal idmapping");
4206 return idmap;
4207
4208 on_error:
4209 if (idmap) {
4210 lxc_free_idmap(idmap);
4211 free(idmap);
4212 }
4213 if (container_root_uid)
4214 free(container_root_uid);
4215 if (container_root_gid)
4216 free(container_root_gid);
4217 if (host_uid_map && (host_uid_map != container_root_uid))
4218 free(host_uid_map);
4219 if (host_gid_map && (host_gid_map != container_root_gid))
4220 free(host_gid_map);
4221
4222 return NULL;
4223 }
4224
4225 /* Run a function in a new user namespace.
4226 * The caller's euid/egid will be mapped if it is not already.
4227 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4228 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4229 * This means we require only to establish a mapping from:
4230 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4231 * - the container root -> some sub{g,u}id
4232 * The former we add, if the user did not specifiy a mapping. The latter we
4233 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4234 * there to start the container in the first place.
4235 */
4236 int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4237 const char *fn_name)
4238 {
4239 pid_t pid;
4240 int p[2];
4241 struct userns_fn_data d;
4242 struct lxc_list *idmap;
4243 int ret = -1, status = -1;
4244 char c = '1';
4245
4246 if (!conf)
4247 return -EINVAL;
4248
4249 idmap = get_minimal_idmap(conf);
4250 if (!idmap)
4251 return -1;
4252
4253 ret = pipe(p);
4254 if (ret < 0) {
4255 SYSERROR("Failed to create pipe");
4256 return -1;
4257 }
4258 d.fn = fn;
4259 d.fn_name = fn_name;
4260 d.arg = data;
4261 d.p[0] = p[0];
4262 d.p[1] = p[1];
4263
4264 /* Clone child in new user namespace. */
4265 pid = lxc_raw_clone_cb(run_userns_fn, &d, CLONE_NEWUSER);
4266 if (pid < 0) {
4267 ERROR("Failed to clone process in new user namespace");
4268 goto on_error;
4269 }
4270
4271 close(p[0]);
4272 p[0] = -1;
4273
4274 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4275 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4276 struct id_map *map;
4277 struct lxc_list *it;
4278
4279 lxc_list_for_each (it, idmap) {
4280 map = it->elem;
4281 TRACE("Establishing %cid mapping for \"%d\" in new "
4282 "user namespace: nsuid %lu - hostid %lu - range "
4283 "%lu",
4284 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4285 map->nsid, map->hostid, map->range);
4286 }
4287 }
4288
4289 /* Set up {g,u}id mapping for user namespace of child process. */
4290 ret = lxc_map_ids(idmap, pid);
4291 if (ret < 0) {
4292 ERROR("Error setting up {g,u}id mappings for child process \"%d\"", pid);
4293 goto on_error;
4294 }
4295
4296 /* Tell child to proceed. */
4297 if (lxc_write_nointr(p[1], &c, 1) != 1) {
4298 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
4299 goto on_error;
4300 }
4301
4302 on_error:
4303 if (p[0] != -1)
4304 close(p[0]);
4305 close(p[1]);
4306
4307 /* Wait for child to finish. */
4308 if (pid > 0)
4309 status = wait_for_pid(pid);
4310
4311 if (status < 0)
4312 ret = -1;
4313
4314 return ret;
4315 }
4316
4317 int userns_exec_full(struct lxc_conf *conf, int (*fn)(void *), void *data,
4318 const char *fn_name)
4319 {
4320 pid_t pid;
4321 uid_t euid, egid;
4322 int p[2];
4323 struct id_map *map;
4324 struct lxc_list *cur;
4325 struct userns_fn_data d;
4326 int ret = -1;
4327 char c = '1';
4328 struct lxc_list *idmap = NULL, *tmplist = NULL;
4329 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4330 *host_uid_map = NULL, *host_gid_map = NULL;
4331
4332 if (!conf)
4333 return -EINVAL;
4334
4335 ret = pipe(p);
4336 if (ret < 0) {
4337 SYSERROR("opening pipe");
4338 return -1;
4339 }
4340 d.fn = fn;
4341 d.fn_name = fn_name;
4342 d.arg = data;
4343 d.p[0] = p[0];
4344 d.p[1] = p[1];
4345
4346 /* Clone child in new user namespace. */
4347 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4348 if (pid < 0) {
4349 ERROR("Failed to clone process in new user namespace");
4350 goto on_error;
4351 }
4352
4353 close(p[0]);
4354 p[0] = -1;
4355
4356 euid = geteuid();
4357 egid = getegid();
4358
4359 /* Allocate new {g,u}id map list. */
4360 idmap = malloc(sizeof(*idmap));
4361 if (!idmap)
4362 goto on_error;
4363 lxc_list_init(idmap);
4364
4365 /* Find container root. */
4366 lxc_list_for_each (cur, &conf->id_map) {
4367 struct id_map *tmpmap;
4368
4369 tmplist = malloc(sizeof(*tmplist));
4370 if (!tmplist)
4371 goto on_error;
4372
4373 tmpmap = malloc(sizeof(*tmpmap));
4374 if (!tmpmap) {
4375 free(tmplist);
4376 goto on_error;
4377 }
4378
4379 memset(tmpmap, 0, sizeof(*tmpmap));
4380 memcpy(tmpmap, cur->elem, sizeof(*tmpmap));
4381 tmplist->elem = tmpmap;
4382
4383 lxc_list_add_tail(idmap, tmplist);
4384
4385 map = cur->elem;
4386
4387 if (map->idtype == ID_TYPE_UID)
4388 if (euid >= map->hostid && euid < map->hostid + map->range)
4389 host_uid_map = map;
4390
4391 if (map->idtype == ID_TYPE_GID)
4392 if (egid >= map->hostid && egid < map->hostid + map->range)
4393 host_gid_map = map;
4394
4395 if (map->nsid != 0)
4396 continue;
4397
4398 if (map->idtype == ID_TYPE_UID)
4399 if (container_root_uid == NULL)
4400 container_root_uid = map;
4401
4402 if (map->idtype == ID_TYPE_GID)
4403 if (container_root_gid == NULL)
4404 container_root_gid = map;
4405 }
4406
4407 if (!container_root_uid || !container_root_gid) {
4408 ERROR("No mapping for container root found");
4409 goto on_error;
4410 }
4411
4412 /* Check whether the {g,u}id of the user has a mapping. */
4413 if (!host_uid_map)
4414 host_uid_map = mapped_hostid_add(conf, euid, ID_TYPE_UID);
4415 else
4416 host_uid_map = container_root_uid;
4417
4418 if (!host_gid_map)
4419 host_gid_map = mapped_hostid_add(conf, egid, ID_TYPE_GID);
4420 else
4421 host_gid_map = container_root_gid;
4422
4423 if (!host_uid_map) {
4424 DEBUG("Failed to find mapping for uid %d", euid);
4425 goto on_error;
4426 }
4427
4428 if (!host_gid_map) {
4429 DEBUG("Failed to find mapping for gid %d", egid);
4430 goto on_error;
4431 }
4432
4433 if (host_uid_map && (host_uid_map != container_root_uid)) {
4434 /* Add container root to the map. */
4435 tmplist = malloc(sizeof(*tmplist));
4436 if (!tmplist)
4437 goto on_error;
4438 lxc_list_add_elem(tmplist, host_uid_map);
4439 lxc_list_add_tail(idmap, tmplist);
4440 }
4441 /* idmap will now keep track of that memory. */
4442 host_uid_map = NULL;
4443
4444 if (host_gid_map && (host_gid_map != container_root_gid)) {
4445 tmplist = malloc(sizeof(*tmplist));
4446 if (!tmplist)
4447 goto on_error;
4448 lxc_list_add_elem(tmplist, host_gid_map);
4449 lxc_list_add_tail(idmap, tmplist);
4450 }
4451 /* idmap will now keep track of that memory. */
4452 host_gid_map = NULL;
4453
4454 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4455 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4456 lxc_list_for_each (cur, idmap) {
4457 map = cur->elem;
4458 TRACE("establishing %cid mapping for \"%d\" in new "
4459 "user namespace: nsuid %lu - hostid %lu - range "
4460 "%lu",
4461 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4462 map->nsid, map->hostid, map->range);
4463 }
4464 }
4465
4466 /* Set up {g,u}id mapping for user namespace of child process. */
4467 ret = lxc_map_ids(idmap, pid);
4468 if (ret < 0) {
4469 ERROR("error setting up {g,u}id mappings for child process \"%d\"", pid);
4470 goto on_error;
4471 }
4472
4473 /* Tell child to proceed. */
4474 if (lxc_write_nointr(p[1], &c, 1) != 1) {
4475 SYSERROR("Failed telling child process \"%d\" to proceed", pid);
4476 goto on_error;
4477 }
4478
4479 on_error:
4480 if (p[0] != -1)
4481 close(p[0]);
4482 close(p[1]);
4483
4484 /* Wait for child to finish. */
4485 if (pid > 0)
4486 ret = wait_for_pid(pid);
4487
4488 if (idmap) {
4489 lxc_free_idmap(idmap);
4490 free(idmap);
4491 }
4492
4493 if (host_uid_map && (host_uid_map != container_root_uid))
4494 free(host_uid_map);
4495 if (host_gid_map && (host_gid_map != container_root_gid))
4496 free(host_gid_map);
4497
4498 return ret;
4499 }
4500
4501 /* not thread-safe, do not use from api without first forking */
4502 static char *getuname(void)
4503 {
4504 struct passwd pwent;
4505 struct passwd *pwentp = NULL;
4506 char *buf;
4507 char *username;
4508 size_t bufsize;
4509 int ret;
4510
4511 bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
4512 if (bufsize == -1)
4513 bufsize = 1024;
4514
4515 buf = malloc(bufsize);
4516 if (!buf)
4517 return NULL;
4518
4519 ret = getpwuid_r(geteuid(), &pwent, buf, bufsize, &pwentp);
4520 if (!pwentp) {
4521 if (ret == 0)
4522 WARN("Could not find matched password record.");
4523
4524 ERROR("Failed to get password record - %u", geteuid());
4525 free(buf);
4526 return NULL;
4527 }
4528
4529 username = strdup(pwent.pw_name);
4530 free(buf);
4531
4532 return username;
4533 }
4534
4535 /* not thread-safe, do not use from api without first forking */
4536 static char *getgname(void)
4537 {
4538 struct group grent;
4539 struct group *grentp = NULL;
4540 char *buf;
4541 char *grname;
4542 size_t bufsize;
4543 int ret;
4544
4545 bufsize = sysconf(_SC_GETGR_R_SIZE_MAX);
4546 if (bufsize == -1)
4547 bufsize = 1024;
4548
4549 buf = malloc(bufsize);
4550 if (!buf)
4551 return NULL;
4552
4553 ret = getgrgid_r(getegid(), &grent, buf, bufsize, &grentp);
4554 if (!grentp) {
4555 if (ret == 0)
4556 WARN("Could not find matched group record");
4557
4558 ERROR("Failed to get group record - %u", getegid());
4559 free(buf);
4560 return NULL;
4561 }
4562
4563 grname = strdup(grent.gr_name);
4564 free(buf);
4565
4566 return grname;
4567 }
4568
4569 /* not thread-safe, do not use from api without first forking */
4570 void suggest_default_idmap(void)
4571 {
4572 char *uname, *gname;
4573 FILE *f;
4574 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4575 size_t len = 0;
4576 char *line = NULL;
4577
4578 uname = getuname();
4579 if (!uname)
4580 return;
4581
4582 gname = getgname();
4583 if (!gname) {
4584 free(uname);
4585 return;
4586 }
4587
4588 f = fopen(subuidfile, "r");
4589 if (!f) {
4590 ERROR("Your system is not configured with subuids");
4591 free(gname);
4592 free(uname);
4593 return;
4594 }
4595
4596 while (getline(&line, &len, f) != -1) {
4597 char *p, *p2;
4598 size_t no_newline = 0;
4599
4600 p = strchr(line, ':');
4601 if (*line == '#')
4602 continue;
4603 if (!p)
4604 continue;
4605 *p = '\0';
4606 p++;
4607
4608 if (strcmp(line, uname))
4609 continue;
4610
4611 p2 = strchr(p, ':');
4612 if (!p2)
4613 continue;
4614 *p2 = '\0';
4615 p2++;
4616 if (!*p2)
4617 continue;
4618 no_newline = strcspn(p2, "\n");
4619 p2[no_newline] = '\0';
4620
4621 if (lxc_safe_uint(p, &uid) < 0)
4622 WARN("Could not parse UID");
4623 if (lxc_safe_uint(p2, &urange) < 0)
4624 WARN("Could not parse UID range");
4625 }
4626 fclose(f);
4627
4628 f = fopen(subgidfile, "r");
4629 if (!f) {
4630 ERROR("Your system is not configured with subgids");
4631 free(gname);
4632 free(uname);
4633 return;
4634 }
4635
4636 while (getline(&line, &len, f) != -1) {
4637 char *p, *p2;
4638 size_t no_newline = 0;
4639
4640 p = strchr(line, ':');
4641 if (*line == '#')
4642 continue;
4643 if (!p)
4644 continue;
4645 *p = '\0';
4646 p++;
4647
4648 if (strcmp(line, uname))
4649 continue;
4650
4651 p2 = strchr(p, ':');
4652 if (!p2)
4653 continue;
4654 *p2 = '\0';
4655 p2++;
4656 if (!*p2)
4657 continue;
4658 no_newline = strcspn(p2, "\n");
4659 p2[no_newline] = '\0';
4660
4661 if (lxc_safe_uint(p, &gid) < 0)
4662 WARN("Could not parse GID");
4663 if (lxc_safe_uint(p2, &grange) < 0)
4664 WARN("Could not parse GID range");
4665 }
4666 fclose(f);
4667
4668 free(line);
4669
4670 if (!urange || !grange) {
4671 ERROR("You do not have subuids or subgids allocated");
4672 ERROR("Unprivileged containers require subuids and subgids");
4673 free(uname);
4674 free(gname);
4675 return;
4676 }
4677
4678 ERROR("You must either run as root, or define uid mappings");
4679 ERROR("To pass uid mappings to lxc-create, you could create");
4680 ERROR("~/.config/lxc/default.conf:");
4681 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
4682 ERROR("lxc.idmap = u 0 %u %u", uid, urange);
4683 ERROR("lxc.idmap = g 0 %u %u", gid, grange);
4684
4685 free(gname);
4686 free(uname);
4687 }
4688
4689 static void free_cgroup_settings(struct lxc_list *result)
4690 {
4691 struct lxc_list *iterator, *next;
4692
4693 lxc_list_for_each_safe (iterator, result, next) {
4694 lxc_list_del(iterator);
4695 free(iterator);
4696 }
4697 free(result);
4698 }
4699
4700 /* Return the list of cgroup_settings sorted according to the following rules
4701 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4702 */
4703 struct lxc_list *sort_cgroup_settings(struct lxc_list *cgroup_settings)
4704 {
4705 struct lxc_list *result;
4706 struct lxc_cgroup *cg = NULL;
4707 struct lxc_list *it = NULL, *item = NULL, *memsw_limit = NULL;
4708
4709 result = malloc(sizeof(*result));
4710 if (!result)
4711 return NULL;
4712 lxc_list_init(result);
4713
4714 /* Iterate over the cgroup settings and copy them to the output list. */
4715 lxc_list_for_each (it, cgroup_settings) {
4716 item = malloc(sizeof(*item));
4717 if (!item) {
4718 free_cgroup_settings(result);
4719 return NULL;
4720 }
4721
4722 item->elem = it->elem;
4723 cg = it->elem;
4724 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4725 /* Store the memsw_limit location */
4726 memsw_limit = item;
4727 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 &&
4728 memsw_limit != NULL) {
4729 /* lxc.cgroup.memory.memsw.limit_in_bytes is found
4730 * before lxc.cgroup.memory.limit_in_bytes, swap these
4731 * two items */
4732 item->elem = memsw_limit->elem;
4733 memsw_limit->elem = it->elem;
4734 }
4735 lxc_list_add_tail(result, item);
4736 }
4737
4738 return result;
4739 }