]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/conf.c
network: user send()/recv()
[mirror_lxc.git] / src / lxc / conf.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #define _GNU_SOURCE
25 #include "config.h"
26
27 #include <dirent.h>
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <grp.h>
31 #include <inttypes.h>
32 #include <libgen.h>
33 #include <pwd.h>
34 #include <stdarg.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <arpa/inet.h>
41 #include <linux/loop.h>
42 #include <net/if.h>
43 #include <netinet/in.h>
44 #include <sys/mman.h>
45 #include <sys/mount.h>
46 #include <sys/param.h>
47 #include <sys/prctl.h>
48 #include <sys/stat.h>
49 #include <sys/socket.h>
50 #include <sys/sysmacros.h>
51 #include <sys/syscall.h>
52 #include <sys/types.h>
53 #include <sys/utsname.h>
54 #include <sys/wait.h>
55
56 /* makedev() */
57 #ifdef MAJOR_IN_MKDEV
58 # include <sys/mkdev.h>
59 #endif
60
61 #ifdef HAVE_STATVFS
62 #include <sys/statvfs.h>
63 #endif
64
65 #if HAVE_PTY_H
66 #include <pty.h>
67 #else
68 #include <../include/openpty.h>
69 #endif
70
71 #ifdef HAVE_LINUX_MEMFD_H
72 #include <linux/memfd.h>
73 #endif
74
75 #include "af_unix.h"
76 #include "caps.h" /* for lxc_caps_last_cap() */
77 #include "cgroup.h"
78 #include "conf.h"
79 #include "confile_utils.h"
80 #include "error.h"
81 #include "log.h"
82 #include "lxclock.h"
83 #include "lxcseccomp.h"
84 #include "namespace.h"
85 #include "network.h"
86 #include "parse.h"
87 #include "storage.h"
88 #include "storage/aufs.h"
89 #include "storage/overlay.h"
90 #include "utils.h"
91 #include "lsm/lsm.h"
92
93 #if HAVE_LIBCAP
94 #include <sys/capability.h>
95 #endif
96
97 #if HAVE_SYS_PERSONALITY_H
98 #include <sys/personality.h>
99 #endif
100
101 #if IS_BIONIC
102 #include <../include/lxcmntent.h>
103 #ifndef HAVE_PRLIMIT
104 #include <../include/prlimit.h>
105 #endif
106 #else
107 #include <mntent.h>
108 #endif
109
110 lxc_log_define(lxc_conf, lxc);
111
112 #if HAVE_LIBCAP
113 #ifndef CAP_SETFCAP
114 #define CAP_SETFCAP 31
115 #endif
116
117 #ifndef CAP_MAC_OVERRIDE
118 #define CAP_MAC_OVERRIDE 32
119 #endif
120
121 #ifndef CAP_MAC_ADMIN
122 #define CAP_MAC_ADMIN 33
123 #endif
124 #endif
125
126 #ifndef PR_CAPBSET_DROP
127 #define PR_CAPBSET_DROP 24
128 #endif
129
130 #ifndef LO_FLAGS_AUTOCLEAR
131 #define LO_FLAGS_AUTOCLEAR 4
132 #endif
133
134 #ifndef CAP_SETUID
135 #define CAP_SETUID 7
136 #endif
137
138 #ifndef CAP_SETGID
139 #define CAP_SETGID 6
140 #endif
141
142 /* needed for cgroup automount checks, regardless of whether we
143 * have included linux/capability.h or not */
144 #ifndef CAP_SYS_ADMIN
145 #define CAP_SYS_ADMIN 21
146 #endif
147
148 /* Define pivot_root() if missing from the C library */
149 #ifndef HAVE_PIVOT_ROOT
150 static int pivot_root(const char * new_root, const char * put_old)
151 {
152 #ifdef __NR_pivot_root
153 return syscall(__NR_pivot_root, new_root, put_old);
154 #else
155 errno = ENOSYS;
156 return -1;
157 #endif
158 }
159 #else
160 extern int pivot_root(const char * new_root, const char * put_old);
161 #endif
162
163 /* Define sethostname() if missing from the C library */
164 #ifndef HAVE_SETHOSTNAME
165 static int sethostname(const char * name, size_t len)
166 {
167 #ifdef __NR_sethostname
168 return syscall(__NR_sethostname, name, len);
169 #else
170 errno = ENOSYS;
171 return -1;
172 #endif
173 }
174 #endif
175
176 #ifndef MS_PRIVATE
177 #define MS_PRIVATE (1<<18)
178 #endif
179
180 #ifndef MS_LAZYTIME
181 #define MS_LAZYTIME (1<<25)
182 #endif
183
184 /* memfd_create() */
185 #ifndef MFD_CLOEXEC
186 #define MFD_CLOEXEC 0x0001U
187 #endif
188
189 #ifndef MFD_ALLOW_SEALING
190 #define MFD_ALLOW_SEALING 0x0002U
191 #endif
192
193 #ifndef HAVE_MEMFD_CREATE
194 static int memfd_create(const char *name, unsigned int flags) {
195 #ifndef __NR_memfd_create
196 #if defined __i386__
197 #define __NR_memfd_create 356
198 #elif defined __x86_64__
199 #define __NR_memfd_create 319
200 #elif defined __arm__
201 #define __NR_memfd_create 385
202 #elif defined __aarch64__
203 #define __NR_memfd_create 279
204 #elif defined __s390__
205 #define __NR_memfd_create 350
206 #elif defined __powerpc__
207 #define __NR_memfd_create 360
208 #elif defined __sparc__
209 #define __NR_memfd_create 348
210 #elif defined __blackfin__
211 #define __NR_memfd_create 390
212 #elif defined __ia64__
213 #define __NR_memfd_create 1340
214 #elif defined _MIPS_SIM
215 #if _MIPS_SIM == _MIPS_SIM_ABI32
216 #define __NR_memfd_create 4354
217 #endif
218 #if _MIPS_SIM == _MIPS_SIM_NABI32
219 #define __NR_memfd_create 6318
220 #endif
221 #if _MIPS_SIM == _MIPS_SIM_ABI64
222 #define __NR_memfd_create 5314
223 #endif
224 #endif
225 #endif
226 #ifdef __NR_memfd_create
227 return syscall(__NR_memfd_create, name, flags);
228 #else
229 errno = ENOSYS;
230 return -1;
231 #endif
232 }
233 #else
234 extern int memfd_create(const char *name, unsigned int flags);
235 #endif
236
237 char *lxchook_names[NUM_LXC_HOOKS] = {"pre-start", "pre-mount", "mount",
238 "autodev", "start", "stop",
239 "post-stop", "clone", "destroy"};
240
241 struct mount_opt {
242 char *name;
243 int clear;
244 int flag;
245 };
246
247 struct caps_opt {
248 char *name;
249 int value;
250 };
251
252 struct limit_opt {
253 char *name;
254 int value;
255 };
256
257 /*
258 * The lxc_conf of the container currently being worked on in an
259 * API call
260 * This is used in the error calls
261 */
262 #ifdef HAVE_TLS
263 __thread struct lxc_conf *current_config;
264 #else
265 struct lxc_conf *current_config;
266 #endif
267
268 /* Declare this here, since we don't want to reshuffle the whole file. */
269 static int in_caplist(int cap, struct lxc_list *caps);
270
271 static struct mount_opt mount_opt[] = {
272 { "async", 1, MS_SYNCHRONOUS },
273 { "atime", 1, MS_NOATIME },
274 { "bind", 0, MS_BIND },
275 { "defaults", 0, 0 },
276 { "dev", 1, MS_NODEV },
277 { "diratime", 1, MS_NODIRATIME },
278 { "dirsync", 0, MS_DIRSYNC },
279 { "exec", 1, MS_NOEXEC },
280 { "lazytime", 0, MS_LAZYTIME },
281 { "mand", 0, MS_MANDLOCK },
282 { "noatime", 0, MS_NOATIME },
283 { "nodev", 0, MS_NODEV },
284 { "nodiratime", 0, MS_NODIRATIME },
285 { "noexec", 0, MS_NOEXEC },
286 { "nomand", 1, MS_MANDLOCK },
287 { "norelatime", 1, MS_RELATIME },
288 { "nostrictatime", 1, MS_STRICTATIME },
289 { "nosuid", 0, MS_NOSUID },
290 { "rbind", 0, MS_BIND|MS_REC },
291 { "relatime", 0, MS_RELATIME },
292 { "remount", 0, MS_REMOUNT },
293 { "ro", 0, MS_RDONLY },
294 { "rw", 1, MS_RDONLY },
295 { "strictatime", 0, MS_STRICTATIME },
296 { "suid", 1, MS_NOSUID },
297 { "sync", 0, MS_SYNCHRONOUS },
298 { NULL, 0, 0 },
299 };
300
301 #if HAVE_LIBCAP
302 static struct caps_opt caps_opt[] = {
303 { "chown", CAP_CHOWN },
304 { "dac_override", CAP_DAC_OVERRIDE },
305 { "dac_read_search", CAP_DAC_READ_SEARCH },
306 { "fowner", CAP_FOWNER },
307 { "fsetid", CAP_FSETID },
308 { "kill", CAP_KILL },
309 { "setgid", CAP_SETGID },
310 { "setuid", CAP_SETUID },
311 { "setpcap", CAP_SETPCAP },
312 { "linux_immutable", CAP_LINUX_IMMUTABLE },
313 { "net_bind_service", CAP_NET_BIND_SERVICE },
314 { "net_broadcast", CAP_NET_BROADCAST },
315 { "net_admin", CAP_NET_ADMIN },
316 { "net_raw", CAP_NET_RAW },
317 { "ipc_lock", CAP_IPC_LOCK },
318 { "ipc_owner", CAP_IPC_OWNER },
319 { "sys_module", CAP_SYS_MODULE },
320 { "sys_rawio", CAP_SYS_RAWIO },
321 { "sys_chroot", CAP_SYS_CHROOT },
322 { "sys_ptrace", CAP_SYS_PTRACE },
323 { "sys_pacct", CAP_SYS_PACCT },
324 { "sys_admin", CAP_SYS_ADMIN },
325 { "sys_boot", CAP_SYS_BOOT },
326 { "sys_nice", CAP_SYS_NICE },
327 { "sys_resource", CAP_SYS_RESOURCE },
328 { "sys_time", CAP_SYS_TIME },
329 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
330 { "mknod", CAP_MKNOD },
331 { "lease", CAP_LEASE },
332 #ifdef CAP_AUDIT_READ
333 { "audit_read", CAP_AUDIT_READ },
334 #endif
335 #ifdef CAP_AUDIT_WRITE
336 { "audit_write", CAP_AUDIT_WRITE },
337 #endif
338 #ifdef CAP_AUDIT_CONTROL
339 { "audit_control", CAP_AUDIT_CONTROL },
340 #endif
341 { "setfcap", CAP_SETFCAP },
342 { "mac_override", CAP_MAC_OVERRIDE },
343 { "mac_admin", CAP_MAC_ADMIN },
344 #ifdef CAP_SYSLOG
345 { "syslog", CAP_SYSLOG },
346 #endif
347 #ifdef CAP_WAKE_ALARM
348 { "wake_alarm", CAP_WAKE_ALARM },
349 #endif
350 #ifdef CAP_BLOCK_SUSPEND
351 { "block_suspend", CAP_BLOCK_SUSPEND },
352 #endif
353 };
354 #else
355 static struct caps_opt caps_opt[] = {};
356 #endif
357
358 static struct limit_opt limit_opt[] = {
359 #ifdef RLIMIT_AS
360 { "as", RLIMIT_AS },
361 #endif
362 #ifdef RLIMIT_CORE
363 { "core", RLIMIT_CORE },
364 #endif
365 #ifdef RLIMIT_CPU
366 { "cpu", RLIMIT_CPU },
367 #endif
368 #ifdef RLIMIT_DATA
369 { "data", RLIMIT_DATA },
370 #endif
371 #ifdef RLIMIT_FSIZE
372 { "fsize", RLIMIT_FSIZE },
373 #endif
374 #ifdef RLIMIT_LOCKS
375 { "locks", RLIMIT_LOCKS },
376 #endif
377 #ifdef RLIMIT_MEMLOCK
378 { "memlock", RLIMIT_MEMLOCK },
379 #endif
380 #ifdef RLIMIT_MSGQUEUE
381 { "msgqueue", RLIMIT_MSGQUEUE },
382 #endif
383 #ifdef RLIMIT_NICE
384 { "nice", RLIMIT_NICE },
385 #endif
386 #ifdef RLIMIT_NOFILE
387 { "nofile", RLIMIT_NOFILE },
388 #endif
389 #ifdef RLIMIT_NPROC
390 { "nproc", RLIMIT_NPROC },
391 #endif
392 #ifdef RLIMIT_RSS
393 { "rss", RLIMIT_RSS },
394 #endif
395 #ifdef RLIMIT_RTPRIO
396 { "rtprio", RLIMIT_RTPRIO },
397 #endif
398 #ifdef RLIMIT_RTTIME
399 { "rttime", RLIMIT_RTTIME },
400 #endif
401 #ifdef RLIMIT_SIGPENDING
402 { "sigpending", RLIMIT_SIGPENDING },
403 #endif
404 #ifdef RLIMIT_STACK
405 { "stack", RLIMIT_STACK },
406 #endif
407 };
408
409 static int run_buffer(char *buffer)
410 {
411 struct lxc_popen_FILE *f;
412 char *output;
413 int ret;
414
415 f = lxc_popen(buffer);
416 if (!f) {
417 SYSERROR("Failed to popen() %s.", buffer);
418 return -1;
419 }
420
421 output = malloc(LXC_LOG_BUFFER_SIZE);
422 if (!output) {
423 ERROR("Failed to allocate memory for %s.", buffer);
424 lxc_pclose(f);
425 return -1;
426 }
427
428 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
429 DEBUG("Script %s with output: %s.", buffer, output);
430
431 free(output);
432
433 ret = lxc_pclose(f);
434 if (ret == -1) {
435 SYSERROR("Script exited with error.");
436 return -1;
437 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
438 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
439 return -1;
440 } else if (WIFSIGNALED(ret)) {
441 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
442 return -1;
443 }
444
445 return 0;
446 }
447
448 static int run_script_argv(const char *name, const char *section,
449 const char *script, const char *hook,
450 const char *lxcpath, char **argsin)
451 {
452 int ret, i;
453 char *buffer;
454 size_t size = 0;
455
456 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
457 script, name, section);
458
459 for (i = 0; argsin && argsin[i]; i++)
460 size += strlen(argsin[i]) + 1;
461
462 size += strlen(hook) + 1;
463
464 size += strlen(script);
465 size += strlen(name);
466 size += strlen(section);
467 size += 3;
468
469 if (size > INT_MAX)
470 return -1;
471
472 buffer = alloca(size);
473 if (!buffer) {
474 ERROR("Failed to allocate memory.");
475 return -1;
476 }
477
478 ret =
479 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
480 if (ret < 0 || (size_t)ret >= size) {
481 ERROR("Script name too long.");
482 return -1;
483 }
484
485 for (i = 0; argsin && argsin[i]; i++) {
486 int len = size - ret;
487 int rc;
488 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
489 if (rc < 0 || rc >= len) {
490 ERROR("Script args too long.");
491 return -1;
492 }
493 ret += rc;
494 }
495
496 return run_buffer(buffer);
497 }
498
499 int run_script(const char *name, const char *section, const char *script, ...)
500 {
501 int ret;
502 char *buffer, *p;
503 size_t size = 0;
504 va_list ap;
505
506 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
507 script, name, section);
508
509 va_start(ap, script);
510 while ((p = va_arg(ap, char *)))
511 size += strlen(p) + 1;
512 va_end(ap);
513
514 size += strlen(script);
515 size += strlen(name);
516 size += strlen(section);
517 size += 3;
518
519 if (size > INT_MAX)
520 return -1;
521
522 buffer = alloca(size);
523 if (!buffer) {
524 ERROR("Failed to allocate memory.");
525 return -1;
526 }
527
528 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
529 if (ret < 0 || ret >= size) {
530 ERROR("Script name too long.");
531 return -1;
532 }
533
534 va_start(ap, script);
535 while ((p = va_arg(ap, char *))) {
536 int len = size - ret;
537 int rc;
538 rc = snprintf(buffer + ret, len, " %s", p);
539 if (rc < 0 || rc >= len) {
540 ERROR("Script args too long.");
541 return -1;
542 }
543 ret += rc;
544 }
545 va_end(ap);
546
547 return run_buffer(buffer);
548 }
549
550 /*
551 * pin_rootfs
552 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
553 * the duration of the container run, to prevent the container from marking
554 * the underlying fs readonly on shutdown. unlink the file immediately so
555 * no name pollution is happens
556 * return -1 on error.
557 * return -2 if nothing needed to be pinned.
558 * return an open fd (>=0) if we pinned it.
559 */
560 int pin_rootfs(const char *rootfs)
561 {
562 char absrootfs[MAXPATHLEN];
563 char absrootfspin[MAXPATHLEN];
564 struct stat s;
565 int ret, fd;
566
567 if (rootfs == NULL || strlen(rootfs) == 0)
568 return -2;
569
570 if (!realpath(rootfs, absrootfs))
571 return -2;
572
573 if (access(absrootfs, F_OK))
574 return -1;
575
576 if (stat(absrootfs, &s))
577 return -1;
578
579 if (!S_ISDIR(s.st_mode))
580 return -2;
581
582 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
583 if (ret >= MAXPATHLEN)
584 return -1;
585
586 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
587 if (fd < 0)
588 return fd;
589 (void)unlink(absrootfspin);
590 return fd;
591 }
592
593 /*
594 * If we are asking to remount something, make sure that any
595 * NOEXEC etc are honored.
596 */
597 unsigned long add_required_remount_flags(const char *s, const char *d,
598 unsigned long flags)
599 {
600 #ifdef HAVE_STATVFS
601 struct statvfs sb;
602 unsigned long required_flags = 0;
603
604 if (!(flags & MS_REMOUNT))
605 return flags;
606
607 if (!s)
608 s = d;
609
610 if (!s)
611 return flags;
612 if (statvfs(s, &sb) < 0)
613 return flags;
614
615 if (sb.f_flag & MS_NOSUID)
616 required_flags |= MS_NOSUID;
617 if (sb.f_flag & MS_NODEV)
618 required_flags |= MS_NODEV;
619 if (sb.f_flag & MS_RDONLY)
620 required_flags |= MS_RDONLY;
621 if (sb.f_flag & MS_NOEXEC)
622 required_flags |= MS_NOEXEC;
623
624 return flags | required_flags;
625 #else
626 return flags;
627 #endif
628 }
629
630 static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
631 {
632 int r;
633 int i;
634 static struct {
635 int match_mask;
636 int match_flag;
637 const char *source;
638 const char *destination;
639 const char *fstype;
640 unsigned long flags;
641 const char *options;
642 } default_mounts[] = {
643 /* Read-only bind-mounting... In older kernels, doing that required
644 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
645 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
646 * kernel 2.6.26 onwards. However, this apparently does not work on
647 * kernel 3.8. Unfortunately, on that very same kernel, doing the
648 * same trick as above doesn't seem to work either, there one needs
649 * to ALSO specify MS_BIND for the remount, otherwise the entire
650 * fs is remounted read-only or the mount fails because it's busy...
651 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
652 * 2.6.32...
653 */
654 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
655 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
656 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
657 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
658 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
659 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
660 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
661 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
662 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
663 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
664 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
665 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
666 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
667 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
668 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
669 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
670 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
671 { 0, 0, NULL, NULL, NULL, 0, NULL }
672 };
673
674 for (i = 0; default_mounts[i].match_mask; i++) {
675 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
676 char *source = NULL;
677 char *destination = NULL;
678 int saved_errno;
679 unsigned long mflags;
680
681 if (default_mounts[i].source) {
682 /* will act like strdup if %r is not present */
683 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
684 if (!source) {
685 SYSERROR("memory allocation error");
686 return -1;
687 }
688 }
689 if (!default_mounts[i].destination) {
690 ERROR("BUG: auto mounts destination %d was NULL", i);
691 free(source);
692 return -1;
693 }
694 /* will act like strdup if %r is not present */
695 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
696 if (!destination) {
697 saved_errno = errno;
698 SYSERROR("memory allocation error");
699 free(source);
700 errno = saved_errno;
701 return -1;
702 }
703 mflags = add_required_remount_flags(source, destination,
704 default_mounts[i].flags);
705 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
706 saved_errno = errno;
707 if (r < 0 && errno == ENOENT) {
708 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
709 r = 0;
710 }
711 else if (r < 0)
712 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
713
714 free(source);
715 free(destination);
716 if (r < 0) {
717 errno = saved_errno;
718 return -1;
719 }
720 }
721 }
722
723 if (flags & LXC_AUTO_CGROUP_MASK) {
724 int cg_flags;
725
726 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
727 /* If the type of cgroup mount was not specified, it depends on the
728 * container's capabilities as to what makes sense: if we have
729 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
730 * anyway, so we may as well default to read-write; then the admin
731 * will not be given a false sense of security. (And if they really
732 * want mixed r/o r/w, then they can explicitly specify :mixed.)
733 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
734 * :mixed, because then the container can't remount it read-write. */
735 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
736 int has_sys_admin = 0;
737
738 if (!lxc_list_empty(&conf->keepcaps))
739 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
740 else
741 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
742
743 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
744 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
745 else
746 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
747 }
748
749 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
750 SYSERROR("error mounting /sys/fs/cgroup");
751 return -1;
752 }
753 }
754
755 return 0;
756 }
757
758 static int setup_utsname(struct utsname *utsname)
759 {
760 if (!utsname)
761 return 0;
762
763 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
764 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
765 return -1;
766 }
767
768 INFO("'%s' hostname has been setup", utsname->nodename);
769
770 return 0;
771 }
772
773 struct dev_symlinks {
774 const char *oldpath;
775 const char *name;
776 };
777
778 static const struct dev_symlinks dev_symlinks[] = {
779 {"/proc/self/fd", "fd"},
780 {"/proc/self/fd/0", "stdin"},
781 {"/proc/self/fd/1", "stdout"},
782 {"/proc/self/fd/2", "stderr"},
783 };
784
785 static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
786 {
787 char path[MAXPATHLEN];
788 int ret,i;
789 struct stat s;
790
791
792 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
793 const struct dev_symlinks *d = &dev_symlinks[i];
794 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
795 if (ret < 0 || ret >= MAXPATHLEN)
796 return -1;
797
798 /*
799 * Stat the path first. If we don't get an error
800 * accept it as is and don't try to create it
801 */
802 if (!stat(path, &s)) {
803 continue;
804 }
805
806 ret = symlink(d->oldpath, path);
807
808 if (ret && errno != EEXIST) {
809 if ( errno == EROFS ) {
810 WARN("Warning: Read Only file system while creating %s", path);
811 } else {
812 SYSERROR("Error creating %s", path);
813 return -1;
814 }
815 }
816 }
817 return 0;
818 }
819
820 /*
821 * Build a space-separate list of ptys to pass to systemd.
822 */
823 static bool append_ptyname(char **pp, char *name)
824 {
825 char *p;
826
827 if (!*pp) {
828 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
829 if (!*pp)
830 return false;
831 sprintf(*pp, "container_ttys=%s", name);
832 return true;
833 }
834 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
835 if (!p)
836 return false;
837 *pp = p;
838 strcat(p, " ");
839 strcat(p, name);
840 return true;
841 }
842
843 static int lxc_setup_tty(struct lxc_conf *conf)
844 {
845 int i, ret;
846 const struct lxc_tty_info *tty_info = &conf->tty_info;
847 char *ttydir = conf->ttydir;
848 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
849
850 if (!conf->rootfs.path)
851 return 0;
852
853 for (i = 0; i < tty_info->nbtty; i++) {
854 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
855
856 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
857 if (ret < 0 || (size_t)ret >= sizeof(path)) {
858 ERROR("pathname too long for ttys");
859 return -1;
860 }
861
862 if (ttydir) {
863 /* create dev/lxc/tty%d" */
864 ret = snprintf(lxcpath, sizeof(lxcpath),
865 "/dev/%s/tty%d", ttydir, i + 1);
866 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
867 ERROR("pathname too long for ttys");
868 return -1;
869 }
870
871 ret = creat(lxcpath, 0660);
872 if (ret < 0 && errno != EEXIST) {
873 SYSERROR("failed to create \"%s\"", lxcpath);
874 return -1;
875 }
876 if (ret >= 0)
877 close(ret);
878
879 ret = unlink(path);
880 if (ret < 0 && errno != ENOENT) {
881 SYSERROR("failed to unlink \"%s\"", path);
882 return -1;
883 }
884
885 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
886 if (ret < 0) {
887 WARN("failed to bind mount \"%s\" onto \"%s\"",
888 pty_info->name, path);
889 continue;
890 }
891 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
892 path);
893
894 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
895 ttydir, i + 1);
896 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
897 ERROR("tty pathname too long");
898 return -1;
899 }
900
901 ret = symlink(lxcpath, path);
902 if (ret < 0) {
903 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
904 path, lxcpath);
905 return -1;
906 }
907 } else {
908 /* If we populated /dev, then we need to create
909 * /dev/ttyN
910 */
911 ret = access(path, F_OK);
912 if (ret < 0) {
913 ret = creat(path, 0660);
914 if (ret < 0) {
915 SYSERROR("failed to create \"%s\"", path);
916 /* this isn't fatal, continue */
917 } else {
918 close(ret);
919 }
920 }
921
922 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
923 if (ret < 0) {
924 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
925 continue;
926 }
927
928 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
929 path);
930 }
931
932 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
933 ERROR("Error setting up container_ttys string");
934 return -1;
935 }
936 }
937
938 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
939 return 0;
940 }
941
942 static int setup_rootfs_pivot_root(const char *rootfs)
943 {
944 int oldroot = -1, newroot = -1;
945
946 oldroot = open("/", O_DIRECTORY | O_RDONLY);
947 if (oldroot < 0) {
948 SYSERROR("Error opening old-/ for fchdir");
949 return -1;
950 }
951 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
952 if (newroot < 0) {
953 SYSERROR("Error opening new-/ for fchdir");
954 goto fail;
955 }
956
957 /* change into new root fs */
958 if (fchdir(newroot)) {
959 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
960 goto fail;
961 }
962
963 /* pivot_root into our new root fs */
964 if (pivot_root(".", ".")) {
965 SYSERROR("pivot_root syscall failed");
966 goto fail;
967 }
968
969 /*
970 * at this point the old-root is mounted on top of our new-root
971 * To unmounted it we must not be chdir'd into it, so escape back
972 * to old-root
973 */
974 if (fchdir(oldroot) < 0) {
975 SYSERROR("Error entering oldroot");
976 goto fail;
977 }
978 if (umount2(".", MNT_DETACH) < 0) {
979 SYSERROR("Error detaching old root");
980 goto fail;
981 }
982
983 if (fchdir(newroot) < 0) {
984 SYSERROR("Error re-entering newroot");
985 goto fail;
986 }
987
988 close(oldroot);
989 close(newroot);
990
991 DEBUG("pivot_root syscall to '%s' successful", rootfs);
992
993 return 0;
994
995 fail:
996 if (oldroot != -1)
997 close(oldroot);
998 if (newroot != -1)
999 close(newroot);
1000 return -1;
1001 }
1002
1003 /* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1004 * error, log it but don't fail yet.
1005 */
1006 static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1007 const char *lxcpath)
1008 {
1009 int ret;
1010 size_t clen;
1011 char *path;
1012
1013 INFO("Preparing \"/dev\"");
1014
1015 /* $(rootfs->mount) + "/dev/pts" + '\0' */
1016 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
1017 path = alloca(clen);
1018
1019 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
1020 if (ret < 0 || (size_t)ret >= clen)
1021 return -1;
1022
1023 if (!dir_exists(path)) {
1024 WARN("\"/dev\" directory does not exist. Proceeding without "
1025 "autodev being set up");
1026 return 0;
1027 }
1028
1029 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
1030 rootfs->path ? rootfs->mount : NULL);
1031 if (ret < 0) {
1032 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1033 return -1;
1034 }
1035 INFO("Mounted tmpfs on \"%s\"", path);
1036
1037 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
1038 if (ret < 0 || (size_t)ret >= clen)
1039 return -1;
1040
1041 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
1042 * If not, then create it and exit if that fails...
1043 */
1044 if (!dir_exists(path)) {
1045 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1046 if (ret < 0) {
1047 SYSERROR("Failed to create directory \"%s\"", path);
1048 return -1;
1049 }
1050 }
1051
1052 INFO("Prepared \"/dev\"");
1053 return 0;
1054 }
1055
1056 struct lxc_devs {
1057 const char *name;
1058 mode_t mode;
1059 int maj;
1060 int min;
1061 };
1062
1063 static const struct lxc_devs lxc_devs[] = {
1064 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1065 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1066 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1067 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1068 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1069 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
1070 };
1071
1072 static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
1073 {
1074 int ret;
1075 char path[MAXPATHLEN];
1076 int i;
1077 mode_t cmask;
1078
1079 ret = snprintf(path, MAXPATHLEN, "%s/dev",
1080 rootfs->path ? rootfs->mount : "");
1081 if (ret < 0 || ret >= MAXPATHLEN)
1082 return -1;
1083
1084 /* ignore, just don't try to fill in */
1085 if (!dir_exists(path))
1086 return 0;
1087
1088 INFO("Populating \"/dev\"");
1089
1090 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1091 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
1092 const struct lxc_devs *d = &lxc_devs[i];
1093
1094 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
1095 rootfs->path ? rootfs->mount : "", d->name);
1096 if (ret < 0 || ret >= MAXPATHLEN)
1097 return -1;
1098
1099 ret = mknod(path, d->mode, makedev(d->maj, d->min));
1100 if (ret < 0) {
1101 FILE *pathfile;
1102 char hostpath[MAXPATHLEN];
1103
1104 if (errno == EEXIST) {
1105 DEBUG("\"%s\" device already existed", path);
1106 continue;
1107 }
1108
1109 /* Unprivileged containers cannot create devices, so
1110 * bind mount the device from the host.
1111 */
1112 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1113 if (ret < 0 || ret >= MAXPATHLEN)
1114 return -1;
1115
1116 pathfile = fopen(path, "wb");
1117 if (!pathfile) {
1118 SYSERROR("Failed to create file \"%s\"", path);
1119 return -1;
1120 }
1121 fclose(pathfile);
1122
1123 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1124 rootfs->path ? rootfs->mount : NULL);
1125 if (ret < 0) {
1126 SYSERROR("Failed to bind mount \"%s\" from "
1127 "host into container",
1128 d->name);
1129 return -1;
1130 }
1131 DEBUG("Bind mounted \"%s\" onto \"%s\"", hostpath,
1132 path);
1133 } else {
1134 DEBUG("Created device node \"%s\"", path);
1135 }
1136 }
1137 umask(cmask);
1138
1139 INFO("Populated \"/dev\"");
1140 return 0;
1141 }
1142
1143 static int lxc_setup_rootfs(struct lxc_conf *conf)
1144 {
1145 int ret;
1146 struct lxc_storage *bdev;
1147 const struct lxc_rootfs *rootfs;
1148
1149 rootfs = &conf->rootfs;
1150 if (!rootfs->path) {
1151 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1152 SYSERROR("Failed to make / rslave.");
1153 return -1;
1154 }
1155 return 0;
1156 }
1157
1158 if (access(rootfs->mount, F_OK)) {
1159 SYSERROR("Failed to access to \"%s\". Check it is present.",
1160 rootfs->mount);
1161 return -1;
1162 }
1163
1164 bdev = storage_init(conf, rootfs->path, rootfs->mount, rootfs->options);
1165 if (!bdev) {
1166 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1167 rootfs->path, rootfs->mount,
1168 rootfs->options ? rootfs->options : "(null)");
1169 return -1;
1170 }
1171
1172 ret = bdev->ops->mount(bdev);
1173 storage_put(bdev);
1174 if (ret < 0) {
1175 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1176 rootfs->path, rootfs->mount,
1177 rootfs->options ? rootfs->options : "(null)");
1178 return -1;
1179 }
1180
1181 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1182 rootfs->path, rootfs->mount,
1183 rootfs->options ? rootfs->options : "(null)");
1184
1185 return 0;
1186 }
1187
1188 int prepare_ramfs_root(char *root)
1189 {
1190 char buf[LXC_LINELEN], *p;
1191 char nroot[PATH_MAX];
1192 FILE *f;
1193 int i;
1194 char *p2;
1195
1196 if (realpath(root, nroot) == NULL)
1197 return -errno;
1198
1199 if (chdir("/") == -1)
1200 return -errno;
1201
1202 /*
1203 * We could use here MS_MOVE, but in userns this mount is
1204 * locked and can't be moved.
1205 */
1206 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
1207 SYSERROR("Failed to move %s into /", root);
1208 return -errno;
1209 }
1210
1211 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
1212 SYSERROR("Failed to make . rprivate");
1213 return -errno;
1214 }
1215
1216 /*
1217 * The following code cleans up inhereted mounts which are not
1218 * required for CT.
1219 *
1220 * The mountinfo file shows not all mounts, if a few points have been
1221 * unmounted between read operations from the mountinfo. So we need to
1222 * read mountinfo a few times.
1223 *
1224 * This loop can be skipped if a container uses unserns, because all
1225 * inherited mounts are locked and we should live with all this trash.
1226 */
1227 while (1) {
1228 int progress = 0;
1229
1230 f = fopen("./proc/self/mountinfo", "r");
1231 if (!f) {
1232 SYSERROR("Unable to open /proc/self/mountinfo");
1233 return -1;
1234 }
1235 while (fgets(buf, LXC_LINELEN, f)) {
1236 for (p = buf, i=0; p && i < 4; i++)
1237 p = strchr(p+1, ' ');
1238 if (!p)
1239 continue;
1240 p2 = strchr(p+1, ' ');
1241 if (!p2)
1242 continue;
1243
1244 *p2 = '\0';
1245 *p = '.';
1246
1247 if (strcmp(p + 1, "/") == 0)
1248 continue;
1249 if (strcmp(p + 1, "/proc") == 0)
1250 continue;
1251
1252 if (umount2(p, MNT_DETACH) == 0)
1253 progress++;
1254 }
1255 fclose(f);
1256 if (!progress)
1257 break;
1258 }
1259
1260 /* This also can be skipped if a container uses unserns */
1261 umount2("./proc", MNT_DETACH);
1262
1263 /* It is weird, but chdir("..") moves us in a new root */
1264 if (chdir("..") == -1) {
1265 SYSERROR("Unable to change working directory");
1266 return -1;
1267 }
1268
1269 if (chroot(".") == -1) {
1270 SYSERROR("Unable to chroot");
1271 return -1;
1272 }
1273
1274 return 0;
1275 }
1276
1277 static int setup_pivot_root(const struct lxc_rootfs *rootfs)
1278 {
1279 if (!rootfs->path) {
1280 DEBUG("container does not have a rootfs, so not doing pivot root");
1281 return 0;
1282 }
1283
1284 if (detect_ramfs_rootfs()) {
1285 DEBUG("detected that container is on ramfs");
1286 if (prepare_ramfs_root(rootfs->mount)) {
1287 ERROR("failed to prepare minimal ramfs root");
1288 return -1;
1289 }
1290
1291 DEBUG("prepared ramfs root for container");
1292 return 0;
1293 }
1294
1295 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1296 ERROR("failed to pivot root");
1297 return -1;
1298 }
1299
1300 DEBUG("finished pivot root");
1301 return 0;
1302 }
1303
1304 static int lxc_setup_devpts(int num_pts)
1305 {
1306 int ret;
1307 const char *default_devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
1308 char devpts_mntopts[256];
1309
1310 if (!num_pts) {
1311 DEBUG("no new devpts instance will be mounted since no pts "
1312 "devices are requested");
1313 return 0;
1314 }
1315
1316 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%d",
1317 default_devpts_mntopts, num_pts);
1318 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1319 return -1;
1320
1321 /* Unmount old devpts instance. */
1322 ret = access("/dev/pts/ptmx", F_OK);
1323 if (!ret) {
1324 ret = umount("/dev/pts");
1325 if (ret < 0) {
1326 SYSERROR("failed to unmount old devpts instance");
1327 return -1;
1328 }
1329 DEBUG("unmounted old /dev/pts instance");
1330 }
1331
1332 /* Create mountpoint for devpts instance. */
1333 ret = mkdir("/dev/pts", 0755);
1334 if (ret < 0 && errno != EEXIST) {
1335 SYSERROR("failed to create the \"/dev/pts\" directory");
1336 return -1;
1337 }
1338
1339 /* Mount new devpts instance. */
1340 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1341 if (ret < 0) {
1342 SYSERROR("failed to mount new devpts instance");
1343 return -1;
1344 }
1345 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
1346
1347 /* Remove any pre-existing /dev/ptmx file. */
1348 ret = access("/dev/ptmx", F_OK);
1349 if (!ret) {
1350 ret = remove("/dev/ptmx");
1351 if (ret < 0) {
1352 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1353 return -1;
1354 }
1355 DEBUG("removed existing \"/dev/ptmx\"");
1356 }
1357
1358 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1359 ret = open("/dev/ptmx", O_CREAT, 0666);
1360 if (ret < 0) {
1361 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1362 return -1;
1363 }
1364 close(ret);
1365 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
1366
1367 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
1368 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
1369 if (!ret) {
1370 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1371 return 0;
1372 } else {
1373 /* Fallthrough and try to create a symlink. */
1374 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1375 }
1376
1377 /* Remove the dummy /dev/ptmx file we created above. */
1378 ret = remove("/dev/ptmx");
1379 if (ret < 0) {
1380 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1381 return -1;
1382 }
1383
1384 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1385 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1386 if (ret < 0) {
1387 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1388 return -1;
1389 }
1390 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1391
1392 return 0;
1393 }
1394
1395 static int setup_personality(int persona)
1396 {
1397 #if HAVE_SYS_PERSONALITY_H
1398 if (persona == -1)
1399 return 0;
1400
1401 if (personality(persona) < 0) {
1402 SYSERROR("failed to set personality to '0x%x'", persona);
1403 return -1;
1404 }
1405
1406 INFO("set personality to '0x%x'", persona);
1407 #endif
1408
1409 return 0;
1410 }
1411
1412 static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1413 const struct lxc_console *console)
1414 {
1415 char path[MAXPATHLEN];
1416 int ret, fd;
1417
1418 if (console->path && !strcmp(console->path, "none"))
1419 return 0;
1420
1421 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1422 if (ret < 0 || (size_t)ret >= sizeof(path))
1423 return -1;
1424
1425 /* When we are asked to setup a console we remove any previous
1426 * /dev/console bind-mounts.
1427 */
1428 if (file_exists(path)) {
1429 ret = lxc_unstack_mountpoint(path, false);
1430 if (ret < 0) {
1431 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1432 return -ret;
1433 } else {
1434 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1435 }
1436
1437 ret = unlink(path);
1438 if (ret < 0) {
1439 SYSERROR("error unlinking %s", path);
1440 return -errno;
1441 }
1442 }
1443
1444 /* For unprivileged containers autodev or automounts will already have
1445 * taken care of creating /dev/console.
1446 */
1447 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1448 if (fd < 0) {
1449 if (errno != EEXIST) {
1450 SYSERROR("failed to create console");
1451 return -errno;
1452 }
1453 } else {
1454 close(fd);
1455 }
1456
1457 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
1458 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1459 return -errno;
1460 }
1461
1462 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
1463 ERROR("failed to mount '%s' on '%s'", console->name, path);
1464 return -1;
1465 }
1466
1467 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
1468 return 0;
1469 }
1470
1471 static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1472 const struct lxc_console *console,
1473 char *ttydir)
1474 {
1475 int ret;
1476 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
1477
1478 /* create rootfs/dev/<ttydir> directory */
1479 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1480 if (ret < 0 || (size_t)ret >= sizeof(path))
1481 return -1;
1482
1483 ret = mkdir(path, 0755);
1484 if (ret && errno != EEXIST) {
1485 SYSERROR("failed with errno %d to create %s", errno, path);
1486 return -errno;
1487 }
1488 DEBUG("Created directory for console and tty devices at \"%s\"", path);
1489
1490 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1491 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1492 return -1;
1493
1494 ret = creat(lxcpath, 0660);
1495 if (ret == -1 && errno != EEXIST) {
1496 SYSERROR("error %d creating %s", errno, lxcpath);
1497 return -errno;
1498 }
1499 if (ret >= 0)
1500 close(ret);
1501
1502 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1503 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1504 return -1;
1505
1506 /* When we are asked to setup a console we remove any previous
1507 * /dev/console bind-mounts.
1508 */
1509 if (console->path && !strcmp(console->path, "none")) {
1510 struct stat st;
1511 ret = stat(path, &st);
1512 if (ret < 0) {
1513 if (errno == ENOENT)
1514 return 0;
1515 SYSERROR("failed stat() \"%s\"", path);
1516 return -errno;
1517 }
1518
1519 /* /dev/console must be character device with major number 5 and
1520 * minor number 1. If not, give benefit of the doubt and assume
1521 * the user has mounted something else right there on purpose.
1522 */
1523 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1524 return 0;
1525
1526 /* In case the user requested a bind-mount for /dev/console and
1527 * requests a ttydir we move the mount to the
1528 * /dev/<ttydir/console.
1529 * Note, we only move the uppermost mount and clear all other
1530 * mounts underneath for safety.
1531 * If it is a character device created via mknod() we simply
1532 * rename it.
1533 */
1534 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1535 if (ret < 0) {
1536 if (errno != EINVAL) {
1537 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1538 return -errno;
1539 }
1540 /* path was not a mountpoint */
1541 ret = rename(path, lxcpath);
1542 if (ret < 0) {
1543 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1544 return -errno;
1545 }
1546 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1547 } else {
1548 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1549 }
1550
1551 /* Clear all remaining bind-mounts. */
1552 ret = lxc_unstack_mountpoint(path, false);
1553 if (ret < 0) {
1554 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1555 return -ret;
1556 } else {
1557 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1558 }
1559 } else {
1560 if (file_exists(path)) {
1561 ret = lxc_unstack_mountpoint(path, false);
1562 if (ret < 0) {
1563 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1564 return -ret;
1565 } else {
1566 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1567 }
1568 }
1569
1570 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1571 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1572 return -1;
1573 }
1574 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
1575 }
1576
1577 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
1578 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
1579 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1580 return -1;
1581
1582 ret = unlink(path);
1583 if (ret && errno != ENOENT) {
1584 SYSERROR("error unlinking %s", path);
1585 return -errno;
1586 }
1587
1588 ret = symlink(lxcpath, path);
1589 if (ret < 0) {
1590 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
1591 return -1;
1592 }
1593
1594 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
1595 return 0;
1596 }
1597
1598 static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1599 const struct lxc_console *console, char *ttydir)
1600 {
1601 /* We don't have a rootfs, /dev/console will be shared. */
1602 if (!rootfs->path) {
1603 DEBUG("/dev/console will be shared with the host");
1604 return 0;
1605 }
1606
1607 if (!ttydir)
1608 return lxc_setup_dev_console(rootfs, console);
1609
1610 return lxc_setup_ttydir_console(rootfs, console, ttydir);
1611 }
1612
1613 static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1614 {
1615 struct mount_opt *mo;
1616
1617 /* If opt is found in mount_opt, set or clear flags.
1618 * Otherwise append it to data. */
1619
1620 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1621 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1622 if (mo->clear)
1623 *flags &= ~mo->flag;
1624 else
1625 *flags |= mo->flag;
1626 return;
1627 }
1628 }
1629
1630 if (strlen(*data))
1631 strcat(*data, ",");
1632 strcat(*data, opt);
1633 }
1634
1635 int parse_mntopts(const char *mntopts, unsigned long *mntflags,
1636 char **mntdata)
1637 {
1638 char *s, *data;
1639 char *p, *saveptr = NULL;
1640
1641 *mntdata = NULL;
1642 *mntflags = 0L;
1643
1644 if (!mntopts)
1645 return 0;
1646
1647 s = strdup(mntopts);
1648 if (!s) {
1649 SYSERROR("failed to allocate memory");
1650 return -1;
1651 }
1652
1653 data = malloc(strlen(s) + 1);
1654 if (!data) {
1655 SYSERROR("failed to allocate memory");
1656 free(s);
1657 return -1;
1658 }
1659 *data = 0;
1660
1661 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1662 p = strtok_r(NULL, ",", &saveptr))
1663 parse_mntopt(p, mntflags, &data);
1664
1665 if (*data)
1666 *mntdata = data;
1667 else
1668 free(data);
1669 free(s);
1670
1671 return 0;
1672 }
1673
1674 static void null_endofword(char *word)
1675 {
1676 while (*word && *word != ' ' && *word != '\t')
1677 word++;
1678 *word = '\0';
1679 }
1680
1681 /*
1682 * skip @nfields spaces in @src
1683 */
1684 static char *get_field(char *src, int nfields)
1685 {
1686 char *p = src;
1687 int i;
1688
1689 for (i = 0; i < nfields; i++) {
1690 while (*p && *p != ' ' && *p != '\t')
1691 p++;
1692 if (!*p)
1693 break;
1694 p++;
1695 }
1696 return p;
1697 }
1698
1699 static int mount_entry(const char *fsname, const char *target,
1700 const char *fstype, unsigned long mountflags,
1701 const char *data, int optional, int dev,
1702 const char *rootfs)
1703 {
1704 int ret;
1705 #ifdef HAVE_STATVFS
1706 struct statvfs sb;
1707 #endif
1708
1709 ret = safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data,
1710 rootfs);
1711 if (ret < 0) {
1712 if (optional) {
1713 INFO("Failed to mount \"%s\" on \"%s\" (optional): %s",
1714 fsname, target, strerror(errno));
1715 return 0;
1716 }
1717
1718 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1719 return -1;
1720 }
1721
1722 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
1723 unsigned long rqd_flags = 0;
1724
1725 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
1726 "options",
1727 fsname ? fsname : "(none)", target ? target : "(none)");
1728
1729 if (mountflags & MS_RDONLY)
1730 rqd_flags |= MS_RDONLY;
1731 #ifdef HAVE_STATVFS
1732 if (statvfs(fsname, &sb) == 0) {
1733 unsigned long required_flags = rqd_flags;
1734
1735 if (sb.f_flag & MS_NOSUID)
1736 required_flags |= MS_NOSUID;
1737
1738 if (sb.f_flag & MS_NODEV && !dev)
1739 required_flags |= MS_NODEV;
1740
1741 if (sb.f_flag & MS_RDONLY)
1742 required_flags |= MS_RDONLY;
1743
1744 if (sb.f_flag & MS_NOEXEC)
1745 required_flags |= MS_NOEXEC;
1746
1747 DEBUG("Flags for \"%s\" were %lu, required extra flags "
1748 "are %lu", fsname, sb.f_flag, required_flags);
1749
1750 /* If this was a bind mount request, and required_flags
1751 * does not have any flags which are not already in
1752 * mountflags, then skip the remount.
1753 */
1754 if (!(mountflags & MS_REMOUNT)) {
1755 if (!(required_flags & ~mountflags) &&
1756 rqd_flags == 0) {
1757 DEBUG("Mountflags already were %lu, "
1758 "skipping remount", mountflags);
1759 goto skipremount;
1760 }
1761 }
1762
1763 mountflags |= required_flags;
1764 }
1765 #endif
1766
1767 ret = mount(fsname, target, fstype, mountflags | MS_REMOUNT, data);
1768 if (ret < 0) {
1769 if (optional) {
1770 INFO("Failed to mount \"%s\" on \"%s\" "
1771 "(optional): %s", fsname, target,
1772 strerror(errno));
1773 return 0;
1774 }
1775
1776 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1777 return -1;
1778 }
1779 }
1780
1781 #ifdef HAVE_STATVFS
1782 skipremount:
1783 #endif
1784 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"", fsname,
1785 target, fstype);
1786
1787 return 0;
1788 }
1789
1790 /* Remove "optional", "create=dir", and "create=file" from mntopt */
1791 static void cull_mntent_opt(struct mntent *mntent)
1792 {
1793 int i;
1794 char *list[] = {"create=dir", "create=file", "optional", NULL};
1795
1796 for (i = 0; list[i]; i++) {
1797 char *p, *p2;
1798
1799 p = strstr(mntent->mnt_opts, list[i]);
1800 if (!p)
1801 continue;
1802
1803 p2 = strchr(p, ',');
1804 if (!p2) {
1805 /* no more mntopts, so just chop it here */
1806 *p = '\0';
1807 continue;
1808 }
1809
1810 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
1811 }
1812 }
1813
1814 static int mount_entry_create_dir_file(const struct mntent *mntent,
1815 const char *path,
1816 const struct lxc_rootfs *rootfs,
1817 const char *lxc_name,
1818 const char *lxc_path)
1819 {
1820 int ret = 0;
1821
1822 if (!strncmp(mntent->mnt_type, "overlay", 7))
1823 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
1824 else if (!strncmp(mntent->mnt_type, "aufs", 4))
1825 ret = aufs_mkdir(mntent, rootfs, lxc_name, lxc_path);
1826 if (ret < 0)
1827 return -1;
1828
1829 if (hasmntopt(mntent, "create=dir")) {
1830 ret = mkdir_p(path, 0755);
1831 if (ret < 0 && errno != EEXIST) {
1832 SYSERROR("Failed to create directory \"%s\"", path);
1833 return -1;
1834 }
1835 }
1836
1837 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1838 int fd;
1839 char *p1, *p2;
1840
1841 p1 = strdup(path);
1842 if (!p1)
1843 return -1;
1844
1845 p2 = dirname(p1);
1846
1847 ret = mkdir_p(p2, 0755);
1848 free(p1);
1849 if (ret < 0 && errno != EEXIST) {
1850 SYSERROR("Failed to create directory \"%s\"", path);
1851 return -1;
1852 }
1853
1854 fd = open(path, O_CREAT, 0644);
1855 if (fd < 0)
1856 return -1;
1857 close(fd);
1858 }
1859
1860 return 0;
1861 }
1862
1863 /* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1864 * without a rootfs. */
1865 static inline int mount_entry_on_generic(struct mntent *mntent,
1866 const char *path,
1867 const struct lxc_rootfs *rootfs,
1868 const char *lxc_name,
1869 const char *lxc_path)
1870 {
1871 int ret;
1872 unsigned long mntflags;
1873 char *mntdata;
1874 bool dev, optional;
1875 char *rootfs_path = NULL;
1876
1877 optional = hasmntopt(mntent, "optional") != NULL;
1878 dev = hasmntopt(mntent, "dev") != NULL;
1879
1880 if (rootfs && rootfs->path)
1881 rootfs_path = rootfs->mount;
1882
1883 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
1884 lxc_path);
1885 if (ret < 0) {
1886 if (optional)
1887 return 0;
1888
1889 return -1;
1890 }
1891 cull_mntent_opt(mntent);
1892
1893 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
1894 if (ret < 0)
1895 return -1;
1896
1897 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
1898 mntdata, optional, dev, rootfs_path);
1899
1900 free(mntdata);
1901 return ret;
1902 }
1903
1904 static inline int mount_entry_on_systemfs(struct mntent *mntent)
1905 {
1906 int ret;
1907 char path[MAXPATHLEN];
1908
1909 /* For containers created without a rootfs all mounts are treated as
1910 * absolute paths starting at / on the host.
1911 */
1912 if (mntent->mnt_dir[0] != '/')
1913 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
1914 else
1915 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
1916 if (ret < 0 || ret >= sizeof(path))
1917 return -1;
1918
1919 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
1920 }
1921
1922 static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
1923 const struct lxc_rootfs *rootfs,
1924 const char *lxc_name,
1925 const char *lxc_path)
1926 {
1927 int offset;
1928 char *aux;
1929 const char *lxcpath;
1930 char path[MAXPATHLEN];
1931 int ret = 0;
1932
1933 lxcpath = lxc_global_config_value("lxc.lxcpath");
1934 if (!lxcpath)
1935 return -1;
1936
1937 /* If rootfs->path is a blockdev path, allow container fstab to use
1938 * <lxcpath>/<name>/rootfs" as the target prefix.
1939 */
1940 ret = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
1941 if (ret < 0 || ret >= MAXPATHLEN)
1942 goto skipvarlib;
1943
1944 aux = strstr(mntent->mnt_dir, path);
1945 if (aux) {
1946 offset = strlen(path);
1947 goto skipabs;
1948 }
1949
1950 skipvarlib:
1951 aux = strstr(mntent->mnt_dir, rootfs->path);
1952 if (!aux) {
1953 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
1954 return ret;
1955 }
1956 offset = strlen(rootfs->path);
1957
1958 skipabs:
1959 ret = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset);
1960 if (ret < 0 || ret >= MAXPATHLEN)
1961 return -1;
1962
1963 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
1964 }
1965
1966 static int mount_entry_on_relative_rootfs(struct mntent *mntent,
1967 const struct lxc_rootfs *rootfs,
1968 const char *lxc_name,
1969 const char *lxc_path)
1970 {
1971 char path[MAXPATHLEN];
1972 int ret;
1973
1974 /* relative to root mount point */
1975 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
1976 if (ret < 0 || ret >= sizeof(path)) {
1977 ERROR("path name too long");
1978 return -1;
1979 }
1980
1981 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
1982 }
1983
1984 /* This logs a NOTICE() when a user specifies mounts that would conflict with
1985 * devices liblxc sets up automatically.
1986 */
1987 static void log_notice_on_conflict(const struct lxc_conf *conf, const char *src,
1988 const char *dest)
1989 {
1990 char *clean_mnt_fsname, *clean_mnt_dir, *tmp;
1991 bool needs_warning = false;
1992
1993 clean_mnt_fsname = lxc_deslashify(src);
1994 if (!clean_mnt_fsname)
1995 return;
1996
1997 clean_mnt_dir = lxc_deslashify(dest);
1998 if (!clean_mnt_dir) {
1999 free(clean_mnt_fsname);
2000 return;
2001 }
2002
2003 tmp = clean_mnt_dir;
2004 if (*tmp == '/')
2005 tmp++;
2006
2007 if (strncmp(src, "/dev", 4) || strncmp(tmp, "dev", 3)) {
2008 free(clean_mnt_dir);
2009 free(clean_mnt_fsname);
2010 return;
2011 }
2012
2013 if (!conf->autodev && !conf->pts && !conf->tty &&
2014 (!conf->console.path || !strcmp(conf->console.path, "none"))) {
2015 free(clean_mnt_dir);
2016 free(clean_mnt_fsname);
2017 return;
2018 }
2019
2020 if (!strcmp(tmp, "dev") && conf->autodev > 0)
2021 needs_warning = true;
2022 else if (!strcmp(tmp, "dev/pts") && (conf->autodev > 0 || conf->pts > 0))
2023 needs_warning = true;
2024 else if (!strcmp(tmp, "dev/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2025 needs_warning = true;
2026 else if (!strcmp(tmp, "dev/pts/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2027 needs_warning = true;
2028 else if (!strcmp(tmp, "dev/null") && conf->autodev > 0)
2029 needs_warning = true;
2030 else if (!strcmp(tmp, "dev/zero") && conf->autodev > 0)
2031 needs_warning = true;
2032 else if (!strcmp(tmp, "dev/full") && conf->autodev > 0)
2033 needs_warning = true;
2034 else if (!strcmp(tmp, "dev/urandom") && conf->autodev > 0)
2035 needs_warning = true;
2036 else if (!strcmp(tmp, "dev/random") && conf->autodev > 0)
2037 needs_warning = true;
2038 else if (!strcmp(tmp, "dev/tty") && conf->autodev > 0)
2039 needs_warning = true;
2040 else if (!strncmp(tmp, "dev/tty", 7) && (conf->autodev > 0 || conf->tty > 0))
2041 needs_warning = true;
2042
2043 if (needs_warning)
2044 NOTICE("Requesting to mount \"%s\" on \"%s\" while requesting "
2045 "automatic device setup under \"/dev\"",
2046 clean_mnt_fsname, clean_mnt_dir);
2047
2048 free(clean_mnt_dir);
2049 free(clean_mnt_fsname);
2050 }
2051
2052 static int mount_file_entries(const struct lxc_conf *conf,
2053 const struct lxc_rootfs *rootfs, FILE *file,
2054 const char *lxc_name, const char *lxc_path)
2055 {
2056 struct mntent mntent;
2057 char buf[4096];
2058 int ret = -1;
2059
2060 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
2061 log_notice_on_conflict(conf, mntent.mnt_fsname, mntent.mnt_dir);
2062
2063 if (!rootfs->path)
2064 ret = mount_entry_on_systemfs(&mntent);
2065 else if (mntent.mnt_dir[0] != '/')
2066 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2067 lxc_name, lxc_path);
2068 else
2069 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2070 lxc_name, lxc_path);
2071 if (ret < 0)
2072 return -1;
2073 }
2074 ret = 0;
2075
2076 INFO("Set up mount entries");
2077 return ret;
2078 }
2079
2080 static int setup_mount(const struct lxc_conf *conf,
2081 const struct lxc_rootfs *rootfs, const char *fstab,
2082 const char *lxc_name, const char *lxc_path)
2083 {
2084 FILE *f;
2085 int ret;
2086
2087 if (!fstab)
2088 return 0;
2089
2090 f = setmntent(fstab, "r");
2091 if (!f) {
2092 SYSERROR("Failed to open \"%s\"", fstab);
2093 return -1;
2094 }
2095
2096 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
2097 if (ret < 0)
2098 ERROR("Failed to set up mount entries");
2099
2100 endmntent(f);
2101 return ret;
2102 }
2103
2104 FILE *make_anonymous_mount_file(struct lxc_list *mount)
2105 {
2106 int ret;
2107 char *mount_entry;
2108 struct lxc_list *iterator;
2109 FILE *f;
2110 int fd = -1;
2111
2112 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2113 if (fd < 0) {
2114 if (errno != ENOSYS)
2115 return NULL;
2116 f = tmpfile();
2117 TRACE("Created temporary mount file");
2118 } else {
2119 f = fdopen(fd, "r+");
2120 TRACE("Created anonymous mount file");
2121 }
2122
2123 if (!f) {
2124 SYSERROR("Could not create mount file");
2125 if (fd != -1)
2126 close(fd);
2127 return NULL;
2128 }
2129
2130 lxc_list_for_each(iterator, mount) {
2131 mount_entry = iterator->elem;
2132 ret = fprintf(f, "%s\n", mount_entry);
2133 if (ret < strlen(mount_entry))
2134 WARN("Could not write mount entry to mount file");
2135 }
2136
2137 ret = fseek(f, 0, SEEK_SET);
2138 if (ret < 0) {
2139 SYSERROR("Failed to seek mount file");
2140 fclose(f);
2141 return NULL;
2142 }
2143
2144 return f;
2145 }
2146
2147 static int setup_mount_entries(const struct lxc_conf *conf,
2148 const struct lxc_rootfs *rootfs,
2149 struct lxc_list *mount, const char *lxc_name,
2150 const char *lxc_path)
2151 {
2152 FILE *f;
2153 int ret;
2154
2155 f = make_anonymous_mount_file(mount);
2156 if (!f)
2157 return -1;
2158
2159 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
2160
2161 fclose(f);
2162 return ret;
2163 }
2164
2165 static int parse_cap(const char *cap)
2166 {
2167 char *ptr = NULL;
2168 size_t i;
2169 int capid = -1;
2170
2171 if (!strcmp(cap, "none"))
2172 return -2;
2173
2174 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2175
2176 if (strcmp(cap, caps_opt[i].name))
2177 continue;
2178
2179 capid = caps_opt[i].value;
2180 break;
2181 }
2182
2183 if (capid < 0) {
2184 /* try to see if it's numeric, so the user may specify
2185 * capabilities that the running kernel knows about but
2186 * we don't */
2187 errno = 0;
2188 capid = strtol(cap, &ptr, 10);
2189 if (!ptr || *ptr != '\0' || errno != 0)
2190 /* not a valid number */
2191 capid = -1;
2192 else if (capid > lxc_caps_last_cap())
2193 /* we have a number but it's not a valid
2194 * capability */
2195 capid = -1;
2196 }
2197
2198 return capid;
2199 }
2200
2201 int in_caplist(int cap, struct lxc_list *caps)
2202 {
2203 struct lxc_list *iterator;
2204 int capid;
2205
2206 lxc_list_for_each(iterator, caps) {
2207 capid = parse_cap(iterator->elem);
2208 if (capid == cap)
2209 return 1;
2210 }
2211
2212 return 0;
2213 }
2214
2215 static int setup_caps(struct lxc_list *caps)
2216 {
2217 struct lxc_list *iterator;
2218 char *drop_entry;
2219 int capid;
2220
2221 lxc_list_for_each(iterator, caps) {
2222
2223 drop_entry = iterator->elem;
2224
2225 capid = parse_cap(drop_entry);
2226
2227 if (capid < 0) {
2228 ERROR("unknown capability %s", drop_entry);
2229 return -1;
2230 }
2231
2232 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2233
2234 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
2235 SYSERROR("failed to remove %s capability", drop_entry);
2236 return -1;
2237 }
2238
2239 }
2240
2241 DEBUG("capabilities have been setup");
2242
2243 return 0;
2244 }
2245
2246 static int dropcaps_except(struct lxc_list *caps)
2247 {
2248 struct lxc_list *iterator;
2249 char *keep_entry;
2250 int i, capid;
2251 int numcaps = lxc_caps_last_cap() + 1;
2252 INFO("found %d capabilities", numcaps);
2253
2254 if (numcaps <= 0 || numcaps > 200)
2255 return -1;
2256
2257 /* caplist[i] is 1 if we keep capability i */
2258 int *caplist = alloca(numcaps * sizeof(int));
2259 memset(caplist, 0, numcaps * sizeof(int));
2260
2261 lxc_list_for_each(iterator, caps) {
2262
2263 keep_entry = iterator->elem;
2264
2265 capid = parse_cap(keep_entry);
2266
2267 if (capid == -2)
2268 continue;
2269
2270 if (capid < 0) {
2271 ERROR("unknown capability %s", keep_entry);
2272 return -1;
2273 }
2274
2275 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
2276
2277 caplist[capid] = 1;
2278 }
2279 for (i=0; i<numcaps; i++) {
2280 if (caplist[i])
2281 continue;
2282 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
2283 SYSERROR("failed to remove capability %d", i);
2284 return -1;
2285 }
2286 }
2287
2288 DEBUG("capabilities have been setup");
2289
2290 return 0;
2291 }
2292
2293 static int parse_resource(const char *res) {
2294 size_t i;
2295 int resid = -1;
2296
2297 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2298 if (strcmp(res, limit_opt[i].name) == 0)
2299 return limit_opt[i].value;
2300 }
2301
2302 /* try to see if it's numeric, so the user may specify
2303 * resources that the running kernel knows about but
2304 * we don't */
2305 if (lxc_safe_int(res, &resid) == 0)
2306 return resid;
2307 return -1;
2308 }
2309
2310 int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2311 struct lxc_list *it;
2312 struct lxc_limit *lim;
2313 int resid;
2314
2315 lxc_list_for_each(it, limits) {
2316 lim = it->elem;
2317
2318 resid = parse_resource(lim->resource);
2319 if (resid < 0) {
2320 ERROR("unknown resource %s", lim->resource);
2321 return -1;
2322 }
2323
2324 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2325 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2326 return -1;
2327 }
2328 }
2329 return 0;
2330 }
2331
2332 static char *default_rootfs_mount = LXCROOTFSMOUNT;
2333
2334 struct lxc_conf *lxc_conf_init(void)
2335 {
2336 struct lxc_conf *new;
2337 int i;
2338
2339 new = malloc(sizeof(*new));
2340 if (!new) {
2341 ERROR("lxc_conf_init : %s", strerror(errno));
2342 return NULL;
2343 }
2344 memset(new, 0, sizeof(*new));
2345
2346 new->loglevel = LXC_LOG_LEVEL_NOTSET;
2347 new->personality = -1;
2348 new->autodev = 1;
2349 new->console.log_path = NULL;
2350 new->console.log_fd = -1;
2351 new->console.path = NULL;
2352 new->console.peer = -1;
2353 new->console.peerpty.busy = -1;
2354 new->console.peerpty.master = -1;
2355 new->console.peerpty.slave = -1;
2356 new->console.master = -1;
2357 new->console.slave = -1;
2358 new->console.name[0] = '\0';
2359 new->maincmd_fd = -1;
2360 new->nbd_idx = -1;
2361 new->rootfs.mount = strdup(default_rootfs_mount);
2362 if (!new->rootfs.mount) {
2363 ERROR("lxc_conf_init : %s", strerror(errno));
2364 free(new);
2365 return NULL;
2366 }
2367 new->logfd = -1;
2368 lxc_list_init(&new->cgroup);
2369 lxc_list_init(&new->network);
2370 lxc_list_init(&new->mount_list);
2371 lxc_list_init(&new->caps);
2372 lxc_list_init(&new->keepcaps);
2373 lxc_list_init(&new->id_map);
2374 lxc_list_init(&new->includes);
2375 lxc_list_init(&new->aliens);
2376 lxc_list_init(&new->environment);
2377 lxc_list_init(&new->limits);
2378 for (i=0; i<NUM_LXC_HOOKS; i++)
2379 lxc_list_init(&new->hooks[i]);
2380 lxc_list_init(&new->groups);
2381 new->lsm_aa_profile = NULL;
2382 new->lsm_se_context = NULL;
2383 new->tmp_umount_proc = 0;
2384
2385 for (i = 0; i < LXC_NS_MAX; i++)
2386 new->inherit_ns_fd[i] = -1;
2387
2388 /* if running in a new user namespace, init and COMMAND
2389 * default to running as UID/GID 0 when using lxc-execute */
2390 new->init_uid = 0;
2391 new->init_gid = 0;
2392 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
2393
2394 return new;
2395 }
2396
2397 static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
2398 size_t buf_size)
2399 {
2400 char path[MAXPATHLEN];
2401 int fd, ret;
2402
2403 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
2404 idtype == ID_TYPE_UID ? 'u' : 'g');
2405 if (ret < 0 || ret >= MAXPATHLEN) {
2406 ERROR("failed to create path \"%s\"", path);
2407 return -E2BIG;
2408 }
2409
2410 fd = open(path, O_WRONLY);
2411 if (fd < 0) {
2412 SYSERROR("failed to open \"%s\"", path);
2413 return -1;
2414 }
2415
2416 errno = 0;
2417 ret = lxc_write_nointr(fd, buf, buf_size);
2418 if (ret != buf_size) {
2419 SYSERROR("failed to write %cid mapping to \"%s\"",
2420 idtype == ID_TYPE_UID ? 'u' : 'g', path);
2421 close(fd);
2422 return -1;
2423 }
2424 close(fd);
2425
2426 return 0;
2427 }
2428
2429 /* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2430 *
2431 * @return 1 if functional binary was found
2432 * @return 0 if binary exists but is lacking privilege
2433 * @return -ENOENT if binary does not exist
2434 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
2435 *
2436 */
2437 static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2438 {
2439 char *path;
2440 int ret;
2441 struct stat st;
2442 int fret = 0;
2443
2444 if (cap != CAP_SETUID && cap != CAP_SETGID)
2445 return -EINVAL;
2446
2447 path = on_path(binary, NULL);
2448 if (!path)
2449 return -ENOENT;
2450
2451 ret = stat(path, &st);
2452 if (ret < 0) {
2453 fret = -errno;
2454 goto cleanup;
2455 }
2456
2457 /* Check if the binary is setuid. */
2458 if (st.st_mode & S_ISUID) {
2459 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
2460 fret = 1;
2461 goto cleanup;
2462 }
2463
2464 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
2465 /* Check if it has the CAP_SETUID capability. */
2466 if ((cap & CAP_SETUID) &&
2467 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
2468 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
2469 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
2470 "and CAP_PERMITTED sets.", path);
2471 fret = 1;
2472 goto cleanup;
2473 }
2474
2475 /* Check if it has the CAP_SETGID capability. */
2476 if ((cap & CAP_SETGID) &&
2477 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
2478 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
2479 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
2480 "and CAP_PERMITTED sets.", path);
2481 fret = 1;
2482 goto cleanup;
2483 }
2484 #else
2485 /* If we cannot check for file capabilities we need to give the benefit
2486 * of the doubt. Otherwise we might fail even though all the necessary
2487 * file capabilities are set.
2488 */
2489 DEBUG("Cannot check for file capabilites as full capability support is "
2490 "missing. Manual intervention needed.");
2491 fret = 1;
2492 #endif
2493
2494 cleanup:
2495 free(path);
2496 return fret;
2497 }
2498
2499 int lxc_map_ids_exec_wrapper(void *args)
2500 {
2501 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2502 return -1;
2503 }
2504
2505 int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2506 {
2507 struct id_map *map;
2508 struct lxc_list *iterator;
2509 enum idtype type;
2510 char u_or_g;
2511 char *pos;
2512 int fill, left;
2513 char cmd_output[MAXPATHLEN];
2514 /* strlen("new@idmap") = 9
2515 * +
2516 * strlen(" ") = 1
2517 * +
2518 * LXC_NUMSTRLEN64
2519 * +
2520 * strlen(" ") = 1
2521 *
2522 * We add some additional space to make sure that we really have
2523 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
2524 */
2525 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
2526 int ret = 0, uidmap = 0, gidmap = 0;
2527 bool use_shadow = false, had_entry = false;
2528
2529 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2530 * ranges, then insist that root also reserve ranges in subuid. This
2531 * will protected it by preventing another user from being handed the
2532 * range by shadow.
2533 */
2534 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
2535 if (uidmap == -ENOENT)
2536 WARN("newuidmap binary is missing");
2537 else if (!uidmap)
2538 WARN("newuidmap is lacking necessary privileges");
2539
2540 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
2541 if (gidmap == -ENOENT)
2542 WARN("newgidmap binary is missing");
2543 else if (!gidmap)
2544 WARN("newgidmap is lacking necessary privileges");
2545
2546 if (uidmap > 0 && gidmap > 0) {
2547 DEBUG("Functional newuidmap and newgidmap binary found.");
2548 use_shadow = true;
2549 } else {
2550 /* In case unprivileged users run application containers via
2551 * execute() or a start*() there are valid cases where they may
2552 * only want to map their own {g,u}id. Let's not block them from
2553 * doing so by requiring geteuid() == 0.
2554 */
2555 DEBUG("No newuidmap and newgidmap binary found. Trying to "
2556 "write directly with euid %d.", geteuid());
2557 }
2558
2559 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2560 type++, u_or_g = 'g') {
2561 pos = mapbuf;
2562
2563 if (use_shadow)
2564 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
2565
2566 lxc_list_for_each(iterator, idmap) {
2567 /* The kernel only takes <= 4k for writes to
2568 * /proc/<nr>/[ug]id_map
2569 */
2570 map = iterator->elem;
2571 if (map->idtype != type)
2572 continue;
2573
2574 had_entry = true;
2575
2576 left = LXC_IDMAPLEN - (pos - mapbuf);
2577 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
2578 use_shadow ? " " : "", map->nsid,
2579 map->hostid, map->range,
2580 use_shadow ? "" : "\n");
2581 if (fill <= 0 || fill >= left)
2582 SYSERROR("Too many {g,u}id mappings defined.");
2583
2584 pos += fill;
2585 }
2586 if (!had_entry)
2587 continue;
2588
2589 /* Try to catch the ouput of new{g,u}idmap to make debugging
2590 * easier.
2591 */
2592 if (use_shadow) {
2593 ret = run_command(cmd_output, sizeof(cmd_output),
2594 lxc_map_ids_exec_wrapper,
2595 (void *)mapbuf);
2596 if (ret < 0) {
2597 ERROR("new%cidmap failed to write mapping: %s",
2598 u_or_g, cmd_output);
2599 return -1;
2600 }
2601 } else {
2602 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
2603 if (ret < 0)
2604 return -1;
2605 }
2606
2607 memset(mapbuf, 0, sizeof(mapbuf));
2608 }
2609
2610 return 0;
2611 }
2612
2613 /*
2614 * return the host uid/gid to which the container root is mapped in
2615 * *val.
2616 * Return true if id was found, false otherwise.
2617 */
2618 bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
2619 unsigned long *val)
2620 {
2621 struct lxc_list *it;
2622 struct id_map *map;
2623
2624 lxc_list_for_each(it, &conf->id_map) {
2625 map = it->elem;
2626 if (map->idtype != idtype)
2627 continue;
2628 if (map->nsid != 0)
2629 continue;
2630 *val = map->hostid;
2631 return true;
2632 }
2633 return false;
2634 }
2635
2636 int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
2637 {
2638 struct lxc_list *it;
2639 struct id_map *map;
2640 lxc_list_for_each(it, &conf->id_map) {
2641 map = it->elem;
2642 if (map->idtype != idtype)
2643 continue;
2644 if (id >= map->hostid && id < map->hostid + map->range)
2645 return (id - map->hostid) + map->nsid;
2646 }
2647 return -1;
2648 }
2649
2650 int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
2651 {
2652 struct lxc_list *it;
2653 struct id_map *map;
2654 unsigned int freeid = 0;
2655 again:
2656 lxc_list_for_each(it, &conf->id_map) {
2657 map = it->elem;
2658 if (map->idtype != idtype)
2659 continue;
2660 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
2661 freeid = map->nsid + map->range;
2662 goto again;
2663 }
2664 }
2665 return freeid;
2666 }
2667
2668 int lxc_create_tty(const char *name, struct lxc_conf *conf)
2669 {
2670 struct lxc_tty_info *tty_info = &conf->tty_info;
2671 int i, ret;
2672
2673 /* no tty in the configuration */
2674 if (!conf->tty)
2675 return 0;
2676
2677 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
2678 if (!tty_info->pty_info) {
2679 SYSERROR("failed to allocate struct *pty_info");
2680 return -ENOMEM;
2681 }
2682
2683 for (i = 0; i < conf->tty; i++) {
2684 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
2685
2686 process_lock();
2687 ret = openpty(&pty_info->master, &pty_info->slave,
2688 pty_info->name, NULL, NULL);
2689 process_unlock();
2690 if (ret) {
2691 SYSERROR("failed to create pty device number %d", i);
2692 tty_info->nbtty = i;
2693 lxc_delete_tty(tty_info);
2694 return -ENOTTY;
2695 }
2696
2697 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
2698 pty_info->name, pty_info->master, pty_info->slave);
2699
2700 /* Prevent leaking the file descriptors to the container */
2701 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
2702 if (ret < 0)
2703 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
2704 "pty device \"%s\": %s",
2705 pty_info->master, pty_info->name, strerror(errno));
2706
2707 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
2708 if (ret < 0)
2709 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
2710 "pty device \"%s\": %s",
2711 pty_info->slave, pty_info->name, strerror(errno));
2712
2713 pty_info->busy = 0;
2714 }
2715
2716 tty_info->nbtty = conf->tty;
2717
2718 INFO("finished allocating %d pts devices", conf->tty);
2719 return 0;
2720 }
2721
2722 void lxc_delete_tty(struct lxc_tty_info *tty_info)
2723 {
2724 int i;
2725
2726 for (i = 0; i < tty_info->nbtty; i++) {
2727 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
2728
2729 close(pty_info->master);
2730 close(pty_info->slave);
2731 }
2732
2733 free(tty_info->pty_info);
2734 tty_info->pty_info = NULL;
2735 tty_info->nbtty = 0;
2736 }
2737
2738
2739 int chown_mapped_root_exec_wrapper(void *args)
2740 {
2741 execvp("lxc-usernsexec", args);
2742 return -1;
2743 }
2744
2745 /*
2746 * chown_mapped_root: for an unprivileged user with uid/gid X to
2747 * chown a dir to subuid/subgid Y, he needs to run chown as root
2748 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
2749 * nsid Y is mapped to hostuid/hostgid X. That way, the container
2750 * root is privileged with respect to hostuid/hostgid X, allowing
2751 * him to do the chown.
2752 */
2753 int chown_mapped_root(char *path, struct lxc_conf *conf)
2754 {
2755 uid_t rootuid, rootgid;
2756 unsigned long val;
2757 int hostuid, hostgid, ret;
2758 struct stat sb;
2759 char map1[100], map2[100], map3[100], map4[100], map5[100];
2760 char ugid[100];
2761 char *args1[] = {"lxc-usernsexec",
2762 "-m", map1,
2763 "-m", map2,
2764 "-m", map3,
2765 "-m", map5,
2766 "--", "chown", ugid, path,
2767 NULL};
2768 char *args2[] = {"lxc-usernsexec",
2769 "-m", map1,
2770 "-m", map2,
2771 "-m", map3,
2772 "-m", map4,
2773 "-m", map5,
2774 "--", "chown", ugid, path,
2775 NULL};
2776 char cmd_output[MAXPATHLEN];
2777
2778 hostuid = geteuid();
2779 hostgid = getegid();
2780
2781 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
2782 ERROR("No uid mapping for container root");
2783 return -1;
2784 }
2785 rootuid = (uid_t)val;
2786 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
2787 ERROR("No gid mapping for container root");
2788 return -1;
2789 }
2790 rootgid = (gid_t)val;
2791
2792 if (hostuid == 0) {
2793 if (chown(path, rootuid, rootgid) < 0) {
2794 ERROR("Error chowning %s", path);
2795 return -1;
2796 }
2797 return 0;
2798 }
2799
2800 if (rootuid == hostuid) {
2801 /* nothing to do */
2802 INFO("Container root is our uid; no need to chown");
2803 return 0;
2804 }
2805
2806 /* save the current gid of "path" */
2807 if (stat(path, &sb) < 0) {
2808 ERROR("Error stat %s", path);
2809 return -1;
2810 }
2811
2812 /* Update the path argument in case this was overlayfs. */
2813 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
2814 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
2815
2816 /*
2817 * A file has to be group-owned by a gid mapped into the
2818 * container, or the container won't be privileged over it.
2819 */
2820 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
2821 if (sb.st_uid == hostuid &&
2822 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
2823 chown(path, -1, hostgid) < 0) {
2824 ERROR("Failed chgrping %s", path);
2825 return -1;
2826 }
2827
2828 /* "u:0:rootuid:1" */
2829 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
2830 if (ret < 0 || ret >= 100) {
2831 ERROR("Error uid printing map string");
2832 return -1;
2833 }
2834
2835 /* "u:hostuid:hostuid:1" */
2836 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
2837 if (ret < 0 || ret >= 100) {
2838 ERROR("Error uid printing map string");
2839 return -1;
2840 }
2841
2842 /* "g:0:rootgid:1" */
2843 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
2844 if (ret < 0 || ret >= 100) {
2845 ERROR("Error gid printing map string");
2846 return -1;
2847 }
2848
2849 /* "g:pathgid:rootgid+pathgid:1" */
2850 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
2851 rootgid + (gid_t)sb.st_gid);
2852 if (ret < 0 || ret >= 100) {
2853 ERROR("Error gid printing map string");
2854 return -1;
2855 }
2856
2857 /* "g:hostgid:hostgid:1" */
2858 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
2859 if (ret < 0 || ret >= 100) {
2860 ERROR("Error gid printing map string");
2861 return -1;
2862 }
2863
2864 /* "0:pathgid" (chown) */
2865 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
2866 if (ret < 0 || ret >= 100) {
2867 ERROR("Error owner printing format string for chown");
2868 return -1;
2869 }
2870
2871 if (hostgid == sb.st_gid)
2872 ret = run_command(cmd_output, sizeof(cmd_output),
2873 chown_mapped_root_exec_wrapper,
2874 (void *)args1);
2875 else
2876 ret = run_command(cmd_output, sizeof(cmd_output),
2877 chown_mapped_root_exec_wrapper,
2878 (void *)args2);
2879 if (ret < 0)
2880 ERROR("lxc-usernsexec failed: %s", cmd_output);
2881
2882 return ret;
2883 }
2884
2885 int lxc_ttys_shift_ids(struct lxc_conf *c)
2886 {
2887 if (lxc_list_empty(&c->id_map))
2888 return 0;
2889
2890 if (!strcmp(c->console.name, ""))
2891 return 0;
2892
2893 if (chown_mapped_root(c->console.name, c) < 0) {
2894 ERROR("failed to chown console \"%s\"", c->console.name);
2895 return -1;
2896 }
2897
2898 TRACE("chowned console \"%s\"", c->console.name);
2899
2900 return 0;
2901 }
2902
2903 /* NOTE: Must not be called from inside the container namespace! */
2904 int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
2905 {
2906 int mounted;
2907
2908 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
2909 if (mounted == -1) {
2910 SYSERROR("failed to mount /proc in the container");
2911 /* continue only if there is no rootfs */
2912 if (conf->rootfs.path)
2913 return -1;
2914 } else if (mounted == 1) {
2915 conf->tmp_umount_proc = 1;
2916 }
2917
2918 return 0;
2919 }
2920
2921 void tmp_proc_unmount(struct lxc_conf *lxc_conf)
2922 {
2923 if (lxc_conf->tmp_umount_proc == 1) {
2924 umount("/proc");
2925 lxc_conf->tmp_umount_proc = 0;
2926 }
2927 }
2928
2929 void remount_all_slave(void)
2930 {
2931 /* walk /proc/mounts and change any shared entries to slave */
2932 FILE *f = fopen("/proc/self/mountinfo", "r");
2933 char *line = NULL;
2934 size_t len = 0;
2935
2936 if (!f) {
2937 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
2938 ERROR("Continuing container startup...");
2939 return;
2940 }
2941
2942 while (getline(&line, &len, f) != -1) {
2943 char *target, *opts;
2944 target = get_field(line, 4);
2945 if (!target)
2946 continue;
2947 opts = get_field(target, 2);
2948 if (!opts)
2949 continue;
2950 null_endofword(opts);
2951 if (!strstr(opts, "shared"))
2952 continue;
2953 null_endofword(target);
2954 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
2955 SYSERROR("Failed to make %s rslave", target);
2956 ERROR("Continuing...");
2957 }
2958 }
2959 fclose(f);
2960 free(line);
2961 }
2962
2963 void lxc_execute_bind_init(struct lxc_conf *conf)
2964 {
2965 int ret;
2966 char path[PATH_MAX], destpath[PATH_MAX], *p;
2967
2968 /* If init exists in the container, don't bind mount a static one */
2969 p = choose_init(conf->rootfs.mount);
2970 if (p) {
2971 free(p);
2972 return;
2973 }
2974
2975 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
2976 if (ret < 0 || ret >= PATH_MAX) {
2977 WARN("Path name too long searching for lxc.init.static");
2978 return;
2979 }
2980
2981 if (!file_exists(path)) {
2982 INFO("%s does not exist on host", path);
2983 return;
2984 }
2985
2986 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
2987 if (ret < 0 || ret >= PATH_MAX) {
2988 WARN("Path name too long for container's lxc.init.static");
2989 return;
2990 }
2991
2992 if (!file_exists(destpath)) {
2993 FILE * pathfile = fopen(destpath, "wb");
2994 if (!pathfile) {
2995 SYSERROR("Failed to create mount target '%s'", destpath);
2996 return;
2997 }
2998 fclose(pathfile);
2999 }
3000
3001 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
3002 if (ret < 0)
3003 SYSERROR("Failed to bind lxc.init.static into container");
3004 INFO("lxc.init.static bound into container at %s", path);
3005 }
3006
3007 /*
3008 * This does the work of remounting / if it is shared, calling the
3009 * container pre-mount hooks, and mounting the rootfs.
3010 */
3011 int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
3012 {
3013 if (conf->rootfs_setup) {
3014 /*
3015 * rootfs was set up in another namespace. bind-mount it
3016 * to give us a mount in our own ns so we can pivot_root to it
3017 */
3018 const char *path = conf->rootfs.mount;
3019 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
3020 ERROR("Failed to bind-mount container / onto itself");
3021 return -1;
3022 }
3023 return 0;
3024 }
3025
3026 remount_all_slave();
3027
3028 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
3029 ERROR("failed to run pre-mount hooks for container '%s'.", name);
3030 return -1;
3031 }
3032
3033 if (lxc_setup_rootfs(conf)) {
3034 ERROR("failed to setup rootfs for '%s'", name);
3035 return -1;
3036 }
3037
3038 conf->rootfs_setup = true;
3039 return 0;
3040 }
3041
3042 static bool verify_start_hooks(struct lxc_conf *conf)
3043 {
3044 struct lxc_list *it;
3045 char path[MAXPATHLEN];
3046 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
3047 char *hookname = it->elem;
3048 struct stat st;
3049 int ret;
3050
3051 ret = snprintf(path, MAXPATHLEN, "%s%s",
3052 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
3053 if (ret < 0 || ret >= MAXPATHLEN)
3054 return false;
3055 ret = stat(path, &st);
3056 if (ret) {
3057 SYSERROR("Start hook %s not found in container",
3058 hookname);
3059 return false;
3060 }
3061 return true;
3062 }
3063
3064 return true;
3065 }
3066
3067 static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
3068 {
3069 int i;
3070 int *ttyfds;
3071 struct lxc_pty_info *pty_info;
3072 struct lxc_conf *conf = handler->conf;
3073 const struct lxc_tty_info *tty_info = &conf->tty_info;
3074 int sock = handler->data_sock[0];
3075 int ret = -1;
3076 size_t num_ttyfds = (2 * conf->tty);
3077
3078 ttyfds = malloc(num_ttyfds * sizeof(int));
3079 if (!ttyfds)
3080 return -1;
3081
3082 for (i = 0; i < num_ttyfds; i++) {
3083 pty_info = &tty_info->pty_info[i / 2];
3084 ttyfds[i++] = pty_info->slave;
3085 ttyfds[i] = pty_info->master;
3086 TRACE("send pty \"%s\" with master fd %d and slave fd %d to "
3087 "parent",
3088 pty_info->name, pty_info->master, pty_info->slave);
3089 }
3090
3091 ret = lxc_abstract_unix_send_fds(sock, ttyfds, num_ttyfds, NULL, 0);
3092 if (ret < 0)
3093 ERROR("failed to send %d ttys to parent: %s", conf->tty,
3094 strerror(errno));
3095 else
3096 TRACE("sent %d ttys to parent", conf->tty);
3097
3098 close(handler->data_sock[0]);
3099 close(handler->data_sock[1]);
3100
3101 for (i = 0; i < num_ttyfds; i++)
3102 close(ttyfds[i]);
3103
3104 free(ttyfds);
3105
3106 return ret;
3107 }
3108
3109 int lxc_setup(struct lxc_handler *handler)
3110 {
3111 const char *name = handler->name;
3112 struct lxc_conf *lxc_conf = handler->conf;
3113 const char *lxcpath = handler->lxcpath;
3114
3115 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
3116 ERROR("Error setting up rootfs mount after spawn");
3117 return -1;
3118 }
3119
3120 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
3121 if (setup_utsname(lxc_conf->utsname)) {
3122 ERROR("failed to setup the utsname for '%s'", name);
3123 return -1;
3124 }
3125 }
3126
3127 if (lxc_setup_network_in_child_namespaces(lxc_conf, &lxc_conf->network)) {
3128 ERROR("failed to setup the network for '%s'", name);
3129 return -1;
3130 }
3131
3132 if (lxc_network_send_name_and_ifindex_to_parent(handler) < 0) {
3133 ERROR("Failed to network device names and ifindices to parent");
3134 return -1;
3135 }
3136
3137 if (lxc_conf->autodev > 0) {
3138 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
3139 ERROR("failed to mount /dev in the container");
3140 return -1;
3141 }
3142 }
3143
3144 /* do automatic mounts (mainly /proc and /sys), but exclude
3145 * those that need to wait until other stuff has finished
3146 */
3147 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
3148 ERROR("failed to setup the automatic mounts for '%s'", name);
3149 return -1;
3150 }
3151
3152 if (setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
3153 ERROR("failed to setup the mounts for '%s'", name);
3154 return -1;
3155 }
3156
3157 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(lxc_conf, &lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
3158 ERROR("failed to setup the mount entries for '%s'", name);
3159 return -1;
3160 }
3161
3162 /* Make sure any start hooks are in the container */
3163 if (!verify_start_hooks(lxc_conf))
3164 return -1;
3165
3166 if (lxc_conf->is_execute)
3167 lxc_execute_bind_init(lxc_conf);
3168
3169 /* now mount only cgroup, if wanted;
3170 * before, /sys could not have been mounted
3171 * (is either mounted automatically or via fstab entries)
3172 */
3173 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
3174 ERROR("failed to setup the automatic mounts for '%s'", name);
3175 return -1;
3176 }
3177
3178 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
3179 ERROR("failed to run mount hooks for container '%s'.", name);
3180 return -1;
3181 }
3182
3183 if (lxc_conf->autodev > 0) {
3184 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
3185 ERROR("failed to run autodev hooks for container '%s'.", name);
3186 return -1;
3187 }
3188
3189 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
3190 ERROR("failed to populate /dev in the container");
3191 return -1;
3192 }
3193 }
3194
3195 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
3196 ERROR("failed to setup the console for '%s'", name);
3197 return -1;
3198 }
3199
3200 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
3201 ERROR("failed to setup /dev symlinks for '%s'", name);
3202 return -1;
3203 }
3204
3205 /* mount /proc if it's not already there */
3206 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
3207 ERROR("failed to LSM mount proc for '%s'", name);
3208 return -1;
3209 }
3210
3211 if (setup_pivot_root(&lxc_conf->rootfs)) {
3212 ERROR("failed to set rootfs for '%s'", name);
3213 return -1;
3214 }
3215
3216 if (lxc_setup_devpts(lxc_conf->pts)) {
3217 ERROR("failed to setup the new pts instance");
3218 return -1;
3219 }
3220
3221 if (lxc_create_tty(name, lxc_conf)) {
3222 ERROR("failed to create the ttys");
3223 return -1;
3224 }
3225
3226 if (lxc_send_ttys_to_parent(handler) < 0) {
3227 ERROR("failure sending console info to parent");
3228 return -1;
3229 }
3230
3231 if (!lxc_conf->is_execute && lxc_setup_tty(lxc_conf)) {
3232 ERROR("failed to setup the ttys for '%s'", name);
3233 return -1;
3234 }
3235
3236 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
3237 SYSERROR("failed to set environment variable for container ptys");
3238
3239
3240 if (setup_personality(lxc_conf->personality)) {
3241 ERROR("failed to setup personality");
3242 return -1;
3243 }
3244
3245 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
3246 if (!lxc_list_empty(&lxc_conf->caps)) {
3247 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
3248 return -1;
3249 }
3250 if (dropcaps_except(&lxc_conf->keepcaps)) {
3251 ERROR("failed to keep requested caps");
3252 return -1;
3253 }
3254 } else if (setup_caps(&lxc_conf->caps)) {
3255 ERROR("failed to drop capabilities");
3256 return -1;
3257 }
3258
3259 NOTICE("Container \"%s\" is set up", name);
3260
3261 return 0;
3262 }
3263
3264 int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
3265 const char *lxcpath, char *argv[])
3266 {
3267 int which = -1;
3268 struct lxc_list *it;
3269
3270 if (strcmp(hook, "pre-start") == 0)
3271 which = LXCHOOK_PRESTART;
3272 else if (strcmp(hook, "pre-mount") == 0)
3273 which = LXCHOOK_PREMOUNT;
3274 else if (strcmp(hook, "mount") == 0)
3275 which = LXCHOOK_MOUNT;
3276 else if (strcmp(hook, "autodev") == 0)
3277 which = LXCHOOK_AUTODEV;
3278 else if (strcmp(hook, "start") == 0)
3279 which = LXCHOOK_START;
3280 else if (strcmp(hook, "stop") == 0)
3281 which = LXCHOOK_STOP;
3282 else if (strcmp(hook, "post-stop") == 0)
3283 which = LXCHOOK_POSTSTOP;
3284 else if (strcmp(hook, "clone") == 0)
3285 which = LXCHOOK_CLONE;
3286 else if (strcmp(hook, "destroy") == 0)
3287 which = LXCHOOK_DESTROY;
3288 else
3289 return -1;
3290 lxc_list_for_each(it, &conf->hooks[which]) {
3291 int ret;
3292 char *hookname = it->elem;
3293 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
3294 if (ret)
3295 return ret;
3296 }
3297 return 0;
3298 }
3299
3300 int lxc_clear_config_caps(struct lxc_conf *c)
3301 {
3302 struct lxc_list *it, *next;
3303
3304 lxc_list_for_each_safe(it, &c->caps, next) {
3305 lxc_list_del(it);
3306 free(it->elem);
3307 free(it);
3308 }
3309 return 0;
3310 }
3311
3312 static int lxc_free_idmap(struct lxc_list *id_map) {
3313 struct lxc_list *it, *next;
3314
3315 lxc_list_for_each_safe(it, id_map, next) {
3316 lxc_list_del(it);
3317 free(it->elem);
3318 free(it);
3319 }
3320 return 0;
3321 }
3322
3323 int lxc_clear_idmaps(struct lxc_conf *c)
3324 {
3325 return lxc_free_idmap(&c->id_map);
3326 }
3327
3328 int lxc_clear_config_keepcaps(struct lxc_conf *c)
3329 {
3330 struct lxc_list *it,*next;
3331
3332 lxc_list_for_each_safe(it, &c->keepcaps, next) {
3333 lxc_list_del(it);
3334 free(it->elem);
3335 free(it);
3336 }
3337 return 0;
3338 }
3339
3340 int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
3341 {
3342 struct lxc_list *it,*next;
3343 bool all = false;
3344 const char *k = NULL;
3345
3346 if (strcmp(key, "lxc.cgroup") == 0)
3347 all = true;
3348 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
3349 k = key + sizeof("lxc.cgroup.")-1;
3350 else
3351 return -1;
3352
3353 lxc_list_for_each_safe(it, &c->cgroup, next) {
3354 struct lxc_cgroup *cg = it->elem;
3355 if (!all && strcmp(cg->subsystem, k) != 0)
3356 continue;
3357 lxc_list_del(it);
3358 free(cg->subsystem);
3359 free(cg->value);
3360 free(cg);
3361 free(it);
3362 }
3363 return 0;
3364 }
3365
3366 int lxc_clear_limits(struct lxc_conf *c, const char *key)
3367 {
3368 struct lxc_list *it, *next;
3369 bool all = false;
3370 const char *k = NULL;
3371
3372 if (strcmp(key, "lxc.limit") == 0
3373 || strcmp(key, "lxc.prlimit"))
3374 all = true;
3375 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
3376 k = key + sizeof("lxc.limit.")-1;
3377 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.")-1) == 0)
3378 k = key + sizeof("lxc.prlimit.")-1;
3379 else
3380 return -1;
3381
3382 lxc_list_for_each_safe(it, &c->limits, next) {
3383 struct lxc_limit *lim = it->elem;
3384 if (!all && strcmp(lim->resource, k) != 0)
3385 continue;
3386 lxc_list_del(it);
3387 free(lim->resource);
3388 free(lim);
3389 free(it);
3390 }
3391 return 0;
3392 }
3393
3394 int lxc_clear_groups(struct lxc_conf *c)
3395 {
3396 struct lxc_list *it,*next;
3397
3398 lxc_list_for_each_safe(it, &c->groups, next) {
3399 lxc_list_del(it);
3400 free(it->elem);
3401 free(it);
3402 }
3403 return 0;
3404 }
3405
3406 int lxc_clear_environment(struct lxc_conf *c)
3407 {
3408 struct lxc_list *it,*next;
3409
3410 lxc_list_for_each_safe(it, &c->environment, next) {
3411 lxc_list_del(it);
3412 free(it->elem);
3413 free(it);
3414 }
3415 return 0;
3416 }
3417
3418 int lxc_clear_mount_entries(struct lxc_conf *c)
3419 {
3420 struct lxc_list *it,*next;
3421
3422 lxc_list_for_each_safe(it, &c->mount_list, next) {
3423 lxc_list_del(it);
3424 free(it->elem);
3425 free(it);
3426 }
3427 return 0;
3428 }
3429
3430 int lxc_clear_automounts(struct lxc_conf *c)
3431 {
3432 c->auto_mounts = 0;
3433 return 0;
3434 }
3435
3436 int lxc_clear_hooks(struct lxc_conf *c, const char *key)
3437 {
3438 struct lxc_list *it,*next;
3439 bool all = false, done = false;
3440 const char *k = NULL;
3441 int i;
3442
3443 if (strcmp(key, "lxc.hook") == 0)
3444 all = true;
3445 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
3446 k = key + sizeof("lxc.hook.")-1;
3447 else
3448 return -1;
3449
3450 for (i=0; i<NUM_LXC_HOOKS; i++) {
3451 if (all || strcmp(k, lxchook_names[i]) == 0) {
3452 lxc_list_for_each_safe(it, &c->hooks[i], next) {
3453 lxc_list_del(it);
3454 free(it->elem);
3455 free(it);
3456 }
3457 done = true;
3458 }
3459 }
3460
3461 if (!done) {
3462 ERROR("Invalid hook key: %s", key);
3463 return -1;
3464 }
3465 return 0;
3466 }
3467
3468 static inline void lxc_clear_aliens(struct lxc_conf *conf)
3469 {
3470 struct lxc_list *it,*next;
3471
3472 lxc_list_for_each_safe(it, &conf->aliens, next) {
3473 lxc_list_del(it);
3474 free(it->elem);
3475 free(it);
3476 }
3477 }
3478
3479 void lxc_clear_includes(struct lxc_conf *conf)
3480 {
3481 struct lxc_list *it,*next;
3482
3483 lxc_list_for_each_safe(it, &conf->includes, next) {
3484 lxc_list_del(it);
3485 free(it->elem);
3486 free(it);
3487 }
3488 }
3489
3490 void lxc_conf_free(struct lxc_conf *conf)
3491 {
3492 if (!conf)
3493 return;
3494 if (current_config == conf)
3495 current_config = NULL;
3496 free(conf->console.log_path);
3497 free(conf->console.path);
3498 free(conf->rootfs.mount);
3499 free(conf->rootfs.bdev_type);
3500 free(conf->rootfs.options);
3501 free(conf->rootfs.path);
3502 free(conf->logfile);
3503 if (conf->logfd != -1)
3504 close(conf->logfd);
3505 free(conf->utsname);
3506 free(conf->ttydir);
3507 free(conf->fstab);
3508 free(conf->rcfile);
3509 free(conf->init_cmd);
3510 free(conf->unexpanded_config);
3511 free(conf->pty_names);
3512 free(conf->syslog);
3513 lxc_free_networks(&conf->network);
3514 free(conf->lsm_aa_profile);
3515 free(conf->lsm_se_context);
3516 lxc_seccomp_free(conf);
3517 lxc_clear_config_caps(conf);
3518 lxc_clear_config_keepcaps(conf);
3519 lxc_clear_cgroups(conf, "lxc.cgroup");
3520 lxc_clear_hooks(conf, "lxc.hook");
3521 lxc_clear_mount_entries(conf);
3522 lxc_clear_idmaps(conf);
3523 lxc_clear_groups(conf);
3524 lxc_clear_includes(conf);
3525 lxc_clear_aliens(conf);
3526 lxc_clear_environment(conf);
3527 lxc_clear_limits(conf, "lxc.prlimit");
3528 free(conf->cgroup_meta.dir);
3529 free(conf->cgroup_meta.controllers);
3530 free(conf);
3531 }
3532
3533 struct userns_fn_data {
3534 int (*fn)(void *);
3535 const char *fn_name;
3536 void *arg;
3537 int p[2];
3538 };
3539
3540 static int run_userns_fn(void *data)
3541 {
3542 struct userns_fn_data *d = data;
3543 char c;
3544
3545 /* Close write end of the pipe. */
3546 close(d->p[1]);
3547
3548 /* Wait for parent to finish establishing a new mapping in the user
3549 * namespace we are executing in.
3550 */
3551 if (read(d->p[0], &c, 1) != 1)
3552 return -1;
3553
3554 /* Close read end of the pipe. */
3555 close(d->p[0]);
3556
3557 if (d->fn_name)
3558 TRACE("calling function \"%s\"", d->fn_name);
3559 /* Call function to run. */
3560 return d->fn(d->arg);
3561 }
3562
3563 static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
3564 enum idtype idtype)
3565 {
3566 struct lxc_list *it;
3567 struct id_map *map;
3568 struct id_map *retmap = NULL;
3569
3570 lxc_list_for_each(it, &conf->id_map) {
3571 map = it->elem;
3572 if (map->idtype != idtype)
3573 continue;
3574
3575 if (id >= map->hostid && id < map->hostid + map->range) {
3576 retmap = map;
3577 break;
3578 }
3579 }
3580
3581 if (!retmap)
3582 return NULL;
3583
3584 retmap = malloc(sizeof(*retmap));
3585 if (!retmap)
3586 return NULL;
3587
3588 memcpy(retmap, map, sizeof(*retmap));
3589 return retmap;
3590 }
3591
3592 /*
3593 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
3594 * existing one or establish a new one.
3595 */
3596 static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
3597 {
3598 int hostid_mapped;
3599 struct id_map *entry = NULL;
3600
3601 /* Reuse existing mapping. */
3602 entry = mapped_hostid_entry(conf, id, type);
3603 if (entry)
3604 return entry;
3605
3606 /* Find new mapping. */
3607 hostid_mapped = find_unmapped_nsid(conf, type);
3608 if (hostid_mapped < 0) {
3609 DEBUG("failed to find free mapping for id %d", id);
3610 return NULL;
3611 }
3612
3613 entry = malloc(sizeof(*entry));
3614 if (!entry)
3615 return NULL;
3616
3617 entry->idtype = type;
3618 entry->nsid = hostid_mapped;
3619 entry->hostid = (unsigned long)id;
3620 entry->range = 1;
3621
3622 return entry;
3623 }
3624
3625 /* Run a function in a new user namespace.
3626 * The caller's euid/egid will be mapped if it is not already.
3627 * Afaict, userns_exec_1() is only used to operate based on privileges for the
3628 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
3629 * This means we require only to establish a mapping from:
3630 * - the container root {g,u}id as seen from the host > user's host {g,u}id
3631 * - the container root -> some sub{g,u}id
3632 * The former we add, if the user did not specifiy a mapping. The latter we
3633 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
3634 * there to start the container in the first place.
3635 */
3636 int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
3637 const char *fn_name)
3638 {
3639 pid_t pid;
3640 uid_t euid, egid;
3641 struct userns_fn_data d;
3642 int p[2];
3643 struct lxc_list *it;
3644 struct id_map *map;
3645 char c = '1';
3646 int ret = -1;
3647 struct lxc_list *idmap = NULL, *tmplist = NULL;
3648 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
3649 *host_uid_map = NULL, *host_gid_map = NULL;
3650
3651 ret = pipe(p);
3652 if (ret < 0) {
3653 SYSERROR("opening pipe");
3654 return -1;
3655 }
3656 d.fn = fn;
3657 d.fn_name = fn_name;
3658 d.arg = data;
3659 d.p[0] = p[0];
3660 d.p[1] = p[1];
3661
3662 /* Clone child in new user namespace. */
3663 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
3664 if (pid < 0) {
3665 ERROR("failed to clone child process in new user namespace");
3666 goto on_error;
3667 }
3668
3669 close(p[0]);
3670 p[0] = -1;
3671
3672 /* Find container root. */
3673 lxc_list_for_each(it, &conf->id_map) {
3674 map = it->elem;
3675
3676 if (map->nsid != 0)
3677 continue;
3678
3679 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
3680 container_root_uid = malloc(sizeof(*container_root_uid));
3681 if (!container_root_uid)
3682 goto on_error;
3683 container_root_uid->idtype = map->idtype;
3684 container_root_uid->hostid = map->hostid;
3685 container_root_uid->nsid = 0;
3686 container_root_uid->range = map->range;
3687 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
3688 container_root_gid = malloc(sizeof(*container_root_gid));
3689 if (!container_root_gid)
3690 goto on_error;
3691 container_root_gid->idtype = map->idtype;
3692 container_root_gid->hostid = map->hostid;
3693 container_root_gid->nsid = 0;
3694 container_root_gid->range = map->range;
3695 }
3696
3697 /* Found container root. */
3698 if (container_root_uid && container_root_gid)
3699 break;
3700 }
3701
3702 /* This is actually checked earlier but it can't hurt. */
3703 if (!container_root_uid || !container_root_gid) {
3704 ERROR("no mapping for container root found");
3705 goto on_error;
3706 }
3707
3708 host_uid_map = container_root_uid;
3709 host_gid_map = container_root_gid;
3710
3711 /* Check whether the {g,u}id of the user has a mapping. */
3712 euid = geteuid();
3713 egid = getegid();
3714 if (euid != container_root_uid->hostid)
3715 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
3716
3717 if (egid != container_root_gid->hostid)
3718 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
3719
3720 if (!host_uid_map) {
3721 DEBUG("failed to find mapping for uid %d", euid);
3722 goto on_error;
3723 }
3724
3725 if (!host_gid_map) {
3726 DEBUG("failed to find mapping for gid %d", egid);
3727 goto on_error;
3728 }
3729
3730 /* Allocate new {g,u}id map list. */
3731 idmap = malloc(sizeof(*idmap));
3732 if (!idmap)
3733 goto on_error;
3734 lxc_list_init(idmap);
3735
3736 /* Add container root to the map. */
3737 tmplist = malloc(sizeof(*tmplist));
3738 if (!tmplist)
3739 goto on_error;
3740 lxc_list_add_elem(tmplist, container_root_uid);
3741 lxc_list_add_tail(idmap, tmplist);
3742
3743 if (host_uid_map && (host_uid_map != container_root_uid)) {
3744 /* idmap will now keep track of that memory. */
3745 container_root_uid = NULL;
3746
3747 /* Add container root to the map. */
3748 tmplist = malloc(sizeof(*tmplist));
3749 if (!tmplist)
3750 goto on_error;
3751 lxc_list_add_elem(tmplist, host_uid_map);
3752 lxc_list_add_tail(idmap, tmplist);
3753 }
3754 /* idmap will now keep track of that memory. */
3755 container_root_uid = NULL;
3756 /* idmap will now keep track of that memory. */
3757 host_uid_map = NULL;
3758
3759 tmplist = malloc(sizeof(*tmplist));
3760 if (!tmplist)
3761 goto on_error;
3762 lxc_list_add_elem(tmplist, container_root_gid);
3763 lxc_list_add_tail(idmap, tmplist);
3764
3765 if (host_gid_map && (host_gid_map != container_root_gid)) {
3766 /* idmap will now keep track of that memory. */
3767 container_root_gid = NULL;
3768
3769 tmplist = malloc(sizeof(*tmplist));
3770 if (!tmplist)
3771 goto on_error;
3772 lxc_list_add_elem(tmplist, host_gid_map);
3773 lxc_list_add_tail(idmap, tmplist);
3774 }
3775 /* idmap will now keep track of that memory. */
3776 container_root_gid = NULL;
3777 /* idmap will now keep track of that memory. */
3778 host_gid_map = NULL;
3779
3780 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
3781 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
3782 lxc_list_for_each(it, idmap) {
3783 map = it->elem;
3784 TRACE("establishing %cid mapping for \"%d\" in new "
3785 "user namespace: nsuid %lu - hostid %lu - range "
3786 "%lu",
3787 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
3788 map->nsid, map->hostid, map->range);
3789 }
3790 }
3791
3792 /* Set up {g,u}id mapping for user namespace of child process. */
3793 ret = lxc_map_ids(idmap, pid);
3794 if (ret < 0) {
3795 ERROR("error setting up {g,u}id mappings for child process "
3796 "\"%d\"",
3797 pid);
3798 goto on_error;
3799 }
3800
3801 /* Tell child to proceed. */
3802 if (write(p[1], &c, 1) != 1) {
3803 SYSERROR("failed telling child process \"%d\" to proceed", pid);
3804 goto on_error;
3805 }
3806
3807 /* Wait for child to finish. */
3808 ret = wait_for_pid(pid);
3809
3810 on_error:
3811 if (idmap)
3812 lxc_free_idmap(idmap);
3813 if (container_root_uid)
3814 free(container_root_uid);
3815 if (container_root_gid)
3816 free(container_root_gid);
3817 if (host_uid_map && (host_uid_map != container_root_uid))
3818 free(host_uid_map);
3819 if (host_gid_map && (host_gid_map != container_root_gid))
3820 free(host_gid_map);
3821
3822 if (p[0] != -1)
3823 close(p[0]);
3824 close(p[1]);
3825
3826 return ret;
3827 }
3828
3829 /* not thread-safe, do not use from api without first forking */
3830 static char* getuname(void)
3831 {
3832 struct passwd *result;
3833
3834 result = getpwuid(geteuid());
3835 if (!result)
3836 return NULL;
3837
3838 return strdup(result->pw_name);
3839 }
3840
3841 /* not thread-safe, do not use from api without first forking */
3842 static char *getgname(void)
3843 {
3844 struct group *result;
3845
3846 result = getgrgid(getegid());
3847 if (!result)
3848 return NULL;
3849
3850 return strdup(result->gr_name);
3851 }
3852
3853 /* not thread-safe, do not use from api without first forking */
3854 void suggest_default_idmap(void)
3855 {
3856 FILE *f;
3857 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
3858 char *line = NULL;
3859 char *uname, *gname;
3860 size_t len = 0;
3861
3862 if (!(uname = getuname()))
3863 return;
3864
3865 if (!(gname = getgname())) {
3866 free(uname);
3867 return;
3868 }
3869
3870 f = fopen(subuidfile, "r");
3871 if (!f) {
3872 ERROR("Your system is not configured with subuids");
3873 free(gname);
3874 free(uname);
3875 return;
3876 }
3877 while (getline(&line, &len, f) != -1) {
3878 size_t no_newline = 0;
3879 char *p = strchr(line, ':'), *p2;
3880 if (*line == '#')
3881 continue;
3882 if (!p)
3883 continue;
3884 *p = '\0';
3885 p++;
3886 if (strcmp(line, uname))
3887 continue;
3888 p2 = strchr(p, ':');
3889 if (!p2)
3890 continue;
3891 *p2 = '\0';
3892 p2++;
3893 if (!*p2)
3894 continue;
3895 no_newline = strcspn(p2, "\n");
3896 p2[no_newline] = '\0';
3897
3898 if (lxc_safe_uint(p, &uid) < 0)
3899 WARN("Could not parse UID.");
3900 if (lxc_safe_uint(p2, &urange) < 0)
3901 WARN("Could not parse UID range.");
3902 }
3903 fclose(f);
3904
3905 f = fopen(subgidfile, "r");
3906 if (!f) {
3907 ERROR("Your system is not configured with subgids");
3908 free(gname);
3909 free(uname);
3910 return;
3911 }
3912 while (getline(&line, &len, f) != -1) {
3913 size_t no_newline = 0;
3914 char *p = strchr(line, ':'), *p2;
3915 if (*line == '#')
3916 continue;
3917 if (!p)
3918 continue;
3919 *p = '\0';
3920 p++;
3921 if (strcmp(line, uname))
3922 continue;
3923 p2 = strchr(p, ':');
3924 if (!p2)
3925 continue;
3926 *p2 = '\0';
3927 p2++;
3928 if (!*p2)
3929 continue;
3930 no_newline = strcspn(p2, "\n");
3931 p2[no_newline] = '\0';
3932
3933 if (lxc_safe_uint(p, &gid) < 0)
3934 WARN("Could not parse GID.");
3935 if (lxc_safe_uint(p2, &grange) < 0)
3936 WARN("Could not parse GID range.");
3937 }
3938 fclose(f);
3939
3940 free(line);
3941
3942 if (!urange || !grange) {
3943 ERROR("You do not have subuids or subgids allocated");
3944 ERROR("Unprivileged containers require subuids and subgids");
3945 return;
3946 }
3947
3948 ERROR("You must either run as root, or define uid mappings");
3949 ERROR("To pass uid mappings to lxc-create, you could create");
3950 ERROR("~/.config/lxc/default.conf:");
3951 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
3952 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
3953 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
3954
3955 free(gname);
3956 free(uname);
3957 }
3958
3959 static void free_cgroup_settings(struct lxc_list *result)
3960 {
3961 struct lxc_list *iterator, *next;
3962
3963 lxc_list_for_each_safe(iterator, result, next) {
3964 lxc_list_del(iterator);
3965 free(iterator);
3966 }
3967 free(result);
3968 }
3969
3970 /*
3971 * Return the list of cgroup_settings sorted according to the following rules
3972 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
3973 */
3974 struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
3975 {
3976 struct lxc_list *result;
3977 struct lxc_list *memsw_limit = NULL;
3978 struct lxc_list *it = NULL;
3979 struct lxc_cgroup *cg = NULL;
3980 struct lxc_list *item = NULL;
3981
3982 result = malloc(sizeof(*result));
3983 if (!result) {
3984 ERROR("failed to allocate memory to sort cgroup settings");
3985 return NULL;
3986 }
3987 lxc_list_init(result);
3988
3989 /*Iterate over the cgroup settings and copy them to the output list*/
3990 lxc_list_for_each(it, cgroup_settings) {
3991 item = malloc(sizeof(*item));
3992 if (!item) {
3993 ERROR("failed to allocate memory to sort cgroup settings");
3994 free_cgroup_settings(result);
3995 return NULL;
3996 }
3997 item->elem = it->elem;
3998 cg = it->elem;
3999 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4000 /* Store the memsw_limit location */
4001 memsw_limit = item;
4002 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
4003 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
4004 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
4005 item->elem = memsw_limit->elem;
4006 memsw_limit->elem = it->elem;
4007 }
4008 lxc_list_add_tail(result, item);
4009 }
4010
4011 return result;
4012 }