]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/conf.c
bede03574f69b98b9711facdf988038b02084074
[mirror_lxc.git] / src / lxc / conf.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #define _GNU_SOURCE
25 #include "config.h"
26
27 #include <dirent.h>
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <grp.h>
31 #include <inttypes.h>
32 #include <libgen.h>
33 #include <pwd.h>
34 #include <stdarg.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <arpa/inet.h>
41 #include <linux/loop.h>
42 #include <net/if.h>
43 #include <netinet/in.h>
44 #include <sys/mman.h>
45 #include <sys/mount.h>
46 #include <sys/param.h>
47 #include <sys/prctl.h>
48 #include <sys/stat.h>
49 #include <sys/socket.h>
50 #include <sys/sysmacros.h>
51 #include <sys/syscall.h>
52 #include <sys/types.h>
53 #include <sys/utsname.h>
54 #include <sys/wait.h>
55
56 /* makedev() */
57 #ifdef MAJOR_IN_MKDEV
58 # include <sys/mkdev.h>
59 #endif
60
61 #ifdef HAVE_STATVFS
62 #include <sys/statvfs.h>
63 #endif
64
65 #if HAVE_PTY_H
66 #include <pty.h>
67 #else
68 #include <../include/openpty.h>
69 #endif
70
71 #ifdef HAVE_LINUX_MEMFD_H
72 #include <linux/memfd.h>
73 #endif
74
75 #include "af_unix.h"
76 #include "caps.h" /* for lxc_caps_last_cap() */
77 #include "cgroup.h"
78 #include "conf.h"
79 #include "confile_utils.h"
80 #include "error.h"
81 #include "log.h"
82 #include "lxclock.h"
83 #include "lxcseccomp.h"
84 #include "namespace.h"
85 #include "network.h"
86 #include "parse.h"
87 #include "storage.h"
88 #include "storage/aufs.h"
89 #include "storage/overlay.h"
90 #include "utils.h"
91 #include "lsm/lsm.h"
92
93 #if HAVE_LIBCAP
94 #include <sys/capability.h>
95 #endif
96
97 #if HAVE_SYS_PERSONALITY_H
98 #include <sys/personality.h>
99 #endif
100
101 #if IS_BIONIC
102 #include <../include/lxcmntent.h>
103 #ifndef HAVE_PRLIMIT
104 #include <../include/prlimit.h>
105 #endif
106 #else
107 #include <mntent.h>
108 #endif
109
110 lxc_log_define(lxc_conf, lxc);
111
112 #if HAVE_LIBCAP
113 #ifndef CAP_SETFCAP
114 #define CAP_SETFCAP 31
115 #endif
116
117 #ifndef CAP_MAC_OVERRIDE
118 #define CAP_MAC_OVERRIDE 32
119 #endif
120
121 #ifndef CAP_MAC_ADMIN
122 #define CAP_MAC_ADMIN 33
123 #endif
124 #endif
125
126 #ifndef PR_CAPBSET_DROP
127 #define PR_CAPBSET_DROP 24
128 #endif
129
130 #ifndef LO_FLAGS_AUTOCLEAR
131 #define LO_FLAGS_AUTOCLEAR 4
132 #endif
133
134 #ifndef CAP_SETUID
135 #define CAP_SETUID 7
136 #endif
137
138 #ifndef CAP_SETGID
139 #define CAP_SETGID 6
140 #endif
141
142 /* needed for cgroup automount checks, regardless of whether we
143 * have included linux/capability.h or not */
144 #ifndef CAP_SYS_ADMIN
145 #define CAP_SYS_ADMIN 21
146 #endif
147
148 /* Define pivot_root() if missing from the C library */
149 #ifndef HAVE_PIVOT_ROOT
150 static int pivot_root(const char * new_root, const char * put_old)
151 {
152 #ifdef __NR_pivot_root
153 return syscall(__NR_pivot_root, new_root, put_old);
154 #else
155 errno = ENOSYS;
156 return -1;
157 #endif
158 }
159 #else
160 extern int pivot_root(const char * new_root, const char * put_old);
161 #endif
162
163 /* Define sethostname() if missing from the C library */
164 #ifndef HAVE_SETHOSTNAME
165 static int sethostname(const char * name, size_t len)
166 {
167 #ifdef __NR_sethostname
168 return syscall(__NR_sethostname, name, len);
169 #else
170 errno = ENOSYS;
171 return -1;
172 #endif
173 }
174 #endif
175
176 #ifndef MS_PRIVATE
177 #define MS_PRIVATE (1<<18)
178 #endif
179
180 #ifndef MS_LAZYTIME
181 #define MS_LAZYTIME (1<<25)
182 #endif
183
184 /* memfd_create() */
185 #ifndef MFD_CLOEXEC
186 #define MFD_CLOEXEC 0x0001U
187 #endif
188
189 #ifndef MFD_ALLOW_SEALING
190 #define MFD_ALLOW_SEALING 0x0002U
191 #endif
192
193 #ifndef HAVE_MEMFD_CREATE
194 static int memfd_create(const char *name, unsigned int flags) {
195 #ifndef __NR_memfd_create
196 #if defined __i386__
197 #define __NR_memfd_create 356
198 #elif defined __x86_64__
199 #define __NR_memfd_create 319
200 #elif defined __arm__
201 #define __NR_memfd_create 385
202 #elif defined __aarch64__
203 #define __NR_memfd_create 279
204 #elif defined __s390__
205 #define __NR_memfd_create 350
206 #elif defined __powerpc__
207 #define __NR_memfd_create 360
208 #elif defined __sparc__
209 #define __NR_memfd_create 348
210 #elif defined __blackfin__
211 #define __NR_memfd_create 390
212 #elif defined __ia64__
213 #define __NR_memfd_create 1340
214 #elif defined _MIPS_SIM
215 #if _MIPS_SIM == _MIPS_SIM_ABI32
216 #define __NR_memfd_create 4354
217 #endif
218 #if _MIPS_SIM == _MIPS_SIM_NABI32
219 #define __NR_memfd_create 6318
220 #endif
221 #if _MIPS_SIM == _MIPS_SIM_ABI64
222 #define __NR_memfd_create 5314
223 #endif
224 #endif
225 #endif
226 #ifdef __NR_memfd_create
227 return syscall(__NR_memfd_create, name, flags);
228 #else
229 errno = ENOSYS;
230 return -1;
231 #endif
232 }
233 #else
234 extern int memfd_create(const char *name, unsigned int flags);
235 #endif
236
237 char *lxchook_names[NUM_LXC_HOOKS] = {"pre-start", "pre-mount", "mount",
238 "autodev", "start", "stop",
239 "post-stop", "clone", "destroy"};
240
241 struct mount_opt {
242 char *name;
243 int clear;
244 int flag;
245 };
246
247 struct caps_opt {
248 char *name;
249 int value;
250 };
251
252 struct limit_opt {
253 char *name;
254 int value;
255 };
256
257 /*
258 * The lxc_conf of the container currently being worked on in an
259 * API call
260 * This is used in the error calls
261 */
262 #ifdef HAVE_TLS
263 __thread struct lxc_conf *current_config;
264 #else
265 struct lxc_conf *current_config;
266 #endif
267
268 /* Declare this here, since we don't want to reshuffle the whole file. */
269 static int in_caplist(int cap, struct lxc_list *caps);
270
271 static struct mount_opt mount_opt[] = {
272 { "async", 1, MS_SYNCHRONOUS },
273 { "atime", 1, MS_NOATIME },
274 { "bind", 0, MS_BIND },
275 { "defaults", 0, 0 },
276 { "dev", 1, MS_NODEV },
277 { "diratime", 1, MS_NODIRATIME },
278 { "dirsync", 0, MS_DIRSYNC },
279 { "exec", 1, MS_NOEXEC },
280 { "lazytime", 0, MS_LAZYTIME },
281 { "mand", 0, MS_MANDLOCK },
282 { "noatime", 0, MS_NOATIME },
283 { "nodev", 0, MS_NODEV },
284 { "nodiratime", 0, MS_NODIRATIME },
285 { "noexec", 0, MS_NOEXEC },
286 { "nomand", 1, MS_MANDLOCK },
287 { "norelatime", 1, MS_RELATIME },
288 { "nostrictatime", 1, MS_STRICTATIME },
289 { "nosuid", 0, MS_NOSUID },
290 { "rbind", 0, MS_BIND|MS_REC },
291 { "relatime", 0, MS_RELATIME },
292 { "remount", 0, MS_REMOUNT },
293 { "ro", 0, MS_RDONLY },
294 { "rw", 1, MS_RDONLY },
295 { "strictatime", 0, MS_STRICTATIME },
296 { "suid", 1, MS_NOSUID },
297 { "sync", 0, MS_SYNCHRONOUS },
298 { NULL, 0, 0 },
299 };
300
301 #if HAVE_LIBCAP
302 static struct caps_opt caps_opt[] = {
303 { "chown", CAP_CHOWN },
304 { "dac_override", CAP_DAC_OVERRIDE },
305 { "dac_read_search", CAP_DAC_READ_SEARCH },
306 { "fowner", CAP_FOWNER },
307 { "fsetid", CAP_FSETID },
308 { "kill", CAP_KILL },
309 { "setgid", CAP_SETGID },
310 { "setuid", CAP_SETUID },
311 { "setpcap", CAP_SETPCAP },
312 { "linux_immutable", CAP_LINUX_IMMUTABLE },
313 { "net_bind_service", CAP_NET_BIND_SERVICE },
314 { "net_broadcast", CAP_NET_BROADCAST },
315 { "net_admin", CAP_NET_ADMIN },
316 { "net_raw", CAP_NET_RAW },
317 { "ipc_lock", CAP_IPC_LOCK },
318 { "ipc_owner", CAP_IPC_OWNER },
319 { "sys_module", CAP_SYS_MODULE },
320 { "sys_rawio", CAP_SYS_RAWIO },
321 { "sys_chroot", CAP_SYS_CHROOT },
322 { "sys_ptrace", CAP_SYS_PTRACE },
323 { "sys_pacct", CAP_SYS_PACCT },
324 { "sys_admin", CAP_SYS_ADMIN },
325 { "sys_boot", CAP_SYS_BOOT },
326 { "sys_nice", CAP_SYS_NICE },
327 { "sys_resource", CAP_SYS_RESOURCE },
328 { "sys_time", CAP_SYS_TIME },
329 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
330 { "mknod", CAP_MKNOD },
331 { "lease", CAP_LEASE },
332 #ifdef CAP_AUDIT_READ
333 { "audit_read", CAP_AUDIT_READ },
334 #endif
335 #ifdef CAP_AUDIT_WRITE
336 { "audit_write", CAP_AUDIT_WRITE },
337 #endif
338 #ifdef CAP_AUDIT_CONTROL
339 { "audit_control", CAP_AUDIT_CONTROL },
340 #endif
341 { "setfcap", CAP_SETFCAP },
342 { "mac_override", CAP_MAC_OVERRIDE },
343 { "mac_admin", CAP_MAC_ADMIN },
344 #ifdef CAP_SYSLOG
345 { "syslog", CAP_SYSLOG },
346 #endif
347 #ifdef CAP_WAKE_ALARM
348 { "wake_alarm", CAP_WAKE_ALARM },
349 #endif
350 #ifdef CAP_BLOCK_SUSPEND
351 { "block_suspend", CAP_BLOCK_SUSPEND },
352 #endif
353 };
354 #else
355 static struct caps_opt caps_opt[] = {};
356 #endif
357
358 static struct limit_opt limit_opt[] = {
359 #ifdef RLIMIT_AS
360 { "as", RLIMIT_AS },
361 #endif
362 #ifdef RLIMIT_CORE
363 { "core", RLIMIT_CORE },
364 #endif
365 #ifdef RLIMIT_CPU
366 { "cpu", RLIMIT_CPU },
367 #endif
368 #ifdef RLIMIT_DATA
369 { "data", RLIMIT_DATA },
370 #endif
371 #ifdef RLIMIT_FSIZE
372 { "fsize", RLIMIT_FSIZE },
373 #endif
374 #ifdef RLIMIT_LOCKS
375 { "locks", RLIMIT_LOCKS },
376 #endif
377 #ifdef RLIMIT_MEMLOCK
378 { "memlock", RLIMIT_MEMLOCK },
379 #endif
380 #ifdef RLIMIT_MSGQUEUE
381 { "msgqueue", RLIMIT_MSGQUEUE },
382 #endif
383 #ifdef RLIMIT_NICE
384 { "nice", RLIMIT_NICE },
385 #endif
386 #ifdef RLIMIT_NOFILE
387 { "nofile", RLIMIT_NOFILE },
388 #endif
389 #ifdef RLIMIT_NPROC
390 { "nproc", RLIMIT_NPROC },
391 #endif
392 #ifdef RLIMIT_RSS
393 { "rss", RLIMIT_RSS },
394 #endif
395 #ifdef RLIMIT_RTPRIO
396 { "rtprio", RLIMIT_RTPRIO },
397 #endif
398 #ifdef RLIMIT_RTTIME
399 { "rttime", RLIMIT_RTTIME },
400 #endif
401 #ifdef RLIMIT_SIGPENDING
402 { "sigpending", RLIMIT_SIGPENDING },
403 #endif
404 #ifdef RLIMIT_STACK
405 { "stack", RLIMIT_STACK },
406 #endif
407 };
408
409 static int run_buffer(char *buffer)
410 {
411 struct lxc_popen_FILE *f;
412 char *output;
413 int ret;
414
415 f = lxc_popen(buffer);
416 if (!f) {
417 SYSERROR("Failed to popen() %s.", buffer);
418 return -1;
419 }
420
421 output = malloc(LXC_LOG_BUFFER_SIZE);
422 if (!output) {
423 ERROR("Failed to allocate memory for %s.", buffer);
424 lxc_pclose(f);
425 return -1;
426 }
427
428 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
429 DEBUG("Script %s with output: %s.", buffer, output);
430
431 free(output);
432
433 ret = lxc_pclose(f);
434 if (ret == -1) {
435 SYSERROR("Script exited with error.");
436 return -1;
437 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
438 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
439 return -1;
440 } else if (WIFSIGNALED(ret)) {
441 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
442 return -1;
443 }
444
445 return 0;
446 }
447
448 static int run_script_argv(const char *name, const char *section,
449 const char *script, const char *hook,
450 const char *lxcpath, char **argsin)
451 {
452 int ret, i;
453 char *buffer;
454 size_t size = 0;
455
456 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
457 script, name, section);
458
459 for (i = 0; argsin && argsin[i]; i++)
460 size += strlen(argsin[i]) + 1;
461
462 size += strlen(hook) + 1;
463
464 size += strlen(script);
465 size += strlen(name);
466 size += strlen(section);
467 size += 3;
468
469 if (size > INT_MAX)
470 return -1;
471
472 buffer = alloca(size);
473 if (!buffer) {
474 ERROR("Failed to allocate memory.");
475 return -1;
476 }
477
478 ret =
479 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
480 if (ret < 0 || (size_t)ret >= size) {
481 ERROR("Script name too long.");
482 return -1;
483 }
484
485 for (i = 0; argsin && argsin[i]; i++) {
486 int len = size - ret;
487 int rc;
488 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
489 if (rc < 0 || rc >= len) {
490 ERROR("Script args too long.");
491 return -1;
492 }
493 ret += rc;
494 }
495
496 return run_buffer(buffer);
497 }
498
499 int run_script(const char *name, const char *section, const char *script, ...)
500 {
501 int ret;
502 char *buffer, *p;
503 size_t size = 0;
504 va_list ap;
505
506 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
507 script, name, section);
508
509 va_start(ap, script);
510 while ((p = va_arg(ap, char *)))
511 size += strlen(p) + 1;
512 va_end(ap);
513
514 size += strlen(script);
515 size += strlen(name);
516 size += strlen(section);
517 size += 3;
518
519 if (size > INT_MAX)
520 return -1;
521
522 buffer = alloca(size);
523 if (!buffer) {
524 ERROR("Failed to allocate memory.");
525 return -1;
526 }
527
528 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
529 if (ret < 0 || ret >= size) {
530 ERROR("Script name too long.");
531 return -1;
532 }
533
534 va_start(ap, script);
535 while ((p = va_arg(ap, char *))) {
536 int len = size - ret;
537 int rc;
538 rc = snprintf(buffer + ret, len, " %s", p);
539 if (rc < 0 || rc >= len) {
540 ERROR("Script args too long.");
541 return -1;
542 }
543 ret += rc;
544 }
545 va_end(ap);
546
547 return run_buffer(buffer);
548 }
549
550 /*
551 * pin_rootfs
552 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
553 * the duration of the container run, to prevent the container from marking
554 * the underlying fs readonly on shutdown. unlink the file immediately so
555 * no name pollution is happens
556 * return -1 on error.
557 * return -2 if nothing needed to be pinned.
558 * return an open fd (>=0) if we pinned it.
559 */
560 int pin_rootfs(const char *rootfs)
561 {
562 char absrootfs[MAXPATHLEN];
563 char absrootfspin[MAXPATHLEN];
564 struct stat s;
565 int ret, fd;
566
567 if (rootfs == NULL || strlen(rootfs) == 0)
568 return -2;
569
570 if (!realpath(rootfs, absrootfs))
571 return -2;
572
573 if (access(absrootfs, F_OK))
574 return -1;
575
576 if (stat(absrootfs, &s))
577 return -1;
578
579 if (!S_ISDIR(s.st_mode))
580 return -2;
581
582 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
583 if (ret >= MAXPATHLEN)
584 return -1;
585
586 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
587 if (fd < 0)
588 return fd;
589 (void)unlink(absrootfspin);
590 return fd;
591 }
592
593 /*
594 * If we are asking to remount something, make sure that any
595 * NOEXEC etc are honored.
596 */
597 unsigned long add_required_remount_flags(const char *s, const char *d,
598 unsigned long flags)
599 {
600 #ifdef HAVE_STATVFS
601 struct statvfs sb;
602 unsigned long required_flags = 0;
603
604 if (!(flags & MS_REMOUNT))
605 return flags;
606
607 if (!s)
608 s = d;
609
610 if (!s)
611 return flags;
612 if (statvfs(s, &sb) < 0)
613 return flags;
614
615 if (sb.f_flag & MS_NOSUID)
616 required_flags |= MS_NOSUID;
617 if (sb.f_flag & MS_NODEV)
618 required_flags |= MS_NODEV;
619 if (sb.f_flag & MS_RDONLY)
620 required_flags |= MS_RDONLY;
621 if (sb.f_flag & MS_NOEXEC)
622 required_flags |= MS_NOEXEC;
623
624 return flags | required_flags;
625 #else
626 return flags;
627 #endif
628 }
629
630 static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
631 {
632 int r;
633 int i;
634 static struct {
635 int match_mask;
636 int match_flag;
637 const char *source;
638 const char *destination;
639 const char *fstype;
640 unsigned long flags;
641 const char *options;
642 } default_mounts[] = {
643 /* Read-only bind-mounting... In older kernels, doing that required
644 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
645 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
646 * kernel 2.6.26 onwards. However, this apparently does not work on
647 * kernel 3.8. Unfortunately, on that very same kernel, doing the
648 * same trick as above doesn't seem to work either, there one needs
649 * to ALSO specify MS_BIND for the remount, otherwise the entire
650 * fs is remounted read-only or the mount fails because it's busy...
651 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
652 * 2.6.32...
653 */
654 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
655 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
656 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
657 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
658 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
659 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
660 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
661 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
662 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
663 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
664 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
665 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
666 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
667 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
668 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
669 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
670 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
671 { 0, 0, NULL, NULL, NULL, 0, NULL }
672 };
673
674 for (i = 0; default_mounts[i].match_mask; i++) {
675 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
676 char *source = NULL;
677 char *destination = NULL;
678 int saved_errno;
679 unsigned long mflags;
680
681 if (default_mounts[i].source) {
682 /* will act like strdup if %r is not present */
683 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
684 if (!source) {
685 SYSERROR("memory allocation error");
686 return -1;
687 }
688 }
689 if (!default_mounts[i].destination) {
690 ERROR("BUG: auto mounts destination %d was NULL", i);
691 free(source);
692 return -1;
693 }
694 /* will act like strdup if %r is not present */
695 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
696 if (!destination) {
697 saved_errno = errno;
698 SYSERROR("memory allocation error");
699 free(source);
700 errno = saved_errno;
701 return -1;
702 }
703 mflags = add_required_remount_flags(source, destination,
704 default_mounts[i].flags);
705 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
706 saved_errno = errno;
707 if (r < 0 && errno == ENOENT) {
708 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
709 r = 0;
710 }
711 else if (r < 0)
712 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
713
714 free(source);
715 free(destination);
716 if (r < 0) {
717 errno = saved_errno;
718 return -1;
719 }
720 }
721 }
722
723 if (flags & LXC_AUTO_CGROUP_MASK) {
724 int cg_flags;
725
726 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
727 /* If the type of cgroup mount was not specified, it depends on the
728 * container's capabilities as to what makes sense: if we have
729 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
730 * anyway, so we may as well default to read-write; then the admin
731 * will not be given a false sense of security. (And if they really
732 * want mixed r/o r/w, then they can explicitly specify :mixed.)
733 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
734 * :mixed, because then the container can't remount it read-write. */
735 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
736 int has_sys_admin = 0;
737
738 if (!lxc_list_empty(&conf->keepcaps))
739 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
740 else
741 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
742
743 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
744 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
745 else
746 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
747 }
748
749 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
750 SYSERROR("error mounting /sys/fs/cgroup");
751 return -1;
752 }
753 }
754
755 return 0;
756 }
757
758 static int setup_utsname(struct utsname *utsname)
759 {
760 if (!utsname)
761 return 0;
762
763 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
764 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
765 return -1;
766 }
767
768 INFO("'%s' hostname has been setup", utsname->nodename);
769
770 return 0;
771 }
772
773 struct dev_symlinks {
774 const char *oldpath;
775 const char *name;
776 };
777
778 static const struct dev_symlinks dev_symlinks[] = {
779 {"/proc/self/fd", "fd"},
780 {"/proc/self/fd/0", "stdin"},
781 {"/proc/self/fd/1", "stdout"},
782 {"/proc/self/fd/2", "stderr"},
783 };
784
785 static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
786 {
787 char path[MAXPATHLEN];
788 int ret,i;
789 struct stat s;
790
791
792 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
793 const struct dev_symlinks *d = &dev_symlinks[i];
794 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
795 if (ret < 0 || ret >= MAXPATHLEN)
796 return -1;
797
798 /*
799 * Stat the path first. If we don't get an error
800 * accept it as is and don't try to create it
801 */
802 if (!stat(path, &s)) {
803 continue;
804 }
805
806 ret = symlink(d->oldpath, path);
807
808 if (ret && errno != EEXIST) {
809 if ( errno == EROFS ) {
810 WARN("Warning: Read Only file system while creating %s", path);
811 } else {
812 SYSERROR("Error creating %s", path);
813 return -1;
814 }
815 }
816 }
817 return 0;
818 }
819
820 /* Build a space-separate list of ptys to pass to systemd. */
821 static bool append_ptyname(char **pp, char *name)
822 {
823 char *p;
824
825 if (!*pp) {
826 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
827 if (!*pp)
828 return false;
829 sprintf(*pp, "container_ttys=%s", name);
830 return true;
831 }
832 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
833 if (!p)
834 return false;
835 *pp = p;
836 strcat(p, " ");
837 strcat(p, name);
838 return true;
839 }
840
841 static int lxc_setup_ttys(struct lxc_conf *conf)
842 {
843 int i, ret;
844 const struct lxc_tty_info *tty_info = &conf->tty_info;
845 char *ttydir = conf->ttydir;
846 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
847
848 if (!conf->rootfs.path)
849 return 0;
850
851 for (i = 0; i < tty_info->nbtty; i++) {
852 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
853
854 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
855 if (ret < 0 || (size_t)ret >= sizeof(path))
856 return -1;
857
858 if (ttydir) {
859 /* create dev/lxc/tty%d" */
860 ret = snprintf(lxcpath, sizeof(lxcpath),
861 "/dev/%s/tty%d", ttydir, i + 1);
862 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
863 return -1;
864
865 ret = creat(lxcpath, 0660);
866 if (ret < 0 && errno != EEXIST) {
867 SYSERROR("Failed to create \"%s\"", lxcpath);
868 return -1;
869 }
870 if (ret >= 0)
871 close(ret);
872
873 ret = unlink(path);
874 if (ret < 0 && errno != ENOENT) {
875 SYSERROR("Failed to unlink \"%s\"", path);
876 return -1;
877 }
878
879 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
880 if (ret < 0) {
881 WARN("Failed to bind mount \"%s\" onto \"%s\"",
882 pty_info->name, path);
883 continue;
884 }
885 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
886 path);
887
888 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
889 ttydir, i + 1);
890 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
891 return -1;
892
893 ret = symlink(lxcpath, path);
894 if (ret < 0) {
895 SYSERROR("Failed to create symlink \"%s\" -> \"%s\"",
896 path, lxcpath);
897 return -1;
898 }
899 } else {
900 /* If we populated /dev, then we need to create
901 * /dev/ttyN
902 */
903 ret = access(path, F_OK);
904 if (ret < 0) {
905 ret = creat(path, 0660);
906 if (ret < 0) {
907 SYSERROR("Failed to create \"%s\"", path);
908 /* this isn't fatal, continue */
909 } else {
910 close(ret);
911 }
912 }
913
914 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
915 if (ret < 0) {
916 SYSERROR("Failed to mount '%s'->'%s'", pty_info->name, path);
917 continue;
918 }
919
920 DEBUG("Bind mounted \"%s\" onto \"%s\"", pty_info->name,
921 path);
922 }
923
924 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
925 ERROR("Error setting up container_ttys string");
926 return -1;
927 }
928 }
929
930 INFO("Finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
931 return 0;
932 }
933
934 int lxc_allocate_ttys(const char *name, struct lxc_conf *conf)
935 {
936 struct lxc_tty_info *tty_info = &conf->tty_info;
937 int i, ret;
938
939 /* no tty in the configuration */
940 if (!conf->tty)
941 return 0;
942
943 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
944 if (!tty_info->pty_info) {
945 SYSERROR("failed to allocate struct *pty_info");
946 return -ENOMEM;
947 }
948
949 for (i = 0; i < conf->tty; i++) {
950 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
951
952 process_lock();
953 ret = openpty(&pty_info->master, &pty_info->slave,
954 pty_info->name, NULL, NULL);
955 process_unlock();
956 if (ret) {
957 SYSERROR("failed to create pty device number %d", i);
958 tty_info->nbtty = i;
959 lxc_delete_tty(tty_info);
960 return -ENOTTY;
961 }
962
963 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
964 pty_info->name, pty_info->master, pty_info->slave);
965
966 /* Prevent leaking the file descriptors to the container */
967 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
968 if (ret < 0)
969 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
970 "pty device \"%s\": %s",
971 pty_info->master, pty_info->name, strerror(errno));
972
973 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
974 if (ret < 0)
975 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
976 "pty device \"%s\": %s",
977 pty_info->slave, pty_info->name, strerror(errno));
978
979 pty_info->busy = 0;
980 }
981
982 tty_info->nbtty = conf->tty;
983
984 INFO("finished allocating %d pts devices", conf->tty);
985 return 0;
986 }
987
988 void lxc_delete_tty(struct lxc_tty_info *tty_info)
989 {
990 int i;
991
992 for (i = 0; i < tty_info->nbtty; i++) {
993 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
994
995 close(pty_info->master);
996 close(pty_info->slave);
997 }
998
999 free(tty_info->pty_info);
1000 tty_info->pty_info = NULL;
1001 tty_info->nbtty = 0;
1002 }
1003
1004 static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
1005 {
1006 int i;
1007 struct lxc_conf *conf = handler->conf;
1008 struct lxc_tty_info *tty_info = &conf->tty_info;
1009 int sock = handler->data_sock[0];
1010 int ret = -1;
1011
1012 if (!conf->tty)
1013 return 0;
1014
1015 for (i = 0; i < conf->tty; i++) {
1016 int ttyfds[2];
1017 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
1018
1019 ttyfds[0] = pty_info->master;
1020 ttyfds[1] = pty_info->slave;
1021
1022 ret = lxc_abstract_unix_send_fds(sock, ttyfds, 2, NULL, 0);
1023 if (ret < 0)
1024 break;
1025
1026 TRACE("Send pty \"%s\" with master fd %d and slave fd %d to "
1027 "parent", pty_info->name, pty_info->master, pty_info->slave);
1028 }
1029
1030 if (ret < 0)
1031 ERROR("Failed to send %d ttys to parent: %s", conf->tty,
1032 strerror(errno));
1033 else
1034 TRACE("Sent %d ttys to parent", conf->tty);
1035
1036 return ret;
1037 }
1038
1039 static int lxc_create_ttys(struct lxc_handler *handler)
1040 {
1041 int ret = -1;
1042 struct lxc_conf *conf = handler->conf;
1043
1044 ret = lxc_allocate_ttys(handler->name, conf);
1045 if (ret < 0) {
1046 ERROR("Failed to allocate ttys");
1047 goto on_error;
1048 }
1049
1050 ret = lxc_send_ttys_to_parent(handler);
1051 if (ret < 0) {
1052 ERROR("Failed to send ttys to parent");
1053 goto on_error;
1054 }
1055
1056 if (!conf->is_execute) {
1057 ret = lxc_setup_ttys(conf);
1058 if (ret < 0) {
1059 ERROR("Failed to setup ttys");
1060 goto on_error;
1061 }
1062 }
1063
1064 if (conf->pty_names) {
1065 ret = setenv("container_ttys", conf->pty_names, 1);
1066 if (ret < 0)
1067 SYSERROR("Failed to set \"container_ttys=%s\"", conf->pty_names);
1068 }
1069
1070 ret = 0;
1071
1072 on_error:
1073 lxc_delete_tty(&conf->tty_info);
1074
1075 return ret;
1076 }
1077
1078 static int setup_rootfs_pivot_root(const char *rootfs)
1079 {
1080 int oldroot = -1, newroot = -1;
1081
1082 oldroot = open("/", O_DIRECTORY | O_RDONLY);
1083 if (oldroot < 0) {
1084 SYSERROR("Error opening old-/ for fchdir");
1085 return -1;
1086 }
1087 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
1088 if (newroot < 0) {
1089 SYSERROR("Error opening new-/ for fchdir");
1090 goto fail;
1091 }
1092
1093 /* change into new root fs */
1094 if (fchdir(newroot)) {
1095 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
1096 goto fail;
1097 }
1098
1099 /* pivot_root into our new root fs */
1100 if (pivot_root(".", ".")) {
1101 SYSERROR("pivot_root syscall failed");
1102 goto fail;
1103 }
1104
1105 /*
1106 * at this point the old-root is mounted on top of our new-root
1107 * To unmounted it we must not be chdir'd into it, so escape back
1108 * to old-root
1109 */
1110 if (fchdir(oldroot) < 0) {
1111 SYSERROR("Error entering oldroot");
1112 goto fail;
1113 }
1114 if (umount2(".", MNT_DETACH) < 0) {
1115 SYSERROR("Error detaching old root");
1116 goto fail;
1117 }
1118
1119 if (fchdir(newroot) < 0) {
1120 SYSERROR("Error re-entering newroot");
1121 goto fail;
1122 }
1123
1124 close(oldroot);
1125 close(newroot);
1126
1127 DEBUG("pivot_root syscall to '%s' successful", rootfs);
1128
1129 return 0;
1130
1131 fail:
1132 if (oldroot != -1)
1133 close(oldroot);
1134 if (newroot != -1)
1135 close(newroot);
1136 return -1;
1137 }
1138
1139 /* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1140 * error, log it but don't fail yet.
1141 */
1142 static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1143 const char *lxcpath)
1144 {
1145 int ret;
1146 size_t clen;
1147 char *path;
1148
1149 INFO("Preparing \"/dev\"");
1150
1151 /* $(rootfs->mount) + "/dev/pts" + '\0' */
1152 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
1153 path = alloca(clen);
1154
1155 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
1156 if (ret < 0 || (size_t)ret >= clen)
1157 return -1;
1158
1159 if (!dir_exists(path)) {
1160 WARN("\"/dev\" directory does not exist. Proceeding without "
1161 "autodev being set up");
1162 return 0;
1163 }
1164
1165 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
1166 rootfs->path ? rootfs->mount : NULL);
1167 if (ret < 0) {
1168 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1169 return -1;
1170 }
1171 INFO("Mounted tmpfs on \"%s\"", path);
1172
1173 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
1174 if (ret < 0 || (size_t)ret >= clen)
1175 return -1;
1176
1177 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
1178 * If not, then create it and exit if that fails...
1179 */
1180 if (!dir_exists(path)) {
1181 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1182 if (ret < 0) {
1183 SYSERROR("Failed to create directory \"%s\"", path);
1184 return -1;
1185 }
1186 }
1187
1188 INFO("Prepared \"/dev\"");
1189 return 0;
1190 }
1191
1192 struct lxc_devs {
1193 const char *name;
1194 mode_t mode;
1195 int maj;
1196 int min;
1197 };
1198
1199 static const struct lxc_devs lxc_devs[] = {
1200 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1201 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1202 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1203 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1204 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1205 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
1206 };
1207
1208 static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
1209 {
1210 int ret;
1211 char path[MAXPATHLEN];
1212 int i;
1213 mode_t cmask;
1214
1215 ret = snprintf(path, MAXPATHLEN, "%s/dev",
1216 rootfs->path ? rootfs->mount : "");
1217 if (ret < 0 || ret >= MAXPATHLEN)
1218 return -1;
1219
1220 /* ignore, just don't try to fill in */
1221 if (!dir_exists(path))
1222 return 0;
1223
1224 INFO("Populating \"/dev\"");
1225
1226 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1227 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
1228 const struct lxc_devs *d = &lxc_devs[i];
1229
1230 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
1231 rootfs->path ? rootfs->mount : "", d->name);
1232 if (ret < 0 || ret >= MAXPATHLEN)
1233 return -1;
1234
1235 ret = mknod(path, d->mode, makedev(d->maj, d->min));
1236 if (ret < 0) {
1237 FILE *pathfile;
1238 char hostpath[MAXPATHLEN];
1239
1240 if (errno == EEXIST) {
1241 DEBUG("\"%s\" device already existed", path);
1242 continue;
1243 }
1244
1245 /* Unprivileged containers cannot create devices, so
1246 * bind mount the device from the host.
1247 */
1248 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1249 if (ret < 0 || ret >= MAXPATHLEN)
1250 return -1;
1251
1252 pathfile = fopen(path, "wb");
1253 if (!pathfile) {
1254 SYSERROR("Failed to create file \"%s\"", path);
1255 return -1;
1256 }
1257 fclose(pathfile);
1258
1259 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1260 rootfs->path ? rootfs->mount : NULL);
1261 if (ret < 0) {
1262 SYSERROR("Failed to bind mount \"%s\" from "
1263 "host into container",
1264 d->name);
1265 return -1;
1266 }
1267 DEBUG("Bind mounted \"%s\" onto \"%s\"", hostpath,
1268 path);
1269 } else {
1270 DEBUG("Created device node \"%s\"", path);
1271 }
1272 }
1273 umask(cmask);
1274
1275 INFO("Populated \"/dev\"");
1276 return 0;
1277 }
1278
1279 static int lxc_setup_rootfs(struct lxc_conf *conf)
1280 {
1281 int ret;
1282 struct lxc_storage *bdev;
1283 const struct lxc_rootfs *rootfs;
1284
1285 rootfs = &conf->rootfs;
1286 if (!rootfs->path) {
1287 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1288 SYSERROR("Failed to make / rslave.");
1289 return -1;
1290 }
1291 return 0;
1292 }
1293
1294 if (access(rootfs->mount, F_OK)) {
1295 SYSERROR("Failed to access to \"%s\". Check it is present.",
1296 rootfs->mount);
1297 return -1;
1298 }
1299
1300 bdev = storage_init(conf, rootfs->path, rootfs->mount, rootfs->options);
1301 if (!bdev) {
1302 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1303 rootfs->path, rootfs->mount,
1304 rootfs->options ? rootfs->options : "(null)");
1305 return -1;
1306 }
1307
1308 ret = bdev->ops->mount(bdev);
1309 storage_put(bdev);
1310 if (ret < 0) {
1311 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1312 rootfs->path, rootfs->mount,
1313 rootfs->options ? rootfs->options : "(null)");
1314 return -1;
1315 }
1316
1317 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1318 rootfs->path, rootfs->mount,
1319 rootfs->options ? rootfs->options : "(null)");
1320
1321 return 0;
1322 }
1323
1324 int prepare_ramfs_root(char *root)
1325 {
1326 char buf[LXC_LINELEN], *p;
1327 char nroot[PATH_MAX];
1328 FILE *f;
1329 int i;
1330 char *p2;
1331
1332 if (realpath(root, nroot) == NULL)
1333 return -errno;
1334
1335 if (chdir("/") == -1)
1336 return -errno;
1337
1338 /*
1339 * We could use here MS_MOVE, but in userns this mount is
1340 * locked and can't be moved.
1341 */
1342 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
1343 SYSERROR("Failed to move %s into /", root);
1344 return -errno;
1345 }
1346
1347 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
1348 SYSERROR("Failed to make . rprivate");
1349 return -errno;
1350 }
1351
1352 /*
1353 * The following code cleans up inhereted mounts which are not
1354 * required for CT.
1355 *
1356 * The mountinfo file shows not all mounts, if a few points have been
1357 * unmounted between read operations from the mountinfo. So we need to
1358 * read mountinfo a few times.
1359 *
1360 * This loop can be skipped if a container uses unserns, because all
1361 * inherited mounts are locked and we should live with all this trash.
1362 */
1363 while (1) {
1364 int progress = 0;
1365
1366 f = fopen("./proc/self/mountinfo", "r");
1367 if (!f) {
1368 SYSERROR("Unable to open /proc/self/mountinfo");
1369 return -1;
1370 }
1371 while (fgets(buf, LXC_LINELEN, f)) {
1372 for (p = buf, i=0; p && i < 4; i++)
1373 p = strchr(p+1, ' ');
1374 if (!p)
1375 continue;
1376 p2 = strchr(p+1, ' ');
1377 if (!p2)
1378 continue;
1379
1380 *p2 = '\0';
1381 *p = '.';
1382
1383 if (strcmp(p + 1, "/") == 0)
1384 continue;
1385 if (strcmp(p + 1, "/proc") == 0)
1386 continue;
1387
1388 if (umount2(p, MNT_DETACH) == 0)
1389 progress++;
1390 }
1391 fclose(f);
1392 if (!progress)
1393 break;
1394 }
1395
1396 /* This also can be skipped if a container uses unserns */
1397 umount2("./proc", MNT_DETACH);
1398
1399 /* It is weird, but chdir("..") moves us in a new root */
1400 if (chdir("..") == -1) {
1401 SYSERROR("Unable to change working directory");
1402 return -1;
1403 }
1404
1405 if (chroot(".") == -1) {
1406 SYSERROR("Unable to chroot");
1407 return -1;
1408 }
1409
1410 return 0;
1411 }
1412
1413 static int setup_pivot_root(const struct lxc_rootfs *rootfs)
1414 {
1415 if (!rootfs->path) {
1416 DEBUG("container does not have a rootfs, so not doing pivot root");
1417 return 0;
1418 }
1419
1420 if (detect_ramfs_rootfs()) {
1421 DEBUG("detected that container is on ramfs");
1422 if (prepare_ramfs_root(rootfs->mount)) {
1423 ERROR("failed to prepare minimal ramfs root");
1424 return -1;
1425 }
1426
1427 DEBUG("prepared ramfs root for container");
1428 return 0;
1429 }
1430
1431 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1432 ERROR("failed to pivot root");
1433 return -1;
1434 }
1435
1436 DEBUG("finished pivot root");
1437 return 0;
1438 }
1439
1440 static int lxc_setup_devpts(int num_pts)
1441 {
1442 int ret;
1443 const char *default_devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
1444 char devpts_mntopts[256];
1445
1446 if (!num_pts) {
1447 DEBUG("no new devpts instance will be mounted since no pts "
1448 "devices are requested");
1449 return 0;
1450 }
1451
1452 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%d",
1453 default_devpts_mntopts, num_pts);
1454 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1455 return -1;
1456
1457 /* Unmount old devpts instance. */
1458 ret = access("/dev/pts/ptmx", F_OK);
1459 if (!ret) {
1460 ret = umount("/dev/pts");
1461 if (ret < 0) {
1462 SYSERROR("failed to unmount old devpts instance");
1463 return -1;
1464 }
1465 DEBUG("unmounted old /dev/pts instance");
1466 }
1467
1468 /* Create mountpoint for devpts instance. */
1469 ret = mkdir("/dev/pts", 0755);
1470 if (ret < 0 && errno != EEXIST) {
1471 SYSERROR("failed to create the \"/dev/pts\" directory");
1472 return -1;
1473 }
1474
1475 /* Mount new devpts instance. */
1476 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1477 if (ret < 0) {
1478 SYSERROR("failed to mount new devpts instance");
1479 return -1;
1480 }
1481 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
1482
1483 /* Remove any pre-existing /dev/ptmx file. */
1484 ret = access("/dev/ptmx", F_OK);
1485 if (!ret) {
1486 ret = remove("/dev/ptmx");
1487 if (ret < 0) {
1488 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1489 return -1;
1490 }
1491 DEBUG("removed existing \"/dev/ptmx\"");
1492 }
1493
1494 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1495 ret = open("/dev/ptmx", O_CREAT, 0666);
1496 if (ret < 0) {
1497 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1498 return -1;
1499 }
1500 close(ret);
1501 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
1502
1503 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
1504 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
1505 if (!ret) {
1506 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1507 return 0;
1508 } else {
1509 /* Fallthrough and try to create a symlink. */
1510 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1511 }
1512
1513 /* Remove the dummy /dev/ptmx file we created above. */
1514 ret = remove("/dev/ptmx");
1515 if (ret < 0) {
1516 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1517 return -1;
1518 }
1519
1520 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1521 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1522 if (ret < 0) {
1523 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1524 return -1;
1525 }
1526 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1527
1528 return 0;
1529 }
1530
1531 static int setup_personality(int persona)
1532 {
1533 #if HAVE_SYS_PERSONALITY_H
1534 if (persona == -1)
1535 return 0;
1536
1537 if (personality(persona) < 0) {
1538 SYSERROR("failed to set personality to '0x%x'", persona);
1539 return -1;
1540 }
1541
1542 INFO("set personality to '0x%x'", persona);
1543 #endif
1544
1545 return 0;
1546 }
1547
1548 static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1549 const struct lxc_console *console)
1550 {
1551 char path[MAXPATHLEN];
1552 int ret, fd;
1553
1554 if (console->path && !strcmp(console->path, "none"))
1555 return 0;
1556
1557 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1558 if (ret < 0 || (size_t)ret >= sizeof(path))
1559 return -1;
1560
1561 /* When we are asked to setup a console we remove any previous
1562 * /dev/console bind-mounts.
1563 */
1564 if (file_exists(path)) {
1565 ret = lxc_unstack_mountpoint(path, false);
1566 if (ret < 0) {
1567 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1568 return -ret;
1569 } else {
1570 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1571 }
1572
1573 ret = unlink(path);
1574 if (ret < 0) {
1575 SYSERROR("error unlinking %s", path);
1576 return -errno;
1577 }
1578 }
1579
1580 /* For unprivileged containers autodev or automounts will already have
1581 * taken care of creating /dev/console.
1582 */
1583 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1584 if (fd < 0) {
1585 if (errno != EEXIST) {
1586 SYSERROR("failed to create console");
1587 return -errno;
1588 }
1589 } else {
1590 close(fd);
1591 }
1592
1593 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
1594 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1595 return -errno;
1596 }
1597
1598 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
1599 ERROR("failed to mount '%s' on '%s'", console->name, path);
1600 return -1;
1601 }
1602
1603 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
1604 return 0;
1605 }
1606
1607 static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1608 const struct lxc_console *console,
1609 char *ttydir)
1610 {
1611 int ret;
1612 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
1613
1614 /* create rootfs/dev/<ttydir> directory */
1615 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1616 if (ret < 0 || (size_t)ret >= sizeof(path))
1617 return -1;
1618
1619 ret = mkdir(path, 0755);
1620 if (ret && errno != EEXIST) {
1621 SYSERROR("failed with errno %d to create %s", errno, path);
1622 return -errno;
1623 }
1624 DEBUG("Created directory for console and tty devices at \"%s\"", path);
1625
1626 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1627 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1628 return -1;
1629
1630 ret = creat(lxcpath, 0660);
1631 if (ret == -1 && errno != EEXIST) {
1632 SYSERROR("error %d creating %s", errno, lxcpath);
1633 return -errno;
1634 }
1635 if (ret >= 0)
1636 close(ret);
1637
1638 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1639 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1640 return -1;
1641
1642 /* When we are asked to setup a console we remove any previous
1643 * /dev/console bind-mounts.
1644 */
1645 if (console->path && !strcmp(console->path, "none")) {
1646 struct stat st;
1647 ret = stat(path, &st);
1648 if (ret < 0) {
1649 if (errno == ENOENT)
1650 return 0;
1651 SYSERROR("failed stat() \"%s\"", path);
1652 return -errno;
1653 }
1654
1655 /* /dev/console must be character device with major number 5 and
1656 * minor number 1. If not, give benefit of the doubt and assume
1657 * the user has mounted something else right there on purpose.
1658 */
1659 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1660 return 0;
1661
1662 /* In case the user requested a bind-mount for /dev/console and
1663 * requests a ttydir we move the mount to the
1664 * /dev/<ttydir/console.
1665 * Note, we only move the uppermost mount and clear all other
1666 * mounts underneath for safety.
1667 * If it is a character device created via mknod() we simply
1668 * rename it.
1669 */
1670 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1671 if (ret < 0) {
1672 if (errno != EINVAL) {
1673 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1674 return -errno;
1675 }
1676 /* path was not a mountpoint */
1677 ret = rename(path, lxcpath);
1678 if (ret < 0) {
1679 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1680 return -errno;
1681 }
1682 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1683 } else {
1684 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1685 }
1686
1687 /* Clear all remaining bind-mounts. */
1688 ret = lxc_unstack_mountpoint(path, false);
1689 if (ret < 0) {
1690 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1691 return -ret;
1692 } else {
1693 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1694 }
1695 } else {
1696 if (file_exists(path)) {
1697 ret = lxc_unstack_mountpoint(path, false);
1698 if (ret < 0) {
1699 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1700 return -ret;
1701 } else {
1702 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1703 }
1704 }
1705
1706 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1707 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1708 return -1;
1709 }
1710 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
1711 }
1712
1713 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
1714 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
1715 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1716 return -1;
1717
1718 ret = unlink(path);
1719 if (ret && errno != ENOENT) {
1720 SYSERROR("error unlinking %s", path);
1721 return -errno;
1722 }
1723
1724 ret = symlink(lxcpath, path);
1725 if (ret < 0) {
1726 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
1727 return -1;
1728 }
1729
1730 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
1731 return 0;
1732 }
1733
1734 static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1735 const struct lxc_console *console, char *ttydir)
1736 {
1737 /* We don't have a rootfs, /dev/console will be shared. */
1738 if (!rootfs->path) {
1739 DEBUG("/dev/console will be shared with the host");
1740 return 0;
1741 }
1742
1743 if (!ttydir)
1744 return lxc_setup_dev_console(rootfs, console);
1745
1746 return lxc_setup_ttydir_console(rootfs, console, ttydir);
1747 }
1748
1749 static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1750 {
1751 struct mount_opt *mo;
1752
1753 /* If opt is found in mount_opt, set or clear flags.
1754 * Otherwise append it to data. */
1755
1756 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1757 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1758 if (mo->clear)
1759 *flags &= ~mo->flag;
1760 else
1761 *flags |= mo->flag;
1762 return;
1763 }
1764 }
1765
1766 if (strlen(*data))
1767 strcat(*data, ",");
1768 strcat(*data, opt);
1769 }
1770
1771 int parse_mntopts(const char *mntopts, unsigned long *mntflags,
1772 char **mntdata)
1773 {
1774 char *s, *data;
1775 char *p, *saveptr = NULL;
1776
1777 *mntdata = NULL;
1778 *mntflags = 0L;
1779
1780 if (!mntopts)
1781 return 0;
1782
1783 s = strdup(mntopts);
1784 if (!s) {
1785 SYSERROR("failed to allocate memory");
1786 return -1;
1787 }
1788
1789 data = malloc(strlen(s) + 1);
1790 if (!data) {
1791 SYSERROR("failed to allocate memory");
1792 free(s);
1793 return -1;
1794 }
1795 *data = 0;
1796
1797 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1798 p = strtok_r(NULL, ",", &saveptr))
1799 parse_mntopt(p, mntflags, &data);
1800
1801 if (*data)
1802 *mntdata = data;
1803 else
1804 free(data);
1805 free(s);
1806
1807 return 0;
1808 }
1809
1810 static void null_endofword(char *word)
1811 {
1812 while (*word && *word != ' ' && *word != '\t')
1813 word++;
1814 *word = '\0';
1815 }
1816
1817 /*
1818 * skip @nfields spaces in @src
1819 */
1820 static char *get_field(char *src, int nfields)
1821 {
1822 char *p = src;
1823 int i;
1824
1825 for (i = 0; i < nfields; i++) {
1826 while (*p && *p != ' ' && *p != '\t')
1827 p++;
1828 if (!*p)
1829 break;
1830 p++;
1831 }
1832 return p;
1833 }
1834
1835 static int mount_entry(const char *fsname, const char *target,
1836 const char *fstype, unsigned long mountflags,
1837 const char *data, int optional, int dev,
1838 const char *rootfs)
1839 {
1840 int ret;
1841 #ifdef HAVE_STATVFS
1842 struct statvfs sb;
1843 #endif
1844
1845 ret = safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data,
1846 rootfs);
1847 if (ret < 0) {
1848 if (optional) {
1849 INFO("Failed to mount \"%s\" on \"%s\" (optional): %s",
1850 fsname, target, strerror(errno));
1851 return 0;
1852 }
1853
1854 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1855 return -1;
1856 }
1857
1858 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
1859 unsigned long rqd_flags = 0;
1860
1861 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
1862 "options",
1863 fsname ? fsname : "(none)", target ? target : "(none)");
1864
1865 if (mountflags & MS_RDONLY)
1866 rqd_flags |= MS_RDONLY;
1867 #ifdef HAVE_STATVFS
1868 if (statvfs(fsname, &sb) == 0) {
1869 unsigned long required_flags = rqd_flags;
1870
1871 if (sb.f_flag & MS_NOSUID)
1872 required_flags |= MS_NOSUID;
1873
1874 if (sb.f_flag & MS_NODEV && !dev)
1875 required_flags |= MS_NODEV;
1876
1877 if (sb.f_flag & MS_RDONLY)
1878 required_flags |= MS_RDONLY;
1879
1880 if (sb.f_flag & MS_NOEXEC)
1881 required_flags |= MS_NOEXEC;
1882
1883 DEBUG("Flags for \"%s\" were %lu, required extra flags "
1884 "are %lu", fsname, sb.f_flag, required_flags);
1885
1886 /* If this was a bind mount request, and required_flags
1887 * does not have any flags which are not already in
1888 * mountflags, then skip the remount.
1889 */
1890 if (!(mountflags & MS_REMOUNT)) {
1891 if (!(required_flags & ~mountflags) &&
1892 rqd_flags == 0) {
1893 DEBUG("Mountflags already were %lu, "
1894 "skipping remount", mountflags);
1895 goto skipremount;
1896 }
1897 }
1898
1899 mountflags |= required_flags;
1900 }
1901 #endif
1902
1903 ret = mount(fsname, target, fstype, mountflags | MS_REMOUNT, data);
1904 if (ret < 0) {
1905 if (optional) {
1906 INFO("Failed to mount \"%s\" on \"%s\" "
1907 "(optional): %s", fsname, target,
1908 strerror(errno));
1909 return 0;
1910 }
1911
1912 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1913 return -1;
1914 }
1915 }
1916
1917 #ifdef HAVE_STATVFS
1918 skipremount:
1919 #endif
1920 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"", fsname,
1921 target, fstype);
1922
1923 return 0;
1924 }
1925
1926 /* Remove "optional", "create=dir", and "create=file" from mntopt */
1927 static void cull_mntent_opt(struct mntent *mntent)
1928 {
1929 int i;
1930 char *list[] = {"create=dir", "create=file", "optional", NULL};
1931
1932 for (i = 0; list[i]; i++) {
1933 char *p, *p2;
1934
1935 p = strstr(mntent->mnt_opts, list[i]);
1936 if (!p)
1937 continue;
1938
1939 p2 = strchr(p, ',');
1940 if (!p2) {
1941 /* no more mntopts, so just chop it here */
1942 *p = '\0';
1943 continue;
1944 }
1945
1946 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
1947 }
1948 }
1949
1950 static int mount_entry_create_dir_file(const struct mntent *mntent,
1951 const char *path,
1952 const struct lxc_rootfs *rootfs,
1953 const char *lxc_name,
1954 const char *lxc_path)
1955 {
1956 int ret = 0;
1957
1958 if (!strncmp(mntent->mnt_type, "overlay", 7))
1959 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
1960 else if (!strncmp(mntent->mnt_type, "aufs", 4))
1961 ret = aufs_mkdir(mntent, rootfs, lxc_name, lxc_path);
1962 if (ret < 0)
1963 return -1;
1964
1965 if (hasmntopt(mntent, "create=dir")) {
1966 ret = mkdir_p(path, 0755);
1967 if (ret < 0 && errno != EEXIST) {
1968 SYSERROR("Failed to create directory \"%s\"", path);
1969 return -1;
1970 }
1971 }
1972
1973 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1974 int fd;
1975 char *p1, *p2;
1976
1977 p1 = strdup(path);
1978 if (!p1)
1979 return -1;
1980
1981 p2 = dirname(p1);
1982
1983 ret = mkdir_p(p2, 0755);
1984 free(p1);
1985 if (ret < 0 && errno != EEXIST) {
1986 SYSERROR("Failed to create directory \"%s\"", path);
1987 return -1;
1988 }
1989
1990 fd = open(path, O_CREAT, 0644);
1991 if (fd < 0)
1992 return -1;
1993 close(fd);
1994 }
1995
1996 return 0;
1997 }
1998
1999 /* rootfs, lxc_name, and lxc_path can be NULL when the container is created
2000 * without a rootfs. */
2001 static inline int mount_entry_on_generic(struct mntent *mntent,
2002 const char *path,
2003 const struct lxc_rootfs *rootfs,
2004 const char *lxc_name,
2005 const char *lxc_path)
2006 {
2007 int ret;
2008 unsigned long mntflags;
2009 char *mntdata;
2010 bool dev, optional;
2011 char *rootfs_path = NULL;
2012
2013 optional = hasmntopt(mntent, "optional") != NULL;
2014 dev = hasmntopt(mntent, "dev") != NULL;
2015
2016 if (rootfs && rootfs->path)
2017 rootfs_path = rootfs->mount;
2018
2019 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
2020 lxc_path);
2021 if (ret < 0) {
2022 if (optional)
2023 return 0;
2024
2025 return -1;
2026 }
2027 cull_mntent_opt(mntent);
2028
2029 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
2030 if (ret < 0)
2031 return -1;
2032
2033 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
2034 mntdata, optional, dev, rootfs_path);
2035
2036 free(mntdata);
2037 return ret;
2038 }
2039
2040 static inline int mount_entry_on_systemfs(struct mntent *mntent)
2041 {
2042 int ret;
2043 char path[MAXPATHLEN];
2044
2045 /* For containers created without a rootfs all mounts are treated as
2046 * absolute paths starting at / on the host.
2047 */
2048 if (mntent->mnt_dir[0] != '/')
2049 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2050 else
2051 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
2052 if (ret < 0 || ret >= sizeof(path))
2053 return -1;
2054
2055 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
2056 }
2057
2058 static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
2059 const struct lxc_rootfs *rootfs,
2060 const char *lxc_name,
2061 const char *lxc_path)
2062 {
2063 int offset;
2064 char *aux;
2065 const char *lxcpath;
2066 char path[MAXPATHLEN];
2067 int ret = 0;
2068
2069 lxcpath = lxc_global_config_value("lxc.lxcpath");
2070 if (!lxcpath)
2071 return -1;
2072
2073 /* If rootfs->path is a blockdev path, allow container fstab to use
2074 * <lxcpath>/<name>/rootfs" as the target prefix.
2075 */
2076 ret = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
2077 if (ret < 0 || ret >= MAXPATHLEN)
2078 goto skipvarlib;
2079
2080 aux = strstr(mntent->mnt_dir, path);
2081 if (aux) {
2082 offset = strlen(path);
2083 goto skipabs;
2084 }
2085
2086 skipvarlib:
2087 aux = strstr(mntent->mnt_dir, rootfs->path);
2088 if (!aux) {
2089 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
2090 return ret;
2091 }
2092 offset = strlen(rootfs->path);
2093
2094 skipabs:
2095 ret = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset);
2096 if (ret < 0 || ret >= MAXPATHLEN)
2097 return -1;
2098
2099 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2100 }
2101
2102 static int mount_entry_on_relative_rootfs(struct mntent *mntent,
2103 const struct lxc_rootfs *rootfs,
2104 const char *lxc_name,
2105 const char *lxc_path)
2106 {
2107 char path[MAXPATHLEN];
2108 int ret;
2109
2110 /* relative to root mount point */
2111 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
2112 if (ret < 0 || ret >= sizeof(path)) {
2113 ERROR("path name too long");
2114 return -1;
2115 }
2116
2117 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2118 }
2119
2120 /* This logs a NOTICE() when a user specifies mounts that would conflict with
2121 * devices liblxc sets up automatically.
2122 */
2123 static void log_notice_on_conflict(const struct lxc_conf *conf, const char *src,
2124 const char *dest)
2125 {
2126 char *clean_mnt_fsname, *clean_mnt_dir, *tmp;
2127 bool needs_warning = false;
2128
2129 clean_mnt_fsname = lxc_deslashify(src);
2130 if (!clean_mnt_fsname)
2131 return;
2132
2133 clean_mnt_dir = lxc_deslashify(dest);
2134 if (!clean_mnt_dir) {
2135 free(clean_mnt_fsname);
2136 return;
2137 }
2138
2139 tmp = clean_mnt_dir;
2140 if (*tmp == '/')
2141 tmp++;
2142
2143 if (strncmp(src, "/dev", 4) || strncmp(tmp, "dev", 3)) {
2144 free(clean_mnt_dir);
2145 free(clean_mnt_fsname);
2146 return;
2147 }
2148
2149 if (!conf->autodev && !conf->pts && !conf->tty &&
2150 (!conf->console.path || !strcmp(conf->console.path, "none"))) {
2151 free(clean_mnt_dir);
2152 free(clean_mnt_fsname);
2153 return;
2154 }
2155
2156 if (!strcmp(tmp, "dev") && conf->autodev > 0)
2157 needs_warning = true;
2158 else if (!strcmp(tmp, "dev/pts") && (conf->autodev > 0 || conf->pts > 0))
2159 needs_warning = true;
2160 else if (!strcmp(tmp, "dev/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2161 needs_warning = true;
2162 else if (!strcmp(tmp, "dev/pts/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2163 needs_warning = true;
2164 else if (!strcmp(tmp, "dev/null") && conf->autodev > 0)
2165 needs_warning = true;
2166 else if (!strcmp(tmp, "dev/zero") && conf->autodev > 0)
2167 needs_warning = true;
2168 else if (!strcmp(tmp, "dev/full") && conf->autodev > 0)
2169 needs_warning = true;
2170 else if (!strcmp(tmp, "dev/urandom") && conf->autodev > 0)
2171 needs_warning = true;
2172 else if (!strcmp(tmp, "dev/random") && conf->autodev > 0)
2173 needs_warning = true;
2174 else if (!strcmp(tmp, "dev/tty") && conf->autodev > 0)
2175 needs_warning = true;
2176 else if (!strncmp(tmp, "dev/tty", 7) && (conf->autodev > 0 || conf->tty > 0))
2177 needs_warning = true;
2178
2179 if (needs_warning)
2180 NOTICE("Requesting to mount \"%s\" on \"%s\" while requesting "
2181 "automatic device setup under \"/dev\"",
2182 clean_mnt_fsname, clean_mnt_dir);
2183
2184 free(clean_mnt_dir);
2185 free(clean_mnt_fsname);
2186 }
2187
2188 static int mount_file_entries(const struct lxc_conf *conf,
2189 const struct lxc_rootfs *rootfs, FILE *file,
2190 const char *lxc_name, const char *lxc_path)
2191 {
2192 struct mntent mntent;
2193 char buf[4096];
2194 int ret = -1;
2195
2196 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
2197 log_notice_on_conflict(conf, mntent.mnt_fsname, mntent.mnt_dir);
2198
2199 if (!rootfs->path)
2200 ret = mount_entry_on_systemfs(&mntent);
2201 else if (mntent.mnt_dir[0] != '/')
2202 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2203 lxc_name, lxc_path);
2204 else
2205 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2206 lxc_name, lxc_path);
2207 if (ret < 0)
2208 return -1;
2209 }
2210 ret = 0;
2211
2212 INFO("Set up mount entries");
2213 return ret;
2214 }
2215
2216 static int setup_mount(const struct lxc_conf *conf,
2217 const struct lxc_rootfs *rootfs, const char *fstab,
2218 const char *lxc_name, const char *lxc_path)
2219 {
2220 FILE *f;
2221 int ret;
2222
2223 if (!fstab)
2224 return 0;
2225
2226 f = setmntent(fstab, "r");
2227 if (!f) {
2228 SYSERROR("Failed to open \"%s\"", fstab);
2229 return -1;
2230 }
2231
2232 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
2233 if (ret < 0)
2234 ERROR("Failed to set up mount entries");
2235
2236 endmntent(f);
2237 return ret;
2238 }
2239
2240 FILE *make_anonymous_mount_file(struct lxc_list *mount)
2241 {
2242 int ret;
2243 char *mount_entry;
2244 struct lxc_list *iterator;
2245 FILE *f;
2246 int fd = -1;
2247
2248 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2249 if (fd < 0) {
2250 if (errno != ENOSYS)
2251 return NULL;
2252 f = tmpfile();
2253 TRACE("Created temporary mount file");
2254 } else {
2255 f = fdopen(fd, "r+");
2256 TRACE("Created anonymous mount file");
2257 }
2258
2259 if (!f) {
2260 SYSERROR("Could not create mount file");
2261 if (fd != -1)
2262 close(fd);
2263 return NULL;
2264 }
2265
2266 lxc_list_for_each(iterator, mount) {
2267 mount_entry = iterator->elem;
2268 ret = fprintf(f, "%s\n", mount_entry);
2269 if (ret < strlen(mount_entry))
2270 WARN("Could not write mount entry to mount file");
2271 }
2272
2273 ret = fseek(f, 0, SEEK_SET);
2274 if (ret < 0) {
2275 SYSERROR("Failed to seek mount file");
2276 fclose(f);
2277 return NULL;
2278 }
2279
2280 return f;
2281 }
2282
2283 static int setup_mount_entries(const struct lxc_conf *conf,
2284 const struct lxc_rootfs *rootfs,
2285 struct lxc_list *mount, const char *lxc_name,
2286 const char *lxc_path)
2287 {
2288 FILE *f;
2289 int ret;
2290
2291 f = make_anonymous_mount_file(mount);
2292 if (!f)
2293 return -1;
2294
2295 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
2296
2297 fclose(f);
2298 return ret;
2299 }
2300
2301 static int parse_cap(const char *cap)
2302 {
2303 char *ptr = NULL;
2304 size_t i;
2305 int capid = -1;
2306
2307 if (!strcmp(cap, "none"))
2308 return -2;
2309
2310 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2311
2312 if (strcmp(cap, caps_opt[i].name))
2313 continue;
2314
2315 capid = caps_opt[i].value;
2316 break;
2317 }
2318
2319 if (capid < 0) {
2320 /* try to see if it's numeric, so the user may specify
2321 * capabilities that the running kernel knows about but
2322 * we don't */
2323 errno = 0;
2324 capid = strtol(cap, &ptr, 10);
2325 if (!ptr || *ptr != '\0' || errno != 0)
2326 /* not a valid number */
2327 capid = -1;
2328 else if (capid > lxc_caps_last_cap())
2329 /* we have a number but it's not a valid
2330 * capability */
2331 capid = -1;
2332 }
2333
2334 return capid;
2335 }
2336
2337 int in_caplist(int cap, struct lxc_list *caps)
2338 {
2339 struct lxc_list *iterator;
2340 int capid;
2341
2342 lxc_list_for_each(iterator, caps) {
2343 capid = parse_cap(iterator->elem);
2344 if (capid == cap)
2345 return 1;
2346 }
2347
2348 return 0;
2349 }
2350
2351 static int setup_caps(struct lxc_list *caps)
2352 {
2353 struct lxc_list *iterator;
2354 char *drop_entry;
2355 int capid;
2356
2357 lxc_list_for_each(iterator, caps) {
2358
2359 drop_entry = iterator->elem;
2360
2361 capid = parse_cap(drop_entry);
2362
2363 if (capid < 0) {
2364 ERROR("unknown capability %s", drop_entry);
2365 return -1;
2366 }
2367
2368 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2369
2370 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
2371 SYSERROR("failed to remove %s capability", drop_entry);
2372 return -1;
2373 }
2374
2375 }
2376
2377 DEBUG("capabilities have been setup");
2378
2379 return 0;
2380 }
2381
2382 static int dropcaps_except(struct lxc_list *caps)
2383 {
2384 struct lxc_list *iterator;
2385 char *keep_entry;
2386 int i, capid;
2387 int numcaps = lxc_caps_last_cap() + 1;
2388 INFO("found %d capabilities", numcaps);
2389
2390 if (numcaps <= 0 || numcaps > 200)
2391 return -1;
2392
2393 /* caplist[i] is 1 if we keep capability i */
2394 int *caplist = alloca(numcaps * sizeof(int));
2395 memset(caplist, 0, numcaps * sizeof(int));
2396
2397 lxc_list_for_each(iterator, caps) {
2398
2399 keep_entry = iterator->elem;
2400
2401 capid = parse_cap(keep_entry);
2402
2403 if (capid == -2)
2404 continue;
2405
2406 if (capid < 0) {
2407 ERROR("unknown capability %s", keep_entry);
2408 return -1;
2409 }
2410
2411 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
2412
2413 caplist[capid] = 1;
2414 }
2415 for (i=0; i<numcaps; i++) {
2416 if (caplist[i])
2417 continue;
2418 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
2419 SYSERROR("failed to remove capability %d", i);
2420 return -1;
2421 }
2422 }
2423
2424 DEBUG("capabilities have been setup");
2425
2426 return 0;
2427 }
2428
2429 static int parse_resource(const char *res) {
2430 size_t i;
2431 int resid = -1;
2432
2433 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2434 if (strcmp(res, limit_opt[i].name) == 0)
2435 return limit_opt[i].value;
2436 }
2437
2438 /* try to see if it's numeric, so the user may specify
2439 * resources that the running kernel knows about but
2440 * we don't */
2441 if (lxc_safe_int(res, &resid) == 0)
2442 return resid;
2443 return -1;
2444 }
2445
2446 int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2447 struct lxc_list *it;
2448 struct lxc_limit *lim;
2449 int resid;
2450
2451 lxc_list_for_each(it, limits) {
2452 lim = it->elem;
2453
2454 resid = parse_resource(lim->resource);
2455 if (resid < 0) {
2456 ERROR("unknown resource %s", lim->resource);
2457 return -1;
2458 }
2459
2460 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2461 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2462 return -1;
2463 }
2464 }
2465 return 0;
2466 }
2467
2468 static char *default_rootfs_mount = LXCROOTFSMOUNT;
2469
2470 struct lxc_conf *lxc_conf_init(void)
2471 {
2472 struct lxc_conf *new;
2473 int i;
2474
2475 new = malloc(sizeof(*new));
2476 if (!new) {
2477 ERROR("lxc_conf_init : %s", strerror(errno));
2478 return NULL;
2479 }
2480 memset(new, 0, sizeof(*new));
2481
2482 new->loglevel = LXC_LOG_LEVEL_NOTSET;
2483 new->personality = -1;
2484 new->autodev = 1;
2485 new->console.log_path = NULL;
2486 new->console.log_fd = -1;
2487 new->console.path = NULL;
2488 new->console.peer = -1;
2489 new->console.peerpty.busy = -1;
2490 new->console.peerpty.master = -1;
2491 new->console.peerpty.slave = -1;
2492 new->console.master = -1;
2493 new->console.slave = -1;
2494 new->console.name[0] = '\0';
2495 new->maincmd_fd = -1;
2496 new->nbd_idx = -1;
2497 new->rootfs.mount = strdup(default_rootfs_mount);
2498 if (!new->rootfs.mount) {
2499 ERROR("lxc_conf_init : %s", strerror(errno));
2500 free(new);
2501 return NULL;
2502 }
2503 new->logfd = -1;
2504 lxc_list_init(&new->cgroup);
2505 lxc_list_init(&new->network);
2506 lxc_list_init(&new->mount_list);
2507 lxc_list_init(&new->caps);
2508 lxc_list_init(&new->keepcaps);
2509 lxc_list_init(&new->id_map);
2510 lxc_list_init(&new->includes);
2511 lxc_list_init(&new->aliens);
2512 lxc_list_init(&new->environment);
2513 lxc_list_init(&new->limits);
2514 for (i=0; i<NUM_LXC_HOOKS; i++)
2515 lxc_list_init(&new->hooks[i]);
2516 lxc_list_init(&new->groups);
2517 new->lsm_aa_profile = NULL;
2518 new->lsm_se_context = NULL;
2519 new->tmp_umount_proc = 0;
2520
2521 for (i = 0; i < LXC_NS_MAX; i++)
2522 new->inherit_ns_fd[i] = -1;
2523
2524 /* if running in a new user namespace, init and COMMAND
2525 * default to running as UID/GID 0 when using lxc-execute */
2526 new->init_uid = 0;
2527 new->init_gid = 0;
2528 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
2529
2530 return new;
2531 }
2532
2533 static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
2534 size_t buf_size)
2535 {
2536 char path[MAXPATHLEN];
2537 int fd, ret;
2538
2539 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
2540 idtype == ID_TYPE_UID ? 'u' : 'g');
2541 if (ret < 0 || ret >= MAXPATHLEN) {
2542 ERROR("failed to create path \"%s\"", path);
2543 return -E2BIG;
2544 }
2545
2546 fd = open(path, O_WRONLY);
2547 if (fd < 0) {
2548 SYSERROR("failed to open \"%s\"", path);
2549 return -1;
2550 }
2551
2552 errno = 0;
2553 ret = lxc_write_nointr(fd, buf, buf_size);
2554 if (ret != buf_size) {
2555 SYSERROR("failed to write %cid mapping to \"%s\"",
2556 idtype == ID_TYPE_UID ? 'u' : 'g', path);
2557 close(fd);
2558 return -1;
2559 }
2560 close(fd);
2561
2562 return 0;
2563 }
2564
2565 /* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
2566 *
2567 * @return 1 if functional binary was found
2568 * @return 0 if binary exists but is lacking privilege
2569 * @return -ENOENT if binary does not exist
2570 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
2571 *
2572 */
2573 static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
2574 {
2575 char *path;
2576 int ret;
2577 struct stat st;
2578 int fret = 0;
2579
2580 if (cap != CAP_SETUID && cap != CAP_SETGID)
2581 return -EINVAL;
2582
2583 path = on_path(binary, NULL);
2584 if (!path)
2585 return -ENOENT;
2586
2587 ret = stat(path, &st);
2588 if (ret < 0) {
2589 fret = -errno;
2590 goto cleanup;
2591 }
2592
2593 /* Check if the binary is setuid. */
2594 if (st.st_mode & S_ISUID) {
2595 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
2596 fret = 1;
2597 goto cleanup;
2598 }
2599
2600 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
2601 /* Check if it has the CAP_SETUID capability. */
2602 if ((cap & CAP_SETUID) &&
2603 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
2604 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
2605 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
2606 "and CAP_PERMITTED sets.", path);
2607 fret = 1;
2608 goto cleanup;
2609 }
2610
2611 /* Check if it has the CAP_SETGID capability. */
2612 if ((cap & CAP_SETGID) &&
2613 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
2614 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
2615 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
2616 "and CAP_PERMITTED sets.", path);
2617 fret = 1;
2618 goto cleanup;
2619 }
2620 #else
2621 /* If we cannot check for file capabilities we need to give the benefit
2622 * of the doubt. Otherwise we might fail even though all the necessary
2623 * file capabilities are set.
2624 */
2625 DEBUG("Cannot check for file capabilites as full capability support is "
2626 "missing. Manual intervention needed.");
2627 fret = 1;
2628 #endif
2629
2630 cleanup:
2631 free(path);
2632 return fret;
2633 }
2634
2635 int lxc_map_ids_exec_wrapper(void *args)
2636 {
2637 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
2638 return -1;
2639 }
2640
2641 int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
2642 {
2643 struct id_map *map;
2644 struct lxc_list *iterator;
2645 enum idtype type;
2646 char u_or_g;
2647 char *pos;
2648 int fill, left;
2649 char cmd_output[MAXPATHLEN];
2650 /* strlen("new@idmap") = 9
2651 * +
2652 * strlen(" ") = 1
2653 * +
2654 * LXC_NUMSTRLEN64
2655 * +
2656 * strlen(" ") = 1
2657 *
2658 * We add some additional space to make sure that we really have
2659 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
2660 */
2661 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
2662 int ret = 0, uidmap = 0, gidmap = 0;
2663 bool use_shadow = false, had_entry = false;
2664
2665 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
2666 * ranges, then insist that root also reserve ranges in subuid. This
2667 * will protected it by preventing another user from being handed the
2668 * range by shadow.
2669 */
2670 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
2671 if (uidmap == -ENOENT)
2672 WARN("newuidmap binary is missing");
2673 else if (!uidmap)
2674 WARN("newuidmap is lacking necessary privileges");
2675
2676 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
2677 if (gidmap == -ENOENT)
2678 WARN("newgidmap binary is missing");
2679 else if (!gidmap)
2680 WARN("newgidmap is lacking necessary privileges");
2681
2682 if (uidmap > 0 && gidmap > 0) {
2683 DEBUG("Functional newuidmap and newgidmap binary found.");
2684 use_shadow = true;
2685 } else {
2686 /* In case unprivileged users run application containers via
2687 * execute() or a start*() there are valid cases where they may
2688 * only want to map their own {g,u}id. Let's not block them from
2689 * doing so by requiring geteuid() == 0.
2690 */
2691 DEBUG("No newuidmap and newgidmap binary found. Trying to "
2692 "write directly with euid %d.", geteuid());
2693 }
2694
2695 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
2696 type++, u_or_g = 'g') {
2697 pos = mapbuf;
2698
2699 if (use_shadow)
2700 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
2701
2702 lxc_list_for_each(iterator, idmap) {
2703 /* The kernel only takes <= 4k for writes to
2704 * /proc/<nr>/[ug]id_map
2705 */
2706 map = iterator->elem;
2707 if (map->idtype != type)
2708 continue;
2709
2710 had_entry = true;
2711
2712 left = LXC_IDMAPLEN - (pos - mapbuf);
2713 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
2714 use_shadow ? " " : "", map->nsid,
2715 map->hostid, map->range,
2716 use_shadow ? "" : "\n");
2717 if (fill <= 0 || fill >= left)
2718 SYSERROR("Too many {g,u}id mappings defined.");
2719
2720 pos += fill;
2721 }
2722 if (!had_entry)
2723 continue;
2724
2725 /* Try to catch the ouput of new{g,u}idmap to make debugging
2726 * easier.
2727 */
2728 if (use_shadow) {
2729 ret = run_command(cmd_output, sizeof(cmd_output),
2730 lxc_map_ids_exec_wrapper,
2731 (void *)mapbuf);
2732 if (ret < 0) {
2733 ERROR("new%cidmap failed to write mapping \"%s\": %s",
2734 u_or_g, cmd_output, mapbuf);
2735 return -1;
2736 }
2737 TRACE("new%cidmap wrote mapping \"%s\"", u_or_g, mapbuf);
2738 } else {
2739 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
2740 if (ret < 0) {
2741 ERROR("Failed to write mapping \"%s\": %s",
2742 cmd_output, mapbuf);
2743 return -1;
2744 }
2745 TRACE("Wrote mapping \"%s\"", mapbuf);
2746 }
2747
2748 memset(mapbuf, 0, sizeof(mapbuf));
2749 }
2750
2751 return 0;
2752 }
2753
2754 /*
2755 * return the host uid/gid to which the container root is mapped in
2756 * *val.
2757 * Return true if id was found, false otherwise.
2758 */
2759 bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
2760 unsigned long *val)
2761 {
2762 struct lxc_list *it;
2763 struct id_map *map;
2764
2765 lxc_list_for_each(it, &conf->id_map) {
2766 map = it->elem;
2767 if (map->idtype != idtype)
2768 continue;
2769 if (map->nsid != 0)
2770 continue;
2771 *val = map->hostid;
2772 return true;
2773 }
2774 return false;
2775 }
2776
2777 int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
2778 {
2779 struct lxc_list *it;
2780 struct id_map *map;
2781 lxc_list_for_each(it, &conf->id_map) {
2782 map = it->elem;
2783 if (map->idtype != idtype)
2784 continue;
2785 if (id >= map->hostid && id < map->hostid + map->range)
2786 return (id - map->hostid) + map->nsid;
2787 }
2788 return -1;
2789 }
2790
2791 int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
2792 {
2793 struct lxc_list *it;
2794 struct id_map *map;
2795 unsigned int freeid = 0;
2796 again:
2797 lxc_list_for_each(it, &conf->id_map) {
2798 map = it->elem;
2799 if (map->idtype != idtype)
2800 continue;
2801 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
2802 freeid = map->nsid + map->range;
2803 goto again;
2804 }
2805 }
2806 return freeid;
2807 }
2808
2809 int chown_mapped_root_exec_wrapper(void *args)
2810 {
2811 execvp("lxc-usernsexec", args);
2812 return -1;
2813 }
2814
2815 /*
2816 * chown_mapped_root: for an unprivileged user with uid/gid X to
2817 * chown a dir to subuid/subgid Y, he needs to run chown as root
2818 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
2819 * nsid Y is mapped to hostuid/hostgid X. That way, the container
2820 * root is privileged with respect to hostuid/hostgid X, allowing
2821 * him to do the chown.
2822 */
2823 int chown_mapped_root(char *path, struct lxc_conf *conf)
2824 {
2825 uid_t rootuid, rootgid;
2826 unsigned long val;
2827 int hostuid, hostgid, ret;
2828 struct stat sb;
2829 char map1[100], map2[100], map3[100], map4[100], map5[100];
2830 char ugid[100];
2831 char *args1[] = {"lxc-usernsexec",
2832 "-m", map1,
2833 "-m", map2,
2834 "-m", map3,
2835 "-m", map5,
2836 "--", "chown", ugid, path,
2837 NULL};
2838 char *args2[] = {"lxc-usernsexec",
2839 "-m", map1,
2840 "-m", map2,
2841 "-m", map3,
2842 "-m", map4,
2843 "-m", map5,
2844 "--", "chown", ugid, path,
2845 NULL};
2846 char cmd_output[MAXPATHLEN];
2847
2848 hostuid = geteuid();
2849 hostgid = getegid();
2850
2851 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
2852 ERROR("No uid mapping for container root");
2853 return -1;
2854 }
2855 rootuid = (uid_t)val;
2856 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
2857 ERROR("No gid mapping for container root");
2858 return -1;
2859 }
2860 rootgid = (gid_t)val;
2861
2862 if (hostuid == 0) {
2863 if (chown(path, rootuid, rootgid) < 0) {
2864 ERROR("Error chowning %s", path);
2865 return -1;
2866 }
2867 return 0;
2868 }
2869
2870 if (rootuid == hostuid) {
2871 /* nothing to do */
2872 INFO("Container root is our uid; no need to chown");
2873 return 0;
2874 }
2875
2876 /* save the current gid of "path" */
2877 if (stat(path, &sb) < 0) {
2878 ERROR("Error stat %s", path);
2879 return -1;
2880 }
2881
2882 /* Update the path argument in case this was overlayfs. */
2883 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
2884 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
2885
2886 /*
2887 * A file has to be group-owned by a gid mapped into the
2888 * container, or the container won't be privileged over it.
2889 */
2890 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
2891 if (sb.st_uid == hostuid &&
2892 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
2893 chown(path, -1, hostgid) < 0) {
2894 ERROR("Failed chgrping %s", path);
2895 return -1;
2896 }
2897
2898 /* "u:0:rootuid:1" */
2899 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
2900 if (ret < 0 || ret >= 100) {
2901 ERROR("Error uid printing map string");
2902 return -1;
2903 }
2904
2905 /* "u:hostuid:hostuid:1" */
2906 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
2907 if (ret < 0 || ret >= 100) {
2908 ERROR("Error uid printing map string");
2909 return -1;
2910 }
2911
2912 /* "g:0:rootgid:1" */
2913 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
2914 if (ret < 0 || ret >= 100) {
2915 ERROR("Error gid printing map string");
2916 return -1;
2917 }
2918
2919 /* "g:pathgid:rootgid+pathgid:1" */
2920 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
2921 rootgid + (gid_t)sb.st_gid);
2922 if (ret < 0 || ret >= 100) {
2923 ERROR("Error gid printing map string");
2924 return -1;
2925 }
2926
2927 /* "g:hostgid:hostgid:1" */
2928 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
2929 if (ret < 0 || ret >= 100) {
2930 ERROR("Error gid printing map string");
2931 return -1;
2932 }
2933
2934 /* "0:pathgid" (chown) */
2935 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
2936 if (ret < 0 || ret >= 100) {
2937 ERROR("Error owner printing format string for chown");
2938 return -1;
2939 }
2940
2941 if (hostgid == sb.st_gid)
2942 ret = run_command(cmd_output, sizeof(cmd_output),
2943 chown_mapped_root_exec_wrapper,
2944 (void *)args1);
2945 else
2946 ret = run_command(cmd_output, sizeof(cmd_output),
2947 chown_mapped_root_exec_wrapper,
2948 (void *)args2);
2949 if (ret < 0)
2950 ERROR("lxc-usernsexec failed: %s", cmd_output);
2951
2952 return ret;
2953 }
2954
2955 int lxc_ttys_shift_ids(struct lxc_conf *c)
2956 {
2957 if (lxc_list_empty(&c->id_map))
2958 return 0;
2959
2960 if (!strcmp(c->console.name, ""))
2961 return 0;
2962
2963 if (chown_mapped_root(c->console.name, c) < 0) {
2964 ERROR("failed to chown console \"%s\"", c->console.name);
2965 return -1;
2966 }
2967
2968 TRACE("chowned console \"%s\"", c->console.name);
2969
2970 return 0;
2971 }
2972
2973 /* NOTE: Must not be called from inside the container namespace! */
2974 int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
2975 {
2976 int mounted;
2977
2978 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
2979 if (mounted == -1) {
2980 SYSERROR("failed to mount /proc in the container");
2981 /* continue only if there is no rootfs */
2982 if (conf->rootfs.path)
2983 return -1;
2984 } else if (mounted == 1) {
2985 conf->tmp_umount_proc = 1;
2986 }
2987
2988 return 0;
2989 }
2990
2991 void tmp_proc_unmount(struct lxc_conf *lxc_conf)
2992 {
2993 if (lxc_conf->tmp_umount_proc == 1) {
2994 umount("/proc");
2995 lxc_conf->tmp_umount_proc = 0;
2996 }
2997 }
2998
2999 void remount_all_slave(void)
3000 {
3001 /* walk /proc/mounts and change any shared entries to slave */
3002 FILE *f = fopen("/proc/self/mountinfo", "r");
3003 char *line = NULL;
3004 size_t len = 0;
3005
3006 if (!f) {
3007 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3008 ERROR("Continuing container startup...");
3009 return;
3010 }
3011
3012 while (getline(&line, &len, f) != -1) {
3013 char *target, *opts;
3014 target = get_field(line, 4);
3015 if (!target)
3016 continue;
3017 opts = get_field(target, 2);
3018 if (!opts)
3019 continue;
3020 null_endofword(opts);
3021 if (!strstr(opts, "shared"))
3022 continue;
3023 null_endofword(target);
3024 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3025 SYSERROR("Failed to make %s rslave", target);
3026 ERROR("Continuing...");
3027 }
3028 }
3029 fclose(f);
3030 free(line);
3031 }
3032
3033 void lxc_execute_bind_init(struct lxc_conf *conf)
3034 {
3035 int ret;
3036 char path[PATH_MAX], destpath[PATH_MAX], *p;
3037
3038 /* If init exists in the container, don't bind mount a static one */
3039 p = choose_init(conf->rootfs.mount);
3040 if (p) {
3041 free(p);
3042 return;
3043 }
3044
3045 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3046 if (ret < 0 || ret >= PATH_MAX) {
3047 WARN("Path name too long searching for lxc.init.static");
3048 return;
3049 }
3050
3051 if (!file_exists(path)) {
3052 INFO("%s does not exist on host", path);
3053 return;
3054 }
3055
3056 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
3057 if (ret < 0 || ret >= PATH_MAX) {
3058 WARN("Path name too long for container's lxc.init.static");
3059 return;
3060 }
3061
3062 if (!file_exists(destpath)) {
3063 FILE * pathfile = fopen(destpath, "wb");
3064 if (!pathfile) {
3065 SYSERROR("Failed to create mount target '%s'", destpath);
3066 return;
3067 }
3068 fclose(pathfile);
3069 }
3070
3071 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
3072 if (ret < 0)
3073 SYSERROR("Failed to bind lxc.init.static into container");
3074 INFO("lxc.init.static bound into container at %s", path);
3075 }
3076
3077 /*
3078 * This does the work of remounting / if it is shared, calling the
3079 * container pre-mount hooks, and mounting the rootfs.
3080 */
3081 int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
3082 {
3083 if (conf->rootfs_setup) {
3084 /*
3085 * rootfs was set up in another namespace. bind-mount it
3086 * to give us a mount in our own ns so we can pivot_root to it
3087 */
3088 const char *path = conf->rootfs.mount;
3089 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
3090 ERROR("Failed to bind-mount container / onto itself");
3091 return -1;
3092 }
3093 return 0;
3094 }
3095
3096 remount_all_slave();
3097
3098 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
3099 ERROR("failed to run pre-mount hooks for container '%s'.", name);
3100 return -1;
3101 }
3102
3103 if (lxc_setup_rootfs(conf)) {
3104 ERROR("failed to setup rootfs for '%s'", name);
3105 return -1;
3106 }
3107
3108 conf->rootfs_setup = true;
3109 return 0;
3110 }
3111
3112 static bool verify_start_hooks(struct lxc_conf *conf)
3113 {
3114 struct lxc_list *it;
3115 char path[MAXPATHLEN];
3116 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
3117 char *hookname = it->elem;
3118 struct stat st;
3119 int ret;
3120
3121 ret = snprintf(path, MAXPATHLEN, "%s%s",
3122 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
3123 if (ret < 0 || ret >= MAXPATHLEN)
3124 return false;
3125 ret = stat(path, &st);
3126 if (ret) {
3127 SYSERROR("Start hook %s not found in container",
3128 hookname);
3129 return false;
3130 }
3131 return true;
3132 }
3133
3134 return true;
3135 }
3136
3137 int lxc_setup(struct lxc_handler *handler)
3138 {
3139 int ret;
3140 const char *name = handler->name;
3141 struct lxc_conf *lxc_conf = handler->conf;
3142 const char *lxcpath = handler->lxcpath;
3143
3144 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
3145 ERROR("Error setting up rootfs mount after spawn");
3146 return -1;
3147 }
3148
3149 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
3150 if (setup_utsname(lxc_conf->utsname)) {
3151 ERROR("failed to setup the utsname for '%s'", name);
3152 return -1;
3153 }
3154 }
3155
3156 if (lxc_setup_network_in_child_namespaces(lxc_conf, &lxc_conf->network)) {
3157 ERROR("failed to setup the network for '%s'", name);
3158 return -1;
3159 }
3160
3161 if (lxc_network_send_name_and_ifindex_to_parent(handler) < 0) {
3162 ERROR("Failed to network device names and ifindices to parent");
3163 return -1;
3164 }
3165
3166 if (lxc_conf->autodev > 0) {
3167 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
3168 ERROR("failed to mount /dev in the container");
3169 return -1;
3170 }
3171 }
3172
3173 /* do automatic mounts (mainly /proc and /sys), but exclude
3174 * those that need to wait until other stuff has finished
3175 */
3176 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
3177 ERROR("failed to setup the automatic mounts for '%s'", name);
3178 return -1;
3179 }
3180
3181 if (setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
3182 ERROR("failed to setup the mounts for '%s'", name);
3183 return -1;
3184 }
3185
3186 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(lxc_conf, &lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
3187 ERROR("failed to setup the mount entries for '%s'", name);
3188 return -1;
3189 }
3190
3191 /* Make sure any start hooks are in the container */
3192 if (!verify_start_hooks(lxc_conf))
3193 return -1;
3194
3195 if (lxc_conf->is_execute)
3196 lxc_execute_bind_init(lxc_conf);
3197
3198 /* now mount only cgroup, if wanted;
3199 * before, /sys could not have been mounted
3200 * (is either mounted automatically or via fstab entries)
3201 */
3202 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
3203 ERROR("failed to setup the automatic mounts for '%s'", name);
3204 return -1;
3205 }
3206
3207 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
3208 ERROR("failed to run mount hooks for container '%s'.", name);
3209 return -1;
3210 }
3211
3212 if (lxc_conf->autodev > 0) {
3213 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
3214 ERROR("failed to run autodev hooks for container '%s'.", name);
3215 return -1;
3216 }
3217
3218 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
3219 ERROR("failed to populate /dev in the container");
3220 return -1;
3221 }
3222 }
3223
3224 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
3225 ERROR("failed to setup the console for '%s'", name);
3226 return -1;
3227 }
3228
3229 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
3230 ERROR("failed to setup /dev symlinks for '%s'", name);
3231 return -1;
3232 }
3233
3234 /* mount /proc if it's not already there */
3235 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
3236 ERROR("failed to LSM mount proc for '%s'", name);
3237 return -1;
3238 }
3239
3240 if (setup_pivot_root(&lxc_conf->rootfs)) {
3241 ERROR("failed to set rootfs for '%s'", name);
3242 return -1;
3243 }
3244
3245 if (lxc_setup_devpts(lxc_conf->pts)) {
3246 ERROR("failed to setup the new pts instance");
3247 return -1;
3248 }
3249
3250 ret = lxc_create_ttys(handler);
3251 if (ret < 0)
3252 return -1;
3253
3254 if (setup_personality(lxc_conf->personality)) {
3255 ERROR("failed to setup personality");
3256 return -1;
3257 }
3258
3259 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
3260 if (!lxc_list_empty(&lxc_conf->caps)) {
3261 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
3262 return -1;
3263 }
3264 if (dropcaps_except(&lxc_conf->keepcaps)) {
3265 ERROR("failed to keep requested caps");
3266 return -1;
3267 }
3268 } else if (setup_caps(&lxc_conf->caps)) {
3269 ERROR("failed to drop capabilities");
3270 return -1;
3271 }
3272
3273 NOTICE("Container \"%s\" is set up", name);
3274
3275 return 0;
3276 }
3277
3278 int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
3279 const char *lxcpath, char *argv[])
3280 {
3281 int which = -1;
3282 struct lxc_list *it;
3283
3284 if (strcmp(hook, "pre-start") == 0)
3285 which = LXCHOOK_PRESTART;
3286 else if (strcmp(hook, "pre-mount") == 0)
3287 which = LXCHOOK_PREMOUNT;
3288 else if (strcmp(hook, "mount") == 0)
3289 which = LXCHOOK_MOUNT;
3290 else if (strcmp(hook, "autodev") == 0)
3291 which = LXCHOOK_AUTODEV;
3292 else if (strcmp(hook, "start") == 0)
3293 which = LXCHOOK_START;
3294 else if (strcmp(hook, "stop") == 0)
3295 which = LXCHOOK_STOP;
3296 else if (strcmp(hook, "post-stop") == 0)
3297 which = LXCHOOK_POSTSTOP;
3298 else if (strcmp(hook, "clone") == 0)
3299 which = LXCHOOK_CLONE;
3300 else if (strcmp(hook, "destroy") == 0)
3301 which = LXCHOOK_DESTROY;
3302 else
3303 return -1;
3304 lxc_list_for_each(it, &conf->hooks[which]) {
3305 int ret;
3306 char *hookname = it->elem;
3307 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
3308 if (ret)
3309 return ret;
3310 }
3311 return 0;
3312 }
3313
3314 int lxc_clear_config_caps(struct lxc_conf *c)
3315 {
3316 struct lxc_list *it, *next;
3317
3318 lxc_list_for_each_safe(it, &c->caps, next) {
3319 lxc_list_del(it);
3320 free(it->elem);
3321 free(it);
3322 }
3323 return 0;
3324 }
3325
3326 static int lxc_free_idmap(struct lxc_list *id_map) {
3327 struct lxc_list *it, *next;
3328
3329 lxc_list_for_each_safe(it, id_map, next) {
3330 lxc_list_del(it);
3331 free(it->elem);
3332 free(it);
3333 }
3334 return 0;
3335 }
3336
3337 int lxc_clear_idmaps(struct lxc_conf *c)
3338 {
3339 return lxc_free_idmap(&c->id_map);
3340 }
3341
3342 int lxc_clear_config_keepcaps(struct lxc_conf *c)
3343 {
3344 struct lxc_list *it,*next;
3345
3346 lxc_list_for_each_safe(it, &c->keepcaps, next) {
3347 lxc_list_del(it);
3348 free(it->elem);
3349 free(it);
3350 }
3351 return 0;
3352 }
3353
3354 int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
3355 {
3356 struct lxc_list *it,*next;
3357 bool all = false;
3358 const char *k = NULL;
3359
3360 if (strcmp(key, "lxc.cgroup") == 0)
3361 all = true;
3362 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
3363 k = key + sizeof("lxc.cgroup.")-1;
3364 else
3365 return -1;
3366
3367 lxc_list_for_each_safe(it, &c->cgroup, next) {
3368 struct lxc_cgroup *cg = it->elem;
3369 if (!all && strcmp(cg->subsystem, k) != 0)
3370 continue;
3371 lxc_list_del(it);
3372 free(cg->subsystem);
3373 free(cg->value);
3374 free(cg);
3375 free(it);
3376 }
3377 return 0;
3378 }
3379
3380 int lxc_clear_limits(struct lxc_conf *c, const char *key)
3381 {
3382 struct lxc_list *it, *next;
3383 bool all = false;
3384 const char *k = NULL;
3385
3386 if (strcmp(key, "lxc.limit") == 0
3387 || strcmp(key, "lxc.prlimit"))
3388 all = true;
3389 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
3390 k = key + sizeof("lxc.limit.")-1;
3391 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.")-1) == 0)
3392 k = key + sizeof("lxc.prlimit.")-1;
3393 else
3394 return -1;
3395
3396 lxc_list_for_each_safe(it, &c->limits, next) {
3397 struct lxc_limit *lim = it->elem;
3398 if (!all && strcmp(lim->resource, k) != 0)
3399 continue;
3400 lxc_list_del(it);
3401 free(lim->resource);
3402 free(lim);
3403 free(it);
3404 }
3405 return 0;
3406 }
3407
3408 int lxc_clear_groups(struct lxc_conf *c)
3409 {
3410 struct lxc_list *it,*next;
3411
3412 lxc_list_for_each_safe(it, &c->groups, next) {
3413 lxc_list_del(it);
3414 free(it->elem);
3415 free(it);
3416 }
3417 return 0;
3418 }
3419
3420 int lxc_clear_environment(struct lxc_conf *c)
3421 {
3422 struct lxc_list *it,*next;
3423
3424 lxc_list_for_each_safe(it, &c->environment, next) {
3425 lxc_list_del(it);
3426 free(it->elem);
3427 free(it);
3428 }
3429 return 0;
3430 }
3431
3432 int lxc_clear_mount_entries(struct lxc_conf *c)
3433 {
3434 struct lxc_list *it,*next;
3435
3436 lxc_list_for_each_safe(it, &c->mount_list, next) {
3437 lxc_list_del(it);
3438 free(it->elem);
3439 free(it);
3440 }
3441 return 0;
3442 }
3443
3444 int lxc_clear_automounts(struct lxc_conf *c)
3445 {
3446 c->auto_mounts = 0;
3447 return 0;
3448 }
3449
3450 int lxc_clear_hooks(struct lxc_conf *c, const char *key)
3451 {
3452 struct lxc_list *it,*next;
3453 bool all = false, done = false;
3454 const char *k = NULL;
3455 int i;
3456
3457 if (strcmp(key, "lxc.hook") == 0)
3458 all = true;
3459 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
3460 k = key + sizeof("lxc.hook.")-1;
3461 else
3462 return -1;
3463
3464 for (i=0; i<NUM_LXC_HOOKS; i++) {
3465 if (all || strcmp(k, lxchook_names[i]) == 0) {
3466 lxc_list_for_each_safe(it, &c->hooks[i], next) {
3467 lxc_list_del(it);
3468 free(it->elem);
3469 free(it);
3470 }
3471 done = true;
3472 }
3473 }
3474
3475 if (!done) {
3476 ERROR("Invalid hook key: %s", key);
3477 return -1;
3478 }
3479 return 0;
3480 }
3481
3482 static inline void lxc_clear_aliens(struct lxc_conf *conf)
3483 {
3484 struct lxc_list *it,*next;
3485
3486 lxc_list_for_each_safe(it, &conf->aliens, next) {
3487 lxc_list_del(it);
3488 free(it->elem);
3489 free(it);
3490 }
3491 }
3492
3493 void lxc_clear_includes(struct lxc_conf *conf)
3494 {
3495 struct lxc_list *it,*next;
3496
3497 lxc_list_for_each_safe(it, &conf->includes, next) {
3498 lxc_list_del(it);
3499 free(it->elem);
3500 free(it);
3501 }
3502 }
3503
3504 void lxc_conf_free(struct lxc_conf *conf)
3505 {
3506 if (!conf)
3507 return;
3508 if (current_config == conf)
3509 current_config = NULL;
3510 free(conf->console.log_path);
3511 free(conf->console.path);
3512 free(conf->rootfs.mount);
3513 free(conf->rootfs.bdev_type);
3514 free(conf->rootfs.options);
3515 free(conf->rootfs.path);
3516 free(conf->logfile);
3517 if (conf->logfd != -1)
3518 close(conf->logfd);
3519 free(conf->utsname);
3520 free(conf->ttydir);
3521 free(conf->fstab);
3522 free(conf->rcfile);
3523 free(conf->init_cmd);
3524 free(conf->unexpanded_config);
3525 free(conf->pty_names);
3526 free(conf->syslog);
3527 lxc_free_networks(&conf->network);
3528 free(conf->lsm_aa_profile);
3529 free(conf->lsm_se_context);
3530 lxc_seccomp_free(conf);
3531 lxc_clear_config_caps(conf);
3532 lxc_clear_config_keepcaps(conf);
3533 lxc_clear_cgroups(conf, "lxc.cgroup");
3534 lxc_clear_hooks(conf, "lxc.hook");
3535 lxc_clear_mount_entries(conf);
3536 lxc_clear_idmaps(conf);
3537 lxc_clear_groups(conf);
3538 lxc_clear_includes(conf);
3539 lxc_clear_aliens(conf);
3540 lxc_clear_environment(conf);
3541 lxc_clear_limits(conf, "lxc.prlimit");
3542 free(conf->cgroup_meta.dir);
3543 free(conf->cgroup_meta.controllers);
3544 free(conf);
3545 }
3546
3547 struct userns_fn_data {
3548 int (*fn)(void *);
3549 const char *fn_name;
3550 void *arg;
3551 int p[2];
3552 };
3553
3554 static int run_userns_fn(void *data)
3555 {
3556 struct userns_fn_data *d = data;
3557 char c;
3558
3559 /* Close write end of the pipe. */
3560 close(d->p[1]);
3561
3562 /* Wait for parent to finish establishing a new mapping in the user
3563 * namespace we are executing in.
3564 */
3565 if (read(d->p[0], &c, 1) != 1)
3566 return -1;
3567
3568 /* Close read end of the pipe. */
3569 close(d->p[0]);
3570
3571 if (d->fn_name)
3572 TRACE("calling function \"%s\"", d->fn_name);
3573 /* Call function to run. */
3574 return d->fn(d->arg);
3575 }
3576
3577 static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
3578 enum idtype idtype)
3579 {
3580 struct lxc_list *it;
3581 struct id_map *map;
3582 struct id_map *retmap = NULL;
3583
3584 lxc_list_for_each(it, &conf->id_map) {
3585 map = it->elem;
3586 if (map->idtype != idtype)
3587 continue;
3588
3589 if (id >= map->hostid && id < map->hostid + map->range) {
3590 retmap = map;
3591 break;
3592 }
3593 }
3594
3595 if (!retmap)
3596 return NULL;
3597
3598 retmap = malloc(sizeof(*retmap));
3599 if (!retmap)
3600 return NULL;
3601
3602 memcpy(retmap, map, sizeof(*retmap));
3603 return retmap;
3604 }
3605
3606 /*
3607 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
3608 * existing one or establish a new one.
3609 */
3610 static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
3611 {
3612 int hostid_mapped;
3613 struct id_map *entry = NULL;
3614
3615 /* Reuse existing mapping. */
3616 entry = mapped_hostid_entry(conf, id, type);
3617 if (entry)
3618 return entry;
3619
3620 /* Find new mapping. */
3621 hostid_mapped = find_unmapped_nsid(conf, type);
3622 if (hostid_mapped < 0) {
3623 DEBUG("failed to find free mapping for id %d", id);
3624 return NULL;
3625 }
3626
3627 entry = malloc(sizeof(*entry));
3628 if (!entry)
3629 return NULL;
3630
3631 entry->idtype = type;
3632 entry->nsid = hostid_mapped;
3633 entry->hostid = (unsigned long)id;
3634 entry->range = 1;
3635
3636 return entry;
3637 }
3638
3639 /* Run a function in a new user namespace.
3640 * The caller's euid/egid will be mapped if it is not already.
3641 * Afaict, userns_exec_1() is only used to operate based on privileges for the
3642 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
3643 * This means we require only to establish a mapping from:
3644 * - the container root {g,u}id as seen from the host > user's host {g,u}id
3645 * - the container root -> some sub{g,u}id
3646 * The former we add, if the user did not specifiy a mapping. The latter we
3647 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
3648 * there to start the container in the first place.
3649 */
3650 int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
3651 const char *fn_name)
3652 {
3653 pid_t pid;
3654 uid_t euid, egid;
3655 struct userns_fn_data d;
3656 int p[2];
3657 struct lxc_list *it;
3658 struct id_map *map;
3659 char c = '1';
3660 int ret = -1;
3661 struct lxc_list *idmap = NULL, *tmplist = NULL;
3662 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
3663 *host_uid_map = NULL, *host_gid_map = NULL;
3664
3665 ret = pipe(p);
3666 if (ret < 0) {
3667 SYSERROR("opening pipe");
3668 return -1;
3669 }
3670 d.fn = fn;
3671 d.fn_name = fn_name;
3672 d.arg = data;
3673 d.p[0] = p[0];
3674 d.p[1] = p[1];
3675
3676 /* Clone child in new user namespace. */
3677 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
3678 if (pid < 0) {
3679 ERROR("failed to clone child process in new user namespace");
3680 goto on_error;
3681 }
3682
3683 close(p[0]);
3684 p[0] = -1;
3685
3686 /* Find container root. */
3687 lxc_list_for_each(it, &conf->id_map) {
3688 map = it->elem;
3689
3690 if (map->nsid != 0)
3691 continue;
3692
3693 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
3694 container_root_uid = malloc(sizeof(*container_root_uid));
3695 if (!container_root_uid)
3696 goto on_error;
3697 container_root_uid->idtype = map->idtype;
3698 container_root_uid->hostid = map->hostid;
3699 container_root_uid->nsid = 0;
3700 container_root_uid->range = map->range;
3701 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
3702 container_root_gid = malloc(sizeof(*container_root_gid));
3703 if (!container_root_gid)
3704 goto on_error;
3705 container_root_gid->idtype = map->idtype;
3706 container_root_gid->hostid = map->hostid;
3707 container_root_gid->nsid = 0;
3708 container_root_gid->range = map->range;
3709 }
3710
3711 /* Found container root. */
3712 if (container_root_uid && container_root_gid)
3713 break;
3714 }
3715
3716 /* This is actually checked earlier but it can't hurt. */
3717 if (!container_root_uid || !container_root_gid) {
3718 ERROR("no mapping for container root found");
3719 goto on_error;
3720 }
3721
3722 host_uid_map = container_root_uid;
3723 host_gid_map = container_root_gid;
3724
3725 /* Check whether the {g,u}id of the user has a mapping. */
3726 euid = geteuid();
3727 egid = getegid();
3728 if (euid != container_root_uid->hostid)
3729 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
3730
3731 if (egid != container_root_gid->hostid)
3732 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
3733
3734 if (!host_uid_map) {
3735 DEBUG("failed to find mapping for uid %d", euid);
3736 goto on_error;
3737 }
3738
3739 if (!host_gid_map) {
3740 DEBUG("failed to find mapping for gid %d", egid);
3741 goto on_error;
3742 }
3743
3744 /* Allocate new {g,u}id map list. */
3745 idmap = malloc(sizeof(*idmap));
3746 if (!idmap)
3747 goto on_error;
3748 lxc_list_init(idmap);
3749
3750 /* Add container root to the map. */
3751 tmplist = malloc(sizeof(*tmplist));
3752 if (!tmplist)
3753 goto on_error;
3754 lxc_list_add_elem(tmplist, container_root_uid);
3755 lxc_list_add_tail(idmap, tmplist);
3756
3757 if (host_uid_map && (host_uid_map != container_root_uid)) {
3758 /* idmap will now keep track of that memory. */
3759 container_root_uid = NULL;
3760
3761 /* Add container root to the map. */
3762 tmplist = malloc(sizeof(*tmplist));
3763 if (!tmplist)
3764 goto on_error;
3765 lxc_list_add_elem(tmplist, host_uid_map);
3766 lxc_list_add_tail(idmap, tmplist);
3767 }
3768 /* idmap will now keep track of that memory. */
3769 container_root_uid = NULL;
3770 /* idmap will now keep track of that memory. */
3771 host_uid_map = NULL;
3772
3773 tmplist = malloc(sizeof(*tmplist));
3774 if (!tmplist)
3775 goto on_error;
3776 lxc_list_add_elem(tmplist, container_root_gid);
3777 lxc_list_add_tail(idmap, tmplist);
3778
3779 if (host_gid_map && (host_gid_map != container_root_gid)) {
3780 /* idmap will now keep track of that memory. */
3781 container_root_gid = NULL;
3782
3783 tmplist = malloc(sizeof(*tmplist));
3784 if (!tmplist)
3785 goto on_error;
3786 lxc_list_add_elem(tmplist, host_gid_map);
3787 lxc_list_add_tail(idmap, tmplist);
3788 }
3789 /* idmap will now keep track of that memory. */
3790 container_root_gid = NULL;
3791 /* idmap will now keep track of that memory. */
3792 host_gid_map = NULL;
3793
3794 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
3795 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
3796 lxc_list_for_each(it, idmap) {
3797 map = it->elem;
3798 TRACE("establishing %cid mapping for \"%d\" in new "
3799 "user namespace: nsuid %lu - hostid %lu - range "
3800 "%lu",
3801 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
3802 map->nsid, map->hostid, map->range);
3803 }
3804 }
3805
3806 /* Set up {g,u}id mapping for user namespace of child process. */
3807 ret = lxc_map_ids(idmap, pid);
3808 if (ret < 0) {
3809 ERROR("error setting up {g,u}id mappings for child process "
3810 "\"%d\"",
3811 pid);
3812 goto on_error;
3813 }
3814
3815 /* Tell child to proceed. */
3816 if (write(p[1], &c, 1) != 1) {
3817 SYSERROR("failed telling child process \"%d\" to proceed", pid);
3818 goto on_error;
3819 }
3820
3821 /* Wait for child to finish. */
3822 ret = wait_for_pid(pid);
3823
3824 on_error:
3825 if (idmap)
3826 lxc_free_idmap(idmap);
3827 if (container_root_uid)
3828 free(container_root_uid);
3829 if (container_root_gid)
3830 free(container_root_gid);
3831 if (host_uid_map && (host_uid_map != container_root_uid))
3832 free(host_uid_map);
3833 if (host_gid_map && (host_gid_map != container_root_gid))
3834 free(host_gid_map);
3835
3836 if (p[0] != -1)
3837 close(p[0]);
3838 close(p[1]);
3839
3840 return ret;
3841 }
3842
3843 /* not thread-safe, do not use from api without first forking */
3844 static char* getuname(void)
3845 {
3846 struct passwd *result;
3847
3848 result = getpwuid(geteuid());
3849 if (!result)
3850 return NULL;
3851
3852 return strdup(result->pw_name);
3853 }
3854
3855 /* not thread-safe, do not use from api without first forking */
3856 static char *getgname(void)
3857 {
3858 struct group *result;
3859
3860 result = getgrgid(getegid());
3861 if (!result)
3862 return NULL;
3863
3864 return strdup(result->gr_name);
3865 }
3866
3867 /* not thread-safe, do not use from api without first forking */
3868 void suggest_default_idmap(void)
3869 {
3870 FILE *f;
3871 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
3872 char *line = NULL;
3873 char *uname, *gname;
3874 size_t len = 0;
3875
3876 if (!(uname = getuname()))
3877 return;
3878
3879 if (!(gname = getgname())) {
3880 free(uname);
3881 return;
3882 }
3883
3884 f = fopen(subuidfile, "r");
3885 if (!f) {
3886 ERROR("Your system is not configured with subuids");
3887 free(gname);
3888 free(uname);
3889 return;
3890 }
3891 while (getline(&line, &len, f) != -1) {
3892 size_t no_newline = 0;
3893 char *p = strchr(line, ':'), *p2;
3894 if (*line == '#')
3895 continue;
3896 if (!p)
3897 continue;
3898 *p = '\0';
3899 p++;
3900 if (strcmp(line, uname))
3901 continue;
3902 p2 = strchr(p, ':');
3903 if (!p2)
3904 continue;
3905 *p2 = '\0';
3906 p2++;
3907 if (!*p2)
3908 continue;
3909 no_newline = strcspn(p2, "\n");
3910 p2[no_newline] = '\0';
3911
3912 if (lxc_safe_uint(p, &uid) < 0)
3913 WARN("Could not parse UID.");
3914 if (lxc_safe_uint(p2, &urange) < 0)
3915 WARN("Could not parse UID range.");
3916 }
3917 fclose(f);
3918
3919 f = fopen(subgidfile, "r");
3920 if (!f) {
3921 ERROR("Your system is not configured with subgids");
3922 free(gname);
3923 free(uname);
3924 return;
3925 }
3926 while (getline(&line, &len, f) != -1) {
3927 size_t no_newline = 0;
3928 char *p = strchr(line, ':'), *p2;
3929 if (*line == '#')
3930 continue;
3931 if (!p)
3932 continue;
3933 *p = '\0';
3934 p++;
3935 if (strcmp(line, uname))
3936 continue;
3937 p2 = strchr(p, ':');
3938 if (!p2)
3939 continue;
3940 *p2 = '\0';
3941 p2++;
3942 if (!*p2)
3943 continue;
3944 no_newline = strcspn(p2, "\n");
3945 p2[no_newline] = '\0';
3946
3947 if (lxc_safe_uint(p, &gid) < 0)
3948 WARN("Could not parse GID.");
3949 if (lxc_safe_uint(p2, &grange) < 0)
3950 WARN("Could not parse GID range.");
3951 }
3952 fclose(f);
3953
3954 free(line);
3955
3956 if (!urange || !grange) {
3957 ERROR("You do not have subuids or subgids allocated");
3958 ERROR("Unprivileged containers require subuids and subgids");
3959 return;
3960 }
3961
3962 ERROR("You must either run as root, or define uid mappings");
3963 ERROR("To pass uid mappings to lxc-create, you could create");
3964 ERROR("~/.config/lxc/default.conf:");
3965 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
3966 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
3967 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
3968
3969 free(gname);
3970 free(uname);
3971 }
3972
3973 static void free_cgroup_settings(struct lxc_list *result)
3974 {
3975 struct lxc_list *iterator, *next;
3976
3977 lxc_list_for_each_safe(iterator, result, next) {
3978 lxc_list_del(iterator);
3979 free(iterator);
3980 }
3981 free(result);
3982 }
3983
3984 /*
3985 * Return the list of cgroup_settings sorted according to the following rules
3986 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
3987 */
3988 struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
3989 {
3990 struct lxc_list *result;
3991 struct lxc_list *memsw_limit = NULL;
3992 struct lxc_list *it = NULL;
3993 struct lxc_cgroup *cg = NULL;
3994 struct lxc_list *item = NULL;
3995
3996 result = malloc(sizeof(*result));
3997 if (!result) {
3998 ERROR("failed to allocate memory to sort cgroup settings");
3999 return NULL;
4000 }
4001 lxc_list_init(result);
4002
4003 /*Iterate over the cgroup settings and copy them to the output list*/
4004 lxc_list_for_each(it, cgroup_settings) {
4005 item = malloc(sizeof(*item));
4006 if (!item) {
4007 ERROR("failed to allocate memory to sort cgroup settings");
4008 free_cgroup_settings(result);
4009 return NULL;
4010 }
4011 item->elem = it->elem;
4012 cg = it->elem;
4013 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4014 /* Store the memsw_limit location */
4015 memsw_limit = item;
4016 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
4017 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
4018 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
4019 item->elem = memsw_limit->elem;
4020 memsw_limit->elem = it->elem;
4021 }
4022 lxc_list_add_tail(result, item);
4023 }
4024
4025 return result;
4026 }