]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/conf.c
conf: non-functional fixup
[mirror_lxc.git] / src / lxc / conf.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #define _GNU_SOURCE
25 #include "config.h"
26
27 #include <dirent.h>
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <grp.h>
31 #include <inttypes.h>
32 #include <libgen.h>
33 #include <pwd.h>
34 #include <stdarg.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <arpa/inet.h>
41 #include <linux/loop.h>
42 #include <net/if.h>
43 #include <netinet/in.h>
44 #include <sys/mman.h>
45 #include <sys/mount.h>
46 #include <sys/param.h>
47 #include <sys/prctl.h>
48 #include <sys/stat.h>
49 #include <sys/socket.h>
50 #include <sys/sysmacros.h>
51 #include <sys/syscall.h>
52 #include <sys/types.h>
53 #include <sys/utsname.h>
54 #include <sys/wait.h>
55
56 /* makedev() */
57 #ifdef MAJOR_IN_MKDEV
58 # include <sys/mkdev.h>
59 #endif
60
61 #ifdef HAVE_STATVFS
62 #include <sys/statvfs.h>
63 #endif
64
65 #if HAVE_PTY_H
66 #include <pty.h>
67 #else
68 #include <../include/openpty.h>
69 #endif
70
71 #ifdef HAVE_LINUX_MEMFD_H
72 #include <linux/memfd.h>
73 #endif
74
75 #include "af_unix.h"
76 #include "caps.h" /* for lxc_caps_last_cap() */
77 #include "cgroup.h"
78 #include "conf.h"
79 #include "confile_utils.h"
80 #include "error.h"
81 #include "log.h"
82 #include "lxclock.h"
83 #include "lxcseccomp.h"
84 #include "namespace.h"
85 #include "network.h"
86 #include "parse.h"
87 #include "storage.h"
88 #include "storage/aufs.h"
89 #include "storage/overlay.h"
90 #include "utils.h"
91 #include "lsm/lsm.h"
92
93 #if HAVE_LIBCAP
94 #include <sys/capability.h>
95 #endif
96
97 #if HAVE_SYS_PERSONALITY_H
98 #include <sys/personality.h>
99 #endif
100
101 #if IS_BIONIC
102 #include <../include/lxcmntent.h>
103 #ifndef HAVE_PRLIMIT
104 #include <../include/prlimit.h>
105 #endif
106 #else
107 #include <mntent.h>
108 #endif
109
110 lxc_log_define(lxc_conf, lxc);
111
112 #if HAVE_LIBCAP
113 #ifndef CAP_SETFCAP
114 #define CAP_SETFCAP 31
115 #endif
116
117 #ifndef CAP_MAC_OVERRIDE
118 #define CAP_MAC_OVERRIDE 32
119 #endif
120
121 #ifndef CAP_MAC_ADMIN
122 #define CAP_MAC_ADMIN 33
123 #endif
124 #endif
125
126 #ifndef PR_CAPBSET_DROP
127 #define PR_CAPBSET_DROP 24
128 #endif
129
130 #ifndef LO_FLAGS_AUTOCLEAR
131 #define LO_FLAGS_AUTOCLEAR 4
132 #endif
133
134 #ifndef CAP_SETUID
135 #define CAP_SETUID 7
136 #endif
137
138 #ifndef CAP_SETGID
139 #define CAP_SETGID 6
140 #endif
141
142 /* needed for cgroup automount checks, regardless of whether we
143 * have included linux/capability.h or not */
144 #ifndef CAP_SYS_ADMIN
145 #define CAP_SYS_ADMIN 21
146 #endif
147
148 /* Define pivot_root() if missing from the C library */
149 #ifndef HAVE_PIVOT_ROOT
150 static int pivot_root(const char * new_root, const char * put_old)
151 {
152 #ifdef __NR_pivot_root
153 return syscall(__NR_pivot_root, new_root, put_old);
154 #else
155 errno = ENOSYS;
156 return -1;
157 #endif
158 }
159 #else
160 extern int pivot_root(const char * new_root, const char * put_old);
161 #endif
162
163 /* Define sethostname() if missing from the C library */
164 #ifndef HAVE_SETHOSTNAME
165 static int sethostname(const char * name, size_t len)
166 {
167 #ifdef __NR_sethostname
168 return syscall(__NR_sethostname, name, len);
169 #else
170 errno = ENOSYS;
171 return -1;
172 #endif
173 }
174 #endif
175
176 #ifndef MS_PRIVATE
177 #define MS_PRIVATE (1<<18)
178 #endif
179
180 #ifndef MS_LAZYTIME
181 #define MS_LAZYTIME (1<<25)
182 #endif
183
184 /* memfd_create() */
185 #ifndef MFD_CLOEXEC
186 #define MFD_CLOEXEC 0x0001U
187 #endif
188
189 #ifndef MFD_ALLOW_SEALING
190 #define MFD_ALLOW_SEALING 0x0002U
191 #endif
192
193 #ifndef HAVE_MEMFD_CREATE
194 static int memfd_create(const char *name, unsigned int flags) {
195 #ifndef __NR_memfd_create
196 #if defined __i386__
197 #define __NR_memfd_create 356
198 #elif defined __x86_64__
199 #define __NR_memfd_create 319
200 #elif defined __arm__
201 #define __NR_memfd_create 385
202 #elif defined __aarch64__
203 #define __NR_memfd_create 279
204 #elif defined __s390__
205 #define __NR_memfd_create 350
206 #elif defined __powerpc__
207 #define __NR_memfd_create 360
208 #elif defined __sparc__
209 #define __NR_memfd_create 348
210 #elif defined __blackfin__
211 #define __NR_memfd_create 390
212 #elif defined __ia64__
213 #define __NR_memfd_create 1340
214 #elif defined _MIPS_SIM
215 #if _MIPS_SIM == _MIPS_SIM_ABI32
216 #define __NR_memfd_create 4354
217 #endif
218 #if _MIPS_SIM == _MIPS_SIM_NABI32
219 #define __NR_memfd_create 6318
220 #endif
221 #if _MIPS_SIM == _MIPS_SIM_ABI64
222 #define __NR_memfd_create 5314
223 #endif
224 #endif
225 #endif
226 #ifdef __NR_memfd_create
227 return syscall(__NR_memfd_create, name, flags);
228 #else
229 errno = ENOSYS;
230 return -1;
231 #endif
232 }
233 #else
234 extern int memfd_create(const char *name, unsigned int flags);
235 #endif
236
237 char *lxchook_names[NUM_LXC_HOOKS] = {"pre-start", "pre-mount", "mount",
238 "autodev", "start", "stop",
239 "post-stop", "clone", "destroy"};
240
241 typedef int (*instantiate_cb)(struct lxc_handler *, struct lxc_netdev *);
242
243 struct mount_opt {
244 char *name;
245 int clear;
246 int flag;
247 };
248
249 struct caps_opt {
250 char *name;
251 int value;
252 };
253
254 struct limit_opt {
255 char *name;
256 int value;
257 };
258
259 /*
260 * The lxc_conf of the container currently being worked on in an
261 * API call
262 * This is used in the error calls
263 */
264 #ifdef HAVE_TLS
265 __thread struct lxc_conf *current_config;
266 #else
267 struct lxc_conf *current_config;
268 #endif
269
270 /* Declare this here, since we don't want to reshuffle the whole file. */
271 static int in_caplist(int cap, struct lxc_list *caps);
272
273 static int instantiate_veth(struct lxc_handler *, struct lxc_netdev *);
274 static int instantiate_macvlan(struct lxc_handler *, struct lxc_netdev *);
275 static int instantiate_vlan(struct lxc_handler *, struct lxc_netdev *);
276 static int instantiate_phys(struct lxc_handler *, struct lxc_netdev *);
277 static int instantiate_empty(struct lxc_handler *, struct lxc_netdev *);
278 static int instantiate_none(struct lxc_handler *, struct lxc_netdev *);
279
280 static instantiate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
281 [LXC_NET_VETH] = instantiate_veth,
282 [LXC_NET_MACVLAN] = instantiate_macvlan,
283 [LXC_NET_VLAN] = instantiate_vlan,
284 [LXC_NET_PHYS] = instantiate_phys,
285 [LXC_NET_EMPTY] = instantiate_empty,
286 [LXC_NET_NONE] = instantiate_none,
287 };
288
289 static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
290 static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
291 static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
292 static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
293 static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
294 static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
295
296 static instantiate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
297 [LXC_NET_VETH] = shutdown_veth,
298 [LXC_NET_MACVLAN] = shutdown_macvlan,
299 [LXC_NET_VLAN] = shutdown_vlan,
300 [LXC_NET_PHYS] = shutdown_phys,
301 [LXC_NET_EMPTY] = shutdown_empty,
302 [LXC_NET_NONE] = shutdown_none,
303 };
304
305 static struct mount_opt mount_opt[] = {
306 { "async", 1, MS_SYNCHRONOUS },
307 { "atime", 1, MS_NOATIME },
308 { "bind", 0, MS_BIND },
309 { "defaults", 0, 0 },
310 { "dev", 1, MS_NODEV },
311 { "diratime", 1, MS_NODIRATIME },
312 { "dirsync", 0, MS_DIRSYNC },
313 { "exec", 1, MS_NOEXEC },
314 { "lazytime", 0, MS_LAZYTIME },
315 { "mand", 0, MS_MANDLOCK },
316 { "noatime", 0, MS_NOATIME },
317 { "nodev", 0, MS_NODEV },
318 { "nodiratime", 0, MS_NODIRATIME },
319 { "noexec", 0, MS_NOEXEC },
320 { "nomand", 1, MS_MANDLOCK },
321 { "norelatime", 1, MS_RELATIME },
322 { "nostrictatime", 1, MS_STRICTATIME },
323 { "nosuid", 0, MS_NOSUID },
324 { "rbind", 0, MS_BIND|MS_REC },
325 { "relatime", 0, MS_RELATIME },
326 { "remount", 0, MS_REMOUNT },
327 { "ro", 0, MS_RDONLY },
328 { "rw", 1, MS_RDONLY },
329 { "strictatime", 0, MS_STRICTATIME },
330 { "suid", 1, MS_NOSUID },
331 { "sync", 0, MS_SYNCHRONOUS },
332 { NULL, 0, 0 },
333 };
334
335 #if HAVE_LIBCAP
336 static struct caps_opt caps_opt[] = {
337 { "chown", CAP_CHOWN },
338 { "dac_override", CAP_DAC_OVERRIDE },
339 { "dac_read_search", CAP_DAC_READ_SEARCH },
340 { "fowner", CAP_FOWNER },
341 { "fsetid", CAP_FSETID },
342 { "kill", CAP_KILL },
343 { "setgid", CAP_SETGID },
344 { "setuid", CAP_SETUID },
345 { "setpcap", CAP_SETPCAP },
346 { "linux_immutable", CAP_LINUX_IMMUTABLE },
347 { "net_bind_service", CAP_NET_BIND_SERVICE },
348 { "net_broadcast", CAP_NET_BROADCAST },
349 { "net_admin", CAP_NET_ADMIN },
350 { "net_raw", CAP_NET_RAW },
351 { "ipc_lock", CAP_IPC_LOCK },
352 { "ipc_owner", CAP_IPC_OWNER },
353 { "sys_module", CAP_SYS_MODULE },
354 { "sys_rawio", CAP_SYS_RAWIO },
355 { "sys_chroot", CAP_SYS_CHROOT },
356 { "sys_ptrace", CAP_SYS_PTRACE },
357 { "sys_pacct", CAP_SYS_PACCT },
358 { "sys_admin", CAP_SYS_ADMIN },
359 { "sys_boot", CAP_SYS_BOOT },
360 { "sys_nice", CAP_SYS_NICE },
361 { "sys_resource", CAP_SYS_RESOURCE },
362 { "sys_time", CAP_SYS_TIME },
363 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
364 { "mknod", CAP_MKNOD },
365 { "lease", CAP_LEASE },
366 #ifdef CAP_AUDIT_READ
367 { "audit_read", CAP_AUDIT_READ },
368 #endif
369 #ifdef CAP_AUDIT_WRITE
370 { "audit_write", CAP_AUDIT_WRITE },
371 #endif
372 #ifdef CAP_AUDIT_CONTROL
373 { "audit_control", CAP_AUDIT_CONTROL },
374 #endif
375 { "setfcap", CAP_SETFCAP },
376 { "mac_override", CAP_MAC_OVERRIDE },
377 { "mac_admin", CAP_MAC_ADMIN },
378 #ifdef CAP_SYSLOG
379 { "syslog", CAP_SYSLOG },
380 #endif
381 #ifdef CAP_WAKE_ALARM
382 { "wake_alarm", CAP_WAKE_ALARM },
383 #endif
384 #ifdef CAP_BLOCK_SUSPEND
385 { "block_suspend", CAP_BLOCK_SUSPEND },
386 #endif
387 };
388 #else
389 static struct caps_opt caps_opt[] = {};
390 #endif
391
392 static struct limit_opt limit_opt[] = {
393 #ifdef RLIMIT_AS
394 { "as", RLIMIT_AS },
395 #endif
396 #ifdef RLIMIT_CORE
397 { "core", RLIMIT_CORE },
398 #endif
399 #ifdef RLIMIT_CPU
400 { "cpu", RLIMIT_CPU },
401 #endif
402 #ifdef RLIMIT_DATA
403 { "data", RLIMIT_DATA },
404 #endif
405 #ifdef RLIMIT_FSIZE
406 { "fsize", RLIMIT_FSIZE },
407 #endif
408 #ifdef RLIMIT_LOCKS
409 { "locks", RLIMIT_LOCKS },
410 #endif
411 #ifdef RLIMIT_MEMLOCK
412 { "memlock", RLIMIT_MEMLOCK },
413 #endif
414 #ifdef RLIMIT_MSGQUEUE
415 { "msgqueue", RLIMIT_MSGQUEUE },
416 #endif
417 #ifdef RLIMIT_NICE
418 { "nice", RLIMIT_NICE },
419 #endif
420 #ifdef RLIMIT_NOFILE
421 { "nofile", RLIMIT_NOFILE },
422 #endif
423 #ifdef RLIMIT_NPROC
424 { "nproc", RLIMIT_NPROC },
425 #endif
426 #ifdef RLIMIT_RSS
427 { "rss", RLIMIT_RSS },
428 #endif
429 #ifdef RLIMIT_RTPRIO
430 { "rtprio", RLIMIT_RTPRIO },
431 #endif
432 #ifdef RLIMIT_RTTIME
433 { "rttime", RLIMIT_RTTIME },
434 #endif
435 #ifdef RLIMIT_SIGPENDING
436 { "sigpending", RLIMIT_SIGPENDING },
437 #endif
438 #ifdef RLIMIT_STACK
439 { "stack", RLIMIT_STACK },
440 #endif
441 };
442
443 static int run_buffer(char *buffer)
444 {
445 struct lxc_popen_FILE *f;
446 char *output;
447 int ret;
448
449 f = lxc_popen(buffer);
450 if (!f) {
451 SYSERROR("Failed to popen() %s.", buffer);
452 return -1;
453 }
454
455 output = malloc(LXC_LOG_BUFFER_SIZE);
456 if (!output) {
457 ERROR("Failed to allocate memory for %s.", buffer);
458 lxc_pclose(f);
459 return -1;
460 }
461
462 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
463 DEBUG("Script %s with output: %s.", buffer, output);
464
465 free(output);
466
467 ret = lxc_pclose(f);
468 if (ret == -1) {
469 SYSERROR("Script exited with error.");
470 return -1;
471 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
472 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
473 return -1;
474 } else if (WIFSIGNALED(ret)) {
475 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
476 return -1;
477 }
478
479 return 0;
480 }
481
482 static int run_script_argv(const char *name, const char *section,
483 const char *script, const char *hook,
484 const char *lxcpath, char **argsin)
485 {
486 int ret, i;
487 char *buffer;
488 size_t size = 0;
489
490 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
491 script, name, section);
492
493 for (i = 0; argsin && argsin[i]; i++)
494 size += strlen(argsin[i]) + 1;
495
496 size += strlen(hook) + 1;
497
498 size += strlen(script);
499 size += strlen(name);
500 size += strlen(section);
501 size += 3;
502
503 if (size > INT_MAX)
504 return -1;
505
506 buffer = alloca(size);
507 if (!buffer) {
508 ERROR("Failed to allocate memory.");
509 return -1;
510 }
511
512 ret =
513 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
514 if (ret < 0 || (size_t)ret >= size) {
515 ERROR("Script name too long.");
516 return -1;
517 }
518
519 for (i = 0; argsin && argsin[i]; i++) {
520 int len = size - ret;
521 int rc;
522 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
523 if (rc < 0 || rc >= len) {
524 ERROR("Script args too long.");
525 return -1;
526 }
527 ret += rc;
528 }
529
530 return run_buffer(buffer);
531 }
532
533 static int run_script(const char *name, const char *section, const char *script,
534 ...)
535 {
536 int ret;
537 char *buffer, *p;
538 size_t size = 0;
539 va_list ap;
540
541 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
542 script, name, section);
543
544 va_start(ap, script);
545 while ((p = va_arg(ap, char *)))
546 size += strlen(p) + 1;
547 va_end(ap);
548
549 size += strlen(script);
550 size += strlen(name);
551 size += strlen(section);
552 size += 3;
553
554 if (size > INT_MAX)
555 return -1;
556
557 buffer = alloca(size);
558 if (!buffer) {
559 ERROR("Failed to allocate memory.");
560 return -1;
561 }
562
563 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
564 if (ret < 0 || ret >= size) {
565 ERROR("Script name too long.");
566 return -1;
567 }
568
569 va_start(ap, script);
570 while ((p = va_arg(ap, char *))) {
571 int len = size - ret;
572 int rc;
573 rc = snprintf(buffer + ret, len, " %s", p);
574 if (rc < 0 || rc >= len) {
575 ERROR("Script args too long.");
576 return -1;
577 }
578 ret += rc;
579 }
580 va_end(ap);
581
582 return run_buffer(buffer);
583 }
584
585 /*
586 * pin_rootfs
587 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
588 * the duration of the container run, to prevent the container from marking
589 * the underlying fs readonly on shutdown. unlink the file immediately so
590 * no name pollution is happens
591 * return -1 on error.
592 * return -2 if nothing needed to be pinned.
593 * return an open fd (>=0) if we pinned it.
594 */
595 int pin_rootfs(const char *rootfs)
596 {
597 char absrootfs[MAXPATHLEN];
598 char absrootfspin[MAXPATHLEN];
599 struct stat s;
600 int ret, fd;
601
602 if (rootfs == NULL || strlen(rootfs) == 0)
603 return -2;
604
605 if (!realpath(rootfs, absrootfs))
606 return -2;
607
608 if (access(absrootfs, F_OK))
609 return -1;
610
611 if (stat(absrootfs, &s))
612 return -1;
613
614 if (!S_ISDIR(s.st_mode))
615 return -2;
616
617 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
618 if (ret >= MAXPATHLEN)
619 return -1;
620
621 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
622 if (fd < 0)
623 return fd;
624 (void)unlink(absrootfspin);
625 return fd;
626 }
627
628 /*
629 * If we are asking to remount something, make sure that any
630 * NOEXEC etc are honored.
631 */
632 unsigned long add_required_remount_flags(const char *s, const char *d,
633 unsigned long flags)
634 {
635 #ifdef HAVE_STATVFS
636 struct statvfs sb;
637 unsigned long required_flags = 0;
638
639 if (!(flags & MS_REMOUNT))
640 return flags;
641
642 if (!s)
643 s = d;
644
645 if (!s)
646 return flags;
647 if (statvfs(s, &sb) < 0)
648 return flags;
649
650 if (sb.f_flag & MS_NOSUID)
651 required_flags |= MS_NOSUID;
652 if (sb.f_flag & MS_NODEV)
653 required_flags |= MS_NODEV;
654 if (sb.f_flag & MS_RDONLY)
655 required_flags |= MS_RDONLY;
656 if (sb.f_flag & MS_NOEXEC)
657 required_flags |= MS_NOEXEC;
658
659 return flags | required_flags;
660 #else
661 return flags;
662 #endif
663 }
664
665 static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
666 {
667 int r;
668 int i;
669 static struct {
670 int match_mask;
671 int match_flag;
672 const char *source;
673 const char *destination;
674 const char *fstype;
675 unsigned long flags;
676 const char *options;
677 } default_mounts[] = {
678 /* Read-only bind-mounting... In older kernels, doing that required
679 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
680 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
681 * kernel 2.6.26 onwards. However, this apparently does not work on
682 * kernel 3.8. Unfortunately, on that very same kernel, doing the
683 * same trick as above doesn't seem to work either, there one needs
684 * to ALSO specify MS_BIND for the remount, otherwise the entire
685 * fs is remounted read-only or the mount fails because it's busy...
686 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
687 * 2.6.32...
688 */
689 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
690 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
691 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
692 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
693 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
694 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
695 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
696 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
697 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
698 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
699 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
700 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
701 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
702 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
703 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
704 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
705 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
706 { 0, 0, NULL, NULL, NULL, 0, NULL }
707 };
708
709 for (i = 0; default_mounts[i].match_mask; i++) {
710 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
711 char *source = NULL;
712 char *destination = NULL;
713 int saved_errno;
714 unsigned long mflags;
715
716 if (default_mounts[i].source) {
717 /* will act like strdup if %r is not present */
718 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
719 if (!source) {
720 SYSERROR("memory allocation error");
721 return -1;
722 }
723 }
724 if (!default_mounts[i].destination) {
725 ERROR("BUG: auto mounts destination %d was NULL", i);
726 free(source);
727 return -1;
728 }
729 /* will act like strdup if %r is not present */
730 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
731 if (!destination) {
732 saved_errno = errno;
733 SYSERROR("memory allocation error");
734 free(source);
735 errno = saved_errno;
736 return -1;
737 }
738 mflags = add_required_remount_flags(source, destination,
739 default_mounts[i].flags);
740 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
741 saved_errno = errno;
742 if (r < 0 && errno == ENOENT) {
743 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
744 r = 0;
745 }
746 else if (r < 0)
747 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
748
749 free(source);
750 free(destination);
751 if (r < 0) {
752 errno = saved_errno;
753 return -1;
754 }
755 }
756 }
757
758 if (flags & LXC_AUTO_CGROUP_MASK) {
759 int cg_flags;
760
761 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
762 /* If the type of cgroup mount was not specified, it depends on the
763 * container's capabilities as to what makes sense: if we have
764 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
765 * anyway, so we may as well default to read-write; then the admin
766 * will not be given a false sense of security. (And if they really
767 * want mixed r/o r/w, then they can explicitly specify :mixed.)
768 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
769 * :mixed, because then the container can't remount it read-write. */
770 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
771 int has_sys_admin = 0;
772
773 if (!lxc_list_empty(&conf->keepcaps))
774 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
775 else
776 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
777
778 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
779 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
780 else
781 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
782 }
783
784 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
785 SYSERROR("error mounting /sys/fs/cgroup");
786 return -1;
787 }
788 }
789
790 return 0;
791 }
792
793 static int setup_utsname(struct utsname *utsname)
794 {
795 if (!utsname)
796 return 0;
797
798 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
799 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
800 return -1;
801 }
802
803 INFO("'%s' hostname has been setup", utsname->nodename);
804
805 return 0;
806 }
807
808 struct dev_symlinks {
809 const char *oldpath;
810 const char *name;
811 };
812
813 static const struct dev_symlinks dev_symlinks[] = {
814 {"/proc/self/fd", "fd"},
815 {"/proc/self/fd/0", "stdin"},
816 {"/proc/self/fd/1", "stdout"},
817 {"/proc/self/fd/2", "stderr"},
818 };
819
820 static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
821 {
822 char path[MAXPATHLEN];
823 int ret,i;
824 struct stat s;
825
826
827 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
828 const struct dev_symlinks *d = &dev_symlinks[i];
829 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
830 if (ret < 0 || ret >= MAXPATHLEN)
831 return -1;
832
833 /*
834 * Stat the path first. If we don't get an error
835 * accept it as is and don't try to create it
836 */
837 if (!stat(path, &s)) {
838 continue;
839 }
840
841 ret = symlink(d->oldpath, path);
842
843 if (ret && errno != EEXIST) {
844 if ( errno == EROFS ) {
845 WARN("Warning: Read Only file system while creating %s", path);
846 } else {
847 SYSERROR("Error creating %s", path);
848 return -1;
849 }
850 }
851 }
852 return 0;
853 }
854
855 /*
856 * Build a space-separate list of ptys to pass to systemd.
857 */
858 static bool append_ptyname(char **pp, char *name)
859 {
860 char *p;
861
862 if (!*pp) {
863 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
864 if (!*pp)
865 return false;
866 sprintf(*pp, "container_ttys=%s", name);
867 return true;
868 }
869 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
870 if (!p)
871 return false;
872 *pp = p;
873 strcat(p, " ");
874 strcat(p, name);
875 return true;
876 }
877
878 static int lxc_setup_tty(struct lxc_conf *conf)
879 {
880 int i, ret;
881 const struct lxc_tty_info *tty_info = &conf->tty_info;
882 char *ttydir = conf->ttydir;
883 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
884
885 if (!conf->rootfs.path)
886 return 0;
887
888 for (i = 0; i < tty_info->nbtty; i++) {
889 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
890
891 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
892 if (ret < 0 || (size_t)ret >= sizeof(path)) {
893 ERROR("pathname too long for ttys");
894 return -1;
895 }
896
897 if (ttydir) {
898 /* create dev/lxc/tty%d" */
899 ret = snprintf(lxcpath, sizeof(lxcpath),
900 "/dev/%s/tty%d", ttydir, i + 1);
901 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
902 ERROR("pathname too long for ttys");
903 return -1;
904 }
905
906 ret = creat(lxcpath, 0660);
907 if (ret < 0 && errno != EEXIST) {
908 SYSERROR("failed to create \"%s\"", lxcpath);
909 return -1;
910 }
911 if (ret >= 0)
912 close(ret);
913
914 ret = unlink(path);
915 if (ret < 0 && errno != ENOENT) {
916 SYSERROR("failed to unlink \"%s\"", path);
917 return -1;
918 }
919
920 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
921 if (ret < 0) {
922 WARN("failed to bind mount \"%s\" onto \"%s\"",
923 pty_info->name, path);
924 continue;
925 }
926 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
927 path);
928
929 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
930 ttydir, i + 1);
931 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
932 ERROR("tty pathname too long");
933 return -1;
934 }
935
936 ret = symlink(lxcpath, path);
937 if (ret < 0) {
938 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
939 path, lxcpath);
940 return -1;
941 }
942 } else {
943 /* If we populated /dev, then we need to create
944 * /dev/ttyN
945 */
946 ret = access(path, F_OK);
947 if (ret < 0) {
948 ret = creat(path, 0660);
949 if (ret < 0) {
950 SYSERROR("failed to create \"%s\"", path);
951 /* this isn't fatal, continue */
952 } else {
953 close(ret);
954 }
955 }
956
957 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
958 if (ret < 0) {
959 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
960 continue;
961 }
962
963 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
964 path);
965 }
966
967 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
968 ERROR("Error setting up container_ttys string");
969 return -1;
970 }
971 }
972
973 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
974 return 0;
975 }
976
977 static int setup_rootfs_pivot_root(const char *rootfs)
978 {
979 int oldroot = -1, newroot = -1;
980
981 oldroot = open("/", O_DIRECTORY | O_RDONLY);
982 if (oldroot < 0) {
983 SYSERROR("Error opening old-/ for fchdir");
984 return -1;
985 }
986 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
987 if (newroot < 0) {
988 SYSERROR("Error opening new-/ for fchdir");
989 goto fail;
990 }
991
992 /* change into new root fs */
993 if (fchdir(newroot)) {
994 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
995 goto fail;
996 }
997
998 /* pivot_root into our new root fs */
999 if (pivot_root(".", ".")) {
1000 SYSERROR("pivot_root syscall failed");
1001 goto fail;
1002 }
1003
1004 /*
1005 * at this point the old-root is mounted on top of our new-root
1006 * To unmounted it we must not be chdir'd into it, so escape back
1007 * to old-root
1008 */
1009 if (fchdir(oldroot) < 0) {
1010 SYSERROR("Error entering oldroot");
1011 goto fail;
1012 }
1013 if (umount2(".", MNT_DETACH) < 0) {
1014 SYSERROR("Error detaching old root");
1015 goto fail;
1016 }
1017
1018 if (fchdir(newroot) < 0) {
1019 SYSERROR("Error re-entering newroot");
1020 goto fail;
1021 }
1022
1023 close(oldroot);
1024 close(newroot);
1025
1026 DEBUG("pivot_root syscall to '%s' successful", rootfs);
1027
1028 return 0;
1029
1030 fail:
1031 if (oldroot != -1)
1032 close(oldroot);
1033 if (newroot != -1)
1034 close(newroot);
1035 return -1;
1036 }
1037
1038 /* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1039 * error, log it but don't fail yet.
1040 */
1041 static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1042 const char *lxcpath)
1043 {
1044 int ret;
1045 size_t clen;
1046 char *path;
1047
1048 INFO("Preparing \"/dev\"");
1049
1050 /* $(rootfs->mount) + "/dev/pts" + '\0' */
1051 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
1052 path = alloca(clen);
1053
1054 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
1055 if (ret < 0 || (size_t)ret >= clen)
1056 return -1;
1057
1058 if (!dir_exists(path)) {
1059 WARN("\"/dev\" directory does not exist. Proceeding without "
1060 "autodev being set up");
1061 return 0;
1062 }
1063
1064 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
1065 rootfs->path ? rootfs->mount : NULL);
1066 if (ret < 0) {
1067 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1068 return -1;
1069 }
1070 INFO("Mounted tmpfs on \"%s\"", path);
1071
1072 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
1073 if (ret < 0 || (size_t)ret >= clen)
1074 return -1;
1075
1076 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
1077 * If not, then create it and exit if that fails...
1078 */
1079 if (!dir_exists(path)) {
1080 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1081 if (ret < 0) {
1082 SYSERROR("Failed to create directory \"%s\"", path);
1083 return -1;
1084 }
1085 }
1086
1087 INFO("Prepared \"/dev\"");
1088 return 0;
1089 }
1090
1091 struct lxc_devs {
1092 const char *name;
1093 mode_t mode;
1094 int maj;
1095 int min;
1096 };
1097
1098 static const struct lxc_devs lxc_devs[] = {
1099 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1100 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1101 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1102 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1103 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1104 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
1105 };
1106
1107 static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
1108 {
1109 int ret;
1110 char path[MAXPATHLEN];
1111 int i;
1112 mode_t cmask;
1113
1114 ret = snprintf(path, MAXPATHLEN, "%s/dev",
1115 rootfs->path ? rootfs->mount : "");
1116 if (ret < 0 || ret >= MAXPATHLEN)
1117 return -1;
1118
1119 /* ignore, just don't try to fill in */
1120 if (!dir_exists(path))
1121 return 0;
1122
1123 INFO("Populating \"/dev\"");
1124
1125 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1126 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
1127 const struct lxc_devs *d = &lxc_devs[i];
1128
1129 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
1130 rootfs->path ? rootfs->mount : "", d->name);
1131 if (ret < 0 || ret >= MAXPATHLEN)
1132 return -1;
1133
1134 ret = mknod(path, d->mode, makedev(d->maj, d->min));
1135 if (ret < 0) {
1136 FILE *pathfile;
1137 char hostpath[MAXPATHLEN];
1138
1139 if (errno == EEXIST) {
1140 DEBUG("\"%s\" device already existed", path);
1141 continue;
1142 }
1143
1144 /* Unprivileged containers cannot create devices, so
1145 * bind mount the device from the host.
1146 */
1147 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1148 if (ret < 0 || ret >= MAXPATHLEN)
1149 return -1;
1150
1151 pathfile = fopen(path, "wb");
1152 if (!pathfile) {
1153 SYSERROR("Failed to create file \"%s\"", path);
1154 return -1;
1155 }
1156 fclose(pathfile);
1157
1158 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1159 rootfs->path ? rootfs->mount : NULL);
1160 if (ret < 0) {
1161 SYSERROR("Failed to bind mount \"%s\" from "
1162 "host into container",
1163 d->name);
1164 return -1;
1165 }
1166 DEBUG("Bind mounted \"%s\" onto \"%s\"", hostpath,
1167 path);
1168 } else {
1169 DEBUG("Created device node \"%s\"", path);
1170 }
1171 }
1172 umask(cmask);
1173
1174 INFO("Populated \"/dev\"");
1175 return 0;
1176 }
1177
1178 static int lxc_setup_rootfs(struct lxc_conf *conf)
1179 {
1180 int ret;
1181 struct lxc_storage *bdev;
1182 const struct lxc_rootfs *rootfs;
1183
1184 rootfs = &conf->rootfs;
1185 if (!rootfs->path) {
1186 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1187 SYSERROR("Failed to make / rslave.");
1188 return -1;
1189 }
1190 return 0;
1191 }
1192
1193 if (access(rootfs->mount, F_OK)) {
1194 SYSERROR("Failed to access to \"%s\". Check it is present.",
1195 rootfs->mount);
1196 return -1;
1197 }
1198
1199 bdev = storage_init(conf, rootfs->path, rootfs->mount, rootfs->options);
1200 if (!bdev) {
1201 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1202 rootfs->path, rootfs->mount,
1203 rootfs->options ? rootfs->options : "(null)");
1204 return -1;
1205 }
1206
1207 ret = bdev->ops->mount(bdev);
1208 storage_put(bdev);
1209 if (ret < 0) {
1210 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1211 rootfs->path, rootfs->mount,
1212 rootfs->options ? rootfs->options : "(null)");
1213 return -1;
1214 }
1215
1216 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1217 rootfs->path, rootfs->mount,
1218 rootfs->options ? rootfs->options : "(null)");
1219
1220 return 0;
1221 }
1222
1223 int prepare_ramfs_root(char *root)
1224 {
1225 char buf[LXC_LINELEN], *p;
1226 char nroot[PATH_MAX];
1227 FILE *f;
1228 int i;
1229 char *p2;
1230
1231 if (realpath(root, nroot) == NULL)
1232 return -errno;
1233
1234 if (chdir("/") == -1)
1235 return -errno;
1236
1237 /*
1238 * We could use here MS_MOVE, but in userns this mount is
1239 * locked and can't be moved.
1240 */
1241 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
1242 SYSERROR("Failed to move %s into /", root);
1243 return -errno;
1244 }
1245
1246 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
1247 SYSERROR("Failed to make . rprivate");
1248 return -errno;
1249 }
1250
1251 /*
1252 * The following code cleans up inhereted mounts which are not
1253 * required for CT.
1254 *
1255 * The mountinfo file shows not all mounts, if a few points have been
1256 * unmounted between read operations from the mountinfo. So we need to
1257 * read mountinfo a few times.
1258 *
1259 * This loop can be skipped if a container uses unserns, because all
1260 * inherited mounts are locked and we should live with all this trash.
1261 */
1262 while (1) {
1263 int progress = 0;
1264
1265 f = fopen("./proc/self/mountinfo", "r");
1266 if (!f) {
1267 SYSERROR("Unable to open /proc/self/mountinfo");
1268 return -1;
1269 }
1270 while (fgets(buf, LXC_LINELEN, f)) {
1271 for (p = buf, i=0; p && i < 4; i++)
1272 p = strchr(p+1, ' ');
1273 if (!p)
1274 continue;
1275 p2 = strchr(p+1, ' ');
1276 if (!p2)
1277 continue;
1278
1279 *p2 = '\0';
1280 *p = '.';
1281
1282 if (strcmp(p + 1, "/") == 0)
1283 continue;
1284 if (strcmp(p + 1, "/proc") == 0)
1285 continue;
1286
1287 if (umount2(p, MNT_DETACH) == 0)
1288 progress++;
1289 }
1290 fclose(f);
1291 if (!progress)
1292 break;
1293 }
1294
1295 /* This also can be skipped if a container uses unserns */
1296 umount2("./proc", MNT_DETACH);
1297
1298 /* It is weird, but chdir("..") moves us in a new root */
1299 if (chdir("..") == -1) {
1300 SYSERROR("Unable to change working directory");
1301 return -1;
1302 }
1303
1304 if (chroot(".") == -1) {
1305 SYSERROR("Unable to chroot");
1306 return -1;
1307 }
1308
1309 return 0;
1310 }
1311
1312 static int setup_pivot_root(const struct lxc_rootfs *rootfs)
1313 {
1314 if (!rootfs->path) {
1315 DEBUG("container does not have a rootfs, so not doing pivot root");
1316 return 0;
1317 }
1318
1319 if (detect_ramfs_rootfs()) {
1320 DEBUG("detected that container is on ramfs");
1321 if (prepare_ramfs_root(rootfs->mount)) {
1322 ERROR("failed to prepare minimal ramfs root");
1323 return -1;
1324 }
1325
1326 DEBUG("prepared ramfs root for container");
1327 return 0;
1328 }
1329
1330 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1331 ERROR("failed to pivot root");
1332 return -1;
1333 }
1334
1335 DEBUG("finished pivot root");
1336 return 0;
1337 }
1338
1339 static int lxc_setup_devpts(int num_pts)
1340 {
1341 int ret;
1342 const char *default_devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
1343 char devpts_mntopts[256];
1344
1345 if (!num_pts) {
1346 DEBUG("no new devpts instance will be mounted since no pts "
1347 "devices are requested");
1348 return 0;
1349 }
1350
1351 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%d",
1352 default_devpts_mntopts, num_pts);
1353 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1354 return -1;
1355
1356 /* Unmount old devpts instance. */
1357 ret = access("/dev/pts/ptmx", F_OK);
1358 if (!ret) {
1359 ret = umount("/dev/pts");
1360 if (ret < 0) {
1361 SYSERROR("failed to unmount old devpts instance");
1362 return -1;
1363 }
1364 DEBUG("unmounted old /dev/pts instance");
1365 }
1366
1367 /* Create mountpoint for devpts instance. */
1368 ret = mkdir("/dev/pts", 0755);
1369 if (ret < 0 && errno != EEXIST) {
1370 SYSERROR("failed to create the \"/dev/pts\" directory");
1371 return -1;
1372 }
1373
1374 /* Mount new devpts instance. */
1375 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1376 if (ret < 0) {
1377 SYSERROR("failed to mount new devpts instance");
1378 return -1;
1379 }
1380 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
1381
1382 /* Remove any pre-existing /dev/ptmx file. */
1383 ret = access("/dev/ptmx", F_OK);
1384 if (!ret) {
1385 ret = remove("/dev/ptmx");
1386 if (ret < 0) {
1387 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1388 return -1;
1389 }
1390 DEBUG("removed existing \"/dev/ptmx\"");
1391 }
1392
1393 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1394 ret = open("/dev/ptmx", O_CREAT, 0666);
1395 if (ret < 0) {
1396 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1397 return -1;
1398 }
1399 close(ret);
1400 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
1401
1402 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
1403 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
1404 if (!ret) {
1405 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1406 return 0;
1407 } else {
1408 /* Fallthrough and try to create a symlink. */
1409 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1410 }
1411
1412 /* Remove the dummy /dev/ptmx file we created above. */
1413 ret = remove("/dev/ptmx");
1414 if (ret < 0) {
1415 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1416 return -1;
1417 }
1418
1419 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1420 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1421 if (ret < 0) {
1422 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1423 return -1;
1424 }
1425 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1426
1427 return 0;
1428 }
1429
1430 static int setup_personality(int persona)
1431 {
1432 #if HAVE_SYS_PERSONALITY_H
1433 if (persona == -1)
1434 return 0;
1435
1436 if (personality(persona) < 0) {
1437 SYSERROR("failed to set personality to '0x%x'", persona);
1438 return -1;
1439 }
1440
1441 INFO("set personality to '0x%x'", persona);
1442 #endif
1443
1444 return 0;
1445 }
1446
1447 static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1448 const struct lxc_console *console)
1449 {
1450 char path[MAXPATHLEN];
1451 int ret, fd;
1452
1453 if (console->path && !strcmp(console->path, "none"))
1454 return 0;
1455
1456 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1457 if (ret < 0 || (size_t)ret >= sizeof(path))
1458 return -1;
1459
1460 /* When we are asked to setup a console we remove any previous
1461 * /dev/console bind-mounts.
1462 */
1463 if (file_exists(path)) {
1464 ret = lxc_unstack_mountpoint(path, false);
1465 if (ret < 0) {
1466 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1467 return -ret;
1468 } else {
1469 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1470 }
1471
1472 ret = unlink(path);
1473 if (ret < 0) {
1474 SYSERROR("error unlinking %s", path);
1475 return -errno;
1476 }
1477 }
1478
1479 /* For unprivileged containers autodev or automounts will already have
1480 * taken care of creating /dev/console.
1481 */
1482 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1483 if (fd < 0) {
1484 if (errno != EEXIST) {
1485 SYSERROR("failed to create console");
1486 return -errno;
1487 }
1488 } else {
1489 close(fd);
1490 }
1491
1492 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
1493 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1494 return -errno;
1495 }
1496
1497 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
1498 ERROR("failed to mount '%s' on '%s'", console->name, path);
1499 return -1;
1500 }
1501
1502 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
1503 return 0;
1504 }
1505
1506 static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1507 const struct lxc_console *console,
1508 char *ttydir)
1509 {
1510 int ret;
1511 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
1512
1513 /* create rootfs/dev/<ttydir> directory */
1514 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1515 if (ret < 0 || (size_t)ret >= sizeof(path))
1516 return -1;
1517
1518 ret = mkdir(path, 0755);
1519 if (ret && errno != EEXIST) {
1520 SYSERROR("failed with errno %d to create %s", errno, path);
1521 return -errno;
1522 }
1523 DEBUG("Created directory for console and tty devices at \"%s\"", path);
1524
1525 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1526 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1527 return -1;
1528
1529 ret = creat(lxcpath, 0660);
1530 if (ret == -1 && errno != EEXIST) {
1531 SYSERROR("error %d creating %s", errno, lxcpath);
1532 return -errno;
1533 }
1534 if (ret >= 0)
1535 close(ret);
1536
1537 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1538 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1539 return -1;
1540
1541 /* When we are asked to setup a console we remove any previous
1542 * /dev/console bind-mounts.
1543 */
1544 if (console->path && !strcmp(console->path, "none")) {
1545 struct stat st;
1546 ret = stat(path, &st);
1547 if (ret < 0) {
1548 if (errno == ENOENT)
1549 return 0;
1550 SYSERROR("failed stat() \"%s\"", path);
1551 return -errno;
1552 }
1553
1554 /* /dev/console must be character device with major number 5 and
1555 * minor number 1. If not, give benefit of the doubt and assume
1556 * the user has mounted something else right there on purpose.
1557 */
1558 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1559 return 0;
1560
1561 /* In case the user requested a bind-mount for /dev/console and
1562 * requests a ttydir we move the mount to the
1563 * /dev/<ttydir/console.
1564 * Note, we only move the uppermost mount and clear all other
1565 * mounts underneath for safety.
1566 * If it is a character device created via mknod() we simply
1567 * rename it.
1568 */
1569 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1570 if (ret < 0) {
1571 if (errno != EINVAL) {
1572 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1573 return -errno;
1574 }
1575 /* path was not a mountpoint */
1576 ret = rename(path, lxcpath);
1577 if (ret < 0) {
1578 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1579 return -errno;
1580 }
1581 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1582 } else {
1583 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1584 }
1585
1586 /* Clear all remaining bind-mounts. */
1587 ret = lxc_unstack_mountpoint(path, false);
1588 if (ret < 0) {
1589 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1590 return -ret;
1591 } else {
1592 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1593 }
1594 } else {
1595 if (file_exists(path)) {
1596 ret = lxc_unstack_mountpoint(path, false);
1597 if (ret < 0) {
1598 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1599 return -ret;
1600 } else {
1601 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1602 }
1603 }
1604
1605 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1606 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1607 return -1;
1608 }
1609 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
1610 }
1611
1612 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
1613 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
1614 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1615 return -1;
1616
1617 ret = unlink(path);
1618 if (ret && errno != ENOENT) {
1619 SYSERROR("error unlinking %s", path);
1620 return -errno;
1621 }
1622
1623 ret = symlink(lxcpath, path);
1624 if (ret < 0) {
1625 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
1626 return -1;
1627 }
1628
1629 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
1630 return 0;
1631 }
1632
1633 static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1634 const struct lxc_console *console, char *ttydir)
1635 {
1636 /* We don't have a rootfs, /dev/console will be shared. */
1637 if (!rootfs->path) {
1638 DEBUG("/dev/console will be shared with the host");
1639 return 0;
1640 }
1641
1642 if (!ttydir)
1643 return lxc_setup_dev_console(rootfs, console);
1644
1645 return lxc_setup_ttydir_console(rootfs, console, ttydir);
1646 }
1647
1648 static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1649 {
1650 struct mount_opt *mo;
1651
1652 /* If opt is found in mount_opt, set or clear flags.
1653 * Otherwise append it to data. */
1654
1655 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1656 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1657 if (mo->clear)
1658 *flags &= ~mo->flag;
1659 else
1660 *flags |= mo->flag;
1661 return;
1662 }
1663 }
1664
1665 if (strlen(*data))
1666 strcat(*data, ",");
1667 strcat(*data, opt);
1668 }
1669
1670 int parse_mntopts(const char *mntopts, unsigned long *mntflags,
1671 char **mntdata)
1672 {
1673 char *s, *data;
1674 char *p, *saveptr = NULL;
1675
1676 *mntdata = NULL;
1677 *mntflags = 0L;
1678
1679 if (!mntopts)
1680 return 0;
1681
1682 s = strdup(mntopts);
1683 if (!s) {
1684 SYSERROR("failed to allocate memory");
1685 return -1;
1686 }
1687
1688 data = malloc(strlen(s) + 1);
1689 if (!data) {
1690 SYSERROR("failed to allocate memory");
1691 free(s);
1692 return -1;
1693 }
1694 *data = 0;
1695
1696 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1697 p = strtok_r(NULL, ",", &saveptr))
1698 parse_mntopt(p, mntflags, &data);
1699
1700 if (*data)
1701 *mntdata = data;
1702 else
1703 free(data);
1704 free(s);
1705
1706 return 0;
1707 }
1708
1709 static void null_endofword(char *word)
1710 {
1711 while (*word && *word != ' ' && *word != '\t')
1712 word++;
1713 *word = '\0';
1714 }
1715
1716 /*
1717 * skip @nfields spaces in @src
1718 */
1719 static char *get_field(char *src, int nfields)
1720 {
1721 char *p = src;
1722 int i;
1723
1724 for (i = 0; i < nfields; i++) {
1725 while (*p && *p != ' ' && *p != '\t')
1726 p++;
1727 if (!*p)
1728 break;
1729 p++;
1730 }
1731 return p;
1732 }
1733
1734 static int mount_entry(const char *fsname, const char *target,
1735 const char *fstype, unsigned long mountflags,
1736 const char *data, int optional, int dev,
1737 const char *rootfs)
1738 {
1739 int ret;
1740 #ifdef HAVE_STATVFS
1741 struct statvfs sb;
1742 #endif
1743
1744 ret = safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data,
1745 rootfs);
1746 if (ret < 0) {
1747 if (optional) {
1748 INFO("Failed to mount \"%s\" on \"%s\" (optional): %s",
1749 fsname, target, strerror(errno));
1750 return 0;
1751 }
1752
1753 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1754 return -1;
1755 }
1756
1757 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
1758 unsigned long rqd_flags = 0;
1759
1760 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
1761 "options",
1762 fsname ? fsname : "(none)", target ? target : "(none)");
1763
1764 if (mountflags & MS_RDONLY)
1765 rqd_flags |= MS_RDONLY;
1766 #ifdef HAVE_STATVFS
1767 if (statvfs(fsname, &sb) == 0) {
1768 unsigned long required_flags = rqd_flags;
1769
1770 if (sb.f_flag & MS_NOSUID)
1771 required_flags |= MS_NOSUID;
1772
1773 if (sb.f_flag & MS_NODEV && !dev)
1774 required_flags |= MS_NODEV;
1775
1776 if (sb.f_flag & MS_RDONLY)
1777 required_flags |= MS_RDONLY;
1778
1779 if (sb.f_flag & MS_NOEXEC)
1780 required_flags |= MS_NOEXEC;
1781
1782 DEBUG("Flags for \"%s\" were %lu, required extra flags "
1783 "are %lu", fsname, sb.f_flag, required_flags);
1784
1785 /* If this was a bind mount request, and required_flags
1786 * does not have any flags which are not already in
1787 * mountflags, then skip the remount.
1788 */
1789 if (!(mountflags & MS_REMOUNT)) {
1790 if (!(required_flags & ~mountflags) &&
1791 rqd_flags == 0) {
1792 DEBUG("Mountflags already were %lu, "
1793 "skipping remount", mountflags);
1794 goto skipremount;
1795 }
1796 }
1797
1798 mountflags |= required_flags;
1799 }
1800 #endif
1801
1802 ret = mount(fsname, target, fstype, mountflags | MS_REMOUNT, data);
1803 if (ret < 0) {
1804 if (optional) {
1805 INFO("Failed to mount \"%s\" on \"%s\" "
1806 "(optional): %s", fsname, target,
1807 strerror(errno));
1808 return 0;
1809 }
1810
1811 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1812 return -1;
1813 }
1814 }
1815
1816 #ifdef HAVE_STATVFS
1817 skipremount:
1818 #endif
1819 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"", fsname,
1820 target, fstype);
1821
1822 return 0;
1823 }
1824
1825 /* Remove "optional", "create=dir", and "create=file" from mntopt */
1826 static void cull_mntent_opt(struct mntent *mntent)
1827 {
1828 int i;
1829 char *list[] = {"create=dir", "create=file", "optional", NULL};
1830
1831 for (i = 0; list[i]; i++) {
1832 char *p, *p2;
1833
1834 p = strstr(mntent->mnt_opts, list[i]);
1835 if (!p)
1836 continue;
1837
1838 p2 = strchr(p, ',');
1839 if (!p2) {
1840 /* no more mntopts, so just chop it here */
1841 *p = '\0';
1842 continue;
1843 }
1844
1845 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
1846 }
1847 }
1848
1849 static int mount_entry_create_dir_file(const struct mntent *mntent,
1850 const char *path,
1851 const struct lxc_rootfs *rootfs,
1852 const char *lxc_name,
1853 const char *lxc_path)
1854 {
1855 int ret = 0;
1856
1857 if (!strncmp(mntent->mnt_type, "overlay", 7))
1858 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
1859 else if (!strncmp(mntent->mnt_type, "aufs", 4))
1860 ret = aufs_mkdir(mntent, rootfs, lxc_name, lxc_path);
1861 if (ret < 0)
1862 return -1;
1863
1864 if (hasmntopt(mntent, "create=dir")) {
1865 ret = mkdir_p(path, 0755);
1866 if (ret < 0 && errno != EEXIST) {
1867 SYSERROR("Failed to create directory \"%s\"", path);
1868 return -1;
1869 }
1870 }
1871
1872 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1873 int fd;
1874 char *p1, *p2;
1875
1876 p1 = strdup(path);
1877 if (!p1)
1878 return -1;
1879
1880 p2 = dirname(p1);
1881
1882 ret = mkdir_p(p2, 0755);
1883 free(p1);
1884 if (ret < 0 && errno != EEXIST) {
1885 SYSERROR("Failed to create directory \"%s\"", path);
1886 return -1;
1887 }
1888
1889 fd = open(path, O_CREAT, 0644);
1890 if (fd < 0)
1891 return -1;
1892 close(fd);
1893 }
1894
1895 return 0;
1896 }
1897
1898 /* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1899 * without a rootfs. */
1900 static inline int mount_entry_on_generic(struct mntent *mntent,
1901 const char *path,
1902 const struct lxc_rootfs *rootfs,
1903 const char *lxc_name,
1904 const char *lxc_path)
1905 {
1906 int ret;
1907 unsigned long mntflags;
1908 char *mntdata;
1909 bool dev, optional;
1910 char *rootfs_path = NULL;
1911
1912 optional = hasmntopt(mntent, "optional") != NULL;
1913 dev = hasmntopt(mntent, "dev") != NULL;
1914
1915 if (rootfs && rootfs->path)
1916 rootfs_path = rootfs->mount;
1917
1918 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
1919 lxc_path);
1920 if (ret < 0) {
1921 if (optional)
1922 return 0;
1923
1924 return -1;
1925 }
1926 cull_mntent_opt(mntent);
1927
1928 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
1929 if (ret < 0)
1930 return -1;
1931
1932 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
1933 mntdata, optional, dev, rootfs_path);
1934
1935 free(mntdata);
1936 return ret;
1937 }
1938
1939 static inline int mount_entry_on_systemfs(struct mntent *mntent)
1940 {
1941 int ret;
1942 char path[MAXPATHLEN];
1943
1944 /* For containers created without a rootfs all mounts are treated as
1945 * absolute paths starting at / on the host.
1946 */
1947 if (mntent->mnt_dir[0] != '/')
1948 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
1949 else
1950 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
1951 if (ret < 0 || ret >= sizeof(path))
1952 return -1;
1953
1954 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
1955 }
1956
1957 static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
1958 const struct lxc_rootfs *rootfs,
1959 const char *lxc_name,
1960 const char *lxc_path)
1961 {
1962 int offset;
1963 char *aux;
1964 const char *lxcpath;
1965 char path[MAXPATHLEN];
1966 int ret = 0;
1967
1968 lxcpath = lxc_global_config_value("lxc.lxcpath");
1969 if (!lxcpath)
1970 return -1;
1971
1972 /* If rootfs->path is a blockdev path, allow container fstab to use
1973 * <lxcpath>/<name>/rootfs" as the target prefix.
1974 */
1975 ret = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
1976 if (ret < 0 || ret >= MAXPATHLEN)
1977 goto skipvarlib;
1978
1979 aux = strstr(mntent->mnt_dir, path);
1980 if (aux) {
1981 offset = strlen(path);
1982 goto skipabs;
1983 }
1984
1985 skipvarlib:
1986 aux = strstr(mntent->mnt_dir, rootfs->path);
1987 if (!aux) {
1988 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
1989 return ret;
1990 }
1991 offset = strlen(rootfs->path);
1992
1993 skipabs:
1994 ret = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset);
1995 if (ret < 0 || ret >= MAXPATHLEN)
1996 return -1;
1997
1998 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
1999 }
2000
2001 static int mount_entry_on_relative_rootfs(struct mntent *mntent,
2002 const struct lxc_rootfs *rootfs,
2003 const char *lxc_name,
2004 const char *lxc_path)
2005 {
2006 char path[MAXPATHLEN];
2007 int ret;
2008
2009 /* relative to root mount point */
2010 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
2011 if (ret < 0 || ret >= sizeof(path)) {
2012 ERROR("path name too long");
2013 return -1;
2014 }
2015
2016 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2017 }
2018
2019 /* This logs a NOTICE() when a user specifies mounts that would conflict with
2020 * devices liblxc sets up automatically.
2021 */
2022 static void log_notice_on_conflict(const struct lxc_conf *conf, const char *src,
2023 const char *dest)
2024 {
2025 char *clean_mnt_fsname, *clean_mnt_dir, *tmp;
2026 bool needs_warning = false;
2027
2028 clean_mnt_fsname = lxc_deslashify(src);
2029 if (!clean_mnt_fsname)
2030 return;
2031
2032 clean_mnt_dir = lxc_deslashify(dest);
2033 if (!clean_mnt_dir) {
2034 free(clean_mnt_fsname);
2035 return;
2036 }
2037
2038 tmp = clean_mnt_dir;
2039 if (*tmp == '/')
2040 tmp++;
2041
2042 if (strncmp(src, "/dev", 4) || strncmp(tmp, "dev", 3)) {
2043 free(clean_mnt_dir);
2044 free(clean_mnt_fsname);
2045 return;
2046 }
2047
2048 if (!conf->autodev && !conf->pts && !conf->tty &&
2049 (!conf->console.path || !strcmp(conf->console.path, "none"))) {
2050 free(clean_mnt_dir);
2051 free(clean_mnt_fsname);
2052 return;
2053 }
2054
2055 if (!strcmp(tmp, "dev") && conf->autodev > 0)
2056 needs_warning = true;
2057 else if (!strcmp(tmp, "dev/pts") && (conf->autodev > 0 || conf->pts > 0))
2058 needs_warning = true;
2059 else if (!strcmp(tmp, "dev/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2060 needs_warning = true;
2061 else if (!strcmp(tmp, "dev/pts/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2062 needs_warning = true;
2063 else if (!strcmp(tmp, "dev/null") && conf->autodev > 0)
2064 needs_warning = true;
2065 else if (!strcmp(tmp, "dev/zero") && conf->autodev > 0)
2066 needs_warning = true;
2067 else if (!strcmp(tmp, "dev/full") && conf->autodev > 0)
2068 needs_warning = true;
2069 else if (!strcmp(tmp, "dev/urandom") && conf->autodev > 0)
2070 needs_warning = true;
2071 else if (!strcmp(tmp, "dev/random") && conf->autodev > 0)
2072 needs_warning = true;
2073 else if (!strcmp(tmp, "dev/tty") && conf->autodev > 0)
2074 needs_warning = true;
2075 else if (!strncmp(tmp, "dev/tty", 7) && (conf->autodev > 0 || conf->tty > 0))
2076 needs_warning = true;
2077
2078 if (needs_warning)
2079 NOTICE("Requesting to mount \"%s\" on \"%s\" while requesting "
2080 "automatic device setup under \"/dev\"",
2081 clean_mnt_fsname, clean_mnt_dir);
2082
2083 free(clean_mnt_dir);
2084 free(clean_mnt_fsname);
2085 }
2086
2087 static int mount_file_entries(const struct lxc_conf *conf,
2088 const struct lxc_rootfs *rootfs, FILE *file,
2089 const char *lxc_name, const char *lxc_path)
2090 {
2091 struct mntent mntent;
2092 char buf[4096];
2093 int ret = -1;
2094
2095 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
2096 log_notice_on_conflict(conf, mntent.mnt_fsname, mntent.mnt_dir);
2097
2098 if (!rootfs->path)
2099 ret = mount_entry_on_systemfs(&mntent);
2100 else if (mntent.mnt_dir[0] != '/')
2101 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2102 lxc_name, lxc_path);
2103 else
2104 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2105 lxc_name, lxc_path);
2106 if (ret < 0)
2107 return -1;
2108 }
2109 ret = 0;
2110
2111 INFO("Set up mount entries");
2112 return ret;
2113 }
2114
2115 static int setup_mount(const struct lxc_conf *conf,
2116 const struct lxc_rootfs *rootfs, const char *fstab,
2117 const char *lxc_name, const char *lxc_path)
2118 {
2119 FILE *f;
2120 int ret;
2121
2122 if (!fstab)
2123 return 0;
2124
2125 f = setmntent(fstab, "r");
2126 if (!f) {
2127 SYSERROR("Failed to open \"%s\"", fstab);
2128 return -1;
2129 }
2130
2131 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
2132 if (ret < 0)
2133 ERROR("Failed to set up mount entries");
2134
2135 endmntent(f);
2136 return ret;
2137 }
2138
2139 FILE *make_anonymous_mount_file(struct lxc_list *mount)
2140 {
2141 int ret;
2142 char *mount_entry;
2143 struct lxc_list *iterator;
2144 FILE *f;
2145 int fd = -1;
2146
2147 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2148 if (fd < 0) {
2149 if (errno != ENOSYS)
2150 return NULL;
2151 f = tmpfile();
2152 TRACE("Created temporary mount file");
2153 } else {
2154 f = fdopen(fd, "r+");
2155 TRACE("Created anonymous mount file");
2156 }
2157
2158 if (!f) {
2159 SYSERROR("Could not create mount file");
2160 if (fd != -1)
2161 close(fd);
2162 return NULL;
2163 }
2164
2165 lxc_list_for_each(iterator, mount) {
2166 mount_entry = iterator->elem;
2167 ret = fprintf(f, "%s\n", mount_entry);
2168 if (ret < strlen(mount_entry))
2169 WARN("Could not write mount entry to mount file");
2170 }
2171
2172 ret = fseek(f, 0, SEEK_SET);
2173 if (ret < 0) {
2174 SYSERROR("Failed to seek mount file");
2175 fclose(f);
2176 return NULL;
2177 }
2178
2179 return f;
2180 }
2181
2182 static int setup_mount_entries(const struct lxc_conf *conf,
2183 const struct lxc_rootfs *rootfs,
2184 struct lxc_list *mount, const char *lxc_name,
2185 const char *lxc_path)
2186 {
2187 FILE *f;
2188 int ret;
2189
2190 f = make_anonymous_mount_file(mount);
2191 if (!f)
2192 return -1;
2193
2194 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
2195
2196 fclose(f);
2197 return ret;
2198 }
2199
2200 static int parse_cap(const char *cap)
2201 {
2202 char *ptr = NULL;
2203 size_t i;
2204 int capid = -1;
2205
2206 if (!strcmp(cap, "none"))
2207 return -2;
2208
2209 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2210
2211 if (strcmp(cap, caps_opt[i].name))
2212 continue;
2213
2214 capid = caps_opt[i].value;
2215 break;
2216 }
2217
2218 if (capid < 0) {
2219 /* try to see if it's numeric, so the user may specify
2220 * capabilities that the running kernel knows about but
2221 * we don't */
2222 errno = 0;
2223 capid = strtol(cap, &ptr, 10);
2224 if (!ptr || *ptr != '\0' || errno != 0)
2225 /* not a valid number */
2226 capid = -1;
2227 else if (capid > lxc_caps_last_cap())
2228 /* we have a number but it's not a valid
2229 * capability */
2230 capid = -1;
2231 }
2232
2233 return capid;
2234 }
2235
2236 int in_caplist(int cap, struct lxc_list *caps)
2237 {
2238 struct lxc_list *iterator;
2239 int capid;
2240
2241 lxc_list_for_each(iterator, caps) {
2242 capid = parse_cap(iterator->elem);
2243 if (capid == cap)
2244 return 1;
2245 }
2246
2247 return 0;
2248 }
2249
2250 static int setup_caps(struct lxc_list *caps)
2251 {
2252 struct lxc_list *iterator;
2253 char *drop_entry;
2254 int capid;
2255
2256 lxc_list_for_each(iterator, caps) {
2257
2258 drop_entry = iterator->elem;
2259
2260 capid = parse_cap(drop_entry);
2261
2262 if (capid < 0) {
2263 ERROR("unknown capability %s", drop_entry);
2264 return -1;
2265 }
2266
2267 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2268
2269 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
2270 SYSERROR("failed to remove %s capability", drop_entry);
2271 return -1;
2272 }
2273
2274 }
2275
2276 DEBUG("capabilities have been setup");
2277
2278 return 0;
2279 }
2280
2281 static int dropcaps_except(struct lxc_list *caps)
2282 {
2283 struct lxc_list *iterator;
2284 char *keep_entry;
2285 int i, capid;
2286 int numcaps = lxc_caps_last_cap() + 1;
2287 INFO("found %d capabilities", numcaps);
2288
2289 if (numcaps <= 0 || numcaps > 200)
2290 return -1;
2291
2292 /* caplist[i] is 1 if we keep capability i */
2293 int *caplist = alloca(numcaps * sizeof(int));
2294 memset(caplist, 0, numcaps * sizeof(int));
2295
2296 lxc_list_for_each(iterator, caps) {
2297
2298 keep_entry = iterator->elem;
2299
2300 capid = parse_cap(keep_entry);
2301
2302 if (capid == -2)
2303 continue;
2304
2305 if (capid < 0) {
2306 ERROR("unknown capability %s", keep_entry);
2307 return -1;
2308 }
2309
2310 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
2311
2312 caplist[capid] = 1;
2313 }
2314 for (i=0; i<numcaps; i++) {
2315 if (caplist[i])
2316 continue;
2317 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
2318 SYSERROR("failed to remove capability %d", i);
2319 return -1;
2320 }
2321 }
2322
2323 DEBUG("capabilities have been setup");
2324
2325 return 0;
2326 }
2327
2328 static int setup_hw_addr(char *hwaddr, const char *ifname)
2329 {
2330 struct sockaddr sockaddr;
2331 struct ifreq ifr;
2332 int ret, fd, saved_errno;
2333
2334 ret = lxc_convert_mac(hwaddr, &sockaddr);
2335 if (ret) {
2336 ERROR("mac address '%s' conversion failed : %s",
2337 hwaddr, strerror(-ret));
2338 return -1;
2339 }
2340
2341 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
2342 ifr.ifr_name[IFNAMSIZ-1] = '\0';
2343 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2344
2345 fd = socket(AF_INET, SOCK_DGRAM, 0);
2346 if (fd < 0) {
2347 ERROR("socket failure : %s", strerror(errno));
2348 return -1;
2349 }
2350
2351 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
2352 saved_errno = errno;
2353 close(fd);
2354 if (ret)
2355 ERROR("ioctl failure : %s", strerror(saved_errno));
2356
2357 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
2358
2359 return ret;
2360 }
2361
2362 static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
2363 {
2364 struct lxc_list *iterator;
2365 struct lxc_inetdev *inetdev;
2366 int err;
2367
2368 lxc_list_for_each(iterator, ip) {
2369
2370 inetdev = iterator->elem;
2371
2372 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2373 &inetdev->bcast, inetdev->prefix);
2374 if (err) {
2375 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2376 ifindex, strerror(-err));
2377 return -1;
2378 }
2379 }
2380
2381 return 0;
2382 }
2383
2384 static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
2385 {
2386 struct lxc_list *iterator;
2387 struct lxc_inet6dev *inet6dev;
2388 int err;
2389
2390 lxc_list_for_each(iterator, ip) {
2391
2392 inet6dev = iterator->elem;
2393
2394 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
2395 &inet6dev->mcast, &inet6dev->acast,
2396 inet6dev->prefix);
2397 if (err) {
2398 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2399 ifindex, strerror(-err));
2400 return -1;
2401 }
2402 }
2403
2404 return 0;
2405 }
2406
2407 static int lxc_setup_netdev_in_child_namespaces(struct lxc_netdev *netdev)
2408 {
2409 char ifname[IFNAMSIZ];
2410 int err;
2411 const char *net_type_name;
2412 char *current_ifname = ifname;
2413
2414 /* empty network namespace */
2415 if (!netdev->ifindex) {
2416 if (netdev->flags & IFF_UP) {
2417 err = lxc_netdev_up("lo");
2418 if (err) {
2419 ERROR("failed to set the loopback up : %s",
2420 strerror(-err));
2421 return -1;
2422 }
2423 }
2424
2425 if (netdev->type == LXC_NET_EMPTY)
2426 return 0;
2427
2428 if (netdev->type == LXC_NET_NONE)
2429 return 0;
2430
2431 if (netdev->type != LXC_NET_VETH) {
2432 net_type_name = lxc_net_type_to_str(netdev->type);
2433 ERROR("%s networks are not supported for containers "
2434 "not setup up by privileged users",
2435 net_type_name);
2436 return -1;
2437 }
2438
2439 netdev->ifindex = if_nametoindex(netdev->name);
2440 }
2441
2442 /* get the new ifindex in case of physical netdev */
2443 if (netdev->type == LXC_NET_PHYS) {
2444 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2445 ERROR("failed to get ifindex for %s",
2446 netdev->link);
2447 return -1;
2448 }
2449 }
2450
2451 /* retrieve the name of the interface */
2452 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2453 ERROR("no interface corresponding to index '%d'",
2454 netdev->ifindex);
2455 return -1;
2456 }
2457
2458 /* default: let the system to choose one interface name */
2459 if (!netdev->name)
2460 netdev->name = netdev->type == LXC_NET_PHYS ?
2461 netdev->link : "eth%d";
2462
2463 /* rename the interface name */
2464 if (strcmp(ifname, netdev->name) != 0) {
2465 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2466 if (err) {
2467 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2468 strerror(-err));
2469 return -1;
2470 }
2471 }
2472
2473 /* Re-read the name of the interface because its name has changed
2474 * and would be automatically allocated by the system
2475 */
2476 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2477 ERROR("no interface corresponding to index '%d'",
2478 netdev->ifindex);
2479 return -1;
2480 }
2481
2482 /* set a mac address */
2483 if (netdev->hwaddr) {
2484 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
2485 ERROR("failed to setup hw address for '%s'",
2486 current_ifname);
2487 return -1;
2488 }
2489 }
2490
2491 /* setup ipv4 addresses on the interface */
2492 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
2493 ERROR("failed to setup ip addresses for '%s'",
2494 ifname);
2495 return -1;
2496 }
2497
2498 /* setup ipv6 addresses on the interface */
2499 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
2500 ERROR("failed to setup ipv6 addresses for '%s'",
2501 ifname);
2502 return -1;
2503 }
2504
2505 /* set the network device up */
2506 if (netdev->flags & IFF_UP) {
2507 int err;
2508
2509 err = lxc_netdev_up(current_ifname);
2510 if (err) {
2511 ERROR("failed to set '%s' up : %s", current_ifname,
2512 strerror(-err));
2513 return -1;
2514 }
2515
2516 /* the network is up, make the loopback up too */
2517 err = lxc_netdev_up("lo");
2518 if (err) {
2519 ERROR("failed to set the loopback up : %s",
2520 strerror(-err));
2521 return -1;
2522 }
2523 }
2524
2525 /* We can only set up the default routes after bringing
2526 * up the interface, sine bringing up the interface adds
2527 * the link-local routes and we can't add a default
2528 * route if the gateway is not reachable. */
2529
2530 /* setup ipv4 gateway on the interface */
2531 if (netdev->ipv4_gateway) {
2532 if (!(netdev->flags & IFF_UP)) {
2533 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2534 return -1;
2535 }
2536
2537 if (lxc_list_empty(&netdev->ipv4)) {
2538 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2539 return -1;
2540 }
2541
2542 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2543 if (err) {
2544 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2545 if (err) {
2546 ERROR("failed to add ipv4 dest for '%s': %s",
2547 ifname, strerror(-err));
2548 }
2549
2550 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2551 if (err) {
2552 ERROR("failed to setup ipv4 gateway for '%s': %s",
2553 ifname, strerror(-err));
2554 if (netdev->ipv4_gateway_auto) {
2555 char buf[INET_ADDRSTRLEN];
2556 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2557 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2558 }
2559 return -1;
2560 }
2561 }
2562 }
2563
2564 /* setup ipv6 gateway on the interface */
2565 if (netdev->ipv6_gateway) {
2566 if (!(netdev->flags & IFF_UP)) {
2567 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2568 return -1;
2569 }
2570
2571 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2572 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2573 return -1;
2574 }
2575
2576 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2577 if (err) {
2578 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2579 if (err) {
2580 ERROR("failed to add ipv6 dest for '%s': %s",
2581 ifname, strerror(-err));
2582 }
2583
2584 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2585 if (err) {
2586 ERROR("failed to setup ipv6 gateway for '%s': %s",
2587 ifname, strerror(-err));
2588 if (netdev->ipv6_gateway_auto) {
2589 char buf[INET6_ADDRSTRLEN];
2590 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2591 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2592 }
2593 return -1;
2594 }
2595 }
2596 }
2597
2598 DEBUG("'%s' has been setup", current_ifname);
2599
2600 return 0;
2601 }
2602
2603 static int lxc_setup_networks_in_child_namespaces(const struct lxc_conf *conf,
2604 struct lxc_list *network)
2605 {
2606 struct lxc_list *iterator;
2607 struct lxc_netdev *netdev;
2608
2609 lxc_log_configured_netdevs(conf);
2610
2611 lxc_list_for_each(iterator, network) {
2612 netdev = iterator->elem;
2613
2614 /* REMOVE in LXC 3.0 */
2615 if (netdev->idx < 0) {
2616 ERROR("WARNING: using \"lxc.network.*\" keys to define "
2617 "networks is DEPRECATED, please switch to using "
2618 "\"lxc.net.[i].* keys\"");
2619 }
2620
2621 if (lxc_setup_netdev_in_child_namespaces(netdev)) {
2622 ERROR("failed to setup netdev");
2623 return -1;
2624 }
2625 }
2626
2627 if (!lxc_list_empty(network))
2628 INFO("network has been setup");
2629
2630 return 0;
2631 }
2632
2633 static int parse_resource(const char *res) {
2634 size_t i;
2635 int resid = -1;
2636
2637 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2638 if (strcmp(res, limit_opt[i].name) == 0)
2639 return limit_opt[i].value;
2640 }
2641
2642 /* try to see if it's numeric, so the user may specify
2643 * resources that the running kernel knows about but
2644 * we don't */
2645 if (lxc_safe_int(res, &resid) == 0)
2646 return resid;
2647 return -1;
2648 }
2649
2650 int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2651 struct lxc_list *it;
2652 struct lxc_limit *lim;
2653 int resid;
2654
2655 lxc_list_for_each(it, limits) {
2656 lim = it->elem;
2657
2658 resid = parse_resource(lim->resource);
2659 if (resid < 0) {
2660 ERROR("unknown resource %s", lim->resource);
2661 return -1;
2662 }
2663
2664 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2665 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2666 return -1;
2667 }
2668 }
2669 return 0;
2670 }
2671
2672 /* try to move physical nics to the init netns */
2673 void lxc_restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2674 {
2675 int i, oldfd;
2676 char ifname[IFNAMSIZ];
2677
2678 if (netnsfd < 0 || conf->num_savednics == 0)
2679 return;
2680
2681 INFO("Running to reset %d nic names.", conf->num_savednics);
2682
2683 oldfd = lxc_preserve_ns(getpid(), "net");
2684 if (oldfd < 0) {
2685 SYSERROR("Failed to open monitor netns fd.");
2686 return;
2687 }
2688
2689 if (setns(netnsfd, 0) != 0) {
2690 SYSERROR("Failed to enter container netns to reset nics");
2691 close(oldfd);
2692 return;
2693 }
2694 for (i=0; i<conf->num_savednics; i++) {
2695 struct saved_nic *s = &conf->saved_nics[i];
2696 /* retrieve the name of the interface */
2697 if (!if_indextoname(s->ifindex, ifname)) {
2698 WARN("no interface corresponding to index '%d'", s->ifindex);
2699 continue;
2700 }
2701 if (lxc_netdev_move_by_name(ifname, 1, s->orig_name))
2702 WARN("Error moving nic name:%s back to host netns", ifname);
2703 free(s->orig_name);
2704 }
2705 conf->num_savednics = 0;
2706
2707 if (setns(oldfd, 0) != 0)
2708 SYSERROR("Failed to re-enter monitor's netns");
2709 close(oldfd);
2710 }
2711
2712 static char *default_rootfs_mount = LXCROOTFSMOUNT;
2713
2714 struct lxc_conf *lxc_conf_init(void)
2715 {
2716 struct lxc_conf *new;
2717 int i;
2718
2719 new = malloc(sizeof(*new));
2720 if (!new) {
2721 ERROR("lxc_conf_init : %s", strerror(errno));
2722 return NULL;
2723 }
2724 memset(new, 0, sizeof(*new));
2725
2726 new->loglevel = LXC_LOG_LEVEL_NOTSET;
2727 new->personality = -1;
2728 new->autodev = 1;
2729 new->console.log_path = NULL;
2730 new->console.log_fd = -1;
2731 new->console.path = NULL;
2732 new->console.peer = -1;
2733 new->console.peerpty.busy = -1;
2734 new->console.peerpty.master = -1;
2735 new->console.peerpty.slave = -1;
2736 new->console.master = -1;
2737 new->console.slave = -1;
2738 new->console.name[0] = '\0';
2739 new->maincmd_fd = -1;
2740 new->nbd_idx = -1;
2741 new->rootfs.mount = strdup(default_rootfs_mount);
2742 if (!new->rootfs.mount) {
2743 ERROR("lxc_conf_init : %s", strerror(errno));
2744 free(new);
2745 return NULL;
2746 }
2747 new->logfd = -1;
2748 lxc_list_init(&new->cgroup);
2749 lxc_list_init(&new->network);
2750 lxc_list_init(&new->mount_list);
2751 lxc_list_init(&new->caps);
2752 lxc_list_init(&new->keepcaps);
2753 lxc_list_init(&new->id_map);
2754 lxc_list_init(&new->includes);
2755 lxc_list_init(&new->aliens);
2756 lxc_list_init(&new->environment);
2757 lxc_list_init(&new->limits);
2758 for (i=0; i<NUM_LXC_HOOKS; i++)
2759 lxc_list_init(&new->hooks[i]);
2760 lxc_list_init(&new->groups);
2761 new->lsm_aa_profile = NULL;
2762 new->lsm_se_context = NULL;
2763 new->tmp_umount_proc = 0;
2764
2765 for (i = 0; i < LXC_NS_MAX; i++)
2766 new->inherit_ns_fd[i] = -1;
2767
2768 /* if running in a new user namespace, init and COMMAND
2769 * default to running as UID/GID 0 when using lxc-execute */
2770 new->init_uid = 0;
2771 new->init_gid = 0;
2772
2773 return new;
2774 }
2775
2776 static int instantiate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2777 {
2778 char *veth1, *veth2;
2779 char veth1buf[IFNAMSIZ], veth2buf[IFNAMSIZ];
2780 int bridge_index, err;
2781 unsigned int mtu = 0;
2782
2783 if (netdev->priv.veth_attr.pair) {
2784 veth1 = netdev->priv.veth_attr.pair;
2785 if (handler->conf->reboot)
2786 lxc_netdev_delete_by_name(veth1);
2787 } else {
2788 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2789 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2790 ERROR("veth1 name too long");
2791 return -1;
2792 }
2793 veth1 = lxc_mkifname(veth1buf);
2794 if (!veth1) {
2795 ERROR("failed to allocate a temporary name");
2796 return -1;
2797 }
2798 /* store away for deconf */
2799 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
2800 }
2801
2802 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
2803 veth2 = lxc_mkifname(veth2buf);
2804 if (!veth2) {
2805 ERROR("failed to allocate a temporary name");
2806 goto out_delete;
2807 }
2808
2809 err = lxc_veth_create(veth1, veth2);
2810 if (err) {
2811 ERROR("failed to create veth pair \"%s\" and \"%s\": %s", veth1,
2812 veth2, strerror(-err));
2813 goto out_delete;
2814 }
2815
2816 /* changing the high byte of the mac address to 0xfe, the bridge interface
2817 * will always keep the host's mac address and not take the mac address
2818 * of a container */
2819 err = setup_private_host_hw_addr(veth1);
2820 if (err) {
2821 ERROR("failed to change mac address of host interface \"%s\": %s",
2822 veth1, strerror(-err));
2823 goto out_delete;
2824 }
2825
2826 netdev->ifindex = if_nametoindex(veth2);
2827 if (!netdev->ifindex) {
2828 ERROR("failed to retrieve the index for \"%s\"", veth2);
2829 goto out_delete;
2830 }
2831
2832 if (netdev->mtu) {
2833 if (lxc_safe_uint(netdev->mtu, &mtu) < 0)
2834 WARN("failed to parse mtu from");
2835 else
2836 INFO("retrieved mtu %d", mtu);
2837 } else if (netdev->link) {
2838 bridge_index = if_nametoindex(netdev->link);
2839 if (bridge_index) {
2840 mtu = netdev_get_mtu(bridge_index);
2841 INFO("retrieved mtu %d from %s", mtu, netdev->link);
2842 } else {
2843 mtu = netdev_get_mtu(netdev->ifindex);
2844 INFO("retrieved mtu %d from %s", mtu, veth2);
2845 }
2846 }
2847
2848 if (mtu) {
2849 err = lxc_netdev_set_mtu(veth1, mtu);
2850 if (!err)
2851 err = lxc_netdev_set_mtu(veth2, mtu);
2852 if (err) {
2853 ERROR("failed to set mtu \"%d\" for veth pair \"%s\" "
2854 "and \"%s\": %s",
2855 mtu, veth1, veth2, strerror(-err));
2856 goto out_delete;
2857 }
2858 }
2859
2860 if (netdev->link) {
2861 err = lxc_bridge_attach(handler->lxcpath, handler->name, netdev->link, veth1);
2862 if (err) {
2863 ERROR("failed to attach \"%s\" to bridge \"%s\": %s",
2864 veth1, netdev->link, strerror(-err));
2865 goto out_delete;
2866 }
2867 INFO("attached \"%s\" to bridge \"%s\"", veth1, netdev->link);
2868 }
2869
2870 err = lxc_netdev_up(veth1);
2871 if (err) {
2872 ERROR("failed to set \"%s\" up: %s", veth1, strerror(-err));
2873 goto out_delete;
2874 }
2875
2876 if (netdev->upscript) {
2877 err = run_script(handler->name, "net", netdev->upscript, "up",
2878 "veth", veth1, (char*) NULL);
2879 if (err)
2880 goto out_delete;
2881 }
2882
2883 DEBUG("instantiated veth \"%s/%s\", index is \"%d\"", veth1, veth2,
2884 netdev->ifindex);
2885
2886 return 0;
2887
2888 out_delete:
2889 if (netdev->ifindex != 0)
2890 lxc_netdev_delete_by_name(veth1);
2891 if (!netdev->priv.veth_attr.pair)
2892 free(veth1);
2893 free(veth2);
2894 return -1;
2895 }
2896
2897 static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2898 {
2899 char *veth1;
2900 int err;
2901
2902 if (netdev->priv.veth_attr.pair)
2903 veth1 = netdev->priv.veth_attr.pair;
2904 else
2905 veth1 = netdev->priv.veth_attr.veth1;
2906
2907 if (netdev->downscript) {
2908 err = run_script(handler->name, "net", netdev->downscript,
2909 "down", "veth", veth1, (char*) NULL);
2910 if (err)
2911 return -1;
2912 }
2913 return 0;
2914 }
2915
2916 static int instantiate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2917 {
2918 char peerbuf[IFNAMSIZ], *peer;
2919 int err;
2920
2921 if (!netdev->link) {
2922 ERROR("no link specified for macvlan netdev");
2923 return -1;
2924 }
2925
2926 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2927 if (err >= sizeof(peerbuf))
2928 return -1;
2929
2930 peer = lxc_mkifname(peerbuf);
2931 if (!peer) {
2932 ERROR("failed to make a temporary name");
2933 return -1;
2934 }
2935
2936 err = lxc_macvlan_create(netdev->link, peer,
2937 netdev->priv.macvlan_attr.mode);
2938 if (err) {
2939 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2940 peer, netdev->link, strerror(-err));
2941 goto out;
2942 }
2943
2944 netdev->ifindex = if_nametoindex(peer);
2945 if (!netdev->ifindex) {
2946 ERROR("failed to retrieve the index for %s", peer);
2947 goto out;
2948 }
2949
2950 if (netdev->upscript) {
2951 err = run_script(handler->name, "net", netdev->upscript, "up",
2952 "macvlan", netdev->link, (char*) NULL);
2953 if (err)
2954 goto out;
2955 }
2956
2957 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
2958 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
2959
2960 return 0;
2961 out:
2962 lxc_netdev_delete_by_name(peer);
2963 free(peer);
2964 return -1;
2965 }
2966
2967 static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2968 {
2969 int err;
2970
2971 if (netdev->downscript) {
2972 err = run_script(handler->name, "net", netdev->downscript,
2973 "down", "macvlan", netdev->link,
2974 (char*) NULL);
2975 if (err)
2976 return -1;
2977 }
2978 return 0;
2979 }
2980
2981 /* XXX: merge with instantiate_macvlan */
2982 static int instantiate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2983 {
2984 char peer[IFNAMSIZ];
2985 int err;
2986 static uint16_t vlan_cntr = 0;
2987 unsigned int mtu = 0;
2988
2989 if (!netdev->link) {
2990 ERROR("no link specified for vlan netdev");
2991 return -1;
2992 }
2993
2994 err = snprintf(peer, sizeof(peer), "vlan%d-%d", netdev->priv.vlan_attr.vid, vlan_cntr++);
2995 if (err >= sizeof(peer)) {
2996 ERROR("peer name too long");
2997 return -1;
2998 }
2999
3000 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
3001 if (err) {
3002 ERROR("failed to create vlan interface '%s' on '%s' : %s",
3003 peer, netdev->link, strerror(-err));
3004 return -1;
3005 }
3006
3007 netdev->ifindex = if_nametoindex(peer);
3008 if (!netdev->ifindex) {
3009 ERROR("failed to retrieve the ifindex for %s", peer);
3010 lxc_netdev_delete_by_name(peer);
3011 return -1;
3012 }
3013
3014 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
3015 netdev->ifindex);
3016 if (netdev->mtu) {
3017 if (lxc_safe_uint(netdev->mtu, &mtu) < 0) {
3018 ERROR("Failed to retrieve mtu from: '%d'/'%s'.",
3019 netdev->ifindex, netdev->name);
3020 return -1;
3021 }
3022 err = lxc_netdev_set_mtu(peer, mtu);
3023 if (err) {
3024 ERROR("failed to set mtu '%s' for %s : %s",
3025 netdev->mtu, peer, strerror(-err));
3026 lxc_netdev_delete_by_name(peer);
3027 return -1;
3028 }
3029 }
3030
3031 return 0;
3032 }
3033
3034 static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
3035 {
3036 return 0;
3037 }
3038
3039 static int instantiate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
3040 {
3041 if (!netdev->link) {
3042 ERROR("no link specified for the physical interface");
3043 return -1;
3044 }
3045
3046 netdev->ifindex = if_nametoindex(netdev->link);
3047 if (!netdev->ifindex) {
3048 ERROR("failed to retrieve the index for %s", netdev->link);
3049 return -1;
3050 }
3051
3052 if (netdev->upscript) {
3053 int err;
3054 err = run_script(handler->name, "net", netdev->upscript,
3055 "up", "phys", netdev->link, (char*) NULL);
3056 if (err)
3057 return -1;
3058 }
3059
3060 return 0;
3061 }
3062
3063 static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
3064 {
3065 int err;
3066
3067 if (netdev->downscript) {
3068 err = run_script(handler->name, "net", netdev->downscript,
3069 "down", "phys", netdev->link, (char*) NULL);
3070 if (err)
3071 return -1;
3072 }
3073 return 0;
3074 }
3075
3076 static int instantiate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3077 {
3078 netdev->ifindex = 0;
3079 return 0;
3080 }
3081
3082 static int instantiate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3083 {
3084 netdev->ifindex = 0;
3085 if (netdev->upscript) {
3086 int err;
3087 err = run_script(handler->name, "net", netdev->upscript,
3088 "up", "empty", (char*) NULL);
3089 if (err)
3090 return -1;
3091 }
3092 return 0;
3093 }
3094
3095 static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3096 {
3097 int err;
3098
3099 if (netdev->downscript) {
3100 err = run_script(handler->name, "net", netdev->downscript,
3101 "down", "empty", (char*) NULL);
3102 if (err)
3103 return -1;
3104 }
3105 return 0;
3106 }
3107
3108 static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3109 {
3110 return 0;
3111 }
3112
3113 int lxc_requests_empty_network(struct lxc_handler *handler)
3114 {
3115 struct lxc_list *network = &handler->conf->network;
3116 struct lxc_list *iterator;
3117 struct lxc_netdev *netdev;
3118 bool found_none = false, found_nic = false;
3119
3120 if (lxc_list_empty(network))
3121 return 0;
3122
3123 lxc_list_for_each(iterator, network) {
3124
3125 netdev = iterator->elem;
3126
3127 if (netdev->type == LXC_NET_NONE)
3128 found_none = true;
3129 else
3130 found_nic = true;
3131 }
3132 if (found_none && !found_nic)
3133 return 1;
3134 return 0;
3135 }
3136
3137 int lxc_setup_networks_in_parent_namespaces(struct lxc_handler *handler)
3138 {
3139 bool am_root;
3140 struct lxc_netdev *netdev;
3141 struct lxc_list *iterator;
3142 struct lxc_list *network = &handler->conf->network;
3143
3144 /* We need to be root. */
3145 am_root = (getuid() == 0);
3146 if (!am_root)
3147 return 0;
3148
3149 lxc_list_for_each(iterator, network) {
3150 netdev = iterator->elem;
3151
3152 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
3153 ERROR("invalid network configuration type '%d'",
3154 netdev->type);
3155 return -1;
3156 }
3157
3158 if (netdev->type != LXC_NET_MACVLAN &&
3159 netdev->priv.macvlan_attr.mode) {
3160 ERROR("Invalid macvlan.mode for a non-macvlan netdev");
3161 return -1;
3162 }
3163
3164 if (netdev->type != LXC_NET_VETH &&
3165 netdev->priv.veth_attr.pair) {
3166 ERROR("Invalid veth pair for a non-veth netdev");
3167 return -1;
3168 }
3169
3170 if (netdev->type != LXC_NET_VLAN &&
3171 netdev->priv.vlan_attr.vid > 0) {
3172 ERROR("Invalid vlan.id for a non-macvlan netdev");
3173 return -1;
3174 }
3175
3176 if (netdev_conf[netdev->type](handler, netdev)) {
3177 ERROR("failed to create netdev");
3178 return -1;
3179 }
3180
3181 }
3182
3183 return 0;
3184 }
3185
3186 bool lxc_delete_network(struct lxc_handler *handler)
3187 {
3188 int ret;
3189 struct lxc_list *network = &handler->conf->network;
3190 struct lxc_list *iterator;
3191 struct lxc_netdev *netdev;
3192 bool deleted_all = true;
3193
3194 lxc_list_for_each(iterator, network) {
3195 netdev = iterator->elem;
3196
3197 if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
3198 if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
3199 WARN("Failed to rename interface with index %d "
3200 "to its initial name \"%s\".",
3201 netdev->ifindex, netdev->link);
3202 continue;
3203 }
3204
3205 if (netdev_deconf[netdev->type](handler, netdev)) {
3206 WARN("Failed to destroy netdev");
3207 }
3208
3209 /* Recent kernel remove the virtual interfaces when the network
3210 * namespace is destroyed but in case we did not moved the
3211 * interface to the network namespace, we have to destroy it
3212 */
3213 if (netdev->ifindex != 0) {
3214 ret = lxc_netdev_delete_by_index(netdev->ifindex);
3215 if (-ret == ENODEV) {
3216 INFO("Interface \"%s\" with index %d already "
3217 "deleted or existing in different network "
3218 "namespace.",
3219 netdev->name ? netdev->name : "(null)",
3220 netdev->ifindex);
3221 } else if (ret < 0) {
3222 deleted_all = false;
3223 WARN("Failed to remove interface \"%s\" with "
3224 "index %d: %s.",
3225 netdev->name ? netdev->name : "(null)",
3226 netdev->ifindex, strerror(-ret));
3227 } else {
3228 INFO("Removed interface \"%s\" with index %d.",
3229 netdev->name ? netdev->name : "(null)",
3230 netdev->ifindex);
3231 }
3232 }
3233
3234 /* Explicitly delete host veth device to prevent lingering
3235 * devices. We had issues in LXD around this.
3236 */
3237 if (netdev->ifindex != 0 && netdev->type == LXC_NET_VETH && !am_unpriv()) {
3238 char *hostveth;
3239 if (netdev->priv.veth_attr.pair) {
3240 hostveth = netdev->priv.veth_attr.pair;
3241 ret = lxc_netdev_delete_by_name(hostveth);
3242 if (ret < 0) {
3243 WARN("Failed to remove interface \"%s\" from host: %s.", hostveth, strerror(-ret));
3244 } else {
3245 INFO("Removed interface \"%s\" from host.", hostveth);
3246 }
3247 } else if (strlen(netdev->priv.veth_attr.veth1) > 0) {
3248 hostveth = netdev->priv.veth_attr.veth1;
3249 ret = lxc_netdev_delete_by_name(hostveth);
3250 if (ret < 0) {
3251 WARN("Failed to remove \"%s\" from host: %s.", hostveth, strerror(-ret));
3252 } else {
3253 INFO("Removed interface \"%s\" from host.", hostveth);
3254 memset((void *)&netdev->priv.veth_attr.veth1, 0, sizeof(netdev->priv.veth_attr.veth1));
3255 }
3256 }
3257 }
3258 }
3259
3260 return deleted_all;
3261 }
3262
3263 #define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3264
3265 /* lxc-user-nic returns "interface_name:interface_name\n" */
3266 #define MAX_BUFFER_SIZE IFNAMSIZ * 2 + 2
3267 static int unpriv_assign_nic(const char *lxcpath, char *lxcname,
3268 struct lxc_netdev *netdev, pid_t pid)
3269 {
3270 pid_t child;
3271 int bytes, pipefd[2];
3272 char *token, *saveptr = NULL;
3273 char buffer[MAX_BUFFER_SIZE];
3274 char netdev_link[IFNAMSIZ + 1];
3275
3276 if (netdev->type != LXC_NET_VETH) {
3277 ERROR("nic type %d not support for unprivileged use",
3278 netdev->type);
3279 return -1;
3280 }
3281
3282 if (pipe(pipefd) < 0) {
3283 SYSERROR("pipe failed");
3284 return -1;
3285 }
3286
3287 child = fork();
3288 if (child < 0) {
3289 SYSERROR("fork");
3290 close(pipefd[0]);
3291 close(pipefd[1]);
3292 return -1;
3293 }
3294
3295 if (child == 0) { /* child */
3296 /* Call lxc-user-nic pid type bridge. */
3297 int ret;
3298 char pidstr[LXC_NUMSTRLEN64];
3299
3300 close(pipefd[0]); /* Close the read-end of the pipe. */
3301
3302 /* Redirect stdout to write-end of the pipe. */
3303 ret = dup2(pipefd[1], STDOUT_FILENO);
3304 close(pipefd[1]); /* Close the write-end of the pipe. */
3305 if (ret < 0) {
3306 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
3307 exit(EXIT_FAILURE);
3308 }
3309
3310 if (netdev->link)
3311 strncpy(netdev_link, netdev->link, IFNAMSIZ);
3312 else
3313 strncpy(netdev_link, "none", IFNAMSIZ);
3314
3315 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
3316 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
3317 exit(EXIT_FAILURE);
3318 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
3319
3320 INFO("Execing lxc-user-nic %s %s %s veth %s %s", lxcpath,
3321 lxcname, pidstr, netdev_link, netdev->name);
3322 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, lxcpath, lxcname,
3323 pidstr, "veth", netdev_link, netdev->name, NULL);
3324
3325 SYSERROR("Failed to exec lxc-user-nic.");
3326 exit(EXIT_FAILURE);
3327 }
3328
3329 /* close the write-end of the pipe */
3330 close(pipefd[1]);
3331
3332 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
3333 if (bytes < 0)
3334 SYSERROR("Failed to read from pipe file descriptor.");
3335 buffer[bytes - 1] = '\0';
3336
3337 if (wait_for_pid(child) != 0) {
3338 close(pipefd[0]);
3339 return -1;
3340 }
3341
3342 /* close the read-end of the pipe */
3343 close(pipefd[0]);
3344
3345 /* fill netdev->name field */
3346 token = strtok_r(buffer, ":", &saveptr);
3347 if (!token)
3348 return -1;
3349
3350 netdev->name = malloc(IFNAMSIZ + 1);
3351 if (!netdev->name) {
3352 SYSERROR("Failed to allocate memory.");
3353 return -1;
3354 }
3355 memset(netdev->name, 0, IFNAMSIZ + 1);
3356 strncpy(netdev->name, token, IFNAMSIZ);
3357
3358 /* fill netdev->veth_attr.pair field */
3359 token = strtok_r(NULL, ":", &saveptr);
3360 if (!token)
3361 return -1;
3362
3363 netdev->priv.veth_attr.pair = strdup(token);
3364 if (!netdev->priv.veth_attr.pair) {
3365 ERROR("Failed to allocate memory.");
3366 return -1;
3367 }
3368
3369 return 0;
3370 }
3371
3372 int lxc_assign_network(const char *lxcpath, char *lxcname,
3373 struct lxc_list *network, pid_t pid)
3374 {
3375 struct lxc_list *iterator;
3376 struct lxc_netdev *netdev;
3377 char ifname[IFNAMSIZ];
3378 int am_root = (getuid() == 0);
3379 int err;
3380
3381 lxc_list_for_each(iterator, network) {
3382
3383 netdev = iterator->elem;
3384
3385 if (netdev->type == LXC_NET_VETH && !am_root) {
3386 if (netdev->mtu)
3387 INFO("mtu ignored due to insufficient privilege");
3388 if (unpriv_assign_nic(lxcpath, lxcname, netdev, pid))
3389 return -1;
3390 /* lxc-user-nic has moved the nic to the new ns.
3391 * unpriv_assign_nic() fills in netdev->name.
3392 * netdev->ifindex will be filed in at
3393 * lxc_setup_netdev_in_child_namespaces.
3394 */
3395 continue;
3396 }
3397
3398 /* empty network namespace, nothing to move */
3399 if (!netdev->ifindex)
3400 continue;
3401
3402 /* retrieve the name of the interface */
3403 if (!if_indextoname(netdev->ifindex, ifname)) {
3404 ERROR("no interface corresponding to index '%d'", netdev->ifindex);
3405 return -1;
3406 }
3407
3408 err = lxc_netdev_move_by_name(ifname, pid, NULL);
3409 if (err) {
3410 ERROR("failed to move '%s' to the container : %s",
3411 netdev->link, strerror(-err));
3412 return -1;
3413 }
3414
3415 DEBUG("move '%s'/'%s' to '%d': .", ifname, netdev->name, pid);
3416 }
3417
3418 return 0;
3419 }
3420
3421 static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3422 size_t buf_size)
3423 {
3424 char path[MAXPATHLEN];
3425 int fd, ret;
3426
3427 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
3428 idtype == ID_TYPE_UID ? 'u' : 'g');
3429 if (ret < 0 || ret >= MAXPATHLEN) {
3430 ERROR("failed to create path \"%s\"", path);
3431 return -E2BIG;
3432 }
3433
3434 fd = open(path, O_WRONLY);
3435 if (fd < 0) {
3436 SYSERROR("failed to open \"%s\"", path);
3437 return -1;
3438 }
3439
3440 errno = 0;
3441 ret = lxc_write_nointr(fd, buf, buf_size);
3442 if (ret != buf_size) {
3443 SYSERROR("failed to write %cid mapping to \"%s\"",
3444 idtype == ID_TYPE_UID ? 'u' : 'g', path);
3445 close(fd);
3446 return -1;
3447 }
3448 close(fd);
3449
3450 return 0;
3451 }
3452
3453 /* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
3454 *
3455 * @return 1 if functional binary was found
3456 * @return 0 if binary exists but is lacking privilege
3457 * @return -ENOENT if binary does not exist
3458 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
3459 *
3460 */
3461 static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
3462 {
3463 char *path;
3464 int ret;
3465 struct stat st;
3466 int fret = 0;
3467
3468 if (cap != CAP_SETUID && cap != CAP_SETGID)
3469 return -EINVAL;
3470
3471 path = on_path(binary, NULL);
3472 if (!path)
3473 return -ENOENT;
3474
3475 ret = stat(path, &st);
3476 if (ret < 0) {
3477 fret = -errno;
3478 goto cleanup;
3479 }
3480
3481 /* Check if the binary is setuid. */
3482 if (st.st_mode & S_ISUID) {
3483 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
3484 fret = 1;
3485 goto cleanup;
3486 }
3487
3488 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
3489 /* Check if it has the CAP_SETUID capability. */
3490 if ((cap & CAP_SETUID) &&
3491 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
3492 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
3493 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
3494 "and CAP_PERMITTED sets.", path);
3495 fret = 1;
3496 goto cleanup;
3497 }
3498
3499 /* Check if it has the CAP_SETGID capability. */
3500 if ((cap & CAP_SETGID) &&
3501 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
3502 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
3503 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
3504 "and CAP_PERMITTED sets.", path);
3505 fret = 1;
3506 goto cleanup;
3507 }
3508 #else
3509 /* If we cannot check for file capabilities we need to give the benefit
3510 * of the doubt. Otherwise we might fail even though all the necessary
3511 * file capabilities are set.
3512 */
3513 DEBUG("Cannot check for file capabilites as full capability support is "
3514 "missing. Manual intervention needed.");
3515 fret = 1;
3516 #endif
3517
3518 cleanup:
3519 free(path);
3520 return fret;
3521 }
3522
3523 int lxc_map_ids_exec_wrapper(void *args)
3524 {
3525 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
3526 return -1;
3527 }
3528
3529 int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3530 {
3531 struct id_map *map;
3532 struct lxc_list *iterator;
3533 enum idtype type;
3534 char u_or_g;
3535 char *pos;
3536 int fill, left;
3537 char cmd_output[MAXPATHLEN];
3538 /* strlen("new@idmap") = 9
3539 * +
3540 * strlen(" ") = 1
3541 * +
3542 * LXC_NUMSTRLEN64
3543 * +
3544 * strlen(" ") = 1
3545 *
3546 * We add some additional space to make sure that we really have
3547 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
3548 */
3549 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
3550 int ret = 0, uidmap = 0, gidmap = 0;
3551 bool use_shadow = false, had_entry = false;
3552
3553 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
3554 * ranges, then insist that root also reserve ranges in subuid. This
3555 * will protected it by preventing another user from being handed the
3556 * range by shadow.
3557 */
3558 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
3559 if (uidmap == -ENOENT)
3560 WARN("newuidmap binary is missing");
3561 else if (!uidmap)
3562 WARN("newuidmap is lacking necessary privileges");
3563
3564 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
3565 if (gidmap == -ENOENT)
3566 WARN("newgidmap binary is missing");
3567 else if (!gidmap)
3568 WARN("newgidmap is lacking necessary privileges");
3569
3570 if (uidmap > 0 && gidmap > 0) {
3571 DEBUG("Functional newuidmap and newgidmap binary found.");
3572 use_shadow = true;
3573 } else {
3574 /* In case unprivileged users run application containers via
3575 * execute() or a start*() there are valid cases where they may
3576 * only want to map their own {g,u}id. Let's not block them from
3577 * doing so by requiring geteuid() == 0.
3578 */
3579 DEBUG("No newuidmap and newgidmap binary found. Trying to "
3580 "write directly with euid %d.", geteuid());
3581 }
3582
3583 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
3584 type++, u_or_g = 'g') {
3585 pos = mapbuf;
3586
3587 if (use_shadow)
3588 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
3589
3590 lxc_list_for_each(iterator, idmap) {
3591 /* The kernel only takes <= 4k for writes to
3592 * /proc/<nr>/[ug]id_map
3593 */
3594 map = iterator->elem;
3595 if (map->idtype != type)
3596 continue;
3597
3598 had_entry = true;
3599
3600 left = LXC_IDMAPLEN - (pos - mapbuf);
3601 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
3602 use_shadow ? " " : "", map->nsid,
3603 map->hostid, map->range,
3604 use_shadow ? "" : "\n");
3605 if (fill <= 0 || fill >= left)
3606 SYSERROR("Too many {g,u}id mappings defined.");
3607
3608 pos += fill;
3609 }
3610 if (!had_entry)
3611 continue;
3612
3613 /* Try to catch the ouput of new{g,u}idmap to make debugging
3614 * easier.
3615 */
3616 if (use_shadow) {
3617 ret = run_command(cmd_output, sizeof(cmd_output),
3618 lxc_map_ids_exec_wrapper,
3619 (void *)mapbuf);
3620 if (ret < 0) {
3621 ERROR("new%cidmap failed to write mapping: %s",
3622 u_or_g, cmd_output);
3623 return -1;
3624 }
3625 } else {
3626 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
3627 if (ret < 0)
3628 return -1;
3629 }
3630
3631 memset(mapbuf, 0, sizeof(mapbuf));
3632 }
3633
3634 return 0;
3635 }
3636
3637 /*
3638 * return the host uid/gid to which the container root is mapped in
3639 * *val.
3640 * Return true if id was found, false otherwise.
3641 */
3642 bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3643 unsigned long *val)
3644 {
3645 struct lxc_list *it;
3646 struct id_map *map;
3647
3648 lxc_list_for_each(it, &conf->id_map) {
3649 map = it->elem;
3650 if (map->idtype != idtype)
3651 continue;
3652 if (map->nsid != 0)
3653 continue;
3654 *val = map->hostid;
3655 return true;
3656 }
3657 return false;
3658 }
3659
3660 int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
3661 {
3662 struct lxc_list *it;
3663 struct id_map *map;
3664 lxc_list_for_each(it, &conf->id_map) {
3665 map = it->elem;
3666 if (map->idtype != idtype)
3667 continue;
3668 if (id >= map->hostid && id < map->hostid + map->range)
3669 return (id - map->hostid) + map->nsid;
3670 }
3671 return -1;
3672 }
3673
3674 int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
3675 {
3676 struct lxc_list *it;
3677 struct id_map *map;
3678 unsigned int freeid = 0;
3679 again:
3680 lxc_list_for_each(it, &conf->id_map) {
3681 map = it->elem;
3682 if (map->idtype != idtype)
3683 continue;
3684 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3685 freeid = map->nsid + map->range;
3686 goto again;
3687 }
3688 }
3689 return freeid;
3690 }
3691
3692 int lxc_find_gateway_addresses(struct lxc_handler *handler)
3693 {
3694 struct lxc_list *network = &handler->conf->network;
3695 struct lxc_list *iterator;
3696 struct lxc_netdev *netdev;
3697 int link_index;
3698
3699 lxc_list_for_each(iterator, network) {
3700 netdev = iterator->elem;
3701
3702 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3703 continue;
3704
3705 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3706 ERROR("gateway = auto only supported for "
3707 "veth and macvlan");
3708 return -1;
3709 }
3710
3711 if (!netdev->link) {
3712 ERROR("gateway = auto needs a link interface");
3713 return -1;
3714 }
3715
3716 link_index = if_nametoindex(netdev->link);
3717 if (!link_index)
3718 return -EINVAL;
3719
3720 if (netdev->ipv4_gateway_auto) {
3721 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3722 ERROR("failed to automatically find ipv4 gateway "
3723 "address from link interface '%s'", netdev->link);
3724 return -1;
3725 }
3726 }
3727
3728 if (netdev->ipv6_gateway_auto) {
3729 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3730 ERROR("failed to automatically find ipv6 gateway "
3731 "address from link interface '%s'", netdev->link);
3732 return -1;
3733 }
3734 }
3735 }
3736
3737 return 0;
3738 }
3739
3740 int lxc_create_tty(const char *name, struct lxc_conf *conf)
3741 {
3742 struct lxc_tty_info *tty_info = &conf->tty_info;
3743 int i, ret;
3744
3745 /* no tty in the configuration */
3746 if (!conf->tty)
3747 return 0;
3748
3749 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
3750 if (!tty_info->pty_info) {
3751 SYSERROR("failed to allocate struct *pty_info");
3752 return -ENOMEM;
3753 }
3754
3755 for (i = 0; i < conf->tty; i++) {
3756 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3757
3758 process_lock();
3759 ret = openpty(&pty_info->master, &pty_info->slave,
3760 pty_info->name, NULL, NULL);
3761 process_unlock();
3762 if (ret) {
3763 SYSERROR("failed to create pty device number %d", i);
3764 tty_info->nbtty = i;
3765 lxc_delete_tty(tty_info);
3766 return -ENOTTY;
3767 }
3768
3769 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
3770 pty_info->name, pty_info->master, pty_info->slave);
3771
3772 /* Prevent leaking the file descriptors to the container */
3773 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3774 if (ret < 0)
3775 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
3776 "pty device \"%s\": %s",
3777 pty_info->master, pty_info->name, strerror(errno));
3778
3779 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3780 if (ret < 0)
3781 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
3782 "pty device \"%s\": %s",
3783 pty_info->slave, pty_info->name, strerror(errno));
3784
3785 pty_info->busy = 0;
3786 }
3787
3788 tty_info->nbtty = conf->tty;
3789
3790 INFO("finished allocating %d pts devices", conf->tty);
3791 return 0;
3792 }
3793
3794 void lxc_delete_tty(struct lxc_tty_info *tty_info)
3795 {
3796 int i;
3797
3798 for (i = 0; i < tty_info->nbtty; i++) {
3799 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3800
3801 close(pty_info->master);
3802 close(pty_info->slave);
3803 }
3804
3805 free(tty_info->pty_info);
3806 tty_info->pty_info = NULL;
3807 tty_info->nbtty = 0;
3808 }
3809
3810
3811 int chown_mapped_root_exec_wrapper(void *args)
3812 {
3813 execvp("lxc-usernsexec", args);
3814 return -1;
3815 }
3816
3817 /*
3818 * chown_mapped_root: for an unprivileged user with uid/gid X to
3819 * chown a dir to subuid/subgid Y, he needs to run chown as root
3820 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3821 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3822 * root is privileged with respect to hostuid/hostgid X, allowing
3823 * him to do the chown.
3824 */
3825 int chown_mapped_root(char *path, struct lxc_conf *conf)
3826 {
3827 uid_t rootuid, rootgid;
3828 unsigned long val;
3829 int hostuid, hostgid, ret;
3830 struct stat sb;
3831 char map1[100], map2[100], map3[100], map4[100], map5[100];
3832 char ugid[100];
3833 char *args1[] = {"lxc-usernsexec",
3834 "-m", map1,
3835 "-m", map2,
3836 "-m", map3,
3837 "-m", map5,
3838 "--", "chown", ugid, path,
3839 NULL};
3840 char *args2[] = {"lxc-usernsexec",
3841 "-m", map1,
3842 "-m", map2,
3843 "-m", map3,
3844 "-m", map4,
3845 "-m", map5,
3846 "--", "chown", ugid, path,
3847 NULL};
3848 char cmd_output[MAXPATHLEN];
3849
3850 hostuid = geteuid();
3851 hostgid = getegid();
3852
3853 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
3854 ERROR("No uid mapping for container root");
3855 return -1;
3856 }
3857 rootuid = (uid_t)val;
3858 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
3859 ERROR("No gid mapping for container root");
3860 return -1;
3861 }
3862 rootgid = (gid_t)val;
3863
3864 if (hostuid == 0) {
3865 if (chown(path, rootuid, rootgid) < 0) {
3866 ERROR("Error chowning %s", path);
3867 return -1;
3868 }
3869 return 0;
3870 }
3871
3872 if (rootuid == hostuid) {
3873 /* nothing to do */
3874 INFO("Container root is our uid; no need to chown");
3875 return 0;
3876 }
3877
3878 /* save the current gid of "path" */
3879 if (stat(path, &sb) < 0) {
3880 ERROR("Error stat %s", path);
3881 return -1;
3882 }
3883
3884 /* Update the path argument in case this was overlayfs. */
3885 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
3886 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
3887
3888 /*
3889 * A file has to be group-owned by a gid mapped into the
3890 * container, or the container won't be privileged over it.
3891 */
3892 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3893 if (sb.st_uid == hostuid &&
3894 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3895 chown(path, -1, hostgid) < 0) {
3896 ERROR("Failed chgrping %s", path);
3897 return -1;
3898 }
3899
3900 /* "u:0:rootuid:1" */
3901 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3902 if (ret < 0 || ret >= 100) {
3903 ERROR("Error uid printing map string");
3904 return -1;
3905 }
3906
3907 /* "u:hostuid:hostuid:1" */
3908 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3909 if (ret < 0 || ret >= 100) {
3910 ERROR("Error uid printing map string");
3911 return -1;
3912 }
3913
3914 /* "g:0:rootgid:1" */
3915 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3916 if (ret < 0 || ret >= 100) {
3917 ERROR("Error gid printing map string");
3918 return -1;
3919 }
3920
3921 /* "g:pathgid:rootgid+pathgid:1" */
3922 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3923 rootgid + (gid_t)sb.st_gid);
3924 if (ret < 0 || ret >= 100) {
3925 ERROR("Error gid printing map string");
3926 return -1;
3927 }
3928
3929 /* "g:hostgid:hostgid:1" */
3930 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3931 if (ret < 0 || ret >= 100) {
3932 ERROR("Error gid printing map string");
3933 return -1;
3934 }
3935
3936 /* "0:pathgid" (chown) */
3937 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3938 if (ret < 0 || ret >= 100) {
3939 ERROR("Error owner printing format string for chown");
3940 return -1;
3941 }
3942
3943 if (hostgid == sb.st_gid)
3944 ret = run_command(cmd_output, sizeof(cmd_output),
3945 chown_mapped_root_exec_wrapper,
3946 (void *)args1);
3947 else
3948 ret = run_command(cmd_output, sizeof(cmd_output),
3949 chown_mapped_root_exec_wrapper,
3950 (void *)args2);
3951 if (ret < 0)
3952 ERROR("lxc-usernsexec failed: %s", cmd_output);
3953
3954 return ret;
3955 }
3956
3957 int lxc_ttys_shift_ids(struct lxc_conf *c)
3958 {
3959 if (lxc_list_empty(&c->id_map))
3960 return 0;
3961
3962 if (!strcmp(c->console.name, ""))
3963 return 0;
3964
3965 if (chown_mapped_root(c->console.name, c) < 0) {
3966 ERROR("failed to chown console \"%s\"", c->console.name);
3967 return -1;
3968 }
3969
3970 TRACE("chowned console \"%s\"", c->console.name);
3971
3972 return 0;
3973 }
3974
3975 /* NOTE: Must not be called from inside the container namespace! */
3976 int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
3977 {
3978 int mounted;
3979
3980 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
3981 if (mounted == -1) {
3982 SYSERROR("failed to mount /proc in the container");
3983 /* continue only if there is no rootfs */
3984 if (conf->rootfs.path)
3985 return -1;
3986 } else if (mounted == 1) {
3987 conf->tmp_umount_proc = 1;
3988 }
3989
3990 return 0;
3991 }
3992
3993 void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3994 {
3995 if (lxc_conf->tmp_umount_proc == 1) {
3996 umount("/proc");
3997 lxc_conf->tmp_umount_proc = 0;
3998 }
3999 }
4000
4001 void remount_all_slave(void)
4002 {
4003 /* walk /proc/mounts and change any shared entries to slave */
4004 FILE *f = fopen("/proc/self/mountinfo", "r");
4005 char *line = NULL;
4006 size_t len = 0;
4007
4008 if (!f) {
4009 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
4010 ERROR("Continuing container startup...");
4011 return;
4012 }
4013
4014 while (getline(&line, &len, f) != -1) {
4015 char *target, *opts;
4016 target = get_field(line, 4);
4017 if (!target)
4018 continue;
4019 opts = get_field(target, 2);
4020 if (!opts)
4021 continue;
4022 null_endofword(opts);
4023 if (!strstr(opts, "shared"))
4024 continue;
4025 null_endofword(target);
4026 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
4027 SYSERROR("Failed to make %s rslave", target);
4028 ERROR("Continuing...");
4029 }
4030 }
4031 fclose(f);
4032 free(line);
4033 }
4034
4035 void lxc_execute_bind_init(struct lxc_conf *conf)
4036 {
4037 int ret;
4038 char path[PATH_MAX], destpath[PATH_MAX], *p;
4039
4040 /* If init exists in the container, don't bind mount a static one */
4041 p = choose_init(conf->rootfs.mount);
4042 if (p) {
4043 free(p);
4044 return;
4045 }
4046
4047 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
4048 if (ret < 0 || ret >= PATH_MAX) {
4049 WARN("Path name too long searching for lxc.init.static");
4050 return;
4051 }
4052
4053 if (!file_exists(path)) {
4054 INFO("%s does not exist on host", path);
4055 return;
4056 }
4057
4058 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
4059 if (ret < 0 || ret >= PATH_MAX) {
4060 WARN("Path name too long for container's lxc.init.static");
4061 return;
4062 }
4063
4064 if (!file_exists(destpath)) {
4065 FILE * pathfile = fopen(destpath, "wb");
4066 if (!pathfile) {
4067 SYSERROR("Failed to create mount target '%s'", destpath);
4068 return;
4069 }
4070 fclose(pathfile);
4071 }
4072
4073 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
4074 if (ret < 0)
4075 SYSERROR("Failed to bind lxc.init.static into container");
4076 INFO("lxc.init.static bound into container at %s", path);
4077 }
4078
4079 /*
4080 * This does the work of remounting / if it is shared, calling the
4081 * container pre-mount hooks, and mounting the rootfs.
4082 */
4083 int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
4084 {
4085 if (conf->rootfs_setup) {
4086 /*
4087 * rootfs was set up in another namespace. bind-mount it
4088 * to give us a mount in our own ns so we can pivot_root to it
4089 */
4090 const char *path = conf->rootfs.mount;
4091 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
4092 ERROR("Failed to bind-mount container / onto itself");
4093 return -1;
4094 }
4095 return 0;
4096 }
4097
4098 remount_all_slave();
4099
4100 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
4101 ERROR("failed to run pre-mount hooks for container '%s'.", name);
4102 return -1;
4103 }
4104
4105 if (lxc_setup_rootfs(conf)) {
4106 ERROR("failed to setup rootfs for '%s'", name);
4107 return -1;
4108 }
4109
4110 conf->rootfs_setup = true;
4111 return 0;
4112 }
4113
4114 static bool verify_start_hooks(struct lxc_conf *conf)
4115 {
4116 struct lxc_list *it;
4117 char path[MAXPATHLEN];
4118 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
4119 char *hookname = it->elem;
4120 struct stat st;
4121 int ret;
4122
4123 ret = snprintf(path, MAXPATHLEN, "%s%s",
4124 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
4125 if (ret < 0 || ret >= MAXPATHLEN)
4126 return false;
4127 ret = stat(path, &st);
4128 if (ret) {
4129 SYSERROR("Start hook %s not found in container",
4130 hookname);
4131 return false;
4132 }
4133 return true;
4134 }
4135
4136 return true;
4137 }
4138
4139 static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
4140 {
4141 int i;
4142 int *ttyfds;
4143 struct lxc_pty_info *pty_info;
4144 struct lxc_conf *conf = handler->conf;
4145 const struct lxc_tty_info *tty_info = &conf->tty_info;
4146 int sock = handler->ttysock[0];
4147 int ret = -1;
4148 size_t num_ttyfds = (2 * conf->tty);
4149
4150 ttyfds = malloc(num_ttyfds * sizeof(int));
4151 if (!ttyfds)
4152 return -1;
4153
4154 for (i = 0; i < num_ttyfds; i++) {
4155 pty_info = &tty_info->pty_info[i / 2];
4156 ttyfds[i++] = pty_info->slave;
4157 ttyfds[i] = pty_info->master;
4158 TRACE("send pty \"%s\" with master fd %d and slave fd %d to "
4159 "parent",
4160 pty_info->name, pty_info->master, pty_info->slave);
4161 }
4162
4163 ret = lxc_abstract_unix_send_fds(sock, ttyfds, num_ttyfds, NULL, 0);
4164 if (ret < 0)
4165 ERROR("failed to send %d ttys to parent: %s", conf->tty,
4166 strerror(errno));
4167 else
4168 TRACE("sent %d ttys to parent", conf->tty);
4169
4170 close(handler->ttysock[0]);
4171 close(handler->ttysock[1]);
4172
4173 for (i = 0; i < num_ttyfds; i++)
4174 close(ttyfds[i]);
4175
4176 free(ttyfds);
4177
4178 return ret;
4179 }
4180
4181 int lxc_setup(struct lxc_handler *handler)
4182 {
4183 const char *name = handler->name;
4184 struct lxc_conf *lxc_conf = handler->conf;
4185 const char *lxcpath = handler->lxcpath;
4186
4187 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4188 ERROR("Error setting up rootfs mount after spawn");
4189 return -1;
4190 }
4191
4192 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4193 if (setup_utsname(lxc_conf->utsname)) {
4194 ERROR("failed to setup the utsname for '%s'", name);
4195 return -1;
4196 }
4197 }
4198
4199 if (lxc_setup_networks_in_child_namespaces(lxc_conf,
4200 &lxc_conf->network)) {
4201 ERROR("failed to setup the network for '%s'", name);
4202 return -1;
4203 }
4204
4205 if (lxc_conf->autodev > 0) {
4206 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
4207 ERROR("failed to mount /dev in the container");
4208 return -1;
4209 }
4210 }
4211
4212 /* do automatic mounts (mainly /proc and /sys), but exclude
4213 * those that need to wait until other stuff has finished
4214 */
4215 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
4216 ERROR("failed to setup the automatic mounts for '%s'", name);
4217 return -1;
4218 }
4219
4220 if (setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
4221 ERROR("failed to setup the mounts for '%s'", name);
4222 return -1;
4223 }
4224
4225 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(lxc_conf, &lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
4226 ERROR("failed to setup the mount entries for '%s'", name);
4227 return -1;
4228 }
4229
4230 /* Make sure any start hooks are in the container */
4231 if (!verify_start_hooks(lxc_conf))
4232 return -1;
4233
4234 if (lxc_conf->is_execute)
4235 lxc_execute_bind_init(lxc_conf);
4236
4237 /* now mount only cgroup, if wanted;
4238 * before, /sys could not have been mounted
4239 * (is either mounted automatically or via fstab entries)
4240 */
4241 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
4242 ERROR("failed to setup the automatic mounts for '%s'", name);
4243 return -1;
4244 }
4245
4246 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
4247 ERROR("failed to run mount hooks for container '%s'.", name);
4248 return -1;
4249 }
4250
4251 if (lxc_conf->autodev > 0) {
4252 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
4253 ERROR("failed to run autodev hooks for container '%s'.", name);
4254 return -1;
4255 }
4256
4257 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
4258 ERROR("failed to populate /dev in the container");
4259 return -1;
4260 }
4261 }
4262
4263 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
4264 ERROR("failed to setup the console for '%s'", name);
4265 return -1;
4266 }
4267
4268 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4269 ERROR("failed to setup /dev symlinks for '%s'", name);
4270 return -1;
4271 }
4272
4273 /* mount /proc if it's not already there */
4274 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
4275 ERROR("failed to LSM mount proc for '%s'", name);
4276 return -1;
4277 }
4278
4279 if (setup_pivot_root(&lxc_conf->rootfs)) {
4280 ERROR("failed to set rootfs for '%s'", name);
4281 return -1;
4282 }
4283
4284 if (lxc_setup_devpts(lxc_conf->pts)) {
4285 ERROR("failed to setup the new pts instance");
4286 return -1;
4287 }
4288
4289 if (lxc_create_tty(name, lxc_conf)) {
4290 ERROR("failed to create the ttys");
4291 return -1;
4292 }
4293
4294 if (lxc_send_ttys_to_parent(handler) < 0) {
4295 ERROR("failure sending console info to parent");
4296 return -1;
4297 }
4298
4299 if (!lxc_conf->is_execute && lxc_setup_tty(lxc_conf)) {
4300 ERROR("failed to setup the ttys for '%s'", name);
4301 return -1;
4302 }
4303
4304 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
4305 SYSERROR("failed to set environment variable for container ptys");
4306
4307
4308 if (setup_personality(lxc_conf->personality)) {
4309 ERROR("failed to setup personality");
4310 return -1;
4311 }
4312
4313 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4314 if (!lxc_list_empty(&lxc_conf->caps)) {
4315 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
4316 return -1;
4317 }
4318 if (dropcaps_except(&lxc_conf->keepcaps)) {
4319 ERROR("failed to keep requested caps");
4320 return -1;
4321 }
4322 } else if (setup_caps(&lxc_conf->caps)) {
4323 ERROR("failed to drop capabilities");
4324 return -1;
4325 }
4326
4327 NOTICE("Container \"%s\" is set up", name);
4328
4329 return 0;
4330 }
4331
4332 int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4333 const char *lxcpath, char *argv[])
4334 {
4335 int which = -1;
4336 struct lxc_list *it;
4337
4338 if (strcmp(hook, "pre-start") == 0)
4339 which = LXCHOOK_PRESTART;
4340 else if (strcmp(hook, "pre-mount") == 0)
4341 which = LXCHOOK_PREMOUNT;
4342 else if (strcmp(hook, "mount") == 0)
4343 which = LXCHOOK_MOUNT;
4344 else if (strcmp(hook, "autodev") == 0)
4345 which = LXCHOOK_AUTODEV;
4346 else if (strcmp(hook, "start") == 0)
4347 which = LXCHOOK_START;
4348 else if (strcmp(hook, "stop") == 0)
4349 which = LXCHOOK_STOP;
4350 else if (strcmp(hook, "post-stop") == 0)
4351 which = LXCHOOK_POSTSTOP;
4352 else if (strcmp(hook, "clone") == 0)
4353 which = LXCHOOK_CLONE;
4354 else if (strcmp(hook, "destroy") == 0)
4355 which = LXCHOOK_DESTROY;
4356 else
4357 return -1;
4358 lxc_list_for_each(it, &conf->hooks[which]) {
4359 int ret;
4360 char *hookname = it->elem;
4361 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
4362 if (ret)
4363 return ret;
4364 }
4365 return 0;
4366 }
4367
4368 int lxc_clear_config_caps(struct lxc_conf *c)
4369 {
4370 struct lxc_list *it, *next;
4371
4372 lxc_list_for_each_safe(it, &c->caps, next) {
4373 lxc_list_del(it);
4374 free(it->elem);
4375 free(it);
4376 }
4377 return 0;
4378 }
4379
4380 static int lxc_free_idmap(struct lxc_list *id_map) {
4381 struct lxc_list *it, *next;
4382
4383 lxc_list_for_each_safe(it, id_map, next) {
4384 lxc_list_del(it);
4385 free(it->elem);
4386 free(it);
4387 }
4388 return 0;
4389 }
4390
4391 int lxc_clear_idmaps(struct lxc_conf *c)
4392 {
4393 return lxc_free_idmap(&c->id_map);
4394 }
4395
4396 int lxc_clear_config_keepcaps(struct lxc_conf *c)
4397 {
4398 struct lxc_list *it,*next;
4399
4400 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4401 lxc_list_del(it);
4402 free(it->elem);
4403 free(it);
4404 }
4405 return 0;
4406 }
4407
4408 int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
4409 {
4410 struct lxc_list *it,*next;
4411 bool all = false;
4412 const char *k = NULL;
4413
4414 if (strcmp(key, "lxc.cgroup") == 0)
4415 all = true;
4416 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
4417 k = key + sizeof("lxc.cgroup.")-1;
4418 else
4419 return -1;
4420
4421 lxc_list_for_each_safe(it, &c->cgroup, next) {
4422 struct lxc_cgroup *cg = it->elem;
4423 if (!all && strcmp(cg->subsystem, k) != 0)
4424 continue;
4425 lxc_list_del(it);
4426 free(cg->subsystem);
4427 free(cg->value);
4428 free(cg);
4429 free(it);
4430 }
4431 return 0;
4432 }
4433
4434 int lxc_clear_limits(struct lxc_conf *c, const char *key)
4435 {
4436 struct lxc_list *it, *next;
4437 bool all = false;
4438 const char *k = NULL;
4439
4440 if (strcmp(key, "lxc.limit") == 0
4441 || strcmp(key, "lxc.prlimit"))
4442 all = true;
4443 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
4444 k = key + sizeof("lxc.limit.")-1;
4445 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.")-1) == 0)
4446 k = key + sizeof("lxc.prlimit.")-1;
4447 else
4448 return -1;
4449
4450 lxc_list_for_each_safe(it, &c->limits, next) {
4451 struct lxc_limit *lim = it->elem;
4452 if (!all && strcmp(lim->resource, k) != 0)
4453 continue;
4454 lxc_list_del(it);
4455 free(lim->resource);
4456 free(lim);
4457 free(it);
4458 }
4459 return 0;
4460 }
4461
4462 int lxc_clear_groups(struct lxc_conf *c)
4463 {
4464 struct lxc_list *it,*next;
4465
4466 lxc_list_for_each_safe(it, &c->groups, next) {
4467 lxc_list_del(it);
4468 free(it->elem);
4469 free(it);
4470 }
4471 return 0;
4472 }
4473
4474 int lxc_clear_environment(struct lxc_conf *c)
4475 {
4476 struct lxc_list *it,*next;
4477
4478 lxc_list_for_each_safe(it, &c->environment, next) {
4479 lxc_list_del(it);
4480 free(it->elem);
4481 free(it);
4482 }
4483 return 0;
4484 }
4485
4486
4487 int lxc_clear_mount_entries(struct lxc_conf *c)
4488 {
4489 struct lxc_list *it,*next;
4490
4491 lxc_list_for_each_safe(it, &c->mount_list, next) {
4492 lxc_list_del(it);
4493 free(it->elem);
4494 free(it);
4495 }
4496 return 0;
4497 }
4498
4499 int lxc_clear_automounts(struct lxc_conf *c)
4500 {
4501 c->auto_mounts = 0;
4502 return 0;
4503 }
4504
4505 int lxc_clear_hooks(struct lxc_conf *c, const char *key)
4506 {
4507 struct lxc_list *it,*next;
4508 bool all = false, done = false;
4509 const char *k = NULL;
4510 int i;
4511
4512 if (strcmp(key, "lxc.hook") == 0)
4513 all = true;
4514 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
4515 k = key + sizeof("lxc.hook.")-1;
4516 else
4517 return -1;
4518
4519 for (i=0; i<NUM_LXC_HOOKS; i++) {
4520 if (all || strcmp(k, lxchook_names[i]) == 0) {
4521 lxc_list_for_each_safe(it, &c->hooks[i], next) {
4522 lxc_list_del(it);
4523 free(it->elem);
4524 free(it);
4525 }
4526 done = true;
4527 }
4528 }
4529
4530 if (!done) {
4531 ERROR("Invalid hook key: %s", key);
4532 return -1;
4533 }
4534 return 0;
4535 }
4536
4537 static void lxc_clear_saved_nics(struct lxc_conf *conf)
4538 {
4539 int i;
4540
4541 if (!conf->saved_nics)
4542 return;
4543 for (i=0; i < conf->num_savednics; i++)
4544 free(conf->saved_nics[i].orig_name);
4545 free(conf->saved_nics);
4546 }
4547
4548 static inline void lxc_clear_aliens(struct lxc_conf *conf)
4549 {
4550 struct lxc_list *it,*next;
4551
4552 lxc_list_for_each_safe(it, &conf->aliens, next) {
4553 lxc_list_del(it);
4554 free(it->elem);
4555 free(it);
4556 }
4557 }
4558
4559 void lxc_clear_includes(struct lxc_conf *conf)
4560 {
4561 struct lxc_list *it,*next;
4562
4563 lxc_list_for_each_safe(it, &conf->includes, next) {
4564 lxc_list_del(it);
4565 free(it->elem);
4566 free(it);
4567 }
4568 }
4569
4570 void lxc_conf_free(struct lxc_conf *conf)
4571 {
4572 if (!conf)
4573 return;
4574 if (current_config == conf)
4575 current_config = NULL;
4576 free(conf->console.log_path);
4577 free(conf->console.path);
4578 free(conf->rootfs.mount);
4579 free(conf->rootfs.bdev_type);
4580 free(conf->rootfs.options);
4581 free(conf->rootfs.path);
4582 free(conf->logfile);
4583 if (conf->logfd != -1)
4584 close(conf->logfd);
4585 free(conf->utsname);
4586 free(conf->ttydir);
4587 free(conf->fstab);
4588 free(conf->rcfile);
4589 free(conf->init_cmd);
4590 free(conf->unexpanded_config);
4591 free(conf->pty_names);
4592 free(conf->syslog);
4593 lxc_free_networks(&conf->network);
4594 free(conf->lsm_aa_profile);
4595 free(conf->lsm_se_context);
4596 lxc_seccomp_free(conf);
4597 lxc_clear_config_caps(conf);
4598 lxc_clear_config_keepcaps(conf);
4599 lxc_clear_cgroups(conf, "lxc.cgroup");
4600 lxc_clear_hooks(conf, "lxc.hook");
4601 lxc_clear_mount_entries(conf);
4602 lxc_clear_saved_nics(conf);
4603 lxc_clear_idmaps(conf);
4604 lxc_clear_groups(conf);
4605 lxc_clear_includes(conf);
4606 lxc_clear_aliens(conf);
4607 lxc_clear_environment(conf);
4608 lxc_clear_limits(conf, "lxc.prlimit");
4609 free(conf);
4610 }
4611
4612 struct userns_fn_data {
4613 int (*fn)(void *);
4614 const char *fn_name;
4615 void *arg;
4616 int p[2];
4617 };
4618
4619 static int run_userns_fn(void *data)
4620 {
4621 struct userns_fn_data *d = data;
4622 char c;
4623
4624 /* Close write end of the pipe. */
4625 close(d->p[1]);
4626
4627 /* Wait for parent to finish establishing a new mapping in the user
4628 * namespace we are executing in.
4629 */
4630 if (read(d->p[0], &c, 1) != 1)
4631 return -1;
4632
4633 /* Close read end of the pipe. */
4634 close(d->p[0]);
4635
4636 if (d->fn_name)
4637 TRACE("calling function \"%s\"", d->fn_name);
4638 /* Call function to run. */
4639 return d->fn(d->arg);
4640 }
4641
4642 static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
4643 enum idtype idtype)
4644 {
4645 struct lxc_list *it;
4646 struct id_map *map;
4647 struct id_map *retmap = NULL;
4648
4649 lxc_list_for_each(it, &conf->id_map) {
4650 map = it->elem;
4651 if (map->idtype != idtype)
4652 continue;
4653
4654 if (id >= map->hostid && id < map->hostid + map->range) {
4655 retmap = map;
4656 break;
4657 }
4658 }
4659
4660 if (!retmap)
4661 return NULL;
4662
4663 retmap = malloc(sizeof(*retmap));
4664 if (!retmap)
4665 return NULL;
4666
4667 memcpy(retmap, map, sizeof(*retmap));
4668 return retmap;
4669 }
4670
4671 /*
4672 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4673 * existing one or establish a new one.
4674 */
4675 static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4676 {
4677 int hostid_mapped;
4678 struct id_map *entry = NULL;
4679
4680 /* Reuse existing mapping. */
4681 entry = mapped_hostid_entry(conf, id, type);
4682 if (entry)
4683 return entry;
4684
4685 /* Find new mapping. */
4686 hostid_mapped = find_unmapped_nsid(conf, type);
4687 if (hostid_mapped < 0) {
4688 DEBUG("failed to find free mapping for id %d", id);
4689 return NULL;
4690 }
4691
4692 entry = malloc(sizeof(*entry));
4693 if (!entry)
4694 return NULL;
4695
4696 entry->idtype = type;
4697 entry->nsid = hostid_mapped;
4698 entry->hostid = (unsigned long)id;
4699 entry->range = 1;
4700
4701 return entry;
4702 }
4703
4704 /* Run a function in a new user namespace.
4705 * The caller's euid/egid will be mapped if it is not already.
4706 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4707 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4708 * This means we require only to establish a mapping from:
4709 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4710 * - the container root -> some sub{g,u}id
4711 * The former we add, if the user did not specifiy a mapping. The latter we
4712 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4713 * there to start the container in the first place.
4714 */
4715 int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4716 const char *fn_name)
4717 {
4718 pid_t pid;
4719 uid_t euid, egid;
4720 struct userns_fn_data d;
4721 int p[2];
4722 struct lxc_list *it;
4723 struct id_map *map;
4724 char c = '1';
4725 int ret = -1;
4726 struct lxc_list *idmap = NULL, *tmplist = NULL;
4727 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4728 *host_uid_map = NULL, *host_gid_map = NULL;
4729
4730 ret = pipe(p);
4731 if (ret < 0) {
4732 SYSERROR("opening pipe");
4733 return -1;
4734 }
4735 d.fn = fn;
4736 d.fn_name = fn_name;
4737 d.arg = data;
4738 d.p[0] = p[0];
4739 d.p[1] = p[1];
4740
4741 /* Clone child in new user namespace. */
4742 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4743 if (pid < 0) {
4744 ERROR("failed to clone child process in new user namespace");
4745 goto on_error;
4746 }
4747
4748 close(p[0]);
4749 p[0] = -1;
4750
4751 /* Find container root. */
4752 lxc_list_for_each(it, &conf->id_map) {
4753 map = it->elem;
4754
4755 if (map->nsid != 0)
4756 continue;
4757
4758 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
4759 container_root_uid = malloc(sizeof(*container_root_uid));
4760 if (!container_root_uid)
4761 goto on_error;
4762 container_root_uid->idtype = map->idtype;
4763 container_root_uid->hostid = map->hostid;
4764 container_root_uid->nsid = 0;
4765 container_root_uid->range = map->range;
4766 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
4767 container_root_gid = malloc(sizeof(*container_root_gid));
4768 if (!container_root_gid)
4769 goto on_error;
4770 container_root_gid->idtype = map->idtype;
4771 container_root_gid->hostid = map->hostid;
4772 container_root_gid->nsid = 0;
4773 container_root_gid->range = map->range;
4774 }
4775
4776 /* Found container root. */
4777 if (container_root_uid && container_root_gid)
4778 break;
4779 }
4780
4781 /* This is actually checked earlier but it can't hurt. */
4782 if (!container_root_uid || !container_root_gid) {
4783 ERROR("no mapping for container root found");
4784 goto on_error;
4785 }
4786
4787 host_uid_map = container_root_uid;
4788 host_gid_map = container_root_gid;
4789
4790 /* Check whether the {g,u}id of the user has a mapping. */
4791 euid = geteuid();
4792 egid = getegid();
4793 if (euid != container_root_uid->hostid)
4794 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
4795
4796 if (egid != container_root_gid->hostid)
4797 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
4798
4799 if (!host_uid_map) {
4800 DEBUG("failed to find mapping for uid %d", euid);
4801 goto on_error;
4802 }
4803
4804 if (!host_gid_map) {
4805 DEBUG("failed to find mapping for gid %d", egid);
4806 goto on_error;
4807 }
4808
4809 /* Allocate new {g,u}id map list. */
4810 idmap = malloc(sizeof(*idmap));
4811 if (!idmap)
4812 goto on_error;
4813 lxc_list_init(idmap);
4814
4815 /* Add container root to the map. */
4816 tmplist = malloc(sizeof(*tmplist));
4817 if (!tmplist)
4818 goto on_error;
4819 lxc_list_add_elem(tmplist, container_root_uid);
4820 lxc_list_add_tail(idmap, tmplist);
4821
4822 if (host_uid_map && (host_uid_map != container_root_uid)) {
4823 /* idmap will now keep track of that memory. */
4824 container_root_uid = NULL;
4825
4826 /* Add container root to the map. */
4827 tmplist = malloc(sizeof(*tmplist));
4828 if (!tmplist)
4829 goto on_error;
4830 lxc_list_add_elem(tmplist, host_uid_map);
4831 lxc_list_add_tail(idmap, tmplist);
4832 }
4833 /* idmap will now keep track of that memory. */
4834 container_root_uid = NULL;
4835 /* idmap will now keep track of that memory. */
4836 host_uid_map = NULL;
4837
4838 tmplist = malloc(sizeof(*tmplist));
4839 if (!tmplist)
4840 goto on_error;
4841 lxc_list_add_elem(tmplist, container_root_gid);
4842 lxc_list_add_tail(idmap, tmplist);
4843
4844 if (host_gid_map && (host_gid_map != container_root_gid)) {
4845 /* idmap will now keep track of that memory. */
4846 container_root_gid = NULL;
4847
4848 tmplist = malloc(sizeof(*tmplist));
4849 if (!tmplist)
4850 goto on_error;
4851 lxc_list_add_elem(tmplist, host_gid_map);
4852 lxc_list_add_tail(idmap, tmplist);
4853 }
4854 /* idmap will now keep track of that memory. */
4855 container_root_gid = NULL;
4856 /* idmap will now keep track of that memory. */
4857 host_gid_map = NULL;
4858
4859 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4860 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4861 lxc_list_for_each(it, idmap) {
4862 map = it->elem;
4863 TRACE("establishing %cid mapping for \"%d\" in new "
4864 "user namespace: nsuid %lu - hostid %lu - range "
4865 "%lu",
4866 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4867 map->nsid, map->hostid, map->range);
4868 }
4869 }
4870
4871 /* Set up {g,u}id mapping for user namespace of child process. */
4872 ret = lxc_map_ids(idmap, pid);
4873 if (ret < 0) {
4874 ERROR("error setting up {g,u}id mappings for child process "
4875 "\"%d\"",
4876 pid);
4877 goto on_error;
4878 }
4879
4880 /* Tell child to proceed. */
4881 if (write(p[1], &c, 1) != 1) {
4882 SYSERROR("failed telling child process \"%d\" to proceed", pid);
4883 goto on_error;
4884 }
4885
4886 /* Wait for child to finish. */
4887 ret = wait_for_pid(pid);
4888
4889 on_error:
4890 if (idmap)
4891 lxc_free_idmap(idmap);
4892 if (container_root_uid)
4893 free(container_root_uid);
4894 if (container_root_gid)
4895 free(container_root_gid);
4896 if (host_uid_map && (host_uid_map != container_root_uid))
4897 free(host_uid_map);
4898 if (host_gid_map && (host_gid_map != container_root_gid))
4899 free(host_gid_map);
4900
4901 if (p[0] != -1)
4902 close(p[0]);
4903 close(p[1]);
4904
4905 return ret;
4906 }
4907
4908 /* not thread-safe, do not use from api without first forking */
4909 static char* getuname(void)
4910 {
4911 struct passwd *result;
4912
4913 result = getpwuid(geteuid());
4914 if (!result)
4915 return NULL;
4916
4917 return strdup(result->pw_name);
4918 }
4919
4920 /* not thread-safe, do not use from api without first forking */
4921 static char *getgname(void)
4922 {
4923 struct group *result;
4924
4925 result = getgrgid(getegid());
4926 if (!result)
4927 return NULL;
4928
4929 return strdup(result->gr_name);
4930 }
4931
4932 /* not thread-safe, do not use from api without first forking */
4933 void suggest_default_idmap(void)
4934 {
4935 FILE *f;
4936 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4937 char *line = NULL;
4938 char *uname, *gname;
4939 size_t len = 0;
4940
4941 if (!(uname = getuname()))
4942 return;
4943
4944 if (!(gname = getgname())) {
4945 free(uname);
4946 return;
4947 }
4948
4949 f = fopen(subuidfile, "r");
4950 if (!f) {
4951 ERROR("Your system is not configured with subuids");
4952 free(gname);
4953 free(uname);
4954 return;
4955 }
4956 while (getline(&line, &len, f) != -1) {
4957 size_t no_newline = 0;
4958 char *p = strchr(line, ':'), *p2;
4959 if (*line == '#')
4960 continue;
4961 if (!p)
4962 continue;
4963 *p = '\0';
4964 p++;
4965 if (strcmp(line, uname))
4966 continue;
4967 p2 = strchr(p, ':');
4968 if (!p2)
4969 continue;
4970 *p2 = '\0';
4971 p2++;
4972 if (!*p2)
4973 continue;
4974 no_newline = strcspn(p2, "\n");
4975 p2[no_newline] = '\0';
4976
4977 if (lxc_safe_uint(p, &uid) < 0)
4978 WARN("Could not parse UID.");
4979 if (lxc_safe_uint(p2, &urange) < 0)
4980 WARN("Could not parse UID range.");
4981 }
4982 fclose(f);
4983
4984 f = fopen(subgidfile, "r");
4985 if (!f) {
4986 ERROR("Your system is not configured with subgids");
4987 free(gname);
4988 free(uname);
4989 return;
4990 }
4991 while (getline(&line, &len, f) != -1) {
4992 size_t no_newline = 0;
4993 char *p = strchr(line, ':'), *p2;
4994 if (*line == '#')
4995 continue;
4996 if (!p)
4997 continue;
4998 *p = '\0';
4999 p++;
5000 if (strcmp(line, uname))
5001 continue;
5002 p2 = strchr(p, ':');
5003 if (!p2)
5004 continue;
5005 *p2 = '\0';
5006 p2++;
5007 if (!*p2)
5008 continue;
5009 no_newline = strcspn(p2, "\n");
5010 p2[no_newline] = '\0';
5011
5012 if (lxc_safe_uint(p, &gid) < 0)
5013 WARN("Could not parse GID.");
5014 if (lxc_safe_uint(p2, &grange) < 0)
5015 WARN("Could not parse GID range.");
5016 }
5017 fclose(f);
5018
5019 free(line);
5020
5021 if (!urange || !grange) {
5022 ERROR("You do not have subuids or subgids allocated");
5023 ERROR("Unprivileged containers require subuids and subgids");
5024 return;
5025 }
5026
5027 ERROR("You must either run as root, or define uid mappings");
5028 ERROR("To pass uid mappings to lxc-create, you could create");
5029 ERROR("~/.config/lxc/default.conf:");
5030 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
5031 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
5032 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
5033
5034 free(gname);
5035 free(uname);
5036 }
5037
5038 static void free_cgroup_settings(struct lxc_list *result)
5039 {
5040 struct lxc_list *iterator, *next;
5041
5042 lxc_list_for_each_safe(iterator, result, next) {
5043 lxc_list_del(iterator);
5044 free(iterator);
5045 }
5046 free(result);
5047 }
5048
5049 /*
5050 * Return the list of cgroup_settings sorted according to the following rules
5051 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
5052 */
5053 struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
5054 {
5055 struct lxc_list *result;
5056 struct lxc_list *memsw_limit = NULL;
5057 struct lxc_list *it = NULL;
5058 struct lxc_cgroup *cg = NULL;
5059 struct lxc_list *item = NULL;
5060
5061 result = malloc(sizeof(*result));
5062 if (!result) {
5063 ERROR("failed to allocate memory to sort cgroup settings");
5064 return NULL;
5065 }
5066 lxc_list_init(result);
5067
5068 /*Iterate over the cgroup settings and copy them to the output list*/
5069 lxc_list_for_each(it, cgroup_settings) {
5070 item = malloc(sizeof(*item));
5071 if (!item) {
5072 ERROR("failed to allocate memory to sort cgroup settings");
5073 free_cgroup_settings(result);
5074 return NULL;
5075 }
5076 item->elem = it->elem;
5077 cg = it->elem;
5078 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
5079 /* Store the memsw_limit location */
5080 memsw_limit = item;
5081 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
5082 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
5083 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
5084 item->elem = memsw_limit->elem;
5085 memsw_limit->elem = it->elem;
5086 }
5087 lxc_list_add_tail(result, item);
5088 }
5089
5090 return result;
5091 }