]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/conf.c
conf: increase lxc-user-nic buffer
[mirror_lxc.git] / src / lxc / conf.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #define _GNU_SOURCE
25 #include "config.h"
26
27 #include <dirent.h>
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <grp.h>
31 #include <inttypes.h>
32 #include <libgen.h>
33 #include <pwd.h>
34 #include <stdarg.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <arpa/inet.h>
41 #include <linux/loop.h>
42 #include <net/if.h>
43 #include <netinet/in.h>
44 #include <sys/mman.h>
45 #include <sys/mount.h>
46 #include <sys/param.h>
47 #include <sys/prctl.h>
48 #include <sys/stat.h>
49 #include <sys/socket.h>
50 #include <sys/sysmacros.h>
51 #include <sys/syscall.h>
52 #include <sys/types.h>
53 #include <sys/utsname.h>
54 #include <sys/wait.h>
55
56 /* makedev() */
57 #ifdef MAJOR_IN_MKDEV
58 # include <sys/mkdev.h>
59 #endif
60
61 #ifdef HAVE_STATVFS
62 #include <sys/statvfs.h>
63 #endif
64
65 #if HAVE_PTY_H
66 #include <pty.h>
67 #else
68 #include <../include/openpty.h>
69 #endif
70
71 #ifdef HAVE_LINUX_MEMFD_H
72 #include <linux/memfd.h>
73 #endif
74
75 #include "af_unix.h"
76 #include "caps.h" /* for lxc_caps_last_cap() */
77 #include "cgroup.h"
78 #include "conf.h"
79 #include "confile_utils.h"
80 #include "error.h"
81 #include "log.h"
82 #include "lxclock.h"
83 #include "lxcseccomp.h"
84 #include "namespace.h"
85 #include "network.h"
86 #include "parse.h"
87 #include "storage.h"
88 #include "storage/aufs.h"
89 #include "storage/overlay.h"
90 #include "utils.h"
91 #include "lsm/lsm.h"
92
93 #if HAVE_LIBCAP
94 #include <sys/capability.h>
95 #endif
96
97 #if HAVE_SYS_PERSONALITY_H
98 #include <sys/personality.h>
99 #endif
100
101 #if IS_BIONIC
102 #include <../include/lxcmntent.h>
103 #ifndef HAVE_PRLIMIT
104 #include <../include/prlimit.h>
105 #endif
106 #else
107 #include <mntent.h>
108 #endif
109
110 lxc_log_define(lxc_conf, lxc);
111
112 #if HAVE_LIBCAP
113 #ifndef CAP_SETFCAP
114 #define CAP_SETFCAP 31
115 #endif
116
117 #ifndef CAP_MAC_OVERRIDE
118 #define CAP_MAC_OVERRIDE 32
119 #endif
120
121 #ifndef CAP_MAC_ADMIN
122 #define CAP_MAC_ADMIN 33
123 #endif
124 #endif
125
126 #ifndef PR_CAPBSET_DROP
127 #define PR_CAPBSET_DROP 24
128 #endif
129
130 #ifndef LO_FLAGS_AUTOCLEAR
131 #define LO_FLAGS_AUTOCLEAR 4
132 #endif
133
134 #ifndef CAP_SETUID
135 #define CAP_SETUID 7
136 #endif
137
138 #ifndef CAP_SETGID
139 #define CAP_SETGID 6
140 #endif
141
142 /* needed for cgroup automount checks, regardless of whether we
143 * have included linux/capability.h or not */
144 #ifndef CAP_SYS_ADMIN
145 #define CAP_SYS_ADMIN 21
146 #endif
147
148 /* Define pivot_root() if missing from the C library */
149 #ifndef HAVE_PIVOT_ROOT
150 static int pivot_root(const char * new_root, const char * put_old)
151 {
152 #ifdef __NR_pivot_root
153 return syscall(__NR_pivot_root, new_root, put_old);
154 #else
155 errno = ENOSYS;
156 return -1;
157 #endif
158 }
159 #else
160 extern int pivot_root(const char * new_root, const char * put_old);
161 #endif
162
163 /* Define sethostname() if missing from the C library */
164 #ifndef HAVE_SETHOSTNAME
165 static int sethostname(const char * name, size_t len)
166 {
167 #ifdef __NR_sethostname
168 return syscall(__NR_sethostname, name, len);
169 #else
170 errno = ENOSYS;
171 return -1;
172 #endif
173 }
174 #endif
175
176 #ifndef MS_PRIVATE
177 #define MS_PRIVATE (1<<18)
178 #endif
179
180 #ifndef MS_LAZYTIME
181 #define MS_LAZYTIME (1<<25)
182 #endif
183
184 /* memfd_create() */
185 #ifndef MFD_CLOEXEC
186 #define MFD_CLOEXEC 0x0001U
187 #endif
188
189 #ifndef MFD_ALLOW_SEALING
190 #define MFD_ALLOW_SEALING 0x0002U
191 #endif
192
193 #ifndef HAVE_MEMFD_CREATE
194 static int memfd_create(const char *name, unsigned int flags) {
195 #ifndef __NR_memfd_create
196 #if defined __i386__
197 #define __NR_memfd_create 356
198 #elif defined __x86_64__
199 #define __NR_memfd_create 319
200 #elif defined __arm__
201 #define __NR_memfd_create 385
202 #elif defined __aarch64__
203 #define __NR_memfd_create 279
204 #elif defined __s390__
205 #define __NR_memfd_create 350
206 #elif defined __powerpc__
207 #define __NR_memfd_create 360
208 #elif defined __sparc__
209 #define __NR_memfd_create 348
210 #elif defined __blackfin__
211 #define __NR_memfd_create 390
212 #elif defined __ia64__
213 #define __NR_memfd_create 1340
214 #elif defined _MIPS_SIM
215 #if _MIPS_SIM == _MIPS_SIM_ABI32
216 #define __NR_memfd_create 4354
217 #endif
218 #if _MIPS_SIM == _MIPS_SIM_NABI32
219 #define __NR_memfd_create 6318
220 #endif
221 #if _MIPS_SIM == _MIPS_SIM_ABI64
222 #define __NR_memfd_create 5314
223 #endif
224 #endif
225 #endif
226 #ifdef __NR_memfd_create
227 return syscall(__NR_memfd_create, name, flags);
228 #else
229 errno = ENOSYS;
230 return -1;
231 #endif
232 }
233 #else
234 extern int memfd_create(const char *name, unsigned int flags);
235 #endif
236
237 char *lxchook_names[NUM_LXC_HOOKS] = {"pre-start", "pre-mount", "mount",
238 "autodev", "start", "stop",
239 "post-stop", "clone", "destroy"};
240
241 typedef int (*instantiate_cb)(struct lxc_handler *, struct lxc_netdev *);
242
243 struct mount_opt {
244 char *name;
245 int clear;
246 int flag;
247 };
248
249 struct caps_opt {
250 char *name;
251 int value;
252 };
253
254 struct limit_opt {
255 char *name;
256 int value;
257 };
258
259 /*
260 * The lxc_conf of the container currently being worked on in an
261 * API call
262 * This is used in the error calls
263 */
264 #ifdef HAVE_TLS
265 __thread struct lxc_conf *current_config;
266 #else
267 struct lxc_conf *current_config;
268 #endif
269
270 /* Declare this here, since we don't want to reshuffle the whole file. */
271 static int in_caplist(int cap, struct lxc_list *caps);
272
273 static int instantiate_veth(struct lxc_handler *, struct lxc_netdev *);
274 static int instantiate_macvlan(struct lxc_handler *, struct lxc_netdev *);
275 static int instantiate_vlan(struct lxc_handler *, struct lxc_netdev *);
276 static int instantiate_phys(struct lxc_handler *, struct lxc_netdev *);
277 static int instantiate_empty(struct lxc_handler *, struct lxc_netdev *);
278 static int instantiate_none(struct lxc_handler *, struct lxc_netdev *);
279
280 static instantiate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
281 [LXC_NET_VETH] = instantiate_veth,
282 [LXC_NET_MACVLAN] = instantiate_macvlan,
283 [LXC_NET_VLAN] = instantiate_vlan,
284 [LXC_NET_PHYS] = instantiate_phys,
285 [LXC_NET_EMPTY] = instantiate_empty,
286 [LXC_NET_NONE] = instantiate_none,
287 };
288
289 static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
290 static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
291 static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
292 static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
293 static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
294 static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
295
296 static instantiate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
297 [LXC_NET_VETH] = shutdown_veth,
298 [LXC_NET_MACVLAN] = shutdown_macvlan,
299 [LXC_NET_VLAN] = shutdown_vlan,
300 [LXC_NET_PHYS] = shutdown_phys,
301 [LXC_NET_EMPTY] = shutdown_empty,
302 [LXC_NET_NONE] = shutdown_none,
303 };
304
305 static struct mount_opt mount_opt[] = {
306 { "async", 1, MS_SYNCHRONOUS },
307 { "atime", 1, MS_NOATIME },
308 { "bind", 0, MS_BIND },
309 { "defaults", 0, 0 },
310 { "dev", 1, MS_NODEV },
311 { "diratime", 1, MS_NODIRATIME },
312 { "dirsync", 0, MS_DIRSYNC },
313 { "exec", 1, MS_NOEXEC },
314 { "lazytime", 0, MS_LAZYTIME },
315 { "mand", 0, MS_MANDLOCK },
316 { "noatime", 0, MS_NOATIME },
317 { "nodev", 0, MS_NODEV },
318 { "nodiratime", 0, MS_NODIRATIME },
319 { "noexec", 0, MS_NOEXEC },
320 { "nomand", 1, MS_MANDLOCK },
321 { "norelatime", 1, MS_RELATIME },
322 { "nostrictatime", 1, MS_STRICTATIME },
323 { "nosuid", 0, MS_NOSUID },
324 { "rbind", 0, MS_BIND|MS_REC },
325 { "relatime", 0, MS_RELATIME },
326 { "remount", 0, MS_REMOUNT },
327 { "ro", 0, MS_RDONLY },
328 { "rw", 1, MS_RDONLY },
329 { "strictatime", 0, MS_STRICTATIME },
330 { "suid", 1, MS_NOSUID },
331 { "sync", 0, MS_SYNCHRONOUS },
332 { NULL, 0, 0 },
333 };
334
335 #if HAVE_LIBCAP
336 static struct caps_opt caps_opt[] = {
337 { "chown", CAP_CHOWN },
338 { "dac_override", CAP_DAC_OVERRIDE },
339 { "dac_read_search", CAP_DAC_READ_SEARCH },
340 { "fowner", CAP_FOWNER },
341 { "fsetid", CAP_FSETID },
342 { "kill", CAP_KILL },
343 { "setgid", CAP_SETGID },
344 { "setuid", CAP_SETUID },
345 { "setpcap", CAP_SETPCAP },
346 { "linux_immutable", CAP_LINUX_IMMUTABLE },
347 { "net_bind_service", CAP_NET_BIND_SERVICE },
348 { "net_broadcast", CAP_NET_BROADCAST },
349 { "net_admin", CAP_NET_ADMIN },
350 { "net_raw", CAP_NET_RAW },
351 { "ipc_lock", CAP_IPC_LOCK },
352 { "ipc_owner", CAP_IPC_OWNER },
353 { "sys_module", CAP_SYS_MODULE },
354 { "sys_rawio", CAP_SYS_RAWIO },
355 { "sys_chroot", CAP_SYS_CHROOT },
356 { "sys_ptrace", CAP_SYS_PTRACE },
357 { "sys_pacct", CAP_SYS_PACCT },
358 { "sys_admin", CAP_SYS_ADMIN },
359 { "sys_boot", CAP_SYS_BOOT },
360 { "sys_nice", CAP_SYS_NICE },
361 { "sys_resource", CAP_SYS_RESOURCE },
362 { "sys_time", CAP_SYS_TIME },
363 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
364 { "mknod", CAP_MKNOD },
365 { "lease", CAP_LEASE },
366 #ifdef CAP_AUDIT_READ
367 { "audit_read", CAP_AUDIT_READ },
368 #endif
369 #ifdef CAP_AUDIT_WRITE
370 { "audit_write", CAP_AUDIT_WRITE },
371 #endif
372 #ifdef CAP_AUDIT_CONTROL
373 { "audit_control", CAP_AUDIT_CONTROL },
374 #endif
375 { "setfcap", CAP_SETFCAP },
376 { "mac_override", CAP_MAC_OVERRIDE },
377 { "mac_admin", CAP_MAC_ADMIN },
378 #ifdef CAP_SYSLOG
379 { "syslog", CAP_SYSLOG },
380 #endif
381 #ifdef CAP_WAKE_ALARM
382 { "wake_alarm", CAP_WAKE_ALARM },
383 #endif
384 #ifdef CAP_BLOCK_SUSPEND
385 { "block_suspend", CAP_BLOCK_SUSPEND },
386 #endif
387 };
388 #else
389 static struct caps_opt caps_opt[] = {};
390 #endif
391
392 static struct limit_opt limit_opt[] = {
393 #ifdef RLIMIT_AS
394 { "as", RLIMIT_AS },
395 #endif
396 #ifdef RLIMIT_CORE
397 { "core", RLIMIT_CORE },
398 #endif
399 #ifdef RLIMIT_CPU
400 { "cpu", RLIMIT_CPU },
401 #endif
402 #ifdef RLIMIT_DATA
403 { "data", RLIMIT_DATA },
404 #endif
405 #ifdef RLIMIT_FSIZE
406 { "fsize", RLIMIT_FSIZE },
407 #endif
408 #ifdef RLIMIT_LOCKS
409 { "locks", RLIMIT_LOCKS },
410 #endif
411 #ifdef RLIMIT_MEMLOCK
412 { "memlock", RLIMIT_MEMLOCK },
413 #endif
414 #ifdef RLIMIT_MSGQUEUE
415 { "msgqueue", RLIMIT_MSGQUEUE },
416 #endif
417 #ifdef RLIMIT_NICE
418 { "nice", RLIMIT_NICE },
419 #endif
420 #ifdef RLIMIT_NOFILE
421 { "nofile", RLIMIT_NOFILE },
422 #endif
423 #ifdef RLIMIT_NPROC
424 { "nproc", RLIMIT_NPROC },
425 #endif
426 #ifdef RLIMIT_RSS
427 { "rss", RLIMIT_RSS },
428 #endif
429 #ifdef RLIMIT_RTPRIO
430 { "rtprio", RLIMIT_RTPRIO },
431 #endif
432 #ifdef RLIMIT_RTTIME
433 { "rttime", RLIMIT_RTTIME },
434 #endif
435 #ifdef RLIMIT_SIGPENDING
436 { "sigpending", RLIMIT_SIGPENDING },
437 #endif
438 #ifdef RLIMIT_STACK
439 { "stack", RLIMIT_STACK },
440 #endif
441 };
442
443 static int run_buffer(char *buffer)
444 {
445 struct lxc_popen_FILE *f;
446 char *output;
447 int ret;
448
449 f = lxc_popen(buffer);
450 if (!f) {
451 SYSERROR("Failed to popen() %s.", buffer);
452 return -1;
453 }
454
455 output = malloc(LXC_LOG_BUFFER_SIZE);
456 if (!output) {
457 ERROR("Failed to allocate memory for %s.", buffer);
458 lxc_pclose(f);
459 return -1;
460 }
461
462 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
463 DEBUG("Script %s with output: %s.", buffer, output);
464
465 free(output);
466
467 ret = lxc_pclose(f);
468 if (ret == -1) {
469 SYSERROR("Script exited with error.");
470 return -1;
471 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
472 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
473 return -1;
474 } else if (WIFSIGNALED(ret)) {
475 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
476 return -1;
477 }
478
479 return 0;
480 }
481
482 static int run_script_argv(const char *name, const char *section,
483 const char *script, const char *hook,
484 const char *lxcpath, char **argsin)
485 {
486 int ret, i;
487 char *buffer;
488 size_t size = 0;
489
490 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
491 script, name, section);
492
493 for (i = 0; argsin && argsin[i]; i++)
494 size += strlen(argsin[i]) + 1;
495
496 size += strlen(hook) + 1;
497
498 size += strlen(script);
499 size += strlen(name);
500 size += strlen(section);
501 size += 3;
502
503 if (size > INT_MAX)
504 return -1;
505
506 buffer = alloca(size);
507 if (!buffer) {
508 ERROR("Failed to allocate memory.");
509 return -1;
510 }
511
512 ret =
513 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
514 if (ret < 0 || (size_t)ret >= size) {
515 ERROR("Script name too long.");
516 return -1;
517 }
518
519 for (i = 0; argsin && argsin[i]; i++) {
520 int len = size - ret;
521 int rc;
522 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
523 if (rc < 0 || rc >= len) {
524 ERROR("Script args too long.");
525 return -1;
526 }
527 ret += rc;
528 }
529
530 return run_buffer(buffer);
531 }
532
533 static int run_script(const char *name, const char *section, const char *script,
534 ...)
535 {
536 int ret;
537 char *buffer, *p;
538 size_t size = 0;
539 va_list ap;
540
541 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
542 script, name, section);
543
544 va_start(ap, script);
545 while ((p = va_arg(ap, char *)))
546 size += strlen(p) + 1;
547 va_end(ap);
548
549 size += strlen(script);
550 size += strlen(name);
551 size += strlen(section);
552 size += 3;
553
554 if (size > INT_MAX)
555 return -1;
556
557 buffer = alloca(size);
558 if (!buffer) {
559 ERROR("Failed to allocate memory.");
560 return -1;
561 }
562
563 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
564 if (ret < 0 || ret >= size) {
565 ERROR("Script name too long.");
566 return -1;
567 }
568
569 va_start(ap, script);
570 while ((p = va_arg(ap, char *))) {
571 int len = size - ret;
572 int rc;
573 rc = snprintf(buffer + ret, len, " %s", p);
574 if (rc < 0 || rc >= len) {
575 ERROR("Script args too long.");
576 return -1;
577 }
578 ret += rc;
579 }
580 va_end(ap);
581
582 return run_buffer(buffer);
583 }
584
585 /*
586 * pin_rootfs
587 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
588 * the duration of the container run, to prevent the container from marking
589 * the underlying fs readonly on shutdown. unlink the file immediately so
590 * no name pollution is happens
591 * return -1 on error.
592 * return -2 if nothing needed to be pinned.
593 * return an open fd (>=0) if we pinned it.
594 */
595 int pin_rootfs(const char *rootfs)
596 {
597 char absrootfs[MAXPATHLEN];
598 char absrootfspin[MAXPATHLEN];
599 struct stat s;
600 int ret, fd;
601
602 if (rootfs == NULL || strlen(rootfs) == 0)
603 return -2;
604
605 if (!realpath(rootfs, absrootfs))
606 return -2;
607
608 if (access(absrootfs, F_OK))
609 return -1;
610
611 if (stat(absrootfs, &s))
612 return -1;
613
614 if (!S_ISDIR(s.st_mode))
615 return -2;
616
617 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
618 if (ret >= MAXPATHLEN)
619 return -1;
620
621 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
622 if (fd < 0)
623 return fd;
624 (void)unlink(absrootfspin);
625 return fd;
626 }
627
628 /*
629 * If we are asking to remount something, make sure that any
630 * NOEXEC etc are honored.
631 */
632 unsigned long add_required_remount_flags(const char *s, const char *d,
633 unsigned long flags)
634 {
635 #ifdef HAVE_STATVFS
636 struct statvfs sb;
637 unsigned long required_flags = 0;
638
639 if (!(flags & MS_REMOUNT))
640 return flags;
641
642 if (!s)
643 s = d;
644
645 if (!s)
646 return flags;
647 if (statvfs(s, &sb) < 0)
648 return flags;
649
650 if (sb.f_flag & MS_NOSUID)
651 required_flags |= MS_NOSUID;
652 if (sb.f_flag & MS_NODEV)
653 required_flags |= MS_NODEV;
654 if (sb.f_flag & MS_RDONLY)
655 required_flags |= MS_RDONLY;
656 if (sb.f_flag & MS_NOEXEC)
657 required_flags |= MS_NOEXEC;
658
659 return flags | required_flags;
660 #else
661 return flags;
662 #endif
663 }
664
665 static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
666 {
667 int r;
668 int i;
669 static struct {
670 int match_mask;
671 int match_flag;
672 const char *source;
673 const char *destination;
674 const char *fstype;
675 unsigned long flags;
676 const char *options;
677 } default_mounts[] = {
678 /* Read-only bind-mounting... In older kernels, doing that required
679 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
680 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
681 * kernel 2.6.26 onwards. However, this apparently does not work on
682 * kernel 3.8. Unfortunately, on that very same kernel, doing the
683 * same trick as above doesn't seem to work either, there one needs
684 * to ALSO specify MS_BIND for the remount, otherwise the entire
685 * fs is remounted read-only or the mount fails because it's busy...
686 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
687 * 2.6.32...
688 */
689 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
690 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
691 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
692 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
693 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
694 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
695 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
696 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
697 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
698 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
699 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
700 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
701 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
702 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
703 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
704 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
705 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
706 { 0, 0, NULL, NULL, NULL, 0, NULL }
707 };
708
709 for (i = 0; default_mounts[i].match_mask; i++) {
710 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
711 char *source = NULL;
712 char *destination = NULL;
713 int saved_errno;
714 unsigned long mflags;
715
716 if (default_mounts[i].source) {
717 /* will act like strdup if %r is not present */
718 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
719 if (!source) {
720 SYSERROR("memory allocation error");
721 return -1;
722 }
723 }
724 if (!default_mounts[i].destination) {
725 ERROR("BUG: auto mounts destination %d was NULL", i);
726 free(source);
727 return -1;
728 }
729 /* will act like strdup if %r is not present */
730 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
731 if (!destination) {
732 saved_errno = errno;
733 SYSERROR("memory allocation error");
734 free(source);
735 errno = saved_errno;
736 return -1;
737 }
738 mflags = add_required_remount_flags(source, destination,
739 default_mounts[i].flags);
740 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
741 saved_errno = errno;
742 if (r < 0 && errno == ENOENT) {
743 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
744 r = 0;
745 }
746 else if (r < 0)
747 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
748
749 free(source);
750 free(destination);
751 if (r < 0) {
752 errno = saved_errno;
753 return -1;
754 }
755 }
756 }
757
758 if (flags & LXC_AUTO_CGROUP_MASK) {
759 int cg_flags;
760
761 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
762 /* If the type of cgroup mount was not specified, it depends on the
763 * container's capabilities as to what makes sense: if we have
764 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
765 * anyway, so we may as well default to read-write; then the admin
766 * will not be given a false sense of security. (And if they really
767 * want mixed r/o r/w, then they can explicitly specify :mixed.)
768 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
769 * :mixed, because then the container can't remount it read-write. */
770 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
771 int has_sys_admin = 0;
772
773 if (!lxc_list_empty(&conf->keepcaps))
774 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
775 else
776 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
777
778 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
779 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
780 else
781 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
782 }
783
784 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
785 SYSERROR("error mounting /sys/fs/cgroup");
786 return -1;
787 }
788 }
789
790 return 0;
791 }
792
793 static int setup_utsname(struct utsname *utsname)
794 {
795 if (!utsname)
796 return 0;
797
798 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
799 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
800 return -1;
801 }
802
803 INFO("'%s' hostname has been setup", utsname->nodename);
804
805 return 0;
806 }
807
808 struct dev_symlinks {
809 const char *oldpath;
810 const char *name;
811 };
812
813 static const struct dev_symlinks dev_symlinks[] = {
814 {"/proc/self/fd", "fd"},
815 {"/proc/self/fd/0", "stdin"},
816 {"/proc/self/fd/1", "stdout"},
817 {"/proc/self/fd/2", "stderr"},
818 };
819
820 static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
821 {
822 char path[MAXPATHLEN];
823 int ret,i;
824 struct stat s;
825
826
827 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
828 const struct dev_symlinks *d = &dev_symlinks[i];
829 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
830 if (ret < 0 || ret >= MAXPATHLEN)
831 return -1;
832
833 /*
834 * Stat the path first. If we don't get an error
835 * accept it as is and don't try to create it
836 */
837 if (!stat(path, &s)) {
838 continue;
839 }
840
841 ret = symlink(d->oldpath, path);
842
843 if (ret && errno != EEXIST) {
844 if ( errno == EROFS ) {
845 WARN("Warning: Read Only file system while creating %s", path);
846 } else {
847 SYSERROR("Error creating %s", path);
848 return -1;
849 }
850 }
851 }
852 return 0;
853 }
854
855 /*
856 * Build a space-separate list of ptys to pass to systemd.
857 */
858 static bool append_ptyname(char **pp, char *name)
859 {
860 char *p;
861
862 if (!*pp) {
863 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
864 if (!*pp)
865 return false;
866 sprintf(*pp, "container_ttys=%s", name);
867 return true;
868 }
869 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
870 if (!p)
871 return false;
872 *pp = p;
873 strcat(p, " ");
874 strcat(p, name);
875 return true;
876 }
877
878 static int lxc_setup_tty(struct lxc_conf *conf)
879 {
880 int i, ret;
881 const struct lxc_tty_info *tty_info = &conf->tty_info;
882 char *ttydir = conf->ttydir;
883 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
884
885 if (!conf->rootfs.path)
886 return 0;
887
888 for (i = 0; i < tty_info->nbtty; i++) {
889 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
890
891 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
892 if (ret < 0 || (size_t)ret >= sizeof(path)) {
893 ERROR("pathname too long for ttys");
894 return -1;
895 }
896
897 if (ttydir) {
898 /* create dev/lxc/tty%d" */
899 ret = snprintf(lxcpath, sizeof(lxcpath),
900 "/dev/%s/tty%d", ttydir, i + 1);
901 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
902 ERROR("pathname too long for ttys");
903 return -1;
904 }
905
906 ret = creat(lxcpath, 0660);
907 if (ret < 0 && errno != EEXIST) {
908 SYSERROR("failed to create \"%s\"", lxcpath);
909 return -1;
910 }
911 if (ret >= 0)
912 close(ret);
913
914 ret = unlink(path);
915 if (ret < 0 && errno != ENOENT) {
916 SYSERROR("failed to unlink \"%s\"", path);
917 return -1;
918 }
919
920 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
921 if (ret < 0) {
922 WARN("failed to bind mount \"%s\" onto \"%s\"",
923 pty_info->name, path);
924 continue;
925 }
926 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
927 path);
928
929 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
930 ttydir, i + 1);
931 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
932 ERROR("tty pathname too long");
933 return -1;
934 }
935
936 ret = symlink(lxcpath, path);
937 if (ret < 0) {
938 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
939 path, lxcpath);
940 return -1;
941 }
942 } else {
943 /* If we populated /dev, then we need to create
944 * /dev/ttyN
945 */
946 ret = access(path, F_OK);
947 if (ret < 0) {
948 ret = creat(path, 0660);
949 if (ret < 0) {
950 SYSERROR("failed to create \"%s\"", path);
951 /* this isn't fatal, continue */
952 } else {
953 close(ret);
954 }
955 }
956
957 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
958 if (ret < 0) {
959 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
960 continue;
961 }
962
963 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
964 path);
965 }
966
967 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
968 ERROR("Error setting up container_ttys string");
969 return -1;
970 }
971 }
972
973 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
974 return 0;
975 }
976
977 static int setup_rootfs_pivot_root(const char *rootfs)
978 {
979 int oldroot = -1, newroot = -1;
980
981 oldroot = open("/", O_DIRECTORY | O_RDONLY);
982 if (oldroot < 0) {
983 SYSERROR("Error opening old-/ for fchdir");
984 return -1;
985 }
986 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
987 if (newroot < 0) {
988 SYSERROR("Error opening new-/ for fchdir");
989 goto fail;
990 }
991
992 /* change into new root fs */
993 if (fchdir(newroot)) {
994 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
995 goto fail;
996 }
997
998 /* pivot_root into our new root fs */
999 if (pivot_root(".", ".")) {
1000 SYSERROR("pivot_root syscall failed");
1001 goto fail;
1002 }
1003
1004 /*
1005 * at this point the old-root is mounted on top of our new-root
1006 * To unmounted it we must not be chdir'd into it, so escape back
1007 * to old-root
1008 */
1009 if (fchdir(oldroot) < 0) {
1010 SYSERROR("Error entering oldroot");
1011 goto fail;
1012 }
1013 if (umount2(".", MNT_DETACH) < 0) {
1014 SYSERROR("Error detaching old root");
1015 goto fail;
1016 }
1017
1018 if (fchdir(newroot) < 0) {
1019 SYSERROR("Error re-entering newroot");
1020 goto fail;
1021 }
1022
1023 close(oldroot);
1024 close(newroot);
1025
1026 DEBUG("pivot_root syscall to '%s' successful", rootfs);
1027
1028 return 0;
1029
1030 fail:
1031 if (oldroot != -1)
1032 close(oldroot);
1033 if (newroot != -1)
1034 close(newroot);
1035 return -1;
1036 }
1037
1038 /* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1039 * error, log it but don't fail yet.
1040 */
1041 static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1042 const char *lxcpath)
1043 {
1044 int ret;
1045 size_t clen;
1046 char *path;
1047
1048 INFO("Preparing \"/dev\"");
1049
1050 /* $(rootfs->mount) + "/dev/pts" + '\0' */
1051 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
1052 path = alloca(clen);
1053
1054 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
1055 if (ret < 0 || (size_t)ret >= clen)
1056 return -1;
1057
1058 if (!dir_exists(path)) {
1059 WARN("\"/dev\" directory does not exist. Proceeding without "
1060 "autodev being set up");
1061 return 0;
1062 }
1063
1064 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
1065 rootfs->path ? rootfs->mount : NULL);
1066 if (ret < 0) {
1067 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1068 return -1;
1069 }
1070 INFO("Mounted tmpfs on \"%s\"", path);
1071
1072 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
1073 if (ret < 0 || (size_t)ret >= clen)
1074 return -1;
1075
1076 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
1077 * If not, then create it and exit if that fails...
1078 */
1079 if (!dir_exists(path)) {
1080 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1081 if (ret < 0) {
1082 SYSERROR("Failed to create directory \"%s\"", path);
1083 return -1;
1084 }
1085 }
1086
1087 INFO("Prepared \"/dev\"");
1088 return 0;
1089 }
1090
1091 struct lxc_devs {
1092 const char *name;
1093 mode_t mode;
1094 int maj;
1095 int min;
1096 };
1097
1098 static const struct lxc_devs lxc_devs[] = {
1099 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1100 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1101 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1102 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1103 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1104 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
1105 };
1106
1107 static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
1108 {
1109 int ret;
1110 char path[MAXPATHLEN];
1111 int i;
1112 mode_t cmask;
1113
1114 ret = snprintf(path, MAXPATHLEN, "%s/dev",
1115 rootfs->path ? rootfs->mount : "");
1116 if (ret < 0 || ret >= MAXPATHLEN)
1117 return -1;
1118
1119 /* ignore, just don't try to fill in */
1120 if (!dir_exists(path))
1121 return 0;
1122
1123 INFO("Populating \"/dev\"");
1124
1125 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1126 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
1127 const struct lxc_devs *d = &lxc_devs[i];
1128
1129 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
1130 rootfs->path ? rootfs->mount : "", d->name);
1131 if (ret < 0 || ret >= MAXPATHLEN)
1132 return -1;
1133
1134 ret = mknod(path, d->mode, makedev(d->maj, d->min));
1135 if (ret < 0) {
1136 FILE *pathfile;
1137 char hostpath[MAXPATHLEN];
1138
1139 if (errno == EEXIST) {
1140 DEBUG("\"%s\" device already existed", path);
1141 continue;
1142 }
1143
1144 /* Unprivileged containers cannot create devices, so
1145 * bind mount the device from the host.
1146 */
1147 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1148 if (ret < 0 || ret >= MAXPATHLEN)
1149 return -1;
1150
1151 pathfile = fopen(path, "wb");
1152 if (!pathfile) {
1153 SYSERROR("Failed to create file \"%s\"", path);
1154 return -1;
1155 }
1156 fclose(pathfile);
1157
1158 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1159 rootfs->path ? rootfs->mount : NULL);
1160 if (ret < 0) {
1161 SYSERROR("Failed to bind mount \"%s\" from "
1162 "host into container",
1163 d->name);
1164 return -1;
1165 }
1166 DEBUG("Bind mounted \"%s\" onto \"%s\"", hostpath,
1167 path);
1168 } else {
1169 DEBUG("Created device node \"%s\"", path);
1170 }
1171 }
1172 umask(cmask);
1173
1174 INFO("Populated \"/dev\"");
1175 return 0;
1176 }
1177
1178 static int lxc_setup_rootfs(struct lxc_conf *conf)
1179 {
1180 int ret;
1181 struct lxc_storage *bdev;
1182 const struct lxc_rootfs *rootfs;
1183
1184 rootfs = &conf->rootfs;
1185 if (!rootfs->path) {
1186 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1187 SYSERROR("Failed to make / rslave.");
1188 return -1;
1189 }
1190 return 0;
1191 }
1192
1193 if (access(rootfs->mount, F_OK)) {
1194 SYSERROR("Failed to access to \"%s\". Check it is present.",
1195 rootfs->mount);
1196 return -1;
1197 }
1198
1199 bdev = storage_init(conf, rootfs->path, rootfs->mount, rootfs->options);
1200 if (!bdev) {
1201 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1202 rootfs->path, rootfs->mount,
1203 rootfs->options ? rootfs->options : "(null)");
1204 return -1;
1205 }
1206
1207 ret = bdev->ops->mount(bdev);
1208 storage_put(bdev);
1209 if (ret < 0) {
1210 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1211 rootfs->path, rootfs->mount,
1212 rootfs->options ? rootfs->options : "(null)");
1213 return -1;
1214 }
1215
1216 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1217 rootfs->path, rootfs->mount,
1218 rootfs->options ? rootfs->options : "(null)");
1219
1220 return 0;
1221 }
1222
1223 int prepare_ramfs_root(char *root)
1224 {
1225 char buf[LXC_LINELEN], *p;
1226 char nroot[PATH_MAX];
1227 FILE *f;
1228 int i;
1229 char *p2;
1230
1231 if (realpath(root, nroot) == NULL)
1232 return -errno;
1233
1234 if (chdir("/") == -1)
1235 return -errno;
1236
1237 /*
1238 * We could use here MS_MOVE, but in userns this mount is
1239 * locked and can't be moved.
1240 */
1241 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
1242 SYSERROR("Failed to move %s into /", root);
1243 return -errno;
1244 }
1245
1246 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
1247 SYSERROR("Failed to make . rprivate");
1248 return -errno;
1249 }
1250
1251 /*
1252 * The following code cleans up inhereted mounts which are not
1253 * required for CT.
1254 *
1255 * The mountinfo file shows not all mounts, if a few points have been
1256 * unmounted between read operations from the mountinfo. So we need to
1257 * read mountinfo a few times.
1258 *
1259 * This loop can be skipped if a container uses unserns, because all
1260 * inherited mounts are locked and we should live with all this trash.
1261 */
1262 while (1) {
1263 int progress = 0;
1264
1265 f = fopen("./proc/self/mountinfo", "r");
1266 if (!f) {
1267 SYSERROR("Unable to open /proc/self/mountinfo");
1268 return -1;
1269 }
1270 while (fgets(buf, LXC_LINELEN, f)) {
1271 for (p = buf, i=0; p && i < 4; i++)
1272 p = strchr(p+1, ' ');
1273 if (!p)
1274 continue;
1275 p2 = strchr(p+1, ' ');
1276 if (!p2)
1277 continue;
1278
1279 *p2 = '\0';
1280 *p = '.';
1281
1282 if (strcmp(p + 1, "/") == 0)
1283 continue;
1284 if (strcmp(p + 1, "/proc") == 0)
1285 continue;
1286
1287 if (umount2(p, MNT_DETACH) == 0)
1288 progress++;
1289 }
1290 fclose(f);
1291 if (!progress)
1292 break;
1293 }
1294
1295 /* This also can be skipped if a container uses unserns */
1296 umount2("./proc", MNT_DETACH);
1297
1298 /* It is weird, but chdir("..") moves us in a new root */
1299 if (chdir("..") == -1) {
1300 SYSERROR("Unable to change working directory");
1301 return -1;
1302 }
1303
1304 if (chroot(".") == -1) {
1305 SYSERROR("Unable to chroot");
1306 return -1;
1307 }
1308
1309 return 0;
1310 }
1311
1312 static int setup_pivot_root(const struct lxc_rootfs *rootfs)
1313 {
1314 if (!rootfs->path) {
1315 DEBUG("container does not have a rootfs, so not doing pivot root");
1316 return 0;
1317 }
1318
1319 if (detect_ramfs_rootfs()) {
1320 DEBUG("detected that container is on ramfs");
1321 if (prepare_ramfs_root(rootfs->mount)) {
1322 ERROR("failed to prepare minimal ramfs root");
1323 return -1;
1324 }
1325
1326 DEBUG("prepared ramfs root for container");
1327 return 0;
1328 }
1329
1330 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1331 ERROR("failed to pivot root");
1332 return -1;
1333 }
1334
1335 DEBUG("finished pivot root");
1336 return 0;
1337 }
1338
1339 static int lxc_setup_devpts(int num_pts)
1340 {
1341 int ret;
1342 const char *default_devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
1343 char devpts_mntopts[256];
1344
1345 if (!num_pts) {
1346 DEBUG("no new devpts instance will be mounted since no pts "
1347 "devices are requested");
1348 return 0;
1349 }
1350
1351 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%d",
1352 default_devpts_mntopts, num_pts);
1353 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1354 return -1;
1355
1356 /* Unmount old devpts instance. */
1357 ret = access("/dev/pts/ptmx", F_OK);
1358 if (!ret) {
1359 ret = umount("/dev/pts");
1360 if (ret < 0) {
1361 SYSERROR("failed to unmount old devpts instance");
1362 return -1;
1363 }
1364 DEBUG("unmounted old /dev/pts instance");
1365 }
1366
1367 /* Create mountpoint for devpts instance. */
1368 ret = mkdir("/dev/pts", 0755);
1369 if (ret < 0 && errno != EEXIST) {
1370 SYSERROR("failed to create the \"/dev/pts\" directory");
1371 return -1;
1372 }
1373
1374 /* Mount new devpts instance. */
1375 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1376 if (ret < 0) {
1377 SYSERROR("failed to mount new devpts instance");
1378 return -1;
1379 }
1380 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
1381
1382 /* Remove any pre-existing /dev/ptmx file. */
1383 ret = access("/dev/ptmx", F_OK);
1384 if (!ret) {
1385 ret = remove("/dev/ptmx");
1386 if (ret < 0) {
1387 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1388 return -1;
1389 }
1390 DEBUG("removed existing \"/dev/ptmx\"");
1391 }
1392
1393 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1394 ret = open("/dev/ptmx", O_CREAT, 0666);
1395 if (ret < 0) {
1396 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1397 return -1;
1398 }
1399 close(ret);
1400 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
1401
1402 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
1403 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
1404 if (!ret) {
1405 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1406 return 0;
1407 } else {
1408 /* Fallthrough and try to create a symlink. */
1409 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1410 }
1411
1412 /* Remove the dummy /dev/ptmx file we created above. */
1413 ret = remove("/dev/ptmx");
1414 if (ret < 0) {
1415 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1416 return -1;
1417 }
1418
1419 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1420 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1421 if (ret < 0) {
1422 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1423 return -1;
1424 }
1425 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1426
1427 return 0;
1428 }
1429
1430 static int setup_personality(int persona)
1431 {
1432 #if HAVE_SYS_PERSONALITY_H
1433 if (persona == -1)
1434 return 0;
1435
1436 if (personality(persona) < 0) {
1437 SYSERROR("failed to set personality to '0x%x'", persona);
1438 return -1;
1439 }
1440
1441 INFO("set personality to '0x%x'", persona);
1442 #endif
1443
1444 return 0;
1445 }
1446
1447 static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1448 const struct lxc_console *console)
1449 {
1450 char path[MAXPATHLEN];
1451 int ret, fd;
1452
1453 if (console->path && !strcmp(console->path, "none"))
1454 return 0;
1455
1456 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1457 if (ret < 0 || (size_t)ret >= sizeof(path))
1458 return -1;
1459
1460 /* When we are asked to setup a console we remove any previous
1461 * /dev/console bind-mounts.
1462 */
1463 if (file_exists(path)) {
1464 ret = lxc_unstack_mountpoint(path, false);
1465 if (ret < 0) {
1466 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1467 return -ret;
1468 } else {
1469 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1470 }
1471
1472 ret = unlink(path);
1473 if (ret < 0) {
1474 SYSERROR("error unlinking %s", path);
1475 return -errno;
1476 }
1477 }
1478
1479 /* For unprivileged containers autodev or automounts will already have
1480 * taken care of creating /dev/console.
1481 */
1482 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1483 if (fd < 0) {
1484 if (errno != EEXIST) {
1485 SYSERROR("failed to create console");
1486 return -errno;
1487 }
1488 } else {
1489 close(fd);
1490 }
1491
1492 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
1493 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1494 return -errno;
1495 }
1496
1497 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
1498 ERROR("failed to mount '%s' on '%s'", console->name, path);
1499 return -1;
1500 }
1501
1502 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
1503 return 0;
1504 }
1505
1506 static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1507 const struct lxc_console *console,
1508 char *ttydir)
1509 {
1510 int ret;
1511 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
1512
1513 /* create rootfs/dev/<ttydir> directory */
1514 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1515 if (ret < 0 || (size_t)ret >= sizeof(path))
1516 return -1;
1517
1518 ret = mkdir(path, 0755);
1519 if (ret && errno != EEXIST) {
1520 SYSERROR("failed with errno %d to create %s", errno, path);
1521 return -errno;
1522 }
1523 DEBUG("Created directory for console and tty devices at \"%s\"", path);
1524
1525 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1526 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1527 return -1;
1528
1529 ret = creat(lxcpath, 0660);
1530 if (ret == -1 && errno != EEXIST) {
1531 SYSERROR("error %d creating %s", errno, lxcpath);
1532 return -errno;
1533 }
1534 if (ret >= 0)
1535 close(ret);
1536
1537 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1538 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1539 return -1;
1540
1541 /* When we are asked to setup a console we remove any previous
1542 * /dev/console bind-mounts.
1543 */
1544 if (console->path && !strcmp(console->path, "none")) {
1545 struct stat st;
1546 ret = stat(path, &st);
1547 if (ret < 0) {
1548 if (errno == ENOENT)
1549 return 0;
1550 SYSERROR("failed stat() \"%s\"", path);
1551 return -errno;
1552 }
1553
1554 /* /dev/console must be character device with major number 5 and
1555 * minor number 1. If not, give benefit of the doubt and assume
1556 * the user has mounted something else right there on purpose.
1557 */
1558 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1559 return 0;
1560
1561 /* In case the user requested a bind-mount for /dev/console and
1562 * requests a ttydir we move the mount to the
1563 * /dev/<ttydir/console.
1564 * Note, we only move the uppermost mount and clear all other
1565 * mounts underneath for safety.
1566 * If it is a character device created via mknod() we simply
1567 * rename it.
1568 */
1569 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1570 if (ret < 0) {
1571 if (errno != EINVAL) {
1572 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1573 return -errno;
1574 }
1575 /* path was not a mountpoint */
1576 ret = rename(path, lxcpath);
1577 if (ret < 0) {
1578 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1579 return -errno;
1580 }
1581 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1582 } else {
1583 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1584 }
1585
1586 /* Clear all remaining bind-mounts. */
1587 ret = lxc_unstack_mountpoint(path, false);
1588 if (ret < 0) {
1589 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1590 return -ret;
1591 } else {
1592 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1593 }
1594 } else {
1595 if (file_exists(path)) {
1596 ret = lxc_unstack_mountpoint(path, false);
1597 if (ret < 0) {
1598 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1599 return -ret;
1600 } else {
1601 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1602 }
1603 }
1604
1605 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1606 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1607 return -1;
1608 }
1609 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
1610 }
1611
1612 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
1613 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
1614 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1615 return -1;
1616
1617 ret = unlink(path);
1618 if (ret && errno != ENOENT) {
1619 SYSERROR("error unlinking %s", path);
1620 return -errno;
1621 }
1622
1623 ret = symlink(lxcpath, path);
1624 if (ret < 0) {
1625 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
1626 return -1;
1627 }
1628
1629 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
1630 return 0;
1631 }
1632
1633 static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1634 const struct lxc_console *console, char *ttydir)
1635 {
1636 /* We don't have a rootfs, /dev/console will be shared. */
1637 if (!rootfs->path) {
1638 DEBUG("/dev/console will be shared with the host");
1639 return 0;
1640 }
1641
1642 if (!ttydir)
1643 return lxc_setup_dev_console(rootfs, console);
1644
1645 return lxc_setup_ttydir_console(rootfs, console, ttydir);
1646 }
1647
1648 static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1649 {
1650 struct mount_opt *mo;
1651
1652 /* If opt is found in mount_opt, set or clear flags.
1653 * Otherwise append it to data. */
1654
1655 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1656 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1657 if (mo->clear)
1658 *flags &= ~mo->flag;
1659 else
1660 *flags |= mo->flag;
1661 return;
1662 }
1663 }
1664
1665 if (strlen(*data))
1666 strcat(*data, ",");
1667 strcat(*data, opt);
1668 }
1669
1670 int parse_mntopts(const char *mntopts, unsigned long *mntflags,
1671 char **mntdata)
1672 {
1673 char *s, *data;
1674 char *p, *saveptr = NULL;
1675
1676 *mntdata = NULL;
1677 *mntflags = 0L;
1678
1679 if (!mntopts)
1680 return 0;
1681
1682 s = strdup(mntopts);
1683 if (!s) {
1684 SYSERROR("failed to allocate memory");
1685 return -1;
1686 }
1687
1688 data = malloc(strlen(s) + 1);
1689 if (!data) {
1690 SYSERROR("failed to allocate memory");
1691 free(s);
1692 return -1;
1693 }
1694 *data = 0;
1695
1696 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1697 p = strtok_r(NULL, ",", &saveptr))
1698 parse_mntopt(p, mntflags, &data);
1699
1700 if (*data)
1701 *mntdata = data;
1702 else
1703 free(data);
1704 free(s);
1705
1706 return 0;
1707 }
1708
1709 static void null_endofword(char *word)
1710 {
1711 while (*word && *word != ' ' && *word != '\t')
1712 word++;
1713 *word = '\0';
1714 }
1715
1716 /*
1717 * skip @nfields spaces in @src
1718 */
1719 static char *get_field(char *src, int nfields)
1720 {
1721 char *p = src;
1722 int i;
1723
1724 for (i = 0; i < nfields; i++) {
1725 while (*p && *p != ' ' && *p != '\t')
1726 p++;
1727 if (!*p)
1728 break;
1729 p++;
1730 }
1731 return p;
1732 }
1733
1734 static int mount_entry(const char *fsname, const char *target,
1735 const char *fstype, unsigned long mountflags,
1736 const char *data, int optional, int dev,
1737 const char *rootfs)
1738 {
1739 int ret;
1740 #ifdef HAVE_STATVFS
1741 struct statvfs sb;
1742 #endif
1743
1744 ret = safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data,
1745 rootfs);
1746 if (ret < 0) {
1747 if (optional) {
1748 INFO("Failed to mount \"%s\" on \"%s\" (optional): %s",
1749 fsname, target, strerror(errno));
1750 return 0;
1751 }
1752
1753 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1754 return -1;
1755 }
1756
1757 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
1758 unsigned long rqd_flags = 0;
1759
1760 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
1761 "options",
1762 fsname ? fsname : "(none)", target ? target : "(none)");
1763
1764 if (mountflags & MS_RDONLY)
1765 rqd_flags |= MS_RDONLY;
1766 #ifdef HAVE_STATVFS
1767 if (statvfs(fsname, &sb) == 0) {
1768 unsigned long required_flags = rqd_flags;
1769
1770 if (sb.f_flag & MS_NOSUID)
1771 required_flags |= MS_NOSUID;
1772
1773 if (sb.f_flag & MS_NODEV && !dev)
1774 required_flags |= MS_NODEV;
1775
1776 if (sb.f_flag & MS_RDONLY)
1777 required_flags |= MS_RDONLY;
1778
1779 if (sb.f_flag & MS_NOEXEC)
1780 required_flags |= MS_NOEXEC;
1781
1782 DEBUG("Flags for \"%s\" were %lu, required extra flags "
1783 "are %lu", fsname, sb.f_flag, required_flags);
1784
1785 /* If this was a bind mount request, and required_flags
1786 * does not have any flags which are not already in
1787 * mountflags, then skip the remount.
1788 */
1789 if (!(mountflags & MS_REMOUNT)) {
1790 if (!(required_flags & ~mountflags) &&
1791 rqd_flags == 0) {
1792 DEBUG("Mountflags already were %lu, "
1793 "skipping remount", mountflags);
1794 goto skipremount;
1795 }
1796 }
1797
1798 mountflags |= required_flags;
1799 }
1800 #endif
1801
1802 ret = mount(fsname, target, fstype, mountflags | MS_REMOUNT, data);
1803 if (ret < 0) {
1804 if (optional) {
1805 INFO("Failed to mount \"%s\" on \"%s\" "
1806 "(optional): %s", fsname, target,
1807 strerror(errno));
1808 return 0;
1809 }
1810
1811 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1812 return -1;
1813 }
1814 }
1815
1816 #ifdef HAVE_STATVFS
1817 skipremount:
1818 #endif
1819 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"", fsname,
1820 target, fstype);
1821
1822 return 0;
1823 }
1824
1825 /* Remove "optional", "create=dir", and "create=file" from mntopt */
1826 static void cull_mntent_opt(struct mntent *mntent)
1827 {
1828 int i;
1829 char *list[] = {"create=dir", "create=file", "optional", NULL};
1830
1831 for (i = 0; list[i]; i++) {
1832 char *p, *p2;
1833
1834 p = strstr(mntent->mnt_opts, list[i]);
1835 if (!p)
1836 continue;
1837
1838 p2 = strchr(p, ',');
1839 if (!p2) {
1840 /* no more mntopts, so just chop it here */
1841 *p = '\0';
1842 continue;
1843 }
1844
1845 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
1846 }
1847 }
1848
1849 static int mount_entry_create_dir_file(const struct mntent *mntent,
1850 const char *path,
1851 const struct lxc_rootfs *rootfs,
1852 const char *lxc_name,
1853 const char *lxc_path)
1854 {
1855 int ret = 0;
1856
1857 if (!strncmp(mntent->mnt_type, "overlay", 7))
1858 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
1859 else if (!strncmp(mntent->mnt_type, "aufs", 4))
1860 ret = aufs_mkdir(mntent, rootfs, lxc_name, lxc_path);
1861 if (ret < 0)
1862 return -1;
1863
1864 if (hasmntopt(mntent, "create=dir")) {
1865 ret = mkdir_p(path, 0755);
1866 if (ret < 0 && errno != EEXIST) {
1867 SYSERROR("Failed to create directory \"%s\"", path);
1868 return -1;
1869 }
1870 }
1871
1872 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1873 int fd;
1874 char *p1, *p2;
1875
1876 p1 = strdup(path);
1877 if (!p1)
1878 return -1;
1879
1880 p2 = dirname(p1);
1881
1882 ret = mkdir_p(p2, 0755);
1883 free(p1);
1884 if (ret < 0 && errno != EEXIST) {
1885 SYSERROR("Failed to create directory \"%s\"", path);
1886 return -1;
1887 }
1888
1889 fd = open(path, O_CREAT, 0644);
1890 if (fd < 0)
1891 return -1;
1892 close(fd);
1893 }
1894
1895 return 0;
1896 }
1897
1898 /* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1899 * without a rootfs. */
1900 static inline int mount_entry_on_generic(struct mntent *mntent,
1901 const char *path,
1902 const struct lxc_rootfs *rootfs,
1903 const char *lxc_name,
1904 const char *lxc_path)
1905 {
1906 int ret;
1907 unsigned long mntflags;
1908 char *mntdata;
1909 bool dev, optional;
1910 char *rootfs_path = NULL;
1911
1912 optional = hasmntopt(mntent, "optional") != NULL;
1913 dev = hasmntopt(mntent, "dev") != NULL;
1914
1915 if (rootfs && rootfs->path)
1916 rootfs_path = rootfs->mount;
1917
1918 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
1919 lxc_path);
1920 if (ret < 0) {
1921 if (optional)
1922 return 0;
1923
1924 return -1;
1925 }
1926 cull_mntent_opt(mntent);
1927
1928 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
1929 if (ret < 0)
1930 return -1;
1931
1932 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
1933 mntdata, optional, dev, rootfs_path);
1934
1935 free(mntdata);
1936 return ret;
1937 }
1938
1939 static inline int mount_entry_on_systemfs(struct mntent *mntent)
1940 {
1941 int ret;
1942 char path[MAXPATHLEN];
1943
1944 /* For containers created without a rootfs all mounts are treated as
1945 * absolute paths starting at / on the host.
1946 */
1947 if (mntent->mnt_dir[0] != '/')
1948 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
1949 else
1950 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
1951 if (ret < 0 || ret >= sizeof(path))
1952 return -1;
1953
1954 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
1955 }
1956
1957 static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
1958 const struct lxc_rootfs *rootfs,
1959 const char *lxc_name,
1960 const char *lxc_path)
1961 {
1962 int offset;
1963 char *aux;
1964 const char *lxcpath;
1965 char path[MAXPATHLEN];
1966 int ret = 0;
1967
1968 lxcpath = lxc_global_config_value("lxc.lxcpath");
1969 if (!lxcpath)
1970 return -1;
1971
1972 /* If rootfs->path is a blockdev path, allow container fstab to use
1973 * <lxcpath>/<name>/rootfs" as the target prefix.
1974 */
1975 ret = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
1976 if (ret < 0 || ret >= MAXPATHLEN)
1977 goto skipvarlib;
1978
1979 aux = strstr(mntent->mnt_dir, path);
1980 if (aux) {
1981 offset = strlen(path);
1982 goto skipabs;
1983 }
1984
1985 skipvarlib:
1986 aux = strstr(mntent->mnt_dir, rootfs->path);
1987 if (!aux) {
1988 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
1989 return ret;
1990 }
1991 offset = strlen(rootfs->path);
1992
1993 skipabs:
1994 ret = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset);
1995 if (ret < 0 || ret >= MAXPATHLEN)
1996 return -1;
1997
1998 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
1999 }
2000
2001 static int mount_entry_on_relative_rootfs(struct mntent *mntent,
2002 const struct lxc_rootfs *rootfs,
2003 const char *lxc_name,
2004 const char *lxc_path)
2005 {
2006 char path[MAXPATHLEN];
2007 int ret;
2008
2009 /* relative to root mount point */
2010 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
2011 if (ret < 0 || ret >= sizeof(path)) {
2012 ERROR("path name too long");
2013 return -1;
2014 }
2015
2016 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2017 }
2018
2019 /* This logs a NOTICE() when a user specifies mounts that would conflict with
2020 * devices liblxc sets up automatically.
2021 */
2022 static void log_notice_on_conflict(const struct lxc_conf *conf, const char *src,
2023 const char *dest)
2024 {
2025 char *clean_mnt_fsname, *clean_mnt_dir, *tmp;
2026 bool needs_warning = false;
2027
2028 clean_mnt_fsname = lxc_deslashify(src);
2029 if (!clean_mnt_fsname)
2030 return;
2031
2032 clean_mnt_dir = lxc_deslashify(dest);
2033 if (!clean_mnt_dir) {
2034 free(clean_mnt_fsname);
2035 return;
2036 }
2037
2038 tmp = clean_mnt_dir;
2039 if (*tmp == '/')
2040 tmp++;
2041
2042 if (strncmp(src, "/dev", 4) || strncmp(tmp, "dev", 3)) {
2043 free(clean_mnt_dir);
2044 free(clean_mnt_fsname);
2045 return;
2046 }
2047
2048 if (!conf->autodev && !conf->pts && !conf->tty &&
2049 (!conf->console.path || !strcmp(conf->console.path, "none"))) {
2050 free(clean_mnt_dir);
2051 free(clean_mnt_fsname);
2052 return;
2053 }
2054
2055 if (!strcmp(tmp, "dev") && conf->autodev > 0)
2056 needs_warning = true;
2057 else if (!strcmp(tmp, "dev/pts") && (conf->autodev > 0 || conf->pts > 0))
2058 needs_warning = true;
2059 else if (!strcmp(tmp, "dev/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2060 needs_warning = true;
2061 else if (!strcmp(tmp, "dev/pts/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2062 needs_warning = true;
2063 else if (!strcmp(tmp, "dev/null") && conf->autodev > 0)
2064 needs_warning = true;
2065 else if (!strcmp(tmp, "dev/zero") && conf->autodev > 0)
2066 needs_warning = true;
2067 else if (!strcmp(tmp, "dev/full") && conf->autodev > 0)
2068 needs_warning = true;
2069 else if (!strcmp(tmp, "dev/urandom") && conf->autodev > 0)
2070 needs_warning = true;
2071 else if (!strcmp(tmp, "dev/random") && conf->autodev > 0)
2072 needs_warning = true;
2073 else if (!strcmp(tmp, "dev/tty") && conf->autodev > 0)
2074 needs_warning = true;
2075 else if (!strncmp(tmp, "dev/tty", 7) && (conf->autodev > 0 || conf->tty > 0))
2076 needs_warning = true;
2077
2078 if (needs_warning)
2079 NOTICE("Requesting to mount \"%s\" on \"%s\" while requesting "
2080 "automatic device setup under \"/dev\"",
2081 clean_mnt_fsname, clean_mnt_dir);
2082
2083 free(clean_mnt_dir);
2084 free(clean_mnt_fsname);
2085 }
2086
2087 static int mount_file_entries(const struct lxc_conf *conf,
2088 const struct lxc_rootfs *rootfs, FILE *file,
2089 const char *lxc_name, const char *lxc_path)
2090 {
2091 struct mntent mntent;
2092 char buf[4096];
2093 int ret = -1;
2094
2095 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
2096 log_notice_on_conflict(conf, mntent.mnt_fsname, mntent.mnt_dir);
2097
2098 if (!rootfs->path)
2099 ret = mount_entry_on_systemfs(&mntent);
2100 else if (mntent.mnt_dir[0] != '/')
2101 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2102 lxc_name, lxc_path);
2103 else
2104 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2105 lxc_name, lxc_path);
2106 if (ret < 0)
2107 return -1;
2108 }
2109 ret = 0;
2110
2111 INFO("Set up mount entries");
2112 return ret;
2113 }
2114
2115 static int setup_mount(const struct lxc_conf *conf,
2116 const struct lxc_rootfs *rootfs, const char *fstab,
2117 const char *lxc_name, const char *lxc_path)
2118 {
2119 FILE *f;
2120 int ret;
2121
2122 if (!fstab)
2123 return 0;
2124
2125 f = setmntent(fstab, "r");
2126 if (!f) {
2127 SYSERROR("Failed to open \"%s\"", fstab);
2128 return -1;
2129 }
2130
2131 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
2132 if (ret < 0)
2133 ERROR("Failed to set up mount entries");
2134
2135 endmntent(f);
2136 return ret;
2137 }
2138
2139 FILE *make_anonymous_mount_file(struct lxc_list *mount)
2140 {
2141 int ret;
2142 char *mount_entry;
2143 struct lxc_list *iterator;
2144 FILE *f;
2145 int fd = -1;
2146
2147 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2148 if (fd < 0) {
2149 if (errno != ENOSYS)
2150 return NULL;
2151 f = tmpfile();
2152 TRACE("Created temporary mount file");
2153 } else {
2154 f = fdopen(fd, "r+");
2155 TRACE("Created anonymous mount file");
2156 }
2157
2158 if (!f) {
2159 SYSERROR("Could not create mount file");
2160 if (fd != -1)
2161 close(fd);
2162 return NULL;
2163 }
2164
2165 lxc_list_for_each(iterator, mount) {
2166 mount_entry = iterator->elem;
2167 ret = fprintf(f, "%s\n", mount_entry);
2168 if (ret < strlen(mount_entry))
2169 WARN("Could not write mount entry to mount file");
2170 }
2171
2172 ret = fseek(f, 0, SEEK_SET);
2173 if (ret < 0) {
2174 SYSERROR("Failed to seek mount file");
2175 fclose(f);
2176 return NULL;
2177 }
2178
2179 return f;
2180 }
2181
2182 static int setup_mount_entries(const struct lxc_conf *conf,
2183 const struct lxc_rootfs *rootfs,
2184 struct lxc_list *mount, const char *lxc_name,
2185 const char *lxc_path)
2186 {
2187 FILE *f;
2188 int ret;
2189
2190 f = make_anonymous_mount_file(mount);
2191 if (!f)
2192 return -1;
2193
2194 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
2195
2196 fclose(f);
2197 return ret;
2198 }
2199
2200 static int parse_cap(const char *cap)
2201 {
2202 char *ptr = NULL;
2203 size_t i;
2204 int capid = -1;
2205
2206 if (!strcmp(cap, "none"))
2207 return -2;
2208
2209 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2210
2211 if (strcmp(cap, caps_opt[i].name))
2212 continue;
2213
2214 capid = caps_opt[i].value;
2215 break;
2216 }
2217
2218 if (capid < 0) {
2219 /* try to see if it's numeric, so the user may specify
2220 * capabilities that the running kernel knows about but
2221 * we don't */
2222 errno = 0;
2223 capid = strtol(cap, &ptr, 10);
2224 if (!ptr || *ptr != '\0' || errno != 0)
2225 /* not a valid number */
2226 capid = -1;
2227 else if (capid > lxc_caps_last_cap())
2228 /* we have a number but it's not a valid
2229 * capability */
2230 capid = -1;
2231 }
2232
2233 return capid;
2234 }
2235
2236 int in_caplist(int cap, struct lxc_list *caps)
2237 {
2238 struct lxc_list *iterator;
2239 int capid;
2240
2241 lxc_list_for_each(iterator, caps) {
2242 capid = parse_cap(iterator->elem);
2243 if (capid == cap)
2244 return 1;
2245 }
2246
2247 return 0;
2248 }
2249
2250 static int setup_caps(struct lxc_list *caps)
2251 {
2252 struct lxc_list *iterator;
2253 char *drop_entry;
2254 int capid;
2255
2256 lxc_list_for_each(iterator, caps) {
2257
2258 drop_entry = iterator->elem;
2259
2260 capid = parse_cap(drop_entry);
2261
2262 if (capid < 0) {
2263 ERROR("unknown capability %s", drop_entry);
2264 return -1;
2265 }
2266
2267 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2268
2269 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
2270 SYSERROR("failed to remove %s capability", drop_entry);
2271 return -1;
2272 }
2273
2274 }
2275
2276 DEBUG("capabilities have been setup");
2277
2278 return 0;
2279 }
2280
2281 static int dropcaps_except(struct lxc_list *caps)
2282 {
2283 struct lxc_list *iterator;
2284 char *keep_entry;
2285 int i, capid;
2286 int numcaps = lxc_caps_last_cap() + 1;
2287 INFO("found %d capabilities", numcaps);
2288
2289 if (numcaps <= 0 || numcaps > 200)
2290 return -1;
2291
2292 /* caplist[i] is 1 if we keep capability i */
2293 int *caplist = alloca(numcaps * sizeof(int));
2294 memset(caplist, 0, numcaps * sizeof(int));
2295
2296 lxc_list_for_each(iterator, caps) {
2297
2298 keep_entry = iterator->elem;
2299
2300 capid = parse_cap(keep_entry);
2301
2302 if (capid == -2)
2303 continue;
2304
2305 if (capid < 0) {
2306 ERROR("unknown capability %s", keep_entry);
2307 return -1;
2308 }
2309
2310 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
2311
2312 caplist[capid] = 1;
2313 }
2314 for (i=0; i<numcaps; i++) {
2315 if (caplist[i])
2316 continue;
2317 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
2318 SYSERROR("failed to remove capability %d", i);
2319 return -1;
2320 }
2321 }
2322
2323 DEBUG("capabilities have been setup");
2324
2325 return 0;
2326 }
2327
2328 static int setup_hw_addr(char *hwaddr, const char *ifname)
2329 {
2330 struct sockaddr sockaddr;
2331 struct ifreq ifr;
2332 int ret, fd, saved_errno;
2333
2334 ret = lxc_convert_mac(hwaddr, &sockaddr);
2335 if (ret) {
2336 ERROR("mac address '%s' conversion failed : %s",
2337 hwaddr, strerror(-ret));
2338 return -1;
2339 }
2340
2341 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
2342 ifr.ifr_name[IFNAMSIZ-1] = '\0';
2343 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2344
2345 fd = socket(AF_INET, SOCK_DGRAM, 0);
2346 if (fd < 0) {
2347 ERROR("socket failure : %s", strerror(errno));
2348 return -1;
2349 }
2350
2351 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
2352 saved_errno = errno;
2353 close(fd);
2354 if (ret)
2355 ERROR("ioctl failure : %s", strerror(saved_errno));
2356
2357 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
2358
2359 return ret;
2360 }
2361
2362 static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
2363 {
2364 struct lxc_list *iterator;
2365 struct lxc_inetdev *inetdev;
2366 int err;
2367
2368 lxc_list_for_each(iterator, ip) {
2369
2370 inetdev = iterator->elem;
2371
2372 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2373 &inetdev->bcast, inetdev->prefix);
2374 if (err) {
2375 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2376 ifindex, strerror(-err));
2377 return -1;
2378 }
2379 }
2380
2381 return 0;
2382 }
2383
2384 static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
2385 {
2386 struct lxc_list *iterator;
2387 struct lxc_inet6dev *inet6dev;
2388 int err;
2389
2390 lxc_list_for_each(iterator, ip) {
2391
2392 inet6dev = iterator->elem;
2393
2394 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
2395 &inet6dev->mcast, &inet6dev->acast,
2396 inet6dev->prefix);
2397 if (err) {
2398 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2399 ifindex, strerror(-err));
2400 return -1;
2401 }
2402 }
2403
2404 return 0;
2405 }
2406
2407 static int lxc_setup_netdev_in_child_namespaces(struct lxc_netdev *netdev)
2408 {
2409 char ifname[IFNAMSIZ];
2410 int err;
2411 const char *net_type_name;
2412 char *current_ifname = ifname;
2413
2414 /* empty network namespace */
2415 if (!netdev->ifindex) {
2416 if (netdev->flags & IFF_UP) {
2417 err = lxc_netdev_up("lo");
2418 if (err) {
2419 ERROR("failed to set the loopback up : %s",
2420 strerror(-err));
2421 return -1;
2422 }
2423 }
2424
2425 if (netdev->type == LXC_NET_EMPTY)
2426 return 0;
2427
2428 if (netdev->type == LXC_NET_NONE)
2429 return 0;
2430
2431 if (netdev->type != LXC_NET_VETH) {
2432 net_type_name = lxc_net_type_to_str(netdev->type);
2433 ERROR("%s networks are not supported for containers "
2434 "not setup up by privileged users",
2435 net_type_name);
2436 return -1;
2437 }
2438
2439 netdev->ifindex = if_nametoindex(netdev->name);
2440 }
2441
2442 /* get the new ifindex in case of physical netdev */
2443 if (netdev->type == LXC_NET_PHYS) {
2444 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2445 ERROR("failed to get ifindex for %s",
2446 netdev->link);
2447 return -1;
2448 }
2449 }
2450
2451 /* retrieve the name of the interface */
2452 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2453 ERROR("no interface corresponding to index '%d'",
2454 netdev->ifindex);
2455 return -1;
2456 }
2457
2458 /* Default: let the system to choose one interface name.
2459 * When the IFLA_IFNAME attribute is passed something like "<prefix>%d"
2460 * netlink will replace the format specifier with an appropriate index.
2461 */
2462 if (!netdev->name)
2463 netdev->name = netdev->type == LXC_NET_PHYS ?
2464 netdev->link : "eth%d";
2465
2466 /* rename the interface name */
2467 if (strcmp(ifname, netdev->name) != 0) {
2468 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2469 if (err) {
2470 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2471 strerror(-err));
2472 return -1;
2473 }
2474 }
2475
2476 /* Re-read the name of the interface because its name has changed
2477 * and would be automatically allocated by the system
2478 */
2479 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2480 ERROR("no interface corresponding to index '%d'",
2481 netdev->ifindex);
2482 return -1;
2483 }
2484
2485 /* set a mac address */
2486 if (netdev->hwaddr) {
2487 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
2488 ERROR("failed to setup hw address for '%s'",
2489 current_ifname);
2490 return -1;
2491 }
2492 }
2493
2494 /* setup ipv4 addresses on the interface */
2495 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
2496 ERROR("failed to setup ip addresses for '%s'",
2497 ifname);
2498 return -1;
2499 }
2500
2501 /* setup ipv6 addresses on the interface */
2502 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
2503 ERROR("failed to setup ipv6 addresses for '%s'",
2504 ifname);
2505 return -1;
2506 }
2507
2508 /* set the network device up */
2509 if (netdev->flags & IFF_UP) {
2510 int err;
2511
2512 err = lxc_netdev_up(current_ifname);
2513 if (err) {
2514 ERROR("failed to set '%s' up : %s", current_ifname,
2515 strerror(-err));
2516 return -1;
2517 }
2518
2519 /* the network is up, make the loopback up too */
2520 err = lxc_netdev_up("lo");
2521 if (err) {
2522 ERROR("failed to set the loopback up : %s",
2523 strerror(-err));
2524 return -1;
2525 }
2526 }
2527
2528 /* We can only set up the default routes after bringing
2529 * up the interface, sine bringing up the interface adds
2530 * the link-local routes and we can't add a default
2531 * route if the gateway is not reachable. */
2532
2533 /* setup ipv4 gateway on the interface */
2534 if (netdev->ipv4_gateway) {
2535 if (!(netdev->flags & IFF_UP)) {
2536 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2537 return -1;
2538 }
2539
2540 if (lxc_list_empty(&netdev->ipv4)) {
2541 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2542 return -1;
2543 }
2544
2545 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2546 if (err) {
2547 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2548 if (err) {
2549 ERROR("failed to add ipv4 dest for '%s': %s",
2550 ifname, strerror(-err));
2551 }
2552
2553 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2554 if (err) {
2555 ERROR("failed to setup ipv4 gateway for '%s': %s",
2556 ifname, strerror(-err));
2557 if (netdev->ipv4_gateway_auto) {
2558 char buf[INET_ADDRSTRLEN];
2559 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2560 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2561 }
2562 return -1;
2563 }
2564 }
2565 }
2566
2567 /* setup ipv6 gateway on the interface */
2568 if (netdev->ipv6_gateway) {
2569 if (!(netdev->flags & IFF_UP)) {
2570 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2571 return -1;
2572 }
2573
2574 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2575 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2576 return -1;
2577 }
2578
2579 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2580 if (err) {
2581 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2582 if (err) {
2583 ERROR("failed to add ipv6 dest for '%s': %s",
2584 ifname, strerror(-err));
2585 }
2586
2587 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2588 if (err) {
2589 ERROR("failed to setup ipv6 gateway for '%s': %s",
2590 ifname, strerror(-err));
2591 if (netdev->ipv6_gateway_auto) {
2592 char buf[INET6_ADDRSTRLEN];
2593 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2594 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2595 }
2596 return -1;
2597 }
2598 }
2599 }
2600
2601 DEBUG("'%s' has been setup", current_ifname);
2602
2603 return 0;
2604 }
2605
2606 static int lxc_setup_networks_in_child_namespaces(const struct lxc_conf *conf,
2607 struct lxc_list *network)
2608 {
2609 struct lxc_list *iterator;
2610 struct lxc_netdev *netdev;
2611
2612 lxc_log_configured_netdevs(conf);
2613
2614 lxc_list_for_each(iterator, network) {
2615 netdev = iterator->elem;
2616
2617 /* REMOVE in LXC 3.0 */
2618 if (netdev->idx < 0) {
2619 ERROR("WARNING: using \"lxc.network.*\" keys to define "
2620 "networks is DEPRECATED, please switch to using "
2621 "\"lxc.net.[i].* keys\"");
2622 }
2623
2624 if (lxc_setup_netdev_in_child_namespaces(netdev)) {
2625 ERROR("failed to setup netdev");
2626 return -1;
2627 }
2628 }
2629
2630 if (!lxc_list_empty(network))
2631 INFO("network has been setup");
2632
2633 return 0;
2634 }
2635
2636 static int parse_resource(const char *res) {
2637 size_t i;
2638 int resid = -1;
2639
2640 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2641 if (strcmp(res, limit_opt[i].name) == 0)
2642 return limit_opt[i].value;
2643 }
2644
2645 /* try to see if it's numeric, so the user may specify
2646 * resources that the running kernel knows about but
2647 * we don't */
2648 if (lxc_safe_int(res, &resid) == 0)
2649 return resid;
2650 return -1;
2651 }
2652
2653 int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2654 struct lxc_list *it;
2655 struct lxc_limit *lim;
2656 int resid;
2657
2658 lxc_list_for_each(it, limits) {
2659 lim = it->elem;
2660
2661 resid = parse_resource(lim->resource);
2662 if (resid < 0) {
2663 ERROR("unknown resource %s", lim->resource);
2664 return -1;
2665 }
2666
2667 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2668 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2669 return -1;
2670 }
2671 }
2672 return 0;
2673 }
2674
2675 /* try to move physical nics to the init netns */
2676 void lxc_restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2677 {
2678 int i, oldfd;
2679 char ifname[IFNAMSIZ];
2680
2681 if (netnsfd < 0 || conf->num_savednics == 0)
2682 return;
2683
2684 INFO("Running to reset %d nic names.", conf->num_savednics);
2685
2686 oldfd = lxc_preserve_ns(getpid(), "net");
2687 if (oldfd < 0) {
2688 SYSERROR("Failed to open monitor netns fd.");
2689 return;
2690 }
2691
2692 if (setns(netnsfd, 0) != 0) {
2693 SYSERROR("Failed to enter container netns to reset nics");
2694 close(oldfd);
2695 return;
2696 }
2697 for (i=0; i<conf->num_savednics; i++) {
2698 struct saved_nic *s = &conf->saved_nics[i];
2699 /* retrieve the name of the interface */
2700 if (!if_indextoname(s->ifindex, ifname)) {
2701 WARN("no interface corresponding to index '%d'", s->ifindex);
2702 continue;
2703 }
2704 if (lxc_netdev_move_by_name(ifname, 1, s->orig_name))
2705 WARN("Error moving nic name:%s back to host netns", ifname);
2706 free(s->orig_name);
2707 }
2708 conf->num_savednics = 0;
2709
2710 if (setns(oldfd, 0) != 0)
2711 SYSERROR("Failed to re-enter monitor's netns");
2712 close(oldfd);
2713 }
2714
2715 static char *default_rootfs_mount = LXCROOTFSMOUNT;
2716
2717 struct lxc_conf *lxc_conf_init(void)
2718 {
2719 struct lxc_conf *new;
2720 int i;
2721
2722 new = malloc(sizeof(*new));
2723 if (!new) {
2724 ERROR("lxc_conf_init : %s", strerror(errno));
2725 return NULL;
2726 }
2727 memset(new, 0, sizeof(*new));
2728
2729 new->loglevel = LXC_LOG_LEVEL_NOTSET;
2730 new->personality = -1;
2731 new->autodev = 1;
2732 new->console.log_path = NULL;
2733 new->console.log_fd = -1;
2734 new->console.path = NULL;
2735 new->console.peer = -1;
2736 new->console.peerpty.busy = -1;
2737 new->console.peerpty.master = -1;
2738 new->console.peerpty.slave = -1;
2739 new->console.master = -1;
2740 new->console.slave = -1;
2741 new->console.name[0] = '\0';
2742 new->maincmd_fd = -1;
2743 new->nbd_idx = -1;
2744 new->rootfs.mount = strdup(default_rootfs_mount);
2745 if (!new->rootfs.mount) {
2746 ERROR("lxc_conf_init : %s", strerror(errno));
2747 free(new);
2748 return NULL;
2749 }
2750 new->logfd = -1;
2751 lxc_list_init(&new->cgroup);
2752 lxc_list_init(&new->network);
2753 lxc_list_init(&new->mount_list);
2754 lxc_list_init(&new->caps);
2755 lxc_list_init(&new->keepcaps);
2756 lxc_list_init(&new->id_map);
2757 lxc_list_init(&new->includes);
2758 lxc_list_init(&new->aliens);
2759 lxc_list_init(&new->environment);
2760 lxc_list_init(&new->limits);
2761 for (i=0; i<NUM_LXC_HOOKS; i++)
2762 lxc_list_init(&new->hooks[i]);
2763 lxc_list_init(&new->groups);
2764 new->lsm_aa_profile = NULL;
2765 new->lsm_se_context = NULL;
2766 new->tmp_umount_proc = 0;
2767
2768 for (i = 0; i < LXC_NS_MAX; i++)
2769 new->inherit_ns_fd[i] = -1;
2770
2771 /* if running in a new user namespace, init and COMMAND
2772 * default to running as UID/GID 0 when using lxc-execute */
2773 new->init_uid = 0;
2774 new->init_gid = 0;
2775 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
2776
2777 return new;
2778 }
2779
2780 static int instantiate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2781 {
2782 char *veth1, *veth2;
2783 char veth1buf[IFNAMSIZ], veth2buf[IFNAMSIZ];
2784 int bridge_index, err;
2785 unsigned int mtu = 0;
2786
2787 if (netdev->priv.veth_attr.pair) {
2788 veth1 = netdev->priv.veth_attr.pair;
2789 if (handler->conf->reboot)
2790 lxc_netdev_delete_by_name(veth1);
2791 } else {
2792 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2793 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2794 ERROR("veth1 name too long");
2795 return -1;
2796 }
2797 veth1 = lxc_mkifname(veth1buf);
2798 if (!veth1) {
2799 ERROR("failed to allocate a temporary name");
2800 return -1;
2801 }
2802 /* store away for deconf */
2803 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
2804 }
2805
2806 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
2807 veth2 = lxc_mkifname(veth2buf);
2808 if (!veth2) {
2809 ERROR("failed to allocate a temporary name");
2810 goto out_delete;
2811 }
2812
2813 err = lxc_veth_create(veth1, veth2);
2814 if (err) {
2815 ERROR("failed to create veth pair \"%s\" and \"%s\": %s", veth1,
2816 veth2, strerror(-err));
2817 goto out_delete;
2818 }
2819
2820 /* changing the high byte of the mac address to 0xfe, the bridge interface
2821 * will always keep the host's mac address and not take the mac address
2822 * of a container */
2823 err = setup_private_host_hw_addr(veth1);
2824 if (err) {
2825 ERROR("failed to change mac address of host interface \"%s\": %s",
2826 veth1, strerror(-err));
2827 goto out_delete;
2828 }
2829
2830 netdev->ifindex = if_nametoindex(veth2);
2831 if (!netdev->ifindex) {
2832 ERROR("failed to retrieve the index for \"%s\"", veth2);
2833 goto out_delete;
2834 }
2835
2836 if (netdev->mtu) {
2837 if (lxc_safe_uint(netdev->mtu, &mtu) < 0)
2838 WARN("failed to parse mtu from");
2839 else
2840 INFO("retrieved mtu %d", mtu);
2841 } else if (netdev->link) {
2842 bridge_index = if_nametoindex(netdev->link);
2843 if (bridge_index) {
2844 mtu = netdev_get_mtu(bridge_index);
2845 INFO("retrieved mtu %d from %s", mtu, netdev->link);
2846 } else {
2847 mtu = netdev_get_mtu(netdev->ifindex);
2848 INFO("retrieved mtu %d from %s", mtu, veth2);
2849 }
2850 }
2851
2852 if (mtu) {
2853 err = lxc_netdev_set_mtu(veth1, mtu);
2854 if (!err)
2855 err = lxc_netdev_set_mtu(veth2, mtu);
2856 if (err) {
2857 ERROR("failed to set mtu \"%d\" for veth pair \"%s\" "
2858 "and \"%s\": %s",
2859 mtu, veth1, veth2, strerror(-err));
2860 goto out_delete;
2861 }
2862 }
2863
2864 if (netdev->link) {
2865 err = lxc_bridge_attach(netdev->link, veth1);
2866 if (err) {
2867 ERROR("failed to attach \"%s\" to bridge \"%s\": %s",
2868 veth1, netdev->link, strerror(-err));
2869 goto out_delete;
2870 }
2871 INFO("attached \"%s\" to bridge \"%s\"", veth1, netdev->link);
2872 }
2873
2874 err = lxc_netdev_up(veth1);
2875 if (err) {
2876 ERROR("failed to set \"%s\" up: %s", veth1, strerror(-err));
2877 goto out_delete;
2878 }
2879
2880 if (netdev->upscript) {
2881 err = run_script(handler->name, "net", netdev->upscript, "up",
2882 "veth", veth1, (char*) NULL);
2883 if (err)
2884 goto out_delete;
2885 }
2886
2887 DEBUG("instantiated veth \"%s/%s\", index is \"%d\"", veth1, veth2,
2888 netdev->ifindex);
2889
2890 return 0;
2891
2892 out_delete:
2893 if (netdev->ifindex != 0)
2894 lxc_netdev_delete_by_name(veth1);
2895 if (!netdev->priv.veth_attr.pair)
2896 free(veth1);
2897 free(veth2);
2898 return -1;
2899 }
2900
2901 static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2902 {
2903 char *veth1;
2904 int err;
2905
2906 if (netdev->priv.veth_attr.pair)
2907 veth1 = netdev->priv.veth_attr.pair;
2908 else
2909 veth1 = netdev->priv.veth_attr.veth1;
2910
2911 if (netdev->downscript) {
2912 err = run_script(handler->name, "net", netdev->downscript,
2913 "down", "veth", veth1, (char*) NULL);
2914 if (err)
2915 return -1;
2916 }
2917 return 0;
2918 }
2919
2920 static int instantiate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2921 {
2922 char peerbuf[IFNAMSIZ], *peer;
2923 int err;
2924
2925 if (!netdev->link) {
2926 ERROR("no link specified for macvlan netdev");
2927 return -1;
2928 }
2929
2930 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2931 if (err >= sizeof(peerbuf))
2932 return -1;
2933
2934 peer = lxc_mkifname(peerbuf);
2935 if (!peer) {
2936 ERROR("failed to make a temporary name");
2937 return -1;
2938 }
2939
2940 err = lxc_macvlan_create(netdev->link, peer,
2941 netdev->priv.macvlan_attr.mode);
2942 if (err) {
2943 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2944 peer, netdev->link, strerror(-err));
2945 goto out;
2946 }
2947
2948 netdev->ifindex = if_nametoindex(peer);
2949 if (!netdev->ifindex) {
2950 ERROR("failed to retrieve the index for %s", peer);
2951 goto out;
2952 }
2953
2954 if (netdev->upscript) {
2955 err = run_script(handler->name, "net", netdev->upscript, "up",
2956 "macvlan", netdev->link, (char*) NULL);
2957 if (err)
2958 goto out;
2959 }
2960
2961 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
2962 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
2963
2964 return 0;
2965 out:
2966 lxc_netdev_delete_by_name(peer);
2967 free(peer);
2968 return -1;
2969 }
2970
2971 static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2972 {
2973 int err;
2974
2975 if (netdev->downscript) {
2976 err = run_script(handler->name, "net", netdev->downscript,
2977 "down", "macvlan", netdev->link,
2978 (char*) NULL);
2979 if (err)
2980 return -1;
2981 }
2982 return 0;
2983 }
2984
2985 /* XXX: merge with instantiate_macvlan */
2986 static int instantiate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2987 {
2988 char peer[IFNAMSIZ];
2989 int err;
2990 static uint16_t vlan_cntr = 0;
2991 unsigned int mtu = 0;
2992
2993 if (!netdev->link) {
2994 ERROR("no link specified for vlan netdev");
2995 return -1;
2996 }
2997
2998 err = snprintf(peer, sizeof(peer), "vlan%d-%d", netdev->priv.vlan_attr.vid, vlan_cntr++);
2999 if (err >= sizeof(peer)) {
3000 ERROR("peer name too long");
3001 return -1;
3002 }
3003
3004 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
3005 if (err) {
3006 ERROR("failed to create vlan interface '%s' on '%s' : %s",
3007 peer, netdev->link, strerror(-err));
3008 return -1;
3009 }
3010
3011 netdev->ifindex = if_nametoindex(peer);
3012 if (!netdev->ifindex) {
3013 ERROR("failed to retrieve the ifindex for %s", peer);
3014 lxc_netdev_delete_by_name(peer);
3015 return -1;
3016 }
3017
3018 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
3019 netdev->ifindex);
3020 if (netdev->mtu) {
3021 if (lxc_safe_uint(netdev->mtu, &mtu) < 0) {
3022 ERROR("Failed to retrieve mtu from: '%d'/'%s'.",
3023 netdev->ifindex, netdev->name);
3024 return -1;
3025 }
3026 err = lxc_netdev_set_mtu(peer, mtu);
3027 if (err) {
3028 ERROR("failed to set mtu '%s' for %s : %s",
3029 netdev->mtu, peer, strerror(-err));
3030 lxc_netdev_delete_by_name(peer);
3031 return -1;
3032 }
3033 }
3034
3035 return 0;
3036 }
3037
3038 static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
3039 {
3040 return 0;
3041 }
3042
3043 static int instantiate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
3044 {
3045 if (!netdev->link) {
3046 ERROR("no link specified for the physical interface");
3047 return -1;
3048 }
3049
3050 netdev->ifindex = if_nametoindex(netdev->link);
3051 if (!netdev->ifindex) {
3052 ERROR("failed to retrieve the index for %s", netdev->link);
3053 return -1;
3054 }
3055
3056 if (netdev->upscript) {
3057 int err;
3058 err = run_script(handler->name, "net", netdev->upscript,
3059 "up", "phys", netdev->link, (char*) NULL);
3060 if (err)
3061 return -1;
3062 }
3063
3064 return 0;
3065 }
3066
3067 static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
3068 {
3069 int err;
3070
3071 if (netdev->downscript) {
3072 err = run_script(handler->name, "net", netdev->downscript,
3073 "down", "phys", netdev->link, (char*) NULL);
3074 if (err)
3075 return -1;
3076 }
3077 return 0;
3078 }
3079
3080 static int instantiate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3081 {
3082 netdev->ifindex = 0;
3083 return 0;
3084 }
3085
3086 static int instantiate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3087 {
3088 netdev->ifindex = 0;
3089 if (netdev->upscript) {
3090 int err;
3091 err = run_script(handler->name, "net", netdev->upscript,
3092 "up", "empty", (char*) NULL);
3093 if (err)
3094 return -1;
3095 }
3096 return 0;
3097 }
3098
3099 static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3100 {
3101 int err;
3102
3103 if (netdev->downscript) {
3104 err = run_script(handler->name, "net", netdev->downscript,
3105 "down", "empty", (char*) NULL);
3106 if (err)
3107 return -1;
3108 }
3109 return 0;
3110 }
3111
3112 static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3113 {
3114 return 0;
3115 }
3116
3117 int lxc_requests_empty_network(struct lxc_handler *handler)
3118 {
3119 struct lxc_list *network = &handler->conf->network;
3120 struct lxc_list *iterator;
3121 struct lxc_netdev *netdev;
3122 bool found_none = false, found_nic = false;
3123
3124 if (lxc_list_empty(network))
3125 return 0;
3126
3127 lxc_list_for_each(iterator, network) {
3128
3129 netdev = iterator->elem;
3130
3131 if (netdev->type == LXC_NET_NONE)
3132 found_none = true;
3133 else
3134 found_nic = true;
3135 }
3136 if (found_none && !found_nic)
3137 return 1;
3138 return 0;
3139 }
3140
3141 int lxc_setup_networks_in_parent_namespaces(struct lxc_handler *handler)
3142 {
3143 bool am_root;
3144 struct lxc_netdev *netdev;
3145 struct lxc_list *iterator;
3146 struct lxc_list *network = &handler->conf->network;
3147
3148 /* We need to be root. */
3149 am_root = (getuid() == 0);
3150 if (!am_root)
3151 return 0;
3152
3153 lxc_list_for_each(iterator, network) {
3154 netdev = iterator->elem;
3155
3156 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
3157 ERROR("invalid network configuration type '%d'",
3158 netdev->type);
3159 return -1;
3160 }
3161
3162 if (netdev_conf[netdev->type](handler, netdev)) {
3163 ERROR("failed to create netdev");
3164 return -1;
3165 }
3166
3167 }
3168
3169 return 0;
3170 }
3171
3172 bool lxc_delete_network(struct lxc_handler *handler)
3173 {
3174 int ret;
3175 struct lxc_list *iterator;
3176 struct lxc_list *network = &handler->conf->network;
3177 bool deleted_all = true;
3178
3179 lxc_list_for_each(iterator, network) {
3180 char *hostveth = NULL;
3181 struct lxc_netdev *netdev = iterator->elem;
3182
3183 /* We can only delete devices whose ifindex we have. If we don't
3184 * have the index it means that we didn't create it.
3185 */
3186 if (!netdev->ifindex)
3187 continue;
3188
3189 if (netdev->type == LXC_NET_PHYS) {
3190 ret = lxc_netdev_rename_by_index(netdev->ifindex, netdev->link);
3191 if (ret < 0)
3192 WARN("Failed to rename interface with index %d "
3193 "to its initial name \"%s\"",
3194 netdev->ifindex, netdev->link);
3195 else
3196 TRACE("Renamed interface with index %d to its "
3197 "initial name \"%s\"",
3198 netdev->ifindex, netdev->link);
3199 continue;
3200 }
3201
3202 ret = netdev_deconf[netdev->type](handler, netdev);
3203 if (ret < 0)
3204 WARN("Failed to deconfigure network device");
3205
3206 /* Recent kernels remove the virtual interfaces when the network
3207 * namespace is destroyed but in case we did not move the
3208 * interface to the network namespace, we have to destroy it.
3209 */
3210 if (!am_unpriv()) {
3211 ret = lxc_netdev_delete_by_index(netdev->ifindex);
3212 if (-ret == ENODEV) {
3213 INFO("Interface \"%s\" with index %d already "
3214 "deleted or existing in different network "
3215 "namespace",
3216 netdev->name ? netdev->name : "(null)",
3217 netdev->ifindex);
3218 } else if (ret < 0) {
3219 deleted_all = false;
3220 WARN("Failed to remove interface \"%s\" with "
3221 "index %d: %s",
3222 netdev->name ? netdev->name : "(null)",
3223 netdev->ifindex, strerror(-ret));
3224 continue;
3225 }
3226 INFO("Removed interface \"%s\" with index %d",
3227 netdev->name ? netdev->name : "(null)",
3228 netdev->ifindex);
3229 }
3230
3231 if (netdev->type != LXC_NET_VETH)
3232 continue;
3233
3234 if (am_unpriv()) {
3235 if (is_ovs_bridge(netdev->link)) {
3236 ret = lxc_unpriv_delete_nic(handler->lxcpath,
3237 handler->name,
3238 netdev, getpid());
3239 if (ret < 0)
3240 WARN("Failed to remove port \"%s\" "
3241 "from openvswitch bridge \"%s\"",
3242 netdev->priv.veth_attr.pair,
3243 netdev->link);
3244 }
3245
3246 continue;
3247 }
3248
3249 /* Explicitly delete host veth device to prevent lingering
3250 * devices. We had issues in LXD around this.
3251 */
3252 if (netdev->priv.veth_attr.pair)
3253 hostveth = netdev->priv.veth_attr.pair;
3254 else
3255 hostveth = netdev->priv.veth_attr.veth1;
3256 if (*hostveth == '\0')
3257 continue;
3258
3259 ret = lxc_netdev_delete_by_name(hostveth);
3260 if (ret < 0) {
3261 deleted_all = false;
3262 WARN("Failed to remove interface \"%s\" from \"%s\": %s",
3263 hostveth, netdev->link, strerror(-ret));
3264 continue;
3265 }
3266 INFO("Removed interface \"%s\" from \"%s\"", hostveth, netdev->link);
3267
3268 if (!is_ovs_bridge(netdev->link)) {
3269 netdev->priv.veth_attr.veth1[0] = '\0';
3270 continue;
3271 }
3272
3273 /* Delete the openvswitch port. */
3274 ret = lxc_ovs_delete_port(netdev->link, hostveth);
3275 if (ret < 0)
3276 WARN("Failed to remove port \"%s\" from openvswitch "
3277 "bridge \"%s\"", hostveth, netdev->link);
3278 else
3279 INFO("Removed port \"%s\" from openvswitch bridge \"%s\"",
3280 hostveth, netdev->link);
3281
3282 netdev->priv.veth_attr.veth1[0] = '\0';
3283 }
3284
3285 return deleted_all;
3286 }
3287
3288 #define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3289 static int unpriv_assign_nic(const char *lxcpath, char *lxcname,
3290 struct lxc_netdev *netdev, pid_t pid)
3291 {
3292 int ret;
3293 pid_t child;
3294 int bytes, pipefd[2];
3295 char *token, *saveptr = NULL;
3296 char netdev_link[IFNAMSIZ + 1];
3297 char buffer[MAXPATHLEN] = {0};
3298
3299 if (netdev->type != LXC_NET_VETH) {
3300 ERROR("nic type %d not support for unprivileged use",
3301 netdev->type);
3302 return -1;
3303 }
3304
3305 if (pipe(pipefd) < 0) {
3306 SYSERROR("pipe failed");
3307 return -1;
3308 }
3309
3310 child = fork();
3311 if (child < 0) {
3312 SYSERROR("fork");
3313 close(pipefd[0]);
3314 close(pipefd[1]);
3315 return -1;
3316 }
3317
3318 if (child == 0) { /* child */
3319 /* Call lxc-user-nic pid type bridge. */
3320 int ret;
3321 char pidstr[LXC_NUMSTRLEN64];
3322
3323 close(pipefd[0]); /* Close the read-end of the pipe. */
3324
3325 /* Redirect stdout to write-end of the pipe. */
3326 ret = dup2(pipefd[1], STDOUT_FILENO);
3327 if (ret >= 0)
3328 ret = dup2(pipefd[1], STDERR_FILENO);
3329 close(pipefd[1]); /* Close the write-end of the pipe. */
3330 if (ret < 0) {
3331 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
3332 exit(EXIT_FAILURE);
3333 }
3334
3335 if (netdev->link)
3336 strncpy(netdev_link, netdev->link, IFNAMSIZ);
3337 else
3338 strncpy(netdev_link, "none", IFNAMSIZ);
3339
3340 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
3341 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
3342 exit(EXIT_FAILURE);
3343 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
3344
3345 INFO("Execing lxc-user-nic create %s %s %s veth %s %s", lxcpath,
3346 lxcname, pidstr, netdev_link,
3347 netdev->name ? netdev->name : "(null)");
3348 if (netdev->name)
3349 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, "create",
3350 lxcpath, lxcname, pidstr, "veth", netdev_link,
3351 netdev->name, (char *)NULL);
3352 else
3353 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, "create",
3354 lxcpath, lxcname, pidstr, "veth", netdev_link,
3355 (char *)NULL);
3356 SYSERROR("Failed to exec lxc-user-nic.");
3357 exit(EXIT_FAILURE);
3358 }
3359
3360 /* close the write-end of the pipe */
3361 close(pipefd[1]);
3362
3363 bytes = read(pipefd[0], &buffer, MAXPATHLEN);
3364 if (bytes < 0) {
3365 SYSERROR("Failed to read from pipe file descriptor.");
3366 close(pipefd[0]);
3367 return -1;
3368 }
3369 buffer[bytes - 1] = '\0';
3370
3371 if (wait_for_pid(child) != 0) {
3372 ERROR("lxc-user-nic failed to configure requested network: %s",
3373 buffer[0] != '\0' ? buffer : "(null)");
3374 close(pipefd[0]);
3375 return -1;
3376 }
3377 TRACE("Received output \"%s\" from lxc-user-nic", buffer);
3378
3379 /* close the read-end of the pipe */
3380 close(pipefd[0]);
3381
3382 /* fill netdev->name field */
3383 token = strtok_r(buffer, ":", &saveptr);
3384 if (!token)
3385 return -1;
3386
3387 netdev->name = malloc(IFNAMSIZ + 1);
3388 if (!netdev->name) {
3389 SYSERROR("Failed to allocate memory.");
3390 return -1;
3391 }
3392 memset(netdev->name, 0, IFNAMSIZ + 1);
3393 strncpy(netdev->name, token, IFNAMSIZ);
3394
3395 /* fill netdev->veth_attr.pair field */
3396 token = strtok_r(NULL, ":", &saveptr);
3397 if (!token)
3398 return -1;
3399
3400 netdev->priv.veth_attr.pair = strdup(token);
3401 if (!netdev->priv.veth_attr.pair) {
3402 ERROR("Failed to allocate memory.");
3403 return -1;
3404 }
3405
3406 /* fill netdev->veth_attr.pair field */
3407 token = strtok_r(NULL, ":", &saveptr);
3408 if (!token)
3409 return -1;
3410
3411 ret = lxc_safe_int(token, &netdev->ifindex);
3412 if (ret < 0) {
3413 ERROR("Failed to parse ifindex for network device \"%s\"", netdev->name);
3414 return -1;
3415 }
3416
3417 return 0;
3418 }
3419
3420 int lxc_assign_network(const char *lxcpath, char *lxcname,
3421 struct lxc_list *network, pid_t pid)
3422 {
3423 struct lxc_list *iterator;
3424 struct lxc_netdev *netdev;
3425 char ifname[IFNAMSIZ];
3426 int am_root = (getuid() == 0);
3427 int err;
3428
3429 lxc_list_for_each(iterator, network) {
3430
3431 netdev = iterator->elem;
3432
3433 if (netdev->type == LXC_NET_VETH && !am_root) {
3434 if (netdev->mtu)
3435 INFO("mtu ignored due to insufficient privilege");
3436 if (unpriv_assign_nic(lxcpath, lxcname, netdev, pid))
3437 return -1;
3438 /* lxc-user-nic has moved the nic to the new ns.
3439 * unpriv_assign_nic() fills in netdev->name.
3440 * netdev->ifindex will be filed in at
3441 * lxc_setup_netdev_in_child_namespaces.
3442 */
3443 continue;
3444 }
3445
3446 /* empty network namespace, nothing to move */
3447 if (!netdev->ifindex)
3448 continue;
3449
3450 /* retrieve the name of the interface */
3451 if (!if_indextoname(netdev->ifindex, ifname)) {
3452 ERROR("no interface corresponding to index '%d'", netdev->ifindex);
3453 return -1;
3454 }
3455
3456 err = lxc_netdev_move_by_name(ifname, pid, NULL);
3457 if (err) {
3458 ERROR("failed to move '%s' to the container : %s",
3459 netdev->link, strerror(-err));
3460 return -1;
3461 }
3462
3463 DEBUG("move '%s'/'%s' to '%d': .", ifname, netdev->name, pid);
3464 }
3465
3466 return 0;
3467 }
3468
3469 static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3470 size_t buf_size)
3471 {
3472 char path[MAXPATHLEN];
3473 int fd, ret;
3474
3475 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
3476 idtype == ID_TYPE_UID ? 'u' : 'g');
3477 if (ret < 0 || ret >= MAXPATHLEN) {
3478 ERROR("failed to create path \"%s\"", path);
3479 return -E2BIG;
3480 }
3481
3482 fd = open(path, O_WRONLY);
3483 if (fd < 0) {
3484 SYSERROR("failed to open \"%s\"", path);
3485 return -1;
3486 }
3487
3488 errno = 0;
3489 ret = lxc_write_nointr(fd, buf, buf_size);
3490 if (ret != buf_size) {
3491 SYSERROR("failed to write %cid mapping to \"%s\"",
3492 idtype == ID_TYPE_UID ? 'u' : 'g', path);
3493 close(fd);
3494 return -1;
3495 }
3496 close(fd);
3497
3498 return 0;
3499 }
3500
3501 /* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
3502 *
3503 * @return 1 if functional binary was found
3504 * @return 0 if binary exists but is lacking privilege
3505 * @return -ENOENT if binary does not exist
3506 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
3507 *
3508 */
3509 static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
3510 {
3511 char *path;
3512 int ret;
3513 struct stat st;
3514 int fret = 0;
3515
3516 if (cap != CAP_SETUID && cap != CAP_SETGID)
3517 return -EINVAL;
3518
3519 path = on_path(binary, NULL);
3520 if (!path)
3521 return -ENOENT;
3522
3523 ret = stat(path, &st);
3524 if (ret < 0) {
3525 fret = -errno;
3526 goto cleanup;
3527 }
3528
3529 /* Check if the binary is setuid. */
3530 if (st.st_mode & S_ISUID) {
3531 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
3532 fret = 1;
3533 goto cleanup;
3534 }
3535
3536 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
3537 /* Check if it has the CAP_SETUID capability. */
3538 if ((cap & CAP_SETUID) &&
3539 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
3540 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
3541 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
3542 "and CAP_PERMITTED sets.", path);
3543 fret = 1;
3544 goto cleanup;
3545 }
3546
3547 /* Check if it has the CAP_SETGID capability. */
3548 if ((cap & CAP_SETGID) &&
3549 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
3550 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
3551 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
3552 "and CAP_PERMITTED sets.", path);
3553 fret = 1;
3554 goto cleanup;
3555 }
3556 #else
3557 /* If we cannot check for file capabilities we need to give the benefit
3558 * of the doubt. Otherwise we might fail even though all the necessary
3559 * file capabilities are set.
3560 */
3561 DEBUG("Cannot check for file capabilites as full capability support is "
3562 "missing. Manual intervention needed.");
3563 fret = 1;
3564 #endif
3565
3566 cleanup:
3567 free(path);
3568 return fret;
3569 }
3570
3571 int lxc_map_ids_exec_wrapper(void *args)
3572 {
3573 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
3574 return -1;
3575 }
3576
3577 int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3578 {
3579 struct id_map *map;
3580 struct lxc_list *iterator;
3581 enum idtype type;
3582 char u_or_g;
3583 char *pos;
3584 int fill, left;
3585 char cmd_output[MAXPATHLEN];
3586 /* strlen("new@idmap") = 9
3587 * +
3588 * strlen(" ") = 1
3589 * +
3590 * LXC_NUMSTRLEN64
3591 * +
3592 * strlen(" ") = 1
3593 *
3594 * We add some additional space to make sure that we really have
3595 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
3596 */
3597 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
3598 int ret = 0, uidmap = 0, gidmap = 0;
3599 bool use_shadow = false, had_entry = false;
3600
3601 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
3602 * ranges, then insist that root also reserve ranges in subuid. This
3603 * will protected it by preventing another user from being handed the
3604 * range by shadow.
3605 */
3606 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
3607 if (uidmap == -ENOENT)
3608 WARN("newuidmap binary is missing");
3609 else if (!uidmap)
3610 WARN("newuidmap is lacking necessary privileges");
3611
3612 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
3613 if (gidmap == -ENOENT)
3614 WARN("newgidmap binary is missing");
3615 else if (!gidmap)
3616 WARN("newgidmap is lacking necessary privileges");
3617
3618 if (uidmap > 0 && gidmap > 0) {
3619 DEBUG("Functional newuidmap and newgidmap binary found.");
3620 use_shadow = true;
3621 } else {
3622 /* In case unprivileged users run application containers via
3623 * execute() or a start*() there are valid cases where they may
3624 * only want to map their own {g,u}id. Let's not block them from
3625 * doing so by requiring geteuid() == 0.
3626 */
3627 DEBUG("No newuidmap and newgidmap binary found. Trying to "
3628 "write directly with euid %d.", geteuid());
3629 }
3630
3631 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
3632 type++, u_or_g = 'g') {
3633 pos = mapbuf;
3634
3635 if (use_shadow)
3636 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
3637
3638 lxc_list_for_each(iterator, idmap) {
3639 /* The kernel only takes <= 4k for writes to
3640 * /proc/<nr>/[ug]id_map
3641 */
3642 map = iterator->elem;
3643 if (map->idtype != type)
3644 continue;
3645
3646 had_entry = true;
3647
3648 left = LXC_IDMAPLEN - (pos - mapbuf);
3649 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
3650 use_shadow ? " " : "", map->nsid,
3651 map->hostid, map->range,
3652 use_shadow ? "" : "\n");
3653 if (fill <= 0 || fill >= left)
3654 SYSERROR("Too many {g,u}id mappings defined.");
3655
3656 pos += fill;
3657 }
3658 if (!had_entry)
3659 continue;
3660
3661 /* Try to catch the ouput of new{g,u}idmap to make debugging
3662 * easier.
3663 */
3664 if (use_shadow) {
3665 ret = run_command(cmd_output, sizeof(cmd_output),
3666 lxc_map_ids_exec_wrapper,
3667 (void *)mapbuf);
3668 if (ret < 0) {
3669 ERROR("new%cidmap failed to write mapping: %s",
3670 u_or_g, cmd_output);
3671 return -1;
3672 }
3673 } else {
3674 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
3675 if (ret < 0)
3676 return -1;
3677 }
3678
3679 memset(mapbuf, 0, sizeof(mapbuf));
3680 }
3681
3682 return 0;
3683 }
3684
3685 /*
3686 * return the host uid/gid to which the container root is mapped in
3687 * *val.
3688 * Return true if id was found, false otherwise.
3689 */
3690 bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3691 unsigned long *val)
3692 {
3693 struct lxc_list *it;
3694 struct id_map *map;
3695
3696 lxc_list_for_each(it, &conf->id_map) {
3697 map = it->elem;
3698 if (map->idtype != idtype)
3699 continue;
3700 if (map->nsid != 0)
3701 continue;
3702 *val = map->hostid;
3703 return true;
3704 }
3705 return false;
3706 }
3707
3708 int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
3709 {
3710 struct lxc_list *it;
3711 struct id_map *map;
3712 lxc_list_for_each(it, &conf->id_map) {
3713 map = it->elem;
3714 if (map->idtype != idtype)
3715 continue;
3716 if (id >= map->hostid && id < map->hostid + map->range)
3717 return (id - map->hostid) + map->nsid;
3718 }
3719 return -1;
3720 }
3721
3722 int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
3723 {
3724 struct lxc_list *it;
3725 struct id_map *map;
3726 unsigned int freeid = 0;
3727 again:
3728 lxc_list_for_each(it, &conf->id_map) {
3729 map = it->elem;
3730 if (map->idtype != idtype)
3731 continue;
3732 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3733 freeid = map->nsid + map->range;
3734 goto again;
3735 }
3736 }
3737 return freeid;
3738 }
3739
3740 int lxc_find_gateway_addresses(struct lxc_handler *handler)
3741 {
3742 struct lxc_list *network = &handler->conf->network;
3743 struct lxc_list *iterator;
3744 struct lxc_netdev *netdev;
3745 int link_index;
3746
3747 lxc_list_for_each(iterator, network) {
3748 netdev = iterator->elem;
3749
3750 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3751 continue;
3752
3753 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3754 ERROR("gateway = auto only supported for "
3755 "veth and macvlan");
3756 return -1;
3757 }
3758
3759 if (!netdev->link) {
3760 ERROR("gateway = auto needs a link interface");
3761 return -1;
3762 }
3763
3764 link_index = if_nametoindex(netdev->link);
3765 if (!link_index)
3766 return -EINVAL;
3767
3768 if (netdev->ipv4_gateway_auto) {
3769 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3770 ERROR("failed to automatically find ipv4 gateway "
3771 "address from link interface '%s'", netdev->link);
3772 return -1;
3773 }
3774 }
3775
3776 if (netdev->ipv6_gateway_auto) {
3777 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3778 ERROR("failed to automatically find ipv6 gateway "
3779 "address from link interface '%s'", netdev->link);
3780 return -1;
3781 }
3782 }
3783 }
3784
3785 return 0;
3786 }
3787
3788 int lxc_create_tty(const char *name, struct lxc_conf *conf)
3789 {
3790 struct lxc_tty_info *tty_info = &conf->tty_info;
3791 int i, ret;
3792
3793 /* no tty in the configuration */
3794 if (!conf->tty)
3795 return 0;
3796
3797 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
3798 if (!tty_info->pty_info) {
3799 SYSERROR("failed to allocate struct *pty_info");
3800 return -ENOMEM;
3801 }
3802
3803 for (i = 0; i < conf->tty; i++) {
3804 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3805
3806 process_lock();
3807 ret = openpty(&pty_info->master, &pty_info->slave,
3808 pty_info->name, NULL, NULL);
3809 process_unlock();
3810 if (ret) {
3811 SYSERROR("failed to create pty device number %d", i);
3812 tty_info->nbtty = i;
3813 lxc_delete_tty(tty_info);
3814 return -ENOTTY;
3815 }
3816
3817 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
3818 pty_info->name, pty_info->master, pty_info->slave);
3819
3820 /* Prevent leaking the file descriptors to the container */
3821 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3822 if (ret < 0)
3823 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
3824 "pty device \"%s\": %s",
3825 pty_info->master, pty_info->name, strerror(errno));
3826
3827 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3828 if (ret < 0)
3829 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
3830 "pty device \"%s\": %s",
3831 pty_info->slave, pty_info->name, strerror(errno));
3832
3833 pty_info->busy = 0;
3834 }
3835
3836 tty_info->nbtty = conf->tty;
3837
3838 INFO("finished allocating %d pts devices", conf->tty);
3839 return 0;
3840 }
3841
3842 void lxc_delete_tty(struct lxc_tty_info *tty_info)
3843 {
3844 int i;
3845
3846 for (i = 0; i < tty_info->nbtty; i++) {
3847 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3848
3849 close(pty_info->master);
3850 close(pty_info->slave);
3851 }
3852
3853 free(tty_info->pty_info);
3854 tty_info->pty_info = NULL;
3855 tty_info->nbtty = 0;
3856 }
3857
3858
3859 int chown_mapped_root_exec_wrapper(void *args)
3860 {
3861 execvp("lxc-usernsexec", args);
3862 return -1;
3863 }
3864
3865 /*
3866 * chown_mapped_root: for an unprivileged user with uid/gid X to
3867 * chown a dir to subuid/subgid Y, he needs to run chown as root
3868 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3869 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3870 * root is privileged with respect to hostuid/hostgid X, allowing
3871 * him to do the chown.
3872 */
3873 int chown_mapped_root(char *path, struct lxc_conf *conf)
3874 {
3875 uid_t rootuid, rootgid;
3876 unsigned long val;
3877 int hostuid, hostgid, ret;
3878 struct stat sb;
3879 char map1[100], map2[100], map3[100], map4[100], map5[100];
3880 char ugid[100];
3881 char *args1[] = {"lxc-usernsexec",
3882 "-m", map1,
3883 "-m", map2,
3884 "-m", map3,
3885 "-m", map5,
3886 "--", "chown", ugid, path,
3887 NULL};
3888 char *args2[] = {"lxc-usernsexec",
3889 "-m", map1,
3890 "-m", map2,
3891 "-m", map3,
3892 "-m", map4,
3893 "-m", map5,
3894 "--", "chown", ugid, path,
3895 NULL};
3896 char cmd_output[MAXPATHLEN];
3897
3898 hostuid = geteuid();
3899 hostgid = getegid();
3900
3901 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
3902 ERROR("No uid mapping for container root");
3903 return -1;
3904 }
3905 rootuid = (uid_t)val;
3906 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
3907 ERROR("No gid mapping for container root");
3908 return -1;
3909 }
3910 rootgid = (gid_t)val;
3911
3912 if (hostuid == 0) {
3913 if (chown(path, rootuid, rootgid) < 0) {
3914 ERROR("Error chowning %s", path);
3915 return -1;
3916 }
3917 return 0;
3918 }
3919
3920 if (rootuid == hostuid) {
3921 /* nothing to do */
3922 INFO("Container root is our uid; no need to chown");
3923 return 0;
3924 }
3925
3926 /* save the current gid of "path" */
3927 if (stat(path, &sb) < 0) {
3928 ERROR("Error stat %s", path);
3929 return -1;
3930 }
3931
3932 /* Update the path argument in case this was overlayfs. */
3933 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
3934 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
3935
3936 /*
3937 * A file has to be group-owned by a gid mapped into the
3938 * container, or the container won't be privileged over it.
3939 */
3940 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3941 if (sb.st_uid == hostuid &&
3942 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3943 chown(path, -1, hostgid) < 0) {
3944 ERROR("Failed chgrping %s", path);
3945 return -1;
3946 }
3947
3948 /* "u:0:rootuid:1" */
3949 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3950 if (ret < 0 || ret >= 100) {
3951 ERROR("Error uid printing map string");
3952 return -1;
3953 }
3954
3955 /* "u:hostuid:hostuid:1" */
3956 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3957 if (ret < 0 || ret >= 100) {
3958 ERROR("Error uid printing map string");
3959 return -1;
3960 }
3961
3962 /* "g:0:rootgid:1" */
3963 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3964 if (ret < 0 || ret >= 100) {
3965 ERROR("Error gid printing map string");
3966 return -1;
3967 }
3968
3969 /* "g:pathgid:rootgid+pathgid:1" */
3970 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3971 rootgid + (gid_t)sb.st_gid);
3972 if (ret < 0 || ret >= 100) {
3973 ERROR("Error gid printing map string");
3974 return -1;
3975 }
3976
3977 /* "g:hostgid:hostgid:1" */
3978 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3979 if (ret < 0 || ret >= 100) {
3980 ERROR("Error gid printing map string");
3981 return -1;
3982 }
3983
3984 /* "0:pathgid" (chown) */
3985 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3986 if (ret < 0 || ret >= 100) {
3987 ERROR("Error owner printing format string for chown");
3988 return -1;
3989 }
3990
3991 if (hostgid == sb.st_gid)
3992 ret = run_command(cmd_output, sizeof(cmd_output),
3993 chown_mapped_root_exec_wrapper,
3994 (void *)args1);
3995 else
3996 ret = run_command(cmd_output, sizeof(cmd_output),
3997 chown_mapped_root_exec_wrapper,
3998 (void *)args2);
3999 if (ret < 0)
4000 ERROR("lxc-usernsexec failed: %s", cmd_output);
4001
4002 return ret;
4003 }
4004
4005 int lxc_ttys_shift_ids(struct lxc_conf *c)
4006 {
4007 if (lxc_list_empty(&c->id_map))
4008 return 0;
4009
4010 if (!strcmp(c->console.name, ""))
4011 return 0;
4012
4013 if (chown_mapped_root(c->console.name, c) < 0) {
4014 ERROR("failed to chown console \"%s\"", c->console.name);
4015 return -1;
4016 }
4017
4018 TRACE("chowned console \"%s\"", c->console.name);
4019
4020 return 0;
4021 }
4022
4023 /* NOTE: Must not be called from inside the container namespace! */
4024 int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
4025 {
4026 int mounted;
4027
4028 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
4029 if (mounted == -1) {
4030 SYSERROR("failed to mount /proc in the container");
4031 /* continue only if there is no rootfs */
4032 if (conf->rootfs.path)
4033 return -1;
4034 } else if (mounted == 1) {
4035 conf->tmp_umount_proc = 1;
4036 }
4037
4038 return 0;
4039 }
4040
4041 void tmp_proc_unmount(struct lxc_conf *lxc_conf)
4042 {
4043 if (lxc_conf->tmp_umount_proc == 1) {
4044 umount("/proc");
4045 lxc_conf->tmp_umount_proc = 0;
4046 }
4047 }
4048
4049 void remount_all_slave(void)
4050 {
4051 /* walk /proc/mounts and change any shared entries to slave */
4052 FILE *f = fopen("/proc/self/mountinfo", "r");
4053 char *line = NULL;
4054 size_t len = 0;
4055
4056 if (!f) {
4057 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
4058 ERROR("Continuing container startup...");
4059 return;
4060 }
4061
4062 while (getline(&line, &len, f) != -1) {
4063 char *target, *opts;
4064 target = get_field(line, 4);
4065 if (!target)
4066 continue;
4067 opts = get_field(target, 2);
4068 if (!opts)
4069 continue;
4070 null_endofword(opts);
4071 if (!strstr(opts, "shared"))
4072 continue;
4073 null_endofword(target);
4074 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
4075 SYSERROR("Failed to make %s rslave", target);
4076 ERROR("Continuing...");
4077 }
4078 }
4079 fclose(f);
4080 free(line);
4081 }
4082
4083 void lxc_execute_bind_init(struct lxc_conf *conf)
4084 {
4085 int ret;
4086 char path[PATH_MAX], destpath[PATH_MAX], *p;
4087
4088 /* If init exists in the container, don't bind mount a static one */
4089 p = choose_init(conf->rootfs.mount);
4090 if (p) {
4091 free(p);
4092 return;
4093 }
4094
4095 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
4096 if (ret < 0 || ret >= PATH_MAX) {
4097 WARN("Path name too long searching for lxc.init.static");
4098 return;
4099 }
4100
4101 if (!file_exists(path)) {
4102 INFO("%s does not exist on host", path);
4103 return;
4104 }
4105
4106 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
4107 if (ret < 0 || ret >= PATH_MAX) {
4108 WARN("Path name too long for container's lxc.init.static");
4109 return;
4110 }
4111
4112 if (!file_exists(destpath)) {
4113 FILE * pathfile = fopen(destpath, "wb");
4114 if (!pathfile) {
4115 SYSERROR("Failed to create mount target '%s'", destpath);
4116 return;
4117 }
4118 fclose(pathfile);
4119 }
4120
4121 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
4122 if (ret < 0)
4123 SYSERROR("Failed to bind lxc.init.static into container");
4124 INFO("lxc.init.static bound into container at %s", path);
4125 }
4126
4127 /*
4128 * This does the work of remounting / if it is shared, calling the
4129 * container pre-mount hooks, and mounting the rootfs.
4130 */
4131 int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
4132 {
4133 if (conf->rootfs_setup) {
4134 /*
4135 * rootfs was set up in another namespace. bind-mount it
4136 * to give us a mount in our own ns so we can pivot_root to it
4137 */
4138 const char *path = conf->rootfs.mount;
4139 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
4140 ERROR("Failed to bind-mount container / onto itself");
4141 return -1;
4142 }
4143 return 0;
4144 }
4145
4146 remount_all_slave();
4147
4148 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
4149 ERROR("failed to run pre-mount hooks for container '%s'.", name);
4150 return -1;
4151 }
4152
4153 if (lxc_setup_rootfs(conf)) {
4154 ERROR("failed to setup rootfs for '%s'", name);
4155 return -1;
4156 }
4157
4158 conf->rootfs_setup = true;
4159 return 0;
4160 }
4161
4162 static bool verify_start_hooks(struct lxc_conf *conf)
4163 {
4164 struct lxc_list *it;
4165 char path[MAXPATHLEN];
4166 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
4167 char *hookname = it->elem;
4168 struct stat st;
4169 int ret;
4170
4171 ret = snprintf(path, MAXPATHLEN, "%s%s",
4172 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
4173 if (ret < 0 || ret >= MAXPATHLEN)
4174 return false;
4175 ret = stat(path, &st);
4176 if (ret) {
4177 SYSERROR("Start hook %s not found in container",
4178 hookname);
4179 return false;
4180 }
4181 return true;
4182 }
4183
4184 return true;
4185 }
4186
4187 static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
4188 {
4189 int i;
4190 int *ttyfds;
4191 struct lxc_pty_info *pty_info;
4192 struct lxc_conf *conf = handler->conf;
4193 const struct lxc_tty_info *tty_info = &conf->tty_info;
4194 int sock = handler->ttysock[0];
4195 int ret = -1;
4196 size_t num_ttyfds = (2 * conf->tty);
4197
4198 ttyfds = malloc(num_ttyfds * sizeof(int));
4199 if (!ttyfds)
4200 return -1;
4201
4202 for (i = 0; i < num_ttyfds; i++) {
4203 pty_info = &tty_info->pty_info[i / 2];
4204 ttyfds[i++] = pty_info->slave;
4205 ttyfds[i] = pty_info->master;
4206 TRACE("send pty \"%s\" with master fd %d and slave fd %d to "
4207 "parent",
4208 pty_info->name, pty_info->master, pty_info->slave);
4209 }
4210
4211 ret = lxc_abstract_unix_send_fds(sock, ttyfds, num_ttyfds, NULL, 0);
4212 if (ret < 0)
4213 ERROR("failed to send %d ttys to parent: %s", conf->tty,
4214 strerror(errno));
4215 else
4216 TRACE("sent %d ttys to parent", conf->tty);
4217
4218 close(handler->ttysock[0]);
4219 close(handler->ttysock[1]);
4220
4221 for (i = 0; i < num_ttyfds; i++)
4222 close(ttyfds[i]);
4223
4224 free(ttyfds);
4225
4226 return ret;
4227 }
4228
4229 int lxc_setup(struct lxc_handler *handler)
4230 {
4231 const char *name = handler->name;
4232 struct lxc_conf *lxc_conf = handler->conf;
4233 const char *lxcpath = handler->lxcpath;
4234
4235 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4236 ERROR("Error setting up rootfs mount after spawn");
4237 return -1;
4238 }
4239
4240 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4241 if (setup_utsname(lxc_conf->utsname)) {
4242 ERROR("failed to setup the utsname for '%s'", name);
4243 return -1;
4244 }
4245 }
4246
4247 if (lxc_setup_networks_in_child_namespaces(lxc_conf,
4248 &lxc_conf->network)) {
4249 ERROR("failed to setup the network for '%s'", name);
4250 return -1;
4251 }
4252
4253 if (lxc_conf->autodev > 0) {
4254 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
4255 ERROR("failed to mount /dev in the container");
4256 return -1;
4257 }
4258 }
4259
4260 /* do automatic mounts (mainly /proc and /sys), but exclude
4261 * those that need to wait until other stuff has finished
4262 */
4263 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
4264 ERROR("failed to setup the automatic mounts for '%s'", name);
4265 return -1;
4266 }
4267
4268 if (setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
4269 ERROR("failed to setup the mounts for '%s'", name);
4270 return -1;
4271 }
4272
4273 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(lxc_conf, &lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
4274 ERROR("failed to setup the mount entries for '%s'", name);
4275 return -1;
4276 }
4277
4278 /* Make sure any start hooks are in the container */
4279 if (!verify_start_hooks(lxc_conf))
4280 return -1;
4281
4282 if (lxc_conf->is_execute)
4283 lxc_execute_bind_init(lxc_conf);
4284
4285 /* now mount only cgroup, if wanted;
4286 * before, /sys could not have been mounted
4287 * (is either mounted automatically or via fstab entries)
4288 */
4289 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
4290 ERROR("failed to setup the automatic mounts for '%s'", name);
4291 return -1;
4292 }
4293
4294 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
4295 ERROR("failed to run mount hooks for container '%s'.", name);
4296 return -1;
4297 }
4298
4299 if (lxc_conf->autodev > 0) {
4300 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
4301 ERROR("failed to run autodev hooks for container '%s'.", name);
4302 return -1;
4303 }
4304
4305 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
4306 ERROR("failed to populate /dev in the container");
4307 return -1;
4308 }
4309 }
4310
4311 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
4312 ERROR("failed to setup the console for '%s'", name);
4313 return -1;
4314 }
4315
4316 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4317 ERROR("failed to setup /dev symlinks for '%s'", name);
4318 return -1;
4319 }
4320
4321 /* mount /proc if it's not already there */
4322 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
4323 ERROR("failed to LSM mount proc for '%s'", name);
4324 return -1;
4325 }
4326
4327 if (setup_pivot_root(&lxc_conf->rootfs)) {
4328 ERROR("failed to set rootfs for '%s'", name);
4329 return -1;
4330 }
4331
4332 if (lxc_setup_devpts(lxc_conf->pts)) {
4333 ERROR("failed to setup the new pts instance");
4334 return -1;
4335 }
4336
4337 if (lxc_create_tty(name, lxc_conf)) {
4338 ERROR("failed to create the ttys");
4339 return -1;
4340 }
4341
4342 if (lxc_send_ttys_to_parent(handler) < 0) {
4343 ERROR("failure sending console info to parent");
4344 return -1;
4345 }
4346
4347 if (!lxc_conf->is_execute && lxc_setup_tty(lxc_conf)) {
4348 ERROR("failed to setup the ttys for '%s'", name);
4349 return -1;
4350 }
4351
4352 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
4353 SYSERROR("failed to set environment variable for container ptys");
4354
4355
4356 if (setup_personality(lxc_conf->personality)) {
4357 ERROR("failed to setup personality");
4358 return -1;
4359 }
4360
4361 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4362 if (!lxc_list_empty(&lxc_conf->caps)) {
4363 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
4364 return -1;
4365 }
4366 if (dropcaps_except(&lxc_conf->keepcaps)) {
4367 ERROR("failed to keep requested caps");
4368 return -1;
4369 }
4370 } else if (setup_caps(&lxc_conf->caps)) {
4371 ERROR("failed to drop capabilities");
4372 return -1;
4373 }
4374
4375 NOTICE("Container \"%s\" is set up", name);
4376
4377 return 0;
4378 }
4379
4380 int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4381 const char *lxcpath, char *argv[])
4382 {
4383 int which = -1;
4384 struct lxc_list *it;
4385
4386 if (strcmp(hook, "pre-start") == 0)
4387 which = LXCHOOK_PRESTART;
4388 else if (strcmp(hook, "pre-mount") == 0)
4389 which = LXCHOOK_PREMOUNT;
4390 else if (strcmp(hook, "mount") == 0)
4391 which = LXCHOOK_MOUNT;
4392 else if (strcmp(hook, "autodev") == 0)
4393 which = LXCHOOK_AUTODEV;
4394 else if (strcmp(hook, "start") == 0)
4395 which = LXCHOOK_START;
4396 else if (strcmp(hook, "stop") == 0)
4397 which = LXCHOOK_STOP;
4398 else if (strcmp(hook, "post-stop") == 0)
4399 which = LXCHOOK_POSTSTOP;
4400 else if (strcmp(hook, "clone") == 0)
4401 which = LXCHOOK_CLONE;
4402 else if (strcmp(hook, "destroy") == 0)
4403 which = LXCHOOK_DESTROY;
4404 else
4405 return -1;
4406 lxc_list_for_each(it, &conf->hooks[which]) {
4407 int ret;
4408 char *hookname = it->elem;
4409 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
4410 if (ret)
4411 return ret;
4412 }
4413 return 0;
4414 }
4415
4416 int lxc_clear_config_caps(struct lxc_conf *c)
4417 {
4418 struct lxc_list *it, *next;
4419
4420 lxc_list_for_each_safe(it, &c->caps, next) {
4421 lxc_list_del(it);
4422 free(it->elem);
4423 free(it);
4424 }
4425 return 0;
4426 }
4427
4428 static int lxc_free_idmap(struct lxc_list *id_map) {
4429 struct lxc_list *it, *next;
4430
4431 lxc_list_for_each_safe(it, id_map, next) {
4432 lxc_list_del(it);
4433 free(it->elem);
4434 free(it);
4435 }
4436 return 0;
4437 }
4438
4439 int lxc_clear_idmaps(struct lxc_conf *c)
4440 {
4441 return lxc_free_idmap(&c->id_map);
4442 }
4443
4444 int lxc_clear_config_keepcaps(struct lxc_conf *c)
4445 {
4446 struct lxc_list *it,*next;
4447
4448 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4449 lxc_list_del(it);
4450 free(it->elem);
4451 free(it);
4452 }
4453 return 0;
4454 }
4455
4456 int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
4457 {
4458 struct lxc_list *it,*next;
4459 bool all = false;
4460 const char *k = NULL;
4461
4462 if (strcmp(key, "lxc.cgroup") == 0)
4463 all = true;
4464 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
4465 k = key + sizeof("lxc.cgroup.")-1;
4466 else
4467 return -1;
4468
4469 lxc_list_for_each_safe(it, &c->cgroup, next) {
4470 struct lxc_cgroup *cg = it->elem;
4471 if (!all && strcmp(cg->subsystem, k) != 0)
4472 continue;
4473 lxc_list_del(it);
4474 free(cg->subsystem);
4475 free(cg->value);
4476 free(cg);
4477 free(it);
4478 }
4479 return 0;
4480 }
4481
4482 int lxc_clear_limits(struct lxc_conf *c, const char *key)
4483 {
4484 struct lxc_list *it, *next;
4485 bool all = false;
4486 const char *k = NULL;
4487
4488 if (strcmp(key, "lxc.limit") == 0
4489 || strcmp(key, "lxc.prlimit"))
4490 all = true;
4491 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
4492 k = key + sizeof("lxc.limit.")-1;
4493 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.")-1) == 0)
4494 k = key + sizeof("lxc.prlimit.")-1;
4495 else
4496 return -1;
4497
4498 lxc_list_for_each_safe(it, &c->limits, next) {
4499 struct lxc_limit *lim = it->elem;
4500 if (!all && strcmp(lim->resource, k) != 0)
4501 continue;
4502 lxc_list_del(it);
4503 free(lim->resource);
4504 free(lim);
4505 free(it);
4506 }
4507 return 0;
4508 }
4509
4510 int lxc_clear_groups(struct lxc_conf *c)
4511 {
4512 struct lxc_list *it,*next;
4513
4514 lxc_list_for_each_safe(it, &c->groups, next) {
4515 lxc_list_del(it);
4516 free(it->elem);
4517 free(it);
4518 }
4519 return 0;
4520 }
4521
4522 int lxc_clear_environment(struct lxc_conf *c)
4523 {
4524 struct lxc_list *it,*next;
4525
4526 lxc_list_for_each_safe(it, &c->environment, next) {
4527 lxc_list_del(it);
4528 free(it->elem);
4529 free(it);
4530 }
4531 return 0;
4532 }
4533
4534
4535 int lxc_clear_mount_entries(struct lxc_conf *c)
4536 {
4537 struct lxc_list *it,*next;
4538
4539 lxc_list_for_each_safe(it, &c->mount_list, next) {
4540 lxc_list_del(it);
4541 free(it->elem);
4542 free(it);
4543 }
4544 return 0;
4545 }
4546
4547 int lxc_clear_automounts(struct lxc_conf *c)
4548 {
4549 c->auto_mounts = 0;
4550 return 0;
4551 }
4552
4553 int lxc_clear_hooks(struct lxc_conf *c, const char *key)
4554 {
4555 struct lxc_list *it,*next;
4556 bool all = false, done = false;
4557 const char *k = NULL;
4558 int i;
4559
4560 if (strcmp(key, "lxc.hook") == 0)
4561 all = true;
4562 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
4563 k = key + sizeof("lxc.hook.")-1;
4564 else
4565 return -1;
4566
4567 for (i=0; i<NUM_LXC_HOOKS; i++) {
4568 if (all || strcmp(k, lxchook_names[i]) == 0) {
4569 lxc_list_for_each_safe(it, &c->hooks[i], next) {
4570 lxc_list_del(it);
4571 free(it->elem);
4572 free(it);
4573 }
4574 done = true;
4575 }
4576 }
4577
4578 if (!done) {
4579 ERROR("Invalid hook key: %s", key);
4580 return -1;
4581 }
4582 return 0;
4583 }
4584
4585 static void lxc_clear_saved_nics(struct lxc_conf *conf)
4586 {
4587 int i;
4588
4589 if (!conf->saved_nics)
4590 return;
4591 for (i=0; i < conf->num_savednics; i++)
4592 free(conf->saved_nics[i].orig_name);
4593 free(conf->saved_nics);
4594 }
4595
4596 static inline void lxc_clear_aliens(struct lxc_conf *conf)
4597 {
4598 struct lxc_list *it,*next;
4599
4600 lxc_list_for_each_safe(it, &conf->aliens, next) {
4601 lxc_list_del(it);
4602 free(it->elem);
4603 free(it);
4604 }
4605 }
4606
4607 void lxc_clear_includes(struct lxc_conf *conf)
4608 {
4609 struct lxc_list *it,*next;
4610
4611 lxc_list_for_each_safe(it, &conf->includes, next) {
4612 lxc_list_del(it);
4613 free(it->elem);
4614 free(it);
4615 }
4616 }
4617
4618 void lxc_conf_free(struct lxc_conf *conf)
4619 {
4620 if (!conf)
4621 return;
4622 if (current_config == conf)
4623 current_config = NULL;
4624 free(conf->console.log_path);
4625 free(conf->console.path);
4626 free(conf->rootfs.mount);
4627 free(conf->rootfs.bdev_type);
4628 free(conf->rootfs.options);
4629 free(conf->rootfs.path);
4630 free(conf->logfile);
4631 if (conf->logfd != -1)
4632 close(conf->logfd);
4633 free(conf->utsname);
4634 free(conf->ttydir);
4635 free(conf->fstab);
4636 free(conf->rcfile);
4637 free(conf->init_cmd);
4638 free(conf->unexpanded_config);
4639 free(conf->pty_names);
4640 free(conf->syslog);
4641 lxc_free_networks(&conf->network);
4642 free(conf->lsm_aa_profile);
4643 free(conf->lsm_se_context);
4644 lxc_seccomp_free(conf);
4645 lxc_clear_config_caps(conf);
4646 lxc_clear_config_keepcaps(conf);
4647 lxc_clear_cgroups(conf, "lxc.cgroup");
4648 lxc_clear_hooks(conf, "lxc.hook");
4649 lxc_clear_mount_entries(conf);
4650 lxc_clear_saved_nics(conf);
4651 lxc_clear_idmaps(conf);
4652 lxc_clear_groups(conf);
4653 lxc_clear_includes(conf);
4654 lxc_clear_aliens(conf);
4655 lxc_clear_environment(conf);
4656 lxc_clear_limits(conf, "lxc.prlimit");
4657 free(conf->cgroup_meta.dir);
4658 free(conf->cgroup_meta.controllers);
4659 free(conf);
4660 }
4661
4662 struct userns_fn_data {
4663 int (*fn)(void *);
4664 const char *fn_name;
4665 void *arg;
4666 int p[2];
4667 };
4668
4669 static int run_userns_fn(void *data)
4670 {
4671 struct userns_fn_data *d = data;
4672 char c;
4673
4674 /* Close write end of the pipe. */
4675 close(d->p[1]);
4676
4677 /* Wait for parent to finish establishing a new mapping in the user
4678 * namespace we are executing in.
4679 */
4680 if (read(d->p[0], &c, 1) != 1)
4681 return -1;
4682
4683 /* Close read end of the pipe. */
4684 close(d->p[0]);
4685
4686 if (d->fn_name)
4687 TRACE("calling function \"%s\"", d->fn_name);
4688 /* Call function to run. */
4689 return d->fn(d->arg);
4690 }
4691
4692 static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
4693 enum idtype idtype)
4694 {
4695 struct lxc_list *it;
4696 struct id_map *map;
4697 struct id_map *retmap = NULL;
4698
4699 lxc_list_for_each(it, &conf->id_map) {
4700 map = it->elem;
4701 if (map->idtype != idtype)
4702 continue;
4703
4704 if (id >= map->hostid && id < map->hostid + map->range) {
4705 retmap = map;
4706 break;
4707 }
4708 }
4709
4710 if (!retmap)
4711 return NULL;
4712
4713 retmap = malloc(sizeof(*retmap));
4714 if (!retmap)
4715 return NULL;
4716
4717 memcpy(retmap, map, sizeof(*retmap));
4718 return retmap;
4719 }
4720
4721 /*
4722 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4723 * existing one or establish a new one.
4724 */
4725 static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4726 {
4727 int hostid_mapped;
4728 struct id_map *entry = NULL;
4729
4730 /* Reuse existing mapping. */
4731 entry = mapped_hostid_entry(conf, id, type);
4732 if (entry)
4733 return entry;
4734
4735 /* Find new mapping. */
4736 hostid_mapped = find_unmapped_nsid(conf, type);
4737 if (hostid_mapped < 0) {
4738 DEBUG("failed to find free mapping for id %d", id);
4739 return NULL;
4740 }
4741
4742 entry = malloc(sizeof(*entry));
4743 if (!entry)
4744 return NULL;
4745
4746 entry->idtype = type;
4747 entry->nsid = hostid_mapped;
4748 entry->hostid = (unsigned long)id;
4749 entry->range = 1;
4750
4751 return entry;
4752 }
4753
4754 /* Run a function in a new user namespace.
4755 * The caller's euid/egid will be mapped if it is not already.
4756 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4757 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4758 * This means we require only to establish a mapping from:
4759 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4760 * - the container root -> some sub{g,u}id
4761 * The former we add, if the user did not specifiy a mapping. The latter we
4762 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4763 * there to start the container in the first place.
4764 */
4765 int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4766 const char *fn_name)
4767 {
4768 pid_t pid;
4769 uid_t euid, egid;
4770 struct userns_fn_data d;
4771 int p[2];
4772 struct lxc_list *it;
4773 struct id_map *map;
4774 char c = '1';
4775 int ret = -1;
4776 struct lxc_list *idmap = NULL, *tmplist = NULL;
4777 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4778 *host_uid_map = NULL, *host_gid_map = NULL;
4779
4780 ret = pipe(p);
4781 if (ret < 0) {
4782 SYSERROR("opening pipe");
4783 return -1;
4784 }
4785 d.fn = fn;
4786 d.fn_name = fn_name;
4787 d.arg = data;
4788 d.p[0] = p[0];
4789 d.p[1] = p[1];
4790
4791 /* Clone child in new user namespace. */
4792 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4793 if (pid < 0) {
4794 ERROR("failed to clone child process in new user namespace");
4795 goto on_error;
4796 }
4797
4798 close(p[0]);
4799 p[0] = -1;
4800
4801 /* Find container root. */
4802 lxc_list_for_each(it, &conf->id_map) {
4803 map = it->elem;
4804
4805 if (map->nsid != 0)
4806 continue;
4807
4808 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
4809 container_root_uid = malloc(sizeof(*container_root_uid));
4810 if (!container_root_uid)
4811 goto on_error;
4812 container_root_uid->idtype = map->idtype;
4813 container_root_uid->hostid = map->hostid;
4814 container_root_uid->nsid = 0;
4815 container_root_uid->range = map->range;
4816 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
4817 container_root_gid = malloc(sizeof(*container_root_gid));
4818 if (!container_root_gid)
4819 goto on_error;
4820 container_root_gid->idtype = map->idtype;
4821 container_root_gid->hostid = map->hostid;
4822 container_root_gid->nsid = 0;
4823 container_root_gid->range = map->range;
4824 }
4825
4826 /* Found container root. */
4827 if (container_root_uid && container_root_gid)
4828 break;
4829 }
4830
4831 /* This is actually checked earlier but it can't hurt. */
4832 if (!container_root_uid || !container_root_gid) {
4833 ERROR("no mapping for container root found");
4834 goto on_error;
4835 }
4836
4837 host_uid_map = container_root_uid;
4838 host_gid_map = container_root_gid;
4839
4840 /* Check whether the {g,u}id of the user has a mapping. */
4841 euid = geteuid();
4842 egid = getegid();
4843 if (euid != container_root_uid->hostid)
4844 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
4845
4846 if (egid != container_root_gid->hostid)
4847 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
4848
4849 if (!host_uid_map) {
4850 DEBUG("failed to find mapping for uid %d", euid);
4851 goto on_error;
4852 }
4853
4854 if (!host_gid_map) {
4855 DEBUG("failed to find mapping for gid %d", egid);
4856 goto on_error;
4857 }
4858
4859 /* Allocate new {g,u}id map list. */
4860 idmap = malloc(sizeof(*idmap));
4861 if (!idmap)
4862 goto on_error;
4863 lxc_list_init(idmap);
4864
4865 /* Add container root to the map. */
4866 tmplist = malloc(sizeof(*tmplist));
4867 if (!tmplist)
4868 goto on_error;
4869 lxc_list_add_elem(tmplist, container_root_uid);
4870 lxc_list_add_tail(idmap, tmplist);
4871
4872 if (host_uid_map && (host_uid_map != container_root_uid)) {
4873 /* idmap will now keep track of that memory. */
4874 container_root_uid = NULL;
4875
4876 /* Add container root to the map. */
4877 tmplist = malloc(sizeof(*tmplist));
4878 if (!tmplist)
4879 goto on_error;
4880 lxc_list_add_elem(tmplist, host_uid_map);
4881 lxc_list_add_tail(idmap, tmplist);
4882 }
4883 /* idmap will now keep track of that memory. */
4884 container_root_uid = NULL;
4885 /* idmap will now keep track of that memory. */
4886 host_uid_map = NULL;
4887
4888 tmplist = malloc(sizeof(*tmplist));
4889 if (!tmplist)
4890 goto on_error;
4891 lxc_list_add_elem(tmplist, container_root_gid);
4892 lxc_list_add_tail(idmap, tmplist);
4893
4894 if (host_gid_map && (host_gid_map != container_root_gid)) {
4895 /* idmap will now keep track of that memory. */
4896 container_root_gid = NULL;
4897
4898 tmplist = malloc(sizeof(*tmplist));
4899 if (!tmplist)
4900 goto on_error;
4901 lxc_list_add_elem(tmplist, host_gid_map);
4902 lxc_list_add_tail(idmap, tmplist);
4903 }
4904 /* idmap will now keep track of that memory. */
4905 container_root_gid = NULL;
4906 /* idmap will now keep track of that memory. */
4907 host_gid_map = NULL;
4908
4909 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4910 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4911 lxc_list_for_each(it, idmap) {
4912 map = it->elem;
4913 TRACE("establishing %cid mapping for \"%d\" in new "
4914 "user namespace: nsuid %lu - hostid %lu - range "
4915 "%lu",
4916 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4917 map->nsid, map->hostid, map->range);
4918 }
4919 }
4920
4921 /* Set up {g,u}id mapping for user namespace of child process. */
4922 ret = lxc_map_ids(idmap, pid);
4923 if (ret < 0) {
4924 ERROR("error setting up {g,u}id mappings for child process "
4925 "\"%d\"",
4926 pid);
4927 goto on_error;
4928 }
4929
4930 /* Tell child to proceed. */
4931 if (write(p[1], &c, 1) != 1) {
4932 SYSERROR("failed telling child process \"%d\" to proceed", pid);
4933 goto on_error;
4934 }
4935
4936 /* Wait for child to finish. */
4937 ret = wait_for_pid(pid);
4938
4939 on_error:
4940 if (idmap)
4941 lxc_free_idmap(idmap);
4942 if (container_root_uid)
4943 free(container_root_uid);
4944 if (container_root_gid)
4945 free(container_root_gid);
4946 if (host_uid_map && (host_uid_map != container_root_uid))
4947 free(host_uid_map);
4948 if (host_gid_map && (host_gid_map != container_root_gid))
4949 free(host_gid_map);
4950
4951 if (p[0] != -1)
4952 close(p[0]);
4953 close(p[1]);
4954
4955 return ret;
4956 }
4957
4958 /* not thread-safe, do not use from api without first forking */
4959 static char* getuname(void)
4960 {
4961 struct passwd *result;
4962
4963 result = getpwuid(geteuid());
4964 if (!result)
4965 return NULL;
4966
4967 return strdup(result->pw_name);
4968 }
4969
4970 /* not thread-safe, do not use from api without first forking */
4971 static char *getgname(void)
4972 {
4973 struct group *result;
4974
4975 result = getgrgid(getegid());
4976 if (!result)
4977 return NULL;
4978
4979 return strdup(result->gr_name);
4980 }
4981
4982 /* not thread-safe, do not use from api without first forking */
4983 void suggest_default_idmap(void)
4984 {
4985 FILE *f;
4986 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4987 char *line = NULL;
4988 char *uname, *gname;
4989 size_t len = 0;
4990
4991 if (!(uname = getuname()))
4992 return;
4993
4994 if (!(gname = getgname())) {
4995 free(uname);
4996 return;
4997 }
4998
4999 f = fopen(subuidfile, "r");
5000 if (!f) {
5001 ERROR("Your system is not configured with subuids");
5002 free(gname);
5003 free(uname);
5004 return;
5005 }
5006 while (getline(&line, &len, f) != -1) {
5007 size_t no_newline = 0;
5008 char *p = strchr(line, ':'), *p2;
5009 if (*line == '#')
5010 continue;
5011 if (!p)
5012 continue;
5013 *p = '\0';
5014 p++;
5015 if (strcmp(line, uname))
5016 continue;
5017 p2 = strchr(p, ':');
5018 if (!p2)
5019 continue;
5020 *p2 = '\0';
5021 p2++;
5022 if (!*p2)
5023 continue;
5024 no_newline = strcspn(p2, "\n");
5025 p2[no_newline] = '\0';
5026
5027 if (lxc_safe_uint(p, &uid) < 0)
5028 WARN("Could not parse UID.");
5029 if (lxc_safe_uint(p2, &urange) < 0)
5030 WARN("Could not parse UID range.");
5031 }
5032 fclose(f);
5033
5034 f = fopen(subgidfile, "r");
5035 if (!f) {
5036 ERROR("Your system is not configured with subgids");
5037 free(gname);
5038 free(uname);
5039 return;
5040 }
5041 while (getline(&line, &len, f) != -1) {
5042 size_t no_newline = 0;
5043 char *p = strchr(line, ':'), *p2;
5044 if (*line == '#')
5045 continue;
5046 if (!p)
5047 continue;
5048 *p = '\0';
5049 p++;
5050 if (strcmp(line, uname))
5051 continue;
5052 p2 = strchr(p, ':');
5053 if (!p2)
5054 continue;
5055 *p2 = '\0';
5056 p2++;
5057 if (!*p2)
5058 continue;
5059 no_newline = strcspn(p2, "\n");
5060 p2[no_newline] = '\0';
5061
5062 if (lxc_safe_uint(p, &gid) < 0)
5063 WARN("Could not parse GID.");
5064 if (lxc_safe_uint(p2, &grange) < 0)
5065 WARN("Could not parse GID range.");
5066 }
5067 fclose(f);
5068
5069 free(line);
5070
5071 if (!urange || !grange) {
5072 ERROR("You do not have subuids or subgids allocated");
5073 ERROR("Unprivileged containers require subuids and subgids");
5074 return;
5075 }
5076
5077 ERROR("You must either run as root, or define uid mappings");
5078 ERROR("To pass uid mappings to lxc-create, you could create");
5079 ERROR("~/.config/lxc/default.conf:");
5080 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
5081 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
5082 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
5083
5084 free(gname);
5085 free(uname);
5086 }
5087
5088 static void free_cgroup_settings(struct lxc_list *result)
5089 {
5090 struct lxc_list *iterator, *next;
5091
5092 lxc_list_for_each_safe(iterator, result, next) {
5093 lxc_list_del(iterator);
5094 free(iterator);
5095 }
5096 free(result);
5097 }
5098
5099 /*
5100 * Return the list of cgroup_settings sorted according to the following rules
5101 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
5102 */
5103 struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
5104 {
5105 struct lxc_list *result;
5106 struct lxc_list *memsw_limit = NULL;
5107 struct lxc_list *it = NULL;
5108 struct lxc_cgroup *cg = NULL;
5109 struct lxc_list *item = NULL;
5110
5111 result = malloc(sizeof(*result));
5112 if (!result) {
5113 ERROR("failed to allocate memory to sort cgroup settings");
5114 return NULL;
5115 }
5116 lxc_list_init(result);
5117
5118 /*Iterate over the cgroup settings and copy them to the output list*/
5119 lxc_list_for_each(it, cgroup_settings) {
5120 item = malloc(sizeof(*item));
5121 if (!item) {
5122 ERROR("failed to allocate memory to sort cgroup settings");
5123 free_cgroup_settings(result);
5124 return NULL;
5125 }
5126 item->elem = it->elem;
5127 cg = it->elem;
5128 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
5129 /* Store the memsw_limit location */
5130 memsw_limit = item;
5131 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
5132 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
5133 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
5134 item->elem = memsw_limit->elem;
5135 memsw_limit->elem = it->elem;
5136 }
5137 lxc_list_add_tail(result, item);
5138 }
5139
5140 return result;
5141 }
5142
5143 int lxc_unpriv_delete_nic(const char *lxcpath, char *lxcname,
5144 struct lxc_netdev *netdev, pid_t pid)
5145 {
5146 pid_t child;
5147 int bytes, pipefd[2];
5148 char buffer[MAXPATHLEN] = {0};
5149
5150 if (netdev->type != LXC_NET_VETH) {
5151 ERROR("nic type %d not support for unprivileged use",
5152 netdev->type);
5153 return -1;
5154 }
5155
5156 if (pipe(pipefd) < 0) {
5157 SYSERROR("pipe failed");
5158 return -1;
5159 }
5160
5161 child = fork();
5162 if (child < 0) {
5163 SYSERROR("fork");
5164 close(pipefd[0]);
5165 close(pipefd[1]);
5166 return -1;
5167 }
5168
5169 if (child == 0) { /* child */
5170 /* Call lxc-user-nic pid type bridge. */
5171 int ret;
5172 char pidstr[LXC_NUMSTRLEN64];
5173
5174 close(pipefd[0]); /* Close the read-end of the pipe. */
5175
5176 /* Redirect stdout to write-end of the pipe. */
5177 ret = dup2(pipefd[1], STDOUT_FILENO);
5178 if (ret >= 0)
5179 ret = dup2(pipefd[1], STDERR_FILENO);
5180 close(pipefd[1]); /* Close the write-end of the pipe. */
5181 if (ret < 0) {
5182 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
5183 exit(EXIT_FAILURE);
5184 }
5185
5186 if (!netdev->link)
5187 SYSERROR("Network link for network device \"%s\" is "
5188 "missing", netdev->priv.veth_attr.pair);
5189
5190 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
5191 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
5192 exit(EXIT_FAILURE);
5193 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
5194
5195 INFO("Execing lxc-user-nic delete %s %s %s veth %s %s", lxcpath,
5196 lxcname, pidstr, netdev->link, netdev->priv.veth_attr.pair);
5197 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, "delete", lxcpath,
5198 lxcname, pidstr, "veth", netdev->link,
5199 netdev->priv.veth_attr.pair, (char *)NULL);
5200 SYSERROR("Failed to exec lxc-user-nic.");
5201 exit(EXIT_FAILURE);
5202 }
5203
5204 /* close the write-end of the pipe */
5205 close(pipefd[1]);
5206
5207 bytes = read(pipefd[0], &buffer, MAXPATHLEN);
5208 if (bytes < 0) {
5209 SYSERROR("Failed to read from pipe file descriptor.");
5210 close(pipefd[0]);
5211 return -1;
5212 }
5213 buffer[bytes - 1] = '\0';
5214
5215 if (wait_for_pid(child) != 0) {
5216 ERROR("lxc-user-nic failed to delete requested network: %s",
5217 buffer[0] != '\0' ? buffer : "(null)");
5218 close(pipefd[0]);
5219 return -1;
5220 }
5221 TRACE("Received output \"%s\" from lxc-user-nic", buffer);
5222
5223 /* close the read-end of the pipe */
5224 close(pipefd[0]);
5225
5226 return 0;
5227 }