]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/conf.c
network: delete ovs for unprivileged networks
[mirror_lxc.git] / src / lxc / conf.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #define _GNU_SOURCE
25 #include "config.h"
26
27 #include <dirent.h>
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <grp.h>
31 #include <inttypes.h>
32 #include <libgen.h>
33 #include <pwd.h>
34 #include <stdarg.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <arpa/inet.h>
41 #include <linux/loop.h>
42 #include <net/if.h>
43 #include <netinet/in.h>
44 #include <sys/mman.h>
45 #include <sys/mount.h>
46 #include <sys/param.h>
47 #include <sys/prctl.h>
48 #include <sys/stat.h>
49 #include <sys/socket.h>
50 #include <sys/sysmacros.h>
51 #include <sys/syscall.h>
52 #include <sys/types.h>
53 #include <sys/utsname.h>
54 #include <sys/wait.h>
55
56 /* makedev() */
57 #ifdef MAJOR_IN_MKDEV
58 # include <sys/mkdev.h>
59 #endif
60
61 #ifdef HAVE_STATVFS
62 #include <sys/statvfs.h>
63 #endif
64
65 #if HAVE_PTY_H
66 #include <pty.h>
67 #else
68 #include <../include/openpty.h>
69 #endif
70
71 #ifdef HAVE_LINUX_MEMFD_H
72 #include <linux/memfd.h>
73 #endif
74
75 #include "af_unix.h"
76 #include "caps.h" /* for lxc_caps_last_cap() */
77 #include "cgroup.h"
78 #include "conf.h"
79 #include "confile_utils.h"
80 #include "error.h"
81 #include "log.h"
82 #include "lxclock.h"
83 #include "lxcseccomp.h"
84 #include "namespace.h"
85 #include "network.h"
86 #include "parse.h"
87 #include "storage.h"
88 #include "storage/aufs.h"
89 #include "storage/overlay.h"
90 #include "utils.h"
91 #include "lsm/lsm.h"
92
93 #if HAVE_LIBCAP
94 #include <sys/capability.h>
95 #endif
96
97 #if HAVE_SYS_PERSONALITY_H
98 #include <sys/personality.h>
99 #endif
100
101 #if IS_BIONIC
102 #include <../include/lxcmntent.h>
103 #ifndef HAVE_PRLIMIT
104 #include <../include/prlimit.h>
105 #endif
106 #else
107 #include <mntent.h>
108 #endif
109
110 lxc_log_define(lxc_conf, lxc);
111
112 #if HAVE_LIBCAP
113 #ifndef CAP_SETFCAP
114 #define CAP_SETFCAP 31
115 #endif
116
117 #ifndef CAP_MAC_OVERRIDE
118 #define CAP_MAC_OVERRIDE 32
119 #endif
120
121 #ifndef CAP_MAC_ADMIN
122 #define CAP_MAC_ADMIN 33
123 #endif
124 #endif
125
126 #ifndef PR_CAPBSET_DROP
127 #define PR_CAPBSET_DROP 24
128 #endif
129
130 #ifndef LO_FLAGS_AUTOCLEAR
131 #define LO_FLAGS_AUTOCLEAR 4
132 #endif
133
134 #ifndef CAP_SETUID
135 #define CAP_SETUID 7
136 #endif
137
138 #ifndef CAP_SETGID
139 #define CAP_SETGID 6
140 #endif
141
142 /* needed for cgroup automount checks, regardless of whether we
143 * have included linux/capability.h or not */
144 #ifndef CAP_SYS_ADMIN
145 #define CAP_SYS_ADMIN 21
146 #endif
147
148 /* Define pivot_root() if missing from the C library */
149 #ifndef HAVE_PIVOT_ROOT
150 static int pivot_root(const char * new_root, const char * put_old)
151 {
152 #ifdef __NR_pivot_root
153 return syscall(__NR_pivot_root, new_root, put_old);
154 #else
155 errno = ENOSYS;
156 return -1;
157 #endif
158 }
159 #else
160 extern int pivot_root(const char * new_root, const char * put_old);
161 #endif
162
163 /* Define sethostname() if missing from the C library */
164 #ifndef HAVE_SETHOSTNAME
165 static int sethostname(const char * name, size_t len)
166 {
167 #ifdef __NR_sethostname
168 return syscall(__NR_sethostname, name, len);
169 #else
170 errno = ENOSYS;
171 return -1;
172 #endif
173 }
174 #endif
175
176 #ifndef MS_PRIVATE
177 #define MS_PRIVATE (1<<18)
178 #endif
179
180 #ifndef MS_LAZYTIME
181 #define MS_LAZYTIME (1<<25)
182 #endif
183
184 /* memfd_create() */
185 #ifndef MFD_CLOEXEC
186 #define MFD_CLOEXEC 0x0001U
187 #endif
188
189 #ifndef MFD_ALLOW_SEALING
190 #define MFD_ALLOW_SEALING 0x0002U
191 #endif
192
193 #ifndef HAVE_MEMFD_CREATE
194 static int memfd_create(const char *name, unsigned int flags) {
195 #ifndef __NR_memfd_create
196 #if defined __i386__
197 #define __NR_memfd_create 356
198 #elif defined __x86_64__
199 #define __NR_memfd_create 319
200 #elif defined __arm__
201 #define __NR_memfd_create 385
202 #elif defined __aarch64__
203 #define __NR_memfd_create 279
204 #elif defined __s390__
205 #define __NR_memfd_create 350
206 #elif defined __powerpc__
207 #define __NR_memfd_create 360
208 #elif defined __sparc__
209 #define __NR_memfd_create 348
210 #elif defined __blackfin__
211 #define __NR_memfd_create 390
212 #elif defined __ia64__
213 #define __NR_memfd_create 1340
214 #elif defined _MIPS_SIM
215 #if _MIPS_SIM == _MIPS_SIM_ABI32
216 #define __NR_memfd_create 4354
217 #endif
218 #if _MIPS_SIM == _MIPS_SIM_NABI32
219 #define __NR_memfd_create 6318
220 #endif
221 #if _MIPS_SIM == _MIPS_SIM_ABI64
222 #define __NR_memfd_create 5314
223 #endif
224 #endif
225 #endif
226 #ifdef __NR_memfd_create
227 return syscall(__NR_memfd_create, name, flags);
228 #else
229 errno = ENOSYS;
230 return -1;
231 #endif
232 }
233 #else
234 extern int memfd_create(const char *name, unsigned int flags);
235 #endif
236
237 char *lxchook_names[NUM_LXC_HOOKS] = {"pre-start", "pre-mount", "mount",
238 "autodev", "start", "stop",
239 "post-stop", "clone", "destroy"};
240
241 typedef int (*instantiate_cb)(struct lxc_handler *, struct lxc_netdev *);
242
243 struct mount_opt {
244 char *name;
245 int clear;
246 int flag;
247 };
248
249 struct caps_opt {
250 char *name;
251 int value;
252 };
253
254 struct limit_opt {
255 char *name;
256 int value;
257 };
258
259 /*
260 * The lxc_conf of the container currently being worked on in an
261 * API call
262 * This is used in the error calls
263 */
264 #ifdef HAVE_TLS
265 __thread struct lxc_conf *current_config;
266 #else
267 struct lxc_conf *current_config;
268 #endif
269
270 /* Declare this here, since we don't want to reshuffle the whole file. */
271 static int in_caplist(int cap, struct lxc_list *caps);
272
273 static int instantiate_veth(struct lxc_handler *, struct lxc_netdev *);
274 static int instantiate_macvlan(struct lxc_handler *, struct lxc_netdev *);
275 static int instantiate_vlan(struct lxc_handler *, struct lxc_netdev *);
276 static int instantiate_phys(struct lxc_handler *, struct lxc_netdev *);
277 static int instantiate_empty(struct lxc_handler *, struct lxc_netdev *);
278 static int instantiate_none(struct lxc_handler *, struct lxc_netdev *);
279
280 static instantiate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
281 [LXC_NET_VETH] = instantiate_veth,
282 [LXC_NET_MACVLAN] = instantiate_macvlan,
283 [LXC_NET_VLAN] = instantiate_vlan,
284 [LXC_NET_PHYS] = instantiate_phys,
285 [LXC_NET_EMPTY] = instantiate_empty,
286 [LXC_NET_NONE] = instantiate_none,
287 };
288
289 static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
290 static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
291 static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
292 static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
293 static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
294 static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
295
296 static instantiate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
297 [LXC_NET_VETH] = shutdown_veth,
298 [LXC_NET_MACVLAN] = shutdown_macvlan,
299 [LXC_NET_VLAN] = shutdown_vlan,
300 [LXC_NET_PHYS] = shutdown_phys,
301 [LXC_NET_EMPTY] = shutdown_empty,
302 [LXC_NET_NONE] = shutdown_none,
303 };
304
305 static struct mount_opt mount_opt[] = {
306 { "async", 1, MS_SYNCHRONOUS },
307 { "atime", 1, MS_NOATIME },
308 { "bind", 0, MS_BIND },
309 { "defaults", 0, 0 },
310 { "dev", 1, MS_NODEV },
311 { "diratime", 1, MS_NODIRATIME },
312 { "dirsync", 0, MS_DIRSYNC },
313 { "exec", 1, MS_NOEXEC },
314 { "lazytime", 0, MS_LAZYTIME },
315 { "mand", 0, MS_MANDLOCK },
316 { "noatime", 0, MS_NOATIME },
317 { "nodev", 0, MS_NODEV },
318 { "nodiratime", 0, MS_NODIRATIME },
319 { "noexec", 0, MS_NOEXEC },
320 { "nomand", 1, MS_MANDLOCK },
321 { "norelatime", 1, MS_RELATIME },
322 { "nostrictatime", 1, MS_STRICTATIME },
323 { "nosuid", 0, MS_NOSUID },
324 { "rbind", 0, MS_BIND|MS_REC },
325 { "relatime", 0, MS_RELATIME },
326 { "remount", 0, MS_REMOUNT },
327 { "ro", 0, MS_RDONLY },
328 { "rw", 1, MS_RDONLY },
329 { "strictatime", 0, MS_STRICTATIME },
330 { "suid", 1, MS_NOSUID },
331 { "sync", 0, MS_SYNCHRONOUS },
332 { NULL, 0, 0 },
333 };
334
335 #if HAVE_LIBCAP
336 static struct caps_opt caps_opt[] = {
337 { "chown", CAP_CHOWN },
338 { "dac_override", CAP_DAC_OVERRIDE },
339 { "dac_read_search", CAP_DAC_READ_SEARCH },
340 { "fowner", CAP_FOWNER },
341 { "fsetid", CAP_FSETID },
342 { "kill", CAP_KILL },
343 { "setgid", CAP_SETGID },
344 { "setuid", CAP_SETUID },
345 { "setpcap", CAP_SETPCAP },
346 { "linux_immutable", CAP_LINUX_IMMUTABLE },
347 { "net_bind_service", CAP_NET_BIND_SERVICE },
348 { "net_broadcast", CAP_NET_BROADCAST },
349 { "net_admin", CAP_NET_ADMIN },
350 { "net_raw", CAP_NET_RAW },
351 { "ipc_lock", CAP_IPC_LOCK },
352 { "ipc_owner", CAP_IPC_OWNER },
353 { "sys_module", CAP_SYS_MODULE },
354 { "sys_rawio", CAP_SYS_RAWIO },
355 { "sys_chroot", CAP_SYS_CHROOT },
356 { "sys_ptrace", CAP_SYS_PTRACE },
357 { "sys_pacct", CAP_SYS_PACCT },
358 { "sys_admin", CAP_SYS_ADMIN },
359 { "sys_boot", CAP_SYS_BOOT },
360 { "sys_nice", CAP_SYS_NICE },
361 { "sys_resource", CAP_SYS_RESOURCE },
362 { "sys_time", CAP_SYS_TIME },
363 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
364 { "mknod", CAP_MKNOD },
365 { "lease", CAP_LEASE },
366 #ifdef CAP_AUDIT_READ
367 { "audit_read", CAP_AUDIT_READ },
368 #endif
369 #ifdef CAP_AUDIT_WRITE
370 { "audit_write", CAP_AUDIT_WRITE },
371 #endif
372 #ifdef CAP_AUDIT_CONTROL
373 { "audit_control", CAP_AUDIT_CONTROL },
374 #endif
375 { "setfcap", CAP_SETFCAP },
376 { "mac_override", CAP_MAC_OVERRIDE },
377 { "mac_admin", CAP_MAC_ADMIN },
378 #ifdef CAP_SYSLOG
379 { "syslog", CAP_SYSLOG },
380 #endif
381 #ifdef CAP_WAKE_ALARM
382 { "wake_alarm", CAP_WAKE_ALARM },
383 #endif
384 #ifdef CAP_BLOCK_SUSPEND
385 { "block_suspend", CAP_BLOCK_SUSPEND },
386 #endif
387 };
388 #else
389 static struct caps_opt caps_opt[] = {};
390 #endif
391
392 static struct limit_opt limit_opt[] = {
393 #ifdef RLIMIT_AS
394 { "as", RLIMIT_AS },
395 #endif
396 #ifdef RLIMIT_CORE
397 { "core", RLIMIT_CORE },
398 #endif
399 #ifdef RLIMIT_CPU
400 { "cpu", RLIMIT_CPU },
401 #endif
402 #ifdef RLIMIT_DATA
403 { "data", RLIMIT_DATA },
404 #endif
405 #ifdef RLIMIT_FSIZE
406 { "fsize", RLIMIT_FSIZE },
407 #endif
408 #ifdef RLIMIT_LOCKS
409 { "locks", RLIMIT_LOCKS },
410 #endif
411 #ifdef RLIMIT_MEMLOCK
412 { "memlock", RLIMIT_MEMLOCK },
413 #endif
414 #ifdef RLIMIT_MSGQUEUE
415 { "msgqueue", RLIMIT_MSGQUEUE },
416 #endif
417 #ifdef RLIMIT_NICE
418 { "nice", RLIMIT_NICE },
419 #endif
420 #ifdef RLIMIT_NOFILE
421 { "nofile", RLIMIT_NOFILE },
422 #endif
423 #ifdef RLIMIT_NPROC
424 { "nproc", RLIMIT_NPROC },
425 #endif
426 #ifdef RLIMIT_RSS
427 { "rss", RLIMIT_RSS },
428 #endif
429 #ifdef RLIMIT_RTPRIO
430 { "rtprio", RLIMIT_RTPRIO },
431 #endif
432 #ifdef RLIMIT_RTTIME
433 { "rttime", RLIMIT_RTTIME },
434 #endif
435 #ifdef RLIMIT_SIGPENDING
436 { "sigpending", RLIMIT_SIGPENDING },
437 #endif
438 #ifdef RLIMIT_STACK
439 { "stack", RLIMIT_STACK },
440 #endif
441 };
442
443 static int run_buffer(char *buffer)
444 {
445 struct lxc_popen_FILE *f;
446 char *output;
447 int ret;
448
449 f = lxc_popen(buffer);
450 if (!f) {
451 SYSERROR("Failed to popen() %s.", buffer);
452 return -1;
453 }
454
455 output = malloc(LXC_LOG_BUFFER_SIZE);
456 if (!output) {
457 ERROR("Failed to allocate memory for %s.", buffer);
458 lxc_pclose(f);
459 return -1;
460 }
461
462 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
463 DEBUG("Script %s with output: %s.", buffer, output);
464
465 free(output);
466
467 ret = lxc_pclose(f);
468 if (ret == -1) {
469 SYSERROR("Script exited with error.");
470 return -1;
471 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
472 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
473 return -1;
474 } else if (WIFSIGNALED(ret)) {
475 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
476 return -1;
477 }
478
479 return 0;
480 }
481
482 static int run_script_argv(const char *name, const char *section,
483 const char *script, const char *hook,
484 const char *lxcpath, char **argsin)
485 {
486 int ret, i;
487 char *buffer;
488 size_t size = 0;
489
490 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
491 script, name, section);
492
493 for (i = 0; argsin && argsin[i]; i++)
494 size += strlen(argsin[i]) + 1;
495
496 size += strlen(hook) + 1;
497
498 size += strlen(script);
499 size += strlen(name);
500 size += strlen(section);
501 size += 3;
502
503 if (size > INT_MAX)
504 return -1;
505
506 buffer = alloca(size);
507 if (!buffer) {
508 ERROR("Failed to allocate memory.");
509 return -1;
510 }
511
512 ret =
513 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
514 if (ret < 0 || (size_t)ret >= size) {
515 ERROR("Script name too long.");
516 return -1;
517 }
518
519 for (i = 0; argsin && argsin[i]; i++) {
520 int len = size - ret;
521 int rc;
522 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
523 if (rc < 0 || rc >= len) {
524 ERROR("Script args too long.");
525 return -1;
526 }
527 ret += rc;
528 }
529
530 return run_buffer(buffer);
531 }
532
533 static int run_script(const char *name, const char *section, const char *script,
534 ...)
535 {
536 int ret;
537 char *buffer, *p;
538 size_t size = 0;
539 va_list ap;
540
541 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
542 script, name, section);
543
544 va_start(ap, script);
545 while ((p = va_arg(ap, char *)))
546 size += strlen(p) + 1;
547 va_end(ap);
548
549 size += strlen(script);
550 size += strlen(name);
551 size += strlen(section);
552 size += 3;
553
554 if (size > INT_MAX)
555 return -1;
556
557 buffer = alloca(size);
558 if (!buffer) {
559 ERROR("Failed to allocate memory.");
560 return -1;
561 }
562
563 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
564 if (ret < 0 || ret >= size) {
565 ERROR("Script name too long.");
566 return -1;
567 }
568
569 va_start(ap, script);
570 while ((p = va_arg(ap, char *))) {
571 int len = size - ret;
572 int rc;
573 rc = snprintf(buffer + ret, len, " %s", p);
574 if (rc < 0 || rc >= len) {
575 ERROR("Script args too long.");
576 return -1;
577 }
578 ret += rc;
579 }
580 va_end(ap);
581
582 return run_buffer(buffer);
583 }
584
585 /*
586 * pin_rootfs
587 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
588 * the duration of the container run, to prevent the container from marking
589 * the underlying fs readonly on shutdown. unlink the file immediately so
590 * no name pollution is happens
591 * return -1 on error.
592 * return -2 if nothing needed to be pinned.
593 * return an open fd (>=0) if we pinned it.
594 */
595 int pin_rootfs(const char *rootfs)
596 {
597 char absrootfs[MAXPATHLEN];
598 char absrootfspin[MAXPATHLEN];
599 struct stat s;
600 int ret, fd;
601
602 if (rootfs == NULL || strlen(rootfs) == 0)
603 return -2;
604
605 if (!realpath(rootfs, absrootfs))
606 return -2;
607
608 if (access(absrootfs, F_OK))
609 return -1;
610
611 if (stat(absrootfs, &s))
612 return -1;
613
614 if (!S_ISDIR(s.st_mode))
615 return -2;
616
617 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
618 if (ret >= MAXPATHLEN)
619 return -1;
620
621 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
622 if (fd < 0)
623 return fd;
624 (void)unlink(absrootfspin);
625 return fd;
626 }
627
628 /*
629 * If we are asking to remount something, make sure that any
630 * NOEXEC etc are honored.
631 */
632 unsigned long add_required_remount_flags(const char *s, const char *d,
633 unsigned long flags)
634 {
635 #ifdef HAVE_STATVFS
636 struct statvfs sb;
637 unsigned long required_flags = 0;
638
639 if (!(flags & MS_REMOUNT))
640 return flags;
641
642 if (!s)
643 s = d;
644
645 if (!s)
646 return flags;
647 if (statvfs(s, &sb) < 0)
648 return flags;
649
650 if (sb.f_flag & MS_NOSUID)
651 required_flags |= MS_NOSUID;
652 if (sb.f_flag & MS_NODEV)
653 required_flags |= MS_NODEV;
654 if (sb.f_flag & MS_RDONLY)
655 required_flags |= MS_RDONLY;
656 if (sb.f_flag & MS_NOEXEC)
657 required_flags |= MS_NOEXEC;
658
659 return flags | required_flags;
660 #else
661 return flags;
662 #endif
663 }
664
665 static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
666 {
667 int r;
668 int i;
669 static struct {
670 int match_mask;
671 int match_flag;
672 const char *source;
673 const char *destination;
674 const char *fstype;
675 unsigned long flags;
676 const char *options;
677 } default_mounts[] = {
678 /* Read-only bind-mounting... In older kernels, doing that required
679 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
680 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
681 * kernel 2.6.26 onwards. However, this apparently does not work on
682 * kernel 3.8. Unfortunately, on that very same kernel, doing the
683 * same trick as above doesn't seem to work either, there one needs
684 * to ALSO specify MS_BIND for the remount, otherwise the entire
685 * fs is remounted read-only or the mount fails because it's busy...
686 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
687 * 2.6.32...
688 */
689 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
690 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
691 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
692 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
693 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
694 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
695 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
696 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
697 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
698 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
699 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
700 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
701 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
702 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
703 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
704 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
705 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
706 { 0, 0, NULL, NULL, NULL, 0, NULL }
707 };
708
709 for (i = 0; default_mounts[i].match_mask; i++) {
710 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
711 char *source = NULL;
712 char *destination = NULL;
713 int saved_errno;
714 unsigned long mflags;
715
716 if (default_mounts[i].source) {
717 /* will act like strdup if %r is not present */
718 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
719 if (!source) {
720 SYSERROR("memory allocation error");
721 return -1;
722 }
723 }
724 if (!default_mounts[i].destination) {
725 ERROR("BUG: auto mounts destination %d was NULL", i);
726 free(source);
727 return -1;
728 }
729 /* will act like strdup if %r is not present */
730 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
731 if (!destination) {
732 saved_errno = errno;
733 SYSERROR("memory allocation error");
734 free(source);
735 errno = saved_errno;
736 return -1;
737 }
738 mflags = add_required_remount_flags(source, destination,
739 default_mounts[i].flags);
740 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
741 saved_errno = errno;
742 if (r < 0 && errno == ENOENT) {
743 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
744 r = 0;
745 }
746 else if (r < 0)
747 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
748
749 free(source);
750 free(destination);
751 if (r < 0) {
752 errno = saved_errno;
753 return -1;
754 }
755 }
756 }
757
758 if (flags & LXC_AUTO_CGROUP_MASK) {
759 int cg_flags;
760
761 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
762 /* If the type of cgroup mount was not specified, it depends on the
763 * container's capabilities as to what makes sense: if we have
764 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
765 * anyway, so we may as well default to read-write; then the admin
766 * will not be given a false sense of security. (And if they really
767 * want mixed r/o r/w, then they can explicitly specify :mixed.)
768 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
769 * :mixed, because then the container can't remount it read-write. */
770 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
771 int has_sys_admin = 0;
772
773 if (!lxc_list_empty(&conf->keepcaps))
774 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
775 else
776 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
777
778 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
779 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
780 else
781 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
782 }
783
784 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
785 SYSERROR("error mounting /sys/fs/cgroup");
786 return -1;
787 }
788 }
789
790 return 0;
791 }
792
793 static int setup_utsname(struct utsname *utsname)
794 {
795 if (!utsname)
796 return 0;
797
798 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
799 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
800 return -1;
801 }
802
803 INFO("'%s' hostname has been setup", utsname->nodename);
804
805 return 0;
806 }
807
808 struct dev_symlinks {
809 const char *oldpath;
810 const char *name;
811 };
812
813 static const struct dev_symlinks dev_symlinks[] = {
814 {"/proc/self/fd", "fd"},
815 {"/proc/self/fd/0", "stdin"},
816 {"/proc/self/fd/1", "stdout"},
817 {"/proc/self/fd/2", "stderr"},
818 };
819
820 static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
821 {
822 char path[MAXPATHLEN];
823 int ret,i;
824 struct stat s;
825
826
827 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
828 const struct dev_symlinks *d = &dev_symlinks[i];
829 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
830 if (ret < 0 || ret >= MAXPATHLEN)
831 return -1;
832
833 /*
834 * Stat the path first. If we don't get an error
835 * accept it as is and don't try to create it
836 */
837 if (!stat(path, &s)) {
838 continue;
839 }
840
841 ret = symlink(d->oldpath, path);
842
843 if (ret && errno != EEXIST) {
844 if ( errno == EROFS ) {
845 WARN("Warning: Read Only file system while creating %s", path);
846 } else {
847 SYSERROR("Error creating %s", path);
848 return -1;
849 }
850 }
851 }
852 return 0;
853 }
854
855 /*
856 * Build a space-separate list of ptys to pass to systemd.
857 */
858 static bool append_ptyname(char **pp, char *name)
859 {
860 char *p;
861
862 if (!*pp) {
863 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
864 if (!*pp)
865 return false;
866 sprintf(*pp, "container_ttys=%s", name);
867 return true;
868 }
869 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
870 if (!p)
871 return false;
872 *pp = p;
873 strcat(p, " ");
874 strcat(p, name);
875 return true;
876 }
877
878 static int lxc_setup_tty(struct lxc_conf *conf)
879 {
880 int i, ret;
881 const struct lxc_tty_info *tty_info = &conf->tty_info;
882 char *ttydir = conf->ttydir;
883 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
884
885 if (!conf->rootfs.path)
886 return 0;
887
888 for (i = 0; i < tty_info->nbtty; i++) {
889 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
890
891 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
892 if (ret < 0 || (size_t)ret >= sizeof(path)) {
893 ERROR("pathname too long for ttys");
894 return -1;
895 }
896
897 if (ttydir) {
898 /* create dev/lxc/tty%d" */
899 ret = snprintf(lxcpath, sizeof(lxcpath),
900 "/dev/%s/tty%d", ttydir, i + 1);
901 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
902 ERROR("pathname too long for ttys");
903 return -1;
904 }
905
906 ret = creat(lxcpath, 0660);
907 if (ret < 0 && errno != EEXIST) {
908 SYSERROR("failed to create \"%s\"", lxcpath);
909 return -1;
910 }
911 if (ret >= 0)
912 close(ret);
913
914 ret = unlink(path);
915 if (ret < 0 && errno != ENOENT) {
916 SYSERROR("failed to unlink \"%s\"", path);
917 return -1;
918 }
919
920 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
921 if (ret < 0) {
922 WARN("failed to bind mount \"%s\" onto \"%s\"",
923 pty_info->name, path);
924 continue;
925 }
926 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
927 path);
928
929 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
930 ttydir, i + 1);
931 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
932 ERROR("tty pathname too long");
933 return -1;
934 }
935
936 ret = symlink(lxcpath, path);
937 if (ret < 0) {
938 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
939 path, lxcpath);
940 return -1;
941 }
942 } else {
943 /* If we populated /dev, then we need to create
944 * /dev/ttyN
945 */
946 ret = access(path, F_OK);
947 if (ret < 0) {
948 ret = creat(path, 0660);
949 if (ret < 0) {
950 SYSERROR("failed to create \"%s\"", path);
951 /* this isn't fatal, continue */
952 } else {
953 close(ret);
954 }
955 }
956
957 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
958 if (ret < 0) {
959 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
960 continue;
961 }
962
963 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
964 path);
965 }
966
967 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
968 ERROR("Error setting up container_ttys string");
969 return -1;
970 }
971 }
972
973 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
974 return 0;
975 }
976
977 static int setup_rootfs_pivot_root(const char *rootfs)
978 {
979 int oldroot = -1, newroot = -1;
980
981 oldroot = open("/", O_DIRECTORY | O_RDONLY);
982 if (oldroot < 0) {
983 SYSERROR("Error opening old-/ for fchdir");
984 return -1;
985 }
986 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
987 if (newroot < 0) {
988 SYSERROR("Error opening new-/ for fchdir");
989 goto fail;
990 }
991
992 /* change into new root fs */
993 if (fchdir(newroot)) {
994 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
995 goto fail;
996 }
997
998 /* pivot_root into our new root fs */
999 if (pivot_root(".", ".")) {
1000 SYSERROR("pivot_root syscall failed");
1001 goto fail;
1002 }
1003
1004 /*
1005 * at this point the old-root is mounted on top of our new-root
1006 * To unmounted it we must not be chdir'd into it, so escape back
1007 * to old-root
1008 */
1009 if (fchdir(oldroot) < 0) {
1010 SYSERROR("Error entering oldroot");
1011 goto fail;
1012 }
1013 if (umount2(".", MNT_DETACH) < 0) {
1014 SYSERROR("Error detaching old root");
1015 goto fail;
1016 }
1017
1018 if (fchdir(newroot) < 0) {
1019 SYSERROR("Error re-entering newroot");
1020 goto fail;
1021 }
1022
1023 close(oldroot);
1024 close(newroot);
1025
1026 DEBUG("pivot_root syscall to '%s' successful", rootfs);
1027
1028 return 0;
1029
1030 fail:
1031 if (oldroot != -1)
1032 close(oldroot);
1033 if (newroot != -1)
1034 close(newroot);
1035 return -1;
1036 }
1037
1038 /* Just create a path for /dev under $lxcpath/$name and in rootfs If we hit an
1039 * error, log it but don't fail yet.
1040 */
1041 static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs,
1042 const char *lxcpath)
1043 {
1044 int ret;
1045 size_t clen;
1046 char *path;
1047
1048 INFO("Preparing \"/dev\"");
1049
1050 /* $(rootfs->mount) + "/dev/pts" + '\0' */
1051 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
1052 path = alloca(clen);
1053
1054 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
1055 if (ret < 0 || (size_t)ret >= clen)
1056 return -1;
1057
1058 if (!dir_exists(path)) {
1059 WARN("\"/dev\" directory does not exist. Proceeding without "
1060 "autodev being set up");
1061 return 0;
1062 }
1063
1064 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
1065 rootfs->path ? rootfs->mount : NULL);
1066 if (ret < 0) {
1067 SYSERROR("Failed to mount tmpfs on \"%s\"", path);
1068 return -1;
1069 }
1070 INFO("Mounted tmpfs on \"%s\"", path);
1071
1072 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
1073 if (ret < 0 || (size_t)ret >= clen)
1074 return -1;
1075
1076 /* If we are running on a devtmpfs mapping, dev/pts may already exist.
1077 * If not, then create it and exit if that fails...
1078 */
1079 if (!dir_exists(path)) {
1080 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1081 if (ret < 0) {
1082 SYSERROR("Failed to create directory \"%s\"", path);
1083 return -1;
1084 }
1085 }
1086
1087 INFO("Prepared \"/dev\"");
1088 return 0;
1089 }
1090
1091 struct lxc_devs {
1092 const char *name;
1093 mode_t mode;
1094 int maj;
1095 int min;
1096 };
1097
1098 static const struct lxc_devs lxc_devs[] = {
1099 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1100 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1101 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1102 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1103 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1104 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
1105 };
1106
1107 static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
1108 {
1109 int ret;
1110 char path[MAXPATHLEN];
1111 int i;
1112 mode_t cmask;
1113
1114 ret = snprintf(path, MAXPATHLEN, "%s/dev",
1115 rootfs->path ? rootfs->mount : "");
1116 if (ret < 0 || ret >= MAXPATHLEN)
1117 return -1;
1118
1119 /* ignore, just don't try to fill in */
1120 if (!dir_exists(path))
1121 return 0;
1122
1123 INFO("Populating \"/dev\"");
1124
1125 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1126 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
1127 const struct lxc_devs *d = &lxc_devs[i];
1128
1129 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s",
1130 rootfs->path ? rootfs->mount : "", d->name);
1131 if (ret < 0 || ret >= MAXPATHLEN)
1132 return -1;
1133
1134 ret = mknod(path, d->mode, makedev(d->maj, d->min));
1135 if (ret < 0) {
1136 FILE *pathfile;
1137 char hostpath[MAXPATHLEN];
1138
1139 if (errno == EEXIST) {
1140 DEBUG("\"%s\" device already existed", path);
1141 continue;
1142 }
1143
1144 /* Unprivileged containers cannot create devices, so
1145 * bind mount the device from the host.
1146 */
1147 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1148 if (ret < 0 || ret >= MAXPATHLEN)
1149 return -1;
1150
1151 pathfile = fopen(path, "wb");
1152 if (!pathfile) {
1153 SYSERROR("Failed to create file \"%s\"", path);
1154 return -1;
1155 }
1156 fclose(pathfile);
1157
1158 ret = safe_mount(hostpath, path, 0, MS_BIND, NULL,
1159 rootfs->path ? rootfs->mount : NULL);
1160 if (ret < 0) {
1161 SYSERROR("Failed to bind mount \"%s\" from "
1162 "host into container",
1163 d->name);
1164 return -1;
1165 }
1166 DEBUG("Bind mounted \"%s\" onto \"%s\"", hostpath,
1167 path);
1168 } else {
1169 DEBUG("Created device node \"%s\"", path);
1170 }
1171 }
1172 umask(cmask);
1173
1174 INFO("Populated \"/dev\"");
1175 return 0;
1176 }
1177
1178 static int lxc_setup_rootfs(struct lxc_conf *conf)
1179 {
1180 int ret;
1181 struct lxc_storage *bdev;
1182 const struct lxc_rootfs *rootfs;
1183
1184 rootfs = &conf->rootfs;
1185 if (!rootfs->path) {
1186 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1187 SYSERROR("Failed to make / rslave.");
1188 return -1;
1189 }
1190 return 0;
1191 }
1192
1193 if (access(rootfs->mount, F_OK)) {
1194 SYSERROR("Failed to access to \"%s\". Check it is present.",
1195 rootfs->mount);
1196 return -1;
1197 }
1198
1199 bdev = storage_init(conf, rootfs->path, rootfs->mount, rootfs->options);
1200 if (!bdev) {
1201 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1202 rootfs->path, rootfs->mount,
1203 rootfs->options ? rootfs->options : "(null)");
1204 return -1;
1205 }
1206
1207 ret = bdev->ops->mount(bdev);
1208 storage_put(bdev);
1209 if (ret < 0) {
1210 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1211 rootfs->path, rootfs->mount,
1212 rootfs->options ? rootfs->options : "(null)");
1213 return -1;
1214 }
1215
1216 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1217 rootfs->path, rootfs->mount,
1218 rootfs->options ? rootfs->options : "(null)");
1219
1220 return 0;
1221 }
1222
1223 int prepare_ramfs_root(char *root)
1224 {
1225 char buf[LXC_LINELEN], *p;
1226 char nroot[PATH_MAX];
1227 FILE *f;
1228 int i;
1229 char *p2;
1230
1231 if (realpath(root, nroot) == NULL)
1232 return -errno;
1233
1234 if (chdir("/") == -1)
1235 return -errno;
1236
1237 /*
1238 * We could use here MS_MOVE, but in userns this mount is
1239 * locked and can't be moved.
1240 */
1241 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
1242 SYSERROR("Failed to move %s into /", root);
1243 return -errno;
1244 }
1245
1246 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
1247 SYSERROR("Failed to make . rprivate");
1248 return -errno;
1249 }
1250
1251 /*
1252 * The following code cleans up inhereted mounts which are not
1253 * required for CT.
1254 *
1255 * The mountinfo file shows not all mounts, if a few points have been
1256 * unmounted between read operations from the mountinfo. So we need to
1257 * read mountinfo a few times.
1258 *
1259 * This loop can be skipped if a container uses unserns, because all
1260 * inherited mounts are locked and we should live with all this trash.
1261 */
1262 while (1) {
1263 int progress = 0;
1264
1265 f = fopen("./proc/self/mountinfo", "r");
1266 if (!f) {
1267 SYSERROR("Unable to open /proc/self/mountinfo");
1268 return -1;
1269 }
1270 while (fgets(buf, LXC_LINELEN, f)) {
1271 for (p = buf, i=0; p && i < 4; i++)
1272 p = strchr(p+1, ' ');
1273 if (!p)
1274 continue;
1275 p2 = strchr(p+1, ' ');
1276 if (!p2)
1277 continue;
1278
1279 *p2 = '\0';
1280 *p = '.';
1281
1282 if (strcmp(p + 1, "/") == 0)
1283 continue;
1284 if (strcmp(p + 1, "/proc") == 0)
1285 continue;
1286
1287 if (umount2(p, MNT_DETACH) == 0)
1288 progress++;
1289 }
1290 fclose(f);
1291 if (!progress)
1292 break;
1293 }
1294
1295 /* This also can be skipped if a container uses unserns */
1296 umount2("./proc", MNT_DETACH);
1297
1298 /* It is weird, but chdir("..") moves us in a new root */
1299 if (chdir("..") == -1) {
1300 SYSERROR("Unable to change working directory");
1301 return -1;
1302 }
1303
1304 if (chroot(".") == -1) {
1305 SYSERROR("Unable to chroot");
1306 return -1;
1307 }
1308
1309 return 0;
1310 }
1311
1312 static int setup_pivot_root(const struct lxc_rootfs *rootfs)
1313 {
1314 if (!rootfs->path) {
1315 DEBUG("container does not have a rootfs, so not doing pivot root");
1316 return 0;
1317 }
1318
1319 if (detect_ramfs_rootfs()) {
1320 DEBUG("detected that container is on ramfs");
1321 if (prepare_ramfs_root(rootfs->mount)) {
1322 ERROR("failed to prepare minimal ramfs root");
1323 return -1;
1324 }
1325
1326 DEBUG("prepared ramfs root for container");
1327 return 0;
1328 }
1329
1330 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1331 ERROR("failed to pivot root");
1332 return -1;
1333 }
1334
1335 DEBUG("finished pivot root");
1336 return 0;
1337 }
1338
1339 static int lxc_setup_devpts(int num_pts)
1340 {
1341 int ret;
1342 const char *default_devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
1343 char devpts_mntopts[256];
1344
1345 if (!num_pts) {
1346 DEBUG("no new devpts instance will be mounted since no pts "
1347 "devices are requested");
1348 return 0;
1349 }
1350
1351 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%d",
1352 default_devpts_mntopts, num_pts);
1353 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1354 return -1;
1355
1356 /* Unmount old devpts instance. */
1357 ret = access("/dev/pts/ptmx", F_OK);
1358 if (!ret) {
1359 ret = umount("/dev/pts");
1360 if (ret < 0) {
1361 SYSERROR("failed to unmount old devpts instance");
1362 return -1;
1363 }
1364 DEBUG("unmounted old /dev/pts instance");
1365 }
1366
1367 /* Create mountpoint for devpts instance. */
1368 ret = mkdir("/dev/pts", 0755);
1369 if (ret < 0 && errno != EEXIST) {
1370 SYSERROR("failed to create the \"/dev/pts\" directory");
1371 return -1;
1372 }
1373
1374 /* Mount new devpts instance. */
1375 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1376 if (ret < 0) {
1377 SYSERROR("failed to mount new devpts instance");
1378 return -1;
1379 }
1380 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
1381
1382 /* Remove any pre-existing /dev/ptmx file. */
1383 ret = access("/dev/ptmx", F_OK);
1384 if (!ret) {
1385 ret = remove("/dev/ptmx");
1386 if (ret < 0) {
1387 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1388 return -1;
1389 }
1390 DEBUG("removed existing \"/dev/ptmx\"");
1391 }
1392
1393 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1394 ret = open("/dev/ptmx", O_CREAT, 0666);
1395 if (ret < 0) {
1396 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1397 return -1;
1398 }
1399 close(ret);
1400 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
1401
1402 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
1403 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
1404 if (!ret) {
1405 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1406 return 0;
1407 } else {
1408 /* Fallthrough and try to create a symlink. */
1409 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1410 }
1411
1412 /* Remove the dummy /dev/ptmx file we created above. */
1413 ret = remove("/dev/ptmx");
1414 if (ret < 0) {
1415 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1416 return -1;
1417 }
1418
1419 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1420 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1421 if (ret < 0) {
1422 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1423 return -1;
1424 }
1425 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1426
1427 return 0;
1428 }
1429
1430 static int setup_personality(int persona)
1431 {
1432 #if HAVE_SYS_PERSONALITY_H
1433 if (persona == -1)
1434 return 0;
1435
1436 if (personality(persona) < 0) {
1437 SYSERROR("failed to set personality to '0x%x'", persona);
1438 return -1;
1439 }
1440
1441 INFO("set personality to '0x%x'", persona);
1442 #endif
1443
1444 return 0;
1445 }
1446
1447 static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1448 const struct lxc_console *console)
1449 {
1450 char path[MAXPATHLEN];
1451 int ret, fd;
1452
1453 if (console->path && !strcmp(console->path, "none"))
1454 return 0;
1455
1456 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1457 if (ret < 0 || (size_t)ret >= sizeof(path))
1458 return -1;
1459
1460 /* When we are asked to setup a console we remove any previous
1461 * /dev/console bind-mounts.
1462 */
1463 if (file_exists(path)) {
1464 ret = lxc_unstack_mountpoint(path, false);
1465 if (ret < 0) {
1466 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1467 return -ret;
1468 } else {
1469 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1470 }
1471
1472 ret = unlink(path);
1473 if (ret < 0) {
1474 SYSERROR("error unlinking %s", path);
1475 return -errno;
1476 }
1477 }
1478
1479 /* For unprivileged containers autodev or automounts will already have
1480 * taken care of creating /dev/console.
1481 */
1482 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1483 if (fd < 0) {
1484 if (errno != EEXIST) {
1485 SYSERROR("failed to create console");
1486 return -errno;
1487 }
1488 } else {
1489 close(fd);
1490 }
1491
1492 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
1493 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1494 return -errno;
1495 }
1496
1497 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
1498 ERROR("failed to mount '%s' on '%s'", console->name, path);
1499 return -1;
1500 }
1501
1502 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
1503 return 0;
1504 }
1505
1506 static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1507 const struct lxc_console *console,
1508 char *ttydir)
1509 {
1510 int ret;
1511 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
1512
1513 /* create rootfs/dev/<ttydir> directory */
1514 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1515 if (ret < 0 || (size_t)ret >= sizeof(path))
1516 return -1;
1517
1518 ret = mkdir(path, 0755);
1519 if (ret && errno != EEXIST) {
1520 SYSERROR("failed with errno %d to create %s", errno, path);
1521 return -errno;
1522 }
1523 DEBUG("Created directory for console and tty devices at \"%s\"", path);
1524
1525 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1526 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1527 return -1;
1528
1529 ret = creat(lxcpath, 0660);
1530 if (ret == -1 && errno != EEXIST) {
1531 SYSERROR("error %d creating %s", errno, lxcpath);
1532 return -errno;
1533 }
1534 if (ret >= 0)
1535 close(ret);
1536
1537 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1538 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1539 return -1;
1540
1541 /* When we are asked to setup a console we remove any previous
1542 * /dev/console bind-mounts.
1543 */
1544 if (console->path && !strcmp(console->path, "none")) {
1545 struct stat st;
1546 ret = stat(path, &st);
1547 if (ret < 0) {
1548 if (errno == ENOENT)
1549 return 0;
1550 SYSERROR("failed stat() \"%s\"", path);
1551 return -errno;
1552 }
1553
1554 /* /dev/console must be character device with major number 5 and
1555 * minor number 1. If not, give benefit of the doubt and assume
1556 * the user has mounted something else right there on purpose.
1557 */
1558 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1559 return 0;
1560
1561 /* In case the user requested a bind-mount for /dev/console and
1562 * requests a ttydir we move the mount to the
1563 * /dev/<ttydir/console.
1564 * Note, we only move the uppermost mount and clear all other
1565 * mounts underneath for safety.
1566 * If it is a character device created via mknod() we simply
1567 * rename it.
1568 */
1569 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1570 if (ret < 0) {
1571 if (errno != EINVAL) {
1572 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1573 return -errno;
1574 }
1575 /* path was not a mountpoint */
1576 ret = rename(path, lxcpath);
1577 if (ret < 0) {
1578 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1579 return -errno;
1580 }
1581 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1582 } else {
1583 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1584 }
1585
1586 /* Clear all remaining bind-mounts. */
1587 ret = lxc_unstack_mountpoint(path, false);
1588 if (ret < 0) {
1589 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1590 return -ret;
1591 } else {
1592 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1593 }
1594 } else {
1595 if (file_exists(path)) {
1596 ret = lxc_unstack_mountpoint(path, false);
1597 if (ret < 0) {
1598 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1599 return -ret;
1600 } else {
1601 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1602 }
1603 }
1604
1605 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1606 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1607 return -1;
1608 }
1609 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
1610 }
1611
1612 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
1613 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
1614 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1615 return -1;
1616
1617 ret = unlink(path);
1618 if (ret && errno != ENOENT) {
1619 SYSERROR("error unlinking %s", path);
1620 return -errno;
1621 }
1622
1623 ret = symlink(lxcpath, path);
1624 if (ret < 0) {
1625 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
1626 return -1;
1627 }
1628
1629 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
1630 return 0;
1631 }
1632
1633 static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1634 const struct lxc_console *console, char *ttydir)
1635 {
1636 /* We don't have a rootfs, /dev/console will be shared. */
1637 if (!rootfs->path) {
1638 DEBUG("/dev/console will be shared with the host");
1639 return 0;
1640 }
1641
1642 if (!ttydir)
1643 return lxc_setup_dev_console(rootfs, console);
1644
1645 return lxc_setup_ttydir_console(rootfs, console, ttydir);
1646 }
1647
1648 static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1649 {
1650 struct mount_opt *mo;
1651
1652 /* If opt is found in mount_opt, set or clear flags.
1653 * Otherwise append it to data. */
1654
1655 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1656 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1657 if (mo->clear)
1658 *flags &= ~mo->flag;
1659 else
1660 *flags |= mo->flag;
1661 return;
1662 }
1663 }
1664
1665 if (strlen(*data))
1666 strcat(*data, ",");
1667 strcat(*data, opt);
1668 }
1669
1670 int parse_mntopts(const char *mntopts, unsigned long *mntflags,
1671 char **mntdata)
1672 {
1673 char *s, *data;
1674 char *p, *saveptr = NULL;
1675
1676 *mntdata = NULL;
1677 *mntflags = 0L;
1678
1679 if (!mntopts)
1680 return 0;
1681
1682 s = strdup(mntopts);
1683 if (!s) {
1684 SYSERROR("failed to allocate memory");
1685 return -1;
1686 }
1687
1688 data = malloc(strlen(s) + 1);
1689 if (!data) {
1690 SYSERROR("failed to allocate memory");
1691 free(s);
1692 return -1;
1693 }
1694 *data = 0;
1695
1696 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1697 p = strtok_r(NULL, ",", &saveptr))
1698 parse_mntopt(p, mntflags, &data);
1699
1700 if (*data)
1701 *mntdata = data;
1702 else
1703 free(data);
1704 free(s);
1705
1706 return 0;
1707 }
1708
1709 static void null_endofword(char *word)
1710 {
1711 while (*word && *word != ' ' && *word != '\t')
1712 word++;
1713 *word = '\0';
1714 }
1715
1716 /*
1717 * skip @nfields spaces in @src
1718 */
1719 static char *get_field(char *src, int nfields)
1720 {
1721 char *p = src;
1722 int i;
1723
1724 for (i = 0; i < nfields; i++) {
1725 while (*p && *p != ' ' && *p != '\t')
1726 p++;
1727 if (!*p)
1728 break;
1729 p++;
1730 }
1731 return p;
1732 }
1733
1734 static int mount_entry(const char *fsname, const char *target,
1735 const char *fstype, unsigned long mountflags,
1736 const char *data, int optional, int dev,
1737 const char *rootfs)
1738 {
1739 int ret;
1740 #ifdef HAVE_STATVFS
1741 struct statvfs sb;
1742 #endif
1743
1744 ret = safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data,
1745 rootfs);
1746 if (ret < 0) {
1747 if (optional) {
1748 INFO("Failed to mount \"%s\" on \"%s\" (optional): %s",
1749 fsname, target, strerror(errno));
1750 return 0;
1751 }
1752
1753 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1754 return -1;
1755 }
1756
1757 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
1758 unsigned long rqd_flags = 0;
1759
1760 DEBUG("Remounting \"%s\" on \"%s\" to respect bind or remount "
1761 "options",
1762 fsname ? fsname : "(none)", target ? target : "(none)");
1763
1764 if (mountflags & MS_RDONLY)
1765 rqd_flags |= MS_RDONLY;
1766 #ifdef HAVE_STATVFS
1767 if (statvfs(fsname, &sb) == 0) {
1768 unsigned long required_flags = rqd_flags;
1769
1770 if (sb.f_flag & MS_NOSUID)
1771 required_flags |= MS_NOSUID;
1772
1773 if (sb.f_flag & MS_NODEV && !dev)
1774 required_flags |= MS_NODEV;
1775
1776 if (sb.f_flag & MS_RDONLY)
1777 required_flags |= MS_RDONLY;
1778
1779 if (sb.f_flag & MS_NOEXEC)
1780 required_flags |= MS_NOEXEC;
1781
1782 DEBUG("Flags for \"%s\" were %lu, required extra flags "
1783 "are %lu", fsname, sb.f_flag, required_flags);
1784
1785 /* If this was a bind mount request, and required_flags
1786 * does not have any flags which are not already in
1787 * mountflags, then skip the remount.
1788 */
1789 if (!(mountflags & MS_REMOUNT)) {
1790 if (!(required_flags & ~mountflags) &&
1791 rqd_flags == 0) {
1792 DEBUG("Mountflags already were %lu, "
1793 "skipping remount", mountflags);
1794 goto skipremount;
1795 }
1796 }
1797
1798 mountflags |= required_flags;
1799 }
1800 #endif
1801
1802 ret = mount(fsname, target, fstype, mountflags | MS_REMOUNT, data);
1803 if (ret < 0) {
1804 if (optional) {
1805 INFO("Failed to mount \"%s\" on \"%s\" "
1806 "(optional): %s", fsname, target,
1807 strerror(errno));
1808 return 0;
1809 }
1810
1811 SYSERROR("Failed to mount \"%s\" on \"%s\"", fsname, target);
1812 return -1;
1813 }
1814 }
1815
1816 #ifdef HAVE_STATVFS
1817 skipremount:
1818 #endif
1819 DEBUG("Mounted \"%s\" on \"%s\" with filesystem type \"%s\"", fsname,
1820 target, fstype);
1821
1822 return 0;
1823 }
1824
1825 /* Remove "optional", "create=dir", and "create=file" from mntopt */
1826 static void cull_mntent_opt(struct mntent *mntent)
1827 {
1828 int i;
1829 char *list[] = {"create=dir", "create=file", "optional", NULL};
1830
1831 for (i = 0; list[i]; i++) {
1832 char *p, *p2;
1833
1834 p = strstr(mntent->mnt_opts, list[i]);
1835 if (!p)
1836 continue;
1837
1838 p2 = strchr(p, ',');
1839 if (!p2) {
1840 /* no more mntopts, so just chop it here */
1841 *p = '\0';
1842 continue;
1843 }
1844
1845 memmove(p, p2 + 1, strlen(p2 + 1) + 1);
1846 }
1847 }
1848
1849 static int mount_entry_create_dir_file(const struct mntent *mntent,
1850 const char *path,
1851 const struct lxc_rootfs *rootfs,
1852 const char *lxc_name,
1853 const char *lxc_path)
1854 {
1855 int ret = 0;
1856
1857 if (!strncmp(mntent->mnt_type, "overlay", 7))
1858 ret = ovl_mkdir(mntent, rootfs, lxc_name, lxc_path);
1859 else if (!strncmp(mntent->mnt_type, "aufs", 4))
1860 ret = aufs_mkdir(mntent, rootfs, lxc_name, lxc_path);
1861 if (ret < 0)
1862 return -1;
1863
1864 if (hasmntopt(mntent, "create=dir")) {
1865 ret = mkdir_p(path, 0755);
1866 if (ret < 0 && errno != EEXIST) {
1867 SYSERROR("Failed to create directory \"%s\"", path);
1868 return -1;
1869 }
1870 }
1871
1872 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1873 int fd;
1874 char *p1, *p2;
1875
1876 p1 = strdup(path);
1877 if (!p1)
1878 return -1;
1879
1880 p2 = dirname(p1);
1881
1882 ret = mkdir_p(p2, 0755);
1883 free(p1);
1884 if (ret < 0 && errno != EEXIST) {
1885 SYSERROR("Failed to create directory \"%s\"", path);
1886 return -1;
1887 }
1888
1889 fd = open(path, O_CREAT, 0644);
1890 if (fd < 0)
1891 return -1;
1892 close(fd);
1893 }
1894
1895 return 0;
1896 }
1897
1898 /* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1899 * without a rootfs. */
1900 static inline int mount_entry_on_generic(struct mntent *mntent,
1901 const char *path,
1902 const struct lxc_rootfs *rootfs,
1903 const char *lxc_name,
1904 const char *lxc_path)
1905 {
1906 int ret;
1907 unsigned long mntflags;
1908 char *mntdata;
1909 bool dev, optional;
1910 char *rootfs_path = NULL;
1911
1912 optional = hasmntopt(mntent, "optional") != NULL;
1913 dev = hasmntopt(mntent, "dev") != NULL;
1914
1915 if (rootfs && rootfs->path)
1916 rootfs_path = rootfs->mount;
1917
1918 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name,
1919 lxc_path);
1920 if (ret < 0) {
1921 if (optional)
1922 return 0;
1923
1924 return -1;
1925 }
1926 cull_mntent_opt(mntent);
1927
1928 ret = parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata);
1929 if (ret < 0)
1930 return -1;
1931
1932 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
1933 mntdata, optional, dev, rootfs_path);
1934
1935 free(mntdata);
1936 return ret;
1937 }
1938
1939 static inline int mount_entry_on_systemfs(struct mntent *mntent)
1940 {
1941 int ret;
1942 char path[MAXPATHLEN];
1943
1944 /* For containers created without a rootfs all mounts are treated as
1945 * absolute paths starting at / on the host.
1946 */
1947 if (mntent->mnt_dir[0] != '/')
1948 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
1949 else
1950 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
1951 if (ret < 0 || ret >= sizeof(path))
1952 return -1;
1953
1954 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
1955 }
1956
1957 static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
1958 const struct lxc_rootfs *rootfs,
1959 const char *lxc_name,
1960 const char *lxc_path)
1961 {
1962 int offset;
1963 char *aux;
1964 const char *lxcpath;
1965 char path[MAXPATHLEN];
1966 int ret = 0;
1967
1968 lxcpath = lxc_global_config_value("lxc.lxcpath");
1969 if (!lxcpath)
1970 return -1;
1971
1972 /* If rootfs->path is a blockdev path, allow container fstab to use
1973 * <lxcpath>/<name>/rootfs" as the target prefix.
1974 */
1975 ret = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
1976 if (ret < 0 || ret >= MAXPATHLEN)
1977 goto skipvarlib;
1978
1979 aux = strstr(mntent->mnt_dir, path);
1980 if (aux) {
1981 offset = strlen(path);
1982 goto skipabs;
1983 }
1984
1985 skipvarlib:
1986 aux = strstr(mntent->mnt_dir, rootfs->path);
1987 if (!aux) {
1988 WARN("Ignoring mount point \"%s\"", mntent->mnt_dir);
1989 return ret;
1990 }
1991 offset = strlen(rootfs->path);
1992
1993 skipabs:
1994 ret = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset);
1995 if (ret < 0 || ret >= MAXPATHLEN)
1996 return -1;
1997
1998 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
1999 }
2000
2001 static int mount_entry_on_relative_rootfs(struct mntent *mntent,
2002 const struct lxc_rootfs *rootfs,
2003 const char *lxc_name,
2004 const char *lxc_path)
2005 {
2006 char path[MAXPATHLEN];
2007 int ret;
2008
2009 /* relative to root mount point */
2010 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
2011 if (ret < 0 || ret >= sizeof(path)) {
2012 ERROR("path name too long");
2013 return -1;
2014 }
2015
2016 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2017 }
2018
2019 /* This logs a NOTICE() when a user specifies mounts that would conflict with
2020 * devices liblxc sets up automatically.
2021 */
2022 static void log_notice_on_conflict(const struct lxc_conf *conf, const char *src,
2023 const char *dest)
2024 {
2025 char *clean_mnt_fsname, *clean_mnt_dir, *tmp;
2026 bool needs_warning = false;
2027
2028 clean_mnt_fsname = lxc_deslashify(src);
2029 if (!clean_mnt_fsname)
2030 return;
2031
2032 clean_mnt_dir = lxc_deslashify(dest);
2033 if (!clean_mnt_dir) {
2034 free(clean_mnt_fsname);
2035 return;
2036 }
2037
2038 tmp = clean_mnt_dir;
2039 if (*tmp == '/')
2040 tmp++;
2041
2042 if (strncmp(src, "/dev", 4) || strncmp(tmp, "dev", 3)) {
2043 free(clean_mnt_dir);
2044 free(clean_mnt_fsname);
2045 return;
2046 }
2047
2048 if (!conf->autodev && !conf->pts && !conf->tty &&
2049 (!conf->console.path || !strcmp(conf->console.path, "none"))) {
2050 free(clean_mnt_dir);
2051 free(clean_mnt_fsname);
2052 return;
2053 }
2054
2055 if (!strcmp(tmp, "dev") && conf->autodev > 0)
2056 needs_warning = true;
2057 else if (!strcmp(tmp, "dev/pts") && (conf->autodev > 0 || conf->pts > 0))
2058 needs_warning = true;
2059 else if (!strcmp(tmp, "dev/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2060 needs_warning = true;
2061 else if (!strcmp(tmp, "dev/pts/ptmx") && (conf->autodev > 0 || conf->pts > 0))
2062 needs_warning = true;
2063 else if (!strcmp(tmp, "dev/null") && conf->autodev > 0)
2064 needs_warning = true;
2065 else if (!strcmp(tmp, "dev/zero") && conf->autodev > 0)
2066 needs_warning = true;
2067 else if (!strcmp(tmp, "dev/full") && conf->autodev > 0)
2068 needs_warning = true;
2069 else if (!strcmp(tmp, "dev/urandom") && conf->autodev > 0)
2070 needs_warning = true;
2071 else if (!strcmp(tmp, "dev/random") && conf->autodev > 0)
2072 needs_warning = true;
2073 else if (!strcmp(tmp, "dev/tty") && conf->autodev > 0)
2074 needs_warning = true;
2075 else if (!strncmp(tmp, "dev/tty", 7) && (conf->autodev > 0 || conf->tty > 0))
2076 needs_warning = true;
2077
2078 if (needs_warning)
2079 NOTICE("Requesting to mount \"%s\" on \"%s\" while requesting "
2080 "automatic device setup under \"/dev\"",
2081 clean_mnt_fsname, clean_mnt_dir);
2082
2083 free(clean_mnt_dir);
2084 free(clean_mnt_fsname);
2085 }
2086
2087 static int mount_file_entries(const struct lxc_conf *conf,
2088 const struct lxc_rootfs *rootfs, FILE *file,
2089 const char *lxc_name, const char *lxc_path)
2090 {
2091 struct mntent mntent;
2092 char buf[4096];
2093 int ret = -1;
2094
2095 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
2096 log_notice_on_conflict(conf, mntent.mnt_fsname, mntent.mnt_dir);
2097
2098 if (!rootfs->path)
2099 ret = mount_entry_on_systemfs(&mntent);
2100 else if (mntent.mnt_dir[0] != '/')
2101 ret = mount_entry_on_relative_rootfs(&mntent, rootfs,
2102 lxc_name, lxc_path);
2103 else
2104 ret = mount_entry_on_absolute_rootfs(&mntent, rootfs,
2105 lxc_name, lxc_path);
2106 if (ret < 0)
2107 return -1;
2108 }
2109 ret = 0;
2110
2111 INFO("Set up mount entries");
2112 return ret;
2113 }
2114
2115 static int setup_mount(const struct lxc_conf *conf,
2116 const struct lxc_rootfs *rootfs, const char *fstab,
2117 const char *lxc_name, const char *lxc_path)
2118 {
2119 FILE *f;
2120 int ret;
2121
2122 if (!fstab)
2123 return 0;
2124
2125 f = setmntent(fstab, "r");
2126 if (!f) {
2127 SYSERROR("Failed to open \"%s\"", fstab);
2128 return -1;
2129 }
2130
2131 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
2132 if (ret < 0)
2133 ERROR("Failed to set up mount entries");
2134
2135 endmntent(f);
2136 return ret;
2137 }
2138
2139 FILE *make_anonymous_mount_file(struct lxc_list *mount)
2140 {
2141 int ret;
2142 char *mount_entry;
2143 struct lxc_list *iterator;
2144 FILE *f;
2145 int fd = -1;
2146
2147 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2148 if (fd < 0) {
2149 if (errno != ENOSYS)
2150 return NULL;
2151 f = tmpfile();
2152 TRACE("Created temporary mount file");
2153 } else {
2154 f = fdopen(fd, "r+");
2155 TRACE("Created anonymous mount file");
2156 }
2157
2158 if (!f) {
2159 SYSERROR("Could not create mount file");
2160 if (fd != -1)
2161 close(fd);
2162 return NULL;
2163 }
2164
2165 lxc_list_for_each(iterator, mount) {
2166 mount_entry = iterator->elem;
2167 ret = fprintf(f, "%s\n", mount_entry);
2168 if (ret < strlen(mount_entry))
2169 WARN("Could not write mount entry to mount file");
2170 }
2171
2172 ret = fseek(f, 0, SEEK_SET);
2173 if (ret < 0) {
2174 SYSERROR("Failed to seek mount file");
2175 fclose(f);
2176 return NULL;
2177 }
2178
2179 return f;
2180 }
2181
2182 static int setup_mount_entries(const struct lxc_conf *conf,
2183 const struct lxc_rootfs *rootfs,
2184 struct lxc_list *mount, const char *lxc_name,
2185 const char *lxc_path)
2186 {
2187 FILE *f;
2188 int ret;
2189
2190 f = make_anonymous_mount_file(mount);
2191 if (!f)
2192 return -1;
2193
2194 ret = mount_file_entries(conf, rootfs, f, lxc_name, lxc_path);
2195
2196 fclose(f);
2197 return ret;
2198 }
2199
2200 static int parse_cap(const char *cap)
2201 {
2202 char *ptr = NULL;
2203 size_t i;
2204 int capid = -1;
2205
2206 if (!strcmp(cap, "none"))
2207 return -2;
2208
2209 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2210
2211 if (strcmp(cap, caps_opt[i].name))
2212 continue;
2213
2214 capid = caps_opt[i].value;
2215 break;
2216 }
2217
2218 if (capid < 0) {
2219 /* try to see if it's numeric, so the user may specify
2220 * capabilities that the running kernel knows about but
2221 * we don't */
2222 errno = 0;
2223 capid = strtol(cap, &ptr, 10);
2224 if (!ptr || *ptr != '\0' || errno != 0)
2225 /* not a valid number */
2226 capid = -1;
2227 else if (capid > lxc_caps_last_cap())
2228 /* we have a number but it's not a valid
2229 * capability */
2230 capid = -1;
2231 }
2232
2233 return capid;
2234 }
2235
2236 int in_caplist(int cap, struct lxc_list *caps)
2237 {
2238 struct lxc_list *iterator;
2239 int capid;
2240
2241 lxc_list_for_each(iterator, caps) {
2242 capid = parse_cap(iterator->elem);
2243 if (capid == cap)
2244 return 1;
2245 }
2246
2247 return 0;
2248 }
2249
2250 static int setup_caps(struct lxc_list *caps)
2251 {
2252 struct lxc_list *iterator;
2253 char *drop_entry;
2254 int capid;
2255
2256 lxc_list_for_each(iterator, caps) {
2257
2258 drop_entry = iterator->elem;
2259
2260 capid = parse_cap(drop_entry);
2261
2262 if (capid < 0) {
2263 ERROR("unknown capability %s", drop_entry);
2264 return -1;
2265 }
2266
2267 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2268
2269 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
2270 SYSERROR("failed to remove %s capability", drop_entry);
2271 return -1;
2272 }
2273
2274 }
2275
2276 DEBUG("capabilities have been setup");
2277
2278 return 0;
2279 }
2280
2281 static int dropcaps_except(struct lxc_list *caps)
2282 {
2283 struct lxc_list *iterator;
2284 char *keep_entry;
2285 int i, capid;
2286 int numcaps = lxc_caps_last_cap() + 1;
2287 INFO("found %d capabilities", numcaps);
2288
2289 if (numcaps <= 0 || numcaps > 200)
2290 return -1;
2291
2292 /* caplist[i] is 1 if we keep capability i */
2293 int *caplist = alloca(numcaps * sizeof(int));
2294 memset(caplist, 0, numcaps * sizeof(int));
2295
2296 lxc_list_for_each(iterator, caps) {
2297
2298 keep_entry = iterator->elem;
2299
2300 capid = parse_cap(keep_entry);
2301
2302 if (capid == -2)
2303 continue;
2304
2305 if (capid < 0) {
2306 ERROR("unknown capability %s", keep_entry);
2307 return -1;
2308 }
2309
2310 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
2311
2312 caplist[capid] = 1;
2313 }
2314 for (i=0; i<numcaps; i++) {
2315 if (caplist[i])
2316 continue;
2317 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
2318 SYSERROR("failed to remove capability %d", i);
2319 return -1;
2320 }
2321 }
2322
2323 DEBUG("capabilities have been setup");
2324
2325 return 0;
2326 }
2327
2328 static int setup_hw_addr(char *hwaddr, const char *ifname)
2329 {
2330 struct sockaddr sockaddr;
2331 struct ifreq ifr;
2332 int ret, fd, saved_errno;
2333
2334 ret = lxc_convert_mac(hwaddr, &sockaddr);
2335 if (ret) {
2336 ERROR("mac address '%s' conversion failed : %s",
2337 hwaddr, strerror(-ret));
2338 return -1;
2339 }
2340
2341 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
2342 ifr.ifr_name[IFNAMSIZ-1] = '\0';
2343 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2344
2345 fd = socket(AF_INET, SOCK_DGRAM, 0);
2346 if (fd < 0) {
2347 ERROR("socket failure : %s", strerror(errno));
2348 return -1;
2349 }
2350
2351 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
2352 saved_errno = errno;
2353 close(fd);
2354 if (ret)
2355 ERROR("ioctl failure : %s", strerror(saved_errno));
2356
2357 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
2358
2359 return ret;
2360 }
2361
2362 static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
2363 {
2364 struct lxc_list *iterator;
2365 struct lxc_inetdev *inetdev;
2366 int err;
2367
2368 lxc_list_for_each(iterator, ip) {
2369
2370 inetdev = iterator->elem;
2371
2372 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2373 &inetdev->bcast, inetdev->prefix);
2374 if (err) {
2375 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2376 ifindex, strerror(-err));
2377 return -1;
2378 }
2379 }
2380
2381 return 0;
2382 }
2383
2384 static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
2385 {
2386 struct lxc_list *iterator;
2387 struct lxc_inet6dev *inet6dev;
2388 int err;
2389
2390 lxc_list_for_each(iterator, ip) {
2391
2392 inet6dev = iterator->elem;
2393
2394 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
2395 &inet6dev->mcast, &inet6dev->acast,
2396 inet6dev->prefix);
2397 if (err) {
2398 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2399 ifindex, strerror(-err));
2400 return -1;
2401 }
2402 }
2403
2404 return 0;
2405 }
2406
2407 static int lxc_setup_netdev_in_child_namespaces(struct lxc_netdev *netdev)
2408 {
2409 char ifname[IFNAMSIZ];
2410 int err;
2411 const char *net_type_name;
2412 char *current_ifname = ifname;
2413
2414 /* empty network namespace */
2415 if (!netdev->ifindex) {
2416 if (netdev->flags & IFF_UP) {
2417 err = lxc_netdev_up("lo");
2418 if (err) {
2419 ERROR("failed to set the loopback up : %s",
2420 strerror(-err));
2421 return -1;
2422 }
2423 }
2424
2425 if (netdev->type == LXC_NET_EMPTY)
2426 return 0;
2427
2428 if (netdev->type == LXC_NET_NONE)
2429 return 0;
2430
2431 if (netdev->type != LXC_NET_VETH) {
2432 net_type_name = lxc_net_type_to_str(netdev->type);
2433 ERROR("%s networks are not supported for containers "
2434 "not setup up by privileged users",
2435 net_type_name);
2436 return -1;
2437 }
2438
2439 netdev->ifindex = if_nametoindex(netdev->name);
2440 }
2441
2442 /* get the new ifindex in case of physical netdev */
2443 if (netdev->type == LXC_NET_PHYS) {
2444 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2445 ERROR("failed to get ifindex for %s",
2446 netdev->link);
2447 return -1;
2448 }
2449 }
2450
2451 /* retrieve the name of the interface */
2452 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2453 ERROR("no interface corresponding to index '%d'",
2454 netdev->ifindex);
2455 return -1;
2456 }
2457
2458 /* Default: let the system to choose one interface name.
2459 * When the IFLA_IFNAME attribute is passed something like "<prefix>%d"
2460 * netlink will replace the format specifier with an appropriate index.
2461 */
2462 if (!netdev->name)
2463 netdev->name = netdev->type == LXC_NET_PHYS ?
2464 netdev->link : "eth%d";
2465
2466 /* rename the interface name */
2467 if (strcmp(ifname, netdev->name) != 0) {
2468 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2469 if (err) {
2470 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2471 strerror(-err));
2472 return -1;
2473 }
2474 }
2475
2476 /* Re-read the name of the interface because its name has changed
2477 * and would be automatically allocated by the system
2478 */
2479 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2480 ERROR("no interface corresponding to index '%d'",
2481 netdev->ifindex);
2482 return -1;
2483 }
2484
2485 /* set a mac address */
2486 if (netdev->hwaddr) {
2487 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
2488 ERROR("failed to setup hw address for '%s'",
2489 current_ifname);
2490 return -1;
2491 }
2492 }
2493
2494 /* setup ipv4 addresses on the interface */
2495 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
2496 ERROR("failed to setup ip addresses for '%s'",
2497 ifname);
2498 return -1;
2499 }
2500
2501 /* setup ipv6 addresses on the interface */
2502 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
2503 ERROR("failed to setup ipv6 addresses for '%s'",
2504 ifname);
2505 return -1;
2506 }
2507
2508 /* set the network device up */
2509 if (netdev->flags & IFF_UP) {
2510 int err;
2511
2512 err = lxc_netdev_up(current_ifname);
2513 if (err) {
2514 ERROR("failed to set '%s' up : %s", current_ifname,
2515 strerror(-err));
2516 return -1;
2517 }
2518
2519 /* the network is up, make the loopback up too */
2520 err = lxc_netdev_up("lo");
2521 if (err) {
2522 ERROR("failed to set the loopback up : %s",
2523 strerror(-err));
2524 return -1;
2525 }
2526 }
2527
2528 /* We can only set up the default routes after bringing
2529 * up the interface, sine bringing up the interface adds
2530 * the link-local routes and we can't add a default
2531 * route if the gateway is not reachable. */
2532
2533 /* setup ipv4 gateway on the interface */
2534 if (netdev->ipv4_gateway) {
2535 if (!(netdev->flags & IFF_UP)) {
2536 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2537 return -1;
2538 }
2539
2540 if (lxc_list_empty(&netdev->ipv4)) {
2541 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2542 return -1;
2543 }
2544
2545 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2546 if (err) {
2547 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2548 if (err) {
2549 ERROR("failed to add ipv4 dest for '%s': %s",
2550 ifname, strerror(-err));
2551 }
2552
2553 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2554 if (err) {
2555 ERROR("failed to setup ipv4 gateway for '%s': %s",
2556 ifname, strerror(-err));
2557 if (netdev->ipv4_gateway_auto) {
2558 char buf[INET_ADDRSTRLEN];
2559 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2560 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2561 }
2562 return -1;
2563 }
2564 }
2565 }
2566
2567 /* setup ipv6 gateway on the interface */
2568 if (netdev->ipv6_gateway) {
2569 if (!(netdev->flags & IFF_UP)) {
2570 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2571 return -1;
2572 }
2573
2574 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2575 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2576 return -1;
2577 }
2578
2579 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2580 if (err) {
2581 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2582 if (err) {
2583 ERROR("failed to add ipv6 dest for '%s': %s",
2584 ifname, strerror(-err));
2585 }
2586
2587 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2588 if (err) {
2589 ERROR("failed to setup ipv6 gateway for '%s': %s",
2590 ifname, strerror(-err));
2591 if (netdev->ipv6_gateway_auto) {
2592 char buf[INET6_ADDRSTRLEN];
2593 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2594 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2595 }
2596 return -1;
2597 }
2598 }
2599 }
2600
2601 DEBUG("'%s' has been setup", current_ifname);
2602
2603 return 0;
2604 }
2605
2606 static int lxc_setup_networks_in_child_namespaces(const struct lxc_conf *conf,
2607 struct lxc_list *network)
2608 {
2609 struct lxc_list *iterator;
2610 struct lxc_netdev *netdev;
2611
2612 lxc_log_configured_netdevs(conf);
2613
2614 lxc_list_for_each(iterator, network) {
2615 netdev = iterator->elem;
2616
2617 /* REMOVE in LXC 3.0 */
2618 if (netdev->idx < 0) {
2619 ERROR("WARNING: using \"lxc.network.*\" keys to define "
2620 "networks is DEPRECATED, please switch to using "
2621 "\"lxc.net.[i].* keys\"");
2622 }
2623
2624 if (lxc_setup_netdev_in_child_namespaces(netdev)) {
2625 ERROR("failed to setup netdev");
2626 return -1;
2627 }
2628 }
2629
2630 if (!lxc_list_empty(network))
2631 INFO("network has been setup");
2632
2633 return 0;
2634 }
2635
2636 static int parse_resource(const char *res) {
2637 size_t i;
2638 int resid = -1;
2639
2640 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2641 if (strcmp(res, limit_opt[i].name) == 0)
2642 return limit_opt[i].value;
2643 }
2644
2645 /* try to see if it's numeric, so the user may specify
2646 * resources that the running kernel knows about but
2647 * we don't */
2648 if (lxc_safe_int(res, &resid) == 0)
2649 return resid;
2650 return -1;
2651 }
2652
2653 int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2654 struct lxc_list *it;
2655 struct lxc_limit *lim;
2656 int resid;
2657
2658 lxc_list_for_each(it, limits) {
2659 lim = it->elem;
2660
2661 resid = parse_resource(lim->resource);
2662 if (resid < 0) {
2663 ERROR("unknown resource %s", lim->resource);
2664 return -1;
2665 }
2666
2667 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2668 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2669 return -1;
2670 }
2671 }
2672 return 0;
2673 }
2674
2675 /* try to move physical nics to the init netns */
2676 void lxc_restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2677 {
2678 int i, oldfd;
2679 char ifname[IFNAMSIZ];
2680
2681 if (netnsfd < 0 || conf->num_savednics == 0)
2682 return;
2683
2684 INFO("Running to reset %d nic names.", conf->num_savednics);
2685
2686 oldfd = lxc_preserve_ns(getpid(), "net");
2687 if (oldfd < 0) {
2688 SYSERROR("Failed to open monitor netns fd.");
2689 return;
2690 }
2691
2692 if (setns(netnsfd, 0) != 0) {
2693 SYSERROR("Failed to enter container netns to reset nics");
2694 close(oldfd);
2695 return;
2696 }
2697 for (i=0; i<conf->num_savednics; i++) {
2698 struct saved_nic *s = &conf->saved_nics[i];
2699 /* retrieve the name of the interface */
2700 if (!if_indextoname(s->ifindex, ifname)) {
2701 WARN("no interface corresponding to index '%d'", s->ifindex);
2702 continue;
2703 }
2704 if (lxc_netdev_move_by_name(ifname, 1, s->orig_name))
2705 WARN("Error moving nic name:%s back to host netns", ifname);
2706 free(s->orig_name);
2707 }
2708 conf->num_savednics = 0;
2709
2710 if (setns(oldfd, 0) != 0)
2711 SYSERROR("Failed to re-enter monitor's netns");
2712 close(oldfd);
2713 }
2714
2715 static char *default_rootfs_mount = LXCROOTFSMOUNT;
2716
2717 struct lxc_conf *lxc_conf_init(void)
2718 {
2719 struct lxc_conf *new;
2720 int i;
2721
2722 new = malloc(sizeof(*new));
2723 if (!new) {
2724 ERROR("lxc_conf_init : %s", strerror(errno));
2725 return NULL;
2726 }
2727 memset(new, 0, sizeof(*new));
2728
2729 new->loglevel = LXC_LOG_LEVEL_NOTSET;
2730 new->personality = -1;
2731 new->autodev = 1;
2732 new->console.log_path = NULL;
2733 new->console.log_fd = -1;
2734 new->console.path = NULL;
2735 new->console.peer = -1;
2736 new->console.peerpty.busy = -1;
2737 new->console.peerpty.master = -1;
2738 new->console.peerpty.slave = -1;
2739 new->console.master = -1;
2740 new->console.slave = -1;
2741 new->console.name[0] = '\0';
2742 new->maincmd_fd = -1;
2743 new->nbd_idx = -1;
2744 new->rootfs.mount = strdup(default_rootfs_mount);
2745 if (!new->rootfs.mount) {
2746 ERROR("lxc_conf_init : %s", strerror(errno));
2747 free(new);
2748 return NULL;
2749 }
2750 new->logfd = -1;
2751 lxc_list_init(&new->cgroup);
2752 lxc_list_init(&new->network);
2753 lxc_list_init(&new->mount_list);
2754 lxc_list_init(&new->caps);
2755 lxc_list_init(&new->keepcaps);
2756 lxc_list_init(&new->id_map);
2757 lxc_list_init(&new->includes);
2758 lxc_list_init(&new->aliens);
2759 lxc_list_init(&new->environment);
2760 lxc_list_init(&new->limits);
2761 for (i=0; i<NUM_LXC_HOOKS; i++)
2762 lxc_list_init(&new->hooks[i]);
2763 lxc_list_init(&new->groups);
2764 new->lsm_aa_profile = NULL;
2765 new->lsm_se_context = NULL;
2766 new->tmp_umount_proc = 0;
2767
2768 for (i = 0; i < LXC_NS_MAX; i++)
2769 new->inherit_ns_fd[i] = -1;
2770
2771 /* if running in a new user namespace, init and COMMAND
2772 * default to running as UID/GID 0 when using lxc-execute */
2773 new->init_uid = 0;
2774 new->init_gid = 0;
2775 memset(&new->cgroup_meta, 0, sizeof(struct lxc_cgroup));
2776
2777 return new;
2778 }
2779
2780 static int instantiate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2781 {
2782 char *veth1, *veth2;
2783 char veth1buf[IFNAMSIZ], veth2buf[IFNAMSIZ];
2784 int bridge_index, err;
2785 unsigned int mtu = 0;
2786
2787 if (netdev->priv.veth_attr.pair) {
2788 veth1 = netdev->priv.veth_attr.pair;
2789 if (handler->conf->reboot)
2790 lxc_netdev_delete_by_name(veth1);
2791 } else {
2792 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2793 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2794 ERROR("veth1 name too long");
2795 return -1;
2796 }
2797 veth1 = lxc_mkifname(veth1buf);
2798 if (!veth1) {
2799 ERROR("failed to allocate a temporary name");
2800 return -1;
2801 }
2802 /* store away for deconf */
2803 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
2804 }
2805
2806 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
2807 veth2 = lxc_mkifname(veth2buf);
2808 if (!veth2) {
2809 ERROR("failed to allocate a temporary name");
2810 goto out_delete;
2811 }
2812
2813 err = lxc_veth_create(veth1, veth2);
2814 if (err) {
2815 ERROR("failed to create veth pair \"%s\" and \"%s\": %s", veth1,
2816 veth2, strerror(-err));
2817 goto out_delete;
2818 }
2819
2820 /* changing the high byte of the mac address to 0xfe, the bridge interface
2821 * will always keep the host's mac address and not take the mac address
2822 * of a container */
2823 err = setup_private_host_hw_addr(veth1);
2824 if (err) {
2825 ERROR("failed to change mac address of host interface \"%s\": %s",
2826 veth1, strerror(-err));
2827 goto out_delete;
2828 }
2829
2830 netdev->ifindex = if_nametoindex(veth2);
2831 if (!netdev->ifindex) {
2832 ERROR("failed to retrieve the index for \"%s\"", veth2);
2833 goto out_delete;
2834 }
2835
2836 if (netdev->mtu) {
2837 if (lxc_safe_uint(netdev->mtu, &mtu) < 0)
2838 WARN("failed to parse mtu from");
2839 else
2840 INFO("retrieved mtu %d", mtu);
2841 } else if (netdev->link) {
2842 bridge_index = if_nametoindex(netdev->link);
2843 if (bridge_index) {
2844 mtu = netdev_get_mtu(bridge_index);
2845 INFO("retrieved mtu %d from %s", mtu, netdev->link);
2846 } else {
2847 mtu = netdev_get_mtu(netdev->ifindex);
2848 INFO("retrieved mtu %d from %s", mtu, veth2);
2849 }
2850 }
2851
2852 if (mtu) {
2853 err = lxc_netdev_set_mtu(veth1, mtu);
2854 if (!err)
2855 err = lxc_netdev_set_mtu(veth2, mtu);
2856 if (err) {
2857 ERROR("failed to set mtu \"%d\" for veth pair \"%s\" "
2858 "and \"%s\": %s",
2859 mtu, veth1, veth2, strerror(-err));
2860 goto out_delete;
2861 }
2862 }
2863
2864 if (netdev->link) {
2865 err = lxc_bridge_attach(netdev->link, veth1);
2866 if (err) {
2867 ERROR("failed to attach \"%s\" to bridge \"%s\": %s",
2868 veth1, netdev->link, strerror(-err));
2869 goto out_delete;
2870 }
2871 INFO("attached \"%s\" to bridge \"%s\"", veth1, netdev->link);
2872 }
2873
2874 err = lxc_netdev_up(veth1);
2875 if (err) {
2876 ERROR("failed to set \"%s\" up: %s", veth1, strerror(-err));
2877 goto out_delete;
2878 }
2879
2880 if (netdev->upscript) {
2881 err = run_script(handler->name, "net", netdev->upscript, "up",
2882 "veth", veth1, (char*) NULL);
2883 if (err)
2884 goto out_delete;
2885 }
2886
2887 DEBUG("instantiated veth \"%s/%s\", index is \"%d\"", veth1, veth2,
2888 netdev->ifindex);
2889
2890 return 0;
2891
2892 out_delete:
2893 if (netdev->ifindex != 0)
2894 lxc_netdev_delete_by_name(veth1);
2895 if (!netdev->priv.veth_attr.pair)
2896 free(veth1);
2897 free(veth2);
2898 return -1;
2899 }
2900
2901 static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2902 {
2903 char *veth1;
2904 int err;
2905
2906 if (netdev->priv.veth_attr.pair)
2907 veth1 = netdev->priv.veth_attr.pair;
2908 else
2909 veth1 = netdev->priv.veth_attr.veth1;
2910
2911 if (netdev->downscript) {
2912 err = run_script(handler->name, "net", netdev->downscript,
2913 "down", "veth", veth1, (char*) NULL);
2914 if (err)
2915 return -1;
2916 }
2917 return 0;
2918 }
2919
2920 static int instantiate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2921 {
2922 char peerbuf[IFNAMSIZ], *peer;
2923 int err;
2924
2925 if (!netdev->link) {
2926 ERROR("no link specified for macvlan netdev");
2927 return -1;
2928 }
2929
2930 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2931 if (err >= sizeof(peerbuf))
2932 return -1;
2933
2934 peer = lxc_mkifname(peerbuf);
2935 if (!peer) {
2936 ERROR("failed to make a temporary name");
2937 return -1;
2938 }
2939
2940 err = lxc_macvlan_create(netdev->link, peer,
2941 netdev->priv.macvlan_attr.mode);
2942 if (err) {
2943 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2944 peer, netdev->link, strerror(-err));
2945 goto out;
2946 }
2947
2948 netdev->ifindex = if_nametoindex(peer);
2949 if (!netdev->ifindex) {
2950 ERROR("failed to retrieve the index for %s", peer);
2951 goto out;
2952 }
2953
2954 if (netdev->upscript) {
2955 err = run_script(handler->name, "net", netdev->upscript, "up",
2956 "macvlan", netdev->link, (char*) NULL);
2957 if (err)
2958 goto out;
2959 }
2960
2961 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
2962 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
2963
2964 return 0;
2965 out:
2966 lxc_netdev_delete_by_name(peer);
2967 free(peer);
2968 return -1;
2969 }
2970
2971 static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2972 {
2973 int err;
2974
2975 if (netdev->downscript) {
2976 err = run_script(handler->name, "net", netdev->downscript,
2977 "down", "macvlan", netdev->link,
2978 (char*) NULL);
2979 if (err)
2980 return -1;
2981 }
2982 return 0;
2983 }
2984
2985 /* XXX: merge with instantiate_macvlan */
2986 static int instantiate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2987 {
2988 char peer[IFNAMSIZ];
2989 int err;
2990 static uint16_t vlan_cntr = 0;
2991 unsigned int mtu = 0;
2992
2993 if (!netdev->link) {
2994 ERROR("no link specified for vlan netdev");
2995 return -1;
2996 }
2997
2998 err = snprintf(peer, sizeof(peer), "vlan%d-%d", netdev->priv.vlan_attr.vid, vlan_cntr++);
2999 if (err >= sizeof(peer)) {
3000 ERROR("peer name too long");
3001 return -1;
3002 }
3003
3004 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
3005 if (err) {
3006 ERROR("failed to create vlan interface '%s' on '%s' : %s",
3007 peer, netdev->link, strerror(-err));
3008 return -1;
3009 }
3010
3011 netdev->ifindex = if_nametoindex(peer);
3012 if (!netdev->ifindex) {
3013 ERROR("failed to retrieve the ifindex for %s", peer);
3014 lxc_netdev_delete_by_name(peer);
3015 return -1;
3016 }
3017
3018 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
3019 netdev->ifindex);
3020 if (netdev->mtu) {
3021 if (lxc_safe_uint(netdev->mtu, &mtu) < 0) {
3022 ERROR("Failed to retrieve mtu from: '%d'/'%s'.",
3023 netdev->ifindex, netdev->name);
3024 return -1;
3025 }
3026 err = lxc_netdev_set_mtu(peer, mtu);
3027 if (err) {
3028 ERROR("failed to set mtu '%s' for %s : %s",
3029 netdev->mtu, peer, strerror(-err));
3030 lxc_netdev_delete_by_name(peer);
3031 return -1;
3032 }
3033 }
3034
3035 return 0;
3036 }
3037
3038 static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
3039 {
3040 return 0;
3041 }
3042
3043 static int instantiate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
3044 {
3045 if (!netdev->link) {
3046 ERROR("no link specified for the physical interface");
3047 return -1;
3048 }
3049
3050 netdev->ifindex = if_nametoindex(netdev->link);
3051 if (!netdev->ifindex) {
3052 ERROR("failed to retrieve the index for %s", netdev->link);
3053 return -1;
3054 }
3055
3056 if (netdev->upscript) {
3057 int err;
3058 err = run_script(handler->name, "net", netdev->upscript,
3059 "up", "phys", netdev->link, (char*) NULL);
3060 if (err)
3061 return -1;
3062 }
3063
3064 return 0;
3065 }
3066
3067 static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
3068 {
3069 int err;
3070
3071 if (netdev->downscript) {
3072 err = run_script(handler->name, "net", netdev->downscript,
3073 "down", "phys", netdev->link, (char*) NULL);
3074 if (err)
3075 return -1;
3076 }
3077 return 0;
3078 }
3079
3080 static int instantiate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3081 {
3082 netdev->ifindex = 0;
3083 return 0;
3084 }
3085
3086 static int instantiate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3087 {
3088 netdev->ifindex = 0;
3089 if (netdev->upscript) {
3090 int err;
3091 err = run_script(handler->name, "net", netdev->upscript,
3092 "up", "empty", (char*) NULL);
3093 if (err)
3094 return -1;
3095 }
3096 return 0;
3097 }
3098
3099 static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3100 {
3101 int err;
3102
3103 if (netdev->downscript) {
3104 err = run_script(handler->name, "net", netdev->downscript,
3105 "down", "empty", (char*) NULL);
3106 if (err)
3107 return -1;
3108 }
3109 return 0;
3110 }
3111
3112 static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3113 {
3114 return 0;
3115 }
3116
3117 int lxc_requests_empty_network(struct lxc_handler *handler)
3118 {
3119 struct lxc_list *network = &handler->conf->network;
3120 struct lxc_list *iterator;
3121 struct lxc_netdev *netdev;
3122 bool found_none = false, found_nic = false;
3123
3124 if (lxc_list_empty(network))
3125 return 0;
3126
3127 lxc_list_for_each(iterator, network) {
3128
3129 netdev = iterator->elem;
3130
3131 if (netdev->type == LXC_NET_NONE)
3132 found_none = true;
3133 else
3134 found_nic = true;
3135 }
3136 if (found_none && !found_nic)
3137 return 1;
3138 return 0;
3139 }
3140
3141 int lxc_setup_networks_in_parent_namespaces(struct lxc_handler *handler)
3142 {
3143 bool am_root;
3144 struct lxc_netdev *netdev;
3145 struct lxc_list *iterator;
3146 struct lxc_list *network = &handler->conf->network;
3147
3148 /* We need to be root. */
3149 am_root = (getuid() == 0);
3150 if (!am_root)
3151 return 0;
3152
3153 lxc_list_for_each(iterator, network) {
3154 netdev = iterator->elem;
3155
3156 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
3157 ERROR("invalid network configuration type '%d'",
3158 netdev->type);
3159 return -1;
3160 }
3161
3162 if (netdev_conf[netdev->type](handler, netdev)) {
3163 ERROR("failed to create netdev");
3164 return -1;
3165 }
3166
3167 }
3168
3169 return 0;
3170 }
3171
3172 bool lxc_delete_network(struct lxc_handler *handler)
3173 {
3174 int ret;
3175 struct lxc_list *iterator;
3176 struct lxc_list *network = &handler->conf->network;
3177 bool deleted_all = true;
3178
3179 lxc_list_for_each(iterator, network) {
3180 char *hostveth = NULL;
3181 struct lxc_netdev *netdev = iterator->elem;
3182
3183 /* We can only delete devices whose ifindex we have. If we don't
3184 * have the index it means that we didn't create it.
3185 */
3186 if (!netdev->ifindex)
3187 continue;
3188
3189 if (netdev->type == LXC_NET_PHYS) {
3190 ret = lxc_netdev_rename_by_index(netdev->ifindex, netdev->link);
3191 if (ret < 0)
3192 WARN("Failed to rename interface with index %d "
3193 "to its initial name \"%s\"",
3194 netdev->ifindex, netdev->link);
3195 else
3196 TRACE("Renamed interface with index %d to its "
3197 "initial name \"%s\"",
3198 netdev->ifindex, netdev->link);
3199 continue;
3200 }
3201
3202 ret = netdev_deconf[netdev->type](handler, netdev);
3203 if (ret < 0)
3204 WARN("Failed to deconfigure network device");
3205
3206 /* Recent kernels remove the virtual interfaces when the network
3207 * namespace is destroyed but in case we did not move the
3208 * interface to the network namespace, we have to destroy it.
3209 */
3210 if (!am_unpriv()) {
3211 ret = lxc_netdev_delete_by_index(netdev->ifindex);
3212 if (-ret == ENODEV) {
3213 INFO("Interface \"%s\" with index %d already "
3214 "deleted or existing in different network "
3215 "namespace",
3216 netdev->name ? netdev->name : "(null)",
3217 netdev->ifindex);
3218 } else if (ret < 0) {
3219 deleted_all = false;
3220 WARN("Failed to remove interface \"%s\" with "
3221 "index %d: %s",
3222 netdev->name ? netdev->name : "(null)",
3223 netdev->ifindex, strerror(-ret));
3224 continue;
3225 }
3226 INFO("Removed interface \"%s\" with index %d",
3227 netdev->name ? netdev->name : "(null)",
3228 netdev->ifindex);
3229 }
3230
3231 if (netdev->type != LXC_NET_VETH)
3232 continue;
3233
3234 if (am_unpriv()) {
3235 if (is_ovs_bridge(netdev->link)) {
3236 ret = lxc_unpriv_delete_nic(handler->lxcpath,
3237 handler->name, "ovs",
3238 netdev, getpid());
3239 if (ret < 0)
3240 WARN("Failed to remove port \"%s\" "
3241 "from openvswitch bridge \"%s\"",
3242 netdev->priv.veth_attr.pair,
3243 netdev->link);
3244 }
3245
3246 continue;
3247 }
3248
3249 /* Explicitly delete host veth device to prevent lingering
3250 * devices. We had issues in LXD around this.
3251 */
3252 if (netdev->priv.veth_attr.pair)
3253 hostveth = netdev->priv.veth_attr.pair;
3254 else
3255 hostveth = netdev->priv.veth_attr.veth1;
3256 if (*hostveth == '\0')
3257 continue;
3258
3259 ret = lxc_netdev_delete_by_name(hostveth);
3260 if (ret < 0) {
3261 deleted_all = false;
3262 WARN("Failed to remove interface \"%s\" from \"%s\": %s",
3263 hostveth, netdev->link, strerror(-ret));
3264 continue;
3265 }
3266 INFO("Removed interface \"%s\" from \"%s\"", hostveth, netdev->link);
3267
3268 if (!is_ovs_bridge(netdev->link)) {
3269 netdev->priv.veth_attr.veth1[0] = '\0';
3270 continue;
3271 }
3272
3273 /* Delete the openvswitch port. */
3274 ret = lxc_ovs_delete_port(netdev->link, hostveth);
3275 if (ret < 0)
3276 WARN("Failed to remove port \"%s\" from openvswitch "
3277 "bridge \"%s\"", hostveth, netdev->link);
3278 else
3279 INFO("Removed port \"%s\" from openvswitch bridge \"%s\"",
3280 hostveth, netdev->link);
3281
3282 netdev->priv.veth_attr.veth1[0] = '\0';
3283 }
3284
3285 return deleted_all;
3286 }
3287
3288 #define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3289
3290 /* lxc-user-nic returns "interface_name:interface_name\n" */
3291 #define MAX_BUFFER_SIZE IFNAMSIZ * 2 + 2
3292 static int unpriv_assign_nic(const char *lxcpath, char *lxcname,
3293 struct lxc_netdev *netdev, pid_t pid)
3294 {
3295 int ret;
3296 pid_t child;
3297 int bytes, pipefd[2];
3298 char *token, *saveptr = NULL;
3299 char netdev_link[IFNAMSIZ + 1];
3300 char buffer[MAX_BUFFER_SIZE] = {0};
3301
3302 if (netdev->type != LXC_NET_VETH) {
3303 ERROR("nic type %d not support for unprivileged use",
3304 netdev->type);
3305 return -1;
3306 }
3307
3308 if (pipe(pipefd) < 0) {
3309 SYSERROR("pipe failed");
3310 return -1;
3311 }
3312
3313 child = fork();
3314 if (child < 0) {
3315 SYSERROR("fork");
3316 close(pipefd[0]);
3317 close(pipefd[1]);
3318 return -1;
3319 }
3320
3321 if (child == 0) { /* child */
3322 /* Call lxc-user-nic pid type bridge. */
3323 int ret;
3324 char pidstr[LXC_NUMSTRLEN64];
3325
3326 close(pipefd[0]); /* Close the read-end of the pipe. */
3327
3328 /* Redirect stdout to write-end of the pipe. */
3329 ret = dup2(pipefd[1], STDOUT_FILENO);
3330 if (ret >= 0)
3331 ret = dup2(pipefd[1], STDERR_FILENO);
3332 close(pipefd[1]); /* Close the write-end of the pipe. */
3333 if (ret < 0) {
3334 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
3335 exit(EXIT_FAILURE);
3336 }
3337
3338 if (netdev->link)
3339 strncpy(netdev_link, netdev->link, IFNAMSIZ);
3340 else
3341 strncpy(netdev_link, "none", IFNAMSIZ);
3342
3343 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
3344 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
3345 exit(EXIT_FAILURE);
3346 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
3347
3348 INFO("Execing lxc-user-nic create %s %s %s veth %s %s", lxcpath,
3349 lxcname, pidstr, netdev_link,
3350 netdev->name ? netdev->name : "(null)");
3351 if (netdev->name)
3352 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, "create",
3353 lxcpath, lxcname, pidstr, "veth", netdev_link,
3354 netdev->name, (char *)NULL);
3355 else
3356 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, "create",
3357 lxcpath, lxcname, pidstr, "veth", netdev_link,
3358 (char *)NULL);
3359 SYSERROR("Failed to exec lxc-user-nic.");
3360 exit(EXIT_FAILURE);
3361 }
3362
3363 /* close the write-end of the pipe */
3364 close(pipefd[1]);
3365
3366 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
3367 if (bytes < 0) {
3368 SYSERROR("Failed to read from pipe file descriptor.");
3369 close(pipefd[0]);
3370 return -1;
3371 }
3372 buffer[bytes - 1] = '\0';
3373
3374 if (wait_for_pid(child) != 0) {
3375 ERROR("lxc-user-nic failed to configure requested network: %s",
3376 buffer[0] != '\0' ? buffer : "(null)");
3377 close(pipefd[0]);
3378 return -1;
3379 }
3380 TRACE("Received output \"%s\" from lxc-user-nic", buffer);
3381
3382 /* close the read-end of the pipe */
3383 close(pipefd[0]);
3384
3385 /* fill netdev->name field */
3386 token = strtok_r(buffer, ":", &saveptr);
3387 if (!token)
3388 return -1;
3389
3390 netdev->name = malloc(IFNAMSIZ + 1);
3391 if (!netdev->name) {
3392 SYSERROR("Failed to allocate memory.");
3393 return -1;
3394 }
3395 memset(netdev->name, 0, IFNAMSIZ + 1);
3396 strncpy(netdev->name, token, IFNAMSIZ);
3397
3398 /* fill netdev->veth_attr.pair field */
3399 token = strtok_r(NULL, ":", &saveptr);
3400 if (!token)
3401 return -1;
3402
3403 netdev->priv.veth_attr.pair = strdup(token);
3404 if (!netdev->priv.veth_attr.pair) {
3405 ERROR("Failed to allocate memory.");
3406 return -1;
3407 }
3408
3409 /* fill netdev->veth_attr.pair field */
3410 token = strtok_r(NULL, ":", &saveptr);
3411 if (!token)
3412 return -1;
3413
3414 ret = lxc_safe_int(token, &netdev->ifindex);
3415 if (ret < 0) {
3416 ERROR("Failed to parse ifindex for network device \"%s\"", netdev->name);
3417 return -1;
3418 }
3419
3420 return 0;
3421 }
3422
3423 int lxc_assign_network(const char *lxcpath, char *lxcname,
3424 struct lxc_list *network, pid_t pid)
3425 {
3426 struct lxc_list *iterator;
3427 struct lxc_netdev *netdev;
3428 char ifname[IFNAMSIZ];
3429 int am_root = (getuid() == 0);
3430 int err;
3431
3432 lxc_list_for_each(iterator, network) {
3433
3434 netdev = iterator->elem;
3435
3436 if (netdev->type == LXC_NET_VETH && !am_root) {
3437 if (netdev->mtu)
3438 INFO("mtu ignored due to insufficient privilege");
3439 if (unpriv_assign_nic(lxcpath, lxcname, netdev, pid))
3440 return -1;
3441 /* lxc-user-nic has moved the nic to the new ns.
3442 * unpriv_assign_nic() fills in netdev->name.
3443 * netdev->ifindex will be filed in at
3444 * lxc_setup_netdev_in_child_namespaces.
3445 */
3446 continue;
3447 }
3448
3449 /* empty network namespace, nothing to move */
3450 if (!netdev->ifindex)
3451 continue;
3452
3453 /* retrieve the name of the interface */
3454 if (!if_indextoname(netdev->ifindex, ifname)) {
3455 ERROR("no interface corresponding to index '%d'", netdev->ifindex);
3456 return -1;
3457 }
3458
3459 err = lxc_netdev_move_by_name(ifname, pid, NULL);
3460 if (err) {
3461 ERROR("failed to move '%s' to the container : %s",
3462 netdev->link, strerror(-err));
3463 return -1;
3464 }
3465
3466 DEBUG("move '%s'/'%s' to '%d': .", ifname, netdev->name, pid);
3467 }
3468
3469 return 0;
3470 }
3471
3472 static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3473 size_t buf_size)
3474 {
3475 char path[MAXPATHLEN];
3476 int fd, ret;
3477
3478 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
3479 idtype == ID_TYPE_UID ? 'u' : 'g');
3480 if (ret < 0 || ret >= MAXPATHLEN) {
3481 ERROR("failed to create path \"%s\"", path);
3482 return -E2BIG;
3483 }
3484
3485 fd = open(path, O_WRONLY);
3486 if (fd < 0) {
3487 SYSERROR("failed to open \"%s\"", path);
3488 return -1;
3489 }
3490
3491 errno = 0;
3492 ret = lxc_write_nointr(fd, buf, buf_size);
3493 if (ret != buf_size) {
3494 SYSERROR("failed to write %cid mapping to \"%s\"",
3495 idtype == ID_TYPE_UID ? 'u' : 'g', path);
3496 close(fd);
3497 return -1;
3498 }
3499 close(fd);
3500
3501 return 0;
3502 }
3503
3504 /* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
3505 *
3506 * @return 1 if functional binary was found
3507 * @return 0 if binary exists but is lacking privilege
3508 * @return -ENOENT if binary does not exist
3509 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
3510 *
3511 */
3512 static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
3513 {
3514 char *path;
3515 int ret;
3516 struct stat st;
3517 int fret = 0;
3518
3519 if (cap != CAP_SETUID && cap != CAP_SETGID)
3520 return -EINVAL;
3521
3522 path = on_path(binary, NULL);
3523 if (!path)
3524 return -ENOENT;
3525
3526 ret = stat(path, &st);
3527 if (ret < 0) {
3528 fret = -errno;
3529 goto cleanup;
3530 }
3531
3532 /* Check if the binary is setuid. */
3533 if (st.st_mode & S_ISUID) {
3534 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
3535 fret = 1;
3536 goto cleanup;
3537 }
3538
3539 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
3540 /* Check if it has the CAP_SETUID capability. */
3541 if ((cap & CAP_SETUID) &&
3542 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
3543 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
3544 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
3545 "and CAP_PERMITTED sets.", path);
3546 fret = 1;
3547 goto cleanup;
3548 }
3549
3550 /* Check if it has the CAP_SETGID capability. */
3551 if ((cap & CAP_SETGID) &&
3552 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
3553 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
3554 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
3555 "and CAP_PERMITTED sets.", path);
3556 fret = 1;
3557 goto cleanup;
3558 }
3559 #else
3560 /* If we cannot check for file capabilities we need to give the benefit
3561 * of the doubt. Otherwise we might fail even though all the necessary
3562 * file capabilities are set.
3563 */
3564 DEBUG("Cannot check for file capabilites as full capability support is "
3565 "missing. Manual intervention needed.");
3566 fret = 1;
3567 #endif
3568
3569 cleanup:
3570 free(path);
3571 return fret;
3572 }
3573
3574 int lxc_map_ids_exec_wrapper(void *args)
3575 {
3576 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
3577 return -1;
3578 }
3579
3580 int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3581 {
3582 struct id_map *map;
3583 struct lxc_list *iterator;
3584 enum idtype type;
3585 char u_or_g;
3586 char *pos;
3587 int fill, left;
3588 char cmd_output[MAXPATHLEN];
3589 /* strlen("new@idmap") = 9
3590 * +
3591 * strlen(" ") = 1
3592 * +
3593 * LXC_NUMSTRLEN64
3594 * +
3595 * strlen(" ") = 1
3596 *
3597 * We add some additional space to make sure that we really have
3598 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
3599 */
3600 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
3601 int ret = 0, uidmap = 0, gidmap = 0;
3602 bool use_shadow = false, had_entry = false;
3603
3604 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
3605 * ranges, then insist that root also reserve ranges in subuid. This
3606 * will protected it by preventing another user from being handed the
3607 * range by shadow.
3608 */
3609 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
3610 if (uidmap == -ENOENT)
3611 WARN("newuidmap binary is missing");
3612 else if (!uidmap)
3613 WARN("newuidmap is lacking necessary privileges");
3614
3615 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
3616 if (gidmap == -ENOENT)
3617 WARN("newgidmap binary is missing");
3618 else if (!gidmap)
3619 WARN("newgidmap is lacking necessary privileges");
3620
3621 if (uidmap > 0 && gidmap > 0) {
3622 DEBUG("Functional newuidmap and newgidmap binary found.");
3623 use_shadow = true;
3624 } else {
3625 /* In case unprivileged users run application containers via
3626 * execute() or a start*() there are valid cases where they may
3627 * only want to map their own {g,u}id. Let's not block them from
3628 * doing so by requiring geteuid() == 0.
3629 */
3630 DEBUG("No newuidmap and newgidmap binary found. Trying to "
3631 "write directly with euid %d.", geteuid());
3632 }
3633
3634 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
3635 type++, u_or_g = 'g') {
3636 pos = mapbuf;
3637
3638 if (use_shadow)
3639 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
3640
3641 lxc_list_for_each(iterator, idmap) {
3642 /* The kernel only takes <= 4k for writes to
3643 * /proc/<nr>/[ug]id_map
3644 */
3645 map = iterator->elem;
3646 if (map->idtype != type)
3647 continue;
3648
3649 had_entry = true;
3650
3651 left = LXC_IDMAPLEN - (pos - mapbuf);
3652 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
3653 use_shadow ? " " : "", map->nsid,
3654 map->hostid, map->range,
3655 use_shadow ? "" : "\n");
3656 if (fill <= 0 || fill >= left)
3657 SYSERROR("Too many {g,u}id mappings defined.");
3658
3659 pos += fill;
3660 }
3661 if (!had_entry)
3662 continue;
3663
3664 /* Try to catch the ouput of new{g,u}idmap to make debugging
3665 * easier.
3666 */
3667 if (use_shadow) {
3668 ret = run_command(cmd_output, sizeof(cmd_output),
3669 lxc_map_ids_exec_wrapper,
3670 (void *)mapbuf);
3671 if (ret < 0) {
3672 ERROR("new%cidmap failed to write mapping: %s",
3673 u_or_g, cmd_output);
3674 return -1;
3675 }
3676 } else {
3677 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
3678 if (ret < 0)
3679 return -1;
3680 }
3681
3682 memset(mapbuf, 0, sizeof(mapbuf));
3683 }
3684
3685 return 0;
3686 }
3687
3688 /*
3689 * return the host uid/gid to which the container root is mapped in
3690 * *val.
3691 * Return true if id was found, false otherwise.
3692 */
3693 bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3694 unsigned long *val)
3695 {
3696 struct lxc_list *it;
3697 struct id_map *map;
3698
3699 lxc_list_for_each(it, &conf->id_map) {
3700 map = it->elem;
3701 if (map->idtype != idtype)
3702 continue;
3703 if (map->nsid != 0)
3704 continue;
3705 *val = map->hostid;
3706 return true;
3707 }
3708 return false;
3709 }
3710
3711 int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
3712 {
3713 struct lxc_list *it;
3714 struct id_map *map;
3715 lxc_list_for_each(it, &conf->id_map) {
3716 map = it->elem;
3717 if (map->idtype != idtype)
3718 continue;
3719 if (id >= map->hostid && id < map->hostid + map->range)
3720 return (id - map->hostid) + map->nsid;
3721 }
3722 return -1;
3723 }
3724
3725 int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
3726 {
3727 struct lxc_list *it;
3728 struct id_map *map;
3729 unsigned int freeid = 0;
3730 again:
3731 lxc_list_for_each(it, &conf->id_map) {
3732 map = it->elem;
3733 if (map->idtype != idtype)
3734 continue;
3735 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3736 freeid = map->nsid + map->range;
3737 goto again;
3738 }
3739 }
3740 return freeid;
3741 }
3742
3743 int lxc_find_gateway_addresses(struct lxc_handler *handler)
3744 {
3745 struct lxc_list *network = &handler->conf->network;
3746 struct lxc_list *iterator;
3747 struct lxc_netdev *netdev;
3748 int link_index;
3749
3750 lxc_list_for_each(iterator, network) {
3751 netdev = iterator->elem;
3752
3753 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3754 continue;
3755
3756 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3757 ERROR("gateway = auto only supported for "
3758 "veth and macvlan");
3759 return -1;
3760 }
3761
3762 if (!netdev->link) {
3763 ERROR("gateway = auto needs a link interface");
3764 return -1;
3765 }
3766
3767 link_index = if_nametoindex(netdev->link);
3768 if (!link_index)
3769 return -EINVAL;
3770
3771 if (netdev->ipv4_gateway_auto) {
3772 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3773 ERROR("failed to automatically find ipv4 gateway "
3774 "address from link interface '%s'", netdev->link);
3775 return -1;
3776 }
3777 }
3778
3779 if (netdev->ipv6_gateway_auto) {
3780 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3781 ERROR("failed to automatically find ipv6 gateway "
3782 "address from link interface '%s'", netdev->link);
3783 return -1;
3784 }
3785 }
3786 }
3787
3788 return 0;
3789 }
3790
3791 int lxc_create_tty(const char *name, struct lxc_conf *conf)
3792 {
3793 struct lxc_tty_info *tty_info = &conf->tty_info;
3794 int i, ret;
3795
3796 /* no tty in the configuration */
3797 if (!conf->tty)
3798 return 0;
3799
3800 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
3801 if (!tty_info->pty_info) {
3802 SYSERROR("failed to allocate struct *pty_info");
3803 return -ENOMEM;
3804 }
3805
3806 for (i = 0; i < conf->tty; i++) {
3807 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3808
3809 process_lock();
3810 ret = openpty(&pty_info->master, &pty_info->slave,
3811 pty_info->name, NULL, NULL);
3812 process_unlock();
3813 if (ret) {
3814 SYSERROR("failed to create pty device number %d", i);
3815 tty_info->nbtty = i;
3816 lxc_delete_tty(tty_info);
3817 return -ENOTTY;
3818 }
3819
3820 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
3821 pty_info->name, pty_info->master, pty_info->slave);
3822
3823 /* Prevent leaking the file descriptors to the container */
3824 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3825 if (ret < 0)
3826 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
3827 "pty device \"%s\": %s",
3828 pty_info->master, pty_info->name, strerror(errno));
3829
3830 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3831 if (ret < 0)
3832 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
3833 "pty device \"%s\": %s",
3834 pty_info->slave, pty_info->name, strerror(errno));
3835
3836 pty_info->busy = 0;
3837 }
3838
3839 tty_info->nbtty = conf->tty;
3840
3841 INFO("finished allocating %d pts devices", conf->tty);
3842 return 0;
3843 }
3844
3845 void lxc_delete_tty(struct lxc_tty_info *tty_info)
3846 {
3847 int i;
3848
3849 for (i = 0; i < tty_info->nbtty; i++) {
3850 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3851
3852 close(pty_info->master);
3853 close(pty_info->slave);
3854 }
3855
3856 free(tty_info->pty_info);
3857 tty_info->pty_info = NULL;
3858 tty_info->nbtty = 0;
3859 }
3860
3861
3862 int chown_mapped_root_exec_wrapper(void *args)
3863 {
3864 execvp("lxc-usernsexec", args);
3865 return -1;
3866 }
3867
3868 /*
3869 * chown_mapped_root: for an unprivileged user with uid/gid X to
3870 * chown a dir to subuid/subgid Y, he needs to run chown as root
3871 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3872 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3873 * root is privileged with respect to hostuid/hostgid X, allowing
3874 * him to do the chown.
3875 */
3876 int chown_mapped_root(char *path, struct lxc_conf *conf)
3877 {
3878 uid_t rootuid, rootgid;
3879 unsigned long val;
3880 int hostuid, hostgid, ret;
3881 struct stat sb;
3882 char map1[100], map2[100], map3[100], map4[100], map5[100];
3883 char ugid[100];
3884 char *args1[] = {"lxc-usernsexec",
3885 "-m", map1,
3886 "-m", map2,
3887 "-m", map3,
3888 "-m", map5,
3889 "--", "chown", ugid, path,
3890 NULL};
3891 char *args2[] = {"lxc-usernsexec",
3892 "-m", map1,
3893 "-m", map2,
3894 "-m", map3,
3895 "-m", map4,
3896 "-m", map5,
3897 "--", "chown", ugid, path,
3898 NULL};
3899 char cmd_output[MAXPATHLEN];
3900
3901 hostuid = geteuid();
3902 hostgid = getegid();
3903
3904 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
3905 ERROR("No uid mapping for container root");
3906 return -1;
3907 }
3908 rootuid = (uid_t)val;
3909 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
3910 ERROR("No gid mapping for container root");
3911 return -1;
3912 }
3913 rootgid = (gid_t)val;
3914
3915 if (hostuid == 0) {
3916 if (chown(path, rootuid, rootgid) < 0) {
3917 ERROR("Error chowning %s", path);
3918 return -1;
3919 }
3920 return 0;
3921 }
3922
3923 if (rootuid == hostuid) {
3924 /* nothing to do */
3925 INFO("Container root is our uid; no need to chown");
3926 return 0;
3927 }
3928
3929 /* save the current gid of "path" */
3930 if (stat(path, &sb) < 0) {
3931 ERROR("Error stat %s", path);
3932 return -1;
3933 }
3934
3935 /* Update the path argument in case this was overlayfs. */
3936 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
3937 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
3938
3939 /*
3940 * A file has to be group-owned by a gid mapped into the
3941 * container, or the container won't be privileged over it.
3942 */
3943 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3944 if (sb.st_uid == hostuid &&
3945 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3946 chown(path, -1, hostgid) < 0) {
3947 ERROR("Failed chgrping %s", path);
3948 return -1;
3949 }
3950
3951 /* "u:0:rootuid:1" */
3952 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3953 if (ret < 0 || ret >= 100) {
3954 ERROR("Error uid printing map string");
3955 return -1;
3956 }
3957
3958 /* "u:hostuid:hostuid:1" */
3959 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3960 if (ret < 0 || ret >= 100) {
3961 ERROR("Error uid printing map string");
3962 return -1;
3963 }
3964
3965 /* "g:0:rootgid:1" */
3966 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3967 if (ret < 0 || ret >= 100) {
3968 ERROR("Error gid printing map string");
3969 return -1;
3970 }
3971
3972 /* "g:pathgid:rootgid+pathgid:1" */
3973 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3974 rootgid + (gid_t)sb.st_gid);
3975 if (ret < 0 || ret >= 100) {
3976 ERROR("Error gid printing map string");
3977 return -1;
3978 }
3979
3980 /* "g:hostgid:hostgid:1" */
3981 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3982 if (ret < 0 || ret >= 100) {
3983 ERROR("Error gid printing map string");
3984 return -1;
3985 }
3986
3987 /* "0:pathgid" (chown) */
3988 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3989 if (ret < 0 || ret >= 100) {
3990 ERROR("Error owner printing format string for chown");
3991 return -1;
3992 }
3993
3994 if (hostgid == sb.st_gid)
3995 ret = run_command(cmd_output, sizeof(cmd_output),
3996 chown_mapped_root_exec_wrapper,
3997 (void *)args1);
3998 else
3999 ret = run_command(cmd_output, sizeof(cmd_output),
4000 chown_mapped_root_exec_wrapper,
4001 (void *)args2);
4002 if (ret < 0)
4003 ERROR("lxc-usernsexec failed: %s", cmd_output);
4004
4005 return ret;
4006 }
4007
4008 int lxc_ttys_shift_ids(struct lxc_conf *c)
4009 {
4010 if (lxc_list_empty(&c->id_map))
4011 return 0;
4012
4013 if (!strcmp(c->console.name, ""))
4014 return 0;
4015
4016 if (chown_mapped_root(c->console.name, c) < 0) {
4017 ERROR("failed to chown console \"%s\"", c->console.name);
4018 return -1;
4019 }
4020
4021 TRACE("chowned console \"%s\"", c->console.name);
4022
4023 return 0;
4024 }
4025
4026 /* NOTE: Must not be called from inside the container namespace! */
4027 int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
4028 {
4029 int mounted;
4030
4031 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
4032 if (mounted == -1) {
4033 SYSERROR("failed to mount /proc in the container");
4034 /* continue only if there is no rootfs */
4035 if (conf->rootfs.path)
4036 return -1;
4037 } else if (mounted == 1) {
4038 conf->tmp_umount_proc = 1;
4039 }
4040
4041 return 0;
4042 }
4043
4044 void tmp_proc_unmount(struct lxc_conf *lxc_conf)
4045 {
4046 if (lxc_conf->tmp_umount_proc == 1) {
4047 umount("/proc");
4048 lxc_conf->tmp_umount_proc = 0;
4049 }
4050 }
4051
4052 void remount_all_slave(void)
4053 {
4054 /* walk /proc/mounts and change any shared entries to slave */
4055 FILE *f = fopen("/proc/self/mountinfo", "r");
4056 char *line = NULL;
4057 size_t len = 0;
4058
4059 if (!f) {
4060 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
4061 ERROR("Continuing container startup...");
4062 return;
4063 }
4064
4065 while (getline(&line, &len, f) != -1) {
4066 char *target, *opts;
4067 target = get_field(line, 4);
4068 if (!target)
4069 continue;
4070 opts = get_field(target, 2);
4071 if (!opts)
4072 continue;
4073 null_endofword(opts);
4074 if (!strstr(opts, "shared"))
4075 continue;
4076 null_endofword(target);
4077 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
4078 SYSERROR("Failed to make %s rslave", target);
4079 ERROR("Continuing...");
4080 }
4081 }
4082 fclose(f);
4083 free(line);
4084 }
4085
4086 void lxc_execute_bind_init(struct lxc_conf *conf)
4087 {
4088 int ret;
4089 char path[PATH_MAX], destpath[PATH_MAX], *p;
4090
4091 /* If init exists in the container, don't bind mount a static one */
4092 p = choose_init(conf->rootfs.mount);
4093 if (p) {
4094 free(p);
4095 return;
4096 }
4097
4098 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
4099 if (ret < 0 || ret >= PATH_MAX) {
4100 WARN("Path name too long searching for lxc.init.static");
4101 return;
4102 }
4103
4104 if (!file_exists(path)) {
4105 INFO("%s does not exist on host", path);
4106 return;
4107 }
4108
4109 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
4110 if (ret < 0 || ret >= PATH_MAX) {
4111 WARN("Path name too long for container's lxc.init.static");
4112 return;
4113 }
4114
4115 if (!file_exists(destpath)) {
4116 FILE * pathfile = fopen(destpath, "wb");
4117 if (!pathfile) {
4118 SYSERROR("Failed to create mount target '%s'", destpath);
4119 return;
4120 }
4121 fclose(pathfile);
4122 }
4123
4124 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
4125 if (ret < 0)
4126 SYSERROR("Failed to bind lxc.init.static into container");
4127 INFO("lxc.init.static bound into container at %s", path);
4128 }
4129
4130 /*
4131 * This does the work of remounting / if it is shared, calling the
4132 * container pre-mount hooks, and mounting the rootfs.
4133 */
4134 int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
4135 {
4136 if (conf->rootfs_setup) {
4137 /*
4138 * rootfs was set up in another namespace. bind-mount it
4139 * to give us a mount in our own ns so we can pivot_root to it
4140 */
4141 const char *path = conf->rootfs.mount;
4142 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
4143 ERROR("Failed to bind-mount container / onto itself");
4144 return -1;
4145 }
4146 return 0;
4147 }
4148
4149 remount_all_slave();
4150
4151 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
4152 ERROR("failed to run pre-mount hooks for container '%s'.", name);
4153 return -1;
4154 }
4155
4156 if (lxc_setup_rootfs(conf)) {
4157 ERROR("failed to setup rootfs for '%s'", name);
4158 return -1;
4159 }
4160
4161 conf->rootfs_setup = true;
4162 return 0;
4163 }
4164
4165 static bool verify_start_hooks(struct lxc_conf *conf)
4166 {
4167 struct lxc_list *it;
4168 char path[MAXPATHLEN];
4169 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
4170 char *hookname = it->elem;
4171 struct stat st;
4172 int ret;
4173
4174 ret = snprintf(path, MAXPATHLEN, "%s%s",
4175 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
4176 if (ret < 0 || ret >= MAXPATHLEN)
4177 return false;
4178 ret = stat(path, &st);
4179 if (ret) {
4180 SYSERROR("Start hook %s not found in container",
4181 hookname);
4182 return false;
4183 }
4184 return true;
4185 }
4186
4187 return true;
4188 }
4189
4190 static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
4191 {
4192 int i;
4193 int *ttyfds;
4194 struct lxc_pty_info *pty_info;
4195 struct lxc_conf *conf = handler->conf;
4196 const struct lxc_tty_info *tty_info = &conf->tty_info;
4197 int sock = handler->ttysock[0];
4198 int ret = -1;
4199 size_t num_ttyfds = (2 * conf->tty);
4200
4201 ttyfds = malloc(num_ttyfds * sizeof(int));
4202 if (!ttyfds)
4203 return -1;
4204
4205 for (i = 0; i < num_ttyfds; i++) {
4206 pty_info = &tty_info->pty_info[i / 2];
4207 ttyfds[i++] = pty_info->slave;
4208 ttyfds[i] = pty_info->master;
4209 TRACE("send pty \"%s\" with master fd %d and slave fd %d to "
4210 "parent",
4211 pty_info->name, pty_info->master, pty_info->slave);
4212 }
4213
4214 ret = lxc_abstract_unix_send_fds(sock, ttyfds, num_ttyfds, NULL, 0);
4215 if (ret < 0)
4216 ERROR("failed to send %d ttys to parent: %s", conf->tty,
4217 strerror(errno));
4218 else
4219 TRACE("sent %d ttys to parent", conf->tty);
4220
4221 close(handler->ttysock[0]);
4222 close(handler->ttysock[1]);
4223
4224 for (i = 0; i < num_ttyfds; i++)
4225 close(ttyfds[i]);
4226
4227 free(ttyfds);
4228
4229 return ret;
4230 }
4231
4232 int lxc_setup(struct lxc_handler *handler)
4233 {
4234 const char *name = handler->name;
4235 struct lxc_conf *lxc_conf = handler->conf;
4236 const char *lxcpath = handler->lxcpath;
4237
4238 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4239 ERROR("Error setting up rootfs mount after spawn");
4240 return -1;
4241 }
4242
4243 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4244 if (setup_utsname(lxc_conf->utsname)) {
4245 ERROR("failed to setup the utsname for '%s'", name);
4246 return -1;
4247 }
4248 }
4249
4250 if (lxc_setup_networks_in_child_namespaces(lxc_conf,
4251 &lxc_conf->network)) {
4252 ERROR("failed to setup the network for '%s'", name);
4253 return -1;
4254 }
4255
4256 if (lxc_conf->autodev > 0) {
4257 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
4258 ERROR("failed to mount /dev in the container");
4259 return -1;
4260 }
4261 }
4262
4263 /* do automatic mounts (mainly /proc and /sys), but exclude
4264 * those that need to wait until other stuff has finished
4265 */
4266 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
4267 ERROR("failed to setup the automatic mounts for '%s'", name);
4268 return -1;
4269 }
4270
4271 if (setup_mount(lxc_conf, &lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
4272 ERROR("failed to setup the mounts for '%s'", name);
4273 return -1;
4274 }
4275
4276 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(lxc_conf, &lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
4277 ERROR("failed to setup the mount entries for '%s'", name);
4278 return -1;
4279 }
4280
4281 /* Make sure any start hooks are in the container */
4282 if (!verify_start_hooks(lxc_conf))
4283 return -1;
4284
4285 if (lxc_conf->is_execute)
4286 lxc_execute_bind_init(lxc_conf);
4287
4288 /* now mount only cgroup, if wanted;
4289 * before, /sys could not have been mounted
4290 * (is either mounted automatically or via fstab entries)
4291 */
4292 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
4293 ERROR("failed to setup the automatic mounts for '%s'", name);
4294 return -1;
4295 }
4296
4297 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
4298 ERROR("failed to run mount hooks for container '%s'.", name);
4299 return -1;
4300 }
4301
4302 if (lxc_conf->autodev > 0) {
4303 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
4304 ERROR("failed to run autodev hooks for container '%s'.", name);
4305 return -1;
4306 }
4307
4308 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
4309 ERROR("failed to populate /dev in the container");
4310 return -1;
4311 }
4312 }
4313
4314 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
4315 ERROR("failed to setup the console for '%s'", name);
4316 return -1;
4317 }
4318
4319 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4320 ERROR("failed to setup /dev symlinks for '%s'", name);
4321 return -1;
4322 }
4323
4324 /* mount /proc if it's not already there */
4325 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
4326 ERROR("failed to LSM mount proc for '%s'", name);
4327 return -1;
4328 }
4329
4330 if (setup_pivot_root(&lxc_conf->rootfs)) {
4331 ERROR("failed to set rootfs for '%s'", name);
4332 return -1;
4333 }
4334
4335 if (lxc_setup_devpts(lxc_conf->pts)) {
4336 ERROR("failed to setup the new pts instance");
4337 return -1;
4338 }
4339
4340 if (lxc_create_tty(name, lxc_conf)) {
4341 ERROR("failed to create the ttys");
4342 return -1;
4343 }
4344
4345 if (lxc_send_ttys_to_parent(handler) < 0) {
4346 ERROR("failure sending console info to parent");
4347 return -1;
4348 }
4349
4350 if (!lxc_conf->is_execute && lxc_setup_tty(lxc_conf)) {
4351 ERROR("failed to setup the ttys for '%s'", name);
4352 return -1;
4353 }
4354
4355 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
4356 SYSERROR("failed to set environment variable for container ptys");
4357
4358
4359 if (setup_personality(lxc_conf->personality)) {
4360 ERROR("failed to setup personality");
4361 return -1;
4362 }
4363
4364 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4365 if (!lxc_list_empty(&lxc_conf->caps)) {
4366 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
4367 return -1;
4368 }
4369 if (dropcaps_except(&lxc_conf->keepcaps)) {
4370 ERROR("failed to keep requested caps");
4371 return -1;
4372 }
4373 } else if (setup_caps(&lxc_conf->caps)) {
4374 ERROR("failed to drop capabilities");
4375 return -1;
4376 }
4377
4378 NOTICE("Container \"%s\" is set up", name);
4379
4380 return 0;
4381 }
4382
4383 int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4384 const char *lxcpath, char *argv[])
4385 {
4386 int which = -1;
4387 struct lxc_list *it;
4388
4389 if (strcmp(hook, "pre-start") == 0)
4390 which = LXCHOOK_PRESTART;
4391 else if (strcmp(hook, "pre-mount") == 0)
4392 which = LXCHOOK_PREMOUNT;
4393 else if (strcmp(hook, "mount") == 0)
4394 which = LXCHOOK_MOUNT;
4395 else if (strcmp(hook, "autodev") == 0)
4396 which = LXCHOOK_AUTODEV;
4397 else if (strcmp(hook, "start") == 0)
4398 which = LXCHOOK_START;
4399 else if (strcmp(hook, "stop") == 0)
4400 which = LXCHOOK_STOP;
4401 else if (strcmp(hook, "post-stop") == 0)
4402 which = LXCHOOK_POSTSTOP;
4403 else if (strcmp(hook, "clone") == 0)
4404 which = LXCHOOK_CLONE;
4405 else if (strcmp(hook, "destroy") == 0)
4406 which = LXCHOOK_DESTROY;
4407 else
4408 return -1;
4409 lxc_list_for_each(it, &conf->hooks[which]) {
4410 int ret;
4411 char *hookname = it->elem;
4412 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
4413 if (ret)
4414 return ret;
4415 }
4416 return 0;
4417 }
4418
4419 int lxc_clear_config_caps(struct lxc_conf *c)
4420 {
4421 struct lxc_list *it, *next;
4422
4423 lxc_list_for_each_safe(it, &c->caps, next) {
4424 lxc_list_del(it);
4425 free(it->elem);
4426 free(it);
4427 }
4428 return 0;
4429 }
4430
4431 static int lxc_free_idmap(struct lxc_list *id_map) {
4432 struct lxc_list *it, *next;
4433
4434 lxc_list_for_each_safe(it, id_map, next) {
4435 lxc_list_del(it);
4436 free(it->elem);
4437 free(it);
4438 }
4439 return 0;
4440 }
4441
4442 int lxc_clear_idmaps(struct lxc_conf *c)
4443 {
4444 return lxc_free_idmap(&c->id_map);
4445 }
4446
4447 int lxc_clear_config_keepcaps(struct lxc_conf *c)
4448 {
4449 struct lxc_list *it,*next;
4450
4451 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4452 lxc_list_del(it);
4453 free(it->elem);
4454 free(it);
4455 }
4456 return 0;
4457 }
4458
4459 int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
4460 {
4461 struct lxc_list *it,*next;
4462 bool all = false;
4463 const char *k = NULL;
4464
4465 if (strcmp(key, "lxc.cgroup") == 0)
4466 all = true;
4467 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
4468 k = key + sizeof("lxc.cgroup.")-1;
4469 else
4470 return -1;
4471
4472 lxc_list_for_each_safe(it, &c->cgroup, next) {
4473 struct lxc_cgroup *cg = it->elem;
4474 if (!all && strcmp(cg->subsystem, k) != 0)
4475 continue;
4476 lxc_list_del(it);
4477 free(cg->subsystem);
4478 free(cg->value);
4479 free(cg);
4480 free(it);
4481 }
4482 return 0;
4483 }
4484
4485 int lxc_clear_limits(struct lxc_conf *c, const char *key)
4486 {
4487 struct lxc_list *it, *next;
4488 bool all = false;
4489 const char *k = NULL;
4490
4491 if (strcmp(key, "lxc.limit") == 0
4492 || strcmp(key, "lxc.prlimit"))
4493 all = true;
4494 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
4495 k = key + sizeof("lxc.limit.")-1;
4496 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.")-1) == 0)
4497 k = key + sizeof("lxc.prlimit.")-1;
4498 else
4499 return -1;
4500
4501 lxc_list_for_each_safe(it, &c->limits, next) {
4502 struct lxc_limit *lim = it->elem;
4503 if (!all && strcmp(lim->resource, k) != 0)
4504 continue;
4505 lxc_list_del(it);
4506 free(lim->resource);
4507 free(lim);
4508 free(it);
4509 }
4510 return 0;
4511 }
4512
4513 int lxc_clear_groups(struct lxc_conf *c)
4514 {
4515 struct lxc_list *it,*next;
4516
4517 lxc_list_for_each_safe(it, &c->groups, next) {
4518 lxc_list_del(it);
4519 free(it->elem);
4520 free(it);
4521 }
4522 return 0;
4523 }
4524
4525 int lxc_clear_environment(struct lxc_conf *c)
4526 {
4527 struct lxc_list *it,*next;
4528
4529 lxc_list_for_each_safe(it, &c->environment, next) {
4530 lxc_list_del(it);
4531 free(it->elem);
4532 free(it);
4533 }
4534 return 0;
4535 }
4536
4537
4538 int lxc_clear_mount_entries(struct lxc_conf *c)
4539 {
4540 struct lxc_list *it,*next;
4541
4542 lxc_list_for_each_safe(it, &c->mount_list, next) {
4543 lxc_list_del(it);
4544 free(it->elem);
4545 free(it);
4546 }
4547 return 0;
4548 }
4549
4550 int lxc_clear_automounts(struct lxc_conf *c)
4551 {
4552 c->auto_mounts = 0;
4553 return 0;
4554 }
4555
4556 int lxc_clear_hooks(struct lxc_conf *c, const char *key)
4557 {
4558 struct lxc_list *it,*next;
4559 bool all = false, done = false;
4560 const char *k = NULL;
4561 int i;
4562
4563 if (strcmp(key, "lxc.hook") == 0)
4564 all = true;
4565 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
4566 k = key + sizeof("lxc.hook.")-1;
4567 else
4568 return -1;
4569
4570 for (i=0; i<NUM_LXC_HOOKS; i++) {
4571 if (all || strcmp(k, lxchook_names[i]) == 0) {
4572 lxc_list_for_each_safe(it, &c->hooks[i], next) {
4573 lxc_list_del(it);
4574 free(it->elem);
4575 free(it);
4576 }
4577 done = true;
4578 }
4579 }
4580
4581 if (!done) {
4582 ERROR("Invalid hook key: %s", key);
4583 return -1;
4584 }
4585 return 0;
4586 }
4587
4588 static void lxc_clear_saved_nics(struct lxc_conf *conf)
4589 {
4590 int i;
4591
4592 if (!conf->saved_nics)
4593 return;
4594 for (i=0; i < conf->num_savednics; i++)
4595 free(conf->saved_nics[i].orig_name);
4596 free(conf->saved_nics);
4597 }
4598
4599 static inline void lxc_clear_aliens(struct lxc_conf *conf)
4600 {
4601 struct lxc_list *it,*next;
4602
4603 lxc_list_for_each_safe(it, &conf->aliens, next) {
4604 lxc_list_del(it);
4605 free(it->elem);
4606 free(it);
4607 }
4608 }
4609
4610 void lxc_clear_includes(struct lxc_conf *conf)
4611 {
4612 struct lxc_list *it,*next;
4613
4614 lxc_list_for_each_safe(it, &conf->includes, next) {
4615 lxc_list_del(it);
4616 free(it->elem);
4617 free(it);
4618 }
4619 }
4620
4621 void lxc_conf_free(struct lxc_conf *conf)
4622 {
4623 if (!conf)
4624 return;
4625 if (current_config == conf)
4626 current_config = NULL;
4627 free(conf->console.log_path);
4628 free(conf->console.path);
4629 free(conf->rootfs.mount);
4630 free(conf->rootfs.bdev_type);
4631 free(conf->rootfs.options);
4632 free(conf->rootfs.path);
4633 free(conf->logfile);
4634 if (conf->logfd != -1)
4635 close(conf->logfd);
4636 free(conf->utsname);
4637 free(conf->ttydir);
4638 free(conf->fstab);
4639 free(conf->rcfile);
4640 free(conf->init_cmd);
4641 free(conf->unexpanded_config);
4642 free(conf->pty_names);
4643 free(conf->syslog);
4644 lxc_free_networks(&conf->network);
4645 free(conf->lsm_aa_profile);
4646 free(conf->lsm_se_context);
4647 lxc_seccomp_free(conf);
4648 lxc_clear_config_caps(conf);
4649 lxc_clear_config_keepcaps(conf);
4650 lxc_clear_cgroups(conf, "lxc.cgroup");
4651 lxc_clear_hooks(conf, "lxc.hook");
4652 lxc_clear_mount_entries(conf);
4653 lxc_clear_saved_nics(conf);
4654 lxc_clear_idmaps(conf);
4655 lxc_clear_groups(conf);
4656 lxc_clear_includes(conf);
4657 lxc_clear_aliens(conf);
4658 lxc_clear_environment(conf);
4659 lxc_clear_limits(conf, "lxc.prlimit");
4660 free(conf->cgroup_meta.dir);
4661 free(conf->cgroup_meta.controllers);
4662 free(conf);
4663 }
4664
4665 struct userns_fn_data {
4666 int (*fn)(void *);
4667 const char *fn_name;
4668 void *arg;
4669 int p[2];
4670 };
4671
4672 static int run_userns_fn(void *data)
4673 {
4674 struct userns_fn_data *d = data;
4675 char c;
4676
4677 /* Close write end of the pipe. */
4678 close(d->p[1]);
4679
4680 /* Wait for parent to finish establishing a new mapping in the user
4681 * namespace we are executing in.
4682 */
4683 if (read(d->p[0], &c, 1) != 1)
4684 return -1;
4685
4686 /* Close read end of the pipe. */
4687 close(d->p[0]);
4688
4689 if (d->fn_name)
4690 TRACE("calling function \"%s\"", d->fn_name);
4691 /* Call function to run. */
4692 return d->fn(d->arg);
4693 }
4694
4695 static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
4696 enum idtype idtype)
4697 {
4698 struct lxc_list *it;
4699 struct id_map *map;
4700 struct id_map *retmap = NULL;
4701
4702 lxc_list_for_each(it, &conf->id_map) {
4703 map = it->elem;
4704 if (map->idtype != idtype)
4705 continue;
4706
4707 if (id >= map->hostid && id < map->hostid + map->range) {
4708 retmap = map;
4709 break;
4710 }
4711 }
4712
4713 if (!retmap)
4714 return NULL;
4715
4716 retmap = malloc(sizeof(*retmap));
4717 if (!retmap)
4718 return NULL;
4719
4720 memcpy(retmap, map, sizeof(*retmap));
4721 return retmap;
4722 }
4723
4724 /*
4725 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4726 * existing one or establish a new one.
4727 */
4728 static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4729 {
4730 int hostid_mapped;
4731 struct id_map *entry = NULL;
4732
4733 /* Reuse existing mapping. */
4734 entry = mapped_hostid_entry(conf, id, type);
4735 if (entry)
4736 return entry;
4737
4738 /* Find new mapping. */
4739 hostid_mapped = find_unmapped_nsid(conf, type);
4740 if (hostid_mapped < 0) {
4741 DEBUG("failed to find free mapping for id %d", id);
4742 return NULL;
4743 }
4744
4745 entry = malloc(sizeof(*entry));
4746 if (!entry)
4747 return NULL;
4748
4749 entry->idtype = type;
4750 entry->nsid = hostid_mapped;
4751 entry->hostid = (unsigned long)id;
4752 entry->range = 1;
4753
4754 return entry;
4755 }
4756
4757 /* Run a function in a new user namespace.
4758 * The caller's euid/egid will be mapped if it is not already.
4759 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4760 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4761 * This means we require only to establish a mapping from:
4762 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4763 * - the container root -> some sub{g,u}id
4764 * The former we add, if the user did not specifiy a mapping. The latter we
4765 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4766 * there to start the container in the first place.
4767 */
4768 int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4769 const char *fn_name)
4770 {
4771 pid_t pid;
4772 uid_t euid, egid;
4773 struct userns_fn_data d;
4774 int p[2];
4775 struct lxc_list *it;
4776 struct id_map *map;
4777 char c = '1';
4778 int ret = -1;
4779 struct lxc_list *idmap = NULL, *tmplist = NULL;
4780 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4781 *host_uid_map = NULL, *host_gid_map = NULL;
4782
4783 ret = pipe(p);
4784 if (ret < 0) {
4785 SYSERROR("opening pipe");
4786 return -1;
4787 }
4788 d.fn = fn;
4789 d.fn_name = fn_name;
4790 d.arg = data;
4791 d.p[0] = p[0];
4792 d.p[1] = p[1];
4793
4794 /* Clone child in new user namespace. */
4795 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4796 if (pid < 0) {
4797 ERROR("failed to clone child process in new user namespace");
4798 goto on_error;
4799 }
4800
4801 close(p[0]);
4802 p[0] = -1;
4803
4804 /* Find container root. */
4805 lxc_list_for_each(it, &conf->id_map) {
4806 map = it->elem;
4807
4808 if (map->nsid != 0)
4809 continue;
4810
4811 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
4812 container_root_uid = malloc(sizeof(*container_root_uid));
4813 if (!container_root_uid)
4814 goto on_error;
4815 container_root_uid->idtype = map->idtype;
4816 container_root_uid->hostid = map->hostid;
4817 container_root_uid->nsid = 0;
4818 container_root_uid->range = map->range;
4819 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
4820 container_root_gid = malloc(sizeof(*container_root_gid));
4821 if (!container_root_gid)
4822 goto on_error;
4823 container_root_gid->idtype = map->idtype;
4824 container_root_gid->hostid = map->hostid;
4825 container_root_gid->nsid = 0;
4826 container_root_gid->range = map->range;
4827 }
4828
4829 /* Found container root. */
4830 if (container_root_uid && container_root_gid)
4831 break;
4832 }
4833
4834 /* This is actually checked earlier but it can't hurt. */
4835 if (!container_root_uid || !container_root_gid) {
4836 ERROR("no mapping for container root found");
4837 goto on_error;
4838 }
4839
4840 host_uid_map = container_root_uid;
4841 host_gid_map = container_root_gid;
4842
4843 /* Check whether the {g,u}id of the user has a mapping. */
4844 euid = geteuid();
4845 egid = getegid();
4846 if (euid != container_root_uid->hostid)
4847 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
4848
4849 if (egid != container_root_gid->hostid)
4850 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
4851
4852 if (!host_uid_map) {
4853 DEBUG("failed to find mapping for uid %d", euid);
4854 goto on_error;
4855 }
4856
4857 if (!host_gid_map) {
4858 DEBUG("failed to find mapping for gid %d", egid);
4859 goto on_error;
4860 }
4861
4862 /* Allocate new {g,u}id map list. */
4863 idmap = malloc(sizeof(*idmap));
4864 if (!idmap)
4865 goto on_error;
4866 lxc_list_init(idmap);
4867
4868 /* Add container root to the map. */
4869 tmplist = malloc(sizeof(*tmplist));
4870 if (!tmplist)
4871 goto on_error;
4872 lxc_list_add_elem(tmplist, container_root_uid);
4873 lxc_list_add_tail(idmap, tmplist);
4874
4875 if (host_uid_map && (host_uid_map != container_root_uid)) {
4876 /* idmap will now keep track of that memory. */
4877 container_root_uid = NULL;
4878
4879 /* Add container root to the map. */
4880 tmplist = malloc(sizeof(*tmplist));
4881 if (!tmplist)
4882 goto on_error;
4883 lxc_list_add_elem(tmplist, host_uid_map);
4884 lxc_list_add_tail(idmap, tmplist);
4885 }
4886 /* idmap will now keep track of that memory. */
4887 container_root_uid = NULL;
4888 /* idmap will now keep track of that memory. */
4889 host_uid_map = NULL;
4890
4891 tmplist = malloc(sizeof(*tmplist));
4892 if (!tmplist)
4893 goto on_error;
4894 lxc_list_add_elem(tmplist, container_root_gid);
4895 lxc_list_add_tail(idmap, tmplist);
4896
4897 if (host_gid_map && (host_gid_map != container_root_gid)) {
4898 /* idmap will now keep track of that memory. */
4899 container_root_gid = NULL;
4900
4901 tmplist = malloc(sizeof(*tmplist));
4902 if (!tmplist)
4903 goto on_error;
4904 lxc_list_add_elem(tmplist, host_gid_map);
4905 lxc_list_add_tail(idmap, tmplist);
4906 }
4907 /* idmap will now keep track of that memory. */
4908 container_root_gid = NULL;
4909 /* idmap will now keep track of that memory. */
4910 host_gid_map = NULL;
4911
4912 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4913 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4914 lxc_list_for_each(it, idmap) {
4915 map = it->elem;
4916 TRACE("establishing %cid mapping for \"%d\" in new "
4917 "user namespace: nsuid %lu - hostid %lu - range "
4918 "%lu",
4919 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4920 map->nsid, map->hostid, map->range);
4921 }
4922 }
4923
4924 /* Set up {g,u}id mapping for user namespace of child process. */
4925 ret = lxc_map_ids(idmap, pid);
4926 if (ret < 0) {
4927 ERROR("error setting up {g,u}id mappings for child process "
4928 "\"%d\"",
4929 pid);
4930 goto on_error;
4931 }
4932
4933 /* Tell child to proceed. */
4934 if (write(p[1], &c, 1) != 1) {
4935 SYSERROR("failed telling child process \"%d\" to proceed", pid);
4936 goto on_error;
4937 }
4938
4939 /* Wait for child to finish. */
4940 ret = wait_for_pid(pid);
4941
4942 on_error:
4943 if (idmap)
4944 lxc_free_idmap(idmap);
4945 if (container_root_uid)
4946 free(container_root_uid);
4947 if (container_root_gid)
4948 free(container_root_gid);
4949 if (host_uid_map && (host_uid_map != container_root_uid))
4950 free(host_uid_map);
4951 if (host_gid_map && (host_gid_map != container_root_gid))
4952 free(host_gid_map);
4953
4954 if (p[0] != -1)
4955 close(p[0]);
4956 close(p[1]);
4957
4958 return ret;
4959 }
4960
4961 /* not thread-safe, do not use from api without first forking */
4962 static char* getuname(void)
4963 {
4964 struct passwd *result;
4965
4966 result = getpwuid(geteuid());
4967 if (!result)
4968 return NULL;
4969
4970 return strdup(result->pw_name);
4971 }
4972
4973 /* not thread-safe, do not use from api without first forking */
4974 static char *getgname(void)
4975 {
4976 struct group *result;
4977
4978 result = getgrgid(getegid());
4979 if (!result)
4980 return NULL;
4981
4982 return strdup(result->gr_name);
4983 }
4984
4985 /* not thread-safe, do not use from api without first forking */
4986 void suggest_default_idmap(void)
4987 {
4988 FILE *f;
4989 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4990 char *line = NULL;
4991 char *uname, *gname;
4992 size_t len = 0;
4993
4994 if (!(uname = getuname()))
4995 return;
4996
4997 if (!(gname = getgname())) {
4998 free(uname);
4999 return;
5000 }
5001
5002 f = fopen(subuidfile, "r");
5003 if (!f) {
5004 ERROR("Your system is not configured with subuids");
5005 free(gname);
5006 free(uname);
5007 return;
5008 }
5009 while (getline(&line, &len, f) != -1) {
5010 size_t no_newline = 0;
5011 char *p = strchr(line, ':'), *p2;
5012 if (*line == '#')
5013 continue;
5014 if (!p)
5015 continue;
5016 *p = '\0';
5017 p++;
5018 if (strcmp(line, uname))
5019 continue;
5020 p2 = strchr(p, ':');
5021 if (!p2)
5022 continue;
5023 *p2 = '\0';
5024 p2++;
5025 if (!*p2)
5026 continue;
5027 no_newline = strcspn(p2, "\n");
5028 p2[no_newline] = '\0';
5029
5030 if (lxc_safe_uint(p, &uid) < 0)
5031 WARN("Could not parse UID.");
5032 if (lxc_safe_uint(p2, &urange) < 0)
5033 WARN("Could not parse UID range.");
5034 }
5035 fclose(f);
5036
5037 f = fopen(subgidfile, "r");
5038 if (!f) {
5039 ERROR("Your system is not configured with subgids");
5040 free(gname);
5041 free(uname);
5042 return;
5043 }
5044 while (getline(&line, &len, f) != -1) {
5045 size_t no_newline = 0;
5046 char *p = strchr(line, ':'), *p2;
5047 if (*line == '#')
5048 continue;
5049 if (!p)
5050 continue;
5051 *p = '\0';
5052 p++;
5053 if (strcmp(line, uname))
5054 continue;
5055 p2 = strchr(p, ':');
5056 if (!p2)
5057 continue;
5058 *p2 = '\0';
5059 p2++;
5060 if (!*p2)
5061 continue;
5062 no_newline = strcspn(p2, "\n");
5063 p2[no_newline] = '\0';
5064
5065 if (lxc_safe_uint(p, &gid) < 0)
5066 WARN("Could not parse GID.");
5067 if (lxc_safe_uint(p2, &grange) < 0)
5068 WARN("Could not parse GID range.");
5069 }
5070 fclose(f);
5071
5072 free(line);
5073
5074 if (!urange || !grange) {
5075 ERROR("You do not have subuids or subgids allocated");
5076 ERROR("Unprivileged containers require subuids and subgids");
5077 return;
5078 }
5079
5080 ERROR("You must either run as root, or define uid mappings");
5081 ERROR("To pass uid mappings to lxc-create, you could create");
5082 ERROR("~/.config/lxc/default.conf:");
5083 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
5084 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
5085 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
5086
5087 free(gname);
5088 free(uname);
5089 }
5090
5091 static void free_cgroup_settings(struct lxc_list *result)
5092 {
5093 struct lxc_list *iterator, *next;
5094
5095 lxc_list_for_each_safe(iterator, result, next) {
5096 lxc_list_del(iterator);
5097 free(iterator);
5098 }
5099 free(result);
5100 }
5101
5102 /*
5103 * Return the list of cgroup_settings sorted according to the following rules
5104 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
5105 */
5106 struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
5107 {
5108 struct lxc_list *result;
5109 struct lxc_list *memsw_limit = NULL;
5110 struct lxc_list *it = NULL;
5111 struct lxc_cgroup *cg = NULL;
5112 struct lxc_list *item = NULL;
5113
5114 result = malloc(sizeof(*result));
5115 if (!result) {
5116 ERROR("failed to allocate memory to sort cgroup settings");
5117 return NULL;
5118 }
5119 lxc_list_init(result);
5120
5121 /*Iterate over the cgroup settings and copy them to the output list*/
5122 lxc_list_for_each(it, cgroup_settings) {
5123 item = malloc(sizeof(*item));
5124 if (!item) {
5125 ERROR("failed to allocate memory to sort cgroup settings");
5126 free_cgroup_settings(result);
5127 return NULL;
5128 }
5129 item->elem = it->elem;
5130 cg = it->elem;
5131 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
5132 /* Store the memsw_limit location */
5133 memsw_limit = item;
5134 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
5135 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
5136 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
5137 item->elem = memsw_limit->elem;
5138 memsw_limit->elem = it->elem;
5139 }
5140 lxc_list_add_tail(result, item);
5141 }
5142
5143 return result;
5144 }
5145
5146 int lxc_unpriv_delete_nic(const char *lxcpath, char *lxcname, char *type,
5147 struct lxc_netdev *netdev, pid_t pid)
5148 {
5149 pid_t child;
5150 int bytes, pipefd[2];
5151 char netdev_link[IFNAMSIZ + 1];
5152 char buffer[MAX_BUFFER_SIZE] = {0};
5153
5154 if (netdev->type != LXC_NET_VETH) {
5155 ERROR("nic type %d not support for unprivileged use",
5156 netdev->type);
5157 return -1;
5158 }
5159
5160 if (pipe(pipefd) < 0) {
5161 SYSERROR("pipe failed");
5162 return -1;
5163 }
5164
5165 child = fork();
5166 if (child < 0) {
5167 SYSERROR("fork");
5168 close(pipefd[0]);
5169 close(pipefd[1]);
5170 return -1;
5171 }
5172
5173 if (child == 0) { /* child */
5174 /* Call lxc-user-nic pid type bridge. */
5175 int ret;
5176 char pidstr[LXC_NUMSTRLEN64];
5177
5178 close(pipefd[0]); /* Close the read-end of the pipe. */
5179
5180 /* Redirect stdout to write-end of the pipe. */
5181 ret = dup2(pipefd[1], STDOUT_FILENO);
5182 if (ret >= 0)
5183 ret = dup2(pipefd[1], STDERR_FILENO);
5184 close(pipefd[1]); /* Close the write-end of the pipe. */
5185 if (ret < 0) {
5186 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
5187 exit(EXIT_FAILURE);
5188 }
5189
5190 if (netdev->link)
5191 strncpy(netdev_link, netdev->link, IFNAMSIZ);
5192 else
5193 strncpy(netdev_link, "none", IFNAMSIZ);
5194
5195 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
5196 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
5197 exit(EXIT_FAILURE);
5198 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
5199
5200 INFO("Execing lxc-user-nic delete %s %s %s ovs %s %s", lxcpath,
5201 lxcname, pidstr, netdev_link, netdev->priv.veth_attr.pair);
5202 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, "delete", lxcpath,
5203 lxcname, pidstr, "ovs", netdev_link,
5204 netdev->priv.veth_attr.pair, (char *)NULL);
5205 SYSERROR("Failed to exec lxc-user-nic.");
5206 exit(EXIT_FAILURE);
5207 }
5208
5209 /* close the write-end of the pipe */
5210 close(pipefd[1]);
5211
5212 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
5213 if (bytes < 0) {
5214 SYSERROR("Failed to read from pipe file descriptor.");
5215 close(pipefd[0]);
5216 return -1;
5217 }
5218 buffer[bytes - 1] = '\0';
5219
5220 if (wait_for_pid(child) != 0) {
5221 ERROR("lxc-user-nic failed to delete requested network: %s",
5222 buffer[0] != '\0' ? buffer : "(null)");
5223 close(pipefd[0]);
5224 return -1;
5225 }
5226 TRACE("Received output \"%s\" from lxc-user-nic", buffer);
5227
5228 /* close the read-end of the pipe */
5229 close(pipefd[0]);
5230
5231 return 0;
5232 }