]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/conf.c
network: preserve backwards compatibility
[mirror_lxc.git] / src / lxc / conf.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #define _GNU_SOURCE
25 #include "config.h"
26
27 #include <dirent.h>
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <grp.h>
31 #include <inttypes.h>
32 #include <libgen.h>
33 #include <pwd.h>
34 #include <stdarg.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <arpa/inet.h>
41 #include <linux/loop.h>
42 #include <net/if.h>
43 #include <netinet/in.h>
44 #include <sys/mman.h>
45 #include <sys/mount.h>
46 #include <sys/param.h>
47 #include <sys/prctl.h>
48 #include <sys/stat.h>
49 #include <sys/socket.h>
50 #include <sys/sysmacros.h>
51 #include <sys/syscall.h>
52 #include <sys/types.h>
53 #include <sys/utsname.h>
54 #include <sys/wait.h>
55
56 /* makedev() */
57 #ifdef MAJOR_IN_MKDEV
58 # include <sys/mkdev.h>
59 #endif
60
61 #ifdef HAVE_STATVFS
62 #include <sys/statvfs.h>
63 #endif
64
65 #if HAVE_PTY_H
66 #include <pty.h>
67 #else
68 #include <../include/openpty.h>
69 #endif
70
71 #ifdef HAVE_LINUX_MEMFD_H
72 #include <linux/memfd.h>
73 #endif
74
75 #include "af_unix.h"
76 #include "bdev.h"
77 #include "caps.h" /* for lxc_caps_last_cap() */
78 #include "cgroup.h"
79 #include "conf.h"
80 #include "confile_utils.h"
81 #include "error.h"
82 #include "log.h"
83 #include "lxcaufs.h"
84 #include "lxclock.h"
85 #include "lxcoverlay.h"
86 #include "lxcseccomp.h"
87 #include "namespace.h"
88 #include "network.h"
89 #include "parse.h"
90 #include "utils.h"
91 #include "lsm/lsm.h"
92
93 #if HAVE_LIBCAP
94 #include <sys/capability.h>
95 #endif
96
97 #if HAVE_SYS_PERSONALITY_H
98 #include <sys/personality.h>
99 #endif
100
101 #if IS_BIONIC
102 #include <../include/lxcmntent.h>
103 #ifndef HAVE_PRLIMIT
104 #include <../include/prlimit.h>
105 #endif
106 #else
107 #include <mntent.h>
108 #endif
109
110 lxc_log_define(lxc_conf, lxc);
111
112 #if HAVE_LIBCAP
113 #ifndef CAP_SETFCAP
114 #define CAP_SETFCAP 31
115 #endif
116
117 #ifndef CAP_MAC_OVERRIDE
118 #define CAP_MAC_OVERRIDE 32
119 #endif
120
121 #ifndef CAP_MAC_ADMIN
122 #define CAP_MAC_ADMIN 33
123 #endif
124 #endif
125
126 #ifndef PR_CAPBSET_DROP
127 #define PR_CAPBSET_DROP 24
128 #endif
129
130 #ifndef LO_FLAGS_AUTOCLEAR
131 #define LO_FLAGS_AUTOCLEAR 4
132 #endif
133
134 #ifndef CAP_SETUID
135 #define CAP_SETUID 7
136 #endif
137
138 #ifndef CAP_SETGID
139 #define CAP_SETGID 6
140 #endif
141
142 /* needed for cgroup automount checks, regardless of whether we
143 * have included linux/capability.h or not */
144 #ifndef CAP_SYS_ADMIN
145 #define CAP_SYS_ADMIN 21
146 #endif
147
148 /* Define pivot_root() if missing from the C library */
149 #ifndef HAVE_PIVOT_ROOT
150 static int pivot_root(const char * new_root, const char * put_old)
151 {
152 #ifdef __NR_pivot_root
153 return syscall(__NR_pivot_root, new_root, put_old);
154 #else
155 errno = ENOSYS;
156 return -1;
157 #endif
158 }
159 #else
160 extern int pivot_root(const char * new_root, const char * put_old);
161 #endif
162
163 /* Define sethostname() if missing from the C library */
164 #ifndef HAVE_SETHOSTNAME
165 static int sethostname(const char * name, size_t len)
166 {
167 #ifdef __NR_sethostname
168 return syscall(__NR_sethostname, name, len);
169 #else
170 errno = ENOSYS;
171 return -1;
172 #endif
173 }
174 #endif
175
176 #ifndef MS_PRIVATE
177 #define MS_PRIVATE (1<<18)
178 #endif
179
180 #ifndef MS_LAZYTIME
181 #define MS_LAZYTIME (1<<25)
182 #endif
183
184 /* memfd_create() */
185 #ifndef MFD_CLOEXEC
186 #define MFD_CLOEXEC 0x0001U
187 #endif
188
189 #ifndef MFD_ALLOW_SEALING
190 #define MFD_ALLOW_SEALING 0x0002U
191 #endif
192
193 #ifndef HAVE_MEMFD_CREATE
194 static int memfd_create(const char *name, unsigned int flags) {
195 #ifndef __NR_memfd_create
196 #if defined __i386__
197 #define __NR_memfd_create 356
198 #elif defined __x86_64__
199 #define __NR_memfd_create 319
200 #elif defined __arm__
201 #define __NR_memfd_create 385
202 #elif defined __aarch64__
203 #define __NR_memfd_create 279
204 #elif defined __s390__
205 #define __NR_memfd_create 350
206 #elif defined __powerpc__
207 #define __NR_memfd_create 360
208 #elif defined __sparc__
209 #define __NR_memfd_create 348
210 #elif defined __blackfin__
211 #define __NR_memfd_create 390
212 #elif defined __ia64__
213 #define __NR_memfd_create 1340
214 #elif defined _MIPS_SIM
215 #if _MIPS_SIM == _MIPS_SIM_ABI32
216 #define __NR_memfd_create 4354
217 #endif
218 #if _MIPS_SIM == _MIPS_SIM_NABI32
219 #define __NR_memfd_create 6318
220 #endif
221 #if _MIPS_SIM == _MIPS_SIM_ABI64
222 #define __NR_memfd_create 5314
223 #endif
224 #endif
225 #endif
226 #ifdef __NR_memfd_create
227 return syscall(__NR_memfd_create, name, flags);
228 #else
229 errno = ENOSYS;
230 return -1;
231 #endif
232 }
233 #else
234 extern int memfd_create(const char *name, unsigned int flags);
235 #endif
236
237 char *lxchook_names[NUM_LXC_HOOKS] = {
238 "pre-start", "pre-mount", "mount", "autodev", "start", "stop", "post-stop", "clone", "destroy" };
239
240 typedef int (*instantiate_cb)(struct lxc_handler *, struct lxc_netdev *);
241
242 struct mount_opt {
243 char *name;
244 int clear;
245 int flag;
246 };
247
248 struct caps_opt {
249 char *name;
250 int value;
251 };
252
253 struct limit_opt {
254 char *name;
255 int value;
256 };
257
258 /*
259 * The lxc_conf of the container currently being worked on in an
260 * API call
261 * This is used in the error calls
262 */
263 #ifdef HAVE_TLS
264 __thread struct lxc_conf *current_config;
265 #else
266 struct lxc_conf *current_config;
267 #endif
268
269 /* Declare this here, since we don't want to reshuffle the whole file. */
270 static int in_caplist(int cap, struct lxc_list *caps);
271
272 static int instantiate_veth(struct lxc_handler *, struct lxc_netdev *);
273 static int instantiate_macvlan(struct lxc_handler *, struct lxc_netdev *);
274 static int instantiate_vlan(struct lxc_handler *, struct lxc_netdev *);
275 static int instantiate_phys(struct lxc_handler *, struct lxc_netdev *);
276 static int instantiate_empty(struct lxc_handler *, struct lxc_netdev *);
277 static int instantiate_none(struct lxc_handler *, struct lxc_netdev *);
278
279 static instantiate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
280 [LXC_NET_VETH] = instantiate_veth,
281 [LXC_NET_MACVLAN] = instantiate_macvlan,
282 [LXC_NET_VLAN] = instantiate_vlan,
283 [LXC_NET_PHYS] = instantiate_phys,
284 [LXC_NET_EMPTY] = instantiate_empty,
285 [LXC_NET_NONE] = instantiate_none,
286 };
287
288 static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
289 static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
290 static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
291 static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
292 static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
293 static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
294
295 static instantiate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
296 [LXC_NET_VETH] = shutdown_veth,
297 [LXC_NET_MACVLAN] = shutdown_macvlan,
298 [LXC_NET_VLAN] = shutdown_vlan,
299 [LXC_NET_PHYS] = shutdown_phys,
300 [LXC_NET_EMPTY] = shutdown_empty,
301 [LXC_NET_NONE] = shutdown_none,
302 };
303
304 static struct mount_opt mount_opt[] = {
305 { "async", 1, MS_SYNCHRONOUS },
306 { "atime", 1, MS_NOATIME },
307 { "bind", 0, MS_BIND },
308 { "defaults", 0, 0 },
309 { "dev", 1, MS_NODEV },
310 { "diratime", 1, MS_NODIRATIME },
311 { "dirsync", 0, MS_DIRSYNC },
312 { "exec", 1, MS_NOEXEC },
313 { "lazytime", 0, MS_LAZYTIME },
314 { "mand", 0, MS_MANDLOCK },
315 { "noatime", 0, MS_NOATIME },
316 { "nodev", 0, MS_NODEV },
317 { "nodiratime", 0, MS_NODIRATIME },
318 { "noexec", 0, MS_NOEXEC },
319 { "nomand", 1, MS_MANDLOCK },
320 { "norelatime", 1, MS_RELATIME },
321 { "nostrictatime", 1, MS_STRICTATIME },
322 { "nosuid", 0, MS_NOSUID },
323 { "rbind", 0, MS_BIND|MS_REC },
324 { "relatime", 0, MS_RELATIME },
325 { "remount", 0, MS_REMOUNT },
326 { "ro", 0, MS_RDONLY },
327 { "rw", 1, MS_RDONLY },
328 { "strictatime", 0, MS_STRICTATIME },
329 { "suid", 1, MS_NOSUID },
330 { "sync", 0, MS_SYNCHRONOUS },
331 { NULL, 0, 0 },
332 };
333
334 #if HAVE_LIBCAP
335 static struct caps_opt caps_opt[] = {
336 { "chown", CAP_CHOWN },
337 { "dac_override", CAP_DAC_OVERRIDE },
338 { "dac_read_search", CAP_DAC_READ_SEARCH },
339 { "fowner", CAP_FOWNER },
340 { "fsetid", CAP_FSETID },
341 { "kill", CAP_KILL },
342 { "setgid", CAP_SETGID },
343 { "setuid", CAP_SETUID },
344 { "setpcap", CAP_SETPCAP },
345 { "linux_immutable", CAP_LINUX_IMMUTABLE },
346 { "net_bind_service", CAP_NET_BIND_SERVICE },
347 { "net_broadcast", CAP_NET_BROADCAST },
348 { "net_admin", CAP_NET_ADMIN },
349 { "net_raw", CAP_NET_RAW },
350 { "ipc_lock", CAP_IPC_LOCK },
351 { "ipc_owner", CAP_IPC_OWNER },
352 { "sys_module", CAP_SYS_MODULE },
353 { "sys_rawio", CAP_SYS_RAWIO },
354 { "sys_chroot", CAP_SYS_CHROOT },
355 { "sys_ptrace", CAP_SYS_PTRACE },
356 { "sys_pacct", CAP_SYS_PACCT },
357 { "sys_admin", CAP_SYS_ADMIN },
358 { "sys_boot", CAP_SYS_BOOT },
359 { "sys_nice", CAP_SYS_NICE },
360 { "sys_resource", CAP_SYS_RESOURCE },
361 { "sys_time", CAP_SYS_TIME },
362 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
363 { "mknod", CAP_MKNOD },
364 { "lease", CAP_LEASE },
365 #ifdef CAP_AUDIT_READ
366 { "audit_read", CAP_AUDIT_READ },
367 #endif
368 #ifdef CAP_AUDIT_WRITE
369 { "audit_write", CAP_AUDIT_WRITE },
370 #endif
371 #ifdef CAP_AUDIT_CONTROL
372 { "audit_control", CAP_AUDIT_CONTROL },
373 #endif
374 { "setfcap", CAP_SETFCAP },
375 { "mac_override", CAP_MAC_OVERRIDE },
376 { "mac_admin", CAP_MAC_ADMIN },
377 #ifdef CAP_SYSLOG
378 { "syslog", CAP_SYSLOG },
379 #endif
380 #ifdef CAP_WAKE_ALARM
381 { "wake_alarm", CAP_WAKE_ALARM },
382 #endif
383 #ifdef CAP_BLOCK_SUSPEND
384 { "block_suspend", CAP_BLOCK_SUSPEND },
385 #endif
386 };
387 #else
388 static struct caps_opt caps_opt[] = {};
389 #endif
390
391 static struct limit_opt limit_opt[] = {
392 #ifdef RLIMIT_AS
393 { "as", RLIMIT_AS },
394 #endif
395 #ifdef RLIMIT_CORE
396 { "core", RLIMIT_CORE },
397 #endif
398 #ifdef RLIMIT_CPU
399 { "cpu", RLIMIT_CPU },
400 #endif
401 #ifdef RLIMIT_DATA
402 { "data", RLIMIT_DATA },
403 #endif
404 #ifdef RLIMIT_FSIZE
405 { "fsize", RLIMIT_FSIZE },
406 #endif
407 #ifdef RLIMIT_LOCKS
408 { "locks", RLIMIT_LOCKS },
409 #endif
410 #ifdef RLIMIT_MEMLOCK
411 { "memlock", RLIMIT_MEMLOCK },
412 #endif
413 #ifdef RLIMIT_MSGQUEUE
414 { "msgqueue", RLIMIT_MSGQUEUE },
415 #endif
416 #ifdef RLIMIT_NICE
417 { "nice", RLIMIT_NICE },
418 #endif
419 #ifdef RLIMIT_NOFILE
420 { "nofile", RLIMIT_NOFILE },
421 #endif
422 #ifdef RLIMIT_NPROC
423 { "nproc", RLIMIT_NPROC },
424 #endif
425 #ifdef RLIMIT_RSS
426 { "rss", RLIMIT_RSS },
427 #endif
428 #ifdef RLIMIT_RTPRIO
429 { "rtprio", RLIMIT_RTPRIO },
430 #endif
431 #ifdef RLIMIT_RTTIME
432 { "rttime", RLIMIT_RTTIME },
433 #endif
434 #ifdef RLIMIT_SIGPENDING
435 { "sigpending", RLIMIT_SIGPENDING },
436 #endif
437 #ifdef RLIMIT_STACK
438 { "stack", RLIMIT_STACK },
439 #endif
440 };
441
442 static int run_buffer(char *buffer)
443 {
444 struct lxc_popen_FILE *f;
445 char *output;
446 int ret;
447
448 f = lxc_popen(buffer);
449 if (!f) {
450 SYSERROR("Failed to popen() %s.", buffer);
451 return -1;
452 }
453
454 output = malloc(LXC_LOG_BUFFER_SIZE);
455 if (!output) {
456 ERROR("Failed to allocate memory for %s.", buffer);
457 lxc_pclose(f);
458 return -1;
459 }
460
461 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
462 DEBUG("Script %s with output: %s.", buffer, output);
463
464 free(output);
465
466 ret = lxc_pclose(f);
467 if (ret == -1) {
468 SYSERROR("Script exited with error.");
469 return -1;
470 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
471 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
472 return -1;
473 } else if (WIFSIGNALED(ret)) {
474 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
475 return -1;
476 }
477
478 return 0;
479 }
480
481 static int run_script_argv(const char *name, const char *section,
482 const char *script, const char *hook,
483 const char *lxcpath, char **argsin)
484 {
485 int ret, i;
486 char *buffer;
487 size_t size = 0;
488
489 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
490 script, name, section);
491
492 for (i = 0; argsin && argsin[i]; i++)
493 size += strlen(argsin[i]) + 1;
494
495 size += strlen(hook) + 1;
496
497 size += strlen(script);
498 size += strlen(name);
499 size += strlen(section);
500 size += 3;
501
502 if (size > INT_MAX)
503 return -1;
504
505 buffer = alloca(size);
506 if (!buffer) {
507 ERROR("Failed to allocate memory.");
508 return -1;
509 }
510
511 ret =
512 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
513 if (ret < 0 || (size_t)ret >= size) {
514 ERROR("Script name too long.");
515 return -1;
516 }
517
518 for (i = 0; argsin && argsin[i]; i++) {
519 int len = size - ret;
520 int rc;
521 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
522 if (rc < 0 || rc >= len) {
523 ERROR("Script args too long.");
524 return -1;
525 }
526 ret += rc;
527 }
528
529 return run_buffer(buffer);
530 }
531
532 static int run_script(const char *name, const char *section, const char *script,
533 ...)
534 {
535 int ret;
536 char *buffer, *p;
537 size_t size = 0;
538 va_list ap;
539
540 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
541 script, name, section);
542
543 va_start(ap, script);
544 while ((p = va_arg(ap, char *)))
545 size += strlen(p) + 1;
546 va_end(ap);
547
548 size += strlen(script);
549 size += strlen(name);
550 size += strlen(section);
551 size += 3;
552
553 if (size > INT_MAX)
554 return -1;
555
556 buffer = alloca(size);
557 if (!buffer) {
558 ERROR("Failed to allocate memory.");
559 return -1;
560 }
561
562 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
563 if (ret < 0 || ret >= size) {
564 ERROR("Script name too long.");
565 return -1;
566 }
567
568 va_start(ap, script);
569 while ((p = va_arg(ap, char *))) {
570 int len = size - ret;
571 int rc;
572 rc = snprintf(buffer + ret, len, " %s", p);
573 if (rc < 0 || rc >= len) {
574 ERROR("Script args too long.");
575 return -1;
576 }
577 ret += rc;
578 }
579 va_end(ap);
580
581 return run_buffer(buffer);
582 }
583
584 /*
585 * pin_rootfs
586 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
587 * the duration of the container run, to prevent the container from marking
588 * the underlying fs readonly on shutdown. unlink the file immediately so
589 * no name pollution is happens
590 * return -1 on error.
591 * return -2 if nothing needed to be pinned.
592 * return an open fd (>=0) if we pinned it.
593 */
594 int pin_rootfs(const char *rootfs)
595 {
596 char absrootfs[MAXPATHLEN];
597 char absrootfspin[MAXPATHLEN];
598 struct stat s;
599 int ret, fd;
600
601 if (rootfs == NULL || strlen(rootfs) == 0)
602 return -2;
603
604 if (!realpath(rootfs, absrootfs))
605 return -2;
606
607 if (access(absrootfs, F_OK))
608 return -1;
609
610 if (stat(absrootfs, &s))
611 return -1;
612
613 if (!S_ISDIR(s.st_mode))
614 return -2;
615
616 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
617 if (ret >= MAXPATHLEN)
618 return -1;
619
620 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
621 if (fd < 0)
622 return fd;
623 (void)unlink(absrootfspin);
624 return fd;
625 }
626
627 /*
628 * If we are asking to remount something, make sure that any
629 * NOEXEC etc are honored.
630 */
631 static unsigned long add_required_remount_flags(const char *s, const char *d,
632 unsigned long flags)
633 {
634 #ifdef HAVE_STATVFS
635 struct statvfs sb;
636 unsigned long required_flags = 0;
637
638 if (!(flags & MS_REMOUNT))
639 return flags;
640
641 if (!s)
642 s = d;
643
644 if (!s)
645 return flags;
646 if (statvfs(s, &sb) < 0)
647 return flags;
648
649 if (sb.f_flag & MS_NOSUID)
650 required_flags |= MS_NOSUID;
651 if (sb.f_flag & MS_NODEV)
652 required_flags |= MS_NODEV;
653 if (sb.f_flag & MS_RDONLY)
654 required_flags |= MS_RDONLY;
655 if (sb.f_flag & MS_NOEXEC)
656 required_flags |= MS_NOEXEC;
657
658 return flags | required_flags;
659 #else
660 return flags;
661 #endif
662 }
663
664 static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
665 {
666 int r;
667 int i;
668 static struct {
669 int match_mask;
670 int match_flag;
671 const char *source;
672 const char *destination;
673 const char *fstype;
674 unsigned long flags;
675 const char *options;
676 } default_mounts[] = {
677 /* Read-only bind-mounting... In older kernels, doing that required
678 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
679 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
680 * kernel 2.6.26 onwards. However, this apparently does not work on
681 * kernel 3.8. Unfortunately, on that very same kernel, doing the
682 * same trick as above doesn't seem to work either, there one needs
683 * to ALSO specify MS_BIND for the remount, otherwise the entire
684 * fs is remounted read-only or the mount fails because it's busy...
685 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
686 * 2.6.32...
687 */
688 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
689 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
690 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
691 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
692 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
693 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
694 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
695 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
696 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
697 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
698 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
699 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
700 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
701 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
702 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
703 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
704 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
705 { 0, 0, NULL, NULL, NULL, 0, NULL }
706 };
707
708 for (i = 0; default_mounts[i].match_mask; i++) {
709 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
710 char *source = NULL;
711 char *destination = NULL;
712 int saved_errno;
713 unsigned long mflags;
714
715 if (default_mounts[i].source) {
716 /* will act like strdup if %r is not present */
717 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
718 if (!source) {
719 SYSERROR("memory allocation error");
720 return -1;
721 }
722 }
723 if (!default_mounts[i].destination) {
724 ERROR("BUG: auto mounts destination %d was NULL", i);
725 free(source);
726 return -1;
727 }
728 /* will act like strdup if %r is not present */
729 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
730 if (!destination) {
731 saved_errno = errno;
732 SYSERROR("memory allocation error");
733 free(source);
734 errno = saved_errno;
735 return -1;
736 }
737 mflags = add_required_remount_flags(source, destination,
738 default_mounts[i].flags);
739 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
740 saved_errno = errno;
741 if (r < 0 && errno == ENOENT) {
742 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
743 r = 0;
744 }
745 else if (r < 0)
746 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
747
748 free(source);
749 free(destination);
750 if (r < 0) {
751 errno = saved_errno;
752 return -1;
753 }
754 }
755 }
756
757 if (flags & LXC_AUTO_CGROUP_MASK) {
758 int cg_flags;
759
760 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
761 /* If the type of cgroup mount was not specified, it depends on the
762 * container's capabilities as to what makes sense: if we have
763 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
764 * anyway, so we may as well default to read-write; then the admin
765 * will not be given a false sense of security. (And if they really
766 * want mixed r/o r/w, then they can explicitly specify :mixed.)
767 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
768 * :mixed, because then the container can't remount it read-write. */
769 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
770 int has_sys_admin = 0;
771
772 if (!lxc_list_empty(&conf->keepcaps))
773 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
774 else
775 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
776
777 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
778 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
779 else
780 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
781 }
782
783 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
784 SYSERROR("error mounting /sys/fs/cgroup");
785 return -1;
786 }
787 }
788
789 return 0;
790 }
791
792 static int setup_utsname(struct utsname *utsname)
793 {
794 if (!utsname)
795 return 0;
796
797 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
798 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
799 return -1;
800 }
801
802 INFO("'%s' hostname has been setup", utsname->nodename);
803
804 return 0;
805 }
806
807 struct dev_symlinks {
808 const char *oldpath;
809 const char *name;
810 };
811
812 static const struct dev_symlinks dev_symlinks[] = {
813 {"/proc/self/fd", "fd"},
814 {"/proc/self/fd/0", "stdin"},
815 {"/proc/self/fd/1", "stdout"},
816 {"/proc/self/fd/2", "stderr"},
817 };
818
819 static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
820 {
821 char path[MAXPATHLEN];
822 int ret,i;
823 struct stat s;
824
825
826 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
827 const struct dev_symlinks *d = &dev_symlinks[i];
828 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
829 if (ret < 0 || ret >= MAXPATHLEN)
830 return -1;
831
832 /*
833 * Stat the path first. If we don't get an error
834 * accept it as is and don't try to create it
835 */
836 if (!stat(path, &s)) {
837 continue;
838 }
839
840 ret = symlink(d->oldpath, path);
841
842 if (ret && errno != EEXIST) {
843 if ( errno == EROFS ) {
844 WARN("Warning: Read Only file system while creating %s", path);
845 } else {
846 SYSERROR("Error creating %s", path);
847 return -1;
848 }
849 }
850 }
851 return 0;
852 }
853
854 /*
855 * Build a space-separate list of ptys to pass to systemd.
856 */
857 static bool append_ptyname(char **pp, char *name)
858 {
859 char *p;
860
861 if (!*pp) {
862 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
863 if (!*pp)
864 return false;
865 sprintf(*pp, "container_ttys=%s", name);
866 return true;
867 }
868 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
869 if (!p)
870 return false;
871 *pp = p;
872 strcat(p, " ");
873 strcat(p, name);
874 return true;
875 }
876
877 static int lxc_setup_tty(struct lxc_conf *conf)
878 {
879 int i, ret;
880 const struct lxc_tty_info *tty_info = &conf->tty_info;
881 char *ttydir = conf->ttydir;
882 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
883
884 if (!conf->rootfs.path)
885 return 0;
886
887 for (i = 0; i < tty_info->nbtty; i++) {
888 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
889
890 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
891 if (ret < 0 || (size_t)ret >= sizeof(path)) {
892 ERROR("pathname too long for ttys");
893 return -1;
894 }
895
896 if (ttydir) {
897 /* create dev/lxc/tty%d" */
898 ret = snprintf(lxcpath, sizeof(lxcpath),
899 "/dev/%s/tty%d", ttydir, i + 1);
900 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
901 ERROR("pathname too long for ttys");
902 return -1;
903 }
904
905 ret = creat(lxcpath, 0660);
906 if (ret < 0 && errno != EEXIST) {
907 SYSERROR("failed to create \"%s\"", lxcpath);
908 return -1;
909 }
910 if (ret >= 0)
911 close(ret);
912
913 ret = unlink(path);
914 if (ret < 0 && errno != ENOENT) {
915 SYSERROR("failed to unlink \"%s\"", path);
916 return -1;
917 }
918
919 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
920 if (ret < 0) {
921 WARN("failed to bind mount \"%s\" onto \"%s\"",
922 pty_info->name, path);
923 continue;
924 }
925 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
926 path);
927
928 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
929 ttydir, i + 1);
930 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
931 ERROR("tty pathname too long");
932 return -1;
933 }
934
935 ret = symlink(lxcpath, path);
936 if (ret < 0) {
937 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
938 path, lxcpath);
939 return -1;
940 }
941 } else {
942 /* If we populated /dev, then we need to create
943 * /dev/ttyN
944 */
945 ret = access(path, F_OK);
946 if (ret < 0) {
947 ret = creat(path, 0660);
948 if (ret < 0) {
949 SYSERROR("failed to create \"%s\"", path);
950 /* this isn't fatal, continue */
951 } else {
952 close(ret);
953 }
954 }
955
956 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
957 if (ret < 0) {
958 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
959 continue;
960 }
961
962 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
963 path);
964 }
965
966 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
967 ERROR("Error setting up container_ttys string");
968 return -1;
969 }
970 }
971
972 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
973 return 0;
974 }
975
976 static int setup_rootfs_pivot_root(const char *rootfs)
977 {
978 int oldroot = -1, newroot = -1;
979
980 oldroot = open("/", O_DIRECTORY | O_RDONLY);
981 if (oldroot < 0) {
982 SYSERROR("Error opening old-/ for fchdir");
983 return -1;
984 }
985 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
986 if (newroot < 0) {
987 SYSERROR("Error opening new-/ for fchdir");
988 goto fail;
989 }
990
991 /* change into new root fs */
992 if (fchdir(newroot)) {
993 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
994 goto fail;
995 }
996
997 /* pivot_root into our new root fs */
998 if (pivot_root(".", ".")) {
999 SYSERROR("pivot_root syscall failed");
1000 goto fail;
1001 }
1002
1003 /*
1004 * at this point the old-root is mounted on top of our new-root
1005 * To unmounted it we must not be chdir'd into it, so escape back
1006 * to old-root
1007 */
1008 if (fchdir(oldroot) < 0) {
1009 SYSERROR("Error entering oldroot");
1010 goto fail;
1011 }
1012 if (umount2(".", MNT_DETACH) < 0) {
1013 SYSERROR("Error detaching old root");
1014 goto fail;
1015 }
1016
1017 if (fchdir(newroot) < 0) {
1018 SYSERROR("Error re-entering newroot");
1019 goto fail;
1020 }
1021
1022 close(oldroot);
1023 close(newroot);
1024
1025 DEBUG("pivot_root syscall to '%s' successful", rootfs);
1026
1027 return 0;
1028
1029 fail:
1030 if (oldroot != -1)
1031 close(oldroot);
1032 if (newroot != -1)
1033 close(newroot);
1034 return -1;
1035 }
1036
1037 /*
1038 * Just create a path for /dev under $lxcpath/$name and in rootfs
1039 * If we hit an error, log it but don't fail yet.
1040 */
1041 static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs, const char *lxcpath)
1042 {
1043 int ret;
1044 size_t clen;
1045 char *path;
1046
1047 INFO("Mounting container /dev");
1048
1049 /* $(rootfs->mount) + "/dev/pts" + '\0' */
1050 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
1051 path = alloca(clen);
1052
1053 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
1054 if (ret < 0 || ret >= clen)
1055 return -1;
1056
1057 if (!dir_exists(path)) {
1058 WARN("No /dev in container.");
1059 WARN("Proceeding without autodev setup");
1060 return 0;
1061 }
1062
1063 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
1064 rootfs->path ? rootfs->mount : NULL);
1065 if (ret != 0) {
1066 SYSERROR("Failed mounting tmpfs onto %s\n", path);
1067 return -1;
1068 }
1069
1070 INFO("Mounted tmpfs onto %s", path);
1071
1072 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
1073 if (ret < 0 || ret >= clen)
1074 return -1;
1075
1076 /*
1077 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1078 * If not, then create it and exit if that fails...
1079 */
1080 if (!dir_exists(path)) {
1081 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1082 if (ret) {
1083 SYSERROR("Failed to create /dev/pts in container");
1084 return -1;
1085 }
1086 }
1087
1088 INFO("Mounted container /dev");
1089 return 0;
1090 }
1091
1092 struct lxc_devs {
1093 const char *name;
1094 mode_t mode;
1095 int maj;
1096 int min;
1097 };
1098
1099 static const struct lxc_devs lxc_devs[] = {
1100 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1101 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1102 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1103 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1104 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1105 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
1106 };
1107
1108 static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
1109 {
1110 int ret;
1111 char path[MAXPATHLEN];
1112 int i;
1113 mode_t cmask;
1114
1115 ret = snprintf(path, MAXPATHLEN, "%s/dev", rootfs->path ? rootfs->mount : "");
1116 if (ret < 0 || ret >= MAXPATHLEN) {
1117 ERROR("Error calculating container /dev location");
1118 return -1;
1119 }
1120
1121 /* ignore, just don't try to fill in */
1122 if (!dir_exists(path))
1123 return 0;
1124
1125 INFO("populating container /dev");
1126 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1127 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
1128 const struct lxc_devs *d = &lxc_devs[i];
1129
1130 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
1131 if (ret < 0 || ret >= MAXPATHLEN)
1132 return -1;
1133
1134 ret = mknod(path, d->mode, makedev(d->maj, d->min));
1135 if (ret < 0) {
1136 char hostpath[MAXPATHLEN];
1137 FILE *pathfile;
1138
1139 if (errno == EEXIST) {
1140 DEBUG("\"%s\" device already existed", path);
1141 continue;
1142 }
1143
1144 /* Unprivileged containers cannot create devices, so
1145 * bind mount the device from the host.
1146 */
1147 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1148 if (ret < 0 || ret >= MAXPATHLEN)
1149 return -1;
1150 pathfile = fopen(path, "wb");
1151 if (!pathfile) {
1152 SYSERROR("Failed to create device mount target '%s'", path);
1153 return -1;
1154 }
1155 fclose(pathfile);
1156 if (safe_mount(hostpath, path, 0, MS_BIND, NULL, rootfs->path ? rootfs->mount : NULL) != 0) {
1157 SYSERROR("Failed bind mounting device %s from host into container", d->name);
1158 return -1;
1159 }
1160 DEBUG("bind mounted \"%s\" onto \"%s\"", hostpath, path);
1161 } else {
1162 DEBUG("created device node \"%s\"", path);
1163 }
1164 }
1165 umask(cmask);
1166
1167 INFO("populated container /dev");
1168 return 0;
1169 }
1170
1171 static int lxc_setup_rootfs(struct lxc_conf *conf)
1172 {
1173 int ret;
1174 struct bdev *bdev;
1175 const struct lxc_rootfs *rootfs;
1176
1177 rootfs = &conf->rootfs;
1178 if (!rootfs->path) {
1179 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1180 SYSERROR("Failed to make / rslave.");
1181 return -1;
1182 }
1183 return 0;
1184 }
1185
1186 if (access(rootfs->mount, F_OK)) {
1187 SYSERROR("Failed to access to \"%s\". Check it is present.",
1188 rootfs->mount);
1189 return -1;
1190 }
1191
1192 bdev = bdev_init(conf, rootfs->path, rootfs->mount, rootfs->options);
1193 if (!bdev) {
1194 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1195 rootfs->path, rootfs->mount,
1196 rootfs->options ? rootfs->options : "(null)");
1197 return -1;
1198 }
1199
1200 ret = bdev->ops->mount(bdev);
1201 bdev_put(bdev);
1202 if (ret < 0) {
1203 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1204 rootfs->path, rootfs->mount,
1205 rootfs->options ? rootfs->options : "(null)");
1206 return -1;
1207 }
1208
1209 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1210 rootfs->path, rootfs->mount,
1211 rootfs->options ? rootfs->options : "(null)");
1212
1213 return 0;
1214 }
1215
1216 int prepare_ramfs_root(char *root)
1217 {
1218 char buf[LXC_LINELEN], *p;
1219 char nroot[PATH_MAX];
1220 FILE *f;
1221 int i;
1222 char *p2;
1223
1224 if (realpath(root, nroot) == NULL)
1225 return -errno;
1226
1227 if (chdir("/") == -1)
1228 return -errno;
1229
1230 /*
1231 * We could use here MS_MOVE, but in userns this mount is
1232 * locked and can't be moved.
1233 */
1234 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
1235 SYSERROR("Failed to move %s into /", root);
1236 return -errno;
1237 }
1238
1239 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
1240 SYSERROR("Failed to make . rprivate");
1241 return -errno;
1242 }
1243
1244 /*
1245 * The following code cleans up inhereted mounts which are not
1246 * required for CT.
1247 *
1248 * The mountinfo file shows not all mounts, if a few points have been
1249 * unmounted between read operations from the mountinfo. So we need to
1250 * read mountinfo a few times.
1251 *
1252 * This loop can be skipped if a container uses unserns, because all
1253 * inherited mounts are locked and we should live with all this trash.
1254 */
1255 while (1) {
1256 int progress = 0;
1257
1258 f = fopen("./proc/self/mountinfo", "r");
1259 if (!f) {
1260 SYSERROR("Unable to open /proc/self/mountinfo");
1261 return -1;
1262 }
1263 while (fgets(buf, LXC_LINELEN, f)) {
1264 for (p = buf, i=0; p && i < 4; i++)
1265 p = strchr(p+1, ' ');
1266 if (!p)
1267 continue;
1268 p2 = strchr(p+1, ' ');
1269 if (!p2)
1270 continue;
1271
1272 *p2 = '\0';
1273 *p = '.';
1274
1275 if (strcmp(p + 1, "/") == 0)
1276 continue;
1277 if (strcmp(p + 1, "/proc") == 0)
1278 continue;
1279
1280 if (umount2(p, MNT_DETACH) == 0)
1281 progress++;
1282 }
1283 fclose(f);
1284 if (!progress)
1285 break;
1286 }
1287
1288 /* This also can be skipped if a container uses unserns */
1289 umount2("./proc", MNT_DETACH);
1290
1291 /* It is weird, but chdir("..") moves us in a new root */
1292 if (chdir("..") == -1) {
1293 SYSERROR("Unable to change working directory");
1294 return -1;
1295 }
1296
1297 if (chroot(".") == -1) {
1298 SYSERROR("Unable to chroot");
1299 return -1;
1300 }
1301
1302 return 0;
1303 }
1304
1305 static int setup_pivot_root(const struct lxc_rootfs *rootfs)
1306 {
1307 if (!rootfs->path) {
1308 DEBUG("container does not have a rootfs, so not doing pivot root");
1309 return 0;
1310 }
1311
1312 if (detect_ramfs_rootfs()) {
1313 DEBUG("detected that container is on ramfs");
1314 if (prepare_ramfs_root(rootfs->mount)) {
1315 ERROR("failed to prepare minimal ramfs root");
1316 return -1;
1317 }
1318
1319 DEBUG("prepared ramfs root for container");
1320 return 0;
1321 }
1322
1323 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1324 ERROR("failed to pivot root");
1325 return -1;
1326 }
1327
1328 DEBUG("finished pivot root");
1329 return 0;
1330 }
1331
1332 static int lxc_setup_devpts(int num_pts)
1333 {
1334 int ret;
1335 const char *devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
1336
1337 if (!num_pts) {
1338 DEBUG("no new devpts instance will be mounted since no pts "
1339 "devices are requested");
1340 return 0;
1341 }
1342
1343 /* Unmount old devpts instance. */
1344 ret = access("/dev/pts/ptmx", F_OK);
1345 if (!ret) {
1346 ret = umount("/dev/pts");
1347 if (ret < 0) {
1348 SYSERROR("failed to unmount old devpts instance");
1349 return -1;
1350 }
1351 DEBUG("unmounted old /dev/pts instance");
1352 }
1353
1354 /* Create mountpoint for devpts instance. */
1355 ret = mkdir("/dev/pts", 0755);
1356 if (ret < 0 && errno != EEXIST) {
1357 SYSERROR("failed to create the \"/dev/pts\" directory");
1358 return -1;
1359 }
1360
1361 /* Mount new devpts instance. */
1362 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1363 if (ret < 0) {
1364 SYSERROR("failed to mount new devpts instance");
1365 return -1;
1366 }
1367 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
1368
1369 /* Remove any pre-existing /dev/ptmx file. */
1370 ret = access("/dev/ptmx", F_OK);
1371 if (!ret) {
1372 ret = remove("/dev/ptmx");
1373 if (ret < 0) {
1374 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1375 return -1;
1376 }
1377 DEBUG("removed existing \"/dev/ptmx\"");
1378 }
1379
1380 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1381 ret = open("/dev/ptmx", O_CREAT, 0666);
1382 if (ret < 0) {
1383 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1384 return -1;
1385 }
1386 close(ret);
1387 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
1388
1389 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
1390 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
1391 if (!ret) {
1392 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1393 return 0;
1394 } else {
1395 /* Fallthrough and try to create a symlink. */
1396 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1397 }
1398
1399 /* Remove the dummy /dev/ptmx file we created above. */
1400 ret = remove("/dev/ptmx");
1401 if (ret < 0) {
1402 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1403 return -1;
1404 }
1405
1406 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1407 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1408 if (ret < 0) {
1409 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1410 return -1;
1411 }
1412 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1413
1414 return 0;
1415 }
1416
1417 static int setup_personality(int persona)
1418 {
1419 #if HAVE_SYS_PERSONALITY_H
1420 if (persona == -1)
1421 return 0;
1422
1423 if (personality(persona) < 0) {
1424 SYSERROR("failed to set personality to '0x%x'", persona);
1425 return -1;
1426 }
1427
1428 INFO("set personality to '0x%x'", persona);
1429 #endif
1430
1431 return 0;
1432 }
1433
1434 static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1435 const struct lxc_console *console)
1436 {
1437 char path[MAXPATHLEN];
1438 int ret, fd;
1439
1440 if (console->path && !strcmp(console->path, "none"))
1441 return 0;
1442
1443 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1444 if (ret < 0 || (size_t)ret >= sizeof(path))
1445 return -1;
1446
1447 /* When we are asked to setup a console we remove any previous
1448 * /dev/console bind-mounts.
1449 */
1450 if (file_exists(path)) {
1451 ret = lxc_unstack_mountpoint(path, false);
1452 if (ret < 0) {
1453 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1454 return -ret;
1455 } else {
1456 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1457 }
1458 ret = unlink(path);
1459 if (ret < 0) {
1460 SYSERROR("error unlinking %s", path);
1461 return -errno;
1462 }
1463 }
1464
1465 /* For unprivileged containers autodev or automounts will already have
1466 * taken care of creating /dev/console.
1467 */
1468 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1469 if (fd < 0) {
1470 if (errno != EEXIST) {
1471 SYSERROR("failed to create console");
1472 return -errno;
1473 }
1474 } else {
1475 close(fd);
1476 }
1477
1478 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
1479 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1480 return -errno;
1481 }
1482
1483 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
1484 ERROR("failed to mount '%s' on '%s'", console->name, path);
1485 return -1;
1486 }
1487
1488 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
1489 return 0;
1490 }
1491
1492 static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1493 const struct lxc_console *console,
1494 char *ttydir)
1495 {
1496 int ret;
1497 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
1498
1499 /* create rootfs/dev/<ttydir> directory */
1500 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1501 if (ret < 0 || (size_t)ret >= sizeof(path))
1502 return -1;
1503
1504 ret = mkdir(path, 0755);
1505 if (ret && errno != EEXIST) {
1506 SYSERROR("failed with errno %d to create %s", errno, path);
1507 return -errno;
1508 }
1509 DEBUG("created directory for console and tty devices at \%s\"", path);
1510
1511 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1512 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1513 return -1;
1514
1515 ret = creat(lxcpath, 0660);
1516 if (ret == -1 && errno != EEXIST) {
1517 SYSERROR("error %d creating %s", errno, lxcpath);
1518 return -errno;
1519 }
1520 if (ret >= 0)
1521 close(ret);
1522
1523 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1524 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1525 return -1;
1526
1527 /* When we are asked to setup a console we remove any previous
1528 * /dev/console bind-mounts.
1529 */
1530 if (console->path && !strcmp(console->path, "none")) {
1531 struct stat st;
1532 ret = stat(path, &st);
1533 if (ret < 0) {
1534 if (errno == ENOENT)
1535 return 0;
1536 SYSERROR("failed stat() \"%s\"", path);
1537 return -errno;
1538 }
1539
1540 /* /dev/console must be character device with major number 5 and
1541 * minor number 1. If not, give benefit of the doubt and assume
1542 * the user has mounted something else right there on purpose.
1543 */
1544 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1545 return 0;
1546
1547 /* In case the user requested a bind-mount for /dev/console and
1548 * requests a ttydir we move the mount to the
1549 * /dev/<ttydir/console.
1550 * Note, we only move the uppermost mount and clear all other
1551 * mounts underneath for safety.
1552 * If it is a character device created via mknod() we simply
1553 * rename it.
1554 */
1555 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1556 if (ret < 0) {
1557 if (errno != EINVAL) {
1558 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1559 return -errno;
1560 }
1561 /* path was not a mountpoint */
1562 ret = rename(path, lxcpath);
1563 if (ret < 0) {
1564 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1565 return -errno;
1566 }
1567 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1568 } else {
1569 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1570 }
1571
1572 /* Clear all remaining bind-mounts. */
1573 ret = lxc_unstack_mountpoint(path, false);
1574 if (ret < 0) {
1575 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1576 return -ret;
1577 } else {
1578 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1579 }
1580 } else {
1581 if (file_exists(path)) {
1582 ret = lxc_unstack_mountpoint(path, false);
1583 if (ret < 0) {
1584 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1585 return -ret;
1586 } else {
1587 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1588 }
1589 }
1590
1591 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1592 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1593 return -1;
1594 }
1595 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
1596 }
1597
1598 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
1599 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
1600 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1601 return -1;
1602
1603 ret = unlink(path);
1604 if (ret && errno != ENOENT) {
1605 SYSERROR("error unlinking %s", path);
1606 return -errno;
1607 }
1608
1609 ret = symlink(lxcpath, path);
1610 if (ret < 0) {
1611 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
1612 return -1;
1613 }
1614
1615 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
1616 return 0;
1617 }
1618
1619 static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1620 const struct lxc_console *console, char *ttydir)
1621 {
1622 /* We don't have a rootfs, /dev/console will be shared. */
1623 if (!rootfs->path) {
1624 DEBUG("/dev/console will be shared with the host");
1625 return 0;
1626 }
1627
1628 if (!ttydir)
1629 return lxc_setup_dev_console(rootfs, console);
1630
1631 return lxc_setup_ttydir_console(rootfs, console, ttydir);
1632 }
1633
1634 static int setup_kmsg(const struct lxc_rootfs *rootfs,
1635 const struct lxc_console *console)
1636 {
1637 char kpath[MAXPATHLEN];
1638 int ret;
1639
1640 if (!rootfs->path)
1641 return 0;
1642 ret = snprintf(kpath, sizeof(kpath), "%s/dev/kmsg", rootfs->mount);
1643 if (ret < 0 || ret >= sizeof(kpath))
1644 return -1;
1645
1646 ret = unlink(kpath);
1647 if (ret && errno != ENOENT) {
1648 SYSERROR("error unlinking %s", kpath);
1649 return -1;
1650 }
1651
1652 ret = symlink("console", kpath);
1653 if (ret) {
1654 SYSERROR("failed to create symlink for kmsg");
1655 return -1;
1656 }
1657
1658 return 0;
1659 }
1660
1661 static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1662 {
1663 struct mount_opt *mo;
1664
1665 /* If opt is found in mount_opt, set or clear flags.
1666 * Otherwise append it to data. */
1667
1668 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1669 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1670 if (mo->clear)
1671 *flags &= ~mo->flag;
1672 else
1673 *flags |= mo->flag;
1674 return;
1675 }
1676 }
1677
1678 if (strlen(*data))
1679 strcat(*data, ",");
1680 strcat(*data, opt);
1681 }
1682
1683 int parse_mntopts(const char *mntopts, unsigned long *mntflags,
1684 char **mntdata)
1685 {
1686 char *s, *data;
1687 char *p, *saveptr = NULL;
1688
1689 *mntdata = NULL;
1690 *mntflags = 0L;
1691
1692 if (!mntopts)
1693 return 0;
1694
1695 s = strdup(mntopts);
1696 if (!s) {
1697 SYSERROR("failed to allocate memory");
1698 return -1;
1699 }
1700
1701 data = malloc(strlen(s) + 1);
1702 if (!data) {
1703 SYSERROR("failed to allocate memory");
1704 free(s);
1705 return -1;
1706 }
1707 *data = 0;
1708
1709 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1710 p = strtok_r(NULL, ",", &saveptr))
1711 parse_mntopt(p, mntflags, &data);
1712
1713 if (*data)
1714 *mntdata = data;
1715 else
1716 free(data);
1717 free(s);
1718
1719 return 0;
1720 }
1721
1722 static void null_endofword(char *word)
1723 {
1724 while (*word && *word != ' ' && *word != '\t')
1725 word++;
1726 *word = '\0';
1727 }
1728
1729 /*
1730 * skip @nfields spaces in @src
1731 */
1732 static char *get_field(char *src, int nfields)
1733 {
1734 char *p = src;
1735 int i;
1736
1737 for (i = 0; i < nfields; i++) {
1738 while (*p && *p != ' ' && *p != '\t')
1739 p++;
1740 if (!*p)
1741 break;
1742 p++;
1743 }
1744 return p;
1745 }
1746
1747 static int mount_entry(const char *fsname, const char *target,
1748 const char *fstype, unsigned long mountflags,
1749 const char *data, int optional, int dev, const char *rootfs)
1750 {
1751 #ifdef HAVE_STATVFS
1752 struct statvfs sb;
1753 #endif
1754
1755 if (safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data, rootfs)) {
1756 if (optional) {
1757 INFO("failed to mount '%s' on '%s' (optional): %s", fsname,
1758 target, strerror(errno));
1759 return 0;
1760 }
1761 else {
1762 SYSERROR("failed to mount '%s' on '%s'", fsname, target);
1763 return -1;
1764 }
1765 }
1766
1767 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
1768 DEBUG("remounting %s on %s to respect bind or remount options",
1769 fsname ? fsname : "(none)", target ? target : "(none)");
1770 unsigned long rqd_flags = 0;
1771 if (mountflags & MS_RDONLY)
1772 rqd_flags |= MS_RDONLY;
1773 #ifdef HAVE_STATVFS
1774 if (statvfs(fsname, &sb) == 0) {
1775 unsigned long required_flags = rqd_flags;
1776 if (sb.f_flag & MS_NOSUID)
1777 required_flags |= MS_NOSUID;
1778 if (sb.f_flag & MS_NODEV && !dev)
1779 required_flags |= MS_NODEV;
1780 if (sb.f_flag & MS_RDONLY)
1781 required_flags |= MS_RDONLY;
1782 if (sb.f_flag & MS_NOEXEC)
1783 required_flags |= MS_NOEXEC;
1784 DEBUG("(at remount) flags for %s was %lu, required extra flags are %lu", fsname, sb.f_flag, required_flags);
1785 /*
1786 * If this was a bind mount request, and required_flags
1787 * does not have any flags which are not already in
1788 * mountflags, then skip the remount
1789 */
1790 if (!(mountflags & MS_REMOUNT)) {
1791 if (!(required_flags & ~mountflags) && rqd_flags == 0) {
1792 DEBUG("mountflags already was %lu, skipping remount",
1793 mountflags);
1794 goto skipremount;
1795 }
1796 }
1797 mountflags |= required_flags;
1798 }
1799 #endif
1800
1801 if (mount(fsname, target, fstype,
1802 mountflags | MS_REMOUNT, data) < 0) {
1803 if (optional) {
1804 INFO("failed to mount '%s' on '%s' (optional): %s",
1805 fsname, target, strerror(errno));
1806 return 0;
1807 }
1808 else {
1809 SYSERROR("failed to mount '%s' on '%s'",
1810 fsname, target);
1811 return -1;
1812 }
1813 }
1814 }
1815
1816 #ifdef HAVE_STATVFS
1817 skipremount:
1818 #endif
1819 DEBUG("mounted '%s' on '%s', type '%s'", fsname, target, fstype);
1820
1821 return 0;
1822 }
1823
1824 /*
1825 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1826 */
1827 static void cull_mntent_opt(struct mntent *mntent)
1828 {
1829 int i;
1830 char *p, *p2;
1831 char *list[] = {"create=dir",
1832 "create=file",
1833 "optional",
1834 NULL };
1835
1836 for (i=0; list[i]; i++) {
1837 if (!(p = strstr(mntent->mnt_opts, list[i])))
1838 continue;
1839 p2 = strchr(p, ',');
1840 if (!p2) {
1841 /* no more mntopts, so just chop it here */
1842 *p = '\0';
1843 continue;
1844 }
1845 memmove(p, p2+1, strlen(p2+1)+1);
1846 }
1847 }
1848
1849 static int mount_entry_create_dir_file(const struct mntent *mntent,
1850 const char* path, const struct lxc_rootfs *rootfs,
1851 const char *lxc_name, const char *lxc_path)
1852 {
1853 char *pathdirname = NULL;
1854 int ret = 0;
1855 FILE *pathfile = NULL;
1856
1857 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
1858 if (ovl_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
1859 return -1;
1860 } else if (strncmp(mntent->mnt_type, "aufs", 4) == 0) {
1861 if (aufs_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
1862 return -1;
1863 }
1864
1865 if (hasmntopt(mntent, "create=dir")) {
1866 if (mkdir_p(path, 0755) < 0) {
1867 WARN("Failed to create mount target '%s'", path);
1868 ret = -1;
1869 }
1870 }
1871
1872 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1873 pathdirname = strdup(path);
1874 pathdirname = dirname(pathdirname);
1875 if (mkdir_p(pathdirname, 0755) < 0) {
1876 WARN("Failed to create target directory");
1877 }
1878 pathfile = fopen(path, "wb");
1879 if (!pathfile) {
1880 WARN("Failed to create mount target '%s'", path);
1881 ret = -1;
1882 } else {
1883 fclose(pathfile);
1884 }
1885 }
1886 free(pathdirname);
1887 return ret;
1888 }
1889
1890 /* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1891 * without a rootfs. */
1892 static inline int mount_entry_on_generic(struct mntent *mntent,
1893 const char* path, const struct lxc_rootfs *rootfs,
1894 const char *lxc_name, const char *lxc_path)
1895 {
1896 unsigned long mntflags;
1897 char *mntdata;
1898 int ret;
1899 bool optional = hasmntopt(mntent, "optional") != NULL;
1900 bool dev = hasmntopt(mntent, "dev") != NULL;
1901
1902 char *rootfs_path = NULL;
1903 if (rootfs && rootfs->path)
1904 rootfs_path = rootfs->mount;
1905
1906 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name, lxc_path);
1907
1908 if (ret < 0)
1909 return optional ? 0 : -1;
1910
1911 cull_mntent_opt(mntent);
1912
1913 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
1914 free(mntdata);
1915 return -1;
1916 }
1917
1918 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
1919 mntdata, optional, dev, rootfs_path);
1920
1921 free(mntdata);
1922 return ret;
1923 }
1924
1925 static inline int mount_entry_on_systemfs(struct mntent *mntent)
1926 {
1927 char path[MAXPATHLEN];
1928 int ret;
1929
1930 /* For containers created without a rootfs all mounts are treated as
1931 * absolute paths starting at / on the host. */
1932 if (mntent->mnt_dir[0] != '/')
1933 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
1934 else
1935 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
1936
1937 if (ret < 0 || ret >= sizeof(path)) {
1938 ERROR("path name too long");
1939 return -1;
1940 }
1941
1942 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
1943 }
1944
1945 static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
1946 const struct lxc_rootfs *rootfs,
1947 const char *lxc_name,
1948 const char *lxc_path)
1949 {
1950 char *aux;
1951 char path[MAXPATHLEN];
1952 int r, ret = 0, offset;
1953 const char *lxcpath;
1954
1955 lxcpath = lxc_global_config_value("lxc.lxcpath");
1956 if (!lxcpath) {
1957 ERROR("Out of memory");
1958 return -1;
1959 }
1960
1961 /* if rootfs->path is a blockdev path, allow container fstab to
1962 * use $lxcpath/CN/rootfs as the target prefix */
1963 r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
1964 if (r < 0 || r >= MAXPATHLEN)
1965 goto skipvarlib;
1966
1967 aux = strstr(mntent->mnt_dir, path);
1968 if (aux) {
1969 offset = strlen(path);
1970 goto skipabs;
1971 }
1972
1973 skipvarlib:
1974 aux = strstr(mntent->mnt_dir, rootfs->path);
1975 if (!aux) {
1976 WARN("ignoring mount point '%s'", mntent->mnt_dir);
1977 return ret;
1978 }
1979 offset = strlen(rootfs->path);
1980
1981 skipabs:
1982
1983 r = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount,
1984 aux + offset);
1985 if (r < 0 || r >= MAXPATHLEN) {
1986 WARN("pathnme too long for '%s'", mntent->mnt_dir);
1987 return -1;
1988 }
1989
1990 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
1991 }
1992
1993 static int mount_entry_on_relative_rootfs(struct mntent *mntent,
1994 const struct lxc_rootfs *rootfs,
1995 const char *lxc_name,
1996 const char *lxc_path)
1997 {
1998 char path[MAXPATHLEN];
1999 int ret;
2000
2001 /* relative to root mount point */
2002 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
2003 if (ret < 0 || ret >= sizeof(path)) {
2004 ERROR("path name too long");
2005 return -1;
2006 }
2007
2008 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2009 }
2010
2011 static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
2012 const char *lxc_name, const char *lxc_path)
2013 {
2014 struct mntent mntent;
2015 char buf[4096];
2016 int ret = -1;
2017
2018 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
2019
2020 if (!rootfs->path) {
2021 if (mount_entry_on_systemfs(&mntent))
2022 goto out;
2023 continue;
2024 }
2025
2026 /* We have a separate root, mounts are relative to it */
2027 if (mntent.mnt_dir[0] != '/') {
2028 if (mount_entry_on_relative_rootfs(&mntent, rootfs, lxc_name, lxc_path))
2029 goto out;
2030 continue;
2031 }
2032
2033 if (mount_entry_on_absolute_rootfs(&mntent, rootfs, lxc_name, lxc_path))
2034 goto out;
2035 }
2036
2037 ret = 0;
2038
2039 INFO("mount points have been setup");
2040 out:
2041 return ret;
2042 }
2043
2044 static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
2045 const char *lxc_name, const char *lxc_path)
2046 {
2047 FILE *file;
2048 int ret;
2049
2050 if (!fstab)
2051 return 0;
2052
2053 file = setmntent(fstab, "r");
2054 if (!file) {
2055 SYSERROR("failed to use '%s'", fstab);
2056 return -1;
2057 }
2058
2059 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
2060
2061 endmntent(file);
2062 return ret;
2063 }
2064
2065 FILE *make_anonymous_mount_file(struct lxc_list *mount)
2066 {
2067 int ret;
2068 char *mount_entry;
2069 struct lxc_list *iterator;
2070 FILE *file;
2071 int fd = -1;
2072
2073 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2074 if (fd < 0) {
2075 if (errno != ENOSYS)
2076 return NULL;
2077 file = tmpfile();
2078 } else {
2079 file = fdopen(fd, "r+");
2080 }
2081
2082 if (!file) {
2083 int saved_errno = errno;
2084 if (fd != -1)
2085 close(fd);
2086 ERROR("Could not create mount entry file: %s.", strerror(saved_errno));
2087 return NULL;
2088 }
2089
2090 lxc_list_for_each(iterator, mount) {
2091 mount_entry = iterator->elem;
2092 ret = fprintf(file, "%s\n", mount_entry);
2093 if (ret < strlen(mount_entry))
2094 WARN("Could not write mount entry to anonymous mount file.");
2095 }
2096
2097 if (fseek(file, 0, SEEK_SET) < 0) {
2098 fclose(file);
2099 return NULL;
2100 }
2101
2102 return file;
2103 }
2104
2105 static int setup_mount_entries(const struct lxc_rootfs *rootfs,
2106 struct lxc_list *mount, const char *lxc_name,
2107 const char *lxc_path)
2108 {
2109 FILE *file;
2110 int ret;
2111
2112 file = make_anonymous_mount_file(mount);
2113 if (!file)
2114 return -1;
2115
2116 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
2117
2118 fclose(file);
2119 return ret;
2120 }
2121
2122 static int parse_cap(const char *cap)
2123 {
2124 char *ptr = NULL;
2125 size_t i;
2126 int capid = -1;
2127
2128 if (!strcmp(cap, "none"))
2129 return -2;
2130
2131 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2132
2133 if (strcmp(cap, caps_opt[i].name))
2134 continue;
2135
2136 capid = caps_opt[i].value;
2137 break;
2138 }
2139
2140 if (capid < 0) {
2141 /* try to see if it's numeric, so the user may specify
2142 * capabilities that the running kernel knows about but
2143 * we don't */
2144 errno = 0;
2145 capid = strtol(cap, &ptr, 10);
2146 if (!ptr || *ptr != '\0' || errno != 0)
2147 /* not a valid number */
2148 capid = -1;
2149 else if (capid > lxc_caps_last_cap())
2150 /* we have a number but it's not a valid
2151 * capability */
2152 capid = -1;
2153 }
2154
2155 return capid;
2156 }
2157
2158 int in_caplist(int cap, struct lxc_list *caps)
2159 {
2160 struct lxc_list *iterator;
2161 int capid;
2162
2163 lxc_list_for_each(iterator, caps) {
2164 capid = parse_cap(iterator->elem);
2165 if (capid == cap)
2166 return 1;
2167 }
2168
2169 return 0;
2170 }
2171
2172 static int setup_caps(struct lxc_list *caps)
2173 {
2174 struct lxc_list *iterator;
2175 char *drop_entry;
2176 int capid;
2177
2178 lxc_list_for_each(iterator, caps) {
2179
2180 drop_entry = iterator->elem;
2181
2182 capid = parse_cap(drop_entry);
2183
2184 if (capid < 0) {
2185 ERROR("unknown capability %s", drop_entry);
2186 return -1;
2187 }
2188
2189 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2190
2191 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
2192 SYSERROR("failed to remove %s capability", drop_entry);
2193 return -1;
2194 }
2195
2196 }
2197
2198 DEBUG("capabilities have been setup");
2199
2200 return 0;
2201 }
2202
2203 static int dropcaps_except(struct lxc_list *caps)
2204 {
2205 struct lxc_list *iterator;
2206 char *keep_entry;
2207 int i, capid;
2208 int numcaps = lxc_caps_last_cap() + 1;
2209 INFO("found %d capabilities", numcaps);
2210
2211 if (numcaps <= 0 || numcaps > 200)
2212 return -1;
2213
2214 // caplist[i] is 1 if we keep capability i
2215 int *caplist = alloca(numcaps * sizeof(int));
2216 memset(caplist, 0, numcaps * sizeof(int));
2217
2218 lxc_list_for_each(iterator, caps) {
2219
2220 keep_entry = iterator->elem;
2221
2222 capid = parse_cap(keep_entry);
2223
2224 if (capid == -2)
2225 continue;
2226
2227 if (capid < 0) {
2228 ERROR("unknown capability %s", keep_entry);
2229 return -1;
2230 }
2231
2232 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
2233
2234 caplist[capid] = 1;
2235 }
2236 for (i=0; i<numcaps; i++) {
2237 if (caplist[i])
2238 continue;
2239 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
2240 SYSERROR("failed to remove capability %d", i);
2241 return -1;
2242 }
2243 }
2244
2245 DEBUG("capabilities have been setup");
2246
2247 return 0;
2248 }
2249
2250 static int setup_hw_addr(char *hwaddr, const char *ifname)
2251 {
2252 struct sockaddr sockaddr;
2253 struct ifreq ifr;
2254 int ret, fd, saved_errno;
2255
2256 ret = lxc_convert_mac(hwaddr, &sockaddr);
2257 if (ret) {
2258 ERROR("mac address '%s' conversion failed : %s",
2259 hwaddr, strerror(-ret));
2260 return -1;
2261 }
2262
2263 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
2264 ifr.ifr_name[IFNAMSIZ-1] = '\0';
2265 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2266
2267 fd = socket(AF_INET, SOCK_DGRAM, 0);
2268 if (fd < 0) {
2269 ERROR("socket failure : %s", strerror(errno));
2270 return -1;
2271 }
2272
2273 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
2274 saved_errno = errno;
2275 close(fd);
2276 if (ret)
2277 ERROR("ioctl failure : %s", strerror(saved_errno));
2278
2279 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
2280
2281 return ret;
2282 }
2283
2284 static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
2285 {
2286 struct lxc_list *iterator;
2287 struct lxc_inetdev *inetdev;
2288 int err;
2289
2290 lxc_list_for_each(iterator, ip) {
2291
2292 inetdev = iterator->elem;
2293
2294 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2295 &inetdev->bcast, inetdev->prefix);
2296 if (err) {
2297 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2298 ifindex, strerror(-err));
2299 return -1;
2300 }
2301 }
2302
2303 return 0;
2304 }
2305
2306 static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
2307 {
2308 struct lxc_list *iterator;
2309 struct lxc_inet6dev *inet6dev;
2310 int err;
2311
2312 lxc_list_for_each(iterator, ip) {
2313
2314 inet6dev = iterator->elem;
2315
2316 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
2317 &inet6dev->mcast, &inet6dev->acast,
2318 inet6dev->prefix);
2319 if (err) {
2320 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2321 ifindex, strerror(-err));
2322 return -1;
2323 }
2324 }
2325
2326 return 0;
2327 }
2328
2329 static int setup_netdev(struct lxc_netdev *netdev)
2330 {
2331 char ifname[IFNAMSIZ];
2332 char *current_ifname = ifname;
2333 int err;
2334
2335 /* empty network namespace */
2336 if (!netdev->ifindex) {
2337 if (netdev->flags & IFF_UP) {
2338 err = lxc_netdev_up("lo");
2339 if (err) {
2340 ERROR("failed to set the loopback up : %s",
2341 strerror(-err));
2342 return -1;
2343 }
2344 }
2345 if (netdev->type != LXC_NET_VETH)
2346 return 0;
2347 netdev->ifindex = if_nametoindex(netdev->name);
2348 }
2349
2350 /* get the new ifindex in case of physical netdev */
2351 if (netdev->type == LXC_NET_PHYS) {
2352 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2353 ERROR("failed to get ifindex for %s",
2354 netdev->link);
2355 return -1;
2356 }
2357 }
2358
2359 /* retrieve the name of the interface */
2360 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2361 ERROR("no interface corresponding to index '%d'",
2362 netdev->ifindex);
2363 return -1;
2364 }
2365
2366 /* default: let the system to choose one interface name */
2367 if (!netdev->name)
2368 netdev->name = netdev->type == LXC_NET_PHYS ?
2369 netdev->link : "eth%d";
2370
2371 /* rename the interface name */
2372 if (strcmp(ifname, netdev->name) != 0) {
2373 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2374 if (err) {
2375 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2376 strerror(-err));
2377 return -1;
2378 }
2379 }
2380
2381 /* Re-read the name of the interface because its name has changed
2382 * and would be automatically allocated by the system
2383 */
2384 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2385 ERROR("no interface corresponding to index '%d'",
2386 netdev->ifindex);
2387 return -1;
2388 }
2389
2390 /* set a mac address */
2391 if (netdev->hwaddr) {
2392 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
2393 ERROR("failed to setup hw address for '%s'",
2394 current_ifname);
2395 return -1;
2396 }
2397 }
2398
2399 /* setup ipv4 addresses on the interface */
2400 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
2401 ERROR("failed to setup ip addresses for '%s'",
2402 ifname);
2403 return -1;
2404 }
2405
2406 /* setup ipv6 addresses on the interface */
2407 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
2408 ERROR("failed to setup ipv6 addresses for '%s'",
2409 ifname);
2410 return -1;
2411 }
2412
2413 /* set the network device up */
2414 if (netdev->flags & IFF_UP) {
2415 int err;
2416
2417 err = lxc_netdev_up(current_ifname);
2418 if (err) {
2419 ERROR("failed to set '%s' up : %s", current_ifname,
2420 strerror(-err));
2421 return -1;
2422 }
2423
2424 /* the network is up, make the loopback up too */
2425 err = lxc_netdev_up("lo");
2426 if (err) {
2427 ERROR("failed to set the loopback up : %s",
2428 strerror(-err));
2429 return -1;
2430 }
2431 }
2432
2433 /* We can only set up the default routes after bringing
2434 * up the interface, sine bringing up the interface adds
2435 * the link-local routes and we can't add a default
2436 * route if the gateway is not reachable. */
2437
2438 /* setup ipv4 gateway on the interface */
2439 if (netdev->ipv4_gateway) {
2440 if (!(netdev->flags & IFF_UP)) {
2441 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2442 return -1;
2443 }
2444
2445 if (lxc_list_empty(&netdev->ipv4)) {
2446 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2447 return -1;
2448 }
2449
2450 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2451 if (err) {
2452 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2453 if (err) {
2454 ERROR("failed to add ipv4 dest for '%s': %s",
2455 ifname, strerror(-err));
2456 }
2457
2458 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2459 if (err) {
2460 ERROR("failed to setup ipv4 gateway for '%s': %s",
2461 ifname, strerror(-err));
2462 if (netdev->ipv4_gateway_auto) {
2463 char buf[INET_ADDRSTRLEN];
2464 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2465 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2466 }
2467 return -1;
2468 }
2469 }
2470 }
2471
2472 /* setup ipv6 gateway on the interface */
2473 if (netdev->ipv6_gateway) {
2474 if (!(netdev->flags & IFF_UP)) {
2475 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2476 return -1;
2477 }
2478
2479 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2480 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2481 return -1;
2482 }
2483
2484 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2485 if (err) {
2486 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2487 if (err) {
2488 ERROR("failed to add ipv6 dest for '%s': %s",
2489 ifname, strerror(-err));
2490 }
2491
2492 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2493 if (err) {
2494 ERROR("failed to setup ipv6 gateway for '%s': %s",
2495 ifname, strerror(-err));
2496 if (netdev->ipv6_gateway_auto) {
2497 char buf[INET6_ADDRSTRLEN];
2498 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2499 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2500 }
2501 return -1;
2502 }
2503 }
2504 }
2505
2506 DEBUG("'%s' has been setup", current_ifname);
2507
2508 return 0;
2509 }
2510
2511 static int setup_network(const struct lxc_conf *conf, struct lxc_list *network)
2512 {
2513 struct lxc_list *iterator;
2514 struct lxc_netdev *netdev;
2515
2516 lxc_log_configured_netdevs(conf);
2517
2518 lxc_list_for_each(iterator, network) {
2519
2520 netdev = iterator->elem;
2521
2522 if (setup_netdev(netdev)) {
2523 ERROR("failed to setup netdev");
2524 return -1;
2525 }
2526 }
2527
2528 if (!lxc_list_empty(network))
2529 INFO("network has been setup");
2530
2531 return 0;
2532 }
2533
2534 static int parse_resource(const char *res) {
2535 size_t i;
2536 int resid = -1;
2537
2538 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2539 if (strcmp(res, limit_opt[i].name) == 0)
2540 return limit_opt[i].value;
2541 }
2542
2543 /* try to see if it's numeric, so the user may specify
2544 * resources that the running kernel knows about but
2545 * we don't */
2546 if (lxc_safe_int(res, &resid) == 0)
2547 return resid;
2548 return -1;
2549 }
2550
2551 int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2552 struct lxc_list *it;
2553 struct lxc_limit *lim;
2554 int resid;
2555
2556 lxc_list_for_each(it, limits) {
2557 lim = it->elem;
2558
2559 resid = parse_resource(lim->resource);
2560 if (resid < 0) {
2561 ERROR("unknown resource %s", lim->resource);
2562 return -1;
2563 }
2564
2565 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2566 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2567 return -1;
2568 }
2569 }
2570 return 0;
2571 }
2572
2573 /* try to move physical nics to the init netns */
2574 void lxc_restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2575 {
2576 int i, oldfd;
2577 char ifname[IFNAMSIZ];
2578
2579 if (netnsfd < 0 || conf->num_savednics == 0)
2580 return;
2581
2582 INFO("Running to reset %d nic names.", conf->num_savednics);
2583
2584 oldfd = lxc_preserve_ns(getpid(), "net");
2585 if (oldfd < 0) {
2586 SYSERROR("Failed to open monitor netns fd.");
2587 return;
2588 }
2589
2590 if (setns(netnsfd, 0) != 0) {
2591 SYSERROR("Failed to enter container netns to reset nics");
2592 close(oldfd);
2593 return;
2594 }
2595 for (i=0; i<conf->num_savednics; i++) {
2596 struct saved_nic *s = &conf->saved_nics[i];
2597 /* retrieve the name of the interface */
2598 if (!if_indextoname(s->ifindex, ifname)) {
2599 WARN("no interface corresponding to index '%d'", s->ifindex);
2600 continue;
2601 }
2602 if (lxc_netdev_move_by_name(ifname, 1, s->orig_name))
2603 WARN("Error moving nic name:%s back to host netns", ifname);
2604 free(s->orig_name);
2605 }
2606 conf->num_savednics = 0;
2607
2608 if (setns(oldfd, 0) != 0)
2609 SYSERROR("Failed to re-enter monitor's netns");
2610 close(oldfd);
2611 }
2612
2613 static char *default_rootfs_mount = LXCROOTFSMOUNT;
2614
2615 struct lxc_conf *lxc_conf_init(void)
2616 {
2617 struct lxc_conf *new;
2618 int i;
2619
2620 new = malloc(sizeof(*new));
2621 if (!new) {
2622 ERROR("lxc_conf_init : %m");
2623 return NULL;
2624 }
2625 memset(new, 0, sizeof(*new));
2626
2627 new->loglevel = LXC_LOG_LEVEL_NOTSET;
2628 new->personality = -1;
2629 new->autodev = 1;
2630 new->console.log_path = NULL;
2631 new->console.log_fd = -1;
2632 new->console.path = NULL;
2633 new->console.peer = -1;
2634 new->console.peerpty.busy = -1;
2635 new->console.peerpty.master = -1;
2636 new->console.peerpty.slave = -1;
2637 new->console.master = -1;
2638 new->console.slave = -1;
2639 new->console.name[0] = '\0';
2640 new->maincmd_fd = -1;
2641 new->nbd_idx = -1;
2642 new->rootfs.mount = strdup(default_rootfs_mount);
2643 if (!new->rootfs.mount) {
2644 ERROR("lxc_conf_init : %m");
2645 free(new);
2646 return NULL;
2647 }
2648 new->kmsg = 0;
2649 new->logfd = -1;
2650 lxc_list_init(&new->cgroup);
2651 lxc_list_init(&new->network);
2652 lxc_list_init(&new->mount_list);
2653 lxc_list_init(&new->caps);
2654 lxc_list_init(&new->keepcaps);
2655 lxc_list_init(&new->id_map);
2656 lxc_list_init(&new->includes);
2657 lxc_list_init(&new->aliens);
2658 lxc_list_init(&new->environment);
2659 lxc_list_init(&new->limits);
2660 for (i=0; i<NUM_LXC_HOOKS; i++)
2661 lxc_list_init(&new->hooks[i]);
2662 lxc_list_init(&new->groups);
2663 new->lsm_aa_profile = NULL;
2664 new->lsm_se_context = NULL;
2665 new->tmp_umount_proc = 0;
2666
2667 for (i = 0; i < LXC_NS_MAX; i++)
2668 new->inherit_ns_fd[i] = -1;
2669
2670 /* if running in a new user namespace, init and COMMAND
2671 * default to running as UID/GID 0 when using lxc-execute */
2672 new->init_uid = 0;
2673 new->init_gid = 0;
2674
2675 return new;
2676 }
2677
2678 static int instantiate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2679 {
2680 char *veth1, *veth2;
2681 char veth1buf[IFNAMSIZ], veth2buf[IFNAMSIZ];
2682 int bridge_index, err;
2683 unsigned int mtu = 0;
2684
2685 if (netdev->priv.veth_attr.pair) {
2686 veth1 = netdev->priv.veth_attr.pair;
2687 if (handler->conf->reboot)
2688 lxc_netdev_delete_by_name(veth1);
2689 } else {
2690 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2691 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2692 ERROR("veth1 name too long");
2693 return -1;
2694 }
2695 veth1 = lxc_mkifname(veth1buf);
2696 if (!veth1) {
2697 ERROR("failed to allocate a temporary name");
2698 return -1;
2699 }
2700 /* store away for deconf */
2701 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
2702 }
2703
2704 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
2705 veth2 = lxc_mkifname(veth2buf);
2706 if (!veth2) {
2707 ERROR("failed to allocate a temporary name");
2708 goto out_delete;
2709 }
2710
2711 err = lxc_veth_create(veth1, veth2);
2712 if (err) {
2713 ERROR("failed to create veth pair \"%s\" and \"%s\": %s", veth1,
2714 veth2, strerror(-err));
2715 goto out_delete;
2716 }
2717
2718 /* changing the high byte of the mac address to 0xfe, the bridge interface
2719 * will always keep the host's mac address and not take the mac address
2720 * of a container */
2721 err = setup_private_host_hw_addr(veth1);
2722 if (err) {
2723 ERROR("failed to change mac address of host interface \"%s\": %s",
2724 veth1, strerror(-err));
2725 goto out_delete;
2726 }
2727
2728 netdev->ifindex = if_nametoindex(veth2);
2729 if (!netdev->ifindex) {
2730 ERROR("failed to retrieve the index for \"%s\"", veth2);
2731 goto out_delete;
2732 }
2733
2734 if (netdev->mtu) {
2735 if (lxc_safe_uint(netdev->mtu, &mtu) < 0)
2736 WARN("failed to parse mtu from");
2737 else
2738 INFO("retrieved mtu %d", mtu);
2739 } else if (netdev->link) {
2740 bridge_index = if_nametoindex(netdev->link);
2741 if (bridge_index) {
2742 mtu = netdev_get_mtu(bridge_index);
2743 INFO("retrieved mtu %d from %s", mtu, netdev->link);
2744 } else {
2745 mtu = netdev_get_mtu(netdev->ifindex);
2746 INFO("retrieved mtu %d from %s", mtu, veth2);
2747 }
2748 }
2749
2750 if (mtu) {
2751 err = lxc_netdev_set_mtu(veth1, mtu);
2752 if (!err)
2753 err = lxc_netdev_set_mtu(veth2, mtu);
2754 if (err) {
2755 ERROR("failed to set mtu \"%d\" for veth pair \"%s\" "
2756 "and \"%s\": %s",
2757 mtu, veth1, veth2, strerror(-err));
2758 goto out_delete;
2759 }
2760 }
2761
2762 if (netdev->link) {
2763 err = lxc_bridge_attach(handler->lxcpath, handler->name, netdev->link, veth1);
2764 if (err) {
2765 ERROR("failed to attach \"%s\" to bridge \"%s\": %s",
2766 veth1, netdev->link, strerror(-err));
2767 goto out_delete;
2768 }
2769 INFO("attached \"%s\" to bridge \"%s\"", veth1, netdev->link);
2770 }
2771
2772 err = lxc_netdev_up(veth1);
2773 if (err) {
2774 ERROR("failed to set \"%s\" up: %s", veth1, strerror(-err));
2775 goto out_delete;
2776 }
2777
2778 if (netdev->upscript) {
2779 err = run_script(handler->name, "net", netdev->upscript, "up",
2780 "veth", veth1, (char*) NULL);
2781 if (err)
2782 goto out_delete;
2783 }
2784
2785 DEBUG("instantiated veth \"%s/%s\", index is \"%d\"", veth1, veth2,
2786 netdev->ifindex);
2787
2788 return 0;
2789
2790 out_delete:
2791 if (netdev->ifindex != 0)
2792 lxc_netdev_delete_by_name(veth1);
2793 if (!netdev->priv.veth_attr.pair)
2794 free(veth1);
2795 free(veth2);
2796 return -1;
2797 }
2798
2799 static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2800 {
2801 char *veth1;
2802 int err;
2803
2804 if (netdev->priv.veth_attr.pair)
2805 veth1 = netdev->priv.veth_attr.pair;
2806 else
2807 veth1 = netdev->priv.veth_attr.veth1;
2808
2809 if (netdev->downscript) {
2810 err = run_script(handler->name, "net", netdev->downscript,
2811 "down", "veth", veth1, (char*) NULL);
2812 if (err)
2813 return -1;
2814 }
2815 return 0;
2816 }
2817
2818 static int instantiate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2819 {
2820 char peerbuf[IFNAMSIZ], *peer;
2821 int err;
2822
2823 if (!netdev->link) {
2824 ERROR("no link specified for macvlan netdev");
2825 return -1;
2826 }
2827
2828 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2829 if (err >= sizeof(peerbuf))
2830 return -1;
2831
2832 peer = lxc_mkifname(peerbuf);
2833 if (!peer) {
2834 ERROR("failed to make a temporary name");
2835 return -1;
2836 }
2837
2838 err = lxc_macvlan_create(netdev->link, peer,
2839 netdev->priv.macvlan_attr.mode);
2840 if (err) {
2841 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2842 peer, netdev->link, strerror(-err));
2843 goto out;
2844 }
2845
2846 netdev->ifindex = if_nametoindex(peer);
2847 if (!netdev->ifindex) {
2848 ERROR("failed to retrieve the index for %s", peer);
2849 goto out;
2850 }
2851
2852 if (netdev->upscript) {
2853 err = run_script(handler->name, "net", netdev->upscript, "up",
2854 "macvlan", netdev->link, (char*) NULL);
2855 if (err)
2856 goto out;
2857 }
2858
2859 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
2860 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
2861
2862 return 0;
2863 out:
2864 lxc_netdev_delete_by_name(peer);
2865 free(peer);
2866 return -1;
2867 }
2868
2869 static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2870 {
2871 int err;
2872
2873 if (netdev->downscript) {
2874 err = run_script(handler->name, "net", netdev->downscript,
2875 "down", "macvlan", netdev->link,
2876 (char*) NULL);
2877 if (err)
2878 return -1;
2879 }
2880 return 0;
2881 }
2882
2883 /* XXX: merge with instantiate_macvlan */
2884 static int instantiate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2885 {
2886 char peer[IFNAMSIZ];
2887 int err;
2888 static uint16_t vlan_cntr = 0;
2889 unsigned int mtu = 0;
2890
2891 if (!netdev->link) {
2892 ERROR("no link specified for vlan netdev");
2893 return -1;
2894 }
2895
2896 err = snprintf(peer, sizeof(peer), "vlan%d-%d", netdev->priv.vlan_attr.vid, vlan_cntr++);
2897 if (err >= sizeof(peer)) {
2898 ERROR("peer name too long");
2899 return -1;
2900 }
2901
2902 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
2903 if (err) {
2904 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2905 peer, netdev->link, strerror(-err));
2906 return -1;
2907 }
2908
2909 netdev->ifindex = if_nametoindex(peer);
2910 if (!netdev->ifindex) {
2911 ERROR("failed to retrieve the ifindex for %s", peer);
2912 lxc_netdev_delete_by_name(peer);
2913 return -1;
2914 }
2915
2916 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
2917 netdev->ifindex);
2918 if (netdev->mtu) {
2919 if (lxc_safe_uint(netdev->mtu, &mtu) < 0) {
2920 ERROR("Failed to retrieve mtu from: '%d'/'%s'.",
2921 netdev->ifindex, netdev->name);
2922 return -1;
2923 }
2924 err = lxc_netdev_set_mtu(peer, mtu);
2925 if (err) {
2926 ERROR("failed to set mtu '%s' for %s : %s",
2927 netdev->mtu, peer, strerror(-err));
2928 lxc_netdev_delete_by_name(peer);
2929 return -1;
2930 }
2931 }
2932
2933 return 0;
2934 }
2935
2936 static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2937 {
2938 return 0;
2939 }
2940
2941 static int instantiate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
2942 {
2943 if (!netdev->link) {
2944 ERROR("no link specified for the physical interface");
2945 return -1;
2946 }
2947
2948 netdev->ifindex = if_nametoindex(netdev->link);
2949 if (!netdev->ifindex) {
2950 ERROR("failed to retrieve the index for %s", netdev->link);
2951 return -1;
2952 }
2953
2954 if (netdev->upscript) {
2955 int err;
2956 err = run_script(handler->name, "net", netdev->upscript,
2957 "up", "phys", netdev->link, (char*) NULL);
2958 if (err)
2959 return -1;
2960 }
2961
2962 return 0;
2963 }
2964
2965 static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
2966 {
2967 int err;
2968
2969 if (netdev->downscript) {
2970 err = run_script(handler->name, "net", netdev->downscript,
2971 "down", "phys", netdev->link, (char*) NULL);
2972 if (err)
2973 return -1;
2974 }
2975 return 0;
2976 }
2977
2978 static int instantiate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
2979 {
2980 netdev->ifindex = 0;
2981 return 0;
2982 }
2983
2984 static int instantiate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
2985 {
2986 netdev->ifindex = 0;
2987 if (netdev->upscript) {
2988 int err;
2989 err = run_script(handler->name, "net", netdev->upscript,
2990 "up", "empty", (char*) NULL);
2991 if (err)
2992 return -1;
2993 }
2994 return 0;
2995 }
2996
2997 static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
2998 {
2999 int err;
3000
3001 if (netdev->downscript) {
3002 err = run_script(handler->name, "net", netdev->downscript,
3003 "down", "empty", (char*) NULL);
3004 if (err)
3005 return -1;
3006 }
3007 return 0;
3008 }
3009
3010 static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3011 {
3012 return 0;
3013 }
3014
3015 int lxc_requests_empty_network(struct lxc_handler *handler)
3016 {
3017 struct lxc_list *network = &handler->conf->network;
3018 struct lxc_list *iterator;
3019 struct lxc_netdev *netdev;
3020 bool found_none = false, found_nic = false;
3021
3022 if (lxc_list_empty(network))
3023 return 0;
3024
3025 lxc_list_for_each(iterator, network) {
3026
3027 netdev = iterator->elem;
3028
3029 if (netdev->type == LXC_NET_NONE)
3030 found_none = true;
3031 else
3032 found_nic = true;
3033 }
3034 if (found_none && !found_nic)
3035 return 1;
3036 return 0;
3037 }
3038
3039 int lxc_create_network(struct lxc_handler *handler)
3040 {
3041 struct lxc_list *network = &handler->conf->network;
3042 struct lxc_list *iterator;
3043 struct lxc_netdev *netdev;
3044 int am_root = (getuid() == 0);
3045
3046 if (!am_root)
3047 return 0;
3048
3049 lxc_list_for_each(iterator, network) {
3050
3051 netdev = iterator->elem;
3052
3053 if (netdev->type != LXC_NET_MACVLAN && netdev->priv.macvlan_attr.mode) {
3054 ERROR("Invalid macvlan.mode for a non-macvlan netdev");
3055 return -1;
3056 }
3057
3058 if (netdev->type != LXC_NET_VETH && netdev->priv.veth_attr.pair) {
3059 ERROR("Invalid veth pair for a non-veth netdev");
3060 return -1;
3061 }
3062
3063 if (netdev->type != LXC_NET_VLAN && netdev->priv.vlan_attr.vid > 0) {
3064 ERROR("Invalid vlan.id for a non-macvlan netdev");
3065 return -1;
3066 }
3067
3068 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
3069 ERROR("invalid network configuration type '%d'",
3070 netdev->type);
3071 return -1;
3072 }
3073
3074 if (netdev_conf[netdev->type](handler, netdev)) {
3075 ERROR("failed to create netdev");
3076 return -1;
3077 }
3078
3079 }
3080
3081 return 0;
3082 }
3083
3084 bool lxc_delete_network(struct lxc_handler *handler)
3085 {
3086 int ret;
3087 struct lxc_list *network = &handler->conf->network;
3088 struct lxc_list *iterator;
3089 struct lxc_netdev *netdev;
3090 bool deleted_all = true;
3091
3092 lxc_list_for_each(iterator, network) {
3093 netdev = iterator->elem;
3094
3095 if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
3096 if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
3097 WARN("Failed to rename interface with index %d "
3098 "to its initial name \"%s\".",
3099 netdev->ifindex, netdev->link);
3100 continue;
3101 }
3102
3103 if (netdev_deconf[netdev->type](handler, netdev)) {
3104 WARN("Failed to destroy netdev");
3105 }
3106
3107 /* Recent kernel remove the virtual interfaces when the network
3108 * namespace is destroyed but in case we did not moved the
3109 * interface to the network namespace, we have to destroy it
3110 */
3111 if (netdev->ifindex != 0) {
3112 ret = lxc_netdev_delete_by_index(netdev->ifindex);
3113 if (-ret == ENODEV) {
3114 INFO("Interface \"%s\" with index %d already "
3115 "deleted or existing in different network "
3116 "namespace.",
3117 netdev->name ? netdev->name : "(null)",
3118 netdev->ifindex);
3119 } else if (ret < 0) {
3120 deleted_all = false;
3121 WARN("Failed to remove interface \"%s\" with "
3122 "index %d: %s.",
3123 netdev->name ? netdev->name : "(null)",
3124 netdev->ifindex, strerror(-ret));
3125 } else {
3126 INFO("Removed interface \"%s\" with index %d.",
3127 netdev->name ? netdev->name : "(null)",
3128 netdev->ifindex);
3129 }
3130 }
3131
3132 /* Explicitly delete host veth device to prevent lingering
3133 * devices. We had issues in LXD around this.
3134 */
3135 if (netdev->ifindex != 0 && netdev->type == LXC_NET_VETH && !am_unpriv()) {
3136 char *hostveth;
3137 if (netdev->priv.veth_attr.pair) {
3138 hostveth = netdev->priv.veth_attr.pair;
3139 ret = lxc_netdev_delete_by_name(hostveth);
3140 if (ret < 0) {
3141 WARN("Failed to remove interface \"%s\" from host: %s.", hostveth, strerror(-ret));
3142 } else {
3143 INFO("Removed interface \"%s\" from host.", hostveth);
3144 }
3145 } else if (strlen(netdev->priv.veth_attr.veth1) > 0) {
3146 hostveth = netdev->priv.veth_attr.veth1;
3147 ret = lxc_netdev_delete_by_name(hostveth);
3148 if (ret < 0) {
3149 WARN("Failed to remove \"%s\" from host: %s.", hostveth, strerror(-ret));
3150 } else {
3151 INFO("Removed interface \"%s\" from host.", hostveth);
3152 memset((void *)&netdev->priv.veth_attr.veth1, 0, sizeof(netdev->priv.veth_attr.veth1));
3153 }
3154 }
3155 }
3156 }
3157
3158 return deleted_all;
3159 }
3160
3161 #define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3162
3163 /* lxc-user-nic returns "interface_name:interface_name\n" */
3164 #define MAX_BUFFER_SIZE IFNAMSIZ * 2 + 2
3165 static int unpriv_assign_nic(const char *lxcpath, char *lxcname,
3166 struct lxc_netdev *netdev, pid_t pid)
3167 {
3168 pid_t child;
3169 int bytes, pipefd[2];
3170 char *token, *saveptr = NULL;
3171 char buffer[MAX_BUFFER_SIZE];
3172 char netdev_link[IFNAMSIZ + 1];
3173
3174 if (netdev->type != LXC_NET_VETH) {
3175 ERROR("nic type %d not support for unprivileged use",
3176 netdev->type);
3177 return -1;
3178 }
3179
3180 if (pipe(pipefd) < 0) {
3181 SYSERROR("pipe failed");
3182 return -1;
3183 }
3184
3185 child = fork();
3186 if (child < 0) {
3187 SYSERROR("fork");
3188 close(pipefd[0]);
3189 close(pipefd[1]);
3190 return -1;
3191 }
3192
3193 if (child == 0) { // child
3194 /* Call lxc-user-nic pid type bridge. */
3195 int ret;
3196 char pidstr[LXC_NUMSTRLEN64];
3197
3198 close(pipefd[0]); /* Close the read-end of the pipe. */
3199
3200 /* Redirect stdout to write-end of the pipe. */
3201 ret = dup2(pipefd[1], STDOUT_FILENO);
3202 close(pipefd[1]); /* Close the write-end of the pipe. */
3203 if (ret < 0) {
3204 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
3205 exit(EXIT_FAILURE);
3206 }
3207
3208 if (netdev->link)
3209 strncpy(netdev_link, netdev->link, IFNAMSIZ);
3210 else
3211 strncpy(netdev_link, "none", IFNAMSIZ);
3212
3213 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
3214 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
3215 exit(EXIT_FAILURE);
3216 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
3217
3218 INFO("Execing lxc-user-nic %s %s %s veth %s %s", lxcpath,
3219 lxcname, pidstr, netdev_link, netdev->name);
3220 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, lxcpath, lxcname,
3221 pidstr, "veth", netdev_link, netdev->name, NULL);
3222
3223 SYSERROR("Failed to exec lxc-user-nic.");
3224 exit(EXIT_FAILURE);
3225 }
3226
3227 /* close the write-end of the pipe */
3228 close(pipefd[1]);
3229
3230 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
3231 if (bytes < 0)
3232 SYSERROR("Failed to read from pipe file descriptor.");
3233 buffer[bytes - 1] = '\0';
3234
3235 if (wait_for_pid(child) != 0) {
3236 close(pipefd[0]);
3237 return -1;
3238 }
3239
3240 /* close the read-end of the pipe */
3241 close(pipefd[0]);
3242
3243 /* fill netdev->name field */
3244 token = strtok_r(buffer, ":", &saveptr);
3245 if (!token)
3246 return -1;
3247
3248 netdev->name = malloc(IFNAMSIZ + 1);
3249 if (!netdev->name) {
3250 SYSERROR("Failed to allocate memory.");
3251 return -1;
3252 }
3253 memset(netdev->name, 0, IFNAMSIZ + 1);
3254 strncpy(netdev->name, token, IFNAMSIZ);
3255
3256 /* fill netdev->veth_attr.pair field */
3257 token = strtok_r(NULL, ":", &saveptr);
3258 if (!token)
3259 return -1;
3260
3261 netdev->priv.veth_attr.pair = strdup(token);
3262 if (!netdev->priv.veth_attr.pair) {
3263 ERROR("Failed to allocate memory.");
3264 return -1;
3265 }
3266
3267 return 0;
3268 }
3269
3270 int lxc_assign_network(const char *lxcpath, char *lxcname,
3271 struct lxc_list *network, pid_t pid)
3272 {
3273 struct lxc_list *iterator;
3274 struct lxc_netdev *netdev;
3275 char ifname[IFNAMSIZ];
3276 int am_root = (getuid() == 0);
3277 int err;
3278
3279 lxc_list_for_each(iterator, network) {
3280
3281 netdev = iterator->elem;
3282
3283 if (netdev->type == LXC_NET_VETH && !am_root) {
3284 if (netdev->mtu)
3285 INFO("mtu ignored due to insufficient privilege");
3286 if (unpriv_assign_nic(lxcpath, lxcname, netdev, pid))
3287 return -1;
3288 // lxc-user-nic has moved the nic to the new ns.
3289 // unpriv_assign_nic() fills in netdev->name.
3290 // netdev->ifindex will be filed in at setup_netdev.
3291 continue;
3292 }
3293
3294 /* empty network namespace, nothing to move */
3295 if (!netdev->ifindex)
3296 continue;
3297
3298 /* retrieve the name of the interface */
3299 if (!if_indextoname(netdev->ifindex, ifname)) {
3300 ERROR("no interface corresponding to index '%d'", netdev->ifindex);
3301 return -1;
3302 }
3303
3304 err = lxc_netdev_move_by_name(ifname, pid, NULL);
3305 if (err) {
3306 ERROR("failed to move '%s' to the container : %s",
3307 netdev->link, strerror(-err));
3308 return -1;
3309 }
3310
3311 DEBUG("move '%s'/'%s' to '%d': .", ifname, netdev->name, pid);
3312 }
3313
3314 return 0;
3315 }
3316
3317 static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3318 size_t buf_size)
3319 {
3320 char path[MAXPATHLEN];
3321 int fd, ret;
3322
3323 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
3324 idtype == ID_TYPE_UID ? 'u' : 'g');
3325 if (ret < 0 || ret >= MAXPATHLEN) {
3326 ERROR("failed to create path \"%s\"", path);
3327 return -E2BIG;
3328 }
3329
3330 fd = open(path, O_WRONLY);
3331 if (fd < 0) {
3332 SYSERROR("failed to open \"%s\"", path);
3333 return -1;
3334 }
3335
3336 errno = 0;
3337 ret = lxc_write_nointr(fd, buf, buf_size);
3338 if (ret != buf_size) {
3339 SYSERROR("failed to write %cid mapping to \"%s\"",
3340 idtype == ID_TYPE_UID ? 'u' : 'g', path);
3341 close(fd);
3342 return -1;
3343 }
3344 close(fd);
3345
3346 return 0;
3347 }
3348
3349 /* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
3350 *
3351 * @return 1 if functional binary was found
3352 * @return 0 if binary exists but is lacking privilege
3353 * @return -ENOENT if binary does not exist
3354 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
3355 *
3356 */
3357 static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
3358 {
3359 char *path;
3360 int ret;
3361 struct stat st;
3362 int fret = 0;
3363
3364 if (cap != CAP_SETUID && cap != CAP_SETGID)
3365 return -EINVAL;
3366
3367 path = on_path(binary, NULL);
3368 if (!path)
3369 return -ENOENT;
3370
3371 ret = stat(path, &st);
3372 if (ret < 0) {
3373 fret = -errno;
3374 goto cleanup;
3375 }
3376
3377 /* Check if the binary is setuid. */
3378 if (st.st_mode & S_ISUID) {
3379 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
3380 fret = 1;
3381 goto cleanup;
3382 }
3383
3384 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
3385 /* Check if it has the CAP_SETUID capability. */
3386 if ((cap & CAP_SETUID) &&
3387 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
3388 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
3389 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
3390 "and CAP_PERMITTED sets.", path);
3391 fret = 1;
3392 goto cleanup;
3393 }
3394
3395 /* Check if it has the CAP_SETGID capability. */
3396 if ((cap & CAP_SETGID) &&
3397 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
3398 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
3399 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
3400 "and CAP_PERMITTED sets.", path);
3401 fret = 1;
3402 goto cleanup;
3403 }
3404 #else
3405 /* If we cannot check for file capabilities we need to give the benefit
3406 * of the doubt. Otherwise we might fail even though all the necessary
3407 * file capabilities are set.
3408 */
3409 DEBUG("Cannot check for file capabilites as full capability support is "
3410 "missing. Manual intervention needed.");
3411 fret = 1;
3412 #endif
3413
3414 cleanup:
3415 free(path);
3416 return fret;
3417 }
3418
3419 int lxc_map_ids_exec_wrapper(void *args)
3420 {
3421 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
3422 return -1;
3423 }
3424
3425 int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3426 {
3427 struct id_map *map;
3428 struct lxc_list *iterator;
3429 enum idtype type;
3430 char u_or_g;
3431 char *pos;
3432 int fill, left;
3433 char cmd_output[MAXPATHLEN];
3434 /* strlen("new@idmap") = 9
3435 * +
3436 * strlen(" ") = 1
3437 * +
3438 * LXC_NUMSTRLEN64
3439 * +
3440 * strlen(" ") = 1
3441 *
3442 * We add some additional space to make sure that we really have
3443 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
3444 */
3445 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
3446 int ret = 0, uidmap = 0, gidmap = 0;
3447 bool use_shadow = false, had_entry = false;
3448
3449 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
3450 * ranges, then insist that root also reserve ranges in subuid. This
3451 * will protected it by preventing another user from being handed the
3452 * range by shadow.
3453 */
3454 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
3455 if (uidmap == -ENOENT)
3456 WARN("newuidmap binary is missing");
3457 else if (!uidmap)
3458 WARN("newuidmap is lacking necessary privileges");
3459
3460 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
3461 if (gidmap == -ENOENT)
3462 WARN("newgidmap binary is missing");
3463 else if (!gidmap)
3464 WARN("newgidmap is lacking necessary privileges");
3465
3466 if (uidmap > 0 && gidmap > 0) {
3467 DEBUG("Functional newuidmap and newgidmap binary found.");
3468 use_shadow = true;
3469 } else {
3470 /* In case unprivileged users run application containers via
3471 * execute() or a start*() there are valid cases where they may
3472 * only want to map their own {g,u}id. Let's not block them from
3473 * doing so by requiring geteuid() == 0.
3474 */
3475 DEBUG("No newuidmap and newgidmap binary found. Trying to "
3476 "write directly with euid %d.", geteuid());
3477 }
3478
3479 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
3480 type++, u_or_g = 'g') {
3481 pos = mapbuf;
3482
3483 if (use_shadow)
3484 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
3485
3486 lxc_list_for_each(iterator, idmap) {
3487 /* The kernel only takes <= 4k for writes to
3488 * /proc/<nr>/[ug]id_map
3489 */
3490 map = iterator->elem;
3491 if (map->idtype != type)
3492 continue;
3493
3494 had_entry = true;
3495
3496 left = LXC_IDMAPLEN - (pos - mapbuf);
3497 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
3498 use_shadow ? " " : "", map->nsid,
3499 map->hostid, map->range,
3500 use_shadow ? "" : "\n");
3501 if (fill <= 0 || fill >= left)
3502 SYSERROR("Too many {g,u}id mappings defined.");
3503
3504 pos += fill;
3505 }
3506 if (!had_entry)
3507 continue;
3508
3509 /* Try to catch the ouput of new{g,u}idmap to make debugging
3510 * easier.
3511 */
3512 if (use_shadow) {
3513 ret = run_command(cmd_output, sizeof(cmd_output),
3514 lxc_map_ids_exec_wrapper,
3515 (void *)mapbuf);
3516 if (ret < 0) {
3517 ERROR("new%cidmap failed to write mapping: %s",
3518 u_or_g, cmd_output);
3519 return -1;
3520 }
3521 } else {
3522 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
3523 if (ret < 0)
3524 return -1;
3525 }
3526
3527 memset(mapbuf, 0, sizeof(mapbuf));
3528 }
3529
3530 return 0;
3531 }
3532
3533 /*
3534 * return the host uid/gid to which the container root is mapped in
3535 * *val.
3536 * Return true if id was found, false otherwise.
3537 */
3538 bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3539 unsigned long *val)
3540 {
3541 struct lxc_list *it;
3542 struct id_map *map;
3543
3544 lxc_list_for_each(it, &conf->id_map) {
3545 map = it->elem;
3546 if (map->idtype != idtype)
3547 continue;
3548 if (map->nsid != 0)
3549 continue;
3550 *val = map->hostid;
3551 return true;
3552 }
3553 return false;
3554 }
3555
3556 int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
3557 {
3558 struct lxc_list *it;
3559 struct id_map *map;
3560 lxc_list_for_each(it, &conf->id_map) {
3561 map = it->elem;
3562 if (map->idtype != idtype)
3563 continue;
3564 if (id >= map->hostid && id < map->hostid + map->range)
3565 return (id - map->hostid) + map->nsid;
3566 }
3567 return -1;
3568 }
3569
3570 int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
3571 {
3572 struct lxc_list *it;
3573 struct id_map *map;
3574 unsigned int freeid = 0;
3575 again:
3576 lxc_list_for_each(it, &conf->id_map) {
3577 map = it->elem;
3578 if (map->idtype != idtype)
3579 continue;
3580 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3581 freeid = map->nsid + map->range;
3582 goto again;
3583 }
3584 }
3585 return freeid;
3586 }
3587
3588 int lxc_find_gateway_addresses(struct lxc_handler *handler)
3589 {
3590 struct lxc_list *network = &handler->conf->network;
3591 struct lxc_list *iterator;
3592 struct lxc_netdev *netdev;
3593 int link_index;
3594
3595 lxc_list_for_each(iterator, network) {
3596 netdev = iterator->elem;
3597
3598 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3599 continue;
3600
3601 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3602 ERROR("gateway = auto only supported for "
3603 "veth and macvlan");
3604 return -1;
3605 }
3606
3607 if (!netdev->link) {
3608 ERROR("gateway = auto needs a link interface");
3609 return -1;
3610 }
3611
3612 link_index = if_nametoindex(netdev->link);
3613 if (!link_index)
3614 return -EINVAL;
3615
3616 if (netdev->ipv4_gateway_auto) {
3617 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3618 ERROR("failed to automatically find ipv4 gateway "
3619 "address from link interface '%s'", netdev->link);
3620 return -1;
3621 }
3622 }
3623
3624 if (netdev->ipv6_gateway_auto) {
3625 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3626 ERROR("failed to automatically find ipv6 gateway "
3627 "address from link interface '%s'", netdev->link);
3628 return -1;
3629 }
3630 }
3631 }
3632
3633 return 0;
3634 }
3635
3636 int lxc_create_tty(const char *name, struct lxc_conf *conf)
3637 {
3638 struct lxc_tty_info *tty_info = &conf->tty_info;
3639 int i, ret;
3640
3641 /* no tty in the configuration */
3642 if (!conf->tty)
3643 return 0;
3644
3645 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
3646 if (!tty_info->pty_info) {
3647 SYSERROR("failed to allocate struct *pty_info");
3648 return -ENOMEM;
3649 }
3650
3651 for (i = 0; i < conf->tty; i++) {
3652 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3653
3654 process_lock();
3655 ret = openpty(&pty_info->master, &pty_info->slave,
3656 pty_info->name, NULL, NULL);
3657 process_unlock();
3658 if (ret) {
3659 SYSERROR("failed to create pty device number %d", i);
3660 tty_info->nbtty = i;
3661 lxc_delete_tty(tty_info);
3662 return -ENOTTY;
3663 }
3664
3665 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
3666 pty_info->name, pty_info->master, pty_info->slave);
3667
3668 /* Prevent leaking the file descriptors to the container */
3669 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3670 if (ret < 0)
3671 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
3672 "pty device \"%s\": %s",
3673 pty_info->master, pty_info->name, strerror(errno));
3674
3675 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3676 if (ret < 0)
3677 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
3678 "pty device \"%s\": %s",
3679 pty_info->slave, pty_info->name, strerror(errno));
3680
3681 pty_info->busy = 0;
3682 }
3683
3684 tty_info->nbtty = conf->tty;
3685
3686 INFO("finished allocating %d pts devices", conf->tty);
3687 return 0;
3688 }
3689
3690 void lxc_delete_tty(struct lxc_tty_info *tty_info)
3691 {
3692 int i;
3693
3694 for (i = 0; i < tty_info->nbtty; i++) {
3695 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3696
3697 close(pty_info->master);
3698 close(pty_info->slave);
3699 }
3700
3701 free(tty_info->pty_info);
3702 tty_info->pty_info = NULL;
3703 tty_info->nbtty = 0;
3704 }
3705
3706
3707 int chown_mapped_root_exec_wrapper(void *args)
3708 {
3709 execvp("lxc-usernsexec", args);
3710 return -1;
3711 }
3712
3713 /*
3714 * chown_mapped_root: for an unprivileged user with uid/gid X to
3715 * chown a dir to subuid/subgid Y, he needs to run chown as root
3716 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3717 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3718 * root is privileged with respect to hostuid/hostgid X, allowing
3719 * him to do the chown.
3720 */
3721 int chown_mapped_root(char *path, struct lxc_conf *conf)
3722 {
3723 uid_t rootuid, rootgid;
3724 unsigned long val;
3725 char *chownpath = path;
3726 int hostuid, hostgid, ret;
3727 struct stat sb;
3728 char map1[100], map2[100], map3[100], map4[100], map5[100];
3729 char ugid[100];
3730 char *args1[] = {"lxc-usernsexec",
3731 "-m", map1,
3732 "-m", map2,
3733 "-m", map3,
3734 "-m", map5,
3735 "--", "chown", ugid, path,
3736 NULL};
3737 char *args2[] = {"lxc-usernsexec",
3738 "-m", map1,
3739 "-m", map2,
3740 "-m", map3,
3741 "-m", map4,
3742 "-m", map5,
3743 "--", "chown", ugid, path,
3744 NULL};
3745 char cmd_output[MAXPATHLEN];
3746
3747 hostuid = geteuid();
3748 hostgid = getegid();
3749
3750 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
3751 ERROR("No uid mapping for container root");
3752 return -1;
3753 }
3754 rootuid = (uid_t)val;
3755 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
3756 ERROR("No gid mapping for container root");
3757 return -1;
3758 }
3759 rootgid = (gid_t)val;
3760
3761 /*
3762 * In case of overlay, we want only the writeable layer to be chowned
3763 */
3764 if (strncmp(path, "overlayfs:", 10) == 0 || strncmp(path, "aufs:", 5) == 0) {
3765 chownpath = strchr(path, ':');
3766 if (!chownpath) {
3767 ERROR("Bad overlay path: %s", path);
3768 return -1;
3769 }
3770 chownpath = strchr(chownpath + 1, ':');
3771 if (!chownpath) {
3772 ERROR("Bad overlay path: %s", path);
3773 return -1;
3774 }
3775 chownpath++;
3776 }
3777 path = chownpath;
3778 if (hostuid == 0) {
3779 if (chown(path, rootuid, rootgid) < 0) {
3780 ERROR("Error chowning %s", path);
3781 return -1;
3782 }
3783 return 0;
3784 }
3785
3786 if (rootuid == hostuid) {
3787 // nothing to do
3788 INFO("%s: container root is our uid; no need to chown" ,__func__);
3789 return 0;
3790 }
3791
3792 /* save the current gid of "path" */
3793 if (stat(path, &sb) < 0) {
3794 ERROR("Error stat %s", path);
3795 return -1;
3796 }
3797
3798 /* Update the path argument in case this was overlayfs. */
3799 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
3800 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
3801
3802 /*
3803 * A file has to be group-owned by a gid mapped into the
3804 * container, or the container won't be privileged over it.
3805 */
3806 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3807 if (sb.st_uid == hostuid &&
3808 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3809 chown(path, -1, hostgid) < 0) {
3810 ERROR("Failed chgrping %s", path);
3811 return -1;
3812 }
3813
3814 // "u:0:rootuid:1"
3815 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3816 if (ret < 0 || ret >= 100) {
3817 ERROR("Error uid printing map string");
3818 return -1;
3819 }
3820
3821 // "u:hostuid:hostuid:1"
3822 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3823 if (ret < 0 || ret >= 100) {
3824 ERROR("Error uid printing map string");
3825 return -1;
3826 }
3827
3828 // "g:0:rootgid:1"
3829 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3830 if (ret < 0 || ret >= 100) {
3831 ERROR("Error gid printing map string");
3832 return -1;
3833 }
3834
3835 // "g:pathgid:rootgid+pathgid:1"
3836 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3837 rootgid + (gid_t)sb.st_gid);
3838 if (ret < 0 || ret >= 100) {
3839 ERROR("Error gid printing map string");
3840 return -1;
3841 }
3842
3843 // "g:hostgid:hostgid:1"
3844 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3845 if (ret < 0 || ret >= 100) {
3846 ERROR("Error gid printing map string");
3847 return -1;
3848 }
3849
3850 // "0:pathgid" (chown)
3851 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3852 if (ret < 0 || ret >= 100) {
3853 ERROR("Error owner printing format string for chown");
3854 return -1;
3855 }
3856
3857 if (hostgid == sb.st_gid)
3858 ret = run_command(cmd_output, sizeof(cmd_output),
3859 chown_mapped_root_exec_wrapper,
3860 (void *)args1);
3861 else
3862 ret = run_command(cmd_output, sizeof(cmd_output),
3863 chown_mapped_root_exec_wrapper,
3864 (void *)args2);
3865 if (ret < 0)
3866 ERROR("lxc-usernsexec failed: %s", cmd_output);
3867
3868 return ret;
3869 }
3870
3871 int lxc_ttys_shift_ids(struct lxc_conf *c)
3872 {
3873 if (lxc_list_empty(&c->id_map))
3874 return 0;
3875
3876 if (!strcmp(c->console.name, ""))
3877 return 0;
3878
3879 if (chown_mapped_root(c->console.name, c) < 0) {
3880 ERROR("failed to chown console \"%s\"", c->console.name);
3881 return -1;
3882 }
3883
3884 TRACE("chowned console \"%s\"", c->console.name);
3885
3886 return 0;
3887 }
3888
3889 /* NOTE: Must not be called from inside the container namespace! */
3890 int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
3891 {
3892 int mounted;
3893
3894 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
3895 if (mounted == -1) {
3896 SYSERROR("failed to mount /proc in the container");
3897 /* continue only if there is no rootfs */
3898 if (conf->rootfs.path)
3899 return -1;
3900 } else if (mounted == 1) {
3901 conf->tmp_umount_proc = 1;
3902 }
3903
3904 return 0;
3905 }
3906
3907 void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3908 {
3909 if (lxc_conf->tmp_umount_proc == 1) {
3910 umount("/proc");
3911 lxc_conf->tmp_umount_proc = 0;
3912 }
3913 }
3914
3915 void remount_all_slave(void)
3916 {
3917 /* walk /proc/mounts and change any shared entries to slave */
3918 FILE *f = fopen("/proc/self/mountinfo", "r");
3919 char *line = NULL;
3920 size_t len = 0;
3921
3922 if (!f) {
3923 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3924 ERROR("Continuing container startup...");
3925 return;
3926 }
3927
3928 while (getline(&line, &len, f) != -1) {
3929 char *target, *opts;
3930 target = get_field(line, 4);
3931 if (!target)
3932 continue;
3933 opts = get_field(target, 2);
3934 if (!opts)
3935 continue;
3936 null_endofword(opts);
3937 if (!strstr(opts, "shared"))
3938 continue;
3939 null_endofword(target);
3940 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3941 SYSERROR("Failed to make %s rslave", target);
3942 ERROR("Continuing...");
3943 }
3944 }
3945 fclose(f);
3946 free(line);
3947 }
3948
3949 void lxc_execute_bind_init(struct lxc_conf *conf)
3950 {
3951 int ret;
3952 char path[PATH_MAX], destpath[PATH_MAX], *p;
3953
3954 /* If init exists in the container, don't bind mount a static one */
3955 p = choose_init(conf->rootfs.mount);
3956 if (p) {
3957 free(p);
3958 return;
3959 }
3960
3961 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3962 if (ret < 0 || ret >= PATH_MAX) {
3963 WARN("Path name too long searching for lxc.init.static");
3964 return;
3965 }
3966
3967 if (!file_exists(path)) {
3968 INFO("%s does not exist on host", path);
3969 return;
3970 }
3971
3972 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
3973 if (ret < 0 || ret >= PATH_MAX) {
3974 WARN("Path name too long for container's lxc.init.static");
3975 return;
3976 }
3977
3978 if (!file_exists(destpath)) {
3979 FILE * pathfile = fopen(destpath, "wb");
3980 if (!pathfile) {
3981 SYSERROR("Failed to create mount target '%s'", destpath);
3982 return;
3983 }
3984 fclose(pathfile);
3985 }
3986
3987 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
3988 if (ret < 0)
3989 SYSERROR("Failed to bind lxc.init.static into container");
3990 INFO("lxc.init.static bound into container at %s", path);
3991 }
3992
3993 /*
3994 * This does the work of remounting / if it is shared, calling the
3995 * container pre-mount hooks, and mounting the rootfs.
3996 */
3997 int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
3998 {
3999 if (conf->rootfs_setup) {
4000 /*
4001 * rootfs was set up in another namespace. bind-mount it
4002 * to give us a mount in our own ns so we can pivot_root to it
4003 */
4004 const char *path = conf->rootfs.mount;
4005 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
4006 ERROR("Failed to bind-mount container / onto itself");
4007 return -1;
4008 }
4009 return 0;
4010 }
4011
4012 remount_all_slave();
4013
4014 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
4015 ERROR("failed to run pre-mount hooks for container '%s'.", name);
4016 return -1;
4017 }
4018
4019 if (lxc_setup_rootfs(conf)) {
4020 ERROR("failed to setup rootfs for '%s'", name);
4021 return -1;
4022 }
4023
4024 conf->rootfs_setup = true;
4025 return 0;
4026 }
4027
4028 static bool verify_start_hooks(struct lxc_conf *conf)
4029 {
4030 struct lxc_list *it;
4031 char path[MAXPATHLEN];
4032 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
4033 char *hookname = it->elem;
4034 struct stat st;
4035 int ret;
4036
4037 ret = snprintf(path, MAXPATHLEN, "%s%s",
4038 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
4039 if (ret < 0 || ret >= MAXPATHLEN)
4040 return false;
4041 ret = stat(path, &st);
4042 if (ret) {
4043 SYSERROR("Start hook %s not found in container",
4044 hookname);
4045 return false;
4046 }
4047 return true;
4048 }
4049
4050 return true;
4051 }
4052
4053 static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
4054 {
4055 int i;
4056 int *ttyfds;
4057 struct lxc_pty_info *pty_info;
4058 struct lxc_conf *conf = handler->conf;
4059 const struct lxc_tty_info *tty_info = &conf->tty_info;
4060 int sock = handler->ttysock[0];
4061 int ret = -1;
4062 size_t num_ttyfds = (2 * conf->tty);
4063
4064 ttyfds = malloc(num_ttyfds * sizeof(int));
4065 if (!ttyfds)
4066 return -1;
4067
4068 for (i = 0; i < num_ttyfds; i++) {
4069 pty_info = &tty_info->pty_info[i / 2];
4070 ttyfds[i++] = pty_info->slave;
4071 ttyfds[i] = pty_info->master;
4072 TRACE("send pty \"%s\" with master fd %d and slave fd %d to "
4073 "parent",
4074 pty_info->name, pty_info->master, pty_info->slave);
4075 }
4076
4077 ret = lxc_abstract_unix_send_fds(sock, ttyfds, num_ttyfds, NULL, 0);
4078 if (ret < 0)
4079 ERROR("failed to send %d ttys to parent: %s", conf->tty,
4080 strerror(errno));
4081 else
4082 TRACE("sent %d ttys to parent", conf->tty);
4083
4084 close(handler->ttysock[0]);
4085 close(handler->ttysock[1]);
4086
4087 for (i = 0; i < num_ttyfds; i++)
4088 close(ttyfds[i]);
4089
4090 free(ttyfds);
4091
4092 return ret;
4093 }
4094
4095 int lxc_setup(struct lxc_handler *handler)
4096 {
4097 const char *name = handler->name;
4098 struct lxc_conf *lxc_conf = handler->conf;
4099 const char *lxcpath = handler->lxcpath;
4100
4101 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4102 ERROR("Error setting up rootfs mount after spawn");
4103 return -1;
4104 }
4105
4106 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4107 if (setup_utsname(lxc_conf->utsname)) {
4108 ERROR("failed to setup the utsname for '%s'", name);
4109 return -1;
4110 }
4111 }
4112
4113 if (setup_network(lxc_conf, &lxc_conf->network)) {
4114 ERROR("failed to setup the network for '%s'", name);
4115 return -1;
4116 }
4117
4118 if (lxc_conf->autodev > 0) {
4119 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
4120 ERROR("failed to mount /dev in the container");
4121 return -1;
4122 }
4123 }
4124
4125 /* do automatic mounts (mainly /proc and /sys), but exclude
4126 * those that need to wait until other stuff has finished
4127 */
4128 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
4129 ERROR("failed to setup the automatic mounts for '%s'", name);
4130 return -1;
4131 }
4132
4133 if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
4134 ERROR("failed to setup the mounts for '%s'", name);
4135 return -1;
4136 }
4137
4138 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
4139 ERROR("failed to setup the mount entries for '%s'", name);
4140 return -1;
4141 }
4142
4143 /* Make sure any start hooks are in the container */
4144 if (!verify_start_hooks(lxc_conf))
4145 return -1;
4146
4147 if (lxc_conf->is_execute)
4148 lxc_execute_bind_init(lxc_conf);
4149
4150 /* now mount only cgroup, if wanted;
4151 * before, /sys could not have been mounted
4152 * (is either mounted automatically or via fstab entries)
4153 */
4154 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
4155 ERROR("failed to setup the automatic mounts for '%s'", name);
4156 return -1;
4157 }
4158
4159 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
4160 ERROR("failed to run mount hooks for container '%s'.", name);
4161 return -1;
4162 }
4163
4164 if (lxc_conf->autodev > 0) {
4165 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
4166 ERROR("failed to run autodev hooks for container '%s'.", name);
4167 return -1;
4168 }
4169 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
4170 ERROR("failed to populate /dev in the container");
4171 return -1;
4172 }
4173 }
4174
4175 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
4176 ERROR("failed to setup the console for '%s'", name);
4177 return -1;
4178 }
4179
4180 if (lxc_conf->kmsg) {
4181 if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console)) // don't fail
4182 ERROR("failed to setup kmsg for '%s'", name);
4183 }
4184
4185 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4186 ERROR("failed to setup /dev symlinks for '%s'", name);
4187 return -1;
4188 }
4189
4190 /* mount /proc if it's not already there */
4191 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
4192 ERROR("failed to LSM mount proc for '%s'", name);
4193 return -1;
4194 }
4195
4196 if (setup_pivot_root(&lxc_conf->rootfs)) {
4197 ERROR("failed to set rootfs for '%s'", name);
4198 return -1;
4199 }
4200
4201 if (lxc_setup_devpts(lxc_conf->pts)) {
4202 ERROR("failed to setup the new pts instance");
4203 return -1;
4204 }
4205
4206 if (lxc_create_tty(name, lxc_conf)) {
4207 ERROR("failed to create the ttys");
4208 return -1;
4209 }
4210
4211 if (lxc_send_ttys_to_parent(handler) < 0) {
4212 ERROR("failure sending console info to parent");
4213 return -1;
4214 }
4215
4216 if (!lxc_conf->is_execute && lxc_setup_tty(lxc_conf)) {
4217 ERROR("failed to setup the ttys for '%s'", name);
4218 return -1;
4219 }
4220
4221 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
4222 SYSERROR("failed to set environment variable for container ptys");
4223
4224
4225 if (setup_personality(lxc_conf->personality)) {
4226 ERROR("failed to setup personality");
4227 return -1;
4228 }
4229
4230 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4231 if (!lxc_list_empty(&lxc_conf->caps)) {
4232 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
4233 return -1;
4234 }
4235 if (dropcaps_except(&lxc_conf->keepcaps)) {
4236 ERROR("failed to keep requested caps");
4237 return -1;
4238 }
4239 } else if (setup_caps(&lxc_conf->caps)) {
4240 ERROR("failed to drop capabilities");
4241 return -1;
4242 }
4243
4244 NOTICE("'%s' is setup.", name);
4245
4246 return 0;
4247 }
4248
4249 int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4250 const char *lxcpath, char *argv[])
4251 {
4252 int which = -1;
4253 struct lxc_list *it;
4254
4255 if (strcmp(hook, "pre-start") == 0)
4256 which = LXCHOOK_PRESTART;
4257 else if (strcmp(hook, "pre-mount") == 0)
4258 which = LXCHOOK_PREMOUNT;
4259 else if (strcmp(hook, "mount") == 0)
4260 which = LXCHOOK_MOUNT;
4261 else if (strcmp(hook, "autodev") == 0)
4262 which = LXCHOOK_AUTODEV;
4263 else if (strcmp(hook, "start") == 0)
4264 which = LXCHOOK_START;
4265 else if (strcmp(hook, "stop") == 0)
4266 which = LXCHOOK_STOP;
4267 else if (strcmp(hook, "post-stop") == 0)
4268 which = LXCHOOK_POSTSTOP;
4269 else if (strcmp(hook, "clone") == 0)
4270 which = LXCHOOK_CLONE;
4271 else if (strcmp(hook, "destroy") == 0)
4272 which = LXCHOOK_DESTROY;
4273 else
4274 return -1;
4275 lxc_list_for_each(it, &conf->hooks[which]) {
4276 int ret;
4277 char *hookname = it->elem;
4278 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
4279 if (ret)
4280 return ret;
4281 }
4282 return 0;
4283 }
4284
4285 int lxc_clear_config_caps(struct lxc_conf *c)
4286 {
4287 struct lxc_list *it,*next;
4288
4289 lxc_list_for_each_safe(it, &c->caps, next) {
4290 lxc_list_del(it);
4291 free(it->elem);
4292 free(it);
4293 }
4294 return 0;
4295 }
4296
4297 static int lxc_free_idmap(struct lxc_list *id_map) {
4298 struct lxc_list *it, *next;
4299
4300 lxc_list_for_each_safe(it, id_map, next) {
4301 lxc_list_del(it);
4302 free(it->elem);
4303 free(it);
4304 }
4305 return 0;
4306 }
4307
4308 int lxc_clear_idmaps(struct lxc_conf *c)
4309 {
4310 return lxc_free_idmap(&c->id_map);
4311 }
4312
4313 int lxc_clear_config_keepcaps(struct lxc_conf *c)
4314 {
4315 struct lxc_list *it,*next;
4316
4317 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4318 lxc_list_del(it);
4319 free(it->elem);
4320 free(it);
4321 }
4322 return 0;
4323 }
4324
4325 int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
4326 {
4327 struct lxc_list *it,*next;
4328 bool all = false;
4329 const char *k = NULL;
4330
4331 if (strcmp(key, "lxc.cgroup") == 0)
4332 all = true;
4333 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
4334 k = key + sizeof("lxc.cgroup.")-1;
4335 else
4336 return -1;
4337
4338 lxc_list_for_each_safe(it, &c->cgroup, next) {
4339 struct lxc_cgroup *cg = it->elem;
4340 if (!all && strcmp(cg->subsystem, k) != 0)
4341 continue;
4342 lxc_list_del(it);
4343 free(cg->subsystem);
4344 free(cg->value);
4345 free(cg);
4346 free(it);
4347 }
4348 return 0;
4349 }
4350
4351 int lxc_clear_limits(struct lxc_conf *c, const char *key)
4352 {
4353 struct lxc_list *it, *next;
4354 bool all = false;
4355 const char *k = NULL;
4356
4357 if (strcmp(key, "lxc.limit") == 0)
4358 all = true;
4359 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
4360 k = key + sizeof("lxc.limit.")-1;
4361 else
4362 return -1;
4363
4364 lxc_list_for_each_safe(it, &c->limits, next) {
4365 struct lxc_limit *lim = it->elem;
4366 if (!all && strcmp(lim->resource, k) != 0)
4367 continue;
4368 lxc_list_del(it);
4369 free(lim->resource);
4370 free(lim);
4371 free(it);
4372 }
4373 return 0;
4374 }
4375
4376 int lxc_clear_groups(struct lxc_conf *c)
4377 {
4378 struct lxc_list *it,*next;
4379
4380 lxc_list_for_each_safe(it, &c->groups, next) {
4381 lxc_list_del(it);
4382 free(it->elem);
4383 free(it);
4384 }
4385 return 0;
4386 }
4387
4388 int lxc_clear_environment(struct lxc_conf *c)
4389 {
4390 struct lxc_list *it,*next;
4391
4392 lxc_list_for_each_safe(it, &c->environment, next) {
4393 lxc_list_del(it);
4394 free(it->elem);
4395 free(it);
4396 }
4397 return 0;
4398 }
4399
4400
4401 int lxc_clear_mount_entries(struct lxc_conf *c)
4402 {
4403 struct lxc_list *it,*next;
4404
4405 lxc_list_for_each_safe(it, &c->mount_list, next) {
4406 lxc_list_del(it);
4407 free(it->elem);
4408 free(it);
4409 }
4410 return 0;
4411 }
4412
4413 int lxc_clear_automounts(struct lxc_conf *c)
4414 {
4415 c->auto_mounts = 0;
4416 return 0;
4417 }
4418
4419 int lxc_clear_hooks(struct lxc_conf *c, const char *key)
4420 {
4421 struct lxc_list *it,*next;
4422 bool all = false, done = false;
4423 const char *k = NULL;
4424 int i;
4425
4426 if (strcmp(key, "lxc.hook") == 0)
4427 all = true;
4428 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
4429 k = key + sizeof("lxc.hook.")-1;
4430 else
4431 return -1;
4432
4433 for (i=0; i<NUM_LXC_HOOKS; i++) {
4434 if (all || strcmp(k, lxchook_names[i]) == 0) {
4435 lxc_list_for_each_safe(it, &c->hooks[i], next) {
4436 lxc_list_del(it);
4437 free(it->elem);
4438 free(it);
4439 }
4440 done = true;
4441 }
4442 }
4443
4444 if (!done) {
4445 ERROR("Invalid hook key: %s", key);
4446 return -1;
4447 }
4448 return 0;
4449 }
4450
4451 static void lxc_clear_saved_nics(struct lxc_conf *conf)
4452 {
4453 int i;
4454
4455 if (!conf->saved_nics)
4456 return;
4457 for (i=0; i < conf->num_savednics; i++)
4458 free(conf->saved_nics[i].orig_name);
4459 free(conf->saved_nics);
4460 }
4461
4462 static inline void lxc_clear_aliens(struct lxc_conf *conf)
4463 {
4464 struct lxc_list *it,*next;
4465
4466 lxc_list_for_each_safe(it, &conf->aliens, next) {
4467 lxc_list_del(it);
4468 free(it->elem);
4469 free(it);
4470 }
4471 }
4472
4473 void lxc_clear_includes(struct lxc_conf *conf)
4474 {
4475 struct lxc_list *it,*next;
4476
4477 lxc_list_for_each_safe(it, &conf->includes, next) {
4478 lxc_list_del(it);
4479 free(it->elem);
4480 free(it);
4481 }
4482 }
4483
4484 void lxc_conf_free(struct lxc_conf *conf)
4485 {
4486 if (!conf)
4487 return;
4488 if (current_config == conf)
4489 current_config = NULL;
4490 free(conf->console.log_path);
4491 free(conf->console.path);
4492 free(conf->rootfs.mount);
4493 free(conf->rootfs.bdev_type);
4494 free(conf->rootfs.options);
4495 free(conf->rootfs.path);
4496 free(conf->logfile);
4497 if (conf->logfd != -1)
4498 close(conf->logfd);
4499 free(conf->utsname);
4500 free(conf->ttydir);
4501 free(conf->fstab);
4502 free(conf->rcfile);
4503 free(conf->init_cmd);
4504 free(conf->unexpanded_config);
4505 free(conf->pty_names);
4506 free(conf->syslog);
4507 lxc_free_networks(&conf->network);
4508 free(conf->lsm_aa_profile);
4509 free(conf->lsm_se_context);
4510 lxc_seccomp_free(conf);
4511 lxc_clear_config_caps(conf);
4512 lxc_clear_config_keepcaps(conf);
4513 lxc_clear_cgroups(conf, "lxc.cgroup");
4514 lxc_clear_hooks(conf, "lxc.hook");
4515 lxc_clear_mount_entries(conf);
4516 lxc_clear_saved_nics(conf);
4517 lxc_clear_idmaps(conf);
4518 lxc_clear_groups(conf);
4519 lxc_clear_includes(conf);
4520 lxc_clear_aliens(conf);
4521 lxc_clear_environment(conf);
4522 lxc_clear_limits(conf, "lxc.limit");
4523 free(conf);
4524 }
4525
4526 struct userns_fn_data {
4527 int (*fn)(void *);
4528 const char *fn_name;
4529 void *arg;
4530 int p[2];
4531 };
4532
4533 static int run_userns_fn(void *data)
4534 {
4535 struct userns_fn_data *d = data;
4536 char c;
4537
4538 /* Close write end of the pipe. */
4539 close(d->p[1]);
4540
4541 /* Wait for parent to finish establishing a new mapping in the user
4542 * namespace we are executing in.
4543 */
4544 if (read(d->p[0], &c, 1) != 1)
4545 return -1;
4546
4547 /* Close read end of the pipe. */
4548 close(d->p[0]);
4549
4550 if (d->fn_name)
4551 TRACE("calling function \"%s\"", d->fn_name);
4552 /* Call function to run. */
4553 return d->fn(d->arg);
4554 }
4555
4556 static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
4557 enum idtype idtype)
4558 {
4559 struct lxc_list *it;
4560 struct id_map *map;
4561 struct id_map *retmap = NULL;
4562
4563 lxc_list_for_each(it, &conf->id_map) {
4564 map = it->elem;
4565 if (map->idtype != idtype)
4566 continue;
4567
4568 if (id >= map->hostid && id < map->hostid + map->range) {
4569 retmap = map;
4570 break;
4571 }
4572 }
4573
4574 if (!retmap)
4575 return NULL;
4576
4577 retmap = malloc(sizeof(*retmap));
4578 if (!retmap)
4579 return NULL;
4580
4581 memcpy(retmap, map, sizeof(*retmap));
4582 return retmap;
4583 }
4584
4585 /*
4586 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4587 * existing one or establish a new one.
4588 */
4589 static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4590 {
4591 int hostid_mapped;
4592 struct id_map *entry = NULL;
4593
4594 /* Reuse existing mapping. */
4595 entry = mapped_hostid_entry(conf, id, type);
4596 if (entry)
4597 return entry;
4598
4599 /* Find new mapping. */
4600 hostid_mapped = find_unmapped_nsid(conf, type);
4601 if (hostid_mapped < 0) {
4602 DEBUG("failed to find free mapping for id %d", id);
4603 return NULL;
4604 }
4605
4606 entry = malloc(sizeof(*entry));
4607 if (!entry)
4608 return NULL;
4609
4610 entry->idtype = type;
4611 entry->nsid = hostid_mapped;
4612 entry->hostid = (unsigned long)id;
4613 entry->range = 1;
4614
4615 return entry;
4616 }
4617
4618 /* Run a function in a new user namespace.
4619 * The caller's euid/egid will be mapped if it is not already.
4620 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4621 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4622 * This means we require only to establish a mapping from:
4623 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4624 * - the container root -> some sub{g,u}id
4625 * The former we add, if the user did not specifiy a mapping. The latter we
4626 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4627 * there to start the container in the first place.
4628 */
4629 int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4630 const char *fn_name)
4631 {
4632 pid_t pid;
4633 uid_t euid, egid;
4634 struct userns_fn_data d;
4635 int p[2];
4636 struct lxc_list *it;
4637 struct id_map *map;
4638 char c = '1';
4639 int ret = -1;
4640 struct lxc_list *idmap = NULL, *tmplist = NULL;
4641 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4642 *host_uid_map = NULL, *host_gid_map = NULL;
4643
4644 ret = pipe(p);
4645 if (ret < 0) {
4646 SYSERROR("opening pipe");
4647 return -1;
4648 }
4649 d.fn = fn;
4650 d.fn_name = fn_name;
4651 d.arg = data;
4652 d.p[0] = p[0];
4653 d.p[1] = p[1];
4654
4655 /* Clone child in new user namespace. */
4656 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4657 if (pid < 0) {
4658 ERROR("failed to clone child process in new user namespace");
4659 goto on_error;
4660 }
4661
4662 close(p[0]);
4663 p[0] = -1;
4664
4665 /* Find container root. */
4666 lxc_list_for_each(it, &conf->id_map) {
4667 map = it->elem;
4668
4669 if (map->nsid != 0)
4670 continue;
4671
4672 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
4673 container_root_uid = malloc(sizeof(*container_root_uid));
4674 if (!container_root_uid)
4675 goto on_error;
4676 container_root_uid->idtype = map->idtype;
4677 container_root_uid->hostid = map->hostid;
4678 container_root_uid->nsid = 0;
4679 container_root_uid->range = map->range;
4680 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
4681 container_root_gid = malloc(sizeof(*container_root_gid));
4682 if (!container_root_gid)
4683 goto on_error;
4684 container_root_gid->idtype = map->idtype;
4685 container_root_gid->hostid = map->hostid;
4686 container_root_gid->nsid = 0;
4687 container_root_gid->range = map->range;
4688 }
4689
4690 /* Found container root. */
4691 if (container_root_uid && container_root_gid)
4692 break;
4693 }
4694
4695 /* This is actually checked earlier but it can't hurt. */
4696 if (!container_root_uid || !container_root_gid) {
4697 ERROR("no mapping for container root found");
4698 goto on_error;
4699 }
4700
4701 host_uid_map = container_root_uid;
4702 host_gid_map = container_root_gid;
4703
4704 /* Check whether the {g,u}id of the user has a mapping. */
4705 euid = geteuid();
4706 egid = getegid();
4707 if (euid != container_root_uid->hostid)
4708 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
4709
4710 if (egid != container_root_gid->hostid)
4711 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
4712
4713 if (!host_uid_map) {
4714 DEBUG("failed to find mapping for uid %d", euid);
4715 goto on_error;
4716 }
4717
4718 if (!host_gid_map) {
4719 DEBUG("failed to find mapping for gid %d", egid);
4720 goto on_error;
4721 }
4722
4723 /* Allocate new {g,u}id map list. */
4724 idmap = malloc(sizeof(*idmap));
4725 if (!idmap)
4726 goto on_error;
4727 lxc_list_init(idmap);
4728
4729 /* Add container root to the map. */
4730 tmplist = malloc(sizeof(*tmplist));
4731 if (!tmplist)
4732 goto on_error;
4733 lxc_list_add_elem(tmplist, container_root_uid);
4734 lxc_list_add_tail(idmap, tmplist);
4735
4736 if (host_uid_map && (host_uid_map != container_root_uid)) {
4737 /* idmap will now keep track of that memory. */
4738 container_root_uid = NULL;
4739
4740 /* Add container root to the map. */
4741 tmplist = malloc(sizeof(*tmplist));
4742 if (!tmplist)
4743 goto on_error;
4744 lxc_list_add_elem(tmplist, host_uid_map);
4745 lxc_list_add_tail(idmap, tmplist);
4746 }
4747 /* idmap will now keep track of that memory. */
4748 container_root_uid = NULL;
4749 /* idmap will now keep track of that memory. */
4750 host_uid_map = NULL;
4751
4752 tmplist = malloc(sizeof(*tmplist));
4753 if (!tmplist)
4754 goto on_error;
4755 lxc_list_add_elem(tmplist, container_root_gid);
4756 lxc_list_add_tail(idmap, tmplist);
4757
4758 if (host_gid_map && (host_gid_map != container_root_gid)) {
4759 /* idmap will now keep track of that memory. */
4760 container_root_gid = NULL;
4761
4762 tmplist = malloc(sizeof(*tmplist));
4763 if (!tmplist)
4764 goto on_error;
4765 lxc_list_add_elem(tmplist, host_gid_map);
4766 lxc_list_add_tail(idmap, tmplist);
4767 }
4768 /* idmap will now keep track of that memory. */
4769 container_root_gid = NULL;
4770 /* idmap will now keep track of that memory. */
4771 host_gid_map = NULL;
4772
4773 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4774 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4775 lxc_list_for_each(it, idmap) {
4776 map = it->elem;
4777 TRACE("establishing %cid mapping for \"%d\" in new "
4778 "user namespace: nsuid %lu - hostid %lu - range "
4779 "%lu",
4780 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4781 map->nsid, map->hostid, map->range);
4782 }
4783 }
4784
4785 /* Set up {g,u}id mapping for user namespace of child process. */
4786 ret = lxc_map_ids(idmap, pid);
4787 if (ret < 0) {
4788 ERROR("error setting up {g,u}id mappings for child process "
4789 "\"%d\"",
4790 pid);
4791 goto on_error;
4792 }
4793
4794 /* Tell child to proceed. */
4795 if (write(p[1], &c, 1) != 1) {
4796 SYSERROR("failed telling child process \"%d\" to proceed", pid);
4797 goto on_error;
4798 }
4799
4800 /* Wait for child to finish. */
4801 ret = wait_for_pid(pid);
4802
4803 on_error:
4804 if (idmap)
4805 lxc_free_idmap(idmap);
4806 if (container_root_uid)
4807 free(container_root_uid);
4808 if (container_root_gid)
4809 free(container_root_gid);
4810 if (host_uid_map && (host_uid_map != container_root_uid))
4811 free(host_uid_map);
4812 if (host_gid_map && (host_gid_map != container_root_gid))
4813 free(host_gid_map);
4814
4815 if (p[0] != -1)
4816 close(p[0]);
4817 close(p[1]);
4818
4819 return ret;
4820 }
4821
4822 /* not thread-safe, do not use from api without first forking */
4823 static char* getuname(void)
4824 {
4825 struct passwd *result;
4826
4827 result = getpwuid(geteuid());
4828 if (!result)
4829 return NULL;
4830
4831 return strdup(result->pw_name);
4832 }
4833
4834 /* not thread-safe, do not use from api without first forking */
4835 static char *getgname(void)
4836 {
4837 struct group *result;
4838
4839 result = getgrgid(getegid());
4840 if (!result)
4841 return NULL;
4842
4843 return strdup(result->gr_name);
4844 }
4845
4846 /* not thread-safe, do not use from api without first forking */
4847 void suggest_default_idmap(void)
4848 {
4849 FILE *f;
4850 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4851 char *line = NULL;
4852 char *uname, *gname;
4853 size_t len = 0;
4854
4855 if (!(uname = getuname()))
4856 return;
4857
4858 if (!(gname = getgname())) {
4859 free(uname);
4860 return;
4861 }
4862
4863 f = fopen(subuidfile, "r");
4864 if (!f) {
4865 ERROR("Your system is not configured with subuids");
4866 free(gname);
4867 free(uname);
4868 return;
4869 }
4870 while (getline(&line, &len, f) != -1) {
4871 size_t no_newline = 0;
4872 char *p = strchr(line, ':'), *p2;
4873 if (*line == '#')
4874 continue;
4875 if (!p)
4876 continue;
4877 *p = '\0';
4878 p++;
4879 if (strcmp(line, uname))
4880 continue;
4881 p2 = strchr(p, ':');
4882 if (!p2)
4883 continue;
4884 *p2 = '\0';
4885 p2++;
4886 if (!*p2)
4887 continue;
4888 no_newline = strcspn(p2, "\n");
4889 p2[no_newline] = '\0';
4890
4891 if (lxc_safe_uint(p, &uid) < 0)
4892 WARN("Could not parse UID.");
4893 if (lxc_safe_uint(p2, &urange) < 0)
4894 WARN("Could not parse UID range.");
4895 }
4896 fclose(f);
4897
4898 f = fopen(subgidfile, "r");
4899 if (!f) {
4900 ERROR("Your system is not configured with subgids");
4901 free(gname);
4902 free(uname);
4903 return;
4904 }
4905 while (getline(&line, &len, f) != -1) {
4906 size_t no_newline = 0;
4907 char *p = strchr(line, ':'), *p2;
4908 if (*line == '#')
4909 continue;
4910 if (!p)
4911 continue;
4912 *p = '\0';
4913 p++;
4914 if (strcmp(line, uname))
4915 continue;
4916 p2 = strchr(p, ':');
4917 if (!p2)
4918 continue;
4919 *p2 = '\0';
4920 p2++;
4921 if (!*p2)
4922 continue;
4923 no_newline = strcspn(p2, "\n");
4924 p2[no_newline] = '\0';
4925
4926 if (lxc_safe_uint(p, &gid) < 0)
4927 WARN("Could not parse GID.");
4928 if (lxc_safe_uint(p2, &grange) < 0)
4929 WARN("Could not parse GID range.");
4930 }
4931 fclose(f);
4932
4933 free(line);
4934
4935 if (!urange || !grange) {
4936 ERROR("You do not have subuids or subgids allocated");
4937 ERROR("Unprivileged containers require subuids and subgids");
4938 return;
4939 }
4940
4941 ERROR("You must either run as root, or define uid mappings");
4942 ERROR("To pass uid mappings to lxc-create, you could create");
4943 ERROR("~/.config/lxc/default.conf:");
4944 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
4945 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
4946 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
4947
4948 free(gname);
4949 free(uname);
4950 }
4951
4952 static void free_cgroup_settings(struct lxc_list *result)
4953 {
4954 struct lxc_list *iterator, *next;
4955
4956 lxc_list_for_each_safe(iterator, result, next) {
4957 lxc_list_del(iterator);
4958 free(iterator);
4959 }
4960 free(result);
4961 }
4962
4963 /*
4964 * Return the list of cgroup_settings sorted according to the following rules
4965 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4966 */
4967 struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
4968 {
4969 struct lxc_list *result;
4970 struct lxc_list *memsw_limit = NULL;
4971 struct lxc_list *it = NULL;
4972 struct lxc_cgroup *cg = NULL;
4973 struct lxc_list *item = NULL;
4974
4975 result = malloc(sizeof(*result));
4976 if (!result) {
4977 ERROR("failed to allocate memory to sort cgroup settings");
4978 return NULL;
4979 }
4980 lxc_list_init(result);
4981
4982 /*Iterate over the cgroup settings and copy them to the output list*/
4983 lxc_list_for_each(it, cgroup_settings) {
4984 item = malloc(sizeof(*item));
4985 if (!item) {
4986 ERROR("failed to allocate memory to sort cgroup settings");
4987 free_cgroup_settings(result);
4988 return NULL;
4989 }
4990 item->elem = it->elem;
4991 cg = it->elem;
4992 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4993 /* Store the memsw_limit location */
4994 memsw_limit = item;
4995 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
4996 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
4997 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
4998 item->elem = memsw_limit->elem;
4999 memsw_limit->elem = it->elem;
5000 }
5001 lxc_list_add_tail(result, item);
5002 }
5003
5004 return result;
5005 }