]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/conf.c
Use strerror(errno) instead of %m
[mirror_lxc.git] / src / lxc / conf.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #define _GNU_SOURCE
25 #include "config.h"
26
27 #include <dirent.h>
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <grp.h>
31 #include <inttypes.h>
32 #include <libgen.h>
33 #include <pwd.h>
34 #include <stdarg.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <arpa/inet.h>
41 #include <linux/loop.h>
42 #include <net/if.h>
43 #include <netinet/in.h>
44 #include <sys/mman.h>
45 #include <sys/mount.h>
46 #include <sys/param.h>
47 #include <sys/prctl.h>
48 #include <sys/stat.h>
49 #include <sys/socket.h>
50 #include <sys/sysmacros.h>
51 #include <sys/syscall.h>
52 #include <sys/types.h>
53 #include <sys/utsname.h>
54 #include <sys/wait.h>
55
56 /* makedev() */
57 #ifdef MAJOR_IN_MKDEV
58 # include <sys/mkdev.h>
59 #endif
60
61 #ifdef HAVE_STATVFS
62 #include <sys/statvfs.h>
63 #endif
64
65 #if HAVE_PTY_H
66 #include <pty.h>
67 #else
68 #include <../include/openpty.h>
69 #endif
70
71 #ifdef HAVE_LINUX_MEMFD_H
72 #include <linux/memfd.h>
73 #endif
74
75 #include "af_unix.h"
76 #include "bdev.h"
77 #include "caps.h" /* for lxc_caps_last_cap() */
78 #include "cgroup.h"
79 #include "conf.h"
80 #include "confile_utils.h"
81 #include "error.h"
82 #include "log.h"
83 #include "lxcaufs.h"
84 #include "lxclock.h"
85 #include "lxcoverlay.h"
86 #include "lxcseccomp.h"
87 #include "namespace.h"
88 #include "network.h"
89 #include "parse.h"
90 #include "utils.h"
91 #include "lsm/lsm.h"
92
93 #if HAVE_LIBCAP
94 #include <sys/capability.h>
95 #endif
96
97 #if HAVE_SYS_PERSONALITY_H
98 #include <sys/personality.h>
99 #endif
100
101 #if IS_BIONIC
102 #include <../include/lxcmntent.h>
103 #ifndef HAVE_PRLIMIT
104 #include <../include/prlimit.h>
105 #endif
106 #else
107 #include <mntent.h>
108 #endif
109
110 lxc_log_define(lxc_conf, lxc);
111
112 #if HAVE_LIBCAP
113 #ifndef CAP_SETFCAP
114 #define CAP_SETFCAP 31
115 #endif
116
117 #ifndef CAP_MAC_OVERRIDE
118 #define CAP_MAC_OVERRIDE 32
119 #endif
120
121 #ifndef CAP_MAC_ADMIN
122 #define CAP_MAC_ADMIN 33
123 #endif
124 #endif
125
126 #ifndef PR_CAPBSET_DROP
127 #define PR_CAPBSET_DROP 24
128 #endif
129
130 #ifndef LO_FLAGS_AUTOCLEAR
131 #define LO_FLAGS_AUTOCLEAR 4
132 #endif
133
134 #ifndef CAP_SETUID
135 #define CAP_SETUID 7
136 #endif
137
138 #ifndef CAP_SETGID
139 #define CAP_SETGID 6
140 #endif
141
142 /* needed for cgroup automount checks, regardless of whether we
143 * have included linux/capability.h or not */
144 #ifndef CAP_SYS_ADMIN
145 #define CAP_SYS_ADMIN 21
146 #endif
147
148 /* Define pivot_root() if missing from the C library */
149 #ifndef HAVE_PIVOT_ROOT
150 static int pivot_root(const char * new_root, const char * put_old)
151 {
152 #ifdef __NR_pivot_root
153 return syscall(__NR_pivot_root, new_root, put_old);
154 #else
155 errno = ENOSYS;
156 return -1;
157 #endif
158 }
159 #else
160 extern int pivot_root(const char * new_root, const char * put_old);
161 #endif
162
163 /* Define sethostname() if missing from the C library */
164 #ifndef HAVE_SETHOSTNAME
165 static int sethostname(const char * name, size_t len)
166 {
167 #ifdef __NR_sethostname
168 return syscall(__NR_sethostname, name, len);
169 #else
170 errno = ENOSYS;
171 return -1;
172 #endif
173 }
174 #endif
175
176 #ifndef MS_PRIVATE
177 #define MS_PRIVATE (1<<18)
178 #endif
179
180 #ifndef MS_LAZYTIME
181 #define MS_LAZYTIME (1<<25)
182 #endif
183
184 /* memfd_create() */
185 #ifndef MFD_CLOEXEC
186 #define MFD_CLOEXEC 0x0001U
187 #endif
188
189 #ifndef MFD_ALLOW_SEALING
190 #define MFD_ALLOW_SEALING 0x0002U
191 #endif
192
193 #ifndef HAVE_MEMFD_CREATE
194 static int memfd_create(const char *name, unsigned int flags) {
195 #ifndef __NR_memfd_create
196 #if defined __i386__
197 #define __NR_memfd_create 356
198 #elif defined __x86_64__
199 #define __NR_memfd_create 319
200 #elif defined __arm__
201 #define __NR_memfd_create 385
202 #elif defined __aarch64__
203 #define __NR_memfd_create 279
204 #elif defined __s390__
205 #define __NR_memfd_create 350
206 #elif defined __powerpc__
207 #define __NR_memfd_create 360
208 #elif defined __sparc__
209 #define __NR_memfd_create 348
210 #elif defined __blackfin__
211 #define __NR_memfd_create 390
212 #elif defined __ia64__
213 #define __NR_memfd_create 1340
214 #elif defined _MIPS_SIM
215 #if _MIPS_SIM == _MIPS_SIM_ABI32
216 #define __NR_memfd_create 4354
217 #endif
218 #if _MIPS_SIM == _MIPS_SIM_NABI32
219 #define __NR_memfd_create 6318
220 #endif
221 #if _MIPS_SIM == _MIPS_SIM_ABI64
222 #define __NR_memfd_create 5314
223 #endif
224 #endif
225 #endif
226 #ifdef __NR_memfd_create
227 return syscall(__NR_memfd_create, name, flags);
228 #else
229 errno = ENOSYS;
230 return -1;
231 #endif
232 }
233 #else
234 extern int memfd_create(const char *name, unsigned int flags);
235 #endif
236
237 char *lxchook_names[NUM_LXC_HOOKS] = {
238 "pre-start", "pre-mount", "mount", "autodev", "start", "stop", "post-stop", "clone", "destroy" };
239
240 typedef int (*instantiate_cb)(struct lxc_handler *, struct lxc_netdev *);
241
242 struct mount_opt {
243 char *name;
244 int clear;
245 int flag;
246 };
247
248 struct caps_opt {
249 char *name;
250 int value;
251 };
252
253 struct limit_opt {
254 char *name;
255 int value;
256 };
257
258 /*
259 * The lxc_conf of the container currently being worked on in an
260 * API call
261 * This is used in the error calls
262 */
263 #ifdef HAVE_TLS
264 __thread struct lxc_conf *current_config;
265 #else
266 struct lxc_conf *current_config;
267 #endif
268
269 /* Declare this here, since we don't want to reshuffle the whole file. */
270 static int in_caplist(int cap, struct lxc_list *caps);
271
272 static int instantiate_veth(struct lxc_handler *, struct lxc_netdev *);
273 static int instantiate_macvlan(struct lxc_handler *, struct lxc_netdev *);
274 static int instantiate_vlan(struct lxc_handler *, struct lxc_netdev *);
275 static int instantiate_phys(struct lxc_handler *, struct lxc_netdev *);
276 static int instantiate_empty(struct lxc_handler *, struct lxc_netdev *);
277 static int instantiate_none(struct lxc_handler *, struct lxc_netdev *);
278
279 static instantiate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
280 [LXC_NET_VETH] = instantiate_veth,
281 [LXC_NET_MACVLAN] = instantiate_macvlan,
282 [LXC_NET_VLAN] = instantiate_vlan,
283 [LXC_NET_PHYS] = instantiate_phys,
284 [LXC_NET_EMPTY] = instantiate_empty,
285 [LXC_NET_NONE] = instantiate_none,
286 };
287
288 static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
289 static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
290 static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
291 static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
292 static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
293 static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
294
295 static instantiate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
296 [LXC_NET_VETH] = shutdown_veth,
297 [LXC_NET_MACVLAN] = shutdown_macvlan,
298 [LXC_NET_VLAN] = shutdown_vlan,
299 [LXC_NET_PHYS] = shutdown_phys,
300 [LXC_NET_EMPTY] = shutdown_empty,
301 [LXC_NET_NONE] = shutdown_none,
302 };
303
304 static struct mount_opt mount_opt[] = {
305 { "async", 1, MS_SYNCHRONOUS },
306 { "atime", 1, MS_NOATIME },
307 { "bind", 0, MS_BIND },
308 { "defaults", 0, 0 },
309 { "dev", 1, MS_NODEV },
310 { "diratime", 1, MS_NODIRATIME },
311 { "dirsync", 0, MS_DIRSYNC },
312 { "exec", 1, MS_NOEXEC },
313 { "lazytime", 0, MS_LAZYTIME },
314 { "mand", 0, MS_MANDLOCK },
315 { "noatime", 0, MS_NOATIME },
316 { "nodev", 0, MS_NODEV },
317 { "nodiratime", 0, MS_NODIRATIME },
318 { "noexec", 0, MS_NOEXEC },
319 { "nomand", 1, MS_MANDLOCK },
320 { "norelatime", 1, MS_RELATIME },
321 { "nostrictatime", 1, MS_STRICTATIME },
322 { "nosuid", 0, MS_NOSUID },
323 { "rbind", 0, MS_BIND|MS_REC },
324 { "relatime", 0, MS_RELATIME },
325 { "remount", 0, MS_REMOUNT },
326 { "ro", 0, MS_RDONLY },
327 { "rw", 1, MS_RDONLY },
328 { "strictatime", 0, MS_STRICTATIME },
329 { "suid", 1, MS_NOSUID },
330 { "sync", 0, MS_SYNCHRONOUS },
331 { NULL, 0, 0 },
332 };
333
334 #if HAVE_LIBCAP
335 static struct caps_opt caps_opt[] = {
336 { "chown", CAP_CHOWN },
337 { "dac_override", CAP_DAC_OVERRIDE },
338 { "dac_read_search", CAP_DAC_READ_SEARCH },
339 { "fowner", CAP_FOWNER },
340 { "fsetid", CAP_FSETID },
341 { "kill", CAP_KILL },
342 { "setgid", CAP_SETGID },
343 { "setuid", CAP_SETUID },
344 { "setpcap", CAP_SETPCAP },
345 { "linux_immutable", CAP_LINUX_IMMUTABLE },
346 { "net_bind_service", CAP_NET_BIND_SERVICE },
347 { "net_broadcast", CAP_NET_BROADCAST },
348 { "net_admin", CAP_NET_ADMIN },
349 { "net_raw", CAP_NET_RAW },
350 { "ipc_lock", CAP_IPC_LOCK },
351 { "ipc_owner", CAP_IPC_OWNER },
352 { "sys_module", CAP_SYS_MODULE },
353 { "sys_rawio", CAP_SYS_RAWIO },
354 { "sys_chroot", CAP_SYS_CHROOT },
355 { "sys_ptrace", CAP_SYS_PTRACE },
356 { "sys_pacct", CAP_SYS_PACCT },
357 { "sys_admin", CAP_SYS_ADMIN },
358 { "sys_boot", CAP_SYS_BOOT },
359 { "sys_nice", CAP_SYS_NICE },
360 { "sys_resource", CAP_SYS_RESOURCE },
361 { "sys_time", CAP_SYS_TIME },
362 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
363 { "mknod", CAP_MKNOD },
364 { "lease", CAP_LEASE },
365 #ifdef CAP_AUDIT_READ
366 { "audit_read", CAP_AUDIT_READ },
367 #endif
368 #ifdef CAP_AUDIT_WRITE
369 { "audit_write", CAP_AUDIT_WRITE },
370 #endif
371 #ifdef CAP_AUDIT_CONTROL
372 { "audit_control", CAP_AUDIT_CONTROL },
373 #endif
374 { "setfcap", CAP_SETFCAP },
375 { "mac_override", CAP_MAC_OVERRIDE },
376 { "mac_admin", CAP_MAC_ADMIN },
377 #ifdef CAP_SYSLOG
378 { "syslog", CAP_SYSLOG },
379 #endif
380 #ifdef CAP_WAKE_ALARM
381 { "wake_alarm", CAP_WAKE_ALARM },
382 #endif
383 #ifdef CAP_BLOCK_SUSPEND
384 { "block_suspend", CAP_BLOCK_SUSPEND },
385 #endif
386 };
387 #else
388 static struct caps_opt caps_opt[] = {};
389 #endif
390
391 static struct limit_opt limit_opt[] = {
392 #ifdef RLIMIT_AS
393 { "as", RLIMIT_AS },
394 #endif
395 #ifdef RLIMIT_CORE
396 { "core", RLIMIT_CORE },
397 #endif
398 #ifdef RLIMIT_CPU
399 { "cpu", RLIMIT_CPU },
400 #endif
401 #ifdef RLIMIT_DATA
402 { "data", RLIMIT_DATA },
403 #endif
404 #ifdef RLIMIT_FSIZE
405 { "fsize", RLIMIT_FSIZE },
406 #endif
407 #ifdef RLIMIT_LOCKS
408 { "locks", RLIMIT_LOCKS },
409 #endif
410 #ifdef RLIMIT_MEMLOCK
411 { "memlock", RLIMIT_MEMLOCK },
412 #endif
413 #ifdef RLIMIT_MSGQUEUE
414 { "msgqueue", RLIMIT_MSGQUEUE },
415 #endif
416 #ifdef RLIMIT_NICE
417 { "nice", RLIMIT_NICE },
418 #endif
419 #ifdef RLIMIT_NOFILE
420 { "nofile", RLIMIT_NOFILE },
421 #endif
422 #ifdef RLIMIT_NPROC
423 { "nproc", RLIMIT_NPROC },
424 #endif
425 #ifdef RLIMIT_RSS
426 { "rss", RLIMIT_RSS },
427 #endif
428 #ifdef RLIMIT_RTPRIO
429 { "rtprio", RLIMIT_RTPRIO },
430 #endif
431 #ifdef RLIMIT_RTTIME
432 { "rttime", RLIMIT_RTTIME },
433 #endif
434 #ifdef RLIMIT_SIGPENDING
435 { "sigpending", RLIMIT_SIGPENDING },
436 #endif
437 #ifdef RLIMIT_STACK
438 { "stack", RLIMIT_STACK },
439 #endif
440 };
441
442 static int run_buffer(char *buffer)
443 {
444 struct lxc_popen_FILE *f;
445 char *output;
446 int ret;
447
448 f = lxc_popen(buffer);
449 if (!f) {
450 SYSERROR("Failed to popen() %s.", buffer);
451 return -1;
452 }
453
454 output = malloc(LXC_LOG_BUFFER_SIZE);
455 if (!output) {
456 ERROR("Failed to allocate memory for %s.", buffer);
457 lxc_pclose(f);
458 return -1;
459 }
460
461 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
462 DEBUG("Script %s with output: %s.", buffer, output);
463
464 free(output);
465
466 ret = lxc_pclose(f);
467 if (ret == -1) {
468 SYSERROR("Script exited with error.");
469 return -1;
470 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
471 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
472 return -1;
473 } else if (WIFSIGNALED(ret)) {
474 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
475 return -1;
476 }
477
478 return 0;
479 }
480
481 static int run_script_argv(const char *name, const char *section,
482 const char *script, const char *hook,
483 const char *lxcpath, char **argsin)
484 {
485 int ret, i;
486 char *buffer;
487 size_t size = 0;
488
489 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
490 script, name, section);
491
492 for (i = 0; argsin && argsin[i]; i++)
493 size += strlen(argsin[i]) + 1;
494
495 size += strlen(hook) + 1;
496
497 size += strlen(script);
498 size += strlen(name);
499 size += strlen(section);
500 size += 3;
501
502 if (size > INT_MAX)
503 return -1;
504
505 buffer = alloca(size);
506 if (!buffer) {
507 ERROR("Failed to allocate memory.");
508 return -1;
509 }
510
511 ret =
512 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
513 if (ret < 0 || (size_t)ret >= size) {
514 ERROR("Script name too long.");
515 return -1;
516 }
517
518 for (i = 0; argsin && argsin[i]; i++) {
519 int len = size - ret;
520 int rc;
521 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
522 if (rc < 0 || rc >= len) {
523 ERROR("Script args too long.");
524 return -1;
525 }
526 ret += rc;
527 }
528
529 return run_buffer(buffer);
530 }
531
532 static int run_script(const char *name, const char *section, const char *script,
533 ...)
534 {
535 int ret;
536 char *buffer, *p;
537 size_t size = 0;
538 va_list ap;
539
540 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
541 script, name, section);
542
543 va_start(ap, script);
544 while ((p = va_arg(ap, char *)))
545 size += strlen(p) + 1;
546 va_end(ap);
547
548 size += strlen(script);
549 size += strlen(name);
550 size += strlen(section);
551 size += 3;
552
553 if (size > INT_MAX)
554 return -1;
555
556 buffer = alloca(size);
557 if (!buffer) {
558 ERROR("Failed to allocate memory.");
559 return -1;
560 }
561
562 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
563 if (ret < 0 || ret >= size) {
564 ERROR("Script name too long.");
565 return -1;
566 }
567
568 va_start(ap, script);
569 while ((p = va_arg(ap, char *))) {
570 int len = size - ret;
571 int rc;
572 rc = snprintf(buffer + ret, len, " %s", p);
573 if (rc < 0 || rc >= len) {
574 ERROR("Script args too long.");
575 return -1;
576 }
577 ret += rc;
578 }
579 va_end(ap);
580
581 return run_buffer(buffer);
582 }
583
584 /*
585 * pin_rootfs
586 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
587 * the duration of the container run, to prevent the container from marking
588 * the underlying fs readonly on shutdown. unlink the file immediately so
589 * no name pollution is happens
590 * return -1 on error.
591 * return -2 if nothing needed to be pinned.
592 * return an open fd (>=0) if we pinned it.
593 */
594 int pin_rootfs(const char *rootfs)
595 {
596 char absrootfs[MAXPATHLEN];
597 char absrootfspin[MAXPATHLEN];
598 struct stat s;
599 int ret, fd;
600
601 if (rootfs == NULL || strlen(rootfs) == 0)
602 return -2;
603
604 if (!realpath(rootfs, absrootfs))
605 return -2;
606
607 if (access(absrootfs, F_OK))
608 return -1;
609
610 if (stat(absrootfs, &s))
611 return -1;
612
613 if (!S_ISDIR(s.st_mode))
614 return -2;
615
616 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
617 if (ret >= MAXPATHLEN)
618 return -1;
619
620 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
621 if (fd < 0)
622 return fd;
623 (void)unlink(absrootfspin);
624 return fd;
625 }
626
627 /*
628 * If we are asking to remount something, make sure that any
629 * NOEXEC etc are honored.
630 */
631 static unsigned long add_required_remount_flags(const char *s, const char *d,
632 unsigned long flags)
633 {
634 #ifdef HAVE_STATVFS
635 struct statvfs sb;
636 unsigned long required_flags = 0;
637
638 if (!(flags & MS_REMOUNT))
639 return flags;
640
641 if (!s)
642 s = d;
643
644 if (!s)
645 return flags;
646 if (statvfs(s, &sb) < 0)
647 return flags;
648
649 if (sb.f_flag & MS_NOSUID)
650 required_flags |= MS_NOSUID;
651 if (sb.f_flag & MS_NODEV)
652 required_flags |= MS_NODEV;
653 if (sb.f_flag & MS_RDONLY)
654 required_flags |= MS_RDONLY;
655 if (sb.f_flag & MS_NOEXEC)
656 required_flags |= MS_NOEXEC;
657
658 return flags | required_flags;
659 #else
660 return flags;
661 #endif
662 }
663
664 static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
665 {
666 int r;
667 int i;
668 static struct {
669 int match_mask;
670 int match_flag;
671 const char *source;
672 const char *destination;
673 const char *fstype;
674 unsigned long flags;
675 const char *options;
676 } default_mounts[] = {
677 /* Read-only bind-mounting... In older kernels, doing that required
678 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
679 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
680 * kernel 2.6.26 onwards. However, this apparently does not work on
681 * kernel 3.8. Unfortunately, on that very same kernel, doing the
682 * same trick as above doesn't seem to work either, there one needs
683 * to ALSO specify MS_BIND for the remount, otherwise the entire
684 * fs is remounted read-only or the mount fails because it's busy...
685 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
686 * 2.6.32...
687 */
688 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
689 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
690 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
691 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
692 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
693 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
694 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
695 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
696 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
697 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
698 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
699 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
700 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
701 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
702 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
703 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
704 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
705 { 0, 0, NULL, NULL, NULL, 0, NULL }
706 };
707
708 for (i = 0; default_mounts[i].match_mask; i++) {
709 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
710 char *source = NULL;
711 char *destination = NULL;
712 int saved_errno;
713 unsigned long mflags;
714
715 if (default_mounts[i].source) {
716 /* will act like strdup if %r is not present */
717 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
718 if (!source) {
719 SYSERROR("memory allocation error");
720 return -1;
721 }
722 }
723 if (!default_mounts[i].destination) {
724 ERROR("BUG: auto mounts destination %d was NULL", i);
725 free(source);
726 return -1;
727 }
728 /* will act like strdup if %r is not present */
729 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
730 if (!destination) {
731 saved_errno = errno;
732 SYSERROR("memory allocation error");
733 free(source);
734 errno = saved_errno;
735 return -1;
736 }
737 mflags = add_required_remount_flags(source, destination,
738 default_mounts[i].flags);
739 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
740 saved_errno = errno;
741 if (r < 0 && errno == ENOENT) {
742 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
743 r = 0;
744 }
745 else if (r < 0)
746 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
747
748 free(source);
749 free(destination);
750 if (r < 0) {
751 errno = saved_errno;
752 return -1;
753 }
754 }
755 }
756
757 if (flags & LXC_AUTO_CGROUP_MASK) {
758 int cg_flags;
759
760 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
761 /* If the type of cgroup mount was not specified, it depends on the
762 * container's capabilities as to what makes sense: if we have
763 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
764 * anyway, so we may as well default to read-write; then the admin
765 * will not be given a false sense of security. (And if they really
766 * want mixed r/o r/w, then they can explicitly specify :mixed.)
767 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
768 * :mixed, because then the container can't remount it read-write. */
769 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
770 int has_sys_admin = 0;
771
772 if (!lxc_list_empty(&conf->keepcaps))
773 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
774 else
775 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
776
777 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
778 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
779 else
780 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
781 }
782
783 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
784 SYSERROR("error mounting /sys/fs/cgroup");
785 return -1;
786 }
787 }
788
789 return 0;
790 }
791
792 static int setup_utsname(struct utsname *utsname)
793 {
794 if (!utsname)
795 return 0;
796
797 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
798 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
799 return -1;
800 }
801
802 INFO("'%s' hostname has been setup", utsname->nodename);
803
804 return 0;
805 }
806
807 struct dev_symlinks {
808 const char *oldpath;
809 const char *name;
810 };
811
812 static const struct dev_symlinks dev_symlinks[] = {
813 {"/proc/self/fd", "fd"},
814 {"/proc/self/fd/0", "stdin"},
815 {"/proc/self/fd/1", "stdout"},
816 {"/proc/self/fd/2", "stderr"},
817 };
818
819 static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
820 {
821 char path[MAXPATHLEN];
822 int ret,i;
823 struct stat s;
824
825
826 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
827 const struct dev_symlinks *d = &dev_symlinks[i];
828 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
829 if (ret < 0 || ret >= MAXPATHLEN)
830 return -1;
831
832 /*
833 * Stat the path first. If we don't get an error
834 * accept it as is and don't try to create it
835 */
836 if (!stat(path, &s)) {
837 continue;
838 }
839
840 ret = symlink(d->oldpath, path);
841
842 if (ret && errno != EEXIST) {
843 if ( errno == EROFS ) {
844 WARN("Warning: Read Only file system while creating %s", path);
845 } else {
846 SYSERROR("Error creating %s", path);
847 return -1;
848 }
849 }
850 }
851 return 0;
852 }
853
854 /*
855 * Build a space-separate list of ptys to pass to systemd.
856 */
857 static bool append_ptyname(char **pp, char *name)
858 {
859 char *p;
860
861 if (!*pp) {
862 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
863 if (!*pp)
864 return false;
865 sprintf(*pp, "container_ttys=%s", name);
866 return true;
867 }
868 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
869 if (!p)
870 return false;
871 *pp = p;
872 strcat(p, " ");
873 strcat(p, name);
874 return true;
875 }
876
877 static int lxc_setup_tty(struct lxc_conf *conf)
878 {
879 int i, ret;
880 const struct lxc_tty_info *tty_info = &conf->tty_info;
881 char *ttydir = conf->ttydir;
882 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
883
884 if (!conf->rootfs.path)
885 return 0;
886
887 for (i = 0; i < tty_info->nbtty; i++) {
888 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
889
890 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
891 if (ret < 0 || (size_t)ret >= sizeof(path)) {
892 ERROR("pathname too long for ttys");
893 return -1;
894 }
895
896 if (ttydir) {
897 /* create dev/lxc/tty%d" */
898 ret = snprintf(lxcpath, sizeof(lxcpath),
899 "/dev/%s/tty%d", ttydir, i + 1);
900 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
901 ERROR("pathname too long for ttys");
902 return -1;
903 }
904
905 ret = creat(lxcpath, 0660);
906 if (ret < 0 && errno != EEXIST) {
907 SYSERROR("failed to create \"%s\"", lxcpath);
908 return -1;
909 }
910 if (ret >= 0)
911 close(ret);
912
913 ret = unlink(path);
914 if (ret < 0 && errno != ENOENT) {
915 SYSERROR("failed to unlink \"%s\"", path);
916 return -1;
917 }
918
919 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
920 if (ret < 0) {
921 WARN("failed to bind mount \"%s\" onto \"%s\"",
922 pty_info->name, path);
923 continue;
924 }
925 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
926 path);
927
928 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
929 ttydir, i + 1);
930 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
931 ERROR("tty pathname too long");
932 return -1;
933 }
934
935 ret = symlink(lxcpath, path);
936 if (ret < 0) {
937 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
938 path, lxcpath);
939 return -1;
940 }
941 } else {
942 /* If we populated /dev, then we need to create
943 * /dev/ttyN
944 */
945 ret = access(path, F_OK);
946 if (ret < 0) {
947 ret = creat(path, 0660);
948 if (ret < 0) {
949 SYSERROR("failed to create \"%s\"", path);
950 /* this isn't fatal, continue */
951 } else {
952 close(ret);
953 }
954 }
955
956 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
957 if (ret < 0) {
958 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
959 continue;
960 }
961
962 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
963 path);
964 }
965
966 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
967 ERROR("Error setting up container_ttys string");
968 return -1;
969 }
970 }
971
972 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
973 return 0;
974 }
975
976 static int setup_rootfs_pivot_root(const char *rootfs)
977 {
978 int oldroot = -1, newroot = -1;
979
980 oldroot = open("/", O_DIRECTORY | O_RDONLY);
981 if (oldroot < 0) {
982 SYSERROR("Error opening old-/ for fchdir");
983 return -1;
984 }
985 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
986 if (newroot < 0) {
987 SYSERROR("Error opening new-/ for fchdir");
988 goto fail;
989 }
990
991 /* change into new root fs */
992 if (fchdir(newroot)) {
993 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
994 goto fail;
995 }
996
997 /* pivot_root into our new root fs */
998 if (pivot_root(".", ".")) {
999 SYSERROR("pivot_root syscall failed");
1000 goto fail;
1001 }
1002
1003 /*
1004 * at this point the old-root is mounted on top of our new-root
1005 * To unmounted it we must not be chdir'd into it, so escape back
1006 * to old-root
1007 */
1008 if (fchdir(oldroot) < 0) {
1009 SYSERROR("Error entering oldroot");
1010 goto fail;
1011 }
1012 if (umount2(".", MNT_DETACH) < 0) {
1013 SYSERROR("Error detaching old root");
1014 goto fail;
1015 }
1016
1017 if (fchdir(newroot) < 0) {
1018 SYSERROR("Error re-entering newroot");
1019 goto fail;
1020 }
1021
1022 close(oldroot);
1023 close(newroot);
1024
1025 DEBUG("pivot_root syscall to '%s' successful", rootfs);
1026
1027 return 0;
1028
1029 fail:
1030 if (oldroot != -1)
1031 close(oldroot);
1032 if (newroot != -1)
1033 close(newroot);
1034 return -1;
1035 }
1036
1037 /*
1038 * Just create a path for /dev under $lxcpath/$name and in rootfs
1039 * If we hit an error, log it but don't fail yet.
1040 */
1041 static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs, const char *lxcpath)
1042 {
1043 int ret;
1044 size_t clen;
1045 char *path;
1046
1047 INFO("Mounting container /dev");
1048
1049 /* $(rootfs->mount) + "/dev/pts" + '\0' */
1050 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
1051 path = alloca(clen);
1052
1053 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
1054 if (ret < 0 || ret >= clen)
1055 return -1;
1056
1057 if (!dir_exists(path)) {
1058 WARN("No /dev in container.");
1059 WARN("Proceeding without autodev setup");
1060 return 0;
1061 }
1062
1063 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
1064 rootfs->path ? rootfs->mount : NULL);
1065 if (ret != 0) {
1066 SYSERROR("Failed mounting tmpfs onto %s\n", path);
1067 return -1;
1068 }
1069
1070 INFO("Mounted tmpfs onto %s", path);
1071
1072 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
1073 if (ret < 0 || ret >= clen)
1074 return -1;
1075
1076 /*
1077 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1078 * If not, then create it and exit if that fails...
1079 */
1080 if (!dir_exists(path)) {
1081 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1082 if (ret) {
1083 SYSERROR("Failed to create /dev/pts in container");
1084 return -1;
1085 }
1086 }
1087
1088 INFO("Mounted container /dev");
1089 return 0;
1090 }
1091
1092 struct lxc_devs {
1093 const char *name;
1094 mode_t mode;
1095 int maj;
1096 int min;
1097 };
1098
1099 static const struct lxc_devs lxc_devs[] = {
1100 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1101 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1102 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1103 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1104 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1105 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
1106 };
1107
1108 static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
1109 {
1110 int ret;
1111 char path[MAXPATHLEN];
1112 int i;
1113 mode_t cmask;
1114
1115 ret = snprintf(path, MAXPATHLEN, "%s/dev", rootfs->path ? rootfs->mount : "");
1116 if (ret < 0 || ret >= MAXPATHLEN) {
1117 ERROR("Error calculating container /dev location");
1118 return -1;
1119 }
1120
1121 /* ignore, just don't try to fill in */
1122 if (!dir_exists(path))
1123 return 0;
1124
1125 INFO("populating container /dev");
1126 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1127 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
1128 const struct lxc_devs *d = &lxc_devs[i];
1129
1130 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
1131 if (ret < 0 || ret >= MAXPATHLEN)
1132 return -1;
1133
1134 ret = mknod(path, d->mode, makedev(d->maj, d->min));
1135 if (ret < 0) {
1136 char hostpath[MAXPATHLEN];
1137 FILE *pathfile;
1138
1139 if (errno == EEXIST) {
1140 DEBUG("\"%s\" device already existed", path);
1141 continue;
1142 }
1143
1144 /* Unprivileged containers cannot create devices, so
1145 * bind mount the device from the host.
1146 */
1147 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1148 if (ret < 0 || ret >= MAXPATHLEN)
1149 return -1;
1150 pathfile = fopen(path, "wb");
1151 if (!pathfile) {
1152 SYSERROR("Failed to create device mount target '%s'", path);
1153 return -1;
1154 }
1155 fclose(pathfile);
1156 if (safe_mount(hostpath, path, 0, MS_BIND, NULL, rootfs->path ? rootfs->mount : NULL) != 0) {
1157 SYSERROR("Failed bind mounting device %s from host into container", d->name);
1158 return -1;
1159 }
1160 DEBUG("bind mounted \"%s\" onto \"%s\"", hostpath, path);
1161 } else {
1162 DEBUG("created device node \"%s\"", path);
1163 }
1164 }
1165 umask(cmask);
1166
1167 INFO("populated container /dev");
1168 return 0;
1169 }
1170
1171 static int lxc_setup_rootfs(struct lxc_conf *conf)
1172 {
1173 int ret;
1174 struct bdev *bdev;
1175 const struct lxc_rootfs *rootfs;
1176
1177 rootfs = &conf->rootfs;
1178 if (!rootfs->path) {
1179 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1180 SYSERROR("Failed to make / rslave.");
1181 return -1;
1182 }
1183 return 0;
1184 }
1185
1186 if (access(rootfs->mount, F_OK)) {
1187 SYSERROR("Failed to access to \"%s\". Check it is present.",
1188 rootfs->mount);
1189 return -1;
1190 }
1191
1192 bdev = bdev_init(conf, rootfs->path, rootfs->mount, rootfs->options);
1193 if (!bdev) {
1194 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1195 rootfs->path, rootfs->mount,
1196 rootfs->options ? rootfs->options : "(null)");
1197 return -1;
1198 }
1199
1200 ret = bdev->ops->mount(bdev);
1201 bdev_put(bdev);
1202 if (ret < 0) {
1203 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1204 rootfs->path, rootfs->mount,
1205 rootfs->options ? rootfs->options : "(null)");
1206 return -1;
1207 }
1208
1209 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1210 rootfs->path, rootfs->mount,
1211 rootfs->options ? rootfs->options : "(null)");
1212
1213 return 0;
1214 }
1215
1216 int prepare_ramfs_root(char *root)
1217 {
1218 char buf[LXC_LINELEN], *p;
1219 char nroot[PATH_MAX];
1220 FILE *f;
1221 int i;
1222 char *p2;
1223
1224 if (realpath(root, nroot) == NULL)
1225 return -errno;
1226
1227 if (chdir("/") == -1)
1228 return -errno;
1229
1230 /*
1231 * We could use here MS_MOVE, but in userns this mount is
1232 * locked and can't be moved.
1233 */
1234 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
1235 SYSERROR("Failed to move %s into /", root);
1236 return -errno;
1237 }
1238
1239 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
1240 SYSERROR("Failed to make . rprivate");
1241 return -errno;
1242 }
1243
1244 /*
1245 * The following code cleans up inhereted mounts which are not
1246 * required for CT.
1247 *
1248 * The mountinfo file shows not all mounts, if a few points have been
1249 * unmounted between read operations from the mountinfo. So we need to
1250 * read mountinfo a few times.
1251 *
1252 * This loop can be skipped if a container uses unserns, because all
1253 * inherited mounts are locked and we should live with all this trash.
1254 */
1255 while (1) {
1256 int progress = 0;
1257
1258 f = fopen("./proc/self/mountinfo", "r");
1259 if (!f) {
1260 SYSERROR("Unable to open /proc/self/mountinfo");
1261 return -1;
1262 }
1263 while (fgets(buf, LXC_LINELEN, f)) {
1264 for (p = buf, i=0; p && i < 4; i++)
1265 p = strchr(p+1, ' ');
1266 if (!p)
1267 continue;
1268 p2 = strchr(p+1, ' ');
1269 if (!p2)
1270 continue;
1271
1272 *p2 = '\0';
1273 *p = '.';
1274
1275 if (strcmp(p + 1, "/") == 0)
1276 continue;
1277 if (strcmp(p + 1, "/proc") == 0)
1278 continue;
1279
1280 if (umount2(p, MNT_DETACH) == 0)
1281 progress++;
1282 }
1283 fclose(f);
1284 if (!progress)
1285 break;
1286 }
1287
1288 /* This also can be skipped if a container uses unserns */
1289 umount2("./proc", MNT_DETACH);
1290
1291 /* It is weird, but chdir("..") moves us in a new root */
1292 if (chdir("..") == -1) {
1293 SYSERROR("Unable to change working directory");
1294 return -1;
1295 }
1296
1297 if (chroot(".") == -1) {
1298 SYSERROR("Unable to chroot");
1299 return -1;
1300 }
1301
1302 return 0;
1303 }
1304
1305 static int setup_pivot_root(const struct lxc_rootfs *rootfs)
1306 {
1307 if (!rootfs->path) {
1308 DEBUG("container does not have a rootfs, so not doing pivot root");
1309 return 0;
1310 }
1311
1312 if (detect_ramfs_rootfs()) {
1313 DEBUG("detected that container is on ramfs");
1314 if (prepare_ramfs_root(rootfs->mount)) {
1315 ERROR("failed to prepare minimal ramfs root");
1316 return -1;
1317 }
1318
1319 DEBUG("prepared ramfs root for container");
1320 return 0;
1321 }
1322
1323 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1324 ERROR("failed to pivot root");
1325 return -1;
1326 }
1327
1328 DEBUG("finished pivot root");
1329 return 0;
1330 }
1331
1332 static int lxc_setup_devpts(int num_pts)
1333 {
1334 int ret;
1335 const char *devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
1336
1337 if (!num_pts) {
1338 DEBUG("no new devpts instance will be mounted since no pts "
1339 "devices are requested");
1340 return 0;
1341 }
1342
1343 /* Unmount old devpts instance. */
1344 ret = access("/dev/pts/ptmx", F_OK);
1345 if (!ret) {
1346 ret = umount("/dev/pts");
1347 if (ret < 0) {
1348 SYSERROR("failed to unmount old devpts instance");
1349 return -1;
1350 }
1351 DEBUG("unmounted old /dev/pts instance");
1352 }
1353
1354 /* Create mountpoint for devpts instance. */
1355 ret = mkdir("/dev/pts", 0755);
1356 if (ret < 0 && errno != EEXIST) {
1357 SYSERROR("failed to create the \"/dev/pts\" directory");
1358 return -1;
1359 }
1360
1361 /* Mount new devpts instance. */
1362 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1363 if (ret < 0) {
1364 SYSERROR("failed to mount new devpts instance");
1365 return -1;
1366 }
1367 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
1368
1369 /* Remove any pre-existing /dev/ptmx file. */
1370 ret = access("/dev/ptmx", F_OK);
1371 if (!ret) {
1372 ret = remove("/dev/ptmx");
1373 if (ret < 0) {
1374 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1375 return -1;
1376 }
1377 DEBUG("removed existing \"/dev/ptmx\"");
1378 }
1379
1380 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1381 ret = open("/dev/ptmx", O_CREAT, 0666);
1382 if (ret < 0) {
1383 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1384 return -1;
1385 }
1386 close(ret);
1387 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
1388
1389 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
1390 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
1391 if (!ret) {
1392 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1393 return 0;
1394 } else {
1395 /* Fallthrough and try to create a symlink. */
1396 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1397 }
1398
1399 /* Remove the dummy /dev/ptmx file we created above. */
1400 ret = remove("/dev/ptmx");
1401 if (ret < 0) {
1402 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1403 return -1;
1404 }
1405
1406 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1407 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1408 if (ret < 0) {
1409 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1410 return -1;
1411 }
1412 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1413
1414 return 0;
1415 }
1416
1417 static int setup_personality(int persona)
1418 {
1419 #if HAVE_SYS_PERSONALITY_H
1420 if (persona == -1)
1421 return 0;
1422
1423 if (personality(persona) < 0) {
1424 SYSERROR("failed to set personality to '0x%x'", persona);
1425 return -1;
1426 }
1427
1428 INFO("set personality to '0x%x'", persona);
1429 #endif
1430
1431 return 0;
1432 }
1433
1434 static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1435 const struct lxc_console *console)
1436 {
1437 char path[MAXPATHLEN];
1438 int ret, fd;
1439
1440 if (console->path && !strcmp(console->path, "none"))
1441 return 0;
1442
1443 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1444 if (ret < 0 || (size_t)ret >= sizeof(path))
1445 return -1;
1446
1447 /* When we are asked to setup a console we remove any previous
1448 * /dev/console bind-mounts.
1449 */
1450 if (file_exists(path)) {
1451 ret = lxc_unstack_mountpoint(path, false);
1452 if (ret < 0) {
1453 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1454 return -ret;
1455 } else {
1456 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1457 }
1458 ret = unlink(path);
1459 if (ret < 0) {
1460 SYSERROR("error unlinking %s", path);
1461 return -errno;
1462 }
1463 }
1464
1465 /* For unprivileged containers autodev or automounts will already have
1466 * taken care of creating /dev/console.
1467 */
1468 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1469 if (fd < 0) {
1470 if (errno != EEXIST) {
1471 SYSERROR("failed to create console");
1472 return -errno;
1473 }
1474 } else {
1475 close(fd);
1476 }
1477
1478 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
1479 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1480 return -errno;
1481 }
1482
1483 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
1484 ERROR("failed to mount '%s' on '%s'", console->name, path);
1485 return -1;
1486 }
1487
1488 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
1489 return 0;
1490 }
1491
1492 static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1493 const struct lxc_console *console,
1494 char *ttydir)
1495 {
1496 int ret;
1497 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
1498
1499 /* create rootfs/dev/<ttydir> directory */
1500 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1501 if (ret < 0 || (size_t)ret >= sizeof(path))
1502 return -1;
1503
1504 ret = mkdir(path, 0755);
1505 if (ret && errno != EEXIST) {
1506 SYSERROR("failed with errno %d to create %s", errno, path);
1507 return -errno;
1508 }
1509 DEBUG("created directory for console and tty devices at \%s\"", path);
1510
1511 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1512 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1513 return -1;
1514
1515 ret = creat(lxcpath, 0660);
1516 if (ret == -1 && errno != EEXIST) {
1517 SYSERROR("error %d creating %s", errno, lxcpath);
1518 return -errno;
1519 }
1520 if (ret >= 0)
1521 close(ret);
1522
1523 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1524 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1525 return -1;
1526
1527 /* When we are asked to setup a console we remove any previous
1528 * /dev/console bind-mounts.
1529 */
1530 if (console->path && !strcmp(console->path, "none")) {
1531 struct stat st;
1532 ret = stat(path, &st);
1533 if (ret < 0) {
1534 if (errno == ENOENT)
1535 return 0;
1536 SYSERROR("failed stat() \"%s\"", path);
1537 return -errno;
1538 }
1539
1540 /* /dev/console must be character device with major number 5 and
1541 * minor number 1. If not, give benefit of the doubt and assume
1542 * the user has mounted something else right there on purpose.
1543 */
1544 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1545 return 0;
1546
1547 /* In case the user requested a bind-mount for /dev/console and
1548 * requests a ttydir we move the mount to the
1549 * /dev/<ttydir/console.
1550 * Note, we only move the uppermost mount and clear all other
1551 * mounts underneath for safety.
1552 * If it is a character device created via mknod() we simply
1553 * rename it.
1554 */
1555 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1556 if (ret < 0) {
1557 if (errno != EINVAL) {
1558 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1559 return -errno;
1560 }
1561 /* path was not a mountpoint */
1562 ret = rename(path, lxcpath);
1563 if (ret < 0) {
1564 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1565 return -errno;
1566 }
1567 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1568 } else {
1569 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1570 }
1571
1572 /* Clear all remaining bind-mounts. */
1573 ret = lxc_unstack_mountpoint(path, false);
1574 if (ret < 0) {
1575 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1576 return -ret;
1577 } else {
1578 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1579 }
1580 } else {
1581 if (file_exists(path)) {
1582 ret = lxc_unstack_mountpoint(path, false);
1583 if (ret < 0) {
1584 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1585 return -ret;
1586 } else {
1587 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1588 }
1589 }
1590
1591 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1592 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1593 return -1;
1594 }
1595 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
1596 }
1597
1598 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
1599 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
1600 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1601 return -1;
1602
1603 ret = unlink(path);
1604 if (ret && errno != ENOENT) {
1605 SYSERROR("error unlinking %s", path);
1606 return -errno;
1607 }
1608
1609 ret = symlink(lxcpath, path);
1610 if (ret < 0) {
1611 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
1612 return -1;
1613 }
1614
1615 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
1616 return 0;
1617 }
1618
1619 static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1620 const struct lxc_console *console, char *ttydir)
1621 {
1622 /* We don't have a rootfs, /dev/console will be shared. */
1623 if (!rootfs->path) {
1624 DEBUG("/dev/console will be shared with the host");
1625 return 0;
1626 }
1627
1628 if (!ttydir)
1629 return lxc_setup_dev_console(rootfs, console);
1630
1631 return lxc_setup_ttydir_console(rootfs, console, ttydir);
1632 }
1633
1634 static int setup_kmsg(const struct lxc_rootfs *rootfs,
1635 const struct lxc_console *console)
1636 {
1637 char kpath[MAXPATHLEN];
1638 int ret;
1639
1640 if (!rootfs->path)
1641 return 0;
1642 ret = snprintf(kpath, sizeof(kpath), "%s/dev/kmsg", rootfs->mount);
1643 if (ret < 0 || ret >= sizeof(kpath))
1644 return -1;
1645
1646 ret = unlink(kpath);
1647 if (ret && errno != ENOENT) {
1648 SYSERROR("error unlinking %s", kpath);
1649 return -1;
1650 }
1651
1652 ret = symlink("console", kpath);
1653 if (ret) {
1654 SYSERROR("failed to create symlink for kmsg");
1655 return -1;
1656 }
1657
1658 return 0;
1659 }
1660
1661 static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1662 {
1663 struct mount_opt *mo;
1664
1665 /* If opt is found in mount_opt, set or clear flags.
1666 * Otherwise append it to data. */
1667
1668 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1669 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1670 if (mo->clear)
1671 *flags &= ~mo->flag;
1672 else
1673 *flags |= mo->flag;
1674 return;
1675 }
1676 }
1677
1678 if (strlen(*data))
1679 strcat(*data, ",");
1680 strcat(*data, opt);
1681 }
1682
1683 int parse_mntopts(const char *mntopts, unsigned long *mntflags,
1684 char **mntdata)
1685 {
1686 char *s, *data;
1687 char *p, *saveptr = NULL;
1688
1689 *mntdata = NULL;
1690 *mntflags = 0L;
1691
1692 if (!mntopts)
1693 return 0;
1694
1695 s = strdup(mntopts);
1696 if (!s) {
1697 SYSERROR("failed to allocate memory");
1698 return -1;
1699 }
1700
1701 data = malloc(strlen(s) + 1);
1702 if (!data) {
1703 SYSERROR("failed to allocate memory");
1704 free(s);
1705 return -1;
1706 }
1707 *data = 0;
1708
1709 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1710 p = strtok_r(NULL, ",", &saveptr))
1711 parse_mntopt(p, mntflags, &data);
1712
1713 if (*data)
1714 *mntdata = data;
1715 else
1716 free(data);
1717 free(s);
1718
1719 return 0;
1720 }
1721
1722 static void null_endofword(char *word)
1723 {
1724 while (*word && *word != ' ' && *word != '\t')
1725 word++;
1726 *word = '\0';
1727 }
1728
1729 /*
1730 * skip @nfields spaces in @src
1731 */
1732 static char *get_field(char *src, int nfields)
1733 {
1734 char *p = src;
1735 int i;
1736
1737 for (i = 0; i < nfields; i++) {
1738 while (*p && *p != ' ' && *p != '\t')
1739 p++;
1740 if (!*p)
1741 break;
1742 p++;
1743 }
1744 return p;
1745 }
1746
1747 static int mount_entry(const char *fsname, const char *target,
1748 const char *fstype, unsigned long mountflags,
1749 const char *data, int optional, int dev, const char *rootfs)
1750 {
1751 #ifdef HAVE_STATVFS
1752 struct statvfs sb;
1753 #endif
1754
1755 if (safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data, rootfs)) {
1756 if (optional) {
1757 INFO("failed to mount '%s' on '%s' (optional): %s", fsname,
1758 target, strerror(errno));
1759 return 0;
1760 }
1761 else {
1762 SYSERROR("failed to mount '%s' on '%s'", fsname, target);
1763 return -1;
1764 }
1765 }
1766
1767 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
1768 DEBUG("remounting %s on %s to respect bind or remount options",
1769 fsname ? fsname : "(none)", target ? target : "(none)");
1770 unsigned long rqd_flags = 0;
1771 if (mountflags & MS_RDONLY)
1772 rqd_flags |= MS_RDONLY;
1773 #ifdef HAVE_STATVFS
1774 if (statvfs(fsname, &sb) == 0) {
1775 unsigned long required_flags = rqd_flags;
1776 if (sb.f_flag & MS_NOSUID)
1777 required_flags |= MS_NOSUID;
1778 if (sb.f_flag & MS_NODEV && !dev)
1779 required_flags |= MS_NODEV;
1780 if (sb.f_flag & MS_RDONLY)
1781 required_flags |= MS_RDONLY;
1782 if (sb.f_flag & MS_NOEXEC)
1783 required_flags |= MS_NOEXEC;
1784 DEBUG("(at remount) flags for %s was %lu, required extra flags are %lu", fsname, sb.f_flag, required_flags);
1785 /*
1786 * If this was a bind mount request, and required_flags
1787 * does not have any flags which are not already in
1788 * mountflags, then skip the remount
1789 */
1790 if (!(mountflags & MS_REMOUNT)) {
1791 if (!(required_flags & ~mountflags) && rqd_flags == 0) {
1792 DEBUG("mountflags already was %lu, skipping remount",
1793 mountflags);
1794 goto skipremount;
1795 }
1796 }
1797 mountflags |= required_flags;
1798 }
1799 #endif
1800
1801 if (mount(fsname, target, fstype,
1802 mountflags | MS_REMOUNT, data) < 0) {
1803 if (optional) {
1804 INFO("failed to mount '%s' on '%s' (optional): %s",
1805 fsname, target, strerror(errno));
1806 return 0;
1807 }
1808 else {
1809 SYSERROR("failed to mount '%s' on '%s'",
1810 fsname, target);
1811 return -1;
1812 }
1813 }
1814 }
1815
1816 #ifdef HAVE_STATVFS
1817 skipremount:
1818 #endif
1819 DEBUG("mounted '%s' on '%s', type '%s'", fsname, target, fstype);
1820
1821 return 0;
1822 }
1823
1824 /*
1825 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1826 */
1827 static void cull_mntent_opt(struct mntent *mntent)
1828 {
1829 int i;
1830 char *p, *p2;
1831 char *list[] = {"create=dir",
1832 "create=file",
1833 "optional",
1834 NULL };
1835
1836 for (i=0; list[i]; i++) {
1837 if (!(p = strstr(mntent->mnt_opts, list[i])))
1838 continue;
1839 p2 = strchr(p, ',');
1840 if (!p2) {
1841 /* no more mntopts, so just chop it here */
1842 *p = '\0';
1843 continue;
1844 }
1845 memmove(p, p2+1, strlen(p2+1)+1);
1846 }
1847 }
1848
1849 static int mount_entry_create_dir_file(const struct mntent *mntent,
1850 const char* path, const struct lxc_rootfs *rootfs,
1851 const char *lxc_name, const char *lxc_path)
1852 {
1853 char *pathdirname = NULL;
1854 int ret = 0;
1855 FILE *pathfile = NULL;
1856
1857 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
1858 if (ovl_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
1859 return -1;
1860 } else if (strncmp(mntent->mnt_type, "aufs", 4) == 0) {
1861 if (aufs_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
1862 return -1;
1863 }
1864
1865 if (hasmntopt(mntent, "create=dir")) {
1866 if (mkdir_p(path, 0755) < 0) {
1867 WARN("Failed to create mount target '%s'", path);
1868 ret = -1;
1869 }
1870 }
1871
1872 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1873 pathdirname = strdup(path);
1874 pathdirname = dirname(pathdirname);
1875 if (mkdir_p(pathdirname, 0755) < 0) {
1876 WARN("Failed to create target directory");
1877 }
1878 pathfile = fopen(path, "wb");
1879 if (!pathfile) {
1880 WARN("Failed to create mount target '%s'", path);
1881 ret = -1;
1882 } else {
1883 fclose(pathfile);
1884 }
1885 }
1886 free(pathdirname);
1887 return ret;
1888 }
1889
1890 /* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1891 * without a rootfs. */
1892 static inline int mount_entry_on_generic(struct mntent *mntent,
1893 const char* path, const struct lxc_rootfs *rootfs,
1894 const char *lxc_name, const char *lxc_path)
1895 {
1896 unsigned long mntflags;
1897 char *mntdata;
1898 int ret;
1899 bool optional = hasmntopt(mntent, "optional") != NULL;
1900 bool dev = hasmntopt(mntent, "dev") != NULL;
1901
1902 char *rootfs_path = NULL;
1903 if (rootfs && rootfs->path)
1904 rootfs_path = rootfs->mount;
1905
1906 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name, lxc_path);
1907
1908 if (ret < 0)
1909 return optional ? 0 : -1;
1910
1911 cull_mntent_opt(mntent);
1912
1913 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
1914 free(mntdata);
1915 return -1;
1916 }
1917
1918 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
1919 mntdata, optional, dev, rootfs_path);
1920
1921 free(mntdata);
1922 return ret;
1923 }
1924
1925 static inline int mount_entry_on_systemfs(struct mntent *mntent)
1926 {
1927 char path[MAXPATHLEN];
1928 int ret;
1929
1930 /* For containers created without a rootfs all mounts are treated as
1931 * absolute paths starting at / on the host. */
1932 if (mntent->mnt_dir[0] != '/')
1933 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
1934 else
1935 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
1936
1937 if (ret < 0 || ret >= sizeof(path)) {
1938 ERROR("path name too long");
1939 return -1;
1940 }
1941
1942 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
1943 }
1944
1945 static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
1946 const struct lxc_rootfs *rootfs,
1947 const char *lxc_name,
1948 const char *lxc_path)
1949 {
1950 char *aux;
1951 char path[MAXPATHLEN];
1952 int r, ret = 0, offset;
1953 const char *lxcpath;
1954
1955 lxcpath = lxc_global_config_value("lxc.lxcpath");
1956 if (!lxcpath) {
1957 ERROR("Out of memory");
1958 return -1;
1959 }
1960
1961 /* if rootfs->path is a blockdev path, allow container fstab to
1962 * use $lxcpath/CN/rootfs as the target prefix */
1963 r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
1964 if (r < 0 || r >= MAXPATHLEN)
1965 goto skipvarlib;
1966
1967 aux = strstr(mntent->mnt_dir, path);
1968 if (aux) {
1969 offset = strlen(path);
1970 goto skipabs;
1971 }
1972
1973 skipvarlib:
1974 aux = strstr(mntent->mnt_dir, rootfs->path);
1975 if (!aux) {
1976 WARN("ignoring mount point '%s'", mntent->mnt_dir);
1977 return ret;
1978 }
1979 offset = strlen(rootfs->path);
1980
1981 skipabs:
1982
1983 r = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount,
1984 aux + offset);
1985 if (r < 0 || r >= MAXPATHLEN) {
1986 WARN("pathnme too long for '%s'", mntent->mnt_dir);
1987 return -1;
1988 }
1989
1990 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
1991 }
1992
1993 static int mount_entry_on_relative_rootfs(struct mntent *mntent,
1994 const struct lxc_rootfs *rootfs,
1995 const char *lxc_name,
1996 const char *lxc_path)
1997 {
1998 char path[MAXPATHLEN];
1999 int ret;
2000
2001 /* relative to root mount point */
2002 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
2003 if (ret < 0 || ret >= sizeof(path)) {
2004 ERROR("path name too long");
2005 return -1;
2006 }
2007
2008 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2009 }
2010
2011 static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
2012 const char *lxc_name, const char *lxc_path)
2013 {
2014 struct mntent mntent;
2015 char buf[4096];
2016 int ret = -1;
2017
2018 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
2019
2020 if (!rootfs->path) {
2021 if (mount_entry_on_systemfs(&mntent))
2022 goto out;
2023 continue;
2024 }
2025
2026 /* We have a separate root, mounts are relative to it */
2027 if (mntent.mnt_dir[0] != '/') {
2028 if (mount_entry_on_relative_rootfs(&mntent, rootfs, lxc_name, lxc_path))
2029 goto out;
2030 continue;
2031 }
2032
2033 if (mount_entry_on_absolute_rootfs(&mntent, rootfs, lxc_name, lxc_path))
2034 goto out;
2035 }
2036
2037 ret = 0;
2038
2039 INFO("mount points have been setup");
2040 out:
2041 return ret;
2042 }
2043
2044 static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
2045 const char *lxc_name, const char *lxc_path)
2046 {
2047 FILE *file;
2048 int ret;
2049
2050 if (!fstab)
2051 return 0;
2052
2053 file = setmntent(fstab, "r");
2054 if (!file) {
2055 SYSERROR("failed to use '%s'", fstab);
2056 return -1;
2057 }
2058
2059 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
2060
2061 endmntent(file);
2062 return ret;
2063 }
2064
2065 FILE *make_anonymous_mount_file(struct lxc_list *mount)
2066 {
2067 int ret;
2068 char *mount_entry;
2069 struct lxc_list *iterator;
2070 FILE *file;
2071 int fd = -1;
2072
2073 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2074 if (fd < 0) {
2075 if (errno != ENOSYS)
2076 return NULL;
2077 file = tmpfile();
2078 } else {
2079 file = fdopen(fd, "r+");
2080 }
2081
2082 if (!file) {
2083 int saved_errno = errno;
2084 if (fd != -1)
2085 close(fd);
2086 ERROR("Could not create mount entry file: %s.", strerror(saved_errno));
2087 return NULL;
2088 }
2089
2090 lxc_list_for_each(iterator, mount) {
2091 mount_entry = iterator->elem;
2092 ret = fprintf(file, "%s\n", mount_entry);
2093 if (ret < strlen(mount_entry))
2094 WARN("Could not write mount entry to anonymous mount file.");
2095 }
2096
2097 if (fseek(file, 0, SEEK_SET) < 0) {
2098 fclose(file);
2099 return NULL;
2100 }
2101
2102 return file;
2103 }
2104
2105 static int setup_mount_entries(const struct lxc_rootfs *rootfs,
2106 struct lxc_list *mount, const char *lxc_name,
2107 const char *lxc_path)
2108 {
2109 FILE *file;
2110 int ret;
2111
2112 file = make_anonymous_mount_file(mount);
2113 if (!file)
2114 return -1;
2115
2116 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
2117
2118 fclose(file);
2119 return ret;
2120 }
2121
2122 static int parse_cap(const char *cap)
2123 {
2124 char *ptr = NULL;
2125 size_t i;
2126 int capid = -1;
2127
2128 if (!strcmp(cap, "none"))
2129 return -2;
2130
2131 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2132
2133 if (strcmp(cap, caps_opt[i].name))
2134 continue;
2135
2136 capid = caps_opt[i].value;
2137 break;
2138 }
2139
2140 if (capid < 0) {
2141 /* try to see if it's numeric, so the user may specify
2142 * capabilities that the running kernel knows about but
2143 * we don't */
2144 errno = 0;
2145 capid = strtol(cap, &ptr, 10);
2146 if (!ptr || *ptr != '\0' || errno != 0)
2147 /* not a valid number */
2148 capid = -1;
2149 else if (capid > lxc_caps_last_cap())
2150 /* we have a number but it's not a valid
2151 * capability */
2152 capid = -1;
2153 }
2154
2155 return capid;
2156 }
2157
2158 int in_caplist(int cap, struct lxc_list *caps)
2159 {
2160 struct lxc_list *iterator;
2161 int capid;
2162
2163 lxc_list_for_each(iterator, caps) {
2164 capid = parse_cap(iterator->elem);
2165 if (capid == cap)
2166 return 1;
2167 }
2168
2169 return 0;
2170 }
2171
2172 static int setup_caps(struct lxc_list *caps)
2173 {
2174 struct lxc_list *iterator;
2175 char *drop_entry;
2176 int capid;
2177
2178 lxc_list_for_each(iterator, caps) {
2179
2180 drop_entry = iterator->elem;
2181
2182 capid = parse_cap(drop_entry);
2183
2184 if (capid < 0) {
2185 ERROR("unknown capability %s", drop_entry);
2186 return -1;
2187 }
2188
2189 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2190
2191 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
2192 SYSERROR("failed to remove %s capability", drop_entry);
2193 return -1;
2194 }
2195
2196 }
2197
2198 DEBUG("capabilities have been setup");
2199
2200 return 0;
2201 }
2202
2203 static int dropcaps_except(struct lxc_list *caps)
2204 {
2205 struct lxc_list *iterator;
2206 char *keep_entry;
2207 int i, capid;
2208 int numcaps = lxc_caps_last_cap() + 1;
2209 INFO("found %d capabilities", numcaps);
2210
2211 if (numcaps <= 0 || numcaps > 200)
2212 return -1;
2213
2214 // caplist[i] is 1 if we keep capability i
2215 int *caplist = alloca(numcaps * sizeof(int));
2216 memset(caplist, 0, numcaps * sizeof(int));
2217
2218 lxc_list_for_each(iterator, caps) {
2219
2220 keep_entry = iterator->elem;
2221
2222 capid = parse_cap(keep_entry);
2223
2224 if (capid == -2)
2225 continue;
2226
2227 if (capid < 0) {
2228 ERROR("unknown capability %s", keep_entry);
2229 return -1;
2230 }
2231
2232 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
2233
2234 caplist[capid] = 1;
2235 }
2236 for (i=0; i<numcaps; i++) {
2237 if (caplist[i])
2238 continue;
2239 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
2240 SYSERROR("failed to remove capability %d", i);
2241 return -1;
2242 }
2243 }
2244
2245 DEBUG("capabilities have been setup");
2246
2247 return 0;
2248 }
2249
2250 static int setup_hw_addr(char *hwaddr, const char *ifname)
2251 {
2252 struct sockaddr sockaddr;
2253 struct ifreq ifr;
2254 int ret, fd, saved_errno;
2255
2256 ret = lxc_convert_mac(hwaddr, &sockaddr);
2257 if (ret) {
2258 ERROR("mac address '%s' conversion failed : %s",
2259 hwaddr, strerror(-ret));
2260 return -1;
2261 }
2262
2263 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
2264 ifr.ifr_name[IFNAMSIZ-1] = '\0';
2265 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2266
2267 fd = socket(AF_INET, SOCK_DGRAM, 0);
2268 if (fd < 0) {
2269 ERROR("socket failure : %s", strerror(errno));
2270 return -1;
2271 }
2272
2273 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
2274 saved_errno = errno;
2275 close(fd);
2276 if (ret)
2277 ERROR("ioctl failure : %s", strerror(saved_errno));
2278
2279 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
2280
2281 return ret;
2282 }
2283
2284 static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
2285 {
2286 struct lxc_list *iterator;
2287 struct lxc_inetdev *inetdev;
2288 int err;
2289
2290 lxc_list_for_each(iterator, ip) {
2291
2292 inetdev = iterator->elem;
2293
2294 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2295 &inetdev->bcast, inetdev->prefix);
2296 if (err) {
2297 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2298 ifindex, strerror(-err));
2299 return -1;
2300 }
2301 }
2302
2303 return 0;
2304 }
2305
2306 static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
2307 {
2308 struct lxc_list *iterator;
2309 struct lxc_inet6dev *inet6dev;
2310 int err;
2311
2312 lxc_list_for_each(iterator, ip) {
2313
2314 inet6dev = iterator->elem;
2315
2316 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
2317 &inet6dev->mcast, &inet6dev->acast,
2318 inet6dev->prefix);
2319 if (err) {
2320 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2321 ifindex, strerror(-err));
2322 return -1;
2323 }
2324 }
2325
2326 return 0;
2327 }
2328
2329 static int lxc_setup_netdev_in_child_namespaces(struct lxc_netdev *netdev)
2330 {
2331 char ifname[IFNAMSIZ];
2332 int err;
2333 const char *net_type_name;
2334 char *current_ifname = ifname;
2335
2336 /* empty network namespace */
2337 if (!netdev->ifindex) {
2338 if (netdev->flags & IFF_UP) {
2339 err = lxc_netdev_up("lo");
2340 if (err) {
2341 ERROR("failed to set the loopback up : %s",
2342 strerror(-err));
2343 return -1;
2344 }
2345 }
2346
2347 if (netdev->type == LXC_NET_EMPTY)
2348 return 0;
2349
2350 if (netdev->type == LXC_NET_NONE)
2351 return 0;
2352
2353 if (netdev->type != LXC_NET_VETH) {
2354 net_type_name = lxc_net_type_to_str(netdev->type);
2355 ERROR("%s networks are not supported for containers "
2356 "not setup up by privileged users",
2357 net_type_name);
2358 return -1;
2359 }
2360
2361 netdev->ifindex = if_nametoindex(netdev->name);
2362 }
2363
2364 /* get the new ifindex in case of physical netdev */
2365 if (netdev->type == LXC_NET_PHYS) {
2366 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2367 ERROR("failed to get ifindex for %s",
2368 netdev->link);
2369 return -1;
2370 }
2371 }
2372
2373 /* retrieve the name of the interface */
2374 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2375 ERROR("no interface corresponding to index '%d'",
2376 netdev->ifindex);
2377 return -1;
2378 }
2379
2380 /* default: let the system to choose one interface name */
2381 if (!netdev->name)
2382 netdev->name = netdev->type == LXC_NET_PHYS ?
2383 netdev->link : "eth%d";
2384
2385 /* rename the interface name */
2386 if (strcmp(ifname, netdev->name) != 0) {
2387 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2388 if (err) {
2389 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2390 strerror(-err));
2391 return -1;
2392 }
2393 }
2394
2395 /* Re-read the name of the interface because its name has changed
2396 * and would be automatically allocated by the system
2397 */
2398 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2399 ERROR("no interface corresponding to index '%d'",
2400 netdev->ifindex);
2401 return -1;
2402 }
2403
2404 /* set a mac address */
2405 if (netdev->hwaddr) {
2406 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
2407 ERROR("failed to setup hw address for '%s'",
2408 current_ifname);
2409 return -1;
2410 }
2411 }
2412
2413 /* setup ipv4 addresses on the interface */
2414 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
2415 ERROR("failed to setup ip addresses for '%s'",
2416 ifname);
2417 return -1;
2418 }
2419
2420 /* setup ipv6 addresses on the interface */
2421 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
2422 ERROR("failed to setup ipv6 addresses for '%s'",
2423 ifname);
2424 return -1;
2425 }
2426
2427 /* set the network device up */
2428 if (netdev->flags & IFF_UP) {
2429 int err;
2430
2431 err = lxc_netdev_up(current_ifname);
2432 if (err) {
2433 ERROR("failed to set '%s' up : %s", current_ifname,
2434 strerror(-err));
2435 return -1;
2436 }
2437
2438 /* the network is up, make the loopback up too */
2439 err = lxc_netdev_up("lo");
2440 if (err) {
2441 ERROR("failed to set the loopback up : %s",
2442 strerror(-err));
2443 return -1;
2444 }
2445 }
2446
2447 /* We can only set up the default routes after bringing
2448 * up the interface, sine bringing up the interface adds
2449 * the link-local routes and we can't add a default
2450 * route if the gateway is not reachable. */
2451
2452 /* setup ipv4 gateway on the interface */
2453 if (netdev->ipv4_gateway) {
2454 if (!(netdev->flags & IFF_UP)) {
2455 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2456 return -1;
2457 }
2458
2459 if (lxc_list_empty(&netdev->ipv4)) {
2460 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2461 return -1;
2462 }
2463
2464 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2465 if (err) {
2466 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2467 if (err) {
2468 ERROR("failed to add ipv4 dest for '%s': %s",
2469 ifname, strerror(-err));
2470 }
2471
2472 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2473 if (err) {
2474 ERROR("failed to setup ipv4 gateway for '%s': %s",
2475 ifname, strerror(-err));
2476 if (netdev->ipv4_gateway_auto) {
2477 char buf[INET_ADDRSTRLEN];
2478 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2479 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2480 }
2481 return -1;
2482 }
2483 }
2484 }
2485
2486 /* setup ipv6 gateway on the interface */
2487 if (netdev->ipv6_gateway) {
2488 if (!(netdev->flags & IFF_UP)) {
2489 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2490 return -1;
2491 }
2492
2493 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2494 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2495 return -1;
2496 }
2497
2498 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2499 if (err) {
2500 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2501 if (err) {
2502 ERROR("failed to add ipv6 dest for '%s': %s",
2503 ifname, strerror(-err));
2504 }
2505
2506 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2507 if (err) {
2508 ERROR("failed to setup ipv6 gateway for '%s': %s",
2509 ifname, strerror(-err));
2510 if (netdev->ipv6_gateway_auto) {
2511 char buf[INET6_ADDRSTRLEN];
2512 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2513 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2514 }
2515 return -1;
2516 }
2517 }
2518 }
2519
2520 DEBUG("'%s' has been setup", current_ifname);
2521
2522 return 0;
2523 }
2524
2525 static int lxc_setup_networks_in_child_namespaces(const struct lxc_conf *conf,
2526 struct lxc_list *network)
2527 {
2528 struct lxc_list *iterator;
2529 struct lxc_netdev *netdev;
2530
2531 lxc_log_configured_netdevs(conf);
2532
2533 lxc_list_for_each(iterator, network) {
2534 netdev = iterator->elem;
2535
2536 /* REMOVE in LXC 3.0 */
2537 if (netdev->idx < 0) {
2538 ERROR("WARNING: using \"lxc.network.*\" keys to define "
2539 "networks is DEPRECATED, please switch to using "
2540 "\"lxc.net.[i].* keys\"");
2541 }
2542
2543 if (lxc_setup_netdev_in_child_namespaces(netdev)) {
2544 ERROR("failed to setup netdev");
2545 return -1;
2546 }
2547 }
2548
2549 if (!lxc_list_empty(network))
2550 INFO("network has been setup");
2551
2552 return 0;
2553 }
2554
2555 static int parse_resource(const char *res) {
2556 size_t i;
2557 int resid = -1;
2558
2559 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2560 if (strcmp(res, limit_opt[i].name) == 0)
2561 return limit_opt[i].value;
2562 }
2563
2564 /* try to see if it's numeric, so the user may specify
2565 * resources that the running kernel knows about but
2566 * we don't */
2567 if (lxc_safe_int(res, &resid) == 0)
2568 return resid;
2569 return -1;
2570 }
2571
2572 int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2573 struct lxc_list *it;
2574 struct lxc_limit *lim;
2575 int resid;
2576
2577 lxc_list_for_each(it, limits) {
2578 lim = it->elem;
2579
2580 resid = parse_resource(lim->resource);
2581 if (resid < 0) {
2582 ERROR("unknown resource %s", lim->resource);
2583 return -1;
2584 }
2585
2586 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2587 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2588 return -1;
2589 }
2590 }
2591 return 0;
2592 }
2593
2594 /* try to move physical nics to the init netns */
2595 void lxc_restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2596 {
2597 int i, oldfd;
2598 char ifname[IFNAMSIZ];
2599
2600 if (netnsfd < 0 || conf->num_savednics == 0)
2601 return;
2602
2603 INFO("Running to reset %d nic names.", conf->num_savednics);
2604
2605 oldfd = lxc_preserve_ns(getpid(), "net");
2606 if (oldfd < 0) {
2607 SYSERROR("Failed to open monitor netns fd.");
2608 return;
2609 }
2610
2611 if (setns(netnsfd, 0) != 0) {
2612 SYSERROR("Failed to enter container netns to reset nics");
2613 close(oldfd);
2614 return;
2615 }
2616 for (i=0; i<conf->num_savednics; i++) {
2617 struct saved_nic *s = &conf->saved_nics[i];
2618 /* retrieve the name of the interface */
2619 if (!if_indextoname(s->ifindex, ifname)) {
2620 WARN("no interface corresponding to index '%d'", s->ifindex);
2621 continue;
2622 }
2623 if (lxc_netdev_move_by_name(ifname, 1, s->orig_name))
2624 WARN("Error moving nic name:%s back to host netns", ifname);
2625 free(s->orig_name);
2626 }
2627 conf->num_savednics = 0;
2628
2629 if (setns(oldfd, 0) != 0)
2630 SYSERROR("Failed to re-enter monitor's netns");
2631 close(oldfd);
2632 }
2633
2634 static char *default_rootfs_mount = LXCROOTFSMOUNT;
2635
2636 struct lxc_conf *lxc_conf_init(void)
2637 {
2638 struct lxc_conf *new;
2639 int i;
2640
2641 new = malloc(sizeof(*new));
2642 if (!new) {
2643 ERROR("lxc_conf_init : %s", strerror(errno));
2644 return NULL;
2645 }
2646 memset(new, 0, sizeof(*new));
2647
2648 new->loglevel = LXC_LOG_LEVEL_NOTSET;
2649 new->personality = -1;
2650 new->autodev = 1;
2651 new->console.log_path = NULL;
2652 new->console.log_fd = -1;
2653 new->console.path = NULL;
2654 new->console.peer = -1;
2655 new->console.peerpty.busy = -1;
2656 new->console.peerpty.master = -1;
2657 new->console.peerpty.slave = -1;
2658 new->console.master = -1;
2659 new->console.slave = -1;
2660 new->console.name[0] = '\0';
2661 new->maincmd_fd = -1;
2662 new->nbd_idx = -1;
2663 new->rootfs.mount = strdup(default_rootfs_mount);
2664 if (!new->rootfs.mount) {
2665 ERROR("lxc_conf_init : %s", strerror(errno));
2666 free(new);
2667 return NULL;
2668 }
2669 new->kmsg = 0;
2670 new->logfd = -1;
2671 lxc_list_init(&new->cgroup);
2672 lxc_list_init(&new->network);
2673 lxc_list_init(&new->mount_list);
2674 lxc_list_init(&new->caps);
2675 lxc_list_init(&new->keepcaps);
2676 lxc_list_init(&new->id_map);
2677 lxc_list_init(&new->includes);
2678 lxc_list_init(&new->aliens);
2679 lxc_list_init(&new->environment);
2680 lxc_list_init(&new->limits);
2681 for (i=0; i<NUM_LXC_HOOKS; i++)
2682 lxc_list_init(&new->hooks[i]);
2683 lxc_list_init(&new->groups);
2684 new->lsm_aa_profile = NULL;
2685 new->lsm_se_context = NULL;
2686 new->tmp_umount_proc = 0;
2687
2688 for (i = 0; i < LXC_NS_MAX; i++)
2689 new->inherit_ns_fd[i] = -1;
2690
2691 /* if running in a new user namespace, init and COMMAND
2692 * default to running as UID/GID 0 when using lxc-execute */
2693 new->init_uid = 0;
2694 new->init_gid = 0;
2695
2696 return new;
2697 }
2698
2699 static int instantiate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2700 {
2701 char *veth1, *veth2;
2702 char veth1buf[IFNAMSIZ], veth2buf[IFNAMSIZ];
2703 int bridge_index, err;
2704 unsigned int mtu = 0;
2705
2706 if (netdev->priv.veth_attr.pair) {
2707 veth1 = netdev->priv.veth_attr.pair;
2708 if (handler->conf->reboot)
2709 lxc_netdev_delete_by_name(veth1);
2710 } else {
2711 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2712 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2713 ERROR("veth1 name too long");
2714 return -1;
2715 }
2716 veth1 = lxc_mkifname(veth1buf);
2717 if (!veth1) {
2718 ERROR("failed to allocate a temporary name");
2719 return -1;
2720 }
2721 /* store away for deconf */
2722 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
2723 }
2724
2725 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
2726 veth2 = lxc_mkifname(veth2buf);
2727 if (!veth2) {
2728 ERROR("failed to allocate a temporary name");
2729 goto out_delete;
2730 }
2731
2732 err = lxc_veth_create(veth1, veth2);
2733 if (err) {
2734 ERROR("failed to create veth pair \"%s\" and \"%s\": %s", veth1,
2735 veth2, strerror(-err));
2736 goto out_delete;
2737 }
2738
2739 /* changing the high byte of the mac address to 0xfe, the bridge interface
2740 * will always keep the host's mac address and not take the mac address
2741 * of a container */
2742 err = setup_private_host_hw_addr(veth1);
2743 if (err) {
2744 ERROR("failed to change mac address of host interface \"%s\": %s",
2745 veth1, strerror(-err));
2746 goto out_delete;
2747 }
2748
2749 netdev->ifindex = if_nametoindex(veth2);
2750 if (!netdev->ifindex) {
2751 ERROR("failed to retrieve the index for \"%s\"", veth2);
2752 goto out_delete;
2753 }
2754
2755 if (netdev->mtu) {
2756 if (lxc_safe_uint(netdev->mtu, &mtu) < 0)
2757 WARN("failed to parse mtu from");
2758 else
2759 INFO("retrieved mtu %d", mtu);
2760 } else if (netdev->link) {
2761 bridge_index = if_nametoindex(netdev->link);
2762 if (bridge_index) {
2763 mtu = netdev_get_mtu(bridge_index);
2764 INFO("retrieved mtu %d from %s", mtu, netdev->link);
2765 } else {
2766 mtu = netdev_get_mtu(netdev->ifindex);
2767 INFO("retrieved mtu %d from %s", mtu, veth2);
2768 }
2769 }
2770
2771 if (mtu) {
2772 err = lxc_netdev_set_mtu(veth1, mtu);
2773 if (!err)
2774 err = lxc_netdev_set_mtu(veth2, mtu);
2775 if (err) {
2776 ERROR("failed to set mtu \"%d\" for veth pair \"%s\" "
2777 "and \"%s\": %s",
2778 mtu, veth1, veth2, strerror(-err));
2779 goto out_delete;
2780 }
2781 }
2782
2783 if (netdev->link) {
2784 err = lxc_bridge_attach(handler->lxcpath, handler->name, netdev->link, veth1);
2785 if (err) {
2786 ERROR("failed to attach \"%s\" to bridge \"%s\": %s",
2787 veth1, netdev->link, strerror(-err));
2788 goto out_delete;
2789 }
2790 INFO("attached \"%s\" to bridge \"%s\"", veth1, netdev->link);
2791 }
2792
2793 err = lxc_netdev_up(veth1);
2794 if (err) {
2795 ERROR("failed to set \"%s\" up: %s", veth1, strerror(-err));
2796 goto out_delete;
2797 }
2798
2799 if (netdev->upscript) {
2800 err = run_script(handler->name, "net", netdev->upscript, "up",
2801 "veth", veth1, (char*) NULL);
2802 if (err)
2803 goto out_delete;
2804 }
2805
2806 DEBUG("instantiated veth \"%s/%s\", index is \"%d\"", veth1, veth2,
2807 netdev->ifindex);
2808
2809 return 0;
2810
2811 out_delete:
2812 if (netdev->ifindex != 0)
2813 lxc_netdev_delete_by_name(veth1);
2814 if (!netdev->priv.veth_attr.pair)
2815 free(veth1);
2816 free(veth2);
2817 return -1;
2818 }
2819
2820 static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2821 {
2822 char *veth1;
2823 int err;
2824
2825 if (netdev->priv.veth_attr.pair)
2826 veth1 = netdev->priv.veth_attr.pair;
2827 else
2828 veth1 = netdev->priv.veth_attr.veth1;
2829
2830 if (netdev->downscript) {
2831 err = run_script(handler->name, "net", netdev->downscript,
2832 "down", "veth", veth1, (char*) NULL);
2833 if (err)
2834 return -1;
2835 }
2836 return 0;
2837 }
2838
2839 static int instantiate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2840 {
2841 char peerbuf[IFNAMSIZ], *peer;
2842 int err;
2843
2844 if (!netdev->link) {
2845 ERROR("no link specified for macvlan netdev");
2846 return -1;
2847 }
2848
2849 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2850 if (err >= sizeof(peerbuf))
2851 return -1;
2852
2853 peer = lxc_mkifname(peerbuf);
2854 if (!peer) {
2855 ERROR("failed to make a temporary name");
2856 return -1;
2857 }
2858
2859 err = lxc_macvlan_create(netdev->link, peer,
2860 netdev->priv.macvlan_attr.mode);
2861 if (err) {
2862 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2863 peer, netdev->link, strerror(-err));
2864 goto out;
2865 }
2866
2867 netdev->ifindex = if_nametoindex(peer);
2868 if (!netdev->ifindex) {
2869 ERROR("failed to retrieve the index for %s", peer);
2870 goto out;
2871 }
2872
2873 if (netdev->upscript) {
2874 err = run_script(handler->name, "net", netdev->upscript, "up",
2875 "macvlan", netdev->link, (char*) NULL);
2876 if (err)
2877 goto out;
2878 }
2879
2880 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
2881 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
2882
2883 return 0;
2884 out:
2885 lxc_netdev_delete_by_name(peer);
2886 free(peer);
2887 return -1;
2888 }
2889
2890 static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2891 {
2892 int err;
2893
2894 if (netdev->downscript) {
2895 err = run_script(handler->name, "net", netdev->downscript,
2896 "down", "macvlan", netdev->link,
2897 (char*) NULL);
2898 if (err)
2899 return -1;
2900 }
2901 return 0;
2902 }
2903
2904 /* XXX: merge with instantiate_macvlan */
2905 static int instantiate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2906 {
2907 char peer[IFNAMSIZ];
2908 int err;
2909 static uint16_t vlan_cntr = 0;
2910 unsigned int mtu = 0;
2911
2912 if (!netdev->link) {
2913 ERROR("no link specified for vlan netdev");
2914 return -1;
2915 }
2916
2917 err = snprintf(peer, sizeof(peer), "vlan%d-%d", netdev->priv.vlan_attr.vid, vlan_cntr++);
2918 if (err >= sizeof(peer)) {
2919 ERROR("peer name too long");
2920 return -1;
2921 }
2922
2923 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
2924 if (err) {
2925 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2926 peer, netdev->link, strerror(-err));
2927 return -1;
2928 }
2929
2930 netdev->ifindex = if_nametoindex(peer);
2931 if (!netdev->ifindex) {
2932 ERROR("failed to retrieve the ifindex for %s", peer);
2933 lxc_netdev_delete_by_name(peer);
2934 return -1;
2935 }
2936
2937 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
2938 netdev->ifindex);
2939 if (netdev->mtu) {
2940 if (lxc_safe_uint(netdev->mtu, &mtu) < 0) {
2941 ERROR("Failed to retrieve mtu from: '%d'/'%s'.",
2942 netdev->ifindex, netdev->name);
2943 return -1;
2944 }
2945 err = lxc_netdev_set_mtu(peer, mtu);
2946 if (err) {
2947 ERROR("failed to set mtu '%s' for %s : %s",
2948 netdev->mtu, peer, strerror(-err));
2949 lxc_netdev_delete_by_name(peer);
2950 return -1;
2951 }
2952 }
2953
2954 return 0;
2955 }
2956
2957 static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2958 {
2959 return 0;
2960 }
2961
2962 static int instantiate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
2963 {
2964 if (!netdev->link) {
2965 ERROR("no link specified for the physical interface");
2966 return -1;
2967 }
2968
2969 netdev->ifindex = if_nametoindex(netdev->link);
2970 if (!netdev->ifindex) {
2971 ERROR("failed to retrieve the index for %s", netdev->link);
2972 return -1;
2973 }
2974
2975 if (netdev->upscript) {
2976 int err;
2977 err = run_script(handler->name, "net", netdev->upscript,
2978 "up", "phys", netdev->link, (char*) NULL);
2979 if (err)
2980 return -1;
2981 }
2982
2983 return 0;
2984 }
2985
2986 static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
2987 {
2988 int err;
2989
2990 if (netdev->downscript) {
2991 err = run_script(handler->name, "net", netdev->downscript,
2992 "down", "phys", netdev->link, (char*) NULL);
2993 if (err)
2994 return -1;
2995 }
2996 return 0;
2997 }
2998
2999 static int instantiate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3000 {
3001 netdev->ifindex = 0;
3002 return 0;
3003 }
3004
3005 static int instantiate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3006 {
3007 netdev->ifindex = 0;
3008 if (netdev->upscript) {
3009 int err;
3010 err = run_script(handler->name, "net", netdev->upscript,
3011 "up", "empty", (char*) NULL);
3012 if (err)
3013 return -1;
3014 }
3015 return 0;
3016 }
3017
3018 static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3019 {
3020 int err;
3021
3022 if (netdev->downscript) {
3023 err = run_script(handler->name, "net", netdev->downscript,
3024 "down", "empty", (char*) NULL);
3025 if (err)
3026 return -1;
3027 }
3028 return 0;
3029 }
3030
3031 static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3032 {
3033 return 0;
3034 }
3035
3036 int lxc_requests_empty_network(struct lxc_handler *handler)
3037 {
3038 struct lxc_list *network = &handler->conf->network;
3039 struct lxc_list *iterator;
3040 struct lxc_netdev *netdev;
3041 bool found_none = false, found_nic = false;
3042
3043 if (lxc_list_empty(network))
3044 return 0;
3045
3046 lxc_list_for_each(iterator, network) {
3047
3048 netdev = iterator->elem;
3049
3050 if (netdev->type == LXC_NET_NONE)
3051 found_none = true;
3052 else
3053 found_nic = true;
3054 }
3055 if (found_none && !found_nic)
3056 return 1;
3057 return 0;
3058 }
3059
3060 int lxc_setup_networks_in_parent_namespaces(struct lxc_handler *handler)
3061 {
3062 bool am_root;
3063 struct lxc_netdev *netdev;
3064 struct lxc_list *iterator;
3065 struct lxc_list *network = &handler->conf->network;
3066
3067 /* We need to be root. */
3068 am_root = (getuid() == 0);
3069 if (!am_root)
3070 return 0;
3071
3072 lxc_list_for_each(iterator, network) {
3073 netdev = iterator->elem;
3074
3075 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
3076 ERROR("invalid network configuration type '%d'",
3077 netdev->type);
3078 return -1;
3079 }
3080
3081 if (netdev->type != LXC_NET_MACVLAN &&
3082 netdev->priv.macvlan_attr.mode) {
3083 ERROR("Invalid macvlan.mode for a non-macvlan netdev");
3084 return -1;
3085 }
3086
3087 if (netdev->type != LXC_NET_VETH &&
3088 netdev->priv.veth_attr.pair) {
3089 ERROR("Invalid veth pair for a non-veth netdev");
3090 return -1;
3091 }
3092
3093 if (netdev->type != LXC_NET_VLAN &&
3094 netdev->priv.vlan_attr.vid > 0) {
3095 ERROR("Invalid vlan.id for a non-macvlan netdev");
3096 return -1;
3097 }
3098
3099 if (netdev_conf[netdev->type](handler, netdev)) {
3100 ERROR("failed to create netdev");
3101 return -1;
3102 }
3103
3104 }
3105
3106 return 0;
3107 }
3108
3109 bool lxc_delete_network(struct lxc_handler *handler)
3110 {
3111 int ret;
3112 struct lxc_list *network = &handler->conf->network;
3113 struct lxc_list *iterator;
3114 struct lxc_netdev *netdev;
3115 bool deleted_all = true;
3116
3117 lxc_list_for_each(iterator, network) {
3118 netdev = iterator->elem;
3119
3120 if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
3121 if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
3122 WARN("Failed to rename interface with index %d "
3123 "to its initial name \"%s\".",
3124 netdev->ifindex, netdev->link);
3125 continue;
3126 }
3127
3128 if (netdev_deconf[netdev->type](handler, netdev)) {
3129 WARN("Failed to destroy netdev");
3130 }
3131
3132 /* Recent kernel remove the virtual interfaces when the network
3133 * namespace is destroyed but in case we did not moved the
3134 * interface to the network namespace, we have to destroy it
3135 */
3136 if (netdev->ifindex != 0) {
3137 ret = lxc_netdev_delete_by_index(netdev->ifindex);
3138 if (-ret == ENODEV) {
3139 INFO("Interface \"%s\" with index %d already "
3140 "deleted or existing in different network "
3141 "namespace.",
3142 netdev->name ? netdev->name : "(null)",
3143 netdev->ifindex);
3144 } else if (ret < 0) {
3145 deleted_all = false;
3146 WARN("Failed to remove interface \"%s\" with "
3147 "index %d: %s.",
3148 netdev->name ? netdev->name : "(null)",
3149 netdev->ifindex, strerror(-ret));
3150 } else {
3151 INFO("Removed interface \"%s\" with index %d.",
3152 netdev->name ? netdev->name : "(null)",
3153 netdev->ifindex);
3154 }
3155 }
3156
3157 /* Explicitly delete host veth device to prevent lingering
3158 * devices. We had issues in LXD around this.
3159 */
3160 if (netdev->ifindex != 0 && netdev->type == LXC_NET_VETH && !am_unpriv()) {
3161 char *hostveth;
3162 if (netdev->priv.veth_attr.pair) {
3163 hostveth = netdev->priv.veth_attr.pair;
3164 ret = lxc_netdev_delete_by_name(hostveth);
3165 if (ret < 0) {
3166 WARN("Failed to remove interface \"%s\" from host: %s.", hostveth, strerror(-ret));
3167 } else {
3168 INFO("Removed interface \"%s\" from host.", hostveth);
3169 }
3170 } else if (strlen(netdev->priv.veth_attr.veth1) > 0) {
3171 hostveth = netdev->priv.veth_attr.veth1;
3172 ret = lxc_netdev_delete_by_name(hostveth);
3173 if (ret < 0) {
3174 WARN("Failed to remove \"%s\" from host: %s.", hostveth, strerror(-ret));
3175 } else {
3176 INFO("Removed interface \"%s\" from host.", hostveth);
3177 memset((void *)&netdev->priv.veth_attr.veth1, 0, sizeof(netdev->priv.veth_attr.veth1));
3178 }
3179 }
3180 }
3181 }
3182
3183 return deleted_all;
3184 }
3185
3186 #define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3187
3188 /* lxc-user-nic returns "interface_name:interface_name\n" */
3189 #define MAX_BUFFER_SIZE IFNAMSIZ * 2 + 2
3190 static int unpriv_assign_nic(const char *lxcpath, char *lxcname,
3191 struct lxc_netdev *netdev, pid_t pid)
3192 {
3193 pid_t child;
3194 int bytes, pipefd[2];
3195 char *token, *saveptr = NULL;
3196 char buffer[MAX_BUFFER_SIZE];
3197 char netdev_link[IFNAMSIZ + 1];
3198
3199 if (netdev->type != LXC_NET_VETH) {
3200 ERROR("nic type %d not support for unprivileged use",
3201 netdev->type);
3202 return -1;
3203 }
3204
3205 if (pipe(pipefd) < 0) {
3206 SYSERROR("pipe failed");
3207 return -1;
3208 }
3209
3210 child = fork();
3211 if (child < 0) {
3212 SYSERROR("fork");
3213 close(pipefd[0]);
3214 close(pipefd[1]);
3215 return -1;
3216 }
3217
3218 if (child == 0) { // child
3219 /* Call lxc-user-nic pid type bridge. */
3220 int ret;
3221 char pidstr[LXC_NUMSTRLEN64];
3222
3223 close(pipefd[0]); /* Close the read-end of the pipe. */
3224
3225 /* Redirect stdout to write-end of the pipe. */
3226 ret = dup2(pipefd[1], STDOUT_FILENO);
3227 close(pipefd[1]); /* Close the write-end of the pipe. */
3228 if (ret < 0) {
3229 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
3230 exit(EXIT_FAILURE);
3231 }
3232
3233 if (netdev->link)
3234 strncpy(netdev_link, netdev->link, IFNAMSIZ);
3235 else
3236 strncpy(netdev_link, "none", IFNAMSIZ);
3237
3238 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
3239 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
3240 exit(EXIT_FAILURE);
3241 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
3242
3243 INFO("Execing lxc-user-nic %s %s %s veth %s %s", lxcpath,
3244 lxcname, pidstr, netdev_link, netdev->name);
3245 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, lxcpath, lxcname,
3246 pidstr, "veth", netdev_link, netdev->name, NULL);
3247
3248 SYSERROR("Failed to exec lxc-user-nic.");
3249 exit(EXIT_FAILURE);
3250 }
3251
3252 /* close the write-end of the pipe */
3253 close(pipefd[1]);
3254
3255 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
3256 if (bytes < 0)
3257 SYSERROR("Failed to read from pipe file descriptor.");
3258 buffer[bytes - 1] = '\0';
3259
3260 if (wait_for_pid(child) != 0) {
3261 close(pipefd[0]);
3262 return -1;
3263 }
3264
3265 /* close the read-end of the pipe */
3266 close(pipefd[0]);
3267
3268 /* fill netdev->name field */
3269 token = strtok_r(buffer, ":", &saveptr);
3270 if (!token)
3271 return -1;
3272
3273 netdev->name = malloc(IFNAMSIZ + 1);
3274 if (!netdev->name) {
3275 SYSERROR("Failed to allocate memory.");
3276 return -1;
3277 }
3278 memset(netdev->name, 0, IFNAMSIZ + 1);
3279 strncpy(netdev->name, token, IFNAMSIZ);
3280
3281 /* fill netdev->veth_attr.pair field */
3282 token = strtok_r(NULL, ":", &saveptr);
3283 if (!token)
3284 return -1;
3285
3286 netdev->priv.veth_attr.pair = strdup(token);
3287 if (!netdev->priv.veth_attr.pair) {
3288 ERROR("Failed to allocate memory.");
3289 return -1;
3290 }
3291
3292 return 0;
3293 }
3294
3295 int lxc_assign_network(const char *lxcpath, char *lxcname,
3296 struct lxc_list *network, pid_t pid)
3297 {
3298 struct lxc_list *iterator;
3299 struct lxc_netdev *netdev;
3300 char ifname[IFNAMSIZ];
3301 int am_root = (getuid() == 0);
3302 int err;
3303
3304 lxc_list_for_each(iterator, network) {
3305
3306 netdev = iterator->elem;
3307
3308 if (netdev->type == LXC_NET_VETH && !am_root) {
3309 if (netdev->mtu)
3310 INFO("mtu ignored due to insufficient privilege");
3311 if (unpriv_assign_nic(lxcpath, lxcname, netdev, pid))
3312 return -1;
3313 /* lxc-user-nic has moved the nic to the new ns.
3314 * unpriv_assign_nic() fills in netdev->name.
3315 * netdev->ifindex will be filed in at
3316 * lxc_setup_netdev_in_child_namespaces.
3317 */
3318 continue;
3319 }
3320
3321 /* empty network namespace, nothing to move */
3322 if (!netdev->ifindex)
3323 continue;
3324
3325 /* retrieve the name of the interface */
3326 if (!if_indextoname(netdev->ifindex, ifname)) {
3327 ERROR("no interface corresponding to index '%d'", netdev->ifindex);
3328 return -1;
3329 }
3330
3331 err = lxc_netdev_move_by_name(ifname, pid, NULL);
3332 if (err) {
3333 ERROR("failed to move '%s' to the container : %s",
3334 netdev->link, strerror(-err));
3335 return -1;
3336 }
3337
3338 DEBUG("move '%s'/'%s' to '%d': .", ifname, netdev->name, pid);
3339 }
3340
3341 return 0;
3342 }
3343
3344 static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3345 size_t buf_size)
3346 {
3347 char path[MAXPATHLEN];
3348 int fd, ret;
3349
3350 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
3351 idtype == ID_TYPE_UID ? 'u' : 'g');
3352 if (ret < 0 || ret >= MAXPATHLEN) {
3353 ERROR("failed to create path \"%s\"", path);
3354 return -E2BIG;
3355 }
3356
3357 fd = open(path, O_WRONLY);
3358 if (fd < 0) {
3359 SYSERROR("failed to open \"%s\"", path);
3360 return -1;
3361 }
3362
3363 errno = 0;
3364 ret = lxc_write_nointr(fd, buf, buf_size);
3365 if (ret != buf_size) {
3366 SYSERROR("failed to write %cid mapping to \"%s\"",
3367 idtype == ID_TYPE_UID ? 'u' : 'g', path);
3368 close(fd);
3369 return -1;
3370 }
3371 close(fd);
3372
3373 return 0;
3374 }
3375
3376 /* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
3377 *
3378 * @return 1 if functional binary was found
3379 * @return 0 if binary exists but is lacking privilege
3380 * @return -ENOENT if binary does not exist
3381 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
3382 *
3383 */
3384 static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
3385 {
3386 char *path;
3387 int ret;
3388 struct stat st;
3389 int fret = 0;
3390
3391 if (cap != CAP_SETUID && cap != CAP_SETGID)
3392 return -EINVAL;
3393
3394 path = on_path(binary, NULL);
3395 if (!path)
3396 return -ENOENT;
3397
3398 ret = stat(path, &st);
3399 if (ret < 0) {
3400 fret = -errno;
3401 goto cleanup;
3402 }
3403
3404 /* Check if the binary is setuid. */
3405 if (st.st_mode & S_ISUID) {
3406 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
3407 fret = 1;
3408 goto cleanup;
3409 }
3410
3411 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
3412 /* Check if it has the CAP_SETUID capability. */
3413 if ((cap & CAP_SETUID) &&
3414 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
3415 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
3416 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
3417 "and CAP_PERMITTED sets.", path);
3418 fret = 1;
3419 goto cleanup;
3420 }
3421
3422 /* Check if it has the CAP_SETGID capability. */
3423 if ((cap & CAP_SETGID) &&
3424 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
3425 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
3426 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
3427 "and CAP_PERMITTED sets.", path);
3428 fret = 1;
3429 goto cleanup;
3430 }
3431 #else
3432 /* If we cannot check for file capabilities we need to give the benefit
3433 * of the doubt. Otherwise we might fail even though all the necessary
3434 * file capabilities are set.
3435 */
3436 DEBUG("Cannot check for file capabilites as full capability support is "
3437 "missing. Manual intervention needed.");
3438 fret = 1;
3439 #endif
3440
3441 cleanup:
3442 free(path);
3443 return fret;
3444 }
3445
3446 int lxc_map_ids_exec_wrapper(void *args)
3447 {
3448 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
3449 return -1;
3450 }
3451
3452 int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3453 {
3454 struct id_map *map;
3455 struct lxc_list *iterator;
3456 enum idtype type;
3457 char u_or_g;
3458 char *pos;
3459 int fill, left;
3460 char cmd_output[MAXPATHLEN];
3461 /* strlen("new@idmap") = 9
3462 * +
3463 * strlen(" ") = 1
3464 * +
3465 * LXC_NUMSTRLEN64
3466 * +
3467 * strlen(" ") = 1
3468 *
3469 * We add some additional space to make sure that we really have
3470 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
3471 */
3472 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
3473 int ret = 0, uidmap = 0, gidmap = 0;
3474 bool use_shadow = false, had_entry = false;
3475
3476 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
3477 * ranges, then insist that root also reserve ranges in subuid. This
3478 * will protected it by preventing another user from being handed the
3479 * range by shadow.
3480 */
3481 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
3482 if (uidmap == -ENOENT)
3483 WARN("newuidmap binary is missing");
3484 else if (!uidmap)
3485 WARN("newuidmap is lacking necessary privileges");
3486
3487 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
3488 if (gidmap == -ENOENT)
3489 WARN("newgidmap binary is missing");
3490 else if (!gidmap)
3491 WARN("newgidmap is lacking necessary privileges");
3492
3493 if (uidmap > 0 && gidmap > 0) {
3494 DEBUG("Functional newuidmap and newgidmap binary found.");
3495 use_shadow = true;
3496 } else {
3497 /* In case unprivileged users run application containers via
3498 * execute() or a start*() there are valid cases where they may
3499 * only want to map their own {g,u}id. Let's not block them from
3500 * doing so by requiring geteuid() == 0.
3501 */
3502 DEBUG("No newuidmap and newgidmap binary found. Trying to "
3503 "write directly with euid %d.", geteuid());
3504 }
3505
3506 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
3507 type++, u_or_g = 'g') {
3508 pos = mapbuf;
3509
3510 if (use_shadow)
3511 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
3512
3513 lxc_list_for_each(iterator, idmap) {
3514 /* The kernel only takes <= 4k for writes to
3515 * /proc/<nr>/[ug]id_map
3516 */
3517 map = iterator->elem;
3518 if (map->idtype != type)
3519 continue;
3520
3521 had_entry = true;
3522
3523 left = LXC_IDMAPLEN - (pos - mapbuf);
3524 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
3525 use_shadow ? " " : "", map->nsid,
3526 map->hostid, map->range,
3527 use_shadow ? "" : "\n");
3528 if (fill <= 0 || fill >= left)
3529 SYSERROR("Too many {g,u}id mappings defined.");
3530
3531 pos += fill;
3532 }
3533 if (!had_entry)
3534 continue;
3535
3536 /* Try to catch the ouput of new{g,u}idmap to make debugging
3537 * easier.
3538 */
3539 if (use_shadow) {
3540 ret = run_command(cmd_output, sizeof(cmd_output),
3541 lxc_map_ids_exec_wrapper,
3542 (void *)mapbuf);
3543 if (ret < 0) {
3544 ERROR("new%cidmap failed to write mapping: %s",
3545 u_or_g, cmd_output);
3546 return -1;
3547 }
3548 } else {
3549 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
3550 if (ret < 0)
3551 return -1;
3552 }
3553
3554 memset(mapbuf, 0, sizeof(mapbuf));
3555 }
3556
3557 return 0;
3558 }
3559
3560 /*
3561 * return the host uid/gid to which the container root is mapped in
3562 * *val.
3563 * Return true if id was found, false otherwise.
3564 */
3565 bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3566 unsigned long *val)
3567 {
3568 struct lxc_list *it;
3569 struct id_map *map;
3570
3571 lxc_list_for_each(it, &conf->id_map) {
3572 map = it->elem;
3573 if (map->idtype != idtype)
3574 continue;
3575 if (map->nsid != 0)
3576 continue;
3577 *val = map->hostid;
3578 return true;
3579 }
3580 return false;
3581 }
3582
3583 int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
3584 {
3585 struct lxc_list *it;
3586 struct id_map *map;
3587 lxc_list_for_each(it, &conf->id_map) {
3588 map = it->elem;
3589 if (map->idtype != idtype)
3590 continue;
3591 if (id >= map->hostid && id < map->hostid + map->range)
3592 return (id - map->hostid) + map->nsid;
3593 }
3594 return -1;
3595 }
3596
3597 int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
3598 {
3599 struct lxc_list *it;
3600 struct id_map *map;
3601 unsigned int freeid = 0;
3602 again:
3603 lxc_list_for_each(it, &conf->id_map) {
3604 map = it->elem;
3605 if (map->idtype != idtype)
3606 continue;
3607 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3608 freeid = map->nsid + map->range;
3609 goto again;
3610 }
3611 }
3612 return freeid;
3613 }
3614
3615 int lxc_find_gateway_addresses(struct lxc_handler *handler)
3616 {
3617 struct lxc_list *network = &handler->conf->network;
3618 struct lxc_list *iterator;
3619 struct lxc_netdev *netdev;
3620 int link_index;
3621
3622 lxc_list_for_each(iterator, network) {
3623 netdev = iterator->elem;
3624
3625 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3626 continue;
3627
3628 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3629 ERROR("gateway = auto only supported for "
3630 "veth and macvlan");
3631 return -1;
3632 }
3633
3634 if (!netdev->link) {
3635 ERROR("gateway = auto needs a link interface");
3636 return -1;
3637 }
3638
3639 link_index = if_nametoindex(netdev->link);
3640 if (!link_index)
3641 return -EINVAL;
3642
3643 if (netdev->ipv4_gateway_auto) {
3644 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3645 ERROR("failed to automatically find ipv4 gateway "
3646 "address from link interface '%s'", netdev->link);
3647 return -1;
3648 }
3649 }
3650
3651 if (netdev->ipv6_gateway_auto) {
3652 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3653 ERROR("failed to automatically find ipv6 gateway "
3654 "address from link interface '%s'", netdev->link);
3655 return -1;
3656 }
3657 }
3658 }
3659
3660 return 0;
3661 }
3662
3663 int lxc_create_tty(const char *name, struct lxc_conf *conf)
3664 {
3665 struct lxc_tty_info *tty_info = &conf->tty_info;
3666 int i, ret;
3667
3668 /* no tty in the configuration */
3669 if (!conf->tty)
3670 return 0;
3671
3672 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
3673 if (!tty_info->pty_info) {
3674 SYSERROR("failed to allocate struct *pty_info");
3675 return -ENOMEM;
3676 }
3677
3678 for (i = 0; i < conf->tty; i++) {
3679 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3680
3681 process_lock();
3682 ret = openpty(&pty_info->master, &pty_info->slave,
3683 pty_info->name, NULL, NULL);
3684 process_unlock();
3685 if (ret) {
3686 SYSERROR("failed to create pty device number %d", i);
3687 tty_info->nbtty = i;
3688 lxc_delete_tty(tty_info);
3689 return -ENOTTY;
3690 }
3691
3692 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
3693 pty_info->name, pty_info->master, pty_info->slave);
3694
3695 /* Prevent leaking the file descriptors to the container */
3696 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3697 if (ret < 0)
3698 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
3699 "pty device \"%s\": %s",
3700 pty_info->master, pty_info->name, strerror(errno));
3701
3702 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3703 if (ret < 0)
3704 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
3705 "pty device \"%s\": %s",
3706 pty_info->slave, pty_info->name, strerror(errno));
3707
3708 pty_info->busy = 0;
3709 }
3710
3711 tty_info->nbtty = conf->tty;
3712
3713 INFO("finished allocating %d pts devices", conf->tty);
3714 return 0;
3715 }
3716
3717 void lxc_delete_tty(struct lxc_tty_info *tty_info)
3718 {
3719 int i;
3720
3721 for (i = 0; i < tty_info->nbtty; i++) {
3722 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3723
3724 close(pty_info->master);
3725 close(pty_info->slave);
3726 }
3727
3728 free(tty_info->pty_info);
3729 tty_info->pty_info = NULL;
3730 tty_info->nbtty = 0;
3731 }
3732
3733
3734 int chown_mapped_root_exec_wrapper(void *args)
3735 {
3736 execvp("lxc-usernsexec", args);
3737 return -1;
3738 }
3739
3740 /*
3741 * chown_mapped_root: for an unprivileged user with uid/gid X to
3742 * chown a dir to subuid/subgid Y, he needs to run chown as root
3743 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3744 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3745 * root is privileged with respect to hostuid/hostgid X, allowing
3746 * him to do the chown.
3747 */
3748 int chown_mapped_root(char *path, struct lxc_conf *conf)
3749 {
3750 uid_t rootuid, rootgid;
3751 unsigned long val;
3752 char *chownpath = path;
3753 int hostuid, hostgid, ret;
3754 struct stat sb;
3755 char map1[100], map2[100], map3[100], map4[100], map5[100];
3756 char ugid[100];
3757 char *args1[] = {"lxc-usernsexec",
3758 "-m", map1,
3759 "-m", map2,
3760 "-m", map3,
3761 "-m", map5,
3762 "--", "chown", ugid, path,
3763 NULL};
3764 char *args2[] = {"lxc-usernsexec",
3765 "-m", map1,
3766 "-m", map2,
3767 "-m", map3,
3768 "-m", map4,
3769 "-m", map5,
3770 "--", "chown", ugid, path,
3771 NULL};
3772 char cmd_output[MAXPATHLEN];
3773
3774 hostuid = geteuid();
3775 hostgid = getegid();
3776
3777 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
3778 ERROR("No uid mapping for container root");
3779 return -1;
3780 }
3781 rootuid = (uid_t)val;
3782 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
3783 ERROR("No gid mapping for container root");
3784 return -1;
3785 }
3786 rootgid = (gid_t)val;
3787
3788 /*
3789 * In case of overlay, we want only the writeable layer to be chowned
3790 */
3791 if (strncmp(path, "overlayfs:", 10) == 0 || strncmp(path, "aufs:", 5) == 0) {
3792 chownpath = strchr(path, ':');
3793 if (!chownpath) {
3794 ERROR("Bad overlay path: %s", path);
3795 return -1;
3796 }
3797 chownpath = strchr(chownpath + 1, ':');
3798 if (!chownpath) {
3799 ERROR("Bad overlay path: %s", path);
3800 return -1;
3801 }
3802 chownpath++;
3803 }
3804 path = chownpath;
3805 if (hostuid == 0) {
3806 if (chown(path, rootuid, rootgid) < 0) {
3807 ERROR("Error chowning %s", path);
3808 return -1;
3809 }
3810 return 0;
3811 }
3812
3813 if (rootuid == hostuid) {
3814 // nothing to do
3815 INFO("%s: container root is our uid; no need to chown" ,__func__);
3816 return 0;
3817 }
3818
3819 /* save the current gid of "path" */
3820 if (stat(path, &sb) < 0) {
3821 ERROR("Error stat %s", path);
3822 return -1;
3823 }
3824
3825 /* Update the path argument in case this was overlayfs. */
3826 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
3827 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
3828
3829 /*
3830 * A file has to be group-owned by a gid mapped into the
3831 * container, or the container won't be privileged over it.
3832 */
3833 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3834 if (sb.st_uid == hostuid &&
3835 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3836 chown(path, -1, hostgid) < 0) {
3837 ERROR("Failed chgrping %s", path);
3838 return -1;
3839 }
3840
3841 // "u:0:rootuid:1"
3842 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3843 if (ret < 0 || ret >= 100) {
3844 ERROR("Error uid printing map string");
3845 return -1;
3846 }
3847
3848 // "u:hostuid:hostuid:1"
3849 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3850 if (ret < 0 || ret >= 100) {
3851 ERROR("Error uid printing map string");
3852 return -1;
3853 }
3854
3855 // "g:0:rootgid:1"
3856 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3857 if (ret < 0 || ret >= 100) {
3858 ERROR("Error gid printing map string");
3859 return -1;
3860 }
3861
3862 // "g:pathgid:rootgid+pathgid:1"
3863 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3864 rootgid + (gid_t)sb.st_gid);
3865 if (ret < 0 || ret >= 100) {
3866 ERROR("Error gid printing map string");
3867 return -1;
3868 }
3869
3870 // "g:hostgid:hostgid:1"
3871 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3872 if (ret < 0 || ret >= 100) {
3873 ERROR("Error gid printing map string");
3874 return -1;
3875 }
3876
3877 // "0:pathgid" (chown)
3878 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3879 if (ret < 0 || ret >= 100) {
3880 ERROR("Error owner printing format string for chown");
3881 return -1;
3882 }
3883
3884 if (hostgid == sb.st_gid)
3885 ret = run_command(cmd_output, sizeof(cmd_output),
3886 chown_mapped_root_exec_wrapper,
3887 (void *)args1);
3888 else
3889 ret = run_command(cmd_output, sizeof(cmd_output),
3890 chown_mapped_root_exec_wrapper,
3891 (void *)args2);
3892 if (ret < 0)
3893 ERROR("lxc-usernsexec failed: %s", cmd_output);
3894
3895 return ret;
3896 }
3897
3898 int lxc_ttys_shift_ids(struct lxc_conf *c)
3899 {
3900 if (lxc_list_empty(&c->id_map))
3901 return 0;
3902
3903 if (!strcmp(c->console.name, ""))
3904 return 0;
3905
3906 if (chown_mapped_root(c->console.name, c) < 0) {
3907 ERROR("failed to chown console \"%s\"", c->console.name);
3908 return -1;
3909 }
3910
3911 TRACE("chowned console \"%s\"", c->console.name);
3912
3913 return 0;
3914 }
3915
3916 /* NOTE: Must not be called from inside the container namespace! */
3917 int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
3918 {
3919 int mounted;
3920
3921 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
3922 if (mounted == -1) {
3923 SYSERROR("failed to mount /proc in the container");
3924 /* continue only if there is no rootfs */
3925 if (conf->rootfs.path)
3926 return -1;
3927 } else if (mounted == 1) {
3928 conf->tmp_umount_proc = 1;
3929 }
3930
3931 return 0;
3932 }
3933
3934 void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3935 {
3936 if (lxc_conf->tmp_umount_proc == 1) {
3937 umount("/proc");
3938 lxc_conf->tmp_umount_proc = 0;
3939 }
3940 }
3941
3942 void remount_all_slave(void)
3943 {
3944 /* walk /proc/mounts and change any shared entries to slave */
3945 FILE *f = fopen("/proc/self/mountinfo", "r");
3946 char *line = NULL;
3947 size_t len = 0;
3948
3949 if (!f) {
3950 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3951 ERROR("Continuing container startup...");
3952 return;
3953 }
3954
3955 while (getline(&line, &len, f) != -1) {
3956 char *target, *opts;
3957 target = get_field(line, 4);
3958 if (!target)
3959 continue;
3960 opts = get_field(target, 2);
3961 if (!opts)
3962 continue;
3963 null_endofword(opts);
3964 if (!strstr(opts, "shared"))
3965 continue;
3966 null_endofword(target);
3967 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3968 SYSERROR("Failed to make %s rslave", target);
3969 ERROR("Continuing...");
3970 }
3971 }
3972 fclose(f);
3973 free(line);
3974 }
3975
3976 void lxc_execute_bind_init(struct lxc_conf *conf)
3977 {
3978 int ret;
3979 char path[PATH_MAX], destpath[PATH_MAX], *p;
3980
3981 /* If init exists in the container, don't bind mount a static one */
3982 p = choose_init(conf->rootfs.mount);
3983 if (p) {
3984 free(p);
3985 return;
3986 }
3987
3988 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3989 if (ret < 0 || ret >= PATH_MAX) {
3990 WARN("Path name too long searching for lxc.init.static");
3991 return;
3992 }
3993
3994 if (!file_exists(path)) {
3995 INFO("%s does not exist on host", path);
3996 return;
3997 }
3998
3999 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
4000 if (ret < 0 || ret >= PATH_MAX) {
4001 WARN("Path name too long for container's lxc.init.static");
4002 return;
4003 }
4004
4005 if (!file_exists(destpath)) {
4006 FILE * pathfile = fopen(destpath, "wb");
4007 if (!pathfile) {
4008 SYSERROR("Failed to create mount target '%s'", destpath);
4009 return;
4010 }
4011 fclose(pathfile);
4012 }
4013
4014 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
4015 if (ret < 0)
4016 SYSERROR("Failed to bind lxc.init.static into container");
4017 INFO("lxc.init.static bound into container at %s", path);
4018 }
4019
4020 /*
4021 * This does the work of remounting / if it is shared, calling the
4022 * container pre-mount hooks, and mounting the rootfs.
4023 */
4024 int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
4025 {
4026 if (conf->rootfs_setup) {
4027 /*
4028 * rootfs was set up in another namespace. bind-mount it
4029 * to give us a mount in our own ns so we can pivot_root to it
4030 */
4031 const char *path = conf->rootfs.mount;
4032 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
4033 ERROR("Failed to bind-mount container / onto itself");
4034 return -1;
4035 }
4036 return 0;
4037 }
4038
4039 remount_all_slave();
4040
4041 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
4042 ERROR("failed to run pre-mount hooks for container '%s'.", name);
4043 return -1;
4044 }
4045
4046 if (lxc_setup_rootfs(conf)) {
4047 ERROR("failed to setup rootfs for '%s'", name);
4048 return -1;
4049 }
4050
4051 conf->rootfs_setup = true;
4052 return 0;
4053 }
4054
4055 static bool verify_start_hooks(struct lxc_conf *conf)
4056 {
4057 struct lxc_list *it;
4058 char path[MAXPATHLEN];
4059 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
4060 char *hookname = it->elem;
4061 struct stat st;
4062 int ret;
4063
4064 ret = snprintf(path, MAXPATHLEN, "%s%s",
4065 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
4066 if (ret < 0 || ret >= MAXPATHLEN)
4067 return false;
4068 ret = stat(path, &st);
4069 if (ret) {
4070 SYSERROR("Start hook %s not found in container",
4071 hookname);
4072 return false;
4073 }
4074 return true;
4075 }
4076
4077 return true;
4078 }
4079
4080 static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
4081 {
4082 int i;
4083 int *ttyfds;
4084 struct lxc_pty_info *pty_info;
4085 struct lxc_conf *conf = handler->conf;
4086 const struct lxc_tty_info *tty_info = &conf->tty_info;
4087 int sock = handler->ttysock[0];
4088 int ret = -1;
4089 size_t num_ttyfds = (2 * conf->tty);
4090
4091 ttyfds = malloc(num_ttyfds * sizeof(int));
4092 if (!ttyfds)
4093 return -1;
4094
4095 for (i = 0; i < num_ttyfds; i++) {
4096 pty_info = &tty_info->pty_info[i / 2];
4097 ttyfds[i++] = pty_info->slave;
4098 ttyfds[i] = pty_info->master;
4099 TRACE("send pty \"%s\" with master fd %d and slave fd %d to "
4100 "parent",
4101 pty_info->name, pty_info->master, pty_info->slave);
4102 }
4103
4104 ret = lxc_abstract_unix_send_fds(sock, ttyfds, num_ttyfds, NULL, 0);
4105 if (ret < 0)
4106 ERROR("failed to send %d ttys to parent: %s", conf->tty,
4107 strerror(errno));
4108 else
4109 TRACE("sent %d ttys to parent", conf->tty);
4110
4111 close(handler->ttysock[0]);
4112 close(handler->ttysock[1]);
4113
4114 for (i = 0; i < num_ttyfds; i++)
4115 close(ttyfds[i]);
4116
4117 free(ttyfds);
4118
4119 return ret;
4120 }
4121
4122 int lxc_setup(struct lxc_handler *handler)
4123 {
4124 const char *name = handler->name;
4125 struct lxc_conf *lxc_conf = handler->conf;
4126 const char *lxcpath = handler->lxcpath;
4127
4128 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4129 ERROR("Error setting up rootfs mount after spawn");
4130 return -1;
4131 }
4132
4133 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4134 if (setup_utsname(lxc_conf->utsname)) {
4135 ERROR("failed to setup the utsname for '%s'", name);
4136 return -1;
4137 }
4138 }
4139
4140 if (lxc_setup_networks_in_child_namespaces(lxc_conf,
4141 &lxc_conf->network)) {
4142 ERROR("failed to setup the network for '%s'", name);
4143 return -1;
4144 }
4145
4146 if (lxc_conf->autodev > 0) {
4147 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
4148 ERROR("failed to mount /dev in the container");
4149 return -1;
4150 }
4151 }
4152
4153 /* do automatic mounts (mainly /proc and /sys), but exclude
4154 * those that need to wait until other stuff has finished
4155 */
4156 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
4157 ERROR("failed to setup the automatic mounts for '%s'", name);
4158 return -1;
4159 }
4160
4161 if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
4162 ERROR("failed to setup the mounts for '%s'", name);
4163 return -1;
4164 }
4165
4166 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
4167 ERROR("failed to setup the mount entries for '%s'", name);
4168 return -1;
4169 }
4170
4171 /* Make sure any start hooks are in the container */
4172 if (!verify_start_hooks(lxc_conf))
4173 return -1;
4174
4175 if (lxc_conf->is_execute)
4176 lxc_execute_bind_init(lxc_conf);
4177
4178 /* now mount only cgroup, if wanted;
4179 * before, /sys could not have been mounted
4180 * (is either mounted automatically or via fstab entries)
4181 */
4182 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
4183 ERROR("failed to setup the automatic mounts for '%s'", name);
4184 return -1;
4185 }
4186
4187 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
4188 ERROR("failed to run mount hooks for container '%s'.", name);
4189 return -1;
4190 }
4191
4192 if (lxc_conf->autodev > 0) {
4193 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
4194 ERROR("failed to run autodev hooks for container '%s'.", name);
4195 return -1;
4196 }
4197 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
4198 ERROR("failed to populate /dev in the container");
4199 return -1;
4200 }
4201 }
4202
4203 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
4204 ERROR("failed to setup the console for '%s'", name);
4205 return -1;
4206 }
4207
4208 if (lxc_conf->kmsg) {
4209 if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console)) // don't fail
4210 ERROR("failed to setup kmsg for '%s'", name);
4211 }
4212
4213 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4214 ERROR("failed to setup /dev symlinks for '%s'", name);
4215 return -1;
4216 }
4217
4218 /* mount /proc if it's not already there */
4219 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
4220 ERROR("failed to LSM mount proc for '%s'", name);
4221 return -1;
4222 }
4223
4224 if (setup_pivot_root(&lxc_conf->rootfs)) {
4225 ERROR("failed to set rootfs for '%s'", name);
4226 return -1;
4227 }
4228
4229 if (lxc_setup_devpts(lxc_conf->pts)) {
4230 ERROR("failed to setup the new pts instance");
4231 return -1;
4232 }
4233
4234 if (lxc_create_tty(name, lxc_conf)) {
4235 ERROR("failed to create the ttys");
4236 return -1;
4237 }
4238
4239 if (lxc_send_ttys_to_parent(handler) < 0) {
4240 ERROR("failure sending console info to parent");
4241 return -1;
4242 }
4243
4244 if (!lxc_conf->is_execute && lxc_setup_tty(lxc_conf)) {
4245 ERROR("failed to setup the ttys for '%s'", name);
4246 return -1;
4247 }
4248
4249 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
4250 SYSERROR("failed to set environment variable for container ptys");
4251
4252
4253 if (setup_personality(lxc_conf->personality)) {
4254 ERROR("failed to setup personality");
4255 return -1;
4256 }
4257
4258 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4259 if (!lxc_list_empty(&lxc_conf->caps)) {
4260 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
4261 return -1;
4262 }
4263 if (dropcaps_except(&lxc_conf->keepcaps)) {
4264 ERROR("failed to keep requested caps");
4265 return -1;
4266 }
4267 } else if (setup_caps(&lxc_conf->caps)) {
4268 ERROR("failed to drop capabilities");
4269 return -1;
4270 }
4271
4272 NOTICE("'%s' is setup.", name);
4273
4274 return 0;
4275 }
4276
4277 int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4278 const char *lxcpath, char *argv[])
4279 {
4280 int which = -1;
4281 struct lxc_list *it;
4282
4283 if (strcmp(hook, "pre-start") == 0)
4284 which = LXCHOOK_PRESTART;
4285 else if (strcmp(hook, "pre-mount") == 0)
4286 which = LXCHOOK_PREMOUNT;
4287 else if (strcmp(hook, "mount") == 0)
4288 which = LXCHOOK_MOUNT;
4289 else if (strcmp(hook, "autodev") == 0)
4290 which = LXCHOOK_AUTODEV;
4291 else if (strcmp(hook, "start") == 0)
4292 which = LXCHOOK_START;
4293 else if (strcmp(hook, "stop") == 0)
4294 which = LXCHOOK_STOP;
4295 else if (strcmp(hook, "post-stop") == 0)
4296 which = LXCHOOK_POSTSTOP;
4297 else if (strcmp(hook, "clone") == 0)
4298 which = LXCHOOK_CLONE;
4299 else if (strcmp(hook, "destroy") == 0)
4300 which = LXCHOOK_DESTROY;
4301 else
4302 return -1;
4303 lxc_list_for_each(it, &conf->hooks[which]) {
4304 int ret;
4305 char *hookname = it->elem;
4306 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
4307 if (ret)
4308 return ret;
4309 }
4310 return 0;
4311 }
4312
4313 int lxc_clear_config_caps(struct lxc_conf *c)
4314 {
4315 struct lxc_list *it,*next;
4316
4317 lxc_list_for_each_safe(it, &c->caps, next) {
4318 lxc_list_del(it);
4319 free(it->elem);
4320 free(it);
4321 }
4322 return 0;
4323 }
4324
4325 static int lxc_free_idmap(struct lxc_list *id_map) {
4326 struct lxc_list *it, *next;
4327
4328 lxc_list_for_each_safe(it, id_map, next) {
4329 lxc_list_del(it);
4330 free(it->elem);
4331 free(it);
4332 }
4333 return 0;
4334 }
4335
4336 int lxc_clear_idmaps(struct lxc_conf *c)
4337 {
4338 return lxc_free_idmap(&c->id_map);
4339 }
4340
4341 int lxc_clear_config_keepcaps(struct lxc_conf *c)
4342 {
4343 struct lxc_list *it,*next;
4344
4345 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4346 lxc_list_del(it);
4347 free(it->elem);
4348 free(it);
4349 }
4350 return 0;
4351 }
4352
4353 int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
4354 {
4355 struct lxc_list *it,*next;
4356 bool all = false;
4357 const char *k = NULL;
4358
4359 if (strcmp(key, "lxc.cgroup") == 0)
4360 all = true;
4361 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
4362 k = key + sizeof("lxc.cgroup.")-1;
4363 else
4364 return -1;
4365
4366 lxc_list_for_each_safe(it, &c->cgroup, next) {
4367 struct lxc_cgroup *cg = it->elem;
4368 if (!all && strcmp(cg->subsystem, k) != 0)
4369 continue;
4370 lxc_list_del(it);
4371 free(cg->subsystem);
4372 free(cg->value);
4373 free(cg);
4374 free(it);
4375 }
4376 return 0;
4377 }
4378
4379 int lxc_clear_limits(struct lxc_conf *c, const char *key)
4380 {
4381 struct lxc_list *it, *next;
4382 bool all = false;
4383 const char *k = NULL;
4384
4385 if (strcmp(key, "lxc.limit") == 0)
4386 all = true;
4387 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
4388 k = key + sizeof("lxc.limit.")-1;
4389 else
4390 return -1;
4391
4392 lxc_list_for_each_safe(it, &c->limits, next) {
4393 struct lxc_limit *lim = it->elem;
4394 if (!all && strcmp(lim->resource, k) != 0)
4395 continue;
4396 lxc_list_del(it);
4397 free(lim->resource);
4398 free(lim);
4399 free(it);
4400 }
4401 return 0;
4402 }
4403
4404 int lxc_clear_groups(struct lxc_conf *c)
4405 {
4406 struct lxc_list *it,*next;
4407
4408 lxc_list_for_each_safe(it, &c->groups, next) {
4409 lxc_list_del(it);
4410 free(it->elem);
4411 free(it);
4412 }
4413 return 0;
4414 }
4415
4416 int lxc_clear_environment(struct lxc_conf *c)
4417 {
4418 struct lxc_list *it,*next;
4419
4420 lxc_list_for_each_safe(it, &c->environment, next) {
4421 lxc_list_del(it);
4422 free(it->elem);
4423 free(it);
4424 }
4425 return 0;
4426 }
4427
4428
4429 int lxc_clear_mount_entries(struct lxc_conf *c)
4430 {
4431 struct lxc_list *it,*next;
4432
4433 lxc_list_for_each_safe(it, &c->mount_list, next) {
4434 lxc_list_del(it);
4435 free(it->elem);
4436 free(it);
4437 }
4438 return 0;
4439 }
4440
4441 int lxc_clear_automounts(struct lxc_conf *c)
4442 {
4443 c->auto_mounts = 0;
4444 return 0;
4445 }
4446
4447 int lxc_clear_hooks(struct lxc_conf *c, const char *key)
4448 {
4449 struct lxc_list *it,*next;
4450 bool all = false, done = false;
4451 const char *k = NULL;
4452 int i;
4453
4454 if (strcmp(key, "lxc.hook") == 0)
4455 all = true;
4456 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
4457 k = key + sizeof("lxc.hook.")-1;
4458 else
4459 return -1;
4460
4461 for (i=0; i<NUM_LXC_HOOKS; i++) {
4462 if (all || strcmp(k, lxchook_names[i]) == 0) {
4463 lxc_list_for_each_safe(it, &c->hooks[i], next) {
4464 lxc_list_del(it);
4465 free(it->elem);
4466 free(it);
4467 }
4468 done = true;
4469 }
4470 }
4471
4472 if (!done) {
4473 ERROR("Invalid hook key: %s", key);
4474 return -1;
4475 }
4476 return 0;
4477 }
4478
4479 static void lxc_clear_saved_nics(struct lxc_conf *conf)
4480 {
4481 int i;
4482
4483 if (!conf->saved_nics)
4484 return;
4485 for (i=0; i < conf->num_savednics; i++)
4486 free(conf->saved_nics[i].orig_name);
4487 free(conf->saved_nics);
4488 }
4489
4490 static inline void lxc_clear_aliens(struct lxc_conf *conf)
4491 {
4492 struct lxc_list *it,*next;
4493
4494 lxc_list_for_each_safe(it, &conf->aliens, next) {
4495 lxc_list_del(it);
4496 free(it->elem);
4497 free(it);
4498 }
4499 }
4500
4501 void lxc_clear_includes(struct lxc_conf *conf)
4502 {
4503 struct lxc_list *it,*next;
4504
4505 lxc_list_for_each_safe(it, &conf->includes, next) {
4506 lxc_list_del(it);
4507 free(it->elem);
4508 free(it);
4509 }
4510 }
4511
4512 void lxc_conf_free(struct lxc_conf *conf)
4513 {
4514 if (!conf)
4515 return;
4516 if (current_config == conf)
4517 current_config = NULL;
4518 free(conf->console.log_path);
4519 free(conf->console.path);
4520 free(conf->rootfs.mount);
4521 free(conf->rootfs.bdev_type);
4522 free(conf->rootfs.options);
4523 free(conf->rootfs.path);
4524 free(conf->logfile);
4525 if (conf->logfd != -1)
4526 close(conf->logfd);
4527 free(conf->utsname);
4528 free(conf->ttydir);
4529 free(conf->fstab);
4530 free(conf->rcfile);
4531 free(conf->init_cmd);
4532 free(conf->unexpanded_config);
4533 free(conf->pty_names);
4534 free(conf->syslog);
4535 lxc_free_networks(&conf->network);
4536 free(conf->lsm_aa_profile);
4537 free(conf->lsm_se_context);
4538 lxc_seccomp_free(conf);
4539 lxc_clear_config_caps(conf);
4540 lxc_clear_config_keepcaps(conf);
4541 lxc_clear_cgroups(conf, "lxc.cgroup");
4542 lxc_clear_hooks(conf, "lxc.hook");
4543 lxc_clear_mount_entries(conf);
4544 lxc_clear_saved_nics(conf);
4545 lxc_clear_idmaps(conf);
4546 lxc_clear_groups(conf);
4547 lxc_clear_includes(conf);
4548 lxc_clear_aliens(conf);
4549 lxc_clear_environment(conf);
4550 lxc_clear_limits(conf, "lxc.limit");
4551 free(conf);
4552 }
4553
4554 struct userns_fn_data {
4555 int (*fn)(void *);
4556 const char *fn_name;
4557 void *arg;
4558 int p[2];
4559 };
4560
4561 static int run_userns_fn(void *data)
4562 {
4563 struct userns_fn_data *d = data;
4564 char c;
4565
4566 /* Close write end of the pipe. */
4567 close(d->p[1]);
4568
4569 /* Wait for parent to finish establishing a new mapping in the user
4570 * namespace we are executing in.
4571 */
4572 if (read(d->p[0], &c, 1) != 1)
4573 return -1;
4574
4575 /* Close read end of the pipe. */
4576 close(d->p[0]);
4577
4578 if (d->fn_name)
4579 TRACE("calling function \"%s\"", d->fn_name);
4580 /* Call function to run. */
4581 return d->fn(d->arg);
4582 }
4583
4584 static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
4585 enum idtype idtype)
4586 {
4587 struct lxc_list *it;
4588 struct id_map *map;
4589 struct id_map *retmap = NULL;
4590
4591 lxc_list_for_each(it, &conf->id_map) {
4592 map = it->elem;
4593 if (map->idtype != idtype)
4594 continue;
4595
4596 if (id >= map->hostid && id < map->hostid + map->range) {
4597 retmap = map;
4598 break;
4599 }
4600 }
4601
4602 if (!retmap)
4603 return NULL;
4604
4605 retmap = malloc(sizeof(*retmap));
4606 if (!retmap)
4607 return NULL;
4608
4609 memcpy(retmap, map, sizeof(*retmap));
4610 return retmap;
4611 }
4612
4613 /*
4614 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4615 * existing one or establish a new one.
4616 */
4617 static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4618 {
4619 int hostid_mapped;
4620 struct id_map *entry = NULL;
4621
4622 /* Reuse existing mapping. */
4623 entry = mapped_hostid_entry(conf, id, type);
4624 if (entry)
4625 return entry;
4626
4627 /* Find new mapping. */
4628 hostid_mapped = find_unmapped_nsid(conf, type);
4629 if (hostid_mapped < 0) {
4630 DEBUG("failed to find free mapping for id %d", id);
4631 return NULL;
4632 }
4633
4634 entry = malloc(sizeof(*entry));
4635 if (!entry)
4636 return NULL;
4637
4638 entry->idtype = type;
4639 entry->nsid = hostid_mapped;
4640 entry->hostid = (unsigned long)id;
4641 entry->range = 1;
4642
4643 return entry;
4644 }
4645
4646 /* Run a function in a new user namespace.
4647 * The caller's euid/egid will be mapped if it is not already.
4648 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4649 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4650 * This means we require only to establish a mapping from:
4651 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4652 * - the container root -> some sub{g,u}id
4653 * The former we add, if the user did not specifiy a mapping. The latter we
4654 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4655 * there to start the container in the first place.
4656 */
4657 int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4658 const char *fn_name)
4659 {
4660 pid_t pid;
4661 uid_t euid, egid;
4662 struct userns_fn_data d;
4663 int p[2];
4664 struct lxc_list *it;
4665 struct id_map *map;
4666 char c = '1';
4667 int ret = -1;
4668 struct lxc_list *idmap = NULL, *tmplist = NULL;
4669 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4670 *host_uid_map = NULL, *host_gid_map = NULL;
4671
4672 ret = pipe(p);
4673 if (ret < 0) {
4674 SYSERROR("opening pipe");
4675 return -1;
4676 }
4677 d.fn = fn;
4678 d.fn_name = fn_name;
4679 d.arg = data;
4680 d.p[0] = p[0];
4681 d.p[1] = p[1];
4682
4683 /* Clone child in new user namespace. */
4684 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4685 if (pid < 0) {
4686 ERROR("failed to clone child process in new user namespace");
4687 goto on_error;
4688 }
4689
4690 close(p[0]);
4691 p[0] = -1;
4692
4693 /* Find container root. */
4694 lxc_list_for_each(it, &conf->id_map) {
4695 map = it->elem;
4696
4697 if (map->nsid != 0)
4698 continue;
4699
4700 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
4701 container_root_uid = malloc(sizeof(*container_root_uid));
4702 if (!container_root_uid)
4703 goto on_error;
4704 container_root_uid->idtype = map->idtype;
4705 container_root_uid->hostid = map->hostid;
4706 container_root_uid->nsid = 0;
4707 container_root_uid->range = map->range;
4708 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
4709 container_root_gid = malloc(sizeof(*container_root_gid));
4710 if (!container_root_gid)
4711 goto on_error;
4712 container_root_gid->idtype = map->idtype;
4713 container_root_gid->hostid = map->hostid;
4714 container_root_gid->nsid = 0;
4715 container_root_gid->range = map->range;
4716 }
4717
4718 /* Found container root. */
4719 if (container_root_uid && container_root_gid)
4720 break;
4721 }
4722
4723 /* This is actually checked earlier but it can't hurt. */
4724 if (!container_root_uid || !container_root_gid) {
4725 ERROR("no mapping for container root found");
4726 goto on_error;
4727 }
4728
4729 host_uid_map = container_root_uid;
4730 host_gid_map = container_root_gid;
4731
4732 /* Check whether the {g,u}id of the user has a mapping. */
4733 euid = geteuid();
4734 egid = getegid();
4735 if (euid != container_root_uid->hostid)
4736 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
4737
4738 if (egid != container_root_gid->hostid)
4739 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
4740
4741 if (!host_uid_map) {
4742 DEBUG("failed to find mapping for uid %d", euid);
4743 goto on_error;
4744 }
4745
4746 if (!host_gid_map) {
4747 DEBUG("failed to find mapping for gid %d", egid);
4748 goto on_error;
4749 }
4750
4751 /* Allocate new {g,u}id map list. */
4752 idmap = malloc(sizeof(*idmap));
4753 if (!idmap)
4754 goto on_error;
4755 lxc_list_init(idmap);
4756
4757 /* Add container root to the map. */
4758 tmplist = malloc(sizeof(*tmplist));
4759 if (!tmplist)
4760 goto on_error;
4761 lxc_list_add_elem(tmplist, container_root_uid);
4762 lxc_list_add_tail(idmap, tmplist);
4763
4764 if (host_uid_map && (host_uid_map != container_root_uid)) {
4765 /* idmap will now keep track of that memory. */
4766 container_root_uid = NULL;
4767
4768 /* Add container root to the map. */
4769 tmplist = malloc(sizeof(*tmplist));
4770 if (!tmplist)
4771 goto on_error;
4772 lxc_list_add_elem(tmplist, host_uid_map);
4773 lxc_list_add_tail(idmap, tmplist);
4774 }
4775 /* idmap will now keep track of that memory. */
4776 container_root_uid = NULL;
4777 /* idmap will now keep track of that memory. */
4778 host_uid_map = NULL;
4779
4780 tmplist = malloc(sizeof(*tmplist));
4781 if (!tmplist)
4782 goto on_error;
4783 lxc_list_add_elem(tmplist, container_root_gid);
4784 lxc_list_add_tail(idmap, tmplist);
4785
4786 if (host_gid_map && (host_gid_map != container_root_gid)) {
4787 /* idmap will now keep track of that memory. */
4788 container_root_gid = NULL;
4789
4790 tmplist = malloc(sizeof(*tmplist));
4791 if (!tmplist)
4792 goto on_error;
4793 lxc_list_add_elem(tmplist, host_gid_map);
4794 lxc_list_add_tail(idmap, tmplist);
4795 }
4796 /* idmap will now keep track of that memory. */
4797 container_root_gid = NULL;
4798 /* idmap will now keep track of that memory. */
4799 host_gid_map = NULL;
4800
4801 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4802 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4803 lxc_list_for_each(it, idmap) {
4804 map = it->elem;
4805 TRACE("establishing %cid mapping for \"%d\" in new "
4806 "user namespace: nsuid %lu - hostid %lu - range "
4807 "%lu",
4808 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4809 map->nsid, map->hostid, map->range);
4810 }
4811 }
4812
4813 /* Set up {g,u}id mapping for user namespace of child process. */
4814 ret = lxc_map_ids(idmap, pid);
4815 if (ret < 0) {
4816 ERROR("error setting up {g,u}id mappings for child process "
4817 "\"%d\"",
4818 pid);
4819 goto on_error;
4820 }
4821
4822 /* Tell child to proceed. */
4823 if (write(p[1], &c, 1) != 1) {
4824 SYSERROR("failed telling child process \"%d\" to proceed", pid);
4825 goto on_error;
4826 }
4827
4828 /* Wait for child to finish. */
4829 ret = wait_for_pid(pid);
4830
4831 on_error:
4832 if (idmap)
4833 lxc_free_idmap(idmap);
4834 if (container_root_uid)
4835 free(container_root_uid);
4836 if (container_root_gid)
4837 free(container_root_gid);
4838 if (host_uid_map && (host_uid_map != container_root_uid))
4839 free(host_uid_map);
4840 if (host_gid_map && (host_gid_map != container_root_gid))
4841 free(host_gid_map);
4842
4843 if (p[0] != -1)
4844 close(p[0]);
4845 close(p[1]);
4846
4847 return ret;
4848 }
4849
4850 /* not thread-safe, do not use from api without first forking */
4851 static char* getuname(void)
4852 {
4853 struct passwd *result;
4854
4855 result = getpwuid(geteuid());
4856 if (!result)
4857 return NULL;
4858
4859 return strdup(result->pw_name);
4860 }
4861
4862 /* not thread-safe, do not use from api without first forking */
4863 static char *getgname(void)
4864 {
4865 struct group *result;
4866
4867 result = getgrgid(getegid());
4868 if (!result)
4869 return NULL;
4870
4871 return strdup(result->gr_name);
4872 }
4873
4874 /* not thread-safe, do not use from api without first forking */
4875 void suggest_default_idmap(void)
4876 {
4877 FILE *f;
4878 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4879 char *line = NULL;
4880 char *uname, *gname;
4881 size_t len = 0;
4882
4883 if (!(uname = getuname()))
4884 return;
4885
4886 if (!(gname = getgname())) {
4887 free(uname);
4888 return;
4889 }
4890
4891 f = fopen(subuidfile, "r");
4892 if (!f) {
4893 ERROR("Your system is not configured with subuids");
4894 free(gname);
4895 free(uname);
4896 return;
4897 }
4898 while (getline(&line, &len, f) != -1) {
4899 size_t no_newline = 0;
4900 char *p = strchr(line, ':'), *p2;
4901 if (*line == '#')
4902 continue;
4903 if (!p)
4904 continue;
4905 *p = '\0';
4906 p++;
4907 if (strcmp(line, uname))
4908 continue;
4909 p2 = strchr(p, ':');
4910 if (!p2)
4911 continue;
4912 *p2 = '\0';
4913 p2++;
4914 if (!*p2)
4915 continue;
4916 no_newline = strcspn(p2, "\n");
4917 p2[no_newline] = '\0';
4918
4919 if (lxc_safe_uint(p, &uid) < 0)
4920 WARN("Could not parse UID.");
4921 if (lxc_safe_uint(p2, &urange) < 0)
4922 WARN("Could not parse UID range.");
4923 }
4924 fclose(f);
4925
4926 f = fopen(subgidfile, "r");
4927 if (!f) {
4928 ERROR("Your system is not configured with subgids");
4929 free(gname);
4930 free(uname);
4931 return;
4932 }
4933 while (getline(&line, &len, f) != -1) {
4934 size_t no_newline = 0;
4935 char *p = strchr(line, ':'), *p2;
4936 if (*line == '#')
4937 continue;
4938 if (!p)
4939 continue;
4940 *p = '\0';
4941 p++;
4942 if (strcmp(line, uname))
4943 continue;
4944 p2 = strchr(p, ':');
4945 if (!p2)
4946 continue;
4947 *p2 = '\0';
4948 p2++;
4949 if (!*p2)
4950 continue;
4951 no_newline = strcspn(p2, "\n");
4952 p2[no_newline] = '\0';
4953
4954 if (lxc_safe_uint(p, &gid) < 0)
4955 WARN("Could not parse GID.");
4956 if (lxc_safe_uint(p2, &grange) < 0)
4957 WARN("Could not parse GID range.");
4958 }
4959 fclose(f);
4960
4961 free(line);
4962
4963 if (!urange || !grange) {
4964 ERROR("You do not have subuids or subgids allocated");
4965 ERROR("Unprivileged containers require subuids and subgids");
4966 return;
4967 }
4968
4969 ERROR("You must either run as root, or define uid mappings");
4970 ERROR("To pass uid mappings to lxc-create, you could create");
4971 ERROR("~/.config/lxc/default.conf:");
4972 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
4973 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
4974 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
4975
4976 free(gname);
4977 free(uname);
4978 }
4979
4980 static void free_cgroup_settings(struct lxc_list *result)
4981 {
4982 struct lxc_list *iterator, *next;
4983
4984 lxc_list_for_each_safe(iterator, result, next) {
4985 lxc_list_del(iterator);
4986 free(iterator);
4987 }
4988 free(result);
4989 }
4990
4991 /*
4992 * Return the list of cgroup_settings sorted according to the following rules
4993 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4994 */
4995 struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
4996 {
4997 struct lxc_list *result;
4998 struct lxc_list *memsw_limit = NULL;
4999 struct lxc_list *it = NULL;
5000 struct lxc_cgroup *cg = NULL;
5001 struct lxc_list *item = NULL;
5002
5003 result = malloc(sizeof(*result));
5004 if (!result) {
5005 ERROR("failed to allocate memory to sort cgroup settings");
5006 return NULL;
5007 }
5008 lxc_list_init(result);
5009
5010 /*Iterate over the cgroup settings and copy them to the output list*/
5011 lxc_list_for_each(it, cgroup_settings) {
5012 item = malloc(sizeof(*item));
5013 if (!item) {
5014 ERROR("failed to allocate memory to sort cgroup settings");
5015 free_cgroup_settings(result);
5016 return NULL;
5017 }
5018 item->elem = it->elem;
5019 cg = it->elem;
5020 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
5021 /* Store the memsw_limit location */
5022 memsw_limit = item;
5023 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
5024 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
5025 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
5026 item->elem = memsw_limit->elem;
5027 memsw_limit->elem = it->elem;
5028 }
5029 lxc_list_add_tail(result, item);
5030 }
5031
5032 return result;
5033 }