]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/conf.c
tree-wide: log function called in userns_exec_1()
[mirror_lxc.git] / src / lxc / conf.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #define _GNU_SOURCE
25 #include "config.h"
26
27 #include <dirent.h>
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <grp.h>
31 #include <inttypes.h>
32 #include <libgen.h>
33 #include <pwd.h>
34 #include <stdarg.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <arpa/inet.h>
41 #include <linux/loop.h>
42 #include <net/if.h>
43 #include <netinet/in.h>
44 #include <sys/mman.h>
45 #include <sys/mount.h>
46 #include <sys/param.h>
47 #include <sys/prctl.h>
48 #include <sys/stat.h>
49 #include <sys/socket.h>
50 #include <sys/sysmacros.h>
51 #include <sys/syscall.h>
52 #include <sys/types.h>
53 #include <sys/utsname.h>
54 #include <sys/wait.h>
55
56 /* makedev() */
57 #ifdef MAJOR_IN_MKDEV
58 # include <sys/mkdev.h>
59 #endif
60
61 #ifdef HAVE_STATVFS
62 #include <sys/statvfs.h>
63 #endif
64
65 #if HAVE_PTY_H
66 #include <pty.h>
67 #else
68 #include <../include/openpty.h>
69 #endif
70
71 #ifdef HAVE_LINUX_MEMFD_H
72 #include <linux/memfd.h>
73 #endif
74
75 #include "af_unix.h"
76 #include "bdev.h"
77 #include "caps.h" /* for lxc_caps_last_cap() */
78 #include "cgroup.h"
79 #include "conf.h"
80 #include "error.h"
81 #include "log.h"
82 #include "lxcaufs.h"
83 #include "lxclock.h"
84 #include "lxcoverlay.h"
85 #include "lxcseccomp.h"
86 #include "namespace.h"
87 #include "network.h"
88 #include "parse.h"
89 #include "utils.h"
90 #include "lsm/lsm.h"
91
92 #if HAVE_LIBCAP
93 #include <sys/capability.h>
94 #endif
95
96 #if HAVE_SYS_PERSONALITY_H
97 #include <sys/personality.h>
98 #endif
99
100 #if IS_BIONIC
101 #include <../include/lxcmntent.h>
102 #ifndef HAVE_PRLIMIT
103 #include <../include/prlimit.h>
104 #endif
105 #else
106 #include <mntent.h>
107 #endif
108
109 lxc_log_define(lxc_conf, lxc);
110
111 #if HAVE_LIBCAP
112 #ifndef CAP_SETFCAP
113 #define CAP_SETFCAP 31
114 #endif
115
116 #ifndef CAP_MAC_OVERRIDE
117 #define CAP_MAC_OVERRIDE 32
118 #endif
119
120 #ifndef CAP_MAC_ADMIN
121 #define CAP_MAC_ADMIN 33
122 #endif
123 #endif
124
125 #ifndef PR_CAPBSET_DROP
126 #define PR_CAPBSET_DROP 24
127 #endif
128
129 #ifndef LO_FLAGS_AUTOCLEAR
130 #define LO_FLAGS_AUTOCLEAR 4
131 #endif
132
133 #ifndef CAP_SETUID
134 #define CAP_SETUID 7
135 #endif
136
137 #ifndef CAP_SETGID
138 #define CAP_SETGID 6
139 #endif
140
141 /* needed for cgroup automount checks, regardless of whether we
142 * have included linux/capability.h or not */
143 #ifndef CAP_SYS_ADMIN
144 #define CAP_SYS_ADMIN 21
145 #endif
146
147 /* Define pivot_root() if missing from the C library */
148 #ifndef HAVE_PIVOT_ROOT
149 static int pivot_root(const char * new_root, const char * put_old)
150 {
151 #ifdef __NR_pivot_root
152 return syscall(__NR_pivot_root, new_root, put_old);
153 #else
154 errno = ENOSYS;
155 return -1;
156 #endif
157 }
158 #else
159 extern int pivot_root(const char * new_root, const char * put_old);
160 #endif
161
162 /* Define sethostname() if missing from the C library */
163 #ifndef HAVE_SETHOSTNAME
164 static int sethostname(const char * name, size_t len)
165 {
166 #ifdef __NR_sethostname
167 return syscall(__NR_sethostname, name, len);
168 #else
169 errno = ENOSYS;
170 return -1;
171 #endif
172 }
173 #endif
174
175 /* Define __S_ISTYPE if missing from the C library */
176 #ifndef __S_ISTYPE
177 #define __S_ISTYPE(mode, mask) (((mode) & S_IFMT) == (mask))
178 #endif
179
180 #ifndef MS_PRIVATE
181 #define MS_PRIVATE (1<<18)
182 #endif
183
184 #ifndef MS_LAZYTIME
185 #define MS_LAZYTIME (1<<25)
186 #endif
187
188 /* memfd_create() */
189 #ifndef MFD_CLOEXEC
190 #define MFD_CLOEXEC 0x0001U
191 #endif
192
193 #ifndef MFD_ALLOW_SEALING
194 #define MFD_ALLOW_SEALING 0x0002U
195 #endif
196
197 #ifndef HAVE_MEMFD_CREATE
198 static int memfd_create(const char *name, unsigned int flags) {
199 #ifndef __NR_memfd_create
200 #if defined __i386__
201 #define __NR_memfd_create 356
202 #elif defined __x86_64__
203 #define __NR_memfd_create 319
204 #elif defined __arm__
205 #define __NR_memfd_create 385
206 #elif defined __aarch64__
207 #define __NR_memfd_create 279
208 #elif defined __s390__
209 #define __NR_memfd_create 350
210 #elif defined __powerpc__
211 #define __NR_memfd_create 360
212 #elif defined __sparc__
213 #define __NR_memfd_create 348
214 #elif defined __blackfin__
215 #define __NR_memfd_create 390
216 #elif defined __ia64__
217 #define __NR_memfd_create 1340
218 #elif defined _MIPS_SIM
219 #if _MIPS_SIM == _MIPS_SIM_ABI32
220 #define __NR_memfd_create 4354
221 #endif
222 #if _MIPS_SIM == _MIPS_SIM_NABI32
223 #define __NR_memfd_create 6318
224 #endif
225 #if _MIPS_SIM == _MIPS_SIM_ABI64
226 #define __NR_memfd_create 5314
227 #endif
228 #endif
229 #endif
230 #ifdef __NR_memfd_create
231 return syscall(__NR_memfd_create, name, flags);
232 #else
233 errno = ENOSYS;
234 return -1;
235 #endif
236 }
237 #else
238 extern int memfd_create(const char *name, unsigned int flags);
239 #endif
240
241 char *lxchook_names[NUM_LXC_HOOKS] = {
242 "pre-start", "pre-mount", "mount", "autodev", "start", "stop", "post-stop", "clone", "destroy" };
243
244 typedef int (*instantiate_cb)(struct lxc_handler *, struct lxc_netdev *);
245
246 struct mount_opt {
247 char *name;
248 int clear;
249 int flag;
250 };
251
252 struct caps_opt {
253 char *name;
254 int value;
255 };
256
257 struct limit_opt {
258 char *name;
259 int value;
260 };
261
262 /*
263 * The lxc_conf of the container currently being worked on in an
264 * API call
265 * This is used in the error calls
266 */
267 #ifdef HAVE_TLS
268 __thread struct lxc_conf *current_config;
269 #else
270 struct lxc_conf *current_config;
271 #endif
272
273 /* Declare this here, since we don't want to reshuffle the whole file. */
274 static int in_caplist(int cap, struct lxc_list *caps);
275
276 static int instantiate_veth(struct lxc_handler *, struct lxc_netdev *);
277 static int instantiate_macvlan(struct lxc_handler *, struct lxc_netdev *);
278 static int instantiate_vlan(struct lxc_handler *, struct lxc_netdev *);
279 static int instantiate_phys(struct lxc_handler *, struct lxc_netdev *);
280 static int instantiate_empty(struct lxc_handler *, struct lxc_netdev *);
281 static int instantiate_none(struct lxc_handler *, struct lxc_netdev *);
282
283 static instantiate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
284 [LXC_NET_VETH] = instantiate_veth,
285 [LXC_NET_MACVLAN] = instantiate_macvlan,
286 [LXC_NET_VLAN] = instantiate_vlan,
287 [LXC_NET_PHYS] = instantiate_phys,
288 [LXC_NET_EMPTY] = instantiate_empty,
289 [LXC_NET_NONE] = instantiate_none,
290 };
291
292 static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
293 static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
294 static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
295 static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
296 static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
297 static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
298
299 static instantiate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
300 [LXC_NET_VETH] = shutdown_veth,
301 [LXC_NET_MACVLAN] = shutdown_macvlan,
302 [LXC_NET_VLAN] = shutdown_vlan,
303 [LXC_NET_PHYS] = shutdown_phys,
304 [LXC_NET_EMPTY] = shutdown_empty,
305 [LXC_NET_NONE] = shutdown_none,
306 };
307
308 static struct mount_opt mount_opt[] = {
309 { "async", 1, MS_SYNCHRONOUS },
310 { "atime", 1, MS_NOATIME },
311 { "bind", 0, MS_BIND },
312 { "defaults", 0, 0 },
313 { "dev", 1, MS_NODEV },
314 { "diratime", 1, MS_NODIRATIME },
315 { "dirsync", 0, MS_DIRSYNC },
316 { "exec", 1, MS_NOEXEC },
317 { "lazytime", 0, MS_LAZYTIME },
318 { "mand", 0, MS_MANDLOCK },
319 { "noatime", 0, MS_NOATIME },
320 { "nodev", 0, MS_NODEV },
321 { "nodiratime", 0, MS_NODIRATIME },
322 { "noexec", 0, MS_NOEXEC },
323 { "nomand", 1, MS_MANDLOCK },
324 { "norelatime", 1, MS_RELATIME },
325 { "nostrictatime", 1, MS_STRICTATIME },
326 { "nosuid", 0, MS_NOSUID },
327 { "rbind", 0, MS_BIND|MS_REC },
328 { "relatime", 0, MS_RELATIME },
329 { "remount", 0, MS_REMOUNT },
330 { "ro", 0, MS_RDONLY },
331 { "rw", 1, MS_RDONLY },
332 { "strictatime", 0, MS_STRICTATIME },
333 { "suid", 1, MS_NOSUID },
334 { "sync", 0, MS_SYNCHRONOUS },
335 { NULL, 0, 0 },
336 };
337
338 #if HAVE_LIBCAP
339 static struct caps_opt caps_opt[] = {
340 { "chown", CAP_CHOWN },
341 { "dac_override", CAP_DAC_OVERRIDE },
342 { "dac_read_search", CAP_DAC_READ_SEARCH },
343 { "fowner", CAP_FOWNER },
344 { "fsetid", CAP_FSETID },
345 { "kill", CAP_KILL },
346 { "setgid", CAP_SETGID },
347 { "setuid", CAP_SETUID },
348 { "setpcap", CAP_SETPCAP },
349 { "linux_immutable", CAP_LINUX_IMMUTABLE },
350 { "net_bind_service", CAP_NET_BIND_SERVICE },
351 { "net_broadcast", CAP_NET_BROADCAST },
352 { "net_admin", CAP_NET_ADMIN },
353 { "net_raw", CAP_NET_RAW },
354 { "ipc_lock", CAP_IPC_LOCK },
355 { "ipc_owner", CAP_IPC_OWNER },
356 { "sys_module", CAP_SYS_MODULE },
357 { "sys_rawio", CAP_SYS_RAWIO },
358 { "sys_chroot", CAP_SYS_CHROOT },
359 { "sys_ptrace", CAP_SYS_PTRACE },
360 { "sys_pacct", CAP_SYS_PACCT },
361 { "sys_admin", CAP_SYS_ADMIN },
362 { "sys_boot", CAP_SYS_BOOT },
363 { "sys_nice", CAP_SYS_NICE },
364 { "sys_resource", CAP_SYS_RESOURCE },
365 { "sys_time", CAP_SYS_TIME },
366 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
367 { "mknod", CAP_MKNOD },
368 { "lease", CAP_LEASE },
369 #ifdef CAP_AUDIT_READ
370 { "audit_read", CAP_AUDIT_READ },
371 #endif
372 #ifdef CAP_AUDIT_WRITE
373 { "audit_write", CAP_AUDIT_WRITE },
374 #endif
375 #ifdef CAP_AUDIT_CONTROL
376 { "audit_control", CAP_AUDIT_CONTROL },
377 #endif
378 { "setfcap", CAP_SETFCAP },
379 { "mac_override", CAP_MAC_OVERRIDE },
380 { "mac_admin", CAP_MAC_ADMIN },
381 #ifdef CAP_SYSLOG
382 { "syslog", CAP_SYSLOG },
383 #endif
384 #ifdef CAP_WAKE_ALARM
385 { "wake_alarm", CAP_WAKE_ALARM },
386 #endif
387 #ifdef CAP_BLOCK_SUSPEND
388 { "block_suspend", CAP_BLOCK_SUSPEND },
389 #endif
390 };
391 #else
392 static struct caps_opt caps_opt[] = {};
393 #endif
394
395 static struct limit_opt limit_opt[] = {
396 #ifdef RLIMIT_AS
397 { "as", RLIMIT_AS },
398 #endif
399 #ifdef RLIMIT_CORE
400 { "core", RLIMIT_CORE },
401 #endif
402 #ifdef RLIMIT_CPU
403 { "cpu", RLIMIT_CPU },
404 #endif
405 #ifdef RLIMIT_DATA
406 { "data", RLIMIT_DATA },
407 #endif
408 #ifdef RLIMIT_FSIZE
409 { "fsize", RLIMIT_FSIZE },
410 #endif
411 #ifdef RLIMIT_LOCKS
412 { "locks", RLIMIT_LOCKS },
413 #endif
414 #ifdef RLIMIT_MEMLOCK
415 { "memlock", RLIMIT_MEMLOCK },
416 #endif
417 #ifdef RLIMIT_MSGQUEUE
418 { "msgqueue", RLIMIT_MSGQUEUE },
419 #endif
420 #ifdef RLIMIT_NICE
421 { "nice", RLIMIT_NICE },
422 #endif
423 #ifdef RLIMIT_NOFILE
424 { "nofile", RLIMIT_NOFILE },
425 #endif
426 #ifdef RLIMIT_NPROC
427 { "nproc", RLIMIT_NPROC },
428 #endif
429 #ifdef RLIMIT_RSS
430 { "rss", RLIMIT_RSS },
431 #endif
432 #ifdef RLIMIT_RTPRIO
433 { "rtprio", RLIMIT_RTPRIO },
434 #endif
435 #ifdef RLIMIT_RTTIME
436 { "rttime", RLIMIT_RTTIME },
437 #endif
438 #ifdef RLIMIT_SIGPENDING
439 { "sigpending", RLIMIT_SIGPENDING },
440 #endif
441 #ifdef RLIMIT_STACK
442 { "stack", RLIMIT_STACK },
443 #endif
444 };
445
446 static int run_buffer(char *buffer)
447 {
448 struct lxc_popen_FILE *f;
449 char *output;
450 int ret;
451
452 f = lxc_popen(buffer);
453 if (!f) {
454 SYSERROR("Failed to popen() %s.", buffer);
455 return -1;
456 }
457
458 output = malloc(LXC_LOG_BUFFER_SIZE);
459 if (!output) {
460 ERROR("Failed to allocate memory for %s.", buffer);
461 lxc_pclose(f);
462 return -1;
463 }
464
465 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
466 DEBUG("Script %s with output: %s.", buffer, output);
467
468 free(output);
469
470 ret = lxc_pclose(f);
471 if (ret == -1) {
472 SYSERROR("Script exited with error.");
473 return -1;
474 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
475 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
476 return -1;
477 } else if (WIFSIGNALED(ret)) {
478 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
479 return -1;
480 }
481
482 return 0;
483 }
484
485 static int run_script_argv(const char *name, const char *section,
486 const char *script, const char *hook,
487 const char *lxcpath, char **argsin)
488 {
489 int ret, i;
490 char *buffer;
491 size_t size = 0;
492
493 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
494 script, name, section);
495
496 for (i = 0; argsin && argsin[i]; i++)
497 size += strlen(argsin[i]) + 1;
498
499 size += strlen(hook) + 1;
500
501 size += strlen(script);
502 size += strlen(name);
503 size += strlen(section);
504 size += 3;
505
506 if (size > INT_MAX)
507 return -1;
508
509 buffer = alloca(size);
510 if (!buffer) {
511 ERROR("Failed to allocate memory.");
512 return -1;
513 }
514
515 ret =
516 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
517 if (ret < 0 || (size_t)ret >= size) {
518 ERROR("Script name too long.");
519 return -1;
520 }
521
522 for (i = 0; argsin && argsin[i]; i++) {
523 int len = size - ret;
524 int rc;
525 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
526 if (rc < 0 || rc >= len) {
527 ERROR("Script args too long.");
528 return -1;
529 }
530 ret += rc;
531 }
532
533 return run_buffer(buffer);
534 }
535
536 static int run_script(const char *name, const char *section, const char *script,
537 ...)
538 {
539 int ret;
540 char *buffer, *p;
541 size_t size = 0;
542 va_list ap;
543
544 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
545 script, name, section);
546
547 va_start(ap, script);
548 while ((p = va_arg(ap, char *)))
549 size += strlen(p) + 1;
550 va_end(ap);
551
552 size += strlen(script);
553 size += strlen(name);
554 size += strlen(section);
555 size += 3;
556
557 if (size > INT_MAX)
558 return -1;
559
560 buffer = alloca(size);
561 if (!buffer) {
562 ERROR("Failed to allocate memory.");
563 return -1;
564 }
565
566 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
567 if (ret < 0 || ret >= size) {
568 ERROR("Script name too long.");
569 return -1;
570 }
571
572 va_start(ap, script);
573 while ((p = va_arg(ap, char *))) {
574 int len = size - ret;
575 int rc;
576 rc = snprintf(buffer + ret, len, " %s", p);
577 if (rc < 0 || rc >= len) {
578 ERROR("Script args too long.");
579 return -1;
580 }
581 ret += rc;
582 }
583 va_end(ap);
584
585 return run_buffer(buffer);
586 }
587
588 static int mount_rootfs_dir(const char *rootfs, const char *target,
589 const char *options)
590 {
591 unsigned long mntflags;
592 char *mntdata;
593 int ret;
594
595 if (parse_mntopts(options, &mntflags, &mntdata) < 0) {
596 free(mntdata);
597 return -1;
598 }
599
600 ret = mount(rootfs, target, "none", MS_BIND | MS_REC | mntflags, mntdata);
601 free(mntdata);
602
603 return ret;
604 }
605
606 static int lxc_mount_rootfs_file(const char *rootfs, const char *target,
607 const char *options)
608 {
609 int ret, loopfd;
610 char path[MAXPATHLEN];
611
612 loopfd = lxc_prepare_loop_dev(rootfs, path, LO_FLAGS_AUTOCLEAR);
613 if (loopfd < 0)
614 return -1;
615 DEBUG("prepared loop device \"%s\"", path);
616
617 ret = mount_unknown_fs(path, target, options);
618 close(loopfd);
619
620 DEBUG("mounted rootfs \"%s\" on loop device \"%s\" via loop device \"%s\"", rootfs, target, path);
621
622 return ret;
623 }
624
625 static int mount_rootfs_block(const char *rootfs, const char *target,
626 const char *options)
627 {
628 return mount_unknown_fs(rootfs, target, options);
629 }
630
631 /*
632 * pin_rootfs
633 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
634 * the duration of the container run, to prevent the container from marking
635 * the underlying fs readonly on shutdown. unlink the file immediately so
636 * no name pollution is happens
637 * return -1 on error.
638 * return -2 if nothing needed to be pinned.
639 * return an open fd (>=0) if we pinned it.
640 */
641 int pin_rootfs(const char *rootfs)
642 {
643 char absrootfs[MAXPATHLEN];
644 char absrootfspin[MAXPATHLEN];
645 struct stat s;
646 int ret, fd;
647
648 if (rootfs == NULL || strlen(rootfs) == 0)
649 return -2;
650
651 if (!realpath(rootfs, absrootfs))
652 return -2;
653
654 if (access(absrootfs, F_OK))
655 return -1;
656
657 if (stat(absrootfs, &s))
658 return -1;
659
660 if (!S_ISDIR(s.st_mode))
661 return -2;
662
663 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
664 if (ret >= MAXPATHLEN)
665 return -1;
666
667 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
668 if (fd < 0)
669 return fd;
670 (void)unlink(absrootfspin);
671 return fd;
672 }
673
674 /*
675 * If we are asking to remount something, make sure that any
676 * NOEXEC etc are honored.
677 */
678 static unsigned long add_required_remount_flags(const char *s, const char *d,
679 unsigned long flags)
680 {
681 #ifdef HAVE_STATVFS
682 struct statvfs sb;
683 unsigned long required_flags = 0;
684
685 if (!(flags & MS_REMOUNT))
686 return flags;
687
688 if (!s)
689 s = d;
690
691 if (!s)
692 return flags;
693 if (statvfs(s, &sb) < 0)
694 return flags;
695
696 if (sb.f_flag & MS_NOSUID)
697 required_flags |= MS_NOSUID;
698 if (sb.f_flag & MS_NODEV)
699 required_flags |= MS_NODEV;
700 if (sb.f_flag & MS_RDONLY)
701 required_flags |= MS_RDONLY;
702 if (sb.f_flag & MS_NOEXEC)
703 required_flags |= MS_NOEXEC;
704
705 return flags | required_flags;
706 #else
707 return flags;
708 #endif
709 }
710
711 static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
712 {
713 int r;
714 int i;
715 static struct {
716 int match_mask;
717 int match_flag;
718 const char *source;
719 const char *destination;
720 const char *fstype;
721 unsigned long flags;
722 const char *options;
723 } default_mounts[] = {
724 /* Read-only bind-mounting... In older kernels, doing that required
725 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
726 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
727 * kernel 2.6.26 onwards. However, this apparently does not work on
728 * kernel 3.8. Unfortunately, on that very same kernel, doing the
729 * same trick as above doesn't seem to work either, there one needs
730 * to ALSO specify MS_BIND for the remount, otherwise the entire
731 * fs is remounted read-only or the mount fails because it's busy...
732 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
733 * 2.6.32...
734 */
735 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
736 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
737 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
738 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
739 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
740 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
741 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
742 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
743 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
744 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
745 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
746 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
747 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
748 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
749 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
750 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
751 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
752 { 0, 0, NULL, NULL, NULL, 0, NULL }
753 };
754
755 for (i = 0; default_mounts[i].match_mask; i++) {
756 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
757 char *source = NULL;
758 char *destination = NULL;
759 int saved_errno;
760 unsigned long mflags;
761
762 if (default_mounts[i].source) {
763 /* will act like strdup if %r is not present */
764 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
765 if (!source) {
766 SYSERROR("memory allocation error");
767 return -1;
768 }
769 }
770 if (!default_mounts[i].destination) {
771 ERROR("BUG: auto mounts destination %d was NULL", i);
772 free(source);
773 return -1;
774 }
775 /* will act like strdup if %r is not present */
776 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
777 if (!destination) {
778 saved_errno = errno;
779 SYSERROR("memory allocation error");
780 free(source);
781 errno = saved_errno;
782 return -1;
783 }
784 mflags = add_required_remount_flags(source, destination,
785 default_mounts[i].flags);
786 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
787 saved_errno = errno;
788 if (r < 0 && errno == ENOENT) {
789 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
790 r = 0;
791 }
792 else if (r < 0)
793 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
794
795 free(source);
796 free(destination);
797 if (r < 0) {
798 errno = saved_errno;
799 return -1;
800 }
801 }
802 }
803
804 if (flags & LXC_AUTO_CGROUP_MASK) {
805 int cg_flags;
806
807 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
808 /* If the type of cgroup mount was not specified, it depends on the
809 * container's capabilities as to what makes sense: if we have
810 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
811 * anyway, so we may as well default to read-write; then the admin
812 * will not be given a false sense of security. (And if they really
813 * want mixed r/o r/w, then they can explicitly specify :mixed.)
814 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
815 * :mixed, because then the container can't remount it read-write. */
816 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
817 int has_sys_admin = 0;
818
819 if (!lxc_list_empty(&conf->keepcaps))
820 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
821 else
822 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
823
824 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
825 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
826 else
827 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
828 }
829
830 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
831 SYSERROR("error mounting /sys/fs/cgroup");
832 return -1;
833 }
834 }
835
836 return 0;
837 }
838
839 static int mount_rootfs(const char *rootfs, const char *target, const char *options)
840 {
841 char absrootfs[MAXPATHLEN];
842 struct stat s;
843 int i;
844
845 typedef int (*rootfs_cb)(const char *, const char *, const char *);
846
847 struct rootfs_type {
848 int type;
849 rootfs_cb cb;
850 } rtfs_type[] = {
851 { S_IFDIR, mount_rootfs_dir },
852 { S_IFBLK, mount_rootfs_block },
853 { S_IFREG, lxc_mount_rootfs_file },
854 };
855
856 if (!realpath(rootfs, absrootfs)) {
857 SYSERROR("Failed to get real path for \"%s\".", rootfs);
858 return -1;
859 }
860
861 if (access(absrootfs, F_OK)) {
862 SYSERROR("The rootfs \"%s\" is not accessible.", absrootfs);
863 return -1;
864 }
865
866 if (stat(absrootfs, &s)) {
867 SYSERROR("Failed to stat the rootfs \"%s\".", absrootfs);
868 return -1;
869 }
870
871 for (i = 0; i < sizeof(rtfs_type)/sizeof(rtfs_type[0]); i++) {
872 if (!__S_ISTYPE(s.st_mode, rtfs_type[i].type))
873 continue;
874
875 return rtfs_type[i].cb(absrootfs, target, options);
876 }
877
878 ERROR("Unsupported rootfs type for rootfs \"%s\".", absrootfs);
879 return -1;
880 }
881
882 static int setup_utsname(struct utsname *utsname)
883 {
884 if (!utsname)
885 return 0;
886
887 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
888 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
889 return -1;
890 }
891
892 INFO("'%s' hostname has been setup", utsname->nodename);
893
894 return 0;
895 }
896
897 struct dev_symlinks {
898 const char *oldpath;
899 const char *name;
900 };
901
902 static const struct dev_symlinks dev_symlinks[] = {
903 {"/proc/self/fd", "fd"},
904 {"/proc/self/fd/0", "stdin"},
905 {"/proc/self/fd/1", "stdout"},
906 {"/proc/self/fd/2", "stderr"},
907 };
908
909 static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
910 {
911 char path[MAXPATHLEN];
912 int ret,i;
913 struct stat s;
914
915
916 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
917 const struct dev_symlinks *d = &dev_symlinks[i];
918 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
919 if (ret < 0 || ret >= MAXPATHLEN)
920 return -1;
921
922 /*
923 * Stat the path first. If we don't get an error
924 * accept it as is and don't try to create it
925 */
926 if (!stat(path, &s)) {
927 continue;
928 }
929
930 ret = symlink(d->oldpath, path);
931
932 if (ret && errno != EEXIST) {
933 if ( errno == EROFS ) {
934 WARN("Warning: Read Only file system while creating %s", path);
935 } else {
936 SYSERROR("Error creating %s", path);
937 return -1;
938 }
939 }
940 }
941 return 0;
942 }
943
944 /*
945 * Build a space-separate list of ptys to pass to systemd.
946 */
947 static bool append_ptyname(char **pp, char *name)
948 {
949 char *p;
950
951 if (!*pp) {
952 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
953 if (!*pp)
954 return false;
955 sprintf(*pp, "container_ttys=%s", name);
956 return true;
957 }
958 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
959 if (!p)
960 return false;
961 *pp = p;
962 strcat(p, " ");
963 strcat(p, name);
964 return true;
965 }
966
967 static int lxc_setup_tty(struct lxc_conf *conf)
968 {
969 int i, ret;
970 const struct lxc_tty_info *tty_info = &conf->tty_info;
971 char *ttydir = conf->ttydir;
972 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
973
974 if (!conf->rootfs.path)
975 return 0;
976
977 for (i = 0; i < tty_info->nbtty; i++) {
978 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
979
980 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
981 if (ret < 0 || (size_t)ret >= sizeof(path)) {
982 ERROR("pathname too long for ttys");
983 return -1;
984 }
985
986 if (ttydir) {
987 /* create dev/lxc/tty%d" */
988 ret = snprintf(lxcpath, sizeof(lxcpath),
989 "/dev/%s/tty%d", ttydir, i + 1);
990 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
991 ERROR("pathname too long for ttys");
992 return -1;
993 }
994
995 ret = creat(lxcpath, 0660);
996 if (ret < 0 && errno != EEXIST) {
997 SYSERROR("failed to create \"%s\"", lxcpath);
998 return -1;
999 }
1000 if (ret >= 0)
1001 close(ret);
1002
1003 ret = unlink(path);
1004 if (ret < 0 && errno != ENOENT) {
1005 SYSERROR("failed to unlink \"%s\"", path);
1006 return -1;
1007 }
1008
1009 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
1010 if (ret < 0) {
1011 WARN("failed to bind mount \"%s\" onto \"%s\"",
1012 pty_info->name, path);
1013 continue;
1014 }
1015 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
1016 path);
1017
1018 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
1019 ttydir, i + 1);
1020 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
1021 ERROR("tty pathname too long");
1022 return -1;
1023 }
1024
1025 ret = symlink(lxcpath, path);
1026 if (ret < 0) {
1027 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
1028 path, lxcpath);
1029 return -1;
1030 }
1031 } else {
1032 /* If we populated /dev, then we need to create
1033 * /dev/ttyN
1034 */
1035 ret = access(path, F_OK);
1036 if (ret < 0) {
1037 ret = creat(path, 0660);
1038 if (ret < 0) {
1039 SYSERROR("failed to create \"%s\"", path);
1040 /* this isn't fatal, continue */
1041 } else {
1042 close(ret);
1043 }
1044 }
1045
1046 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
1047 if (ret < 0) {
1048 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
1049 continue;
1050 }
1051
1052 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
1053 path);
1054 }
1055
1056 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
1057 ERROR("Error setting up container_ttys string");
1058 return -1;
1059 }
1060 }
1061
1062 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
1063 return 0;
1064 }
1065
1066 static int setup_rootfs_pivot_root(const char *rootfs)
1067 {
1068 int oldroot = -1, newroot = -1;
1069
1070 oldroot = open("/", O_DIRECTORY | O_RDONLY);
1071 if (oldroot < 0) {
1072 SYSERROR("Error opening old-/ for fchdir");
1073 return -1;
1074 }
1075 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
1076 if (newroot < 0) {
1077 SYSERROR("Error opening new-/ for fchdir");
1078 goto fail;
1079 }
1080
1081 /* change into new root fs */
1082 if (fchdir(newroot)) {
1083 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
1084 goto fail;
1085 }
1086
1087 /* pivot_root into our new root fs */
1088 if (pivot_root(".", ".")) {
1089 SYSERROR("pivot_root syscall failed");
1090 goto fail;
1091 }
1092
1093 /*
1094 * at this point the old-root is mounted on top of our new-root
1095 * To unmounted it we must not be chdir'd into it, so escape back
1096 * to old-root
1097 */
1098 if (fchdir(oldroot) < 0) {
1099 SYSERROR("Error entering oldroot");
1100 goto fail;
1101 }
1102 if (umount2(".", MNT_DETACH) < 0) {
1103 SYSERROR("Error detaching old root");
1104 goto fail;
1105 }
1106
1107 if (fchdir(newroot) < 0) {
1108 SYSERROR("Error re-entering newroot");
1109 goto fail;
1110 }
1111
1112 close(oldroot);
1113 close(newroot);
1114
1115 DEBUG("pivot_root syscall to '%s' successful", rootfs);
1116
1117 return 0;
1118
1119 fail:
1120 if (oldroot != -1)
1121 close(oldroot);
1122 if (newroot != -1)
1123 close(newroot);
1124 return -1;
1125 }
1126
1127 /*
1128 * Just create a path for /dev under $lxcpath/$name and in rootfs
1129 * If we hit an error, log it but don't fail yet.
1130 */
1131 static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs, const char *lxcpath)
1132 {
1133 int ret;
1134 size_t clen;
1135 char *path;
1136
1137 INFO("Mounting container /dev");
1138
1139 /* $(rootfs->mount) + "/dev/pts" + '\0' */
1140 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
1141 path = alloca(clen);
1142
1143 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
1144 if (ret < 0 || ret >= clen)
1145 return -1;
1146
1147 if (!dir_exists(path)) {
1148 WARN("No /dev in container.");
1149 WARN("Proceeding without autodev setup");
1150 return 0;
1151 }
1152
1153 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
1154 rootfs->path ? rootfs->mount : NULL);
1155 if (ret != 0) {
1156 SYSERROR("Failed mounting tmpfs onto %s\n", path);
1157 return -1;
1158 }
1159
1160 INFO("Mounted tmpfs onto %s", path);
1161
1162 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
1163 if (ret < 0 || ret >= clen)
1164 return -1;
1165
1166 /*
1167 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1168 * If not, then create it and exit if that fails...
1169 */
1170 if (!dir_exists(path)) {
1171 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1172 if (ret) {
1173 SYSERROR("Failed to create /dev/pts in container");
1174 return -1;
1175 }
1176 }
1177
1178 INFO("Mounted container /dev");
1179 return 0;
1180 }
1181
1182 struct lxc_devs {
1183 const char *name;
1184 mode_t mode;
1185 int maj;
1186 int min;
1187 };
1188
1189 static const struct lxc_devs lxc_devs[] = {
1190 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1191 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1192 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1193 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1194 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1195 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
1196 };
1197
1198 static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
1199 {
1200 int ret;
1201 char path[MAXPATHLEN];
1202 int i;
1203 mode_t cmask;
1204
1205 ret = snprintf(path, MAXPATHLEN, "%s/dev", rootfs->path ? rootfs->mount : "");
1206 if (ret < 0 || ret >= MAXPATHLEN) {
1207 ERROR("Error calculating container /dev location");
1208 return -1;
1209 }
1210
1211 /* ignore, just don't try to fill in */
1212 if (!dir_exists(path))
1213 return 0;
1214
1215 INFO("populating container /dev");
1216 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1217 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
1218 const struct lxc_devs *d = &lxc_devs[i];
1219
1220 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
1221 if (ret < 0 || ret >= MAXPATHLEN)
1222 return -1;
1223
1224 ret = mknod(path, d->mode, makedev(d->maj, d->min));
1225 if (ret < 0) {
1226 char hostpath[MAXPATHLEN];
1227 FILE *pathfile;
1228
1229 if (errno == EEXIST) {
1230 DEBUG("\"%s\" device already existed", path);
1231 continue;
1232 }
1233
1234 /* Unprivileged containers cannot create devices, so
1235 * bind mount the device from the host.
1236 */
1237 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1238 if (ret < 0 || ret >= MAXPATHLEN)
1239 return -1;
1240 pathfile = fopen(path, "wb");
1241 if (!pathfile) {
1242 SYSERROR("Failed to create device mount target '%s'", path);
1243 return -1;
1244 }
1245 fclose(pathfile);
1246 if (safe_mount(hostpath, path, 0, MS_BIND, NULL, rootfs->path ? rootfs->mount : NULL) != 0) {
1247 SYSERROR("Failed bind mounting device %s from host into container", d->name);
1248 return -1;
1249 }
1250 DEBUG("bind mounted \"%s\" onto \"%s\"", hostpath, path);
1251 } else {
1252 DEBUG("created device node \"%s\"", path);
1253 }
1254 }
1255 umask(cmask);
1256
1257 INFO("populated container /dev");
1258 return 0;
1259 }
1260
1261 static int setup_rootfs(struct lxc_conf *conf)
1262 {
1263 struct bdev *bdev;
1264 const struct lxc_rootfs *rootfs;
1265
1266 rootfs = &conf->rootfs;
1267 if (!rootfs->path) {
1268 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1269 SYSERROR("Failed to make / rslave.");
1270 return -1;
1271 }
1272 return 0;
1273 }
1274
1275 if (access(rootfs->mount, F_OK)) {
1276 SYSERROR("Failed to access to \"%s\". Check it is present.",
1277 rootfs->mount);
1278 return -1;
1279 }
1280
1281 /* First try mounting rootfs using a bdev. */
1282 bdev = bdev_init(conf, rootfs->path, rootfs->mount, rootfs->options);
1283 if (bdev && !bdev->ops->mount(bdev)) {
1284 bdev_put(bdev);
1285 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1286 rootfs->path, rootfs->mount,
1287 rootfs->options ? rootfs->options : "(null)");
1288 return 0;
1289 }
1290 if (bdev)
1291 bdev_put(bdev);
1292 if (mount_rootfs(rootfs->path, rootfs->mount, rootfs->options)) {
1293 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1294 rootfs->path, rootfs->mount,
1295 rootfs->options ? rootfs->options : "(null)");
1296 return -1;
1297 }
1298
1299 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1300 rootfs->path, rootfs->mount,
1301 rootfs->options ? rootfs->options : "(null)");
1302 return 0;
1303 }
1304
1305 int prepare_ramfs_root(char *root)
1306 {
1307 char buf[LXC_LINELEN], *p;
1308 char nroot[PATH_MAX];
1309 FILE *f;
1310 int i;
1311 char *p2;
1312
1313 if (realpath(root, nroot) == NULL)
1314 return -errno;
1315
1316 if (chdir("/") == -1)
1317 return -errno;
1318
1319 /*
1320 * We could use here MS_MOVE, but in userns this mount is
1321 * locked and can't be moved.
1322 */
1323 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
1324 SYSERROR("Failed to move %s into /", root);
1325 return -errno;
1326 }
1327
1328 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
1329 SYSERROR("Failed to make . rprivate");
1330 return -errno;
1331 }
1332
1333 /*
1334 * The following code cleans up inhereted mounts which are not
1335 * required for CT.
1336 *
1337 * The mountinfo file shows not all mounts, if a few points have been
1338 * unmounted between read operations from the mountinfo. So we need to
1339 * read mountinfo a few times.
1340 *
1341 * This loop can be skipped if a container uses unserns, because all
1342 * inherited mounts are locked and we should live with all this trash.
1343 */
1344 while (1) {
1345 int progress = 0;
1346
1347 f = fopen("./proc/self/mountinfo", "r");
1348 if (!f) {
1349 SYSERROR("Unable to open /proc/self/mountinfo");
1350 return -1;
1351 }
1352 while (fgets(buf, LXC_LINELEN, f)) {
1353 for (p = buf, i=0; p && i < 4; i++)
1354 p = strchr(p+1, ' ');
1355 if (!p)
1356 continue;
1357 p2 = strchr(p+1, ' ');
1358 if (!p2)
1359 continue;
1360
1361 *p2 = '\0';
1362 *p = '.';
1363
1364 if (strcmp(p + 1, "/") == 0)
1365 continue;
1366 if (strcmp(p + 1, "/proc") == 0)
1367 continue;
1368
1369 if (umount2(p, MNT_DETACH) == 0)
1370 progress++;
1371 }
1372 fclose(f);
1373 if (!progress)
1374 break;
1375 }
1376
1377 /* This also can be skipped if a container uses unserns */
1378 umount2("./proc", MNT_DETACH);
1379
1380 /* It is weird, but chdir("..") moves us in a new root */
1381 if (chdir("..") == -1) {
1382 SYSERROR("Unable to change working directory");
1383 return -1;
1384 }
1385
1386 if (chroot(".") == -1) {
1387 SYSERROR("Unable to chroot");
1388 return -1;
1389 }
1390
1391 return 0;
1392 }
1393
1394 static int setup_pivot_root(const struct lxc_rootfs *rootfs)
1395 {
1396 if (!rootfs->path) {
1397 DEBUG("container does not have a rootfs, so not doing pivot root");
1398 return 0;
1399 }
1400
1401 if (detect_ramfs_rootfs()) {
1402 DEBUG("detected that container is on ramfs");
1403 if (prepare_ramfs_root(rootfs->mount)) {
1404 ERROR("failed to prepare minimal ramfs root");
1405 return -1;
1406 }
1407
1408 DEBUG("prepared ramfs root for container");
1409 return 0;
1410 }
1411
1412 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1413 ERROR("failed to pivot root");
1414 return -1;
1415 }
1416
1417 DEBUG("finished pivot root");
1418 return 0;
1419 }
1420
1421 static int lxc_setup_devpts(int num_pts)
1422 {
1423 int ret;
1424 const char *devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
1425
1426 if (!num_pts) {
1427 DEBUG("no new devpts instance will be mounted since no pts "
1428 "devices are requested");
1429 return 0;
1430 }
1431
1432 /* Unmount old devpts instance. */
1433 ret = access("/dev/pts/ptmx", F_OK);
1434 if (!ret) {
1435 ret = umount("/dev/pts");
1436 if (ret < 0) {
1437 SYSERROR("failed to unmount old devpts instance");
1438 return -1;
1439 }
1440 DEBUG("unmounted old /dev/pts instance");
1441 }
1442
1443 /* Create mountpoint for devpts instance. */
1444 ret = mkdir("/dev/pts", 0755);
1445 if (ret < 0 && errno != EEXIST) {
1446 SYSERROR("failed to create the \"/dev/pts\" directory");
1447 return -1;
1448 }
1449
1450 /* Mount new devpts instance. */
1451 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1452 if (ret < 0) {
1453 SYSERROR("failed to mount new devpts instance");
1454 return -1;
1455 }
1456 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
1457
1458 /* Remove any pre-existing /dev/ptmx file. */
1459 ret = access("/dev/ptmx", F_OK);
1460 if (!ret) {
1461 ret = remove("/dev/ptmx");
1462 if (ret < 0) {
1463 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1464 return -1;
1465 }
1466 DEBUG("removed existing \"/dev/ptmx\"");
1467 }
1468
1469 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1470 ret = open("/dev/ptmx", O_CREAT, 0666);
1471 if (ret < 0) {
1472 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1473 return -1;
1474 }
1475 close(ret);
1476 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
1477
1478 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
1479 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
1480 if (!ret) {
1481 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1482 return 0;
1483 } else {
1484 /* Fallthrough and try to create a symlink. */
1485 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1486 }
1487
1488 /* Remove the dummy /dev/ptmx file we created above. */
1489 ret = remove("/dev/ptmx");
1490 if (ret < 0) {
1491 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1492 return -1;
1493 }
1494
1495 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1496 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1497 if (ret < 0) {
1498 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1499 return -1;
1500 }
1501 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1502
1503 return 0;
1504 }
1505
1506 static int setup_personality(int persona)
1507 {
1508 #if HAVE_SYS_PERSONALITY_H
1509 if (persona == -1)
1510 return 0;
1511
1512 if (personality(persona) < 0) {
1513 SYSERROR("failed to set personality to '0x%x'", persona);
1514 return -1;
1515 }
1516
1517 INFO("set personality to '0x%x'", persona);
1518 #endif
1519
1520 return 0;
1521 }
1522
1523 static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1524 const struct lxc_console *console)
1525 {
1526 char path[MAXPATHLEN];
1527 int ret, fd;
1528
1529 if (console->path && !strcmp(console->path, "none"))
1530 return 0;
1531
1532 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1533 if (ret < 0 || (size_t)ret >= sizeof(path))
1534 return -1;
1535
1536 /* When we are asked to setup a console we remove any previous
1537 * /dev/console bind-mounts.
1538 */
1539 if (file_exists(path)) {
1540 ret = lxc_unstack_mountpoint(path, false);
1541 if (ret < 0) {
1542 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1543 return -ret;
1544 } else {
1545 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1546 }
1547 ret = unlink(path);
1548 if (ret < 0) {
1549 SYSERROR("error unlinking %s", path);
1550 return -errno;
1551 }
1552 }
1553
1554 /* For unprivileged containers autodev or automounts will already have
1555 * taken care of creating /dev/console.
1556 */
1557 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1558 if (fd < 0) {
1559 if (errno != EEXIST) {
1560 SYSERROR("failed to create console");
1561 return -errno;
1562 }
1563 } else {
1564 close(fd);
1565 }
1566
1567 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
1568 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1569 return -errno;
1570 }
1571
1572 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
1573 ERROR("failed to mount '%s' on '%s'", console->name, path);
1574 return -1;
1575 }
1576
1577 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
1578 return 0;
1579 }
1580
1581 static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1582 const struct lxc_console *console,
1583 char *ttydir)
1584 {
1585 int ret;
1586 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
1587
1588 /* create rootfs/dev/<ttydir> directory */
1589 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1590 if (ret < 0 || (size_t)ret >= sizeof(path))
1591 return -1;
1592
1593 ret = mkdir(path, 0755);
1594 if (ret && errno != EEXIST) {
1595 SYSERROR("failed with errno %d to create %s", errno, path);
1596 return -errno;
1597 }
1598 DEBUG("created directory for console and tty devices at \%s\"", path);
1599
1600 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1601 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1602 return -1;
1603
1604 ret = creat(lxcpath, 0660);
1605 if (ret == -1 && errno != EEXIST) {
1606 SYSERROR("error %d creating %s", errno, lxcpath);
1607 return -errno;
1608 }
1609 if (ret >= 0)
1610 close(ret);
1611
1612 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1613 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1614 return -1;
1615
1616 /* When we are asked to setup a console we remove any previous
1617 * /dev/console bind-mounts.
1618 */
1619 if (console->path && !strcmp(console->path, "none")) {
1620 struct stat st;
1621 ret = stat(path, &st);
1622 if (ret < 0) {
1623 if (errno == ENOENT)
1624 return 0;
1625 SYSERROR("failed stat() \"%s\"", path);
1626 return -errno;
1627 }
1628
1629 /* /dev/console must be character device with major number 5 and
1630 * minor number 1. If not, give benefit of the doubt and assume
1631 * the user has mounted something else right there on purpose.
1632 */
1633 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1634 return 0;
1635
1636 /* In case the user requested a bind-mount for /dev/console and
1637 * requests a ttydir we move the mount to the
1638 * /dev/<ttydir/console.
1639 * Note, we only move the uppermost mount and clear all other
1640 * mounts underneath for safety.
1641 * If it is a character device created via mknod() we simply
1642 * rename it.
1643 */
1644 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1645 if (ret < 0) {
1646 if (errno != EINVAL) {
1647 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1648 return -errno;
1649 }
1650 /* path was not a mountpoint */
1651 ret = rename(path, lxcpath);
1652 if (ret < 0) {
1653 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1654 return -errno;
1655 }
1656 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1657 } else {
1658 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1659 }
1660
1661 /* Clear all remaining bind-mounts. */
1662 ret = lxc_unstack_mountpoint(path, false);
1663 if (ret < 0) {
1664 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1665 return -ret;
1666 } else {
1667 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1668 }
1669 } else {
1670 if (file_exists(path)) {
1671 ret = lxc_unstack_mountpoint(path, false);
1672 if (ret < 0) {
1673 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1674 return -ret;
1675 } else {
1676 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1677 }
1678 }
1679
1680 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1681 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1682 return -1;
1683 }
1684 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
1685 }
1686
1687 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
1688 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
1689 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1690 return -1;
1691
1692 ret = unlink(path);
1693 if (ret && errno != ENOENT) {
1694 SYSERROR("error unlinking %s", path);
1695 return -errno;
1696 }
1697
1698 ret = symlink(lxcpath, path);
1699 if (ret < 0) {
1700 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
1701 return -1;
1702 }
1703
1704 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
1705 return 0;
1706 }
1707
1708 static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1709 const struct lxc_console *console, char *ttydir)
1710 {
1711 /* We don't have a rootfs, /dev/console will be shared. */
1712 if (!rootfs->path) {
1713 DEBUG("/dev/console will be shared with the host");
1714 return 0;
1715 }
1716
1717 if (!ttydir)
1718 return lxc_setup_dev_console(rootfs, console);
1719
1720 return lxc_setup_ttydir_console(rootfs, console, ttydir);
1721 }
1722
1723 static int setup_kmsg(const struct lxc_rootfs *rootfs,
1724 const struct lxc_console *console)
1725 {
1726 char kpath[MAXPATHLEN];
1727 int ret;
1728
1729 if (!rootfs->path)
1730 return 0;
1731 ret = snprintf(kpath, sizeof(kpath), "%s/dev/kmsg", rootfs->mount);
1732 if (ret < 0 || ret >= sizeof(kpath))
1733 return -1;
1734
1735 ret = unlink(kpath);
1736 if (ret && errno != ENOENT) {
1737 SYSERROR("error unlinking %s", kpath);
1738 return -1;
1739 }
1740
1741 ret = symlink("console", kpath);
1742 if (ret) {
1743 SYSERROR("failed to create symlink for kmsg");
1744 return -1;
1745 }
1746
1747 return 0;
1748 }
1749
1750 static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1751 {
1752 struct mount_opt *mo;
1753
1754 /* If opt is found in mount_opt, set or clear flags.
1755 * Otherwise append it to data. */
1756
1757 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1758 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1759 if (mo->clear)
1760 *flags &= ~mo->flag;
1761 else
1762 *flags |= mo->flag;
1763 return;
1764 }
1765 }
1766
1767 if (strlen(*data))
1768 strcat(*data, ",");
1769 strcat(*data, opt);
1770 }
1771
1772 int parse_mntopts(const char *mntopts, unsigned long *mntflags,
1773 char **mntdata)
1774 {
1775 char *s, *data;
1776 char *p, *saveptr = NULL;
1777
1778 *mntdata = NULL;
1779 *mntflags = 0L;
1780
1781 if (!mntopts)
1782 return 0;
1783
1784 s = strdup(mntopts);
1785 if (!s) {
1786 SYSERROR("failed to allocate memory");
1787 return -1;
1788 }
1789
1790 data = malloc(strlen(s) + 1);
1791 if (!data) {
1792 SYSERROR("failed to allocate memory");
1793 free(s);
1794 return -1;
1795 }
1796 *data = 0;
1797
1798 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1799 p = strtok_r(NULL, ",", &saveptr))
1800 parse_mntopt(p, mntflags, &data);
1801
1802 if (*data)
1803 *mntdata = data;
1804 else
1805 free(data);
1806 free(s);
1807
1808 return 0;
1809 }
1810
1811 static void null_endofword(char *word)
1812 {
1813 while (*word && *word != ' ' && *word != '\t')
1814 word++;
1815 *word = '\0';
1816 }
1817
1818 /*
1819 * skip @nfields spaces in @src
1820 */
1821 static char *get_field(char *src, int nfields)
1822 {
1823 char *p = src;
1824 int i;
1825
1826 for (i = 0; i < nfields; i++) {
1827 while (*p && *p != ' ' && *p != '\t')
1828 p++;
1829 if (!*p)
1830 break;
1831 p++;
1832 }
1833 return p;
1834 }
1835
1836 static int mount_entry(const char *fsname, const char *target,
1837 const char *fstype, unsigned long mountflags,
1838 const char *data, int optional, int dev, const char *rootfs)
1839 {
1840 #ifdef HAVE_STATVFS
1841 struct statvfs sb;
1842 #endif
1843
1844 if (safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data, rootfs)) {
1845 if (optional) {
1846 INFO("failed to mount '%s' on '%s' (optional): %s", fsname,
1847 target, strerror(errno));
1848 return 0;
1849 }
1850 else {
1851 SYSERROR("failed to mount '%s' on '%s'", fsname, target);
1852 return -1;
1853 }
1854 }
1855
1856 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
1857 DEBUG("remounting %s on %s to respect bind or remount options",
1858 fsname ? fsname : "(none)", target ? target : "(none)");
1859 unsigned long rqd_flags = 0;
1860 if (mountflags & MS_RDONLY)
1861 rqd_flags |= MS_RDONLY;
1862 #ifdef HAVE_STATVFS
1863 if (statvfs(fsname, &sb) == 0) {
1864 unsigned long required_flags = rqd_flags;
1865 if (sb.f_flag & MS_NOSUID)
1866 required_flags |= MS_NOSUID;
1867 if (sb.f_flag & MS_NODEV && !dev)
1868 required_flags |= MS_NODEV;
1869 if (sb.f_flag & MS_RDONLY)
1870 required_flags |= MS_RDONLY;
1871 if (sb.f_flag & MS_NOEXEC)
1872 required_flags |= MS_NOEXEC;
1873 DEBUG("(at remount) flags for %s was %lu, required extra flags are %lu", fsname, sb.f_flag, required_flags);
1874 /*
1875 * If this was a bind mount request, and required_flags
1876 * does not have any flags which are not already in
1877 * mountflags, then skip the remount
1878 */
1879 if (!(mountflags & MS_REMOUNT)) {
1880 if (!(required_flags & ~mountflags) && rqd_flags == 0) {
1881 DEBUG("mountflags already was %lu, skipping remount",
1882 mountflags);
1883 goto skipremount;
1884 }
1885 }
1886 mountflags |= required_flags;
1887 }
1888 #endif
1889
1890 if (mount(fsname, target, fstype,
1891 mountflags | MS_REMOUNT, data) < 0) {
1892 if (optional) {
1893 INFO("failed to mount '%s' on '%s' (optional): %s",
1894 fsname, target, strerror(errno));
1895 return 0;
1896 }
1897 else {
1898 SYSERROR("failed to mount '%s' on '%s'",
1899 fsname, target);
1900 return -1;
1901 }
1902 }
1903 }
1904
1905 #ifdef HAVE_STATVFS
1906 skipremount:
1907 #endif
1908 DEBUG("mounted '%s' on '%s', type '%s'", fsname, target, fstype);
1909
1910 return 0;
1911 }
1912
1913 /*
1914 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1915 */
1916 static void cull_mntent_opt(struct mntent *mntent)
1917 {
1918 int i;
1919 char *p, *p2;
1920 char *list[] = {"create=dir",
1921 "create=file",
1922 "optional",
1923 NULL };
1924
1925 for (i=0; list[i]; i++) {
1926 if (!(p = strstr(mntent->mnt_opts, list[i])))
1927 continue;
1928 p2 = strchr(p, ',');
1929 if (!p2) {
1930 /* no more mntopts, so just chop it here */
1931 *p = '\0';
1932 continue;
1933 }
1934 memmove(p, p2+1, strlen(p2+1)+1);
1935 }
1936 }
1937
1938 static int mount_entry_create_dir_file(const struct mntent *mntent,
1939 const char* path, const struct lxc_rootfs *rootfs,
1940 const char *lxc_name, const char *lxc_path)
1941 {
1942 char *pathdirname = NULL;
1943 int ret = 0;
1944 FILE *pathfile = NULL;
1945
1946 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
1947 if (ovl_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
1948 return -1;
1949 } else if (strncmp(mntent->mnt_type, "aufs", 4) == 0) {
1950 if (aufs_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
1951 return -1;
1952 }
1953
1954 if (hasmntopt(mntent, "create=dir")) {
1955 if (mkdir_p(path, 0755) < 0) {
1956 WARN("Failed to create mount target '%s'", path);
1957 ret = -1;
1958 }
1959 }
1960
1961 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1962 pathdirname = strdup(path);
1963 pathdirname = dirname(pathdirname);
1964 if (mkdir_p(pathdirname, 0755) < 0) {
1965 WARN("Failed to create target directory");
1966 }
1967 pathfile = fopen(path, "wb");
1968 if (!pathfile) {
1969 WARN("Failed to create mount target '%s'", path);
1970 ret = -1;
1971 } else {
1972 fclose(pathfile);
1973 }
1974 }
1975 free(pathdirname);
1976 return ret;
1977 }
1978
1979 /* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1980 * without a rootfs. */
1981 static inline int mount_entry_on_generic(struct mntent *mntent,
1982 const char* path, const struct lxc_rootfs *rootfs,
1983 const char *lxc_name, const char *lxc_path)
1984 {
1985 unsigned long mntflags;
1986 char *mntdata;
1987 int ret;
1988 bool optional = hasmntopt(mntent, "optional") != NULL;
1989 bool dev = hasmntopt(mntent, "dev") != NULL;
1990
1991 char *rootfs_path = NULL;
1992 if (rootfs && rootfs->path)
1993 rootfs_path = rootfs->mount;
1994
1995 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name, lxc_path);
1996
1997 if (ret < 0)
1998 return optional ? 0 : -1;
1999
2000 cull_mntent_opt(mntent);
2001
2002 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
2003 free(mntdata);
2004 return -1;
2005 }
2006
2007 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
2008 mntdata, optional, dev, rootfs_path);
2009
2010 free(mntdata);
2011 return ret;
2012 }
2013
2014 static inline int mount_entry_on_systemfs(struct mntent *mntent)
2015 {
2016 char path[MAXPATHLEN];
2017 int ret;
2018
2019 /* For containers created without a rootfs all mounts are treated as
2020 * absolute paths starting at / on the host. */
2021 if (mntent->mnt_dir[0] != '/')
2022 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
2023 else
2024 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
2025
2026 if (ret < 0 || ret >= sizeof(path)) {
2027 ERROR("path name too long");
2028 return -1;
2029 }
2030
2031 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
2032 }
2033
2034 static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
2035 const struct lxc_rootfs *rootfs,
2036 const char *lxc_name,
2037 const char *lxc_path)
2038 {
2039 char *aux;
2040 char path[MAXPATHLEN];
2041 int r, ret = 0, offset;
2042 const char *lxcpath;
2043
2044 lxcpath = lxc_global_config_value("lxc.lxcpath");
2045 if (!lxcpath) {
2046 ERROR("Out of memory");
2047 return -1;
2048 }
2049
2050 /* if rootfs->path is a blockdev path, allow container fstab to
2051 * use $lxcpath/CN/rootfs as the target prefix */
2052 r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
2053 if (r < 0 || r >= MAXPATHLEN)
2054 goto skipvarlib;
2055
2056 aux = strstr(mntent->mnt_dir, path);
2057 if (aux) {
2058 offset = strlen(path);
2059 goto skipabs;
2060 }
2061
2062 skipvarlib:
2063 aux = strstr(mntent->mnt_dir, rootfs->path);
2064 if (!aux) {
2065 WARN("ignoring mount point '%s'", mntent->mnt_dir);
2066 return ret;
2067 }
2068 offset = strlen(rootfs->path);
2069
2070 skipabs:
2071
2072 r = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount,
2073 aux + offset);
2074 if (r < 0 || r >= MAXPATHLEN) {
2075 WARN("pathnme too long for '%s'", mntent->mnt_dir);
2076 return -1;
2077 }
2078
2079 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2080 }
2081
2082 static int mount_entry_on_relative_rootfs(struct mntent *mntent,
2083 const struct lxc_rootfs *rootfs,
2084 const char *lxc_name,
2085 const char *lxc_path)
2086 {
2087 char path[MAXPATHLEN];
2088 int ret;
2089
2090 /* relative to root mount point */
2091 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
2092 if (ret < 0 || ret >= sizeof(path)) {
2093 ERROR("path name too long");
2094 return -1;
2095 }
2096
2097 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
2098 }
2099
2100 static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
2101 const char *lxc_name, const char *lxc_path)
2102 {
2103 struct mntent mntent;
2104 char buf[4096];
2105 int ret = -1;
2106
2107 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
2108
2109 if (!rootfs->path) {
2110 if (mount_entry_on_systemfs(&mntent))
2111 goto out;
2112 continue;
2113 }
2114
2115 /* We have a separate root, mounts are relative to it */
2116 if (mntent.mnt_dir[0] != '/') {
2117 if (mount_entry_on_relative_rootfs(&mntent, rootfs, lxc_name, lxc_path))
2118 goto out;
2119 continue;
2120 }
2121
2122 if (mount_entry_on_absolute_rootfs(&mntent, rootfs, lxc_name, lxc_path))
2123 goto out;
2124 }
2125
2126 ret = 0;
2127
2128 INFO("mount points have been setup");
2129 out:
2130 return ret;
2131 }
2132
2133 static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
2134 const char *lxc_name, const char *lxc_path)
2135 {
2136 FILE *file;
2137 int ret;
2138
2139 if (!fstab)
2140 return 0;
2141
2142 file = setmntent(fstab, "r");
2143 if (!file) {
2144 SYSERROR("failed to use '%s'", fstab);
2145 return -1;
2146 }
2147
2148 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
2149
2150 endmntent(file);
2151 return ret;
2152 }
2153
2154 FILE *make_anonymous_mount_file(struct lxc_list *mount)
2155 {
2156 int ret;
2157 char *mount_entry;
2158 struct lxc_list *iterator;
2159 FILE *file;
2160 int fd = -1;
2161
2162 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2163 if (fd < 0) {
2164 if (errno != ENOSYS)
2165 return NULL;
2166 file = tmpfile();
2167 } else {
2168 file = fdopen(fd, "r+");
2169 }
2170
2171 if (!file) {
2172 int saved_errno = errno;
2173 if (fd != -1)
2174 close(fd);
2175 ERROR("Could not create mount entry file: %s.", strerror(saved_errno));
2176 return NULL;
2177 }
2178
2179 lxc_list_for_each(iterator, mount) {
2180 mount_entry = iterator->elem;
2181 ret = fprintf(file, "%s\n", mount_entry);
2182 if (ret < strlen(mount_entry))
2183 WARN("Could not write mount entry to anonymous mount file.");
2184 }
2185
2186 if (fseek(file, 0, SEEK_SET) < 0) {
2187 fclose(file);
2188 return NULL;
2189 }
2190
2191 return file;
2192 }
2193
2194 static int setup_mount_entries(const struct lxc_rootfs *rootfs,
2195 struct lxc_list *mount, const char *lxc_name,
2196 const char *lxc_path)
2197 {
2198 FILE *file;
2199 int ret;
2200
2201 file = make_anonymous_mount_file(mount);
2202 if (!file)
2203 return -1;
2204
2205 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
2206
2207 fclose(file);
2208 return ret;
2209 }
2210
2211 static int parse_cap(const char *cap)
2212 {
2213 char *ptr = NULL;
2214 size_t i;
2215 int capid = -1;
2216
2217 if (!strcmp(cap, "none"))
2218 return -2;
2219
2220 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2221
2222 if (strcmp(cap, caps_opt[i].name))
2223 continue;
2224
2225 capid = caps_opt[i].value;
2226 break;
2227 }
2228
2229 if (capid < 0) {
2230 /* try to see if it's numeric, so the user may specify
2231 * capabilities that the running kernel knows about but
2232 * we don't */
2233 errno = 0;
2234 capid = strtol(cap, &ptr, 10);
2235 if (!ptr || *ptr != '\0' || errno != 0)
2236 /* not a valid number */
2237 capid = -1;
2238 else if (capid > lxc_caps_last_cap())
2239 /* we have a number but it's not a valid
2240 * capability */
2241 capid = -1;
2242 }
2243
2244 return capid;
2245 }
2246
2247 int in_caplist(int cap, struct lxc_list *caps)
2248 {
2249 struct lxc_list *iterator;
2250 int capid;
2251
2252 lxc_list_for_each(iterator, caps) {
2253 capid = parse_cap(iterator->elem);
2254 if (capid == cap)
2255 return 1;
2256 }
2257
2258 return 0;
2259 }
2260
2261 static int setup_caps(struct lxc_list *caps)
2262 {
2263 struct lxc_list *iterator;
2264 char *drop_entry;
2265 int capid;
2266
2267 lxc_list_for_each(iterator, caps) {
2268
2269 drop_entry = iterator->elem;
2270
2271 capid = parse_cap(drop_entry);
2272
2273 if (capid < 0) {
2274 ERROR("unknown capability %s", drop_entry);
2275 return -1;
2276 }
2277
2278 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2279
2280 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
2281 SYSERROR("failed to remove %s capability", drop_entry);
2282 return -1;
2283 }
2284
2285 }
2286
2287 DEBUG("capabilities have been setup");
2288
2289 return 0;
2290 }
2291
2292 static int dropcaps_except(struct lxc_list *caps)
2293 {
2294 struct lxc_list *iterator;
2295 char *keep_entry;
2296 int i, capid;
2297 int numcaps = lxc_caps_last_cap() + 1;
2298 INFO("found %d capabilities", numcaps);
2299
2300 if (numcaps <= 0 || numcaps > 200)
2301 return -1;
2302
2303 // caplist[i] is 1 if we keep capability i
2304 int *caplist = alloca(numcaps * sizeof(int));
2305 memset(caplist, 0, numcaps * sizeof(int));
2306
2307 lxc_list_for_each(iterator, caps) {
2308
2309 keep_entry = iterator->elem;
2310
2311 capid = parse_cap(keep_entry);
2312
2313 if (capid == -2)
2314 continue;
2315
2316 if (capid < 0) {
2317 ERROR("unknown capability %s", keep_entry);
2318 return -1;
2319 }
2320
2321 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
2322
2323 caplist[capid] = 1;
2324 }
2325 for (i=0; i<numcaps; i++) {
2326 if (caplist[i])
2327 continue;
2328 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
2329 SYSERROR("failed to remove capability %d", i);
2330 return -1;
2331 }
2332 }
2333
2334 DEBUG("capabilities have been setup");
2335
2336 return 0;
2337 }
2338
2339 static int setup_hw_addr(char *hwaddr, const char *ifname)
2340 {
2341 struct sockaddr sockaddr;
2342 struct ifreq ifr;
2343 int ret, fd, saved_errno;
2344
2345 ret = lxc_convert_mac(hwaddr, &sockaddr);
2346 if (ret) {
2347 ERROR("mac address '%s' conversion failed : %s",
2348 hwaddr, strerror(-ret));
2349 return -1;
2350 }
2351
2352 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
2353 ifr.ifr_name[IFNAMSIZ-1] = '\0';
2354 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2355
2356 fd = socket(AF_INET, SOCK_DGRAM, 0);
2357 if (fd < 0) {
2358 ERROR("socket failure : %s", strerror(errno));
2359 return -1;
2360 }
2361
2362 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
2363 saved_errno = errno;
2364 close(fd);
2365 if (ret)
2366 ERROR("ioctl failure : %s", strerror(saved_errno));
2367
2368 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
2369
2370 return ret;
2371 }
2372
2373 static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
2374 {
2375 struct lxc_list *iterator;
2376 struct lxc_inetdev *inetdev;
2377 int err;
2378
2379 lxc_list_for_each(iterator, ip) {
2380
2381 inetdev = iterator->elem;
2382
2383 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2384 &inetdev->bcast, inetdev->prefix);
2385 if (err) {
2386 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2387 ifindex, strerror(-err));
2388 return -1;
2389 }
2390 }
2391
2392 return 0;
2393 }
2394
2395 static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
2396 {
2397 struct lxc_list *iterator;
2398 struct lxc_inet6dev *inet6dev;
2399 int err;
2400
2401 lxc_list_for_each(iterator, ip) {
2402
2403 inet6dev = iterator->elem;
2404
2405 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
2406 &inet6dev->mcast, &inet6dev->acast,
2407 inet6dev->prefix);
2408 if (err) {
2409 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2410 ifindex, strerror(-err));
2411 return -1;
2412 }
2413 }
2414
2415 return 0;
2416 }
2417
2418 static int setup_netdev(struct lxc_netdev *netdev)
2419 {
2420 char ifname[IFNAMSIZ];
2421 char *current_ifname = ifname;
2422 int err;
2423
2424 /* empty network namespace */
2425 if (!netdev->ifindex) {
2426 if (netdev->flags & IFF_UP) {
2427 err = lxc_netdev_up("lo");
2428 if (err) {
2429 ERROR("failed to set the loopback up : %s",
2430 strerror(-err));
2431 return -1;
2432 }
2433 }
2434 if (netdev->type != LXC_NET_VETH)
2435 return 0;
2436 netdev->ifindex = if_nametoindex(netdev->name);
2437 }
2438
2439 /* get the new ifindex in case of physical netdev */
2440 if (netdev->type == LXC_NET_PHYS) {
2441 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2442 ERROR("failed to get ifindex for %s",
2443 netdev->link);
2444 return -1;
2445 }
2446 }
2447
2448 /* retrieve the name of the interface */
2449 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2450 ERROR("no interface corresponding to index '%d'",
2451 netdev->ifindex);
2452 return -1;
2453 }
2454
2455 /* default: let the system to choose one interface name */
2456 if (!netdev->name)
2457 netdev->name = netdev->type == LXC_NET_PHYS ?
2458 netdev->link : "eth%d";
2459
2460 /* rename the interface name */
2461 if (strcmp(ifname, netdev->name) != 0) {
2462 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2463 if (err) {
2464 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2465 strerror(-err));
2466 return -1;
2467 }
2468 }
2469
2470 /* Re-read the name of the interface because its name has changed
2471 * and would be automatically allocated by the system
2472 */
2473 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2474 ERROR("no interface corresponding to index '%d'",
2475 netdev->ifindex);
2476 return -1;
2477 }
2478
2479 /* set a mac address */
2480 if (netdev->hwaddr) {
2481 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
2482 ERROR("failed to setup hw address for '%s'",
2483 current_ifname);
2484 return -1;
2485 }
2486 }
2487
2488 /* setup ipv4 addresses on the interface */
2489 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
2490 ERROR("failed to setup ip addresses for '%s'",
2491 ifname);
2492 return -1;
2493 }
2494
2495 /* setup ipv6 addresses on the interface */
2496 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
2497 ERROR("failed to setup ipv6 addresses for '%s'",
2498 ifname);
2499 return -1;
2500 }
2501
2502 /* set the network device up */
2503 if (netdev->flags & IFF_UP) {
2504 int err;
2505
2506 err = lxc_netdev_up(current_ifname);
2507 if (err) {
2508 ERROR("failed to set '%s' up : %s", current_ifname,
2509 strerror(-err));
2510 return -1;
2511 }
2512
2513 /* the network is up, make the loopback up too */
2514 err = lxc_netdev_up("lo");
2515 if (err) {
2516 ERROR("failed to set the loopback up : %s",
2517 strerror(-err));
2518 return -1;
2519 }
2520 }
2521
2522 /* We can only set up the default routes after bringing
2523 * up the interface, sine bringing up the interface adds
2524 * the link-local routes and we can't add a default
2525 * route if the gateway is not reachable. */
2526
2527 /* setup ipv4 gateway on the interface */
2528 if (netdev->ipv4_gateway) {
2529 if (!(netdev->flags & IFF_UP)) {
2530 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2531 return -1;
2532 }
2533
2534 if (lxc_list_empty(&netdev->ipv4)) {
2535 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2536 return -1;
2537 }
2538
2539 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2540 if (err) {
2541 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2542 if (err) {
2543 ERROR("failed to add ipv4 dest for '%s': %s",
2544 ifname, strerror(-err));
2545 }
2546
2547 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2548 if (err) {
2549 ERROR("failed to setup ipv4 gateway for '%s': %s",
2550 ifname, strerror(-err));
2551 if (netdev->ipv4_gateway_auto) {
2552 char buf[INET_ADDRSTRLEN];
2553 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2554 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2555 }
2556 return -1;
2557 }
2558 }
2559 }
2560
2561 /* setup ipv6 gateway on the interface */
2562 if (netdev->ipv6_gateway) {
2563 if (!(netdev->flags & IFF_UP)) {
2564 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2565 return -1;
2566 }
2567
2568 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2569 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2570 return -1;
2571 }
2572
2573 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2574 if (err) {
2575 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2576 if (err) {
2577 ERROR("failed to add ipv6 dest for '%s': %s",
2578 ifname, strerror(-err));
2579 }
2580
2581 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2582 if (err) {
2583 ERROR("failed to setup ipv6 gateway for '%s': %s",
2584 ifname, strerror(-err));
2585 if (netdev->ipv6_gateway_auto) {
2586 char buf[INET6_ADDRSTRLEN];
2587 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2588 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2589 }
2590 return -1;
2591 }
2592 }
2593 }
2594
2595 DEBUG("'%s' has been setup", current_ifname);
2596
2597 return 0;
2598 }
2599
2600 static int setup_network(struct lxc_list *network)
2601 {
2602 struct lxc_list *iterator;
2603 struct lxc_netdev *netdev;
2604
2605 lxc_list_for_each(iterator, network) {
2606
2607 netdev = iterator->elem;
2608
2609 if (setup_netdev(netdev)) {
2610 ERROR("failed to setup netdev");
2611 return -1;
2612 }
2613 }
2614
2615 if (!lxc_list_empty(network))
2616 INFO("network has been setup");
2617
2618 return 0;
2619 }
2620
2621 static int parse_resource(const char *res) {
2622 size_t i;
2623 int resid = -1;
2624
2625 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2626 if (strcmp(res, limit_opt[i].name) == 0)
2627 return limit_opt[i].value;
2628 }
2629
2630 /* try to see if it's numeric, so the user may specify
2631 * resources that the running kernel knows about but
2632 * we don't */
2633 if (lxc_safe_int(res, &resid) == 0)
2634 return resid;
2635 return -1;
2636 }
2637
2638 int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2639 struct lxc_list *it;
2640 struct lxc_limit *lim;
2641 int resid;
2642
2643 lxc_list_for_each(it, limits) {
2644 lim = it->elem;
2645
2646 resid = parse_resource(lim->resource);
2647 if (resid < 0) {
2648 ERROR("unknown resource %s", lim->resource);
2649 return -1;
2650 }
2651
2652 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2653 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2654 return -1;
2655 }
2656 }
2657 return 0;
2658 }
2659
2660 /* try to move physical nics to the init netns */
2661 void lxc_restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2662 {
2663 int i, oldfd;
2664 char ifname[IFNAMSIZ];
2665
2666 if (netnsfd < 0 || conf->num_savednics == 0)
2667 return;
2668
2669 INFO("Running to reset %d nic names.", conf->num_savednics);
2670
2671 oldfd = lxc_preserve_ns(getpid(), "net");
2672 if (oldfd < 0) {
2673 SYSERROR("Failed to open monitor netns fd.");
2674 return;
2675 }
2676
2677 if (setns(netnsfd, 0) != 0) {
2678 SYSERROR("Failed to enter container netns to reset nics");
2679 close(oldfd);
2680 return;
2681 }
2682 for (i=0; i<conf->num_savednics; i++) {
2683 struct saved_nic *s = &conf->saved_nics[i];
2684 /* retrieve the name of the interface */
2685 if (!if_indextoname(s->ifindex, ifname)) {
2686 WARN("no interface corresponding to index '%d'", s->ifindex);
2687 continue;
2688 }
2689 if (lxc_netdev_move_by_name(ifname, 1, s->orig_name))
2690 WARN("Error moving nic name:%s back to host netns", ifname);
2691 free(s->orig_name);
2692 }
2693 conf->num_savednics = 0;
2694
2695 if (setns(oldfd, 0) != 0)
2696 SYSERROR("Failed to re-enter monitor's netns");
2697 close(oldfd);
2698 }
2699
2700 static char *default_rootfs_mount = LXCROOTFSMOUNT;
2701
2702 struct lxc_conf *lxc_conf_init(void)
2703 {
2704 struct lxc_conf *new;
2705 int i;
2706
2707 new = malloc(sizeof(*new));
2708 if (!new) {
2709 ERROR("lxc_conf_init : %m");
2710 return NULL;
2711 }
2712 memset(new, 0, sizeof(*new));
2713
2714 new->loglevel = LXC_LOG_PRIORITY_NOTSET;
2715 new->personality = -1;
2716 new->autodev = 1;
2717 new->console.log_path = NULL;
2718 new->console.log_fd = -1;
2719 new->console.path = NULL;
2720 new->console.peer = -1;
2721 new->console.peerpty.busy = -1;
2722 new->console.peerpty.master = -1;
2723 new->console.peerpty.slave = -1;
2724 new->console.master = -1;
2725 new->console.slave = -1;
2726 new->console.name[0] = '\0';
2727 new->maincmd_fd = -1;
2728 new->nbd_idx = -1;
2729 new->rootfs.mount = strdup(default_rootfs_mount);
2730 if (!new->rootfs.mount) {
2731 ERROR("lxc_conf_init : %m");
2732 free(new);
2733 return NULL;
2734 }
2735 new->kmsg = 0;
2736 new->logfd = -1;
2737 lxc_list_init(&new->cgroup);
2738 lxc_list_init(&new->network);
2739 lxc_list_init(&new->mount_list);
2740 lxc_list_init(&new->caps);
2741 lxc_list_init(&new->keepcaps);
2742 lxc_list_init(&new->id_map);
2743 lxc_list_init(&new->includes);
2744 lxc_list_init(&new->aliens);
2745 lxc_list_init(&new->environment);
2746 lxc_list_init(&new->limits);
2747 for (i=0; i<NUM_LXC_HOOKS; i++)
2748 lxc_list_init(&new->hooks[i]);
2749 lxc_list_init(&new->groups);
2750 new->lsm_aa_profile = NULL;
2751 new->lsm_se_context = NULL;
2752 new->tmp_umount_proc = 0;
2753
2754 for (i = 0; i < LXC_NS_MAX; i++)
2755 new->inherit_ns_fd[i] = -1;
2756
2757 /* if running in a new user namespace, init and COMMAND
2758 * default to running as UID/GID 0 when using lxc-execute */
2759 new->init_uid = 0;
2760 new->init_gid = 0;
2761
2762 return new;
2763 }
2764
2765 static int instantiate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2766 {
2767 char *veth1, *veth2;
2768 char veth1buf[IFNAMSIZ], veth2buf[IFNAMSIZ];
2769 int bridge_index, err;
2770 unsigned int mtu = 0;
2771
2772 if (netdev->priv.veth_attr.pair) {
2773 veth1 = netdev->priv.veth_attr.pair;
2774 if (handler->conf->reboot)
2775 lxc_netdev_delete_by_name(veth1);
2776 } else {
2777 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2778 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2779 ERROR("veth1 name too long");
2780 return -1;
2781 }
2782 veth1 = lxc_mkifname(veth1buf);
2783 if (!veth1) {
2784 ERROR("failed to allocate a temporary name");
2785 return -1;
2786 }
2787 /* store away for deconf */
2788 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
2789 }
2790
2791 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
2792 veth2 = lxc_mkifname(veth2buf);
2793 if (!veth2) {
2794 ERROR("failed to allocate a temporary name");
2795 goto out_delete;
2796 }
2797
2798 err = lxc_veth_create(veth1, veth2);
2799 if (err) {
2800 ERROR("failed to create veth pair \"%s\" and \"%s\": %s", veth1,
2801 veth2, strerror(-err));
2802 goto out_delete;
2803 }
2804
2805 /* changing the high byte of the mac address to 0xfe, the bridge interface
2806 * will always keep the host's mac address and not take the mac address
2807 * of a container */
2808 err = setup_private_host_hw_addr(veth1);
2809 if (err) {
2810 ERROR("failed to change mac address of host interface \"%s\": %s",
2811 veth1, strerror(-err));
2812 goto out_delete;
2813 }
2814
2815 netdev->ifindex = if_nametoindex(veth2);
2816 if (!netdev->ifindex) {
2817 ERROR("failed to retrieve the index for \"%s\"", veth2);
2818 goto out_delete;
2819 }
2820
2821 if (netdev->mtu) {
2822 if (lxc_safe_uint(netdev->mtu, &mtu) < 0)
2823 WARN("failed to parse mtu from");
2824 else
2825 INFO("retrieved mtu %d", mtu);
2826 } else if (netdev->link) {
2827 bridge_index = if_nametoindex(netdev->link);
2828 if (bridge_index) {
2829 mtu = netdev_get_mtu(bridge_index);
2830 INFO("retrieved mtu %d from %s", mtu, netdev->link);
2831 } else {
2832 mtu = netdev_get_mtu(netdev->ifindex);
2833 INFO("retrieved mtu %d from %s", mtu, veth2);
2834 }
2835 }
2836
2837 if (mtu) {
2838 err = lxc_netdev_set_mtu(veth1, mtu);
2839 if (!err)
2840 err = lxc_netdev_set_mtu(veth2, mtu);
2841 if (err) {
2842 ERROR("failed to set mtu \"%d\" for veth pair \"%s\" "
2843 "and \"%s\": %s",
2844 mtu, veth1, veth2, strerror(-err));
2845 goto out_delete;
2846 }
2847 }
2848
2849 if (netdev->link) {
2850 err = lxc_bridge_attach(handler->lxcpath, handler->name, netdev->link, veth1);
2851 if (err) {
2852 ERROR("failed to attach \"%s\" to bridge \"%s\": %s",
2853 veth1, netdev->link, strerror(-err));
2854 goto out_delete;
2855 }
2856 INFO("attached \"%s\" to bridge \"%s\"", veth1, netdev->link);
2857 }
2858
2859 err = lxc_netdev_up(veth1);
2860 if (err) {
2861 ERROR("failed to set \"%s\" up: %s", veth1, strerror(-err));
2862 goto out_delete;
2863 }
2864
2865 if (netdev->upscript) {
2866 err = run_script(handler->name, "net", netdev->upscript, "up",
2867 "veth", veth1, (char*) NULL);
2868 if (err)
2869 goto out_delete;
2870 }
2871
2872 DEBUG("instantiated veth \"%s/%s\", index is \"%d\"", veth1, veth2,
2873 netdev->ifindex);
2874
2875 return 0;
2876
2877 out_delete:
2878 if (netdev->ifindex != 0)
2879 lxc_netdev_delete_by_name(veth1);
2880 if (!netdev->priv.veth_attr.pair)
2881 free(veth1);
2882 free(veth2);
2883 return -1;
2884 }
2885
2886 static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2887 {
2888 char *veth1;
2889 int err;
2890
2891 if (netdev->priv.veth_attr.pair)
2892 veth1 = netdev->priv.veth_attr.pair;
2893 else
2894 veth1 = netdev->priv.veth_attr.veth1;
2895
2896 if (netdev->downscript) {
2897 err = run_script(handler->name, "net", netdev->downscript,
2898 "down", "veth", veth1, (char*) NULL);
2899 if (err)
2900 return -1;
2901 }
2902 return 0;
2903 }
2904
2905 static int instantiate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2906 {
2907 char peerbuf[IFNAMSIZ], *peer;
2908 int err;
2909
2910 if (!netdev->link) {
2911 ERROR("no link specified for macvlan netdev");
2912 return -1;
2913 }
2914
2915 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2916 if (err >= sizeof(peerbuf))
2917 return -1;
2918
2919 peer = lxc_mkifname(peerbuf);
2920 if (!peer) {
2921 ERROR("failed to make a temporary name");
2922 return -1;
2923 }
2924
2925 err = lxc_macvlan_create(netdev->link, peer,
2926 netdev->priv.macvlan_attr.mode);
2927 if (err) {
2928 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2929 peer, netdev->link, strerror(-err));
2930 goto out;
2931 }
2932
2933 netdev->ifindex = if_nametoindex(peer);
2934 if (!netdev->ifindex) {
2935 ERROR("failed to retrieve the index for %s", peer);
2936 goto out;
2937 }
2938
2939 if (netdev->upscript) {
2940 err = run_script(handler->name, "net", netdev->upscript, "up",
2941 "macvlan", netdev->link, (char*) NULL);
2942 if (err)
2943 goto out;
2944 }
2945
2946 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
2947 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
2948
2949 return 0;
2950 out:
2951 lxc_netdev_delete_by_name(peer);
2952 free(peer);
2953 return -1;
2954 }
2955
2956 static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2957 {
2958 int err;
2959
2960 if (netdev->downscript) {
2961 err = run_script(handler->name, "net", netdev->downscript,
2962 "down", "macvlan", netdev->link,
2963 (char*) NULL);
2964 if (err)
2965 return -1;
2966 }
2967 return 0;
2968 }
2969
2970 /* XXX: merge with instantiate_macvlan */
2971 static int instantiate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2972 {
2973 char peer[IFNAMSIZ];
2974 int err;
2975 static uint16_t vlan_cntr = 0;
2976 unsigned int mtu = 0;
2977
2978 if (!netdev->link) {
2979 ERROR("no link specified for vlan netdev");
2980 return -1;
2981 }
2982
2983 err = snprintf(peer, sizeof(peer), "vlan%d-%d", netdev->priv.vlan_attr.vid, vlan_cntr++);
2984 if (err >= sizeof(peer)) {
2985 ERROR("peer name too long");
2986 return -1;
2987 }
2988
2989 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
2990 if (err) {
2991 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2992 peer, netdev->link, strerror(-err));
2993 return -1;
2994 }
2995
2996 netdev->ifindex = if_nametoindex(peer);
2997 if (!netdev->ifindex) {
2998 ERROR("failed to retrieve the ifindex for %s", peer);
2999 lxc_netdev_delete_by_name(peer);
3000 return -1;
3001 }
3002
3003 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
3004 netdev->ifindex);
3005 if (netdev->mtu) {
3006 if (lxc_safe_uint(netdev->mtu, &mtu) < 0) {
3007 ERROR("Failed to retrieve mtu from: '%d'/'%s'.",
3008 netdev->ifindex, netdev->name);
3009 return -1;
3010 }
3011 err = lxc_netdev_set_mtu(peer, mtu);
3012 if (err) {
3013 ERROR("failed to set mtu '%s' for %s : %s",
3014 netdev->mtu, peer, strerror(-err));
3015 lxc_netdev_delete_by_name(peer);
3016 return -1;
3017 }
3018 }
3019
3020 return 0;
3021 }
3022
3023 static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
3024 {
3025 return 0;
3026 }
3027
3028 static int instantiate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
3029 {
3030 if (!netdev->link) {
3031 ERROR("no link specified for the physical interface");
3032 return -1;
3033 }
3034
3035 netdev->ifindex = if_nametoindex(netdev->link);
3036 if (!netdev->ifindex) {
3037 ERROR("failed to retrieve the index for %s", netdev->link);
3038 return -1;
3039 }
3040
3041 if (netdev->upscript) {
3042 int err;
3043 err = run_script(handler->name, "net", netdev->upscript,
3044 "up", "phys", netdev->link, (char*) NULL);
3045 if (err)
3046 return -1;
3047 }
3048
3049 return 0;
3050 }
3051
3052 static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
3053 {
3054 int err;
3055
3056 if (netdev->downscript) {
3057 err = run_script(handler->name, "net", netdev->downscript,
3058 "down", "phys", netdev->link, (char*) NULL);
3059 if (err)
3060 return -1;
3061 }
3062 return 0;
3063 }
3064
3065 static int instantiate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3066 {
3067 netdev->ifindex = 0;
3068 return 0;
3069 }
3070
3071 static int instantiate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3072 {
3073 netdev->ifindex = 0;
3074 if (netdev->upscript) {
3075 int err;
3076 err = run_script(handler->name, "net", netdev->upscript,
3077 "up", "empty", (char*) NULL);
3078 if (err)
3079 return -1;
3080 }
3081 return 0;
3082 }
3083
3084 static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3085 {
3086 int err;
3087
3088 if (netdev->downscript) {
3089 err = run_script(handler->name, "net", netdev->downscript,
3090 "down", "empty", (char*) NULL);
3091 if (err)
3092 return -1;
3093 }
3094 return 0;
3095 }
3096
3097 static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3098 {
3099 return 0;
3100 }
3101
3102 int lxc_requests_empty_network(struct lxc_handler *handler)
3103 {
3104 struct lxc_list *network = &handler->conf->network;
3105 struct lxc_list *iterator;
3106 struct lxc_netdev *netdev;
3107 bool found_none = false, found_nic = false;
3108
3109 if (lxc_list_empty(network))
3110 return 0;
3111
3112 lxc_list_for_each(iterator, network) {
3113
3114 netdev = iterator->elem;
3115
3116 if (netdev->type == LXC_NET_NONE)
3117 found_none = true;
3118 else
3119 found_nic = true;
3120 }
3121 if (found_none && !found_nic)
3122 return 1;
3123 return 0;
3124 }
3125
3126 int lxc_create_network(struct lxc_handler *handler)
3127 {
3128 struct lxc_list *network = &handler->conf->network;
3129 struct lxc_list *iterator;
3130 struct lxc_netdev *netdev;
3131 int am_root = (getuid() == 0);
3132
3133 if (!am_root)
3134 return 0;
3135
3136 lxc_list_for_each(iterator, network) {
3137
3138 netdev = iterator->elem;
3139
3140 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
3141 ERROR("invalid network configuration type '%d'",
3142 netdev->type);
3143 return -1;
3144 }
3145
3146 if (netdev_conf[netdev->type](handler, netdev)) {
3147 ERROR("failed to create netdev");
3148 return -1;
3149 }
3150
3151 }
3152
3153 return 0;
3154 }
3155
3156 bool lxc_delete_network(struct lxc_handler *handler)
3157 {
3158 int ret;
3159 struct lxc_list *network = &handler->conf->network;
3160 struct lxc_list *iterator;
3161 struct lxc_netdev *netdev;
3162 bool deleted_all = true;
3163
3164 lxc_list_for_each(iterator, network) {
3165 netdev = iterator->elem;
3166
3167 if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
3168 if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
3169 WARN("Failed to rename interface with index %d "
3170 "to its initial name \"%s\".",
3171 netdev->ifindex, netdev->link);
3172 continue;
3173 }
3174
3175 if (netdev_deconf[netdev->type](handler, netdev)) {
3176 WARN("Failed to destroy netdev");
3177 }
3178
3179 /* Recent kernel remove the virtual interfaces when the network
3180 * namespace is destroyed but in case we did not moved the
3181 * interface to the network namespace, we have to destroy it
3182 */
3183 if (netdev->ifindex != 0) {
3184 ret = lxc_netdev_delete_by_index(netdev->ifindex);
3185 if (-ret == ENODEV) {
3186 INFO("Interface \"%s\" with index %d already "
3187 "deleted or existing in different network "
3188 "namespace.",
3189 netdev->name ? netdev->name : "(null)",
3190 netdev->ifindex);
3191 } else if (ret < 0) {
3192 deleted_all = false;
3193 WARN("Failed to remove interface \"%s\" with "
3194 "index %d: %s.",
3195 netdev->name ? netdev->name : "(null)",
3196 netdev->ifindex, strerror(-ret));
3197 } else {
3198 INFO("Removed interface \"%s\" with index %d.",
3199 netdev->name ? netdev->name : "(null)",
3200 netdev->ifindex);
3201 }
3202 }
3203
3204 /* Explicitly delete host veth device to prevent lingering
3205 * devices. We had issues in LXD around this.
3206 */
3207 if (netdev->ifindex != 0 && netdev->type == LXC_NET_VETH && !am_unpriv()) {
3208 char *hostveth;
3209 if (netdev->priv.veth_attr.pair) {
3210 hostveth = netdev->priv.veth_attr.pair;
3211 ret = lxc_netdev_delete_by_name(hostveth);
3212 if (ret < 0) {
3213 WARN("Failed to remove interface \"%s\" from host: %s.", hostveth, strerror(-ret));
3214 } else {
3215 INFO("Removed interface \"%s\" from host.", hostveth);
3216 }
3217 } else if (strlen(netdev->priv.veth_attr.veth1) > 0) {
3218 hostveth = netdev->priv.veth_attr.veth1;
3219 ret = lxc_netdev_delete_by_name(hostveth);
3220 if (ret < 0) {
3221 WARN("Failed to remove \"%s\" from host: %s.", hostveth, strerror(-ret));
3222 } else {
3223 INFO("Removed interface \"%s\" from host.", hostveth);
3224 memset((void *)&netdev->priv.veth_attr.veth1, 0, sizeof(netdev->priv.veth_attr.veth1));
3225 }
3226 }
3227 }
3228 }
3229
3230 return deleted_all;
3231 }
3232
3233 #define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3234
3235 /* lxc-user-nic returns "interface_name:interface_name\n" */
3236 #define MAX_BUFFER_SIZE IFNAMSIZ * 2 + 2
3237 static int unpriv_assign_nic(const char *lxcpath, char *lxcname,
3238 struct lxc_netdev *netdev, pid_t pid)
3239 {
3240 pid_t child;
3241 int bytes, pipefd[2];
3242 char *token, *saveptr = NULL;
3243 char buffer[MAX_BUFFER_SIZE];
3244 char netdev_link[IFNAMSIZ + 1];
3245
3246 if (netdev->type != LXC_NET_VETH) {
3247 ERROR("nic type %d not support for unprivileged use",
3248 netdev->type);
3249 return -1;
3250 }
3251
3252 if (pipe(pipefd) < 0) {
3253 SYSERROR("pipe failed");
3254 return -1;
3255 }
3256
3257 child = fork();
3258 if (child < 0) {
3259 SYSERROR("fork");
3260 close(pipefd[0]);
3261 close(pipefd[1]);
3262 return -1;
3263 }
3264
3265 if (child == 0) { // child
3266 /* Call lxc-user-nic pid type bridge. */
3267 int ret;
3268 char pidstr[LXC_NUMSTRLEN64];
3269
3270 close(pipefd[0]); /* Close the read-end of the pipe. */
3271
3272 /* Redirect stdout to write-end of the pipe. */
3273 ret = dup2(pipefd[1], STDOUT_FILENO);
3274 close(pipefd[1]); /* Close the write-end of the pipe. */
3275 if (ret < 0) {
3276 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
3277 exit(EXIT_FAILURE);
3278 }
3279
3280 if (netdev->link)
3281 strncpy(netdev_link, netdev->link, IFNAMSIZ);
3282 else
3283 strncpy(netdev_link, "none", IFNAMSIZ);
3284
3285 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
3286 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
3287 exit(EXIT_FAILURE);
3288 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
3289
3290 INFO("Execing lxc-user-nic %s %s %s veth %s %s", lxcpath,
3291 lxcname, pidstr, netdev_link, netdev->name);
3292 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, lxcpath, lxcname,
3293 pidstr, "veth", netdev_link, netdev->name, NULL);
3294
3295 SYSERROR("Failed to exec lxc-user-nic.");
3296 exit(EXIT_FAILURE);
3297 }
3298
3299 /* close the write-end of the pipe */
3300 close(pipefd[1]);
3301
3302 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
3303 if (bytes < 0)
3304 SYSERROR("Failed to read from pipe file descriptor.");
3305 buffer[bytes - 1] = '\0';
3306
3307 if (wait_for_pid(child) != 0) {
3308 close(pipefd[0]);
3309 return -1;
3310 }
3311
3312 /* close the read-end of the pipe */
3313 close(pipefd[0]);
3314
3315 /* fill netdev->name field */
3316 token = strtok_r(buffer, ":", &saveptr);
3317 if (!token)
3318 return -1;
3319
3320 netdev->name = malloc(IFNAMSIZ + 1);
3321 if (!netdev->name) {
3322 SYSERROR("Failed to allocate memory.");
3323 return -1;
3324 }
3325 memset(netdev->name, 0, IFNAMSIZ + 1);
3326 strncpy(netdev->name, token, IFNAMSIZ);
3327
3328 /* fill netdev->veth_attr.pair field */
3329 token = strtok_r(NULL, ":", &saveptr);
3330 if (!token)
3331 return -1;
3332
3333 netdev->priv.veth_attr.pair = strdup(token);
3334 if (!netdev->priv.veth_attr.pair) {
3335 ERROR("Failed to allocate memory.");
3336 return -1;
3337 }
3338
3339 return 0;
3340 }
3341
3342 int lxc_assign_network(const char *lxcpath, char *lxcname,
3343 struct lxc_list *network, pid_t pid)
3344 {
3345 struct lxc_list *iterator;
3346 struct lxc_netdev *netdev;
3347 char ifname[IFNAMSIZ];
3348 int am_root = (getuid() == 0);
3349 int err;
3350
3351 lxc_list_for_each(iterator, network) {
3352
3353 netdev = iterator->elem;
3354
3355 if (netdev->type == LXC_NET_VETH && !am_root) {
3356 if (netdev->mtu)
3357 INFO("mtu ignored due to insufficient privilege");
3358 if (unpriv_assign_nic(lxcpath, lxcname, netdev, pid))
3359 return -1;
3360 // lxc-user-nic has moved the nic to the new ns.
3361 // unpriv_assign_nic() fills in netdev->name.
3362 // netdev->ifindex will be filed in at setup_netdev.
3363 continue;
3364 }
3365
3366 /* empty network namespace, nothing to move */
3367 if (!netdev->ifindex)
3368 continue;
3369
3370 /* retrieve the name of the interface */
3371 if (!if_indextoname(netdev->ifindex, ifname)) {
3372 ERROR("no interface corresponding to index '%d'", netdev->ifindex);
3373 return -1;
3374 }
3375
3376 err = lxc_netdev_move_by_name(ifname, pid, NULL);
3377 if (err) {
3378 ERROR("failed to move '%s' to the container : %s",
3379 netdev->link, strerror(-err));
3380 return -1;
3381 }
3382
3383 DEBUG("move '%s'/'%s' to '%d': .", ifname, netdev->name, pid);
3384 }
3385
3386 return 0;
3387 }
3388
3389 static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3390 size_t buf_size)
3391 {
3392 char path[MAXPATHLEN];
3393 int fd, ret;
3394
3395 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
3396 idtype == ID_TYPE_UID ? 'u' : 'g');
3397 if (ret < 0 || ret >= MAXPATHLEN) {
3398 ERROR("failed to create path \"%s\"", path);
3399 return -E2BIG;
3400 }
3401
3402 fd = open(path, O_WRONLY);
3403 if (fd < 0) {
3404 SYSERROR("failed to open \"%s\"", path);
3405 return -1;
3406 }
3407
3408 errno = 0;
3409 ret = lxc_write_nointr(fd, buf, buf_size);
3410 if (ret != buf_size) {
3411 SYSERROR("failed to write %cid mapping to \"%s\"",
3412 idtype == ID_TYPE_UID ? 'u' : 'g', path);
3413 close(fd);
3414 return -1;
3415 }
3416 close(fd);
3417
3418 return 0;
3419 }
3420
3421 /* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both. */
3422 static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
3423 {
3424 char *path;
3425 int ret;
3426 struct stat st;
3427 int fret = 0;
3428
3429 path = on_path(binary, NULL);
3430 if (!path)
3431 return -ENOENT;
3432
3433 ret = stat(path, &st);
3434 if (ret < 0) {
3435 fret = -errno;
3436 goto cleanup;
3437 }
3438
3439 /* Check if the binary is setuid. */
3440 if (st.st_mode & S_ISUID) {
3441 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
3442 fret = 1;
3443 goto cleanup;
3444 }
3445
3446 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
3447 /* Check if it has the CAP_SETUID capability. */
3448 if ((cap & CAP_SETUID) &&
3449 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
3450 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
3451 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
3452 "and CAP_PERMITTED sets.", path);
3453 fret = 1;
3454 goto cleanup;
3455 }
3456
3457 /* Check if it has the CAP_SETGID capability. */
3458 if ((cap & CAP_SETGID) &&
3459 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
3460 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
3461 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
3462 "and CAP_PERMITTED sets.", path);
3463 fret = 1;
3464 goto cleanup;
3465 }
3466 #else
3467 /* If we cannot check for file capabilities we need to give the benefit
3468 * of the doubt. Otherwise we might fail even though all the necessary
3469 * file capabilities are set.
3470 */
3471 DEBUG("Cannot check for file capabilites as full capability support is "
3472 "missing. Manual intervention needed.");
3473 fret = 1;
3474 #endif
3475
3476 cleanup:
3477 free(path);
3478 return fret;
3479 }
3480
3481 int lxc_map_ids_exec_wrapper(void *args)
3482 {
3483 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
3484 return -1;
3485 }
3486
3487 int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3488 {
3489 struct id_map *map;
3490 struct lxc_list *iterator;
3491 enum idtype type;
3492 char u_or_g;
3493 char *pos;
3494 int fill, left;
3495 char cmd_output[MAXPATHLEN];
3496 /* strlen("new@idmap") = 9
3497 * +
3498 * strlen(" ") = 1
3499 * +
3500 * LXC_NUMSTRLEN64
3501 * +
3502 * strlen(" ") = 1
3503 *
3504 * We add some additional space to make sure that we really have
3505 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
3506 */
3507 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
3508 int ret = 0, uidmap = 0, gidmap = 0;
3509 bool use_shadow = false, had_entry = false;
3510
3511 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
3512 * ranges, then insist that root also reserve ranges in subuid. This
3513 * will protected it by preventing another user from being handed the
3514 * range by shadow.
3515 */
3516 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
3517 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
3518 if (uidmap > 0 && gidmap > 0) {
3519 DEBUG("Functional newuidmap and newgidmap binary found.");
3520 use_shadow = true;
3521 } else {
3522 /* In case unprivileged users run application containers via
3523 * execute() or a start*() there are valid cases where they may
3524 * only want to map their own {g,u}id. Let's not block them from
3525 * doing so by requiring geteuid() == 0.
3526 */
3527 DEBUG("No newuidmap and newgidmap binary found. Trying to "
3528 "write directly with euid %d.", geteuid());
3529 }
3530
3531 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
3532 type++, u_or_g = 'g') {
3533 pos = mapbuf;
3534
3535 if (use_shadow)
3536 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
3537
3538 lxc_list_for_each(iterator, idmap) {
3539 /* The kernel only takes <= 4k for writes to
3540 * /proc/<nr>/[ug]id_map
3541 */
3542 map = iterator->elem;
3543 if (map->idtype != type)
3544 continue;
3545
3546 had_entry = true;
3547
3548 left = LXC_IDMAPLEN - (pos - mapbuf);
3549 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
3550 use_shadow ? " " : "", map->nsid,
3551 map->hostid, map->range,
3552 use_shadow ? "" : "\n");
3553 if (fill <= 0 || fill >= left)
3554 SYSERROR("Too many {g,u}id mappings defined.");
3555
3556 pos += fill;
3557 }
3558 if (!had_entry)
3559 continue;
3560
3561 /* Try to catch the ouput of new{g,u}idmap to make debugging
3562 * easier.
3563 */
3564 if (use_shadow) {
3565 ret = run_command(cmd_output, sizeof(cmd_output),
3566 lxc_map_ids_exec_wrapper,
3567 (void *)mapbuf);
3568 if (ret < 0) {
3569 ERROR("new%cidmap failed to write mapping: %s",
3570 u_or_g, cmd_output);
3571 return -1;
3572 }
3573 } else {
3574 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
3575 if (ret < 0)
3576 return -1;
3577 }
3578
3579 memset(mapbuf, 0, sizeof(mapbuf));
3580 }
3581
3582 return 0;
3583 }
3584
3585 /*
3586 * return the host uid/gid to which the container root is mapped in
3587 * *val.
3588 * Return true if id was found, false otherwise.
3589 */
3590 bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3591 unsigned long *val)
3592 {
3593 struct lxc_list *it;
3594 struct id_map *map;
3595
3596 lxc_list_for_each(it, &conf->id_map) {
3597 map = it->elem;
3598 if (map->idtype != idtype)
3599 continue;
3600 if (map->nsid != 0)
3601 continue;
3602 *val = map->hostid;
3603 return true;
3604 }
3605 return false;
3606 }
3607
3608 int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
3609 {
3610 struct lxc_list *it;
3611 struct id_map *map;
3612 lxc_list_for_each(it, &conf->id_map) {
3613 map = it->elem;
3614 if (map->idtype != idtype)
3615 continue;
3616 if (id >= map->hostid && id < map->hostid + map->range)
3617 return (id - map->hostid) + map->nsid;
3618 }
3619 return -1;
3620 }
3621
3622 int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
3623 {
3624 struct lxc_list *it;
3625 struct id_map *map;
3626 unsigned int freeid = 0;
3627 again:
3628 lxc_list_for_each(it, &conf->id_map) {
3629 map = it->elem;
3630 if (map->idtype != idtype)
3631 continue;
3632 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3633 freeid = map->nsid + map->range;
3634 goto again;
3635 }
3636 }
3637 return freeid;
3638 }
3639
3640 int lxc_find_gateway_addresses(struct lxc_handler *handler)
3641 {
3642 struct lxc_list *network = &handler->conf->network;
3643 struct lxc_list *iterator;
3644 struct lxc_netdev *netdev;
3645 int link_index;
3646
3647 lxc_list_for_each(iterator, network) {
3648 netdev = iterator->elem;
3649
3650 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3651 continue;
3652
3653 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3654 ERROR("gateway = auto only supported for "
3655 "veth and macvlan");
3656 return -1;
3657 }
3658
3659 if (!netdev->link) {
3660 ERROR("gateway = auto needs a link interface");
3661 return -1;
3662 }
3663
3664 link_index = if_nametoindex(netdev->link);
3665 if (!link_index)
3666 return -EINVAL;
3667
3668 if (netdev->ipv4_gateway_auto) {
3669 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3670 ERROR("failed to automatically find ipv4 gateway "
3671 "address from link interface '%s'", netdev->link);
3672 return -1;
3673 }
3674 }
3675
3676 if (netdev->ipv6_gateway_auto) {
3677 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3678 ERROR("failed to automatically find ipv6 gateway "
3679 "address from link interface '%s'", netdev->link);
3680 return -1;
3681 }
3682 }
3683 }
3684
3685 return 0;
3686 }
3687
3688 int lxc_create_tty(const char *name, struct lxc_conf *conf)
3689 {
3690 struct lxc_tty_info *tty_info = &conf->tty_info;
3691 int i, ret;
3692
3693 /* no tty in the configuration */
3694 if (!conf->tty)
3695 return 0;
3696
3697 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
3698 if (!tty_info->pty_info) {
3699 SYSERROR("failed to allocate struct *pty_info");
3700 return -ENOMEM;
3701 }
3702
3703 for (i = 0; i < conf->tty; i++) {
3704 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3705
3706 process_lock();
3707 ret = openpty(&pty_info->master, &pty_info->slave,
3708 pty_info->name, NULL, NULL);
3709 process_unlock();
3710 if (ret) {
3711 SYSERROR("failed to create pty device number %d", i);
3712 tty_info->nbtty = i;
3713 lxc_delete_tty(tty_info);
3714 return -ENOTTY;
3715 }
3716
3717 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
3718 pty_info->name, pty_info->master, pty_info->slave);
3719
3720 /* Prevent leaking the file descriptors to the container */
3721 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3722 if (ret < 0)
3723 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
3724 "pty device \"%s\": %s",
3725 pty_info->master, pty_info->name, strerror(errno));
3726
3727 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3728 if (ret < 0)
3729 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
3730 "pty device \"%s\": %s",
3731 pty_info->slave, pty_info->name, strerror(errno));
3732
3733 pty_info->busy = 0;
3734 }
3735
3736 tty_info->nbtty = conf->tty;
3737
3738 INFO("finished allocating %d pts devices", conf->tty);
3739 return 0;
3740 }
3741
3742 void lxc_delete_tty(struct lxc_tty_info *tty_info)
3743 {
3744 int i;
3745
3746 for (i = 0; i < tty_info->nbtty; i++) {
3747 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3748
3749 close(pty_info->master);
3750 close(pty_info->slave);
3751 }
3752
3753 free(tty_info->pty_info);
3754 tty_info->pty_info = NULL;
3755 tty_info->nbtty = 0;
3756 }
3757
3758
3759 int chown_mapped_root_exec_wrapper(void *args)
3760 {
3761 execvp("lxc-usernsexec", args);
3762 return -1;
3763 }
3764
3765 /*
3766 * chown_mapped_root: for an unprivileged user with uid/gid X to
3767 * chown a dir to subuid/subgid Y, he needs to run chown as root
3768 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3769 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3770 * root is privileged with respect to hostuid/hostgid X, allowing
3771 * him to do the chown.
3772 */
3773 int chown_mapped_root(char *path, struct lxc_conf *conf)
3774 {
3775 uid_t rootuid, rootgid;
3776 unsigned long val;
3777 char *chownpath = path;
3778 int hostuid, hostgid, ret;
3779 struct stat sb;
3780 char map1[100], map2[100], map3[100], map4[100], map5[100];
3781 char ugid[100];
3782 char *args1[] = {"lxc-usernsexec",
3783 "-m", map1,
3784 "-m", map2,
3785 "-m", map3,
3786 "-m", map5,
3787 "--", "chown", ugid, path,
3788 NULL};
3789 char *args2[] = {"lxc-usernsexec",
3790 "-m", map1,
3791 "-m", map2,
3792 "-m", map3,
3793 "-m", map4,
3794 "-m", map5,
3795 "--", "chown", ugid, path,
3796 NULL};
3797 char cmd_output[MAXPATHLEN];
3798
3799 hostuid = geteuid();
3800 hostgid = getegid();
3801
3802 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
3803 ERROR("No uid mapping for container root");
3804 return -1;
3805 }
3806 rootuid = (uid_t)val;
3807 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
3808 ERROR("No gid mapping for container root");
3809 return -1;
3810 }
3811 rootgid = (gid_t)val;
3812
3813 /*
3814 * In case of overlay, we want only the writeable layer to be chowned
3815 */
3816 if (strncmp(path, "overlayfs:", 10) == 0 || strncmp(path, "aufs:", 5) == 0) {
3817 chownpath = strchr(path, ':');
3818 if (!chownpath) {
3819 ERROR("Bad overlay path: %s", path);
3820 return -1;
3821 }
3822 chownpath = strchr(chownpath + 1, ':');
3823 if (!chownpath) {
3824 ERROR("Bad overlay path: %s", path);
3825 return -1;
3826 }
3827 chownpath++;
3828 }
3829 path = chownpath;
3830 if (hostuid == 0) {
3831 if (chown(path, rootuid, rootgid) < 0) {
3832 ERROR("Error chowning %s", path);
3833 return -1;
3834 }
3835 return 0;
3836 }
3837
3838 if (rootuid == hostuid) {
3839 // nothing to do
3840 INFO("%s: container root is our uid; no need to chown" ,__func__);
3841 return 0;
3842 }
3843
3844 // save the current gid of "path"
3845 if (stat(path, &sb) < 0) {
3846 ERROR("Error stat %s", path);
3847 return -1;
3848 }
3849
3850 /*
3851 * A file has to be group-owned by a gid mapped into the
3852 * container, or the container won't be privileged over it.
3853 */
3854 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3855 if (sb.st_uid == hostuid &&
3856 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3857 chown(path, -1, hostgid) < 0) {
3858 ERROR("Failed chgrping %s", path);
3859 return -1;
3860 }
3861
3862 // "u:0:rootuid:1"
3863 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3864 if (ret < 0 || ret >= 100) {
3865 ERROR("Error uid printing map string");
3866 return -1;
3867 }
3868
3869 // "u:hostuid:hostuid:1"
3870 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3871 if (ret < 0 || ret >= 100) {
3872 ERROR("Error uid printing map string");
3873 return -1;
3874 }
3875
3876 // "g:0:rootgid:1"
3877 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3878 if (ret < 0 || ret >= 100) {
3879 ERROR("Error gid printing map string");
3880 return -1;
3881 }
3882
3883 // "g:pathgid:rootgid+pathgid:1"
3884 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3885 rootgid + (gid_t)sb.st_gid);
3886 if (ret < 0 || ret >= 100) {
3887 ERROR("Error gid printing map string");
3888 return -1;
3889 }
3890
3891 // "g:hostgid:hostgid:1"
3892 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3893 if (ret < 0 || ret >= 100) {
3894 ERROR("Error gid printing map string");
3895 return -1;
3896 }
3897
3898 // "0:pathgid" (chown)
3899 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3900 if (ret < 0 || ret >= 100) {
3901 ERROR("Error owner printing format string for chown");
3902 return -1;
3903 }
3904
3905 if (hostgid == sb.st_gid)
3906 ret = run_command(cmd_output, sizeof(cmd_output),
3907 chown_mapped_root_exec_wrapper,
3908 (void *)args1);
3909 else
3910 ret = run_command(cmd_output, sizeof(cmd_output),
3911 chown_mapped_root_exec_wrapper,
3912 (void *)args2);
3913 if (ret < 0)
3914 ERROR("lxc-usernsexec failed: %s", cmd_output);
3915
3916 return ret;
3917 }
3918
3919 int ttys_shift_ids(struct lxc_conf *c)
3920 {
3921 if (lxc_list_empty(&c->id_map))
3922 return 0;
3923
3924 if (strcmp(c->console.name, "") !=0 && chown_mapped_root(c->console.name, c) < 0) {
3925 ERROR("Failed to chown %s", c->console.name);
3926 return -1;
3927 }
3928
3929 return 0;
3930 }
3931
3932 /* NOTE: Must not be called from inside the container namespace! */
3933 int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
3934 {
3935 int mounted;
3936
3937 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
3938 if (mounted == -1) {
3939 SYSERROR("failed to mount /proc in the container");
3940 /* continue only if there is no rootfs */
3941 if (conf->rootfs.path)
3942 return -1;
3943 } else if (mounted == 1) {
3944 conf->tmp_umount_proc = 1;
3945 }
3946
3947 return 0;
3948 }
3949
3950 void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3951 {
3952 if (lxc_conf->tmp_umount_proc == 1) {
3953 umount("/proc");
3954 lxc_conf->tmp_umount_proc = 0;
3955 }
3956 }
3957
3958 void remount_all_slave(void)
3959 {
3960 /* walk /proc/mounts and change any shared entries to slave */
3961 FILE *f = fopen("/proc/self/mountinfo", "r");
3962 char *line = NULL;
3963 size_t len = 0;
3964
3965 if (!f) {
3966 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3967 ERROR("Continuing container startup...");
3968 return;
3969 }
3970
3971 while (getline(&line, &len, f) != -1) {
3972 char *target, *opts;
3973 target = get_field(line, 4);
3974 if (!target)
3975 continue;
3976 opts = get_field(target, 2);
3977 if (!opts)
3978 continue;
3979 null_endofword(opts);
3980 if (!strstr(opts, "shared"))
3981 continue;
3982 null_endofword(target);
3983 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3984 SYSERROR("Failed to make %s rslave", target);
3985 ERROR("Continuing...");
3986 }
3987 }
3988 fclose(f);
3989 free(line);
3990 }
3991
3992 void lxc_execute_bind_init(struct lxc_conf *conf)
3993 {
3994 int ret;
3995 char path[PATH_MAX], destpath[PATH_MAX], *p;
3996
3997 /* If init exists in the container, don't bind mount a static one */
3998 p = choose_init(conf->rootfs.mount);
3999 if (p) {
4000 free(p);
4001 return;
4002 }
4003
4004 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
4005 if (ret < 0 || ret >= PATH_MAX) {
4006 WARN("Path name too long searching for lxc.init.static");
4007 return;
4008 }
4009
4010 if (!file_exists(path)) {
4011 INFO("%s does not exist on host", path);
4012 return;
4013 }
4014
4015 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
4016 if (ret < 0 || ret >= PATH_MAX) {
4017 WARN("Path name too long for container's lxc.init.static");
4018 return;
4019 }
4020
4021 if (!file_exists(destpath)) {
4022 FILE * pathfile = fopen(destpath, "wb");
4023 if (!pathfile) {
4024 SYSERROR("Failed to create mount target '%s'", destpath);
4025 return;
4026 }
4027 fclose(pathfile);
4028 }
4029
4030 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
4031 if (ret < 0)
4032 SYSERROR("Failed to bind lxc.init.static into container");
4033 INFO("lxc.init.static bound into container at %s", path);
4034 }
4035
4036 /*
4037 * This does the work of remounting / if it is shared, calling the
4038 * container pre-mount hooks, and mounting the rootfs.
4039 */
4040 int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
4041 {
4042 if (conf->rootfs_setup) {
4043 /*
4044 * rootfs was set up in another namespace. bind-mount it
4045 * to give us a mount in our own ns so we can pivot_root to it
4046 */
4047 const char *path = conf->rootfs.mount;
4048 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
4049 ERROR("Failed to bind-mount container / onto itself");
4050 return -1;
4051 }
4052 return 0;
4053 }
4054
4055 remount_all_slave();
4056
4057 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
4058 ERROR("failed to run pre-mount hooks for container '%s'.", name);
4059 return -1;
4060 }
4061
4062 if (setup_rootfs(conf)) {
4063 ERROR("failed to setup rootfs for '%s'", name);
4064 return -1;
4065 }
4066
4067 conf->rootfs_setup = true;
4068 return 0;
4069 }
4070
4071 static bool verify_start_hooks(struct lxc_conf *conf)
4072 {
4073 struct lxc_list *it;
4074 char path[MAXPATHLEN];
4075 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
4076 char *hookname = it->elem;
4077 struct stat st;
4078 int ret;
4079
4080 ret = snprintf(path, MAXPATHLEN, "%s%s",
4081 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
4082 if (ret < 0 || ret >= MAXPATHLEN)
4083 return false;
4084 ret = stat(path, &st);
4085 if (ret) {
4086 SYSERROR("Start hook %s not found in container",
4087 hookname);
4088 return false;
4089 }
4090 return true;
4091 }
4092
4093 return true;
4094 }
4095
4096 static int send_fd(int sock, int fd)
4097 {
4098 int ret = lxc_abstract_unix_send_fd(sock, fd, NULL, 0);
4099
4100
4101 if (ret < 0) {
4102 SYSERROR("Error sending tty fd to parent");
4103 return -1;
4104 }
4105
4106 return 0;
4107 }
4108
4109 static int send_ttys_to_parent(struct lxc_handler *handler)
4110 {
4111 int i, ret;
4112 struct lxc_conf *conf = handler->conf;
4113 const struct lxc_tty_info *tty_info = &conf->tty_info;
4114 int sock = handler->ttysock[0];
4115
4116 for (i = 0; i < tty_info->nbtty; i++) {
4117 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
4118 ret = send_fd(sock, pty_info->slave);
4119 if (ret >= 0)
4120 send_fd(sock, pty_info->master);
4121 TRACE("sending pty \"%s\" with master fd %d and slave fd %d to "
4122 "parent",
4123 pty_info->name, pty_info->master, pty_info->slave);
4124 close(pty_info->slave);
4125 pty_info->slave = -1;
4126 close(pty_info->master);
4127 pty_info->master = -1;
4128 if (ret < 0) {
4129 ERROR("failed to send pty \"%s\" with master fd %d and "
4130 "slave fd %d to parent : %s",
4131 pty_info->name, pty_info->master, pty_info->slave,
4132 strerror(errno));
4133 goto bad;
4134 }
4135 }
4136
4137 close(handler->ttysock[0]);
4138 close(handler->ttysock[1]);
4139
4140 return 0;
4141
4142 bad:
4143 ERROR("Error writing tty fd to parent");
4144 return -1;
4145 }
4146
4147 int lxc_setup(struct lxc_handler *handler)
4148 {
4149 const char *name = handler->name;
4150 struct lxc_conf *lxc_conf = handler->conf;
4151 const char *lxcpath = handler->lxcpath;
4152
4153 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4154 ERROR("Error setting up rootfs mount after spawn");
4155 return -1;
4156 }
4157
4158 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4159 if (setup_utsname(lxc_conf->utsname)) {
4160 ERROR("failed to setup the utsname for '%s'", name);
4161 return -1;
4162 }
4163 }
4164
4165 if (setup_network(&lxc_conf->network)) {
4166 ERROR("failed to setup the network for '%s'", name);
4167 return -1;
4168 }
4169
4170 if (lxc_conf->autodev > 0) {
4171 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
4172 ERROR("failed to mount /dev in the container");
4173 return -1;
4174 }
4175 }
4176
4177 /* do automatic mounts (mainly /proc and /sys), but exclude
4178 * those that need to wait until other stuff has finished
4179 */
4180 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
4181 ERROR("failed to setup the automatic mounts for '%s'", name);
4182 return -1;
4183 }
4184
4185 if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
4186 ERROR("failed to setup the mounts for '%s'", name);
4187 return -1;
4188 }
4189
4190 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
4191 ERROR("failed to setup the mount entries for '%s'", name);
4192 return -1;
4193 }
4194
4195 /* Make sure any start hooks are in the container */
4196 if (!verify_start_hooks(lxc_conf))
4197 return -1;
4198
4199 if (lxc_conf->is_execute)
4200 lxc_execute_bind_init(lxc_conf);
4201
4202 /* now mount only cgroup, if wanted;
4203 * before, /sys could not have been mounted
4204 * (is either mounted automatically or via fstab entries)
4205 */
4206 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
4207 ERROR("failed to setup the automatic mounts for '%s'", name);
4208 return -1;
4209 }
4210
4211 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
4212 ERROR("failed to run mount hooks for container '%s'.", name);
4213 return -1;
4214 }
4215
4216 if (lxc_conf->autodev > 0) {
4217 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
4218 ERROR("failed to run autodev hooks for container '%s'.", name);
4219 return -1;
4220 }
4221 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
4222 ERROR("failed to populate /dev in the container");
4223 return -1;
4224 }
4225 }
4226
4227 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
4228 ERROR("failed to setup the console for '%s'", name);
4229 return -1;
4230 }
4231
4232 if (lxc_conf->kmsg) {
4233 if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console)) // don't fail
4234 ERROR("failed to setup kmsg for '%s'", name);
4235 }
4236
4237 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4238 ERROR("failed to setup /dev symlinks for '%s'", name);
4239 return -1;
4240 }
4241
4242 /* mount /proc if it's not already there */
4243 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
4244 ERROR("failed to LSM mount proc for '%s'", name);
4245 return -1;
4246 }
4247
4248 if (setup_pivot_root(&lxc_conf->rootfs)) {
4249 ERROR("failed to set rootfs for '%s'", name);
4250 return -1;
4251 }
4252
4253 if (lxc_setup_devpts(lxc_conf->pts)) {
4254 ERROR("failed to setup the new pts instance");
4255 return -1;
4256 }
4257
4258 if (lxc_create_tty(name, lxc_conf)) {
4259 ERROR("failed to create the ttys");
4260 return -1;
4261 }
4262
4263 if (send_ttys_to_parent(handler) < 0) {
4264 ERROR("failure sending console info to parent");
4265 return -1;
4266 }
4267
4268 if (!lxc_conf->is_execute && lxc_setup_tty(lxc_conf)) {
4269 ERROR("failed to setup the ttys for '%s'", name);
4270 return -1;
4271 }
4272
4273 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
4274 SYSERROR("failed to set environment variable for container ptys");
4275
4276
4277 if (setup_personality(lxc_conf->personality)) {
4278 ERROR("failed to setup personality");
4279 return -1;
4280 }
4281
4282 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4283 if (!lxc_list_empty(&lxc_conf->caps)) {
4284 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
4285 return -1;
4286 }
4287 if (dropcaps_except(&lxc_conf->keepcaps)) {
4288 ERROR("failed to keep requested caps");
4289 return -1;
4290 }
4291 } else if (setup_caps(&lxc_conf->caps)) {
4292 ERROR("failed to drop capabilities");
4293 return -1;
4294 }
4295
4296 NOTICE("'%s' is setup.", name);
4297
4298 return 0;
4299 }
4300
4301 int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4302 const char *lxcpath, char *argv[])
4303 {
4304 int which = -1;
4305 struct lxc_list *it;
4306
4307 if (strcmp(hook, "pre-start") == 0)
4308 which = LXCHOOK_PRESTART;
4309 else if (strcmp(hook, "pre-mount") == 0)
4310 which = LXCHOOK_PREMOUNT;
4311 else if (strcmp(hook, "mount") == 0)
4312 which = LXCHOOK_MOUNT;
4313 else if (strcmp(hook, "autodev") == 0)
4314 which = LXCHOOK_AUTODEV;
4315 else if (strcmp(hook, "start") == 0)
4316 which = LXCHOOK_START;
4317 else if (strcmp(hook, "stop") == 0)
4318 which = LXCHOOK_STOP;
4319 else if (strcmp(hook, "post-stop") == 0)
4320 which = LXCHOOK_POSTSTOP;
4321 else if (strcmp(hook, "clone") == 0)
4322 which = LXCHOOK_CLONE;
4323 else if (strcmp(hook, "destroy") == 0)
4324 which = LXCHOOK_DESTROY;
4325 else
4326 return -1;
4327 lxc_list_for_each(it, &conf->hooks[which]) {
4328 int ret;
4329 char *hookname = it->elem;
4330 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
4331 if (ret)
4332 return ret;
4333 }
4334 return 0;
4335 }
4336
4337 static void lxc_remove_nic(struct lxc_list *it)
4338 {
4339 struct lxc_netdev *netdev = it->elem;
4340 struct lxc_list *it2,*next;
4341
4342 lxc_list_del(it);
4343
4344 free(netdev->link);
4345 free(netdev->name);
4346 if (netdev->type == LXC_NET_VETH)
4347 free(netdev->priv.veth_attr.pair);
4348 free(netdev->upscript);
4349 free(netdev->hwaddr);
4350 free(netdev->mtu);
4351 free(netdev->ipv4_gateway);
4352 free(netdev->ipv6_gateway);
4353 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
4354 lxc_list_del(it2);
4355 free(it2->elem);
4356 free(it2);
4357 }
4358 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
4359 lxc_list_del(it2);
4360 free(it2->elem);
4361 free(it2);
4362 }
4363 free(netdev);
4364 free(it);
4365 }
4366
4367 /* we get passed in something like '0', '0.ipv4' or '1.ipv6' */
4368 int lxc_clear_nic(struct lxc_conf *c, const char *key)
4369 {
4370 char *p1;
4371 int ret, idx, i;
4372 struct lxc_list *it;
4373 struct lxc_netdev *netdev;
4374
4375 p1 = strchr(key, '.');
4376 if (!p1 || *(p1+1) == '\0')
4377 p1 = NULL;
4378
4379 ret = sscanf(key, "%d", &idx);
4380 if (ret != 1) return -1;
4381 if (idx < 0)
4382 return -1;
4383
4384 i = 0;
4385 lxc_list_for_each(it, &c->network) {
4386 if (i == idx)
4387 break;
4388 i++;
4389 }
4390 if (i < idx) // we don't have that many nics defined
4391 return -1;
4392
4393 if (!it || !it->elem)
4394 return -1;
4395
4396 netdev = it->elem;
4397
4398 if (!p1) {
4399 lxc_remove_nic(it);
4400 } else if (strcmp(p1, ".ipv4") == 0) {
4401 struct lxc_list *it2,*next;
4402 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
4403 lxc_list_del(it2);
4404 free(it2->elem);
4405 free(it2);
4406 }
4407 } else if (strcmp(p1, ".ipv6") == 0) {
4408 struct lxc_list *it2,*next;
4409 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
4410 lxc_list_del(it2);
4411 free(it2->elem);
4412 free(it2);
4413 }
4414 }
4415 else return -1;
4416
4417 return 0;
4418 }
4419
4420 int lxc_clear_config_network(struct lxc_conf *c)
4421 {
4422 struct lxc_list *it,*next;
4423 lxc_list_for_each_safe(it, &c->network, next) {
4424 lxc_remove_nic(it);
4425 }
4426 return 0;
4427 }
4428
4429 int lxc_clear_config_caps(struct lxc_conf *c)
4430 {
4431 struct lxc_list *it,*next;
4432
4433 lxc_list_for_each_safe(it, &c->caps, next) {
4434 lxc_list_del(it);
4435 free(it->elem);
4436 free(it);
4437 }
4438 return 0;
4439 }
4440
4441 static int lxc_free_idmap(struct lxc_list *id_map) {
4442 struct lxc_list *it, *next;
4443
4444 lxc_list_for_each_safe(it, id_map, next) {
4445 lxc_list_del(it);
4446 free(it->elem);
4447 free(it);
4448 }
4449 return 0;
4450 }
4451
4452 int lxc_clear_idmaps(struct lxc_conf *c)
4453 {
4454 return lxc_free_idmap(&c->id_map);
4455 }
4456
4457 int lxc_clear_config_keepcaps(struct lxc_conf *c)
4458 {
4459 struct lxc_list *it,*next;
4460
4461 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4462 lxc_list_del(it);
4463 free(it->elem);
4464 free(it);
4465 }
4466 return 0;
4467 }
4468
4469 int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
4470 {
4471 struct lxc_list *it,*next;
4472 bool all = false;
4473 const char *k = NULL;
4474
4475 if (strcmp(key, "lxc.cgroup") == 0)
4476 all = true;
4477 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
4478 k = key + sizeof("lxc.cgroup.")-1;
4479 else
4480 return -1;
4481
4482 lxc_list_for_each_safe(it, &c->cgroup, next) {
4483 struct lxc_cgroup *cg = it->elem;
4484 if (!all && strcmp(cg->subsystem, k) != 0)
4485 continue;
4486 lxc_list_del(it);
4487 free(cg->subsystem);
4488 free(cg->value);
4489 free(cg);
4490 free(it);
4491 }
4492 return 0;
4493 }
4494
4495 int lxc_clear_limits(struct lxc_conf *c, const char *key)
4496 {
4497 struct lxc_list *it, *next;
4498 bool all = false;
4499 const char *k = NULL;
4500
4501 if (strcmp(key, "lxc.limit") == 0)
4502 all = true;
4503 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
4504 k = key + sizeof("lxc.limit.")-1;
4505 else
4506 return -1;
4507
4508 lxc_list_for_each_safe(it, &c->limits, next) {
4509 struct lxc_limit *lim = it->elem;
4510 if (!all && strcmp(lim->resource, k) != 0)
4511 continue;
4512 lxc_list_del(it);
4513 free(lim->resource);
4514 free(lim);
4515 free(it);
4516 }
4517 return 0;
4518 }
4519
4520 int lxc_clear_groups(struct lxc_conf *c)
4521 {
4522 struct lxc_list *it,*next;
4523
4524 lxc_list_for_each_safe(it, &c->groups, next) {
4525 lxc_list_del(it);
4526 free(it->elem);
4527 free(it);
4528 }
4529 return 0;
4530 }
4531
4532 int lxc_clear_environment(struct lxc_conf *c)
4533 {
4534 struct lxc_list *it,*next;
4535
4536 lxc_list_for_each_safe(it, &c->environment, next) {
4537 lxc_list_del(it);
4538 free(it->elem);
4539 free(it);
4540 }
4541 return 0;
4542 }
4543
4544
4545 int lxc_clear_mount_entries(struct lxc_conf *c)
4546 {
4547 struct lxc_list *it,*next;
4548
4549 lxc_list_for_each_safe(it, &c->mount_list, next) {
4550 lxc_list_del(it);
4551 free(it->elem);
4552 free(it);
4553 }
4554 return 0;
4555 }
4556
4557 int lxc_clear_automounts(struct lxc_conf *c)
4558 {
4559 c->auto_mounts = 0;
4560 return 0;
4561 }
4562
4563 int lxc_clear_hooks(struct lxc_conf *c, const char *key)
4564 {
4565 struct lxc_list *it,*next;
4566 bool all = false, done = false;
4567 const char *k = NULL;
4568 int i;
4569
4570 if (strcmp(key, "lxc.hook") == 0)
4571 all = true;
4572 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
4573 k = key + sizeof("lxc.hook.")-1;
4574 else
4575 return -1;
4576
4577 for (i=0; i<NUM_LXC_HOOKS; i++) {
4578 if (all || strcmp(k, lxchook_names[i]) == 0) {
4579 lxc_list_for_each_safe(it, &c->hooks[i], next) {
4580 lxc_list_del(it);
4581 free(it->elem);
4582 free(it);
4583 }
4584 done = true;
4585 }
4586 }
4587
4588 if (!done) {
4589 ERROR("Invalid hook key: %s", key);
4590 return -1;
4591 }
4592 return 0;
4593 }
4594
4595 static void lxc_clear_saved_nics(struct lxc_conf *conf)
4596 {
4597 int i;
4598
4599 if (!conf->saved_nics)
4600 return;
4601 for (i=0; i < conf->num_savednics; i++)
4602 free(conf->saved_nics[i].orig_name);
4603 free(conf->saved_nics);
4604 }
4605
4606 static inline void lxc_clear_aliens(struct lxc_conf *conf)
4607 {
4608 struct lxc_list *it,*next;
4609
4610 lxc_list_for_each_safe(it, &conf->aliens, next) {
4611 lxc_list_del(it);
4612 free(it->elem);
4613 free(it);
4614 }
4615 }
4616
4617 void lxc_clear_includes(struct lxc_conf *conf)
4618 {
4619 struct lxc_list *it,*next;
4620
4621 lxc_list_for_each_safe(it, &conf->includes, next) {
4622 lxc_list_del(it);
4623 free(it->elem);
4624 free(it);
4625 }
4626 }
4627
4628 void lxc_conf_free(struct lxc_conf *conf)
4629 {
4630 if (!conf)
4631 return;
4632 if (current_config == conf)
4633 current_config = NULL;
4634 free(conf->console.log_path);
4635 free(conf->console.path);
4636 free(conf->rootfs.mount);
4637 free(conf->rootfs.bdev_type);
4638 free(conf->rootfs.options);
4639 free(conf->rootfs.path);
4640 free(conf->logfile);
4641 if (conf->logfd != -1)
4642 close(conf->logfd);
4643 free(conf->utsname);
4644 free(conf->ttydir);
4645 free(conf->fstab);
4646 free(conf->rcfile);
4647 free(conf->init_cmd);
4648 free(conf->unexpanded_config);
4649 free(conf->pty_names);
4650 free(conf->syslog);
4651 lxc_clear_config_network(conf);
4652 free(conf->lsm_aa_profile);
4653 free(conf->lsm_se_context);
4654 lxc_seccomp_free(conf);
4655 lxc_clear_config_caps(conf);
4656 lxc_clear_config_keepcaps(conf);
4657 lxc_clear_cgroups(conf, "lxc.cgroup");
4658 lxc_clear_hooks(conf, "lxc.hook");
4659 lxc_clear_mount_entries(conf);
4660 lxc_clear_saved_nics(conf);
4661 lxc_clear_idmaps(conf);
4662 lxc_clear_groups(conf);
4663 lxc_clear_includes(conf);
4664 lxc_clear_aliens(conf);
4665 lxc_clear_environment(conf);
4666 lxc_clear_limits(conf, "lxc.limit");
4667 free(conf);
4668 }
4669
4670 struct userns_fn_data {
4671 int (*fn)(void *);
4672 const char *fn_name;
4673 void *arg;
4674 int p[2];
4675 };
4676
4677 static int run_userns_fn(void *data)
4678 {
4679 struct userns_fn_data *d = data;
4680 char c;
4681
4682 /* Close write end of the pipe. */
4683 close(d->p[1]);
4684
4685 /* Wait for parent to finish establishing a new mapping in the user
4686 * namespace we are executing in.
4687 */
4688 if (read(d->p[0], &c, 1) != 1)
4689 return -1;
4690
4691 /* Close read end of the pipe. */
4692 close(d->p[0]);
4693
4694 if (d->fn_name)
4695 TRACE("calling function \"%s\"", d->fn_name);
4696 /* Call function to run. */
4697 return d->fn(d->arg);
4698 }
4699
4700 static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
4701 enum idtype idtype)
4702 {
4703 struct lxc_list *it;
4704 struct id_map *map;
4705 struct id_map *retmap = NULL;
4706
4707 lxc_list_for_each(it, &conf->id_map) {
4708 map = it->elem;
4709 if (map->idtype != idtype)
4710 continue;
4711
4712 if (id >= map->hostid && id < map->hostid + map->range) {
4713 retmap = map;
4714 break;
4715 }
4716 }
4717
4718 if (!retmap)
4719 return NULL;
4720
4721 retmap = malloc(sizeof(*retmap));
4722 if (!retmap)
4723 return NULL;
4724
4725 memcpy(retmap, map, sizeof(*retmap));
4726 return retmap;
4727 }
4728
4729 /*
4730 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4731 * existing one or establish a new one.
4732 */
4733 static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4734 {
4735 int hostid_mapped;
4736 struct id_map *entry = NULL;
4737
4738 /* Reuse existing mapping. */
4739 entry = mapped_hostid_entry(conf, id, type);
4740 if (entry)
4741 return entry;
4742
4743 /* Find new mapping. */
4744 hostid_mapped = find_unmapped_nsid(conf, type);
4745 if (hostid_mapped < 0) {
4746 DEBUG("failed to find free mapping for id %d", id);
4747 return NULL;
4748 }
4749
4750 entry = malloc(sizeof(*entry));
4751 if (!entry)
4752 return NULL;
4753
4754 entry->idtype = type;
4755 entry->nsid = hostid_mapped;
4756 entry->hostid = (unsigned long)id;
4757 entry->range = 1;
4758
4759 return entry;
4760 }
4761
4762 /* Run a function in a new user namespace.
4763 * The caller's euid/egid will be mapped if it is not already.
4764 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4765 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4766 * This means we require only to establish a mapping from:
4767 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4768 * - the container root -> some sub{g,u}id
4769 * The former we add, if the user did not specifiy a mapping. The latter we
4770 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4771 * there to start the container in the first place.
4772 */
4773 int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4774 const char *fn_name)
4775 {
4776 pid_t pid;
4777 uid_t euid, egid;
4778 struct userns_fn_data d;
4779 int p[2];
4780 struct lxc_list *it;
4781 struct id_map *map;
4782 char c = '1';
4783 int ret = -1;
4784 struct lxc_list *idmap = NULL, *tmplist = NULL;
4785 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4786 *host_uid_map = NULL, *host_gid_map = NULL;
4787
4788 ret = pipe(p);
4789 if (ret < 0) {
4790 SYSERROR("opening pipe");
4791 return -1;
4792 }
4793 d.fn = fn;
4794 d.fn_name = fn_name;
4795 d.arg = data;
4796 d.p[0] = p[0];
4797 d.p[1] = p[1];
4798
4799 /* Clone child in new user namespace. */
4800 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4801 if (pid < 0) {
4802 ERROR("failed to clone child process in new user namespace");
4803 goto on_error;
4804 }
4805
4806 close(p[0]);
4807 p[0] = -1;
4808
4809 /* Find container root. */
4810 lxc_list_for_each(it, &conf->id_map) {
4811 map = it->elem;
4812
4813 if (map->nsid != 0)
4814 continue;
4815
4816 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
4817 container_root_uid = malloc(sizeof(*container_root_uid));
4818 if (!container_root_uid)
4819 goto on_error;
4820 container_root_uid->idtype = map->idtype;
4821 container_root_uid->hostid = map->hostid;
4822 container_root_uid->nsid = 0;
4823 container_root_uid->range = map->range;
4824 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
4825 container_root_gid = malloc(sizeof(*container_root_gid));
4826 if (!container_root_gid)
4827 goto on_error;
4828 container_root_gid->idtype = map->idtype;
4829 container_root_gid->hostid = map->hostid;
4830 container_root_gid->nsid = 0;
4831 container_root_gid->range = map->range;
4832 }
4833
4834 /* Found container root. */
4835 if (container_root_uid && container_root_gid)
4836 break;
4837 }
4838
4839 /* This is actually checked earlier but it can't hurt. */
4840 if (!container_root_uid || !container_root_gid) {
4841 ERROR("no mapping for container root found");
4842 goto on_error;
4843 }
4844
4845 host_uid_map = container_root_uid;
4846 host_gid_map = container_root_gid;
4847
4848 /* Check whether the {g,u}id of the user has a mapping. */
4849 euid = geteuid();
4850 egid = getegid();
4851 if (euid != container_root_uid->hostid)
4852 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
4853
4854 if (egid != container_root_gid->hostid)
4855 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
4856
4857 if (!host_uid_map) {
4858 DEBUG("failed to find mapping for uid %d", euid);
4859 goto on_error;
4860 }
4861
4862 if (!host_gid_map) {
4863 DEBUG("failed to find mapping for gid %d", egid);
4864 goto on_error;
4865 }
4866
4867 /* Allocate new {g,u}id map list. */
4868 idmap = malloc(sizeof(*idmap));
4869 if (!idmap)
4870 goto on_error;
4871 lxc_list_init(idmap);
4872
4873 /* Add container root to the map. */
4874 tmplist = malloc(sizeof(*tmplist));
4875 if (!tmplist)
4876 goto on_error;
4877 lxc_list_add_elem(tmplist, container_root_uid);
4878 lxc_list_add_tail(idmap, tmplist);
4879
4880 if (host_uid_map && (host_uid_map != container_root_uid)) {
4881 /* idmap will now keep track of that memory. */
4882 container_root_uid = NULL;
4883
4884 /* Add container root to the map. */
4885 tmplist = malloc(sizeof(*tmplist));
4886 if (!tmplist)
4887 goto on_error;
4888 lxc_list_add_elem(tmplist, host_uid_map);
4889 lxc_list_add_tail(idmap, tmplist);
4890 }
4891 /* idmap will now keep track of that memory. */
4892 container_root_uid = NULL;
4893 /* idmap will now keep track of that memory. */
4894 host_uid_map = NULL;
4895
4896 tmplist = malloc(sizeof(*tmplist));
4897 if (!tmplist)
4898 goto on_error;
4899 lxc_list_add_elem(tmplist, container_root_gid);
4900 lxc_list_add_tail(idmap, tmplist);
4901
4902 if (host_gid_map && (host_gid_map != container_root_gid)) {
4903 /* idmap will now keep track of that memory. */
4904 container_root_gid = NULL;
4905
4906 tmplist = malloc(sizeof(*tmplist));
4907 if (!tmplist)
4908 goto on_error;
4909 lxc_list_add_elem(tmplist, host_gid_map);
4910 lxc_list_add_tail(idmap, tmplist);
4911 }
4912 /* idmap will now keep track of that memory. */
4913 container_root_gid = NULL;
4914 /* idmap will now keep track of that memory. */
4915 host_gid_map = NULL;
4916
4917 if (lxc_log_get_level() == LXC_LOG_PRIORITY_TRACE ||
4918 conf->loglevel == LXC_LOG_PRIORITY_TRACE) {
4919 lxc_list_for_each(it, idmap) {
4920 map = it->elem;
4921 TRACE("establishing %cid mapping for \"%d\" in new "
4922 "user namespace: nsuid %lu - hostid %lu - range "
4923 "%lu",
4924 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4925 map->nsid, map->hostid, map->range);
4926 }
4927 }
4928
4929 /* Set up {g,u}id mapping for user namespace of child process. */
4930 ret = lxc_map_ids(idmap, pid);
4931 if (ret < 0) {
4932 ERROR("error setting up {g,u}id mappings for child process "
4933 "\"%d\"",
4934 pid);
4935 goto on_error;
4936 }
4937
4938 /* Tell child to proceed. */
4939 if (write(p[1], &c, 1) != 1) {
4940 SYSERROR("failed telling child process \"%d\" to proceed", pid);
4941 goto on_error;
4942 }
4943
4944 /* Wait for child to finish. */
4945 ret = wait_for_pid(pid);
4946
4947 on_error:
4948 if (idmap)
4949 lxc_free_idmap(idmap);
4950 if (container_root_uid)
4951 free(container_root_uid);
4952 if (container_root_gid)
4953 free(container_root_gid);
4954 if (host_uid_map && (host_uid_map != container_root_uid))
4955 free(host_uid_map);
4956 if (host_gid_map && (host_gid_map != container_root_gid))
4957 free(host_gid_map);
4958
4959 if (p[0] != -1)
4960 close(p[0]);
4961 close(p[1]);
4962
4963 return ret;
4964 }
4965
4966 /* not thread-safe, do not use from api without first forking */
4967 static char* getuname(void)
4968 {
4969 struct passwd *result;
4970
4971 result = getpwuid(geteuid());
4972 if (!result)
4973 return NULL;
4974
4975 return strdup(result->pw_name);
4976 }
4977
4978 /* not thread-safe, do not use from api without first forking */
4979 static char *getgname(void)
4980 {
4981 struct group *result;
4982
4983 result = getgrgid(getegid());
4984 if (!result)
4985 return NULL;
4986
4987 return strdup(result->gr_name);
4988 }
4989
4990 /* not thread-safe, do not use from api without first forking */
4991 void suggest_default_idmap(void)
4992 {
4993 FILE *f;
4994 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4995 char *line = NULL;
4996 char *uname, *gname;
4997 size_t len = 0;
4998
4999 if (!(uname = getuname()))
5000 return;
5001
5002 if (!(gname = getgname())) {
5003 free(uname);
5004 return;
5005 }
5006
5007 f = fopen(subuidfile, "r");
5008 if (!f) {
5009 ERROR("Your system is not configured with subuids");
5010 free(gname);
5011 free(uname);
5012 return;
5013 }
5014 while (getline(&line, &len, f) != -1) {
5015 size_t no_newline = 0;
5016 char *p = strchr(line, ':'), *p2;
5017 if (*line == '#')
5018 continue;
5019 if (!p)
5020 continue;
5021 *p = '\0';
5022 p++;
5023 if (strcmp(line, uname))
5024 continue;
5025 p2 = strchr(p, ':');
5026 if (!p2)
5027 continue;
5028 *p2 = '\0';
5029 p2++;
5030 if (!*p2)
5031 continue;
5032 no_newline = strcspn(p2, "\n");
5033 p2[no_newline] = '\0';
5034
5035 if (lxc_safe_uint(p, &uid) < 0)
5036 WARN("Could not parse UID.");
5037 if (lxc_safe_uint(p2, &urange) < 0)
5038 WARN("Could not parse UID range.");
5039 }
5040 fclose(f);
5041
5042 f = fopen(subgidfile, "r");
5043 if (!f) {
5044 ERROR("Your system is not configured with subgids");
5045 free(gname);
5046 free(uname);
5047 return;
5048 }
5049 while (getline(&line, &len, f) != -1) {
5050 size_t no_newline = 0;
5051 char *p = strchr(line, ':'), *p2;
5052 if (*line == '#')
5053 continue;
5054 if (!p)
5055 continue;
5056 *p = '\0';
5057 p++;
5058 if (strcmp(line, uname))
5059 continue;
5060 p2 = strchr(p, ':');
5061 if (!p2)
5062 continue;
5063 *p2 = '\0';
5064 p2++;
5065 if (!*p2)
5066 continue;
5067 no_newline = strcspn(p2, "\n");
5068 p2[no_newline] = '\0';
5069
5070 if (lxc_safe_uint(p, &gid) < 0)
5071 WARN("Could not parse GID.");
5072 if (lxc_safe_uint(p2, &grange) < 0)
5073 WARN("Could not parse GID range.");
5074 }
5075 fclose(f);
5076
5077 free(line);
5078
5079 if (!urange || !grange) {
5080 ERROR("You do not have subuids or subgids allocated");
5081 ERROR("Unprivileged containers require subuids and subgids");
5082 return;
5083 }
5084
5085 ERROR("You must either run as root, or define uid mappings");
5086 ERROR("To pass uid mappings to lxc-create, you could create");
5087 ERROR("~/.config/lxc/default.conf:");
5088 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
5089 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
5090 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
5091
5092 free(gname);
5093 free(uname);
5094 }
5095
5096 static void free_cgroup_settings(struct lxc_list *result)
5097 {
5098 struct lxc_list *iterator, *next;
5099
5100 lxc_list_for_each_safe(iterator, result, next) {
5101 lxc_list_del(iterator);
5102 free(iterator);
5103 }
5104 free(result);
5105 }
5106
5107 /*
5108 * Return the list of cgroup_settings sorted according to the following rules
5109 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
5110 */
5111 struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
5112 {
5113 struct lxc_list *result;
5114 struct lxc_list *memsw_limit = NULL;
5115 struct lxc_list *it = NULL;
5116 struct lxc_cgroup *cg = NULL;
5117 struct lxc_list *item = NULL;
5118
5119 result = malloc(sizeof(*result));
5120 if (!result) {
5121 ERROR("failed to allocate memory to sort cgroup settings");
5122 return NULL;
5123 }
5124 lxc_list_init(result);
5125
5126 /*Iterate over the cgroup settings and copy them to the output list*/
5127 lxc_list_for_each(it, cgroup_settings) {
5128 item = malloc(sizeof(*item));
5129 if (!item) {
5130 ERROR("failed to allocate memory to sort cgroup settings");
5131 free_cgroup_settings(result);
5132 return NULL;
5133 }
5134 item->elem = it->elem;
5135 cg = it->elem;
5136 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
5137 /* Store the memsw_limit location */
5138 memsw_limit = item;
5139 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
5140 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
5141 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
5142 item->elem = memsw_limit->elem;
5143 memsw_limit->elem = it->elem;
5144 }
5145 lxc_list_add_tail(result, item);
5146 }
5147
5148 return result;
5149 }