]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/conf.c
Merge pull request #1723 from brauner/2017-07-31/remove_utmp_watch
[mirror_lxc.git] / src / lxc / conf.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #define _GNU_SOURCE
25 #include "config.h"
26
27 #include <dirent.h>
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <grp.h>
31 #include <inttypes.h>
32 #include <libgen.h>
33 #include <pwd.h>
34 #include <stdarg.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <time.h>
39 #include <unistd.h>
40 #include <arpa/inet.h>
41 #include <linux/loop.h>
42 #include <net/if.h>
43 #include <netinet/in.h>
44 #include <sys/mman.h>
45 #include <sys/mount.h>
46 #include <sys/param.h>
47 #include <sys/prctl.h>
48 #include <sys/stat.h>
49 #include <sys/socket.h>
50 #include <sys/sysmacros.h>
51 #include <sys/syscall.h>
52 #include <sys/types.h>
53 #include <sys/utsname.h>
54 #include <sys/wait.h>
55
56 /* makedev() */
57 #ifdef MAJOR_IN_MKDEV
58 # include <sys/mkdev.h>
59 #endif
60
61 #ifdef HAVE_STATVFS
62 #include <sys/statvfs.h>
63 #endif
64
65 #if HAVE_PTY_H
66 #include <pty.h>
67 #else
68 #include <../include/openpty.h>
69 #endif
70
71 #ifdef HAVE_LINUX_MEMFD_H
72 #include <linux/memfd.h>
73 #endif
74
75 #include "af_unix.h"
76 #include "bdev.h"
77 #include "caps.h" /* for lxc_caps_last_cap() */
78 #include "cgroup.h"
79 #include "conf.h"
80 #include "confile_utils.h"
81 #include "error.h"
82 #include "log.h"
83 #include "lxcaufs.h"
84 #include "lxclock.h"
85 #include "lxcoverlay.h"
86 #include "lxcseccomp.h"
87 #include "namespace.h"
88 #include "network.h"
89 #include "parse.h"
90 #include "utils.h"
91 #include "lsm/lsm.h"
92
93 #if HAVE_LIBCAP
94 #include <sys/capability.h>
95 #endif
96
97 #if HAVE_SYS_PERSONALITY_H
98 #include <sys/personality.h>
99 #endif
100
101 #if IS_BIONIC
102 #include <../include/lxcmntent.h>
103 #ifndef HAVE_PRLIMIT
104 #include <../include/prlimit.h>
105 #endif
106 #else
107 #include <mntent.h>
108 #endif
109
110 lxc_log_define(lxc_conf, lxc);
111
112 #if HAVE_LIBCAP
113 #ifndef CAP_SETFCAP
114 #define CAP_SETFCAP 31
115 #endif
116
117 #ifndef CAP_MAC_OVERRIDE
118 #define CAP_MAC_OVERRIDE 32
119 #endif
120
121 #ifndef CAP_MAC_ADMIN
122 #define CAP_MAC_ADMIN 33
123 #endif
124 #endif
125
126 #ifndef PR_CAPBSET_DROP
127 #define PR_CAPBSET_DROP 24
128 #endif
129
130 #ifndef LO_FLAGS_AUTOCLEAR
131 #define LO_FLAGS_AUTOCLEAR 4
132 #endif
133
134 #ifndef CAP_SETUID
135 #define CAP_SETUID 7
136 #endif
137
138 #ifndef CAP_SETGID
139 #define CAP_SETGID 6
140 #endif
141
142 /* needed for cgroup automount checks, regardless of whether we
143 * have included linux/capability.h or not */
144 #ifndef CAP_SYS_ADMIN
145 #define CAP_SYS_ADMIN 21
146 #endif
147
148 /* Define pivot_root() if missing from the C library */
149 #ifndef HAVE_PIVOT_ROOT
150 static int pivot_root(const char * new_root, const char * put_old)
151 {
152 #ifdef __NR_pivot_root
153 return syscall(__NR_pivot_root, new_root, put_old);
154 #else
155 errno = ENOSYS;
156 return -1;
157 #endif
158 }
159 #else
160 extern int pivot_root(const char * new_root, const char * put_old);
161 #endif
162
163 /* Define sethostname() if missing from the C library */
164 #ifndef HAVE_SETHOSTNAME
165 static int sethostname(const char * name, size_t len)
166 {
167 #ifdef __NR_sethostname
168 return syscall(__NR_sethostname, name, len);
169 #else
170 errno = ENOSYS;
171 return -1;
172 #endif
173 }
174 #endif
175
176 #ifndef MS_PRIVATE
177 #define MS_PRIVATE (1<<18)
178 #endif
179
180 #ifndef MS_LAZYTIME
181 #define MS_LAZYTIME (1<<25)
182 #endif
183
184 /* memfd_create() */
185 #ifndef MFD_CLOEXEC
186 #define MFD_CLOEXEC 0x0001U
187 #endif
188
189 #ifndef MFD_ALLOW_SEALING
190 #define MFD_ALLOW_SEALING 0x0002U
191 #endif
192
193 #ifndef HAVE_MEMFD_CREATE
194 static int memfd_create(const char *name, unsigned int flags) {
195 #ifndef __NR_memfd_create
196 #if defined __i386__
197 #define __NR_memfd_create 356
198 #elif defined __x86_64__
199 #define __NR_memfd_create 319
200 #elif defined __arm__
201 #define __NR_memfd_create 385
202 #elif defined __aarch64__
203 #define __NR_memfd_create 279
204 #elif defined __s390__
205 #define __NR_memfd_create 350
206 #elif defined __powerpc__
207 #define __NR_memfd_create 360
208 #elif defined __sparc__
209 #define __NR_memfd_create 348
210 #elif defined __blackfin__
211 #define __NR_memfd_create 390
212 #elif defined __ia64__
213 #define __NR_memfd_create 1340
214 #elif defined _MIPS_SIM
215 #if _MIPS_SIM == _MIPS_SIM_ABI32
216 #define __NR_memfd_create 4354
217 #endif
218 #if _MIPS_SIM == _MIPS_SIM_NABI32
219 #define __NR_memfd_create 6318
220 #endif
221 #if _MIPS_SIM == _MIPS_SIM_ABI64
222 #define __NR_memfd_create 5314
223 #endif
224 #endif
225 #endif
226 #ifdef __NR_memfd_create
227 return syscall(__NR_memfd_create, name, flags);
228 #else
229 errno = ENOSYS;
230 return -1;
231 #endif
232 }
233 #else
234 extern int memfd_create(const char *name, unsigned int flags);
235 #endif
236
237 char *lxchook_names[NUM_LXC_HOOKS] = {
238 "pre-start", "pre-mount", "mount", "autodev", "start", "stop", "post-stop", "clone", "destroy" };
239
240 typedef int (*instantiate_cb)(struct lxc_handler *, struct lxc_netdev *);
241
242 struct mount_opt {
243 char *name;
244 int clear;
245 int flag;
246 };
247
248 struct caps_opt {
249 char *name;
250 int value;
251 };
252
253 struct limit_opt {
254 char *name;
255 int value;
256 };
257
258 /*
259 * The lxc_conf of the container currently being worked on in an
260 * API call
261 * This is used in the error calls
262 */
263 #ifdef HAVE_TLS
264 __thread struct lxc_conf *current_config;
265 #else
266 struct lxc_conf *current_config;
267 #endif
268
269 /* Declare this here, since we don't want to reshuffle the whole file. */
270 static int in_caplist(int cap, struct lxc_list *caps);
271
272 static int instantiate_veth(struct lxc_handler *, struct lxc_netdev *);
273 static int instantiate_macvlan(struct lxc_handler *, struct lxc_netdev *);
274 static int instantiate_vlan(struct lxc_handler *, struct lxc_netdev *);
275 static int instantiate_phys(struct lxc_handler *, struct lxc_netdev *);
276 static int instantiate_empty(struct lxc_handler *, struct lxc_netdev *);
277 static int instantiate_none(struct lxc_handler *, struct lxc_netdev *);
278
279 static instantiate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
280 [LXC_NET_VETH] = instantiate_veth,
281 [LXC_NET_MACVLAN] = instantiate_macvlan,
282 [LXC_NET_VLAN] = instantiate_vlan,
283 [LXC_NET_PHYS] = instantiate_phys,
284 [LXC_NET_EMPTY] = instantiate_empty,
285 [LXC_NET_NONE] = instantiate_none,
286 };
287
288 static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
289 static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
290 static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
291 static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
292 static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
293 static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
294
295 static instantiate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
296 [LXC_NET_VETH] = shutdown_veth,
297 [LXC_NET_MACVLAN] = shutdown_macvlan,
298 [LXC_NET_VLAN] = shutdown_vlan,
299 [LXC_NET_PHYS] = shutdown_phys,
300 [LXC_NET_EMPTY] = shutdown_empty,
301 [LXC_NET_NONE] = shutdown_none,
302 };
303
304 static struct mount_opt mount_opt[] = {
305 { "async", 1, MS_SYNCHRONOUS },
306 { "atime", 1, MS_NOATIME },
307 { "bind", 0, MS_BIND },
308 { "defaults", 0, 0 },
309 { "dev", 1, MS_NODEV },
310 { "diratime", 1, MS_NODIRATIME },
311 { "dirsync", 0, MS_DIRSYNC },
312 { "exec", 1, MS_NOEXEC },
313 { "lazytime", 0, MS_LAZYTIME },
314 { "mand", 0, MS_MANDLOCK },
315 { "noatime", 0, MS_NOATIME },
316 { "nodev", 0, MS_NODEV },
317 { "nodiratime", 0, MS_NODIRATIME },
318 { "noexec", 0, MS_NOEXEC },
319 { "nomand", 1, MS_MANDLOCK },
320 { "norelatime", 1, MS_RELATIME },
321 { "nostrictatime", 1, MS_STRICTATIME },
322 { "nosuid", 0, MS_NOSUID },
323 { "rbind", 0, MS_BIND|MS_REC },
324 { "relatime", 0, MS_RELATIME },
325 { "remount", 0, MS_REMOUNT },
326 { "ro", 0, MS_RDONLY },
327 { "rw", 1, MS_RDONLY },
328 { "strictatime", 0, MS_STRICTATIME },
329 { "suid", 1, MS_NOSUID },
330 { "sync", 0, MS_SYNCHRONOUS },
331 { NULL, 0, 0 },
332 };
333
334 #if HAVE_LIBCAP
335 static struct caps_opt caps_opt[] = {
336 { "chown", CAP_CHOWN },
337 { "dac_override", CAP_DAC_OVERRIDE },
338 { "dac_read_search", CAP_DAC_READ_SEARCH },
339 { "fowner", CAP_FOWNER },
340 { "fsetid", CAP_FSETID },
341 { "kill", CAP_KILL },
342 { "setgid", CAP_SETGID },
343 { "setuid", CAP_SETUID },
344 { "setpcap", CAP_SETPCAP },
345 { "linux_immutable", CAP_LINUX_IMMUTABLE },
346 { "net_bind_service", CAP_NET_BIND_SERVICE },
347 { "net_broadcast", CAP_NET_BROADCAST },
348 { "net_admin", CAP_NET_ADMIN },
349 { "net_raw", CAP_NET_RAW },
350 { "ipc_lock", CAP_IPC_LOCK },
351 { "ipc_owner", CAP_IPC_OWNER },
352 { "sys_module", CAP_SYS_MODULE },
353 { "sys_rawio", CAP_SYS_RAWIO },
354 { "sys_chroot", CAP_SYS_CHROOT },
355 { "sys_ptrace", CAP_SYS_PTRACE },
356 { "sys_pacct", CAP_SYS_PACCT },
357 { "sys_admin", CAP_SYS_ADMIN },
358 { "sys_boot", CAP_SYS_BOOT },
359 { "sys_nice", CAP_SYS_NICE },
360 { "sys_resource", CAP_SYS_RESOURCE },
361 { "sys_time", CAP_SYS_TIME },
362 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
363 { "mknod", CAP_MKNOD },
364 { "lease", CAP_LEASE },
365 #ifdef CAP_AUDIT_READ
366 { "audit_read", CAP_AUDIT_READ },
367 #endif
368 #ifdef CAP_AUDIT_WRITE
369 { "audit_write", CAP_AUDIT_WRITE },
370 #endif
371 #ifdef CAP_AUDIT_CONTROL
372 { "audit_control", CAP_AUDIT_CONTROL },
373 #endif
374 { "setfcap", CAP_SETFCAP },
375 { "mac_override", CAP_MAC_OVERRIDE },
376 { "mac_admin", CAP_MAC_ADMIN },
377 #ifdef CAP_SYSLOG
378 { "syslog", CAP_SYSLOG },
379 #endif
380 #ifdef CAP_WAKE_ALARM
381 { "wake_alarm", CAP_WAKE_ALARM },
382 #endif
383 #ifdef CAP_BLOCK_SUSPEND
384 { "block_suspend", CAP_BLOCK_SUSPEND },
385 #endif
386 };
387 #else
388 static struct caps_opt caps_opt[] = {};
389 #endif
390
391 static struct limit_opt limit_opt[] = {
392 #ifdef RLIMIT_AS
393 { "as", RLIMIT_AS },
394 #endif
395 #ifdef RLIMIT_CORE
396 { "core", RLIMIT_CORE },
397 #endif
398 #ifdef RLIMIT_CPU
399 { "cpu", RLIMIT_CPU },
400 #endif
401 #ifdef RLIMIT_DATA
402 { "data", RLIMIT_DATA },
403 #endif
404 #ifdef RLIMIT_FSIZE
405 { "fsize", RLIMIT_FSIZE },
406 #endif
407 #ifdef RLIMIT_LOCKS
408 { "locks", RLIMIT_LOCKS },
409 #endif
410 #ifdef RLIMIT_MEMLOCK
411 { "memlock", RLIMIT_MEMLOCK },
412 #endif
413 #ifdef RLIMIT_MSGQUEUE
414 { "msgqueue", RLIMIT_MSGQUEUE },
415 #endif
416 #ifdef RLIMIT_NICE
417 { "nice", RLIMIT_NICE },
418 #endif
419 #ifdef RLIMIT_NOFILE
420 { "nofile", RLIMIT_NOFILE },
421 #endif
422 #ifdef RLIMIT_NPROC
423 { "nproc", RLIMIT_NPROC },
424 #endif
425 #ifdef RLIMIT_RSS
426 { "rss", RLIMIT_RSS },
427 #endif
428 #ifdef RLIMIT_RTPRIO
429 { "rtprio", RLIMIT_RTPRIO },
430 #endif
431 #ifdef RLIMIT_RTTIME
432 { "rttime", RLIMIT_RTTIME },
433 #endif
434 #ifdef RLIMIT_SIGPENDING
435 { "sigpending", RLIMIT_SIGPENDING },
436 #endif
437 #ifdef RLIMIT_STACK
438 { "stack", RLIMIT_STACK },
439 #endif
440 };
441
442 static int run_buffer(char *buffer)
443 {
444 struct lxc_popen_FILE *f;
445 char *output;
446 int ret;
447
448 f = lxc_popen(buffer);
449 if (!f) {
450 SYSERROR("Failed to popen() %s.", buffer);
451 return -1;
452 }
453
454 output = malloc(LXC_LOG_BUFFER_SIZE);
455 if (!output) {
456 ERROR("Failed to allocate memory for %s.", buffer);
457 lxc_pclose(f);
458 return -1;
459 }
460
461 while (fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
462 DEBUG("Script %s with output: %s.", buffer, output);
463
464 free(output);
465
466 ret = lxc_pclose(f);
467 if (ret == -1) {
468 SYSERROR("Script exited with error.");
469 return -1;
470 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
471 ERROR("Script exited with status %d.", WEXITSTATUS(ret));
472 return -1;
473 } else if (WIFSIGNALED(ret)) {
474 ERROR("Script terminated by signal %d.", WTERMSIG(ret));
475 return -1;
476 }
477
478 return 0;
479 }
480
481 static int run_script_argv(const char *name, const char *section,
482 const char *script, const char *hook,
483 const char *lxcpath, char **argsin)
484 {
485 int ret, i;
486 char *buffer;
487 size_t size = 0;
488
489 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
490 script, name, section);
491
492 for (i = 0; argsin && argsin[i]; i++)
493 size += strlen(argsin[i]) + 1;
494
495 size += strlen(hook) + 1;
496
497 size += strlen(script);
498 size += strlen(name);
499 size += strlen(section);
500 size += 3;
501
502 if (size > INT_MAX)
503 return -1;
504
505 buffer = alloca(size);
506 if (!buffer) {
507 ERROR("Failed to allocate memory.");
508 return -1;
509 }
510
511 ret =
512 snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
513 if (ret < 0 || (size_t)ret >= size) {
514 ERROR("Script name too long.");
515 return -1;
516 }
517
518 for (i = 0; argsin && argsin[i]; i++) {
519 int len = size - ret;
520 int rc;
521 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
522 if (rc < 0 || rc >= len) {
523 ERROR("Script args too long.");
524 return -1;
525 }
526 ret += rc;
527 }
528
529 return run_buffer(buffer);
530 }
531
532 static int run_script(const char *name, const char *section, const char *script,
533 ...)
534 {
535 int ret;
536 char *buffer, *p;
537 size_t size = 0;
538 va_list ap;
539
540 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
541 script, name, section);
542
543 va_start(ap, script);
544 while ((p = va_arg(ap, char *)))
545 size += strlen(p) + 1;
546 va_end(ap);
547
548 size += strlen(script);
549 size += strlen(name);
550 size += strlen(section);
551 size += 3;
552
553 if (size > INT_MAX)
554 return -1;
555
556 buffer = alloca(size);
557 if (!buffer) {
558 ERROR("Failed to allocate memory.");
559 return -1;
560 }
561
562 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
563 if (ret < 0 || ret >= size) {
564 ERROR("Script name too long.");
565 return -1;
566 }
567
568 va_start(ap, script);
569 while ((p = va_arg(ap, char *))) {
570 int len = size - ret;
571 int rc;
572 rc = snprintf(buffer + ret, len, " %s", p);
573 if (rc < 0 || rc >= len) {
574 ERROR("Script args too long.");
575 return -1;
576 }
577 ret += rc;
578 }
579 va_end(ap);
580
581 return run_buffer(buffer);
582 }
583
584 /*
585 * pin_rootfs
586 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
587 * the duration of the container run, to prevent the container from marking
588 * the underlying fs readonly on shutdown. unlink the file immediately so
589 * no name pollution is happens
590 * return -1 on error.
591 * return -2 if nothing needed to be pinned.
592 * return an open fd (>=0) if we pinned it.
593 */
594 int pin_rootfs(const char *rootfs)
595 {
596 char absrootfs[MAXPATHLEN];
597 char absrootfspin[MAXPATHLEN];
598 struct stat s;
599 int ret, fd;
600
601 if (rootfs == NULL || strlen(rootfs) == 0)
602 return -2;
603
604 if (!realpath(rootfs, absrootfs))
605 return -2;
606
607 if (access(absrootfs, F_OK))
608 return -1;
609
610 if (stat(absrootfs, &s))
611 return -1;
612
613 if (!S_ISDIR(s.st_mode))
614 return -2;
615
616 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
617 if (ret >= MAXPATHLEN)
618 return -1;
619
620 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
621 if (fd < 0)
622 return fd;
623 (void)unlink(absrootfspin);
624 return fd;
625 }
626
627 /*
628 * If we are asking to remount something, make sure that any
629 * NOEXEC etc are honored.
630 */
631 unsigned long add_required_remount_flags(const char *s, const char *d,
632 unsigned long flags)
633 {
634 #ifdef HAVE_STATVFS
635 struct statvfs sb;
636 unsigned long required_flags = 0;
637
638 if (!(flags & MS_REMOUNT))
639 return flags;
640
641 if (!s)
642 s = d;
643
644 if (!s)
645 return flags;
646 if (statvfs(s, &sb) < 0)
647 return flags;
648
649 if (sb.f_flag & MS_NOSUID)
650 required_flags |= MS_NOSUID;
651 if (sb.f_flag & MS_NODEV)
652 required_flags |= MS_NODEV;
653 if (sb.f_flag & MS_RDONLY)
654 required_flags |= MS_RDONLY;
655 if (sb.f_flag & MS_NOEXEC)
656 required_flags |= MS_NOEXEC;
657
658 return flags | required_flags;
659 #else
660 return flags;
661 #endif
662 }
663
664 static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
665 {
666 int r;
667 int i;
668 static struct {
669 int match_mask;
670 int match_flag;
671 const char *source;
672 const char *destination;
673 const char *fstype;
674 unsigned long flags;
675 const char *options;
676 } default_mounts[] = {
677 /* Read-only bind-mounting... In older kernels, doing that required
678 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
679 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
680 * kernel 2.6.26 onwards. However, this apparently does not work on
681 * kernel 3.8. Unfortunately, on that very same kernel, doing the
682 * same trick as above doesn't seem to work either, there one needs
683 * to ALSO specify MS_BIND for the remount, otherwise the entire
684 * fs is remounted read-only or the mount fails because it's busy...
685 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
686 * 2.6.32...
687 */
688 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
689 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
690 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL },
691 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
692 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
693 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL },
694 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
695 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
696 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
697 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
698 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
699 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys", "sysfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
700 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys", "%r/sys", NULL, MS_BIND, NULL },
701 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
702 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL },
703 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL, MS_BIND, NULL },
704 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED, NULL, "%r/sys/devices/virtual/net", NULL, MS_REMOUNT|MS_BIND|MS_NOSUID|MS_NODEV|MS_NOEXEC, NULL },
705 { 0, 0, NULL, NULL, NULL, 0, NULL }
706 };
707
708 for (i = 0; default_mounts[i].match_mask; i++) {
709 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
710 char *source = NULL;
711 char *destination = NULL;
712 int saved_errno;
713 unsigned long mflags;
714
715 if (default_mounts[i].source) {
716 /* will act like strdup if %r is not present */
717 source = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].source);
718 if (!source) {
719 SYSERROR("memory allocation error");
720 return -1;
721 }
722 }
723 if (!default_mounts[i].destination) {
724 ERROR("BUG: auto mounts destination %d was NULL", i);
725 free(source);
726 return -1;
727 }
728 /* will act like strdup if %r is not present */
729 destination = lxc_string_replace("%r", conf->rootfs.path ? conf->rootfs.mount : "", default_mounts[i].destination);
730 if (!destination) {
731 saved_errno = errno;
732 SYSERROR("memory allocation error");
733 free(source);
734 errno = saved_errno;
735 return -1;
736 }
737 mflags = add_required_remount_flags(source, destination,
738 default_mounts[i].flags);
739 r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL);
740 saved_errno = errno;
741 if (r < 0 && errno == ENOENT) {
742 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination);
743 r = 0;
744 }
745 else if (r < 0)
746 SYSERROR("error mounting %s on %s flags %lu", source, destination, mflags);
747
748 free(source);
749 free(destination);
750 if (r < 0) {
751 errno = saved_errno;
752 return -1;
753 }
754 }
755 }
756
757 if (flags & LXC_AUTO_CGROUP_MASK) {
758 int cg_flags;
759
760 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
761 /* If the type of cgroup mount was not specified, it depends on the
762 * container's capabilities as to what makes sense: if we have
763 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
764 * anyway, so we may as well default to read-write; then the admin
765 * will not be given a false sense of security. (And if they really
766 * want mixed r/o r/w, then they can explicitly specify :mixed.)
767 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
768 * :mixed, because then the container can't remount it read-write. */
769 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
770 int has_sys_admin = 0;
771
772 if (!lxc_list_empty(&conf->keepcaps))
773 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
774 else
775 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
776
777 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC)
778 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
779 else
780 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
781 }
782
783 if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) {
784 SYSERROR("error mounting /sys/fs/cgroup");
785 return -1;
786 }
787 }
788
789 return 0;
790 }
791
792 static int setup_utsname(struct utsname *utsname)
793 {
794 if (!utsname)
795 return 0;
796
797 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
798 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
799 return -1;
800 }
801
802 INFO("'%s' hostname has been setup", utsname->nodename);
803
804 return 0;
805 }
806
807 struct dev_symlinks {
808 const char *oldpath;
809 const char *name;
810 };
811
812 static const struct dev_symlinks dev_symlinks[] = {
813 {"/proc/self/fd", "fd"},
814 {"/proc/self/fd/0", "stdin"},
815 {"/proc/self/fd/1", "stdout"},
816 {"/proc/self/fd/2", "stderr"},
817 };
818
819 static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
820 {
821 char path[MAXPATHLEN];
822 int ret,i;
823 struct stat s;
824
825
826 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
827 const struct dev_symlinks *d = &dev_symlinks[i];
828 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
829 if (ret < 0 || ret >= MAXPATHLEN)
830 return -1;
831
832 /*
833 * Stat the path first. If we don't get an error
834 * accept it as is and don't try to create it
835 */
836 if (!stat(path, &s)) {
837 continue;
838 }
839
840 ret = symlink(d->oldpath, path);
841
842 if (ret && errno != EEXIST) {
843 if ( errno == EROFS ) {
844 WARN("Warning: Read Only file system while creating %s", path);
845 } else {
846 SYSERROR("Error creating %s", path);
847 return -1;
848 }
849 }
850 }
851 return 0;
852 }
853
854 /*
855 * Build a space-separate list of ptys to pass to systemd.
856 */
857 static bool append_ptyname(char **pp, char *name)
858 {
859 char *p;
860
861 if (!*pp) {
862 *pp = malloc(strlen(name) + strlen("container_ttys=") + 1);
863 if (!*pp)
864 return false;
865 sprintf(*pp, "container_ttys=%s", name);
866 return true;
867 }
868 p = realloc(*pp, strlen(*pp) + strlen(name) + 2);
869 if (!p)
870 return false;
871 *pp = p;
872 strcat(p, " ");
873 strcat(p, name);
874 return true;
875 }
876
877 static int lxc_setup_tty(struct lxc_conf *conf)
878 {
879 int i, ret;
880 const struct lxc_tty_info *tty_info = &conf->tty_info;
881 char *ttydir = conf->ttydir;
882 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
883
884 if (!conf->rootfs.path)
885 return 0;
886
887 for (i = 0; i < tty_info->nbtty; i++) {
888 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
889
890 ret = snprintf(path, sizeof(path), "/dev/tty%d", i + 1);
891 if (ret < 0 || (size_t)ret >= sizeof(path)) {
892 ERROR("pathname too long for ttys");
893 return -1;
894 }
895
896 if (ttydir) {
897 /* create dev/lxc/tty%d" */
898 ret = snprintf(lxcpath, sizeof(lxcpath),
899 "/dev/%s/tty%d", ttydir, i + 1);
900 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
901 ERROR("pathname too long for ttys");
902 return -1;
903 }
904
905 ret = creat(lxcpath, 0660);
906 if (ret < 0 && errno != EEXIST) {
907 SYSERROR("failed to create \"%s\"", lxcpath);
908 return -1;
909 }
910 if (ret >= 0)
911 close(ret);
912
913 ret = unlink(path);
914 if (ret < 0 && errno != ENOENT) {
915 SYSERROR("failed to unlink \"%s\"", path);
916 return -1;
917 }
918
919 ret = mount(pty_info->name, lxcpath, "none", MS_BIND, 0);
920 if (ret < 0) {
921 WARN("failed to bind mount \"%s\" onto \"%s\"",
922 pty_info->name, path);
923 continue;
924 }
925 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
926 path);
927
928 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d",
929 ttydir, i + 1);
930 if (ret < 0 || (size_t)ret >= sizeof(lxcpath)) {
931 ERROR("tty pathname too long");
932 return -1;
933 }
934
935 ret = symlink(lxcpath, path);
936 if (ret < 0) {
937 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
938 path, lxcpath);
939 return -1;
940 }
941 } else {
942 /* If we populated /dev, then we need to create
943 * /dev/ttyN
944 */
945 ret = access(path, F_OK);
946 if (ret < 0) {
947 ret = creat(path, 0660);
948 if (ret < 0) {
949 SYSERROR("failed to create \"%s\"", path);
950 /* this isn't fatal, continue */
951 } else {
952 close(ret);
953 }
954 }
955
956 ret = mount(pty_info->name, path, "none", MS_BIND, 0);
957 if (ret < 0) {
958 SYSERROR("failed to mount '%s'->'%s'", pty_info->name, path);
959 continue;
960 }
961
962 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info->name,
963 path);
964 }
965
966 if (!append_ptyname(&conf->pty_names, pty_info->name)) {
967 ERROR("Error setting up container_ttys string");
968 return -1;
969 }
970 }
971
972 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info->nbtty);
973 return 0;
974 }
975
976 static int setup_rootfs_pivot_root(const char *rootfs)
977 {
978 int oldroot = -1, newroot = -1;
979
980 oldroot = open("/", O_DIRECTORY | O_RDONLY);
981 if (oldroot < 0) {
982 SYSERROR("Error opening old-/ for fchdir");
983 return -1;
984 }
985 newroot = open(rootfs, O_DIRECTORY | O_RDONLY);
986 if (newroot < 0) {
987 SYSERROR("Error opening new-/ for fchdir");
988 goto fail;
989 }
990
991 /* change into new root fs */
992 if (fchdir(newroot)) {
993 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
994 goto fail;
995 }
996
997 /* pivot_root into our new root fs */
998 if (pivot_root(".", ".")) {
999 SYSERROR("pivot_root syscall failed");
1000 goto fail;
1001 }
1002
1003 /*
1004 * at this point the old-root is mounted on top of our new-root
1005 * To unmounted it we must not be chdir'd into it, so escape back
1006 * to old-root
1007 */
1008 if (fchdir(oldroot) < 0) {
1009 SYSERROR("Error entering oldroot");
1010 goto fail;
1011 }
1012 if (umount2(".", MNT_DETACH) < 0) {
1013 SYSERROR("Error detaching old root");
1014 goto fail;
1015 }
1016
1017 if (fchdir(newroot) < 0) {
1018 SYSERROR("Error re-entering newroot");
1019 goto fail;
1020 }
1021
1022 close(oldroot);
1023 close(newroot);
1024
1025 DEBUG("pivot_root syscall to '%s' successful", rootfs);
1026
1027 return 0;
1028
1029 fail:
1030 if (oldroot != -1)
1031 close(oldroot);
1032 if (newroot != -1)
1033 close(newroot);
1034 return -1;
1035 }
1036
1037 /*
1038 * Just create a path for /dev under $lxcpath/$name and in rootfs
1039 * If we hit an error, log it but don't fail yet.
1040 */
1041 static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs, const char *lxcpath)
1042 {
1043 int ret;
1044 size_t clen;
1045 char *path;
1046
1047 INFO("Mounting container /dev");
1048
1049 /* $(rootfs->mount) + "/dev/pts" + '\0' */
1050 clen = (rootfs->path ? strlen(rootfs->mount) : 0) + 9;
1051 path = alloca(clen);
1052
1053 ret = snprintf(path, clen, "%s/dev", rootfs->path ? rootfs->mount : "");
1054 if (ret < 0 || ret >= clen)
1055 return -1;
1056
1057 if (!dir_exists(path)) {
1058 WARN("No /dev in container.");
1059 WARN("Proceeding without autodev setup");
1060 return 0;
1061 }
1062
1063 ret = safe_mount("none", path, "tmpfs", 0, "size=500000,mode=755",
1064 rootfs->path ? rootfs->mount : NULL);
1065 if (ret != 0) {
1066 SYSERROR("Failed mounting tmpfs onto %s\n", path);
1067 return -1;
1068 }
1069
1070 INFO("Mounted tmpfs onto %s", path);
1071
1072 ret = snprintf(path, clen, "%s/dev/pts", rootfs->path ? rootfs->mount : "");
1073 if (ret < 0 || ret >= clen)
1074 return -1;
1075
1076 /*
1077 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1078 * If not, then create it and exit if that fails...
1079 */
1080 if (!dir_exists(path)) {
1081 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1082 if (ret) {
1083 SYSERROR("Failed to create /dev/pts in container");
1084 return -1;
1085 }
1086 }
1087
1088 INFO("Mounted container /dev");
1089 return 0;
1090 }
1091
1092 struct lxc_devs {
1093 const char *name;
1094 mode_t mode;
1095 int maj;
1096 int min;
1097 };
1098
1099 static const struct lxc_devs lxc_devs[] = {
1100 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1101 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1102 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1103 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1104 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1105 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
1106 };
1107
1108 static int lxc_fill_autodev(const struct lxc_rootfs *rootfs)
1109 {
1110 int ret;
1111 char path[MAXPATHLEN];
1112 int i;
1113 mode_t cmask;
1114
1115 ret = snprintf(path, MAXPATHLEN, "%s/dev", rootfs->path ? rootfs->mount : "");
1116 if (ret < 0 || ret >= MAXPATHLEN) {
1117 ERROR("Error calculating container /dev location");
1118 return -1;
1119 }
1120
1121 /* ignore, just don't try to fill in */
1122 if (!dir_exists(path))
1123 return 0;
1124
1125 INFO("populating container /dev");
1126 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1127 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
1128 const struct lxc_devs *d = &lxc_devs[i];
1129
1130 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", rootfs->path ? rootfs->mount : "", d->name);
1131 if (ret < 0 || ret >= MAXPATHLEN)
1132 return -1;
1133
1134 ret = mknod(path, d->mode, makedev(d->maj, d->min));
1135 if (ret < 0) {
1136 char hostpath[MAXPATHLEN];
1137 FILE *pathfile;
1138
1139 if (errno == EEXIST) {
1140 DEBUG("\"%s\" device already existed", path);
1141 continue;
1142 }
1143
1144 /* Unprivileged containers cannot create devices, so
1145 * bind mount the device from the host.
1146 */
1147 ret = snprintf(hostpath, MAXPATHLEN, "/dev/%s", d->name);
1148 if (ret < 0 || ret >= MAXPATHLEN)
1149 return -1;
1150 pathfile = fopen(path, "wb");
1151 if (!pathfile) {
1152 SYSERROR("Failed to create device mount target '%s'", path);
1153 return -1;
1154 }
1155 fclose(pathfile);
1156 if (safe_mount(hostpath, path, 0, MS_BIND, NULL, rootfs->path ? rootfs->mount : NULL) != 0) {
1157 SYSERROR("Failed bind mounting device %s from host into container", d->name);
1158 return -1;
1159 }
1160 DEBUG("bind mounted \"%s\" onto \"%s\"", hostpath, path);
1161 } else {
1162 DEBUG("created device node \"%s\"", path);
1163 }
1164 }
1165 umask(cmask);
1166
1167 INFO("populated container /dev");
1168 return 0;
1169 }
1170
1171 static int lxc_setup_rootfs(struct lxc_conf *conf)
1172 {
1173 int ret;
1174 struct bdev *bdev;
1175 const struct lxc_rootfs *rootfs;
1176
1177 rootfs = &conf->rootfs;
1178 if (!rootfs->path) {
1179 if (mount("", "/", NULL, MS_SLAVE | MS_REC, 0)) {
1180 SYSERROR("Failed to make / rslave.");
1181 return -1;
1182 }
1183 return 0;
1184 }
1185
1186 if (access(rootfs->mount, F_OK)) {
1187 SYSERROR("Failed to access to \"%s\". Check it is present.",
1188 rootfs->mount);
1189 return -1;
1190 }
1191
1192 bdev = bdev_init(conf, rootfs->path, rootfs->mount, rootfs->options);
1193 if (!bdev) {
1194 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1195 rootfs->path, rootfs->mount,
1196 rootfs->options ? rootfs->options : "(null)");
1197 return -1;
1198 }
1199
1200 ret = bdev->ops->mount(bdev);
1201 bdev_put(bdev);
1202 if (ret < 0) {
1203 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1204 rootfs->path, rootfs->mount,
1205 rootfs->options ? rootfs->options : "(null)");
1206 return -1;
1207 }
1208
1209 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1210 rootfs->path, rootfs->mount,
1211 rootfs->options ? rootfs->options : "(null)");
1212
1213 return 0;
1214 }
1215
1216 int prepare_ramfs_root(char *root)
1217 {
1218 char buf[LXC_LINELEN], *p;
1219 char nroot[PATH_MAX];
1220 FILE *f;
1221 int i;
1222 char *p2;
1223
1224 if (realpath(root, nroot) == NULL)
1225 return -errno;
1226
1227 if (chdir("/") == -1)
1228 return -errno;
1229
1230 /*
1231 * We could use here MS_MOVE, but in userns this mount is
1232 * locked and can't be moved.
1233 */
1234 if (mount(root, "/", NULL, MS_REC | MS_BIND, NULL) < 0) {
1235 SYSERROR("Failed to move %s into /", root);
1236 return -errno;
1237 }
1238
1239 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
1240 SYSERROR("Failed to make . rprivate");
1241 return -errno;
1242 }
1243
1244 /*
1245 * The following code cleans up inhereted mounts which are not
1246 * required for CT.
1247 *
1248 * The mountinfo file shows not all mounts, if a few points have been
1249 * unmounted between read operations from the mountinfo. So we need to
1250 * read mountinfo a few times.
1251 *
1252 * This loop can be skipped if a container uses unserns, because all
1253 * inherited mounts are locked and we should live with all this trash.
1254 */
1255 while (1) {
1256 int progress = 0;
1257
1258 f = fopen("./proc/self/mountinfo", "r");
1259 if (!f) {
1260 SYSERROR("Unable to open /proc/self/mountinfo");
1261 return -1;
1262 }
1263 while (fgets(buf, LXC_LINELEN, f)) {
1264 for (p = buf, i=0; p && i < 4; i++)
1265 p = strchr(p+1, ' ');
1266 if (!p)
1267 continue;
1268 p2 = strchr(p+1, ' ');
1269 if (!p2)
1270 continue;
1271
1272 *p2 = '\0';
1273 *p = '.';
1274
1275 if (strcmp(p + 1, "/") == 0)
1276 continue;
1277 if (strcmp(p + 1, "/proc") == 0)
1278 continue;
1279
1280 if (umount2(p, MNT_DETACH) == 0)
1281 progress++;
1282 }
1283 fclose(f);
1284 if (!progress)
1285 break;
1286 }
1287
1288 /* This also can be skipped if a container uses unserns */
1289 umount2("./proc", MNT_DETACH);
1290
1291 /* It is weird, but chdir("..") moves us in a new root */
1292 if (chdir("..") == -1) {
1293 SYSERROR("Unable to change working directory");
1294 return -1;
1295 }
1296
1297 if (chroot(".") == -1) {
1298 SYSERROR("Unable to chroot");
1299 return -1;
1300 }
1301
1302 return 0;
1303 }
1304
1305 static int setup_pivot_root(const struct lxc_rootfs *rootfs)
1306 {
1307 if (!rootfs->path) {
1308 DEBUG("container does not have a rootfs, so not doing pivot root");
1309 return 0;
1310 }
1311
1312 if (detect_ramfs_rootfs()) {
1313 DEBUG("detected that container is on ramfs");
1314 if (prepare_ramfs_root(rootfs->mount)) {
1315 ERROR("failed to prepare minimal ramfs root");
1316 return -1;
1317 }
1318
1319 DEBUG("prepared ramfs root for container");
1320 return 0;
1321 }
1322
1323 if (setup_rootfs_pivot_root(rootfs->mount) < 0) {
1324 ERROR("failed to pivot root");
1325 return -1;
1326 }
1327
1328 DEBUG("finished pivot root");
1329 return 0;
1330 }
1331
1332 static int lxc_setup_devpts(int num_pts)
1333 {
1334 int ret;
1335 const char *default_devpts_mntopts = "newinstance,ptmxmode=0666,mode=0620,gid=5";
1336 char devpts_mntopts[256];
1337
1338 if (!num_pts) {
1339 DEBUG("no new devpts instance will be mounted since no pts "
1340 "devices are requested");
1341 return 0;
1342 }
1343
1344 ret = snprintf(devpts_mntopts, sizeof(devpts_mntopts), "%s,max=%d",
1345 default_devpts_mntopts, num_pts);
1346 if (ret < 0 || (size_t)ret >= sizeof(devpts_mntopts))
1347 return -1;
1348
1349 /* Unmount old devpts instance. */
1350 ret = access("/dev/pts/ptmx", F_OK);
1351 if (!ret) {
1352 ret = umount("/dev/pts");
1353 if (ret < 0) {
1354 SYSERROR("failed to unmount old devpts instance");
1355 return -1;
1356 }
1357 DEBUG("unmounted old /dev/pts instance");
1358 }
1359
1360 /* Create mountpoint for devpts instance. */
1361 ret = mkdir("/dev/pts", 0755);
1362 if (ret < 0 && errno != EEXIST) {
1363 SYSERROR("failed to create the \"/dev/pts\" directory");
1364 return -1;
1365 }
1366
1367 /* Mount new devpts instance. */
1368 ret = mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, devpts_mntopts);
1369 if (ret < 0) {
1370 SYSERROR("failed to mount new devpts instance");
1371 return -1;
1372 }
1373 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts);
1374
1375 /* Remove any pre-existing /dev/ptmx file. */
1376 ret = access("/dev/ptmx", F_OK);
1377 if (!ret) {
1378 ret = remove("/dev/ptmx");
1379 if (ret < 0) {
1380 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1381 return -1;
1382 }
1383 DEBUG("removed existing \"/dev/ptmx\"");
1384 }
1385
1386 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1387 ret = open("/dev/ptmx", O_CREAT, 0666);
1388 if (ret < 0) {
1389 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1390 return -1;
1391 }
1392 close(ret);
1393 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
1394
1395 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
1396 ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
1397 if (!ret) {
1398 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1399 return 0;
1400 } else {
1401 /* Fallthrough and try to create a symlink. */
1402 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1403 }
1404
1405 /* Remove the dummy /dev/ptmx file we created above. */
1406 ret = remove("/dev/ptmx");
1407 if (ret < 0) {
1408 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1409 return -1;
1410 }
1411
1412 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1413 ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
1414 if (ret < 0) {
1415 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1416 return -1;
1417 }
1418 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1419
1420 return 0;
1421 }
1422
1423 static int setup_personality(int persona)
1424 {
1425 #if HAVE_SYS_PERSONALITY_H
1426 if (persona == -1)
1427 return 0;
1428
1429 if (personality(persona) < 0) {
1430 SYSERROR("failed to set personality to '0x%x'", persona);
1431 return -1;
1432 }
1433
1434 INFO("set personality to '0x%x'", persona);
1435 #endif
1436
1437 return 0;
1438 }
1439
1440 static int lxc_setup_dev_console(const struct lxc_rootfs *rootfs,
1441 const struct lxc_console *console)
1442 {
1443 char path[MAXPATHLEN];
1444 int ret, fd;
1445
1446 if (console->path && !strcmp(console->path, "none"))
1447 return 0;
1448
1449 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1450 if (ret < 0 || (size_t)ret >= sizeof(path))
1451 return -1;
1452
1453 /* When we are asked to setup a console we remove any previous
1454 * /dev/console bind-mounts.
1455 */
1456 if (file_exists(path)) {
1457 ret = lxc_unstack_mountpoint(path, false);
1458 if (ret < 0) {
1459 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1460 return -ret;
1461 } else {
1462 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1463 }
1464
1465 ret = unlink(path);
1466 if (ret < 0) {
1467 SYSERROR("error unlinking %s", path);
1468 return -errno;
1469 }
1470 }
1471
1472 /* For unprivileged containers autodev or automounts will already have
1473 * taken care of creating /dev/console.
1474 */
1475 fd = open(path, O_CREAT | O_EXCL, S_IXUSR | S_IXGRP | S_IXOTH);
1476 if (fd < 0) {
1477 if (errno != EEXIST) {
1478 SYSERROR("failed to create console");
1479 return -errno;
1480 }
1481 } else {
1482 close(fd);
1483 }
1484
1485 if (chmod(console->name, S_IXUSR | S_IXGRP | S_IXOTH)) {
1486 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR | S_IXGRP | S_IXOTH, console->name);
1487 return -errno;
1488 }
1489
1490 if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount) < 0) {
1491 ERROR("failed to mount '%s' on '%s'", console->name, path);
1492 return -1;
1493 }
1494
1495 DEBUG("mounted pts device \"%s\" onto \"%s\"", console->name, path);
1496 return 0;
1497 }
1498
1499 static int lxc_setup_ttydir_console(const struct lxc_rootfs *rootfs,
1500 const struct lxc_console *console,
1501 char *ttydir)
1502 {
1503 int ret;
1504 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
1505
1506 /* create rootfs/dev/<ttydir> directory */
1507 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir);
1508 if (ret < 0 || (size_t)ret >= sizeof(path))
1509 return -1;
1510
1511 ret = mkdir(path, 0755);
1512 if (ret && errno != EEXIST) {
1513 SYSERROR("failed with errno %d to create %s", errno, path);
1514 return -errno;
1515 }
1516 DEBUG("created directory for console and tty devices at \%s\"", path);
1517
1518 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir);
1519 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1520 return -1;
1521
1522 ret = creat(lxcpath, 0660);
1523 if (ret == -1 && errno != EEXIST) {
1524 SYSERROR("error %d creating %s", errno, lxcpath);
1525 return -errno;
1526 }
1527 if (ret >= 0)
1528 close(ret);
1529
1530 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1531 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1532 return -1;
1533
1534 /* When we are asked to setup a console we remove any previous
1535 * /dev/console bind-mounts.
1536 */
1537 if (console->path && !strcmp(console->path, "none")) {
1538 struct stat st;
1539 ret = stat(path, &st);
1540 if (ret < 0) {
1541 if (errno == ENOENT)
1542 return 0;
1543 SYSERROR("failed stat() \"%s\"", path);
1544 return -errno;
1545 }
1546
1547 /* /dev/console must be character device with major number 5 and
1548 * minor number 1. If not, give benefit of the doubt and assume
1549 * the user has mounted something else right there on purpose.
1550 */
1551 if (((st.st_mode & S_IFMT) != S_IFCHR) || major(st.st_rdev) != 5 || minor(st.st_rdev) != 1)
1552 return 0;
1553
1554 /* In case the user requested a bind-mount for /dev/console and
1555 * requests a ttydir we move the mount to the
1556 * /dev/<ttydir/console.
1557 * Note, we only move the uppermost mount and clear all other
1558 * mounts underneath for safety.
1559 * If it is a character device created via mknod() we simply
1560 * rename it.
1561 */
1562 ret = safe_mount(path, lxcpath, "none", MS_MOVE, NULL, rootfs->mount);
1563 if (ret < 0) {
1564 if (errno != EINVAL) {
1565 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1566 return -errno;
1567 }
1568 /* path was not a mountpoint */
1569 ret = rename(path, lxcpath);
1570 if (ret < 0) {
1571 ERROR("failed to rename \"%s\" to \"%s\": %s", path, lxcpath, strerror(errno));
1572 return -errno;
1573 }
1574 DEBUG("renamed \"%s\" to \"%s\"", path, lxcpath);
1575 } else {
1576 DEBUG("moved mount \"%s\" to \"%s\"", path, lxcpath);
1577 }
1578
1579 /* Clear all remaining bind-mounts. */
1580 ret = lxc_unstack_mountpoint(path, false);
1581 if (ret < 0) {
1582 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1583 return -ret;
1584 } else {
1585 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1586 }
1587 } else {
1588 if (file_exists(path)) {
1589 ret = lxc_unstack_mountpoint(path, false);
1590 if (ret < 0) {
1591 ERROR("failed to unmount \"%s\": %s", path, strerror(errno));
1592 return -ret;
1593 } else {
1594 DEBUG("cleared all (%d) mounts from \"%s\"", ret, path);
1595 }
1596 }
1597
1598 if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount) < 0) {
1599 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1600 return -1;
1601 }
1602 DEBUG("mounted \"%s\" onto \"%s\"", console->name, lxcpath);
1603 }
1604
1605 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
1606 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
1607 if (ret < 0 || (size_t)ret >= sizeof(lxcpath))
1608 return -1;
1609
1610 ret = unlink(path);
1611 if (ret && errno != ENOENT) {
1612 SYSERROR("error unlinking %s", path);
1613 return -errno;
1614 }
1615
1616 ret = symlink(lxcpath, path);
1617 if (ret < 0) {
1618 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath, path);
1619 return -1;
1620 }
1621
1622 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath, path);
1623 return 0;
1624 }
1625
1626 static int lxc_setup_console(const struct lxc_rootfs *rootfs,
1627 const struct lxc_console *console, char *ttydir)
1628 {
1629 /* We don't have a rootfs, /dev/console will be shared. */
1630 if (!rootfs->path) {
1631 DEBUG("/dev/console will be shared with the host");
1632 return 0;
1633 }
1634
1635 if (!ttydir)
1636 return lxc_setup_dev_console(rootfs, console);
1637
1638 return lxc_setup_ttydir_console(rootfs, console, ttydir);
1639 }
1640
1641 static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1642 {
1643 struct mount_opt *mo;
1644
1645 /* If opt is found in mount_opt, set or clear flags.
1646 * Otherwise append it to data. */
1647
1648 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1649 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1650 if (mo->clear)
1651 *flags &= ~mo->flag;
1652 else
1653 *flags |= mo->flag;
1654 return;
1655 }
1656 }
1657
1658 if (strlen(*data))
1659 strcat(*data, ",");
1660 strcat(*data, opt);
1661 }
1662
1663 int parse_mntopts(const char *mntopts, unsigned long *mntflags,
1664 char **mntdata)
1665 {
1666 char *s, *data;
1667 char *p, *saveptr = NULL;
1668
1669 *mntdata = NULL;
1670 *mntflags = 0L;
1671
1672 if (!mntopts)
1673 return 0;
1674
1675 s = strdup(mntopts);
1676 if (!s) {
1677 SYSERROR("failed to allocate memory");
1678 return -1;
1679 }
1680
1681 data = malloc(strlen(s) + 1);
1682 if (!data) {
1683 SYSERROR("failed to allocate memory");
1684 free(s);
1685 return -1;
1686 }
1687 *data = 0;
1688
1689 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1690 p = strtok_r(NULL, ",", &saveptr))
1691 parse_mntopt(p, mntflags, &data);
1692
1693 if (*data)
1694 *mntdata = data;
1695 else
1696 free(data);
1697 free(s);
1698
1699 return 0;
1700 }
1701
1702 static void null_endofword(char *word)
1703 {
1704 while (*word && *word != ' ' && *word != '\t')
1705 word++;
1706 *word = '\0';
1707 }
1708
1709 /*
1710 * skip @nfields spaces in @src
1711 */
1712 static char *get_field(char *src, int nfields)
1713 {
1714 char *p = src;
1715 int i;
1716
1717 for (i = 0; i < nfields; i++) {
1718 while (*p && *p != ' ' && *p != '\t')
1719 p++;
1720 if (!*p)
1721 break;
1722 p++;
1723 }
1724 return p;
1725 }
1726
1727 static int mount_entry(const char *fsname, const char *target,
1728 const char *fstype, unsigned long mountflags,
1729 const char *data, int optional, int dev, const char *rootfs)
1730 {
1731 #ifdef HAVE_STATVFS
1732 struct statvfs sb;
1733 #endif
1734
1735 if (safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data, rootfs)) {
1736 if (optional) {
1737 INFO("failed to mount '%s' on '%s' (optional): %s", fsname,
1738 target, strerror(errno));
1739 return 0;
1740 }
1741 else {
1742 SYSERROR("failed to mount '%s' on '%s'", fsname, target);
1743 return -1;
1744 }
1745 }
1746
1747 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
1748 DEBUG("remounting %s on %s to respect bind or remount options",
1749 fsname ? fsname : "(none)", target ? target : "(none)");
1750 unsigned long rqd_flags = 0;
1751 if (mountflags & MS_RDONLY)
1752 rqd_flags |= MS_RDONLY;
1753 #ifdef HAVE_STATVFS
1754 if (statvfs(fsname, &sb) == 0) {
1755 unsigned long required_flags = rqd_flags;
1756 if (sb.f_flag & MS_NOSUID)
1757 required_flags |= MS_NOSUID;
1758 if (sb.f_flag & MS_NODEV && !dev)
1759 required_flags |= MS_NODEV;
1760 if (sb.f_flag & MS_RDONLY)
1761 required_flags |= MS_RDONLY;
1762 if (sb.f_flag & MS_NOEXEC)
1763 required_flags |= MS_NOEXEC;
1764 DEBUG("(at remount) flags for %s was %lu, required extra flags are %lu", fsname, sb.f_flag, required_flags);
1765 /*
1766 * If this was a bind mount request, and required_flags
1767 * does not have any flags which are not already in
1768 * mountflags, then skip the remount
1769 */
1770 if (!(mountflags & MS_REMOUNT)) {
1771 if (!(required_flags & ~mountflags) && rqd_flags == 0) {
1772 DEBUG("mountflags already was %lu, skipping remount",
1773 mountflags);
1774 goto skipremount;
1775 }
1776 }
1777 mountflags |= required_flags;
1778 }
1779 #endif
1780
1781 if (mount(fsname, target, fstype,
1782 mountflags | MS_REMOUNT, data) < 0) {
1783 if (optional) {
1784 INFO("failed to mount '%s' on '%s' (optional): %s",
1785 fsname, target, strerror(errno));
1786 return 0;
1787 }
1788 else {
1789 SYSERROR("failed to mount '%s' on '%s'",
1790 fsname, target);
1791 return -1;
1792 }
1793 }
1794 }
1795
1796 #ifdef HAVE_STATVFS
1797 skipremount:
1798 #endif
1799 DEBUG("mounted '%s' on '%s', type '%s'", fsname, target, fstype);
1800
1801 return 0;
1802 }
1803
1804 /*
1805 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1806 */
1807 static void cull_mntent_opt(struct mntent *mntent)
1808 {
1809 int i;
1810 char *p, *p2;
1811 char *list[] = {"create=dir",
1812 "create=file",
1813 "optional",
1814 NULL };
1815
1816 for (i=0; list[i]; i++) {
1817 if (!(p = strstr(mntent->mnt_opts, list[i])))
1818 continue;
1819 p2 = strchr(p, ',');
1820 if (!p2) {
1821 /* no more mntopts, so just chop it here */
1822 *p = '\0';
1823 continue;
1824 }
1825 memmove(p, p2+1, strlen(p2+1)+1);
1826 }
1827 }
1828
1829 static int mount_entry_create_dir_file(const struct mntent *mntent,
1830 const char* path, const struct lxc_rootfs *rootfs,
1831 const char *lxc_name, const char *lxc_path)
1832 {
1833 char *pathdirname = NULL;
1834 int ret = 0;
1835 FILE *pathfile = NULL;
1836
1837 if (strncmp(mntent->mnt_type, "overlay", 7) == 0) {
1838 if (ovl_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
1839 return -1;
1840 } else if (strncmp(mntent->mnt_type, "aufs", 4) == 0) {
1841 if (aufs_mkdir(mntent, rootfs, lxc_name, lxc_path) < 0)
1842 return -1;
1843 }
1844
1845 if (hasmntopt(mntent, "create=dir")) {
1846 if (mkdir_p(path, 0755) < 0) {
1847 WARN("Failed to create mount target '%s'", path);
1848 ret = -1;
1849 }
1850 }
1851
1852 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1853 pathdirname = strdup(path);
1854 pathdirname = dirname(pathdirname);
1855 if (mkdir_p(pathdirname, 0755) < 0) {
1856 WARN("Failed to create target directory");
1857 }
1858 pathfile = fopen(path, "wb");
1859 if (!pathfile) {
1860 WARN("Failed to create mount target '%s'", path);
1861 ret = -1;
1862 } else {
1863 fclose(pathfile);
1864 }
1865 }
1866 free(pathdirname);
1867 return ret;
1868 }
1869
1870 /* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1871 * without a rootfs. */
1872 static inline int mount_entry_on_generic(struct mntent *mntent,
1873 const char* path, const struct lxc_rootfs *rootfs,
1874 const char *lxc_name, const char *lxc_path)
1875 {
1876 unsigned long mntflags;
1877 char *mntdata;
1878 int ret;
1879 bool optional = hasmntopt(mntent, "optional") != NULL;
1880 bool dev = hasmntopt(mntent, "dev") != NULL;
1881
1882 char *rootfs_path = NULL;
1883 if (rootfs && rootfs->path)
1884 rootfs_path = rootfs->mount;
1885
1886 ret = mount_entry_create_dir_file(mntent, path, rootfs, lxc_name, lxc_path);
1887
1888 if (ret < 0)
1889 return optional ? 0 : -1;
1890
1891 cull_mntent_opt(mntent);
1892
1893 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
1894 free(mntdata);
1895 return -1;
1896 }
1897
1898 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags,
1899 mntdata, optional, dev, rootfs_path);
1900
1901 free(mntdata);
1902 return ret;
1903 }
1904
1905 static inline int mount_entry_on_systemfs(struct mntent *mntent)
1906 {
1907 char path[MAXPATHLEN];
1908 int ret;
1909
1910 /* For containers created without a rootfs all mounts are treated as
1911 * absolute paths starting at / on the host. */
1912 if (mntent->mnt_dir[0] != '/')
1913 ret = snprintf(path, sizeof(path), "/%s", mntent->mnt_dir);
1914 else
1915 ret = snprintf(path, sizeof(path), "%s", mntent->mnt_dir);
1916
1917 if (ret < 0 || ret >= sizeof(path)) {
1918 ERROR("path name too long");
1919 return -1;
1920 }
1921
1922 return mount_entry_on_generic(mntent, path, NULL, NULL, NULL);
1923 }
1924
1925 static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
1926 const struct lxc_rootfs *rootfs,
1927 const char *lxc_name,
1928 const char *lxc_path)
1929 {
1930 char *aux;
1931 char path[MAXPATHLEN];
1932 int r, ret = 0, offset;
1933 const char *lxcpath;
1934
1935 lxcpath = lxc_global_config_value("lxc.lxcpath");
1936 if (!lxcpath) {
1937 ERROR("Out of memory");
1938 return -1;
1939 }
1940
1941 /* if rootfs->path is a blockdev path, allow container fstab to
1942 * use $lxcpath/CN/rootfs as the target prefix */
1943 r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
1944 if (r < 0 || r >= MAXPATHLEN)
1945 goto skipvarlib;
1946
1947 aux = strstr(mntent->mnt_dir, path);
1948 if (aux) {
1949 offset = strlen(path);
1950 goto skipabs;
1951 }
1952
1953 skipvarlib:
1954 aux = strstr(mntent->mnt_dir, rootfs->path);
1955 if (!aux) {
1956 WARN("ignoring mount point '%s'", mntent->mnt_dir);
1957 return ret;
1958 }
1959 offset = strlen(rootfs->path);
1960
1961 skipabs:
1962
1963 r = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount,
1964 aux + offset);
1965 if (r < 0 || r >= MAXPATHLEN) {
1966 WARN("pathnme too long for '%s'", mntent->mnt_dir);
1967 return -1;
1968 }
1969
1970 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
1971 }
1972
1973 static int mount_entry_on_relative_rootfs(struct mntent *mntent,
1974 const struct lxc_rootfs *rootfs,
1975 const char *lxc_name,
1976 const char *lxc_path)
1977 {
1978 char path[MAXPATHLEN];
1979 int ret;
1980
1981 /* relative to root mount point */
1982 ret = snprintf(path, sizeof(path), "%s/%s", rootfs->mount, mntent->mnt_dir);
1983 if (ret < 0 || ret >= sizeof(path)) {
1984 ERROR("path name too long");
1985 return -1;
1986 }
1987
1988 return mount_entry_on_generic(mntent, path, rootfs, lxc_name, lxc_path);
1989 }
1990
1991 static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
1992 const char *lxc_name, const char *lxc_path)
1993 {
1994 struct mntent mntent;
1995 char buf[4096];
1996 int ret = -1;
1997
1998 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
1999
2000 if (!rootfs->path) {
2001 if (mount_entry_on_systemfs(&mntent))
2002 goto out;
2003 continue;
2004 }
2005
2006 /* We have a separate root, mounts are relative to it */
2007 if (mntent.mnt_dir[0] != '/') {
2008 if (mount_entry_on_relative_rootfs(&mntent, rootfs, lxc_name, lxc_path))
2009 goto out;
2010 continue;
2011 }
2012
2013 if (mount_entry_on_absolute_rootfs(&mntent, rootfs, lxc_name, lxc_path))
2014 goto out;
2015 }
2016
2017 ret = 0;
2018
2019 INFO("mount points have been setup");
2020 out:
2021 return ret;
2022 }
2023
2024 static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
2025 const char *lxc_name, const char *lxc_path)
2026 {
2027 FILE *file;
2028 int ret;
2029
2030 if (!fstab)
2031 return 0;
2032
2033 file = setmntent(fstab, "r");
2034 if (!file) {
2035 SYSERROR("failed to use '%s'", fstab);
2036 return -1;
2037 }
2038
2039 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
2040
2041 endmntent(file);
2042 return ret;
2043 }
2044
2045 FILE *make_anonymous_mount_file(struct lxc_list *mount)
2046 {
2047 int ret;
2048 char *mount_entry;
2049 struct lxc_list *iterator;
2050 FILE *file;
2051 int fd = -1;
2052
2053 fd = memfd_create("lxc_mount_file", MFD_CLOEXEC);
2054 if (fd < 0) {
2055 if (errno != ENOSYS)
2056 return NULL;
2057 file = tmpfile();
2058 } else {
2059 file = fdopen(fd, "r+");
2060 }
2061
2062 if (!file) {
2063 int saved_errno = errno;
2064 if (fd != -1)
2065 close(fd);
2066 ERROR("Could not create mount entry file: %s.", strerror(saved_errno));
2067 return NULL;
2068 }
2069
2070 lxc_list_for_each(iterator, mount) {
2071 mount_entry = iterator->elem;
2072 ret = fprintf(file, "%s\n", mount_entry);
2073 if (ret < strlen(mount_entry))
2074 WARN("Could not write mount entry to anonymous mount file.");
2075 }
2076
2077 if (fseek(file, 0, SEEK_SET) < 0) {
2078 fclose(file);
2079 return NULL;
2080 }
2081
2082 return file;
2083 }
2084
2085 static int setup_mount_entries(const struct lxc_rootfs *rootfs,
2086 struct lxc_list *mount, const char *lxc_name,
2087 const char *lxc_path)
2088 {
2089 FILE *file;
2090 int ret;
2091
2092 file = make_anonymous_mount_file(mount);
2093 if (!file)
2094 return -1;
2095
2096 ret = mount_file_entries(rootfs, file, lxc_name, lxc_path);
2097
2098 fclose(file);
2099 return ret;
2100 }
2101
2102 static int parse_cap(const char *cap)
2103 {
2104 char *ptr = NULL;
2105 size_t i;
2106 int capid = -1;
2107
2108 if (!strcmp(cap, "none"))
2109 return -2;
2110
2111 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2112
2113 if (strcmp(cap, caps_opt[i].name))
2114 continue;
2115
2116 capid = caps_opt[i].value;
2117 break;
2118 }
2119
2120 if (capid < 0) {
2121 /* try to see if it's numeric, so the user may specify
2122 * capabilities that the running kernel knows about but
2123 * we don't */
2124 errno = 0;
2125 capid = strtol(cap, &ptr, 10);
2126 if (!ptr || *ptr != '\0' || errno != 0)
2127 /* not a valid number */
2128 capid = -1;
2129 else if (capid > lxc_caps_last_cap())
2130 /* we have a number but it's not a valid
2131 * capability */
2132 capid = -1;
2133 }
2134
2135 return capid;
2136 }
2137
2138 int in_caplist(int cap, struct lxc_list *caps)
2139 {
2140 struct lxc_list *iterator;
2141 int capid;
2142
2143 lxc_list_for_each(iterator, caps) {
2144 capid = parse_cap(iterator->elem);
2145 if (capid == cap)
2146 return 1;
2147 }
2148
2149 return 0;
2150 }
2151
2152 static int setup_caps(struct lxc_list *caps)
2153 {
2154 struct lxc_list *iterator;
2155 char *drop_entry;
2156 int capid;
2157
2158 lxc_list_for_each(iterator, caps) {
2159
2160 drop_entry = iterator->elem;
2161
2162 capid = parse_cap(drop_entry);
2163
2164 if (capid < 0) {
2165 ERROR("unknown capability %s", drop_entry);
2166 return -1;
2167 }
2168
2169 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2170
2171 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
2172 SYSERROR("failed to remove %s capability", drop_entry);
2173 return -1;
2174 }
2175
2176 }
2177
2178 DEBUG("capabilities have been setup");
2179
2180 return 0;
2181 }
2182
2183 static int dropcaps_except(struct lxc_list *caps)
2184 {
2185 struct lxc_list *iterator;
2186 char *keep_entry;
2187 int i, capid;
2188 int numcaps = lxc_caps_last_cap() + 1;
2189 INFO("found %d capabilities", numcaps);
2190
2191 if (numcaps <= 0 || numcaps > 200)
2192 return -1;
2193
2194 // caplist[i] is 1 if we keep capability i
2195 int *caplist = alloca(numcaps * sizeof(int));
2196 memset(caplist, 0, numcaps * sizeof(int));
2197
2198 lxc_list_for_each(iterator, caps) {
2199
2200 keep_entry = iterator->elem;
2201
2202 capid = parse_cap(keep_entry);
2203
2204 if (capid == -2)
2205 continue;
2206
2207 if (capid < 0) {
2208 ERROR("unknown capability %s", keep_entry);
2209 return -1;
2210 }
2211
2212 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
2213
2214 caplist[capid] = 1;
2215 }
2216 for (i=0; i<numcaps; i++) {
2217 if (caplist[i])
2218 continue;
2219 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
2220 SYSERROR("failed to remove capability %d", i);
2221 return -1;
2222 }
2223 }
2224
2225 DEBUG("capabilities have been setup");
2226
2227 return 0;
2228 }
2229
2230 static int setup_hw_addr(char *hwaddr, const char *ifname)
2231 {
2232 struct sockaddr sockaddr;
2233 struct ifreq ifr;
2234 int ret, fd, saved_errno;
2235
2236 ret = lxc_convert_mac(hwaddr, &sockaddr);
2237 if (ret) {
2238 ERROR("mac address '%s' conversion failed : %s",
2239 hwaddr, strerror(-ret));
2240 return -1;
2241 }
2242
2243 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
2244 ifr.ifr_name[IFNAMSIZ-1] = '\0';
2245 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2246
2247 fd = socket(AF_INET, SOCK_DGRAM, 0);
2248 if (fd < 0) {
2249 ERROR("socket failure : %s", strerror(errno));
2250 return -1;
2251 }
2252
2253 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
2254 saved_errno = errno;
2255 close(fd);
2256 if (ret)
2257 ERROR("ioctl failure : %s", strerror(saved_errno));
2258
2259 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
2260
2261 return ret;
2262 }
2263
2264 static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
2265 {
2266 struct lxc_list *iterator;
2267 struct lxc_inetdev *inetdev;
2268 int err;
2269
2270 lxc_list_for_each(iterator, ip) {
2271
2272 inetdev = iterator->elem;
2273
2274 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2275 &inetdev->bcast, inetdev->prefix);
2276 if (err) {
2277 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2278 ifindex, strerror(-err));
2279 return -1;
2280 }
2281 }
2282
2283 return 0;
2284 }
2285
2286 static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
2287 {
2288 struct lxc_list *iterator;
2289 struct lxc_inet6dev *inet6dev;
2290 int err;
2291
2292 lxc_list_for_each(iterator, ip) {
2293
2294 inet6dev = iterator->elem;
2295
2296 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
2297 &inet6dev->mcast, &inet6dev->acast,
2298 inet6dev->prefix);
2299 if (err) {
2300 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2301 ifindex, strerror(-err));
2302 return -1;
2303 }
2304 }
2305
2306 return 0;
2307 }
2308
2309 static int lxc_setup_netdev_in_child_namespaces(struct lxc_netdev *netdev)
2310 {
2311 char ifname[IFNAMSIZ];
2312 int err;
2313 const char *net_type_name;
2314 char *current_ifname = ifname;
2315
2316 /* empty network namespace */
2317 if (!netdev->ifindex) {
2318 if (netdev->flags & IFF_UP) {
2319 err = lxc_netdev_up("lo");
2320 if (err) {
2321 ERROR("failed to set the loopback up : %s",
2322 strerror(-err));
2323 return -1;
2324 }
2325 }
2326
2327 if (netdev->type == LXC_NET_EMPTY)
2328 return 0;
2329
2330 if (netdev->type == LXC_NET_NONE)
2331 return 0;
2332
2333 if (netdev->type != LXC_NET_VETH) {
2334 net_type_name = lxc_net_type_to_str(netdev->type);
2335 ERROR("%s networks are not supported for containers "
2336 "not setup up by privileged users",
2337 net_type_name);
2338 return -1;
2339 }
2340
2341 netdev->ifindex = if_nametoindex(netdev->name);
2342 }
2343
2344 /* get the new ifindex in case of physical netdev */
2345 if (netdev->type == LXC_NET_PHYS) {
2346 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2347 ERROR("failed to get ifindex for %s",
2348 netdev->link);
2349 return -1;
2350 }
2351 }
2352
2353 /* retrieve the name of the interface */
2354 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2355 ERROR("no interface corresponding to index '%d'",
2356 netdev->ifindex);
2357 return -1;
2358 }
2359
2360 /* default: let the system to choose one interface name */
2361 if (!netdev->name)
2362 netdev->name = netdev->type == LXC_NET_PHYS ?
2363 netdev->link : "eth%d";
2364
2365 /* rename the interface name */
2366 if (strcmp(ifname, netdev->name) != 0) {
2367 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2368 if (err) {
2369 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2370 strerror(-err));
2371 return -1;
2372 }
2373 }
2374
2375 /* Re-read the name of the interface because its name has changed
2376 * and would be automatically allocated by the system
2377 */
2378 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2379 ERROR("no interface corresponding to index '%d'",
2380 netdev->ifindex);
2381 return -1;
2382 }
2383
2384 /* set a mac address */
2385 if (netdev->hwaddr) {
2386 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
2387 ERROR("failed to setup hw address for '%s'",
2388 current_ifname);
2389 return -1;
2390 }
2391 }
2392
2393 /* setup ipv4 addresses on the interface */
2394 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
2395 ERROR("failed to setup ip addresses for '%s'",
2396 ifname);
2397 return -1;
2398 }
2399
2400 /* setup ipv6 addresses on the interface */
2401 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
2402 ERROR("failed to setup ipv6 addresses for '%s'",
2403 ifname);
2404 return -1;
2405 }
2406
2407 /* set the network device up */
2408 if (netdev->flags & IFF_UP) {
2409 int err;
2410
2411 err = lxc_netdev_up(current_ifname);
2412 if (err) {
2413 ERROR("failed to set '%s' up : %s", current_ifname,
2414 strerror(-err));
2415 return -1;
2416 }
2417
2418 /* the network is up, make the loopback up too */
2419 err = lxc_netdev_up("lo");
2420 if (err) {
2421 ERROR("failed to set the loopback up : %s",
2422 strerror(-err));
2423 return -1;
2424 }
2425 }
2426
2427 /* We can only set up the default routes after bringing
2428 * up the interface, sine bringing up the interface adds
2429 * the link-local routes and we can't add a default
2430 * route if the gateway is not reachable. */
2431
2432 /* setup ipv4 gateway on the interface */
2433 if (netdev->ipv4_gateway) {
2434 if (!(netdev->flags & IFF_UP)) {
2435 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2436 return -1;
2437 }
2438
2439 if (lxc_list_empty(&netdev->ipv4)) {
2440 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2441 return -1;
2442 }
2443
2444 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2445 if (err) {
2446 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2447 if (err) {
2448 ERROR("failed to add ipv4 dest for '%s': %s",
2449 ifname, strerror(-err));
2450 }
2451
2452 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2453 if (err) {
2454 ERROR("failed to setup ipv4 gateway for '%s': %s",
2455 ifname, strerror(-err));
2456 if (netdev->ipv4_gateway_auto) {
2457 char buf[INET_ADDRSTRLEN];
2458 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2459 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2460 }
2461 return -1;
2462 }
2463 }
2464 }
2465
2466 /* setup ipv6 gateway on the interface */
2467 if (netdev->ipv6_gateway) {
2468 if (!(netdev->flags & IFF_UP)) {
2469 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2470 return -1;
2471 }
2472
2473 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2474 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2475 return -1;
2476 }
2477
2478 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2479 if (err) {
2480 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2481 if (err) {
2482 ERROR("failed to add ipv6 dest for '%s': %s",
2483 ifname, strerror(-err));
2484 }
2485
2486 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2487 if (err) {
2488 ERROR("failed to setup ipv6 gateway for '%s': %s",
2489 ifname, strerror(-err));
2490 if (netdev->ipv6_gateway_auto) {
2491 char buf[INET6_ADDRSTRLEN];
2492 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2493 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2494 }
2495 return -1;
2496 }
2497 }
2498 }
2499
2500 DEBUG("'%s' has been setup", current_ifname);
2501
2502 return 0;
2503 }
2504
2505 static int lxc_setup_networks_in_child_namespaces(const struct lxc_conf *conf,
2506 struct lxc_list *network)
2507 {
2508 struct lxc_list *iterator;
2509 struct lxc_netdev *netdev;
2510
2511 lxc_log_configured_netdevs(conf);
2512
2513 lxc_list_for_each(iterator, network) {
2514 netdev = iterator->elem;
2515
2516 /* REMOVE in LXC 3.0 */
2517 if (netdev->idx < 0) {
2518 ERROR("WARNING: using \"lxc.network.*\" keys to define "
2519 "networks is DEPRECATED, please switch to using "
2520 "\"lxc.net.[i].* keys\"");
2521 }
2522
2523 if (lxc_setup_netdev_in_child_namespaces(netdev)) {
2524 ERROR("failed to setup netdev");
2525 return -1;
2526 }
2527 }
2528
2529 if (!lxc_list_empty(network))
2530 INFO("network has been setup");
2531
2532 return 0;
2533 }
2534
2535 static int parse_resource(const char *res) {
2536 size_t i;
2537 int resid = -1;
2538
2539 for (i = 0; i < sizeof(limit_opt)/sizeof(limit_opt[0]); ++i) {
2540 if (strcmp(res, limit_opt[i].name) == 0)
2541 return limit_opt[i].value;
2542 }
2543
2544 /* try to see if it's numeric, so the user may specify
2545 * resources that the running kernel knows about but
2546 * we don't */
2547 if (lxc_safe_int(res, &resid) == 0)
2548 return resid;
2549 return -1;
2550 }
2551
2552 int setup_resource_limits(struct lxc_list *limits, pid_t pid) {
2553 struct lxc_list *it;
2554 struct lxc_limit *lim;
2555 int resid;
2556
2557 lxc_list_for_each(it, limits) {
2558 lim = it->elem;
2559
2560 resid = parse_resource(lim->resource);
2561 if (resid < 0) {
2562 ERROR("unknown resource %s", lim->resource);
2563 return -1;
2564 }
2565
2566 if (prlimit(pid, resid, &lim->limit, NULL) != 0) {
2567 ERROR("failed to set limit %s: %s", lim->resource, strerror(errno));
2568 return -1;
2569 }
2570 }
2571 return 0;
2572 }
2573
2574 /* try to move physical nics to the init netns */
2575 void lxc_restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2576 {
2577 int i, oldfd;
2578 char ifname[IFNAMSIZ];
2579
2580 if (netnsfd < 0 || conf->num_savednics == 0)
2581 return;
2582
2583 INFO("Running to reset %d nic names.", conf->num_savednics);
2584
2585 oldfd = lxc_preserve_ns(getpid(), "net");
2586 if (oldfd < 0) {
2587 SYSERROR("Failed to open monitor netns fd.");
2588 return;
2589 }
2590
2591 if (setns(netnsfd, 0) != 0) {
2592 SYSERROR("Failed to enter container netns to reset nics");
2593 close(oldfd);
2594 return;
2595 }
2596 for (i=0; i<conf->num_savednics; i++) {
2597 struct saved_nic *s = &conf->saved_nics[i];
2598 /* retrieve the name of the interface */
2599 if (!if_indextoname(s->ifindex, ifname)) {
2600 WARN("no interface corresponding to index '%d'", s->ifindex);
2601 continue;
2602 }
2603 if (lxc_netdev_move_by_name(ifname, 1, s->orig_name))
2604 WARN("Error moving nic name:%s back to host netns", ifname);
2605 free(s->orig_name);
2606 }
2607 conf->num_savednics = 0;
2608
2609 if (setns(oldfd, 0) != 0)
2610 SYSERROR("Failed to re-enter monitor's netns");
2611 close(oldfd);
2612 }
2613
2614 static char *default_rootfs_mount = LXCROOTFSMOUNT;
2615
2616 struct lxc_conf *lxc_conf_init(void)
2617 {
2618 struct lxc_conf *new;
2619 int i;
2620
2621 new = malloc(sizeof(*new));
2622 if (!new) {
2623 ERROR("lxc_conf_init : %s", strerror(errno));
2624 return NULL;
2625 }
2626 memset(new, 0, sizeof(*new));
2627
2628 new->loglevel = LXC_LOG_LEVEL_NOTSET;
2629 new->personality = -1;
2630 new->autodev = 1;
2631 new->console.log_path = NULL;
2632 new->console.log_fd = -1;
2633 new->console.path = NULL;
2634 new->console.peer = -1;
2635 new->console.peerpty.busy = -1;
2636 new->console.peerpty.master = -1;
2637 new->console.peerpty.slave = -1;
2638 new->console.master = -1;
2639 new->console.slave = -1;
2640 new->console.name[0] = '\0';
2641 new->maincmd_fd = -1;
2642 new->nbd_idx = -1;
2643 new->rootfs.mount = strdup(default_rootfs_mount);
2644 if (!new->rootfs.mount) {
2645 ERROR("lxc_conf_init : %s", strerror(errno));
2646 free(new);
2647 return NULL;
2648 }
2649 new->logfd = -1;
2650 lxc_list_init(&new->cgroup);
2651 lxc_list_init(&new->network);
2652 lxc_list_init(&new->mount_list);
2653 lxc_list_init(&new->caps);
2654 lxc_list_init(&new->keepcaps);
2655 lxc_list_init(&new->id_map);
2656 lxc_list_init(&new->includes);
2657 lxc_list_init(&new->aliens);
2658 lxc_list_init(&new->environment);
2659 lxc_list_init(&new->limits);
2660 for (i=0; i<NUM_LXC_HOOKS; i++)
2661 lxc_list_init(&new->hooks[i]);
2662 lxc_list_init(&new->groups);
2663 new->lsm_aa_profile = NULL;
2664 new->lsm_se_context = NULL;
2665 new->tmp_umount_proc = 0;
2666
2667 for (i = 0; i < LXC_NS_MAX; i++)
2668 new->inherit_ns_fd[i] = -1;
2669
2670 /* if running in a new user namespace, init and COMMAND
2671 * default to running as UID/GID 0 when using lxc-execute */
2672 new->init_uid = 0;
2673 new->init_gid = 0;
2674
2675 return new;
2676 }
2677
2678 static int instantiate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2679 {
2680 char *veth1, *veth2;
2681 char veth1buf[IFNAMSIZ], veth2buf[IFNAMSIZ];
2682 int bridge_index, err;
2683 unsigned int mtu = 0;
2684
2685 if (netdev->priv.veth_attr.pair) {
2686 veth1 = netdev->priv.veth_attr.pair;
2687 if (handler->conf->reboot)
2688 lxc_netdev_delete_by_name(veth1);
2689 } else {
2690 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2691 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2692 ERROR("veth1 name too long");
2693 return -1;
2694 }
2695 veth1 = lxc_mkifname(veth1buf);
2696 if (!veth1) {
2697 ERROR("failed to allocate a temporary name");
2698 return -1;
2699 }
2700 /* store away for deconf */
2701 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
2702 }
2703
2704 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
2705 veth2 = lxc_mkifname(veth2buf);
2706 if (!veth2) {
2707 ERROR("failed to allocate a temporary name");
2708 goto out_delete;
2709 }
2710
2711 err = lxc_veth_create(veth1, veth2);
2712 if (err) {
2713 ERROR("failed to create veth pair \"%s\" and \"%s\": %s", veth1,
2714 veth2, strerror(-err));
2715 goto out_delete;
2716 }
2717
2718 /* changing the high byte of the mac address to 0xfe, the bridge interface
2719 * will always keep the host's mac address and not take the mac address
2720 * of a container */
2721 err = setup_private_host_hw_addr(veth1);
2722 if (err) {
2723 ERROR("failed to change mac address of host interface \"%s\": %s",
2724 veth1, strerror(-err));
2725 goto out_delete;
2726 }
2727
2728 netdev->ifindex = if_nametoindex(veth2);
2729 if (!netdev->ifindex) {
2730 ERROR("failed to retrieve the index for \"%s\"", veth2);
2731 goto out_delete;
2732 }
2733
2734 if (netdev->mtu) {
2735 if (lxc_safe_uint(netdev->mtu, &mtu) < 0)
2736 WARN("failed to parse mtu from");
2737 else
2738 INFO("retrieved mtu %d", mtu);
2739 } else if (netdev->link) {
2740 bridge_index = if_nametoindex(netdev->link);
2741 if (bridge_index) {
2742 mtu = netdev_get_mtu(bridge_index);
2743 INFO("retrieved mtu %d from %s", mtu, netdev->link);
2744 } else {
2745 mtu = netdev_get_mtu(netdev->ifindex);
2746 INFO("retrieved mtu %d from %s", mtu, veth2);
2747 }
2748 }
2749
2750 if (mtu) {
2751 err = lxc_netdev_set_mtu(veth1, mtu);
2752 if (!err)
2753 err = lxc_netdev_set_mtu(veth2, mtu);
2754 if (err) {
2755 ERROR("failed to set mtu \"%d\" for veth pair \"%s\" "
2756 "and \"%s\": %s",
2757 mtu, veth1, veth2, strerror(-err));
2758 goto out_delete;
2759 }
2760 }
2761
2762 if (netdev->link) {
2763 err = lxc_bridge_attach(handler->lxcpath, handler->name, netdev->link, veth1);
2764 if (err) {
2765 ERROR("failed to attach \"%s\" to bridge \"%s\": %s",
2766 veth1, netdev->link, strerror(-err));
2767 goto out_delete;
2768 }
2769 INFO("attached \"%s\" to bridge \"%s\"", veth1, netdev->link);
2770 }
2771
2772 err = lxc_netdev_up(veth1);
2773 if (err) {
2774 ERROR("failed to set \"%s\" up: %s", veth1, strerror(-err));
2775 goto out_delete;
2776 }
2777
2778 if (netdev->upscript) {
2779 err = run_script(handler->name, "net", netdev->upscript, "up",
2780 "veth", veth1, (char*) NULL);
2781 if (err)
2782 goto out_delete;
2783 }
2784
2785 DEBUG("instantiated veth \"%s/%s\", index is \"%d\"", veth1, veth2,
2786 netdev->ifindex);
2787
2788 return 0;
2789
2790 out_delete:
2791 if (netdev->ifindex != 0)
2792 lxc_netdev_delete_by_name(veth1);
2793 if (!netdev->priv.veth_attr.pair)
2794 free(veth1);
2795 free(veth2);
2796 return -1;
2797 }
2798
2799 static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2800 {
2801 char *veth1;
2802 int err;
2803
2804 if (netdev->priv.veth_attr.pair)
2805 veth1 = netdev->priv.veth_attr.pair;
2806 else
2807 veth1 = netdev->priv.veth_attr.veth1;
2808
2809 if (netdev->downscript) {
2810 err = run_script(handler->name, "net", netdev->downscript,
2811 "down", "veth", veth1, (char*) NULL);
2812 if (err)
2813 return -1;
2814 }
2815 return 0;
2816 }
2817
2818 static int instantiate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2819 {
2820 char peerbuf[IFNAMSIZ], *peer;
2821 int err;
2822
2823 if (!netdev->link) {
2824 ERROR("no link specified for macvlan netdev");
2825 return -1;
2826 }
2827
2828 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2829 if (err >= sizeof(peerbuf))
2830 return -1;
2831
2832 peer = lxc_mkifname(peerbuf);
2833 if (!peer) {
2834 ERROR("failed to make a temporary name");
2835 return -1;
2836 }
2837
2838 err = lxc_macvlan_create(netdev->link, peer,
2839 netdev->priv.macvlan_attr.mode);
2840 if (err) {
2841 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2842 peer, netdev->link, strerror(-err));
2843 goto out;
2844 }
2845
2846 netdev->ifindex = if_nametoindex(peer);
2847 if (!netdev->ifindex) {
2848 ERROR("failed to retrieve the index for %s", peer);
2849 goto out;
2850 }
2851
2852 if (netdev->upscript) {
2853 err = run_script(handler->name, "net", netdev->upscript, "up",
2854 "macvlan", netdev->link, (char*) NULL);
2855 if (err)
2856 goto out;
2857 }
2858
2859 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
2860 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
2861
2862 return 0;
2863 out:
2864 lxc_netdev_delete_by_name(peer);
2865 free(peer);
2866 return -1;
2867 }
2868
2869 static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2870 {
2871 int err;
2872
2873 if (netdev->downscript) {
2874 err = run_script(handler->name, "net", netdev->downscript,
2875 "down", "macvlan", netdev->link,
2876 (char*) NULL);
2877 if (err)
2878 return -1;
2879 }
2880 return 0;
2881 }
2882
2883 /* XXX: merge with instantiate_macvlan */
2884 static int instantiate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2885 {
2886 char peer[IFNAMSIZ];
2887 int err;
2888 static uint16_t vlan_cntr = 0;
2889 unsigned int mtu = 0;
2890
2891 if (!netdev->link) {
2892 ERROR("no link specified for vlan netdev");
2893 return -1;
2894 }
2895
2896 err = snprintf(peer, sizeof(peer), "vlan%d-%d", netdev->priv.vlan_attr.vid, vlan_cntr++);
2897 if (err >= sizeof(peer)) {
2898 ERROR("peer name too long");
2899 return -1;
2900 }
2901
2902 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
2903 if (err) {
2904 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2905 peer, netdev->link, strerror(-err));
2906 return -1;
2907 }
2908
2909 netdev->ifindex = if_nametoindex(peer);
2910 if (!netdev->ifindex) {
2911 ERROR("failed to retrieve the ifindex for %s", peer);
2912 lxc_netdev_delete_by_name(peer);
2913 return -1;
2914 }
2915
2916 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
2917 netdev->ifindex);
2918 if (netdev->mtu) {
2919 if (lxc_safe_uint(netdev->mtu, &mtu) < 0) {
2920 ERROR("Failed to retrieve mtu from: '%d'/'%s'.",
2921 netdev->ifindex, netdev->name);
2922 return -1;
2923 }
2924 err = lxc_netdev_set_mtu(peer, mtu);
2925 if (err) {
2926 ERROR("failed to set mtu '%s' for %s : %s",
2927 netdev->mtu, peer, strerror(-err));
2928 lxc_netdev_delete_by_name(peer);
2929 return -1;
2930 }
2931 }
2932
2933 return 0;
2934 }
2935
2936 static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2937 {
2938 return 0;
2939 }
2940
2941 static int instantiate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
2942 {
2943 if (!netdev->link) {
2944 ERROR("no link specified for the physical interface");
2945 return -1;
2946 }
2947
2948 netdev->ifindex = if_nametoindex(netdev->link);
2949 if (!netdev->ifindex) {
2950 ERROR("failed to retrieve the index for %s", netdev->link);
2951 return -1;
2952 }
2953
2954 if (netdev->upscript) {
2955 int err;
2956 err = run_script(handler->name, "net", netdev->upscript,
2957 "up", "phys", netdev->link, (char*) NULL);
2958 if (err)
2959 return -1;
2960 }
2961
2962 return 0;
2963 }
2964
2965 static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
2966 {
2967 int err;
2968
2969 if (netdev->downscript) {
2970 err = run_script(handler->name, "net", netdev->downscript,
2971 "down", "phys", netdev->link, (char*) NULL);
2972 if (err)
2973 return -1;
2974 }
2975 return 0;
2976 }
2977
2978 static int instantiate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
2979 {
2980 netdev->ifindex = 0;
2981 return 0;
2982 }
2983
2984 static int instantiate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
2985 {
2986 netdev->ifindex = 0;
2987 if (netdev->upscript) {
2988 int err;
2989 err = run_script(handler->name, "net", netdev->upscript,
2990 "up", "empty", (char*) NULL);
2991 if (err)
2992 return -1;
2993 }
2994 return 0;
2995 }
2996
2997 static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
2998 {
2999 int err;
3000
3001 if (netdev->downscript) {
3002 err = run_script(handler->name, "net", netdev->downscript,
3003 "down", "empty", (char*) NULL);
3004 if (err)
3005 return -1;
3006 }
3007 return 0;
3008 }
3009
3010 static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3011 {
3012 return 0;
3013 }
3014
3015 int lxc_requests_empty_network(struct lxc_handler *handler)
3016 {
3017 struct lxc_list *network = &handler->conf->network;
3018 struct lxc_list *iterator;
3019 struct lxc_netdev *netdev;
3020 bool found_none = false, found_nic = false;
3021
3022 if (lxc_list_empty(network))
3023 return 0;
3024
3025 lxc_list_for_each(iterator, network) {
3026
3027 netdev = iterator->elem;
3028
3029 if (netdev->type == LXC_NET_NONE)
3030 found_none = true;
3031 else
3032 found_nic = true;
3033 }
3034 if (found_none && !found_nic)
3035 return 1;
3036 return 0;
3037 }
3038
3039 int lxc_setup_networks_in_parent_namespaces(struct lxc_handler *handler)
3040 {
3041 bool am_root;
3042 struct lxc_netdev *netdev;
3043 struct lxc_list *iterator;
3044 struct lxc_list *network = &handler->conf->network;
3045
3046 /* We need to be root. */
3047 am_root = (getuid() == 0);
3048 if (!am_root)
3049 return 0;
3050
3051 lxc_list_for_each(iterator, network) {
3052 netdev = iterator->elem;
3053
3054 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
3055 ERROR("invalid network configuration type '%d'",
3056 netdev->type);
3057 return -1;
3058 }
3059
3060 if (netdev->type != LXC_NET_MACVLAN &&
3061 netdev->priv.macvlan_attr.mode) {
3062 ERROR("Invalid macvlan.mode for a non-macvlan netdev");
3063 return -1;
3064 }
3065
3066 if (netdev->type != LXC_NET_VETH &&
3067 netdev->priv.veth_attr.pair) {
3068 ERROR("Invalid veth pair for a non-veth netdev");
3069 return -1;
3070 }
3071
3072 if (netdev->type != LXC_NET_VLAN &&
3073 netdev->priv.vlan_attr.vid > 0) {
3074 ERROR("Invalid vlan.id for a non-macvlan netdev");
3075 return -1;
3076 }
3077
3078 if (netdev_conf[netdev->type](handler, netdev)) {
3079 ERROR("failed to create netdev");
3080 return -1;
3081 }
3082
3083 }
3084
3085 return 0;
3086 }
3087
3088 bool lxc_delete_network(struct lxc_handler *handler)
3089 {
3090 int ret;
3091 struct lxc_list *network = &handler->conf->network;
3092 struct lxc_list *iterator;
3093 struct lxc_netdev *netdev;
3094 bool deleted_all = true;
3095
3096 lxc_list_for_each(iterator, network) {
3097 netdev = iterator->elem;
3098
3099 if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
3100 if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
3101 WARN("Failed to rename interface with index %d "
3102 "to its initial name \"%s\".",
3103 netdev->ifindex, netdev->link);
3104 continue;
3105 }
3106
3107 if (netdev_deconf[netdev->type](handler, netdev)) {
3108 WARN("Failed to destroy netdev");
3109 }
3110
3111 /* Recent kernel remove the virtual interfaces when the network
3112 * namespace is destroyed but in case we did not moved the
3113 * interface to the network namespace, we have to destroy it
3114 */
3115 if (netdev->ifindex != 0) {
3116 ret = lxc_netdev_delete_by_index(netdev->ifindex);
3117 if (-ret == ENODEV) {
3118 INFO("Interface \"%s\" with index %d already "
3119 "deleted or existing in different network "
3120 "namespace.",
3121 netdev->name ? netdev->name : "(null)",
3122 netdev->ifindex);
3123 } else if (ret < 0) {
3124 deleted_all = false;
3125 WARN("Failed to remove interface \"%s\" with "
3126 "index %d: %s.",
3127 netdev->name ? netdev->name : "(null)",
3128 netdev->ifindex, strerror(-ret));
3129 } else {
3130 INFO("Removed interface \"%s\" with index %d.",
3131 netdev->name ? netdev->name : "(null)",
3132 netdev->ifindex);
3133 }
3134 }
3135
3136 /* Explicitly delete host veth device to prevent lingering
3137 * devices. We had issues in LXD around this.
3138 */
3139 if (netdev->ifindex != 0 && netdev->type == LXC_NET_VETH && !am_unpriv()) {
3140 char *hostveth;
3141 if (netdev->priv.veth_attr.pair) {
3142 hostveth = netdev->priv.veth_attr.pair;
3143 ret = lxc_netdev_delete_by_name(hostveth);
3144 if (ret < 0) {
3145 WARN("Failed to remove interface \"%s\" from host: %s.", hostveth, strerror(-ret));
3146 } else {
3147 INFO("Removed interface \"%s\" from host.", hostveth);
3148 }
3149 } else if (strlen(netdev->priv.veth_attr.veth1) > 0) {
3150 hostveth = netdev->priv.veth_attr.veth1;
3151 ret = lxc_netdev_delete_by_name(hostveth);
3152 if (ret < 0) {
3153 WARN("Failed to remove \"%s\" from host: %s.", hostveth, strerror(-ret));
3154 } else {
3155 INFO("Removed interface \"%s\" from host.", hostveth);
3156 memset((void *)&netdev->priv.veth_attr.veth1, 0, sizeof(netdev->priv.veth_attr.veth1));
3157 }
3158 }
3159 }
3160 }
3161
3162 return deleted_all;
3163 }
3164
3165 #define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3166
3167 /* lxc-user-nic returns "interface_name:interface_name\n" */
3168 #define MAX_BUFFER_SIZE IFNAMSIZ * 2 + 2
3169 static int unpriv_assign_nic(const char *lxcpath, char *lxcname,
3170 struct lxc_netdev *netdev, pid_t pid)
3171 {
3172 pid_t child;
3173 int bytes, pipefd[2];
3174 char *token, *saveptr = NULL;
3175 char buffer[MAX_BUFFER_SIZE];
3176 char netdev_link[IFNAMSIZ + 1];
3177
3178 if (netdev->type != LXC_NET_VETH) {
3179 ERROR("nic type %d not support for unprivileged use",
3180 netdev->type);
3181 return -1;
3182 }
3183
3184 if (pipe(pipefd) < 0) {
3185 SYSERROR("pipe failed");
3186 return -1;
3187 }
3188
3189 child = fork();
3190 if (child < 0) {
3191 SYSERROR("fork");
3192 close(pipefd[0]);
3193 close(pipefd[1]);
3194 return -1;
3195 }
3196
3197 if (child == 0) { // child
3198 /* Call lxc-user-nic pid type bridge. */
3199 int ret;
3200 char pidstr[LXC_NUMSTRLEN64];
3201
3202 close(pipefd[0]); /* Close the read-end of the pipe. */
3203
3204 /* Redirect stdout to write-end of the pipe. */
3205 ret = dup2(pipefd[1], STDOUT_FILENO);
3206 close(pipefd[1]); /* Close the write-end of the pipe. */
3207 if (ret < 0) {
3208 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
3209 exit(EXIT_FAILURE);
3210 }
3211
3212 if (netdev->link)
3213 strncpy(netdev_link, netdev->link, IFNAMSIZ);
3214 else
3215 strncpy(netdev_link, "none", IFNAMSIZ);
3216
3217 ret = snprintf(pidstr, LXC_NUMSTRLEN64, "%d", pid);
3218 if (ret < 0 || ret >= LXC_NUMSTRLEN64)
3219 exit(EXIT_FAILURE);
3220 pidstr[LXC_NUMSTRLEN64 - 1] = '\0';
3221
3222 INFO("Execing lxc-user-nic %s %s %s veth %s %s", lxcpath,
3223 lxcname, pidstr, netdev_link, netdev->name);
3224 execlp(LXC_USERNIC_PATH, LXC_USERNIC_PATH, lxcpath, lxcname,
3225 pidstr, "veth", netdev_link, netdev->name, NULL);
3226
3227 SYSERROR("Failed to exec lxc-user-nic.");
3228 exit(EXIT_FAILURE);
3229 }
3230
3231 /* close the write-end of the pipe */
3232 close(pipefd[1]);
3233
3234 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
3235 if (bytes < 0)
3236 SYSERROR("Failed to read from pipe file descriptor.");
3237 buffer[bytes - 1] = '\0';
3238
3239 if (wait_for_pid(child) != 0) {
3240 close(pipefd[0]);
3241 return -1;
3242 }
3243
3244 /* close the read-end of the pipe */
3245 close(pipefd[0]);
3246
3247 /* fill netdev->name field */
3248 token = strtok_r(buffer, ":", &saveptr);
3249 if (!token)
3250 return -1;
3251
3252 netdev->name = malloc(IFNAMSIZ + 1);
3253 if (!netdev->name) {
3254 SYSERROR("Failed to allocate memory.");
3255 return -1;
3256 }
3257 memset(netdev->name, 0, IFNAMSIZ + 1);
3258 strncpy(netdev->name, token, IFNAMSIZ);
3259
3260 /* fill netdev->veth_attr.pair field */
3261 token = strtok_r(NULL, ":", &saveptr);
3262 if (!token)
3263 return -1;
3264
3265 netdev->priv.veth_attr.pair = strdup(token);
3266 if (!netdev->priv.veth_attr.pair) {
3267 ERROR("Failed to allocate memory.");
3268 return -1;
3269 }
3270
3271 return 0;
3272 }
3273
3274 int lxc_assign_network(const char *lxcpath, char *lxcname,
3275 struct lxc_list *network, pid_t pid)
3276 {
3277 struct lxc_list *iterator;
3278 struct lxc_netdev *netdev;
3279 char ifname[IFNAMSIZ];
3280 int am_root = (getuid() == 0);
3281 int err;
3282
3283 lxc_list_for_each(iterator, network) {
3284
3285 netdev = iterator->elem;
3286
3287 if (netdev->type == LXC_NET_VETH && !am_root) {
3288 if (netdev->mtu)
3289 INFO("mtu ignored due to insufficient privilege");
3290 if (unpriv_assign_nic(lxcpath, lxcname, netdev, pid))
3291 return -1;
3292 /* lxc-user-nic has moved the nic to the new ns.
3293 * unpriv_assign_nic() fills in netdev->name.
3294 * netdev->ifindex will be filed in at
3295 * lxc_setup_netdev_in_child_namespaces.
3296 */
3297 continue;
3298 }
3299
3300 /* empty network namespace, nothing to move */
3301 if (!netdev->ifindex)
3302 continue;
3303
3304 /* retrieve the name of the interface */
3305 if (!if_indextoname(netdev->ifindex, ifname)) {
3306 ERROR("no interface corresponding to index '%d'", netdev->ifindex);
3307 return -1;
3308 }
3309
3310 err = lxc_netdev_move_by_name(ifname, pid, NULL);
3311 if (err) {
3312 ERROR("failed to move '%s' to the container : %s",
3313 netdev->link, strerror(-err));
3314 return -1;
3315 }
3316
3317 DEBUG("move '%s'/'%s' to '%d': .", ifname, netdev->name, pid);
3318 }
3319
3320 return 0;
3321 }
3322
3323 static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3324 size_t buf_size)
3325 {
3326 char path[MAXPATHLEN];
3327 int fd, ret;
3328
3329 ret = snprintf(path, MAXPATHLEN, "/proc/%d/%cid_map", pid,
3330 idtype == ID_TYPE_UID ? 'u' : 'g');
3331 if (ret < 0 || ret >= MAXPATHLEN) {
3332 ERROR("failed to create path \"%s\"", path);
3333 return -E2BIG;
3334 }
3335
3336 fd = open(path, O_WRONLY);
3337 if (fd < 0) {
3338 SYSERROR("failed to open \"%s\"", path);
3339 return -1;
3340 }
3341
3342 errno = 0;
3343 ret = lxc_write_nointr(fd, buf, buf_size);
3344 if (ret != buf_size) {
3345 SYSERROR("failed to write %cid mapping to \"%s\"",
3346 idtype == ID_TYPE_UID ? 'u' : 'g', path);
3347 close(fd);
3348 return -1;
3349 }
3350 close(fd);
3351
3352 return 0;
3353 }
3354
3355 /* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both.
3356 *
3357 * @return 1 if functional binary was found
3358 * @return 0 if binary exists but is lacking privilege
3359 * @return -ENOENT if binary does not exist
3360 * @return -EINVAL if cap to check is neither CAP_SETUID nor CAP_SETGID
3361 *
3362 */
3363 static int idmaptool_on_path_and_privileged(const char *binary, cap_value_t cap)
3364 {
3365 char *path;
3366 int ret;
3367 struct stat st;
3368 int fret = 0;
3369
3370 if (cap != CAP_SETUID && cap != CAP_SETGID)
3371 return -EINVAL;
3372
3373 path = on_path(binary, NULL);
3374 if (!path)
3375 return -ENOENT;
3376
3377 ret = stat(path, &st);
3378 if (ret < 0) {
3379 fret = -errno;
3380 goto cleanup;
3381 }
3382
3383 /* Check if the binary is setuid. */
3384 if (st.st_mode & S_ISUID) {
3385 DEBUG("The binary \"%s\" does have the setuid bit set.", path);
3386 fret = 1;
3387 goto cleanup;
3388 }
3389
3390 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
3391 /* Check if it has the CAP_SETUID capability. */
3392 if ((cap & CAP_SETUID) &&
3393 lxc_file_cap_is_set(path, CAP_SETUID, CAP_EFFECTIVE) &&
3394 lxc_file_cap_is_set(path, CAP_SETUID, CAP_PERMITTED)) {
3395 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
3396 "and CAP_PERMITTED sets.", path);
3397 fret = 1;
3398 goto cleanup;
3399 }
3400
3401 /* Check if it has the CAP_SETGID capability. */
3402 if ((cap & CAP_SETGID) &&
3403 lxc_file_cap_is_set(path, CAP_SETGID, CAP_EFFECTIVE) &&
3404 lxc_file_cap_is_set(path, CAP_SETGID, CAP_PERMITTED)) {
3405 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
3406 "and CAP_PERMITTED sets.", path);
3407 fret = 1;
3408 goto cleanup;
3409 }
3410 #else
3411 /* If we cannot check for file capabilities we need to give the benefit
3412 * of the doubt. Otherwise we might fail even though all the necessary
3413 * file capabilities are set.
3414 */
3415 DEBUG("Cannot check for file capabilites as full capability support is "
3416 "missing. Manual intervention needed.");
3417 fret = 1;
3418 #endif
3419
3420 cleanup:
3421 free(path);
3422 return fret;
3423 }
3424
3425 int lxc_map_ids_exec_wrapper(void *args)
3426 {
3427 execl("/bin/sh", "sh", "-c", (char *)args, (char *)NULL);
3428 return -1;
3429 }
3430
3431 int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3432 {
3433 struct id_map *map;
3434 struct lxc_list *iterator;
3435 enum idtype type;
3436 char u_or_g;
3437 char *pos;
3438 int fill, left;
3439 char cmd_output[MAXPATHLEN];
3440 /* strlen("new@idmap") = 9
3441 * +
3442 * strlen(" ") = 1
3443 * +
3444 * LXC_NUMSTRLEN64
3445 * +
3446 * strlen(" ") = 1
3447 *
3448 * We add some additional space to make sure that we really have
3449 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
3450 */
3451 char mapbuf[9 + 1 + LXC_NUMSTRLEN64 + 1 + LXC_IDMAPLEN] = {0};
3452 int ret = 0, uidmap = 0, gidmap = 0;
3453 bool use_shadow = false, had_entry = false;
3454
3455 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
3456 * ranges, then insist that root also reserve ranges in subuid. This
3457 * will protected it by preventing another user from being handed the
3458 * range by shadow.
3459 */
3460 uidmap = idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID);
3461 if (uidmap == -ENOENT)
3462 WARN("newuidmap binary is missing");
3463 else if (!uidmap)
3464 WARN("newuidmap is lacking necessary privileges");
3465
3466 gidmap = idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID);
3467 if (gidmap == -ENOENT)
3468 WARN("newgidmap binary is missing");
3469 else if (!gidmap)
3470 WARN("newgidmap is lacking necessary privileges");
3471
3472 if (uidmap > 0 && gidmap > 0) {
3473 DEBUG("Functional newuidmap and newgidmap binary found.");
3474 use_shadow = true;
3475 } else {
3476 /* In case unprivileged users run application containers via
3477 * execute() or a start*() there are valid cases where they may
3478 * only want to map their own {g,u}id. Let's not block them from
3479 * doing so by requiring geteuid() == 0.
3480 */
3481 DEBUG("No newuidmap and newgidmap binary found. Trying to "
3482 "write directly with euid %d.", geteuid());
3483 }
3484
3485 for (type = ID_TYPE_UID, u_or_g = 'u'; type <= ID_TYPE_GID;
3486 type++, u_or_g = 'g') {
3487 pos = mapbuf;
3488
3489 if (use_shadow)
3490 pos += sprintf(mapbuf, "new%cidmap %d", u_or_g, pid);
3491
3492 lxc_list_for_each(iterator, idmap) {
3493 /* The kernel only takes <= 4k for writes to
3494 * /proc/<nr>/[ug]id_map
3495 */
3496 map = iterator->elem;
3497 if (map->idtype != type)
3498 continue;
3499
3500 had_entry = true;
3501
3502 left = LXC_IDMAPLEN - (pos - mapbuf);
3503 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
3504 use_shadow ? " " : "", map->nsid,
3505 map->hostid, map->range,
3506 use_shadow ? "" : "\n");
3507 if (fill <= 0 || fill >= left)
3508 SYSERROR("Too many {g,u}id mappings defined.");
3509
3510 pos += fill;
3511 }
3512 if (!had_entry)
3513 continue;
3514
3515 /* Try to catch the ouput of new{g,u}idmap to make debugging
3516 * easier.
3517 */
3518 if (use_shadow) {
3519 ret = run_command(cmd_output, sizeof(cmd_output),
3520 lxc_map_ids_exec_wrapper,
3521 (void *)mapbuf);
3522 if (ret < 0) {
3523 ERROR("new%cidmap failed to write mapping: %s",
3524 u_or_g, cmd_output);
3525 return -1;
3526 }
3527 } else {
3528 ret = write_id_mapping(type, pid, mapbuf, pos - mapbuf);
3529 if (ret < 0)
3530 return -1;
3531 }
3532
3533 memset(mapbuf, 0, sizeof(mapbuf));
3534 }
3535
3536 return 0;
3537 }
3538
3539 /*
3540 * return the host uid/gid to which the container root is mapped in
3541 * *val.
3542 * Return true if id was found, false otherwise.
3543 */
3544 bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3545 unsigned long *val)
3546 {
3547 struct lxc_list *it;
3548 struct id_map *map;
3549
3550 lxc_list_for_each(it, &conf->id_map) {
3551 map = it->elem;
3552 if (map->idtype != idtype)
3553 continue;
3554 if (map->nsid != 0)
3555 continue;
3556 *val = map->hostid;
3557 return true;
3558 }
3559 return false;
3560 }
3561
3562 int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
3563 {
3564 struct lxc_list *it;
3565 struct id_map *map;
3566 lxc_list_for_each(it, &conf->id_map) {
3567 map = it->elem;
3568 if (map->idtype != idtype)
3569 continue;
3570 if (id >= map->hostid && id < map->hostid + map->range)
3571 return (id - map->hostid) + map->nsid;
3572 }
3573 return -1;
3574 }
3575
3576 int find_unmapped_nsid(struct lxc_conf *conf, enum idtype idtype)
3577 {
3578 struct lxc_list *it;
3579 struct id_map *map;
3580 unsigned int freeid = 0;
3581 again:
3582 lxc_list_for_each(it, &conf->id_map) {
3583 map = it->elem;
3584 if (map->idtype != idtype)
3585 continue;
3586 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3587 freeid = map->nsid + map->range;
3588 goto again;
3589 }
3590 }
3591 return freeid;
3592 }
3593
3594 int lxc_find_gateway_addresses(struct lxc_handler *handler)
3595 {
3596 struct lxc_list *network = &handler->conf->network;
3597 struct lxc_list *iterator;
3598 struct lxc_netdev *netdev;
3599 int link_index;
3600
3601 lxc_list_for_each(iterator, network) {
3602 netdev = iterator->elem;
3603
3604 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3605 continue;
3606
3607 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3608 ERROR("gateway = auto only supported for "
3609 "veth and macvlan");
3610 return -1;
3611 }
3612
3613 if (!netdev->link) {
3614 ERROR("gateway = auto needs a link interface");
3615 return -1;
3616 }
3617
3618 link_index = if_nametoindex(netdev->link);
3619 if (!link_index)
3620 return -EINVAL;
3621
3622 if (netdev->ipv4_gateway_auto) {
3623 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3624 ERROR("failed to automatically find ipv4 gateway "
3625 "address from link interface '%s'", netdev->link);
3626 return -1;
3627 }
3628 }
3629
3630 if (netdev->ipv6_gateway_auto) {
3631 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3632 ERROR("failed to automatically find ipv6 gateway "
3633 "address from link interface '%s'", netdev->link);
3634 return -1;
3635 }
3636 }
3637 }
3638
3639 return 0;
3640 }
3641
3642 int lxc_create_tty(const char *name, struct lxc_conf *conf)
3643 {
3644 struct lxc_tty_info *tty_info = &conf->tty_info;
3645 int i, ret;
3646
3647 /* no tty in the configuration */
3648 if (!conf->tty)
3649 return 0;
3650
3651 tty_info->pty_info = malloc(sizeof(*tty_info->pty_info) * conf->tty);
3652 if (!tty_info->pty_info) {
3653 SYSERROR("failed to allocate struct *pty_info");
3654 return -ENOMEM;
3655 }
3656
3657 for (i = 0; i < conf->tty; i++) {
3658 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3659
3660 process_lock();
3661 ret = openpty(&pty_info->master, &pty_info->slave,
3662 pty_info->name, NULL, NULL);
3663 process_unlock();
3664 if (ret) {
3665 SYSERROR("failed to create pty device number %d", i);
3666 tty_info->nbtty = i;
3667 lxc_delete_tty(tty_info);
3668 return -ENOTTY;
3669 }
3670
3671 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
3672 pty_info->name, pty_info->master, pty_info->slave);
3673
3674 /* Prevent leaking the file descriptors to the container */
3675 ret = fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3676 if (ret < 0)
3677 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
3678 "pty device \"%s\": %s",
3679 pty_info->master, pty_info->name, strerror(errno));
3680
3681 ret = fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3682 if (ret < 0)
3683 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
3684 "pty device \"%s\": %s",
3685 pty_info->slave, pty_info->name, strerror(errno));
3686
3687 pty_info->busy = 0;
3688 }
3689
3690 tty_info->nbtty = conf->tty;
3691
3692 INFO("finished allocating %d pts devices", conf->tty);
3693 return 0;
3694 }
3695
3696 void lxc_delete_tty(struct lxc_tty_info *tty_info)
3697 {
3698 int i;
3699
3700 for (i = 0; i < tty_info->nbtty; i++) {
3701 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3702
3703 close(pty_info->master);
3704 close(pty_info->slave);
3705 }
3706
3707 free(tty_info->pty_info);
3708 tty_info->pty_info = NULL;
3709 tty_info->nbtty = 0;
3710 }
3711
3712
3713 int chown_mapped_root_exec_wrapper(void *args)
3714 {
3715 execvp("lxc-usernsexec", args);
3716 return -1;
3717 }
3718
3719 /*
3720 * chown_mapped_root: for an unprivileged user with uid/gid X to
3721 * chown a dir to subuid/subgid Y, he needs to run chown as root
3722 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3723 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3724 * root is privileged with respect to hostuid/hostgid X, allowing
3725 * him to do the chown.
3726 */
3727 int chown_mapped_root(char *path, struct lxc_conf *conf)
3728 {
3729 uid_t rootuid, rootgid;
3730 unsigned long val;
3731 char *chownpath = path;
3732 int hostuid, hostgid, ret;
3733 struct stat sb;
3734 char map1[100], map2[100], map3[100], map4[100], map5[100];
3735 char ugid[100];
3736 char *args1[] = {"lxc-usernsexec",
3737 "-m", map1,
3738 "-m", map2,
3739 "-m", map3,
3740 "-m", map5,
3741 "--", "chown", ugid, path,
3742 NULL};
3743 char *args2[] = {"lxc-usernsexec",
3744 "-m", map1,
3745 "-m", map2,
3746 "-m", map3,
3747 "-m", map4,
3748 "-m", map5,
3749 "--", "chown", ugid, path,
3750 NULL};
3751 char cmd_output[MAXPATHLEN];
3752
3753 hostuid = geteuid();
3754 hostgid = getegid();
3755
3756 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
3757 ERROR("No uid mapping for container root");
3758 return -1;
3759 }
3760 rootuid = (uid_t)val;
3761 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
3762 ERROR("No gid mapping for container root");
3763 return -1;
3764 }
3765 rootgid = (gid_t)val;
3766
3767 /*
3768 * In case of overlay, we want only the writeable layer to be chowned
3769 */
3770 if (strncmp(path, "overlayfs:", 10) == 0 || strncmp(path, "aufs:", 5) == 0) {
3771 chownpath = strchr(path, ':');
3772 if (!chownpath) {
3773 ERROR("Bad overlay path: %s", path);
3774 return -1;
3775 }
3776 chownpath = strchr(chownpath + 1, ':');
3777 if (!chownpath) {
3778 ERROR("Bad overlay path: %s", path);
3779 return -1;
3780 }
3781 chownpath++;
3782 }
3783 path = chownpath;
3784 if (hostuid == 0) {
3785 if (chown(path, rootuid, rootgid) < 0) {
3786 ERROR("Error chowning %s", path);
3787 return -1;
3788 }
3789 return 0;
3790 }
3791
3792 if (rootuid == hostuid) {
3793 // nothing to do
3794 INFO("Container root is our uid; no need to chown");
3795 return 0;
3796 }
3797
3798 /* save the current gid of "path" */
3799 if (stat(path, &sb) < 0) {
3800 ERROR("Error stat %s", path);
3801 return -1;
3802 }
3803
3804 /* Update the path argument in case this was overlayfs. */
3805 args1[sizeof(args1) / sizeof(args1[0]) - 2] = path;
3806 args2[sizeof(args2) / sizeof(args2[0]) - 2] = path;
3807
3808 /*
3809 * A file has to be group-owned by a gid mapped into the
3810 * container, or the container won't be privileged over it.
3811 */
3812 DEBUG("trying to chown \"%s\" to %d", path, hostgid);
3813 if (sb.st_uid == hostuid &&
3814 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3815 chown(path, -1, hostgid) < 0) {
3816 ERROR("Failed chgrping %s", path);
3817 return -1;
3818 }
3819
3820 // "u:0:rootuid:1"
3821 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3822 if (ret < 0 || ret >= 100) {
3823 ERROR("Error uid printing map string");
3824 return -1;
3825 }
3826
3827 // "u:hostuid:hostuid:1"
3828 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3829 if (ret < 0 || ret >= 100) {
3830 ERROR("Error uid printing map string");
3831 return -1;
3832 }
3833
3834 // "g:0:rootgid:1"
3835 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3836 if (ret < 0 || ret >= 100) {
3837 ERROR("Error gid printing map string");
3838 return -1;
3839 }
3840
3841 // "g:pathgid:rootgid+pathgid:1"
3842 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3843 rootgid + (gid_t)sb.st_gid);
3844 if (ret < 0 || ret >= 100) {
3845 ERROR("Error gid printing map string");
3846 return -1;
3847 }
3848
3849 // "g:hostgid:hostgid:1"
3850 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3851 if (ret < 0 || ret >= 100) {
3852 ERROR("Error gid printing map string");
3853 return -1;
3854 }
3855
3856 // "0:pathgid" (chown)
3857 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3858 if (ret < 0 || ret >= 100) {
3859 ERROR("Error owner printing format string for chown");
3860 return -1;
3861 }
3862
3863 if (hostgid == sb.st_gid)
3864 ret = run_command(cmd_output, sizeof(cmd_output),
3865 chown_mapped_root_exec_wrapper,
3866 (void *)args1);
3867 else
3868 ret = run_command(cmd_output, sizeof(cmd_output),
3869 chown_mapped_root_exec_wrapper,
3870 (void *)args2);
3871 if (ret < 0)
3872 ERROR("lxc-usernsexec failed: %s", cmd_output);
3873
3874 return ret;
3875 }
3876
3877 int lxc_ttys_shift_ids(struct lxc_conf *c)
3878 {
3879 if (lxc_list_empty(&c->id_map))
3880 return 0;
3881
3882 if (!strcmp(c->console.name, ""))
3883 return 0;
3884
3885 if (chown_mapped_root(c->console.name, c) < 0) {
3886 ERROR("failed to chown console \"%s\"", c->console.name);
3887 return -1;
3888 }
3889
3890 TRACE("chowned console \"%s\"", c->console.name);
3891
3892 return 0;
3893 }
3894
3895 /* NOTE: Must not be called from inside the container namespace! */
3896 int lxc_create_tmp_proc_mount(struct lxc_conf *conf)
3897 {
3898 int mounted;
3899
3900 mounted = lxc_mount_proc_if_needed(conf->rootfs.path ? conf->rootfs.mount : "");
3901 if (mounted == -1) {
3902 SYSERROR("failed to mount /proc in the container");
3903 /* continue only if there is no rootfs */
3904 if (conf->rootfs.path)
3905 return -1;
3906 } else if (mounted == 1) {
3907 conf->tmp_umount_proc = 1;
3908 }
3909
3910 return 0;
3911 }
3912
3913 void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3914 {
3915 if (lxc_conf->tmp_umount_proc == 1) {
3916 umount("/proc");
3917 lxc_conf->tmp_umount_proc = 0;
3918 }
3919 }
3920
3921 void remount_all_slave(void)
3922 {
3923 /* walk /proc/mounts and change any shared entries to slave */
3924 FILE *f = fopen("/proc/self/mountinfo", "r");
3925 char *line = NULL;
3926 size_t len = 0;
3927
3928 if (!f) {
3929 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3930 ERROR("Continuing container startup...");
3931 return;
3932 }
3933
3934 while (getline(&line, &len, f) != -1) {
3935 char *target, *opts;
3936 target = get_field(line, 4);
3937 if (!target)
3938 continue;
3939 opts = get_field(target, 2);
3940 if (!opts)
3941 continue;
3942 null_endofword(opts);
3943 if (!strstr(opts, "shared"))
3944 continue;
3945 null_endofword(target);
3946 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3947 SYSERROR("Failed to make %s rslave", target);
3948 ERROR("Continuing...");
3949 }
3950 }
3951 fclose(f);
3952 free(line);
3953 }
3954
3955 void lxc_execute_bind_init(struct lxc_conf *conf)
3956 {
3957 int ret;
3958 char path[PATH_MAX], destpath[PATH_MAX], *p;
3959
3960 /* If init exists in the container, don't bind mount a static one */
3961 p = choose_init(conf->rootfs.mount);
3962 if (p) {
3963 free(p);
3964 return;
3965 }
3966
3967 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3968 if (ret < 0 || ret >= PATH_MAX) {
3969 WARN("Path name too long searching for lxc.init.static");
3970 return;
3971 }
3972
3973 if (!file_exists(path)) {
3974 INFO("%s does not exist on host", path);
3975 return;
3976 }
3977
3978 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
3979 if (ret < 0 || ret >= PATH_MAX) {
3980 WARN("Path name too long for container's lxc.init.static");
3981 return;
3982 }
3983
3984 if (!file_exists(destpath)) {
3985 FILE * pathfile = fopen(destpath, "wb");
3986 if (!pathfile) {
3987 SYSERROR("Failed to create mount target '%s'", destpath);
3988 return;
3989 }
3990 fclose(pathfile);
3991 }
3992
3993 ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount);
3994 if (ret < 0)
3995 SYSERROR("Failed to bind lxc.init.static into container");
3996 INFO("lxc.init.static bound into container at %s", path);
3997 }
3998
3999 /*
4000 * This does the work of remounting / if it is shared, calling the
4001 * container pre-mount hooks, and mounting the rootfs.
4002 */
4003 int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
4004 {
4005 if (conf->rootfs_setup) {
4006 /*
4007 * rootfs was set up in another namespace. bind-mount it
4008 * to give us a mount in our own ns so we can pivot_root to it
4009 */
4010 const char *path = conf->rootfs.mount;
4011 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
4012 ERROR("Failed to bind-mount container / onto itself");
4013 return -1;
4014 }
4015 return 0;
4016 }
4017
4018 remount_all_slave();
4019
4020 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
4021 ERROR("failed to run pre-mount hooks for container '%s'.", name);
4022 return -1;
4023 }
4024
4025 if (lxc_setup_rootfs(conf)) {
4026 ERROR("failed to setup rootfs for '%s'", name);
4027 return -1;
4028 }
4029
4030 conf->rootfs_setup = true;
4031 return 0;
4032 }
4033
4034 static bool verify_start_hooks(struct lxc_conf *conf)
4035 {
4036 struct lxc_list *it;
4037 char path[MAXPATHLEN];
4038 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
4039 char *hookname = it->elem;
4040 struct stat st;
4041 int ret;
4042
4043 ret = snprintf(path, MAXPATHLEN, "%s%s",
4044 conf->rootfs.path ? conf->rootfs.mount : "", hookname);
4045 if (ret < 0 || ret >= MAXPATHLEN)
4046 return false;
4047 ret = stat(path, &st);
4048 if (ret) {
4049 SYSERROR("Start hook %s not found in container",
4050 hookname);
4051 return false;
4052 }
4053 return true;
4054 }
4055
4056 return true;
4057 }
4058
4059 static int lxc_send_ttys_to_parent(struct lxc_handler *handler)
4060 {
4061 int i;
4062 int *ttyfds;
4063 struct lxc_pty_info *pty_info;
4064 struct lxc_conf *conf = handler->conf;
4065 const struct lxc_tty_info *tty_info = &conf->tty_info;
4066 int sock = handler->ttysock[0];
4067 int ret = -1;
4068 size_t num_ttyfds = (2 * conf->tty);
4069
4070 ttyfds = malloc(num_ttyfds * sizeof(int));
4071 if (!ttyfds)
4072 return -1;
4073
4074 for (i = 0; i < num_ttyfds; i++) {
4075 pty_info = &tty_info->pty_info[i / 2];
4076 ttyfds[i++] = pty_info->slave;
4077 ttyfds[i] = pty_info->master;
4078 TRACE("send pty \"%s\" with master fd %d and slave fd %d to "
4079 "parent",
4080 pty_info->name, pty_info->master, pty_info->slave);
4081 }
4082
4083 ret = lxc_abstract_unix_send_fds(sock, ttyfds, num_ttyfds, NULL, 0);
4084 if (ret < 0)
4085 ERROR("failed to send %d ttys to parent: %s", conf->tty,
4086 strerror(errno));
4087 else
4088 TRACE("sent %d ttys to parent", conf->tty);
4089
4090 close(handler->ttysock[0]);
4091 close(handler->ttysock[1]);
4092
4093 for (i = 0; i < num_ttyfds; i++)
4094 close(ttyfds[i]);
4095
4096 free(ttyfds);
4097
4098 return ret;
4099 }
4100
4101 int lxc_setup(struct lxc_handler *handler)
4102 {
4103 const char *name = handler->name;
4104 struct lxc_conf *lxc_conf = handler->conf;
4105 const char *lxcpath = handler->lxcpath;
4106
4107 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4108 ERROR("Error setting up rootfs mount after spawn");
4109 return -1;
4110 }
4111
4112 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4113 if (setup_utsname(lxc_conf->utsname)) {
4114 ERROR("failed to setup the utsname for '%s'", name);
4115 return -1;
4116 }
4117 }
4118
4119 if (lxc_setup_networks_in_child_namespaces(lxc_conf,
4120 &lxc_conf->network)) {
4121 ERROR("failed to setup the network for '%s'", name);
4122 return -1;
4123 }
4124
4125 if (lxc_conf->autodev > 0) {
4126 if (mount_autodev(name, &lxc_conf->rootfs, lxcpath)) {
4127 ERROR("failed to mount /dev in the container");
4128 return -1;
4129 }
4130 }
4131
4132 /* do automatic mounts (mainly /proc and /sys), but exclude
4133 * those that need to wait until other stuff has finished
4134 */
4135 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
4136 ERROR("failed to setup the automatic mounts for '%s'", name);
4137 return -1;
4138 }
4139
4140 if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name, lxcpath)) {
4141 ERROR("failed to setup the mounts for '%s'", name);
4142 return -1;
4143 }
4144
4145 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name, lxcpath)) {
4146 ERROR("failed to setup the mount entries for '%s'", name);
4147 return -1;
4148 }
4149
4150 /* Make sure any start hooks are in the container */
4151 if (!verify_start_hooks(lxc_conf))
4152 return -1;
4153
4154 if (lxc_conf->is_execute)
4155 lxc_execute_bind_init(lxc_conf);
4156
4157 /* now mount only cgroup, if wanted;
4158 * before, /sys could not have been mounted
4159 * (is either mounted automatically or via fstab entries)
4160 */
4161 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
4162 ERROR("failed to setup the automatic mounts for '%s'", name);
4163 return -1;
4164 }
4165
4166 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
4167 ERROR("failed to run mount hooks for container '%s'.", name);
4168 return -1;
4169 }
4170
4171 if (lxc_conf->autodev > 0) {
4172 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
4173 ERROR("failed to run autodev hooks for container '%s'.", name);
4174 return -1;
4175 }
4176 if (lxc_fill_autodev(&lxc_conf->rootfs)) {
4177 ERROR("failed to populate /dev in the container");
4178 return -1;
4179 }
4180 }
4181
4182 if (!lxc_conf->is_execute && lxc_setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
4183 ERROR("failed to setup the console for '%s'", name);
4184 return -1;
4185 }
4186
4187 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4188 ERROR("failed to setup /dev symlinks for '%s'", name);
4189 return -1;
4190 }
4191
4192 /* mount /proc if it's not already there */
4193 if (lxc_create_tmp_proc_mount(lxc_conf) < 0) {
4194 ERROR("failed to LSM mount proc for '%s'", name);
4195 return -1;
4196 }
4197
4198 if (setup_pivot_root(&lxc_conf->rootfs)) {
4199 ERROR("failed to set rootfs for '%s'", name);
4200 return -1;
4201 }
4202
4203 if (lxc_setup_devpts(lxc_conf->pts)) {
4204 ERROR("failed to setup the new pts instance");
4205 return -1;
4206 }
4207
4208 if (lxc_create_tty(name, lxc_conf)) {
4209 ERROR("failed to create the ttys");
4210 return -1;
4211 }
4212
4213 if (lxc_send_ttys_to_parent(handler) < 0) {
4214 ERROR("failure sending console info to parent");
4215 return -1;
4216 }
4217
4218 if (!lxc_conf->is_execute && lxc_setup_tty(lxc_conf)) {
4219 ERROR("failed to setup the ttys for '%s'", name);
4220 return -1;
4221 }
4222
4223 if (lxc_conf->pty_names && setenv("container_ttys", lxc_conf->pty_names, 1))
4224 SYSERROR("failed to set environment variable for container ptys");
4225
4226
4227 if (setup_personality(lxc_conf->personality)) {
4228 ERROR("failed to setup personality");
4229 return -1;
4230 }
4231
4232 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4233 if (!lxc_list_empty(&lxc_conf->caps)) {
4234 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
4235 return -1;
4236 }
4237 if (dropcaps_except(&lxc_conf->keepcaps)) {
4238 ERROR("failed to keep requested caps");
4239 return -1;
4240 }
4241 } else if (setup_caps(&lxc_conf->caps)) {
4242 ERROR("failed to drop capabilities");
4243 return -1;
4244 }
4245
4246 NOTICE("Container \"%s\" is set up", name);
4247
4248 return 0;
4249 }
4250
4251 int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4252 const char *lxcpath, char *argv[])
4253 {
4254 int which = -1;
4255 struct lxc_list *it;
4256
4257 if (strcmp(hook, "pre-start") == 0)
4258 which = LXCHOOK_PRESTART;
4259 else if (strcmp(hook, "pre-mount") == 0)
4260 which = LXCHOOK_PREMOUNT;
4261 else if (strcmp(hook, "mount") == 0)
4262 which = LXCHOOK_MOUNT;
4263 else if (strcmp(hook, "autodev") == 0)
4264 which = LXCHOOK_AUTODEV;
4265 else if (strcmp(hook, "start") == 0)
4266 which = LXCHOOK_START;
4267 else if (strcmp(hook, "stop") == 0)
4268 which = LXCHOOK_STOP;
4269 else if (strcmp(hook, "post-stop") == 0)
4270 which = LXCHOOK_POSTSTOP;
4271 else if (strcmp(hook, "clone") == 0)
4272 which = LXCHOOK_CLONE;
4273 else if (strcmp(hook, "destroy") == 0)
4274 which = LXCHOOK_DESTROY;
4275 else
4276 return -1;
4277 lxc_list_for_each(it, &conf->hooks[which]) {
4278 int ret;
4279 char *hookname = it->elem;
4280 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
4281 if (ret)
4282 return ret;
4283 }
4284 return 0;
4285 }
4286
4287 int lxc_clear_config_caps(struct lxc_conf *c)
4288 {
4289 struct lxc_list *it,*next;
4290
4291 lxc_list_for_each_safe(it, &c->caps, next) {
4292 lxc_list_del(it);
4293 free(it->elem);
4294 free(it);
4295 }
4296 return 0;
4297 }
4298
4299 static int lxc_free_idmap(struct lxc_list *id_map) {
4300 struct lxc_list *it, *next;
4301
4302 lxc_list_for_each_safe(it, id_map, next) {
4303 lxc_list_del(it);
4304 free(it->elem);
4305 free(it);
4306 }
4307 return 0;
4308 }
4309
4310 int lxc_clear_idmaps(struct lxc_conf *c)
4311 {
4312 return lxc_free_idmap(&c->id_map);
4313 }
4314
4315 int lxc_clear_config_keepcaps(struct lxc_conf *c)
4316 {
4317 struct lxc_list *it,*next;
4318
4319 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4320 lxc_list_del(it);
4321 free(it->elem);
4322 free(it);
4323 }
4324 return 0;
4325 }
4326
4327 int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
4328 {
4329 struct lxc_list *it,*next;
4330 bool all = false;
4331 const char *k = NULL;
4332
4333 if (strcmp(key, "lxc.cgroup") == 0)
4334 all = true;
4335 else if (strncmp(key, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
4336 k = key + sizeof("lxc.cgroup.")-1;
4337 else
4338 return -1;
4339
4340 lxc_list_for_each_safe(it, &c->cgroup, next) {
4341 struct lxc_cgroup *cg = it->elem;
4342 if (!all && strcmp(cg->subsystem, k) != 0)
4343 continue;
4344 lxc_list_del(it);
4345 free(cg->subsystem);
4346 free(cg->value);
4347 free(cg);
4348 free(it);
4349 }
4350 return 0;
4351 }
4352
4353 int lxc_clear_limits(struct lxc_conf *c, const char *key)
4354 {
4355 struct lxc_list *it, *next;
4356 bool all = false;
4357 const char *k = NULL;
4358
4359 if (strcmp(key, "lxc.limit") == 0
4360 || strcmp(key, "lxc.prlimit"))
4361 all = true;
4362 else if (strncmp(key, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
4363 k = key + sizeof("lxc.limit.")-1;
4364 else if (strncmp(key, "lxc.prlimit.", sizeof("lxc.prlimit.")-1) == 0)
4365 k = key + sizeof("lxc.prlimit.")-1;
4366 else
4367 return -1;
4368
4369 lxc_list_for_each_safe(it, &c->limits, next) {
4370 struct lxc_limit *lim = it->elem;
4371 if (!all && strcmp(lim->resource, k) != 0)
4372 continue;
4373 lxc_list_del(it);
4374 free(lim->resource);
4375 free(lim);
4376 free(it);
4377 }
4378 return 0;
4379 }
4380
4381 int lxc_clear_groups(struct lxc_conf *c)
4382 {
4383 struct lxc_list *it,*next;
4384
4385 lxc_list_for_each_safe(it, &c->groups, next) {
4386 lxc_list_del(it);
4387 free(it->elem);
4388 free(it);
4389 }
4390 return 0;
4391 }
4392
4393 int lxc_clear_environment(struct lxc_conf *c)
4394 {
4395 struct lxc_list *it,*next;
4396
4397 lxc_list_for_each_safe(it, &c->environment, next) {
4398 lxc_list_del(it);
4399 free(it->elem);
4400 free(it);
4401 }
4402 return 0;
4403 }
4404
4405
4406 int lxc_clear_mount_entries(struct lxc_conf *c)
4407 {
4408 struct lxc_list *it,*next;
4409
4410 lxc_list_for_each_safe(it, &c->mount_list, next) {
4411 lxc_list_del(it);
4412 free(it->elem);
4413 free(it);
4414 }
4415 return 0;
4416 }
4417
4418 int lxc_clear_automounts(struct lxc_conf *c)
4419 {
4420 c->auto_mounts = 0;
4421 return 0;
4422 }
4423
4424 int lxc_clear_hooks(struct lxc_conf *c, const char *key)
4425 {
4426 struct lxc_list *it,*next;
4427 bool all = false, done = false;
4428 const char *k = NULL;
4429 int i;
4430
4431 if (strcmp(key, "lxc.hook") == 0)
4432 all = true;
4433 else if (strncmp(key, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
4434 k = key + sizeof("lxc.hook.")-1;
4435 else
4436 return -1;
4437
4438 for (i=0; i<NUM_LXC_HOOKS; i++) {
4439 if (all || strcmp(k, lxchook_names[i]) == 0) {
4440 lxc_list_for_each_safe(it, &c->hooks[i], next) {
4441 lxc_list_del(it);
4442 free(it->elem);
4443 free(it);
4444 }
4445 done = true;
4446 }
4447 }
4448
4449 if (!done) {
4450 ERROR("Invalid hook key: %s", key);
4451 return -1;
4452 }
4453 return 0;
4454 }
4455
4456 static void lxc_clear_saved_nics(struct lxc_conf *conf)
4457 {
4458 int i;
4459
4460 if (!conf->saved_nics)
4461 return;
4462 for (i=0; i < conf->num_savednics; i++)
4463 free(conf->saved_nics[i].orig_name);
4464 free(conf->saved_nics);
4465 }
4466
4467 static inline void lxc_clear_aliens(struct lxc_conf *conf)
4468 {
4469 struct lxc_list *it,*next;
4470
4471 lxc_list_for_each_safe(it, &conf->aliens, next) {
4472 lxc_list_del(it);
4473 free(it->elem);
4474 free(it);
4475 }
4476 }
4477
4478 void lxc_clear_includes(struct lxc_conf *conf)
4479 {
4480 struct lxc_list *it,*next;
4481
4482 lxc_list_for_each_safe(it, &conf->includes, next) {
4483 lxc_list_del(it);
4484 free(it->elem);
4485 free(it);
4486 }
4487 }
4488
4489 void lxc_conf_free(struct lxc_conf *conf)
4490 {
4491 if (!conf)
4492 return;
4493 if (current_config == conf)
4494 current_config = NULL;
4495 free(conf->console.log_path);
4496 free(conf->console.path);
4497 free(conf->rootfs.mount);
4498 free(conf->rootfs.bdev_type);
4499 free(conf->rootfs.options);
4500 free(conf->rootfs.path);
4501 free(conf->logfile);
4502 if (conf->logfd != -1)
4503 close(conf->logfd);
4504 free(conf->utsname);
4505 free(conf->ttydir);
4506 free(conf->fstab);
4507 free(conf->rcfile);
4508 free(conf->init_cmd);
4509 free(conf->unexpanded_config);
4510 free(conf->pty_names);
4511 free(conf->syslog);
4512 lxc_free_networks(&conf->network);
4513 free(conf->lsm_aa_profile);
4514 free(conf->lsm_se_context);
4515 lxc_seccomp_free(conf);
4516 lxc_clear_config_caps(conf);
4517 lxc_clear_config_keepcaps(conf);
4518 lxc_clear_cgroups(conf, "lxc.cgroup");
4519 lxc_clear_hooks(conf, "lxc.hook");
4520 lxc_clear_mount_entries(conf);
4521 lxc_clear_saved_nics(conf);
4522 lxc_clear_idmaps(conf);
4523 lxc_clear_groups(conf);
4524 lxc_clear_includes(conf);
4525 lxc_clear_aliens(conf);
4526 lxc_clear_environment(conf);
4527 lxc_clear_limits(conf, "lxc.prlimit");
4528 free(conf);
4529 }
4530
4531 struct userns_fn_data {
4532 int (*fn)(void *);
4533 const char *fn_name;
4534 void *arg;
4535 int p[2];
4536 };
4537
4538 static int run_userns_fn(void *data)
4539 {
4540 struct userns_fn_data *d = data;
4541 char c;
4542
4543 /* Close write end of the pipe. */
4544 close(d->p[1]);
4545
4546 /* Wait for parent to finish establishing a new mapping in the user
4547 * namespace we are executing in.
4548 */
4549 if (read(d->p[0], &c, 1) != 1)
4550 return -1;
4551
4552 /* Close read end of the pipe. */
4553 close(d->p[0]);
4554
4555 if (d->fn_name)
4556 TRACE("calling function \"%s\"", d->fn_name);
4557 /* Call function to run. */
4558 return d->fn(d->arg);
4559 }
4560
4561 static struct id_map *mapped_hostid_entry(struct lxc_conf *conf, unsigned id,
4562 enum idtype idtype)
4563 {
4564 struct lxc_list *it;
4565 struct id_map *map;
4566 struct id_map *retmap = NULL;
4567
4568 lxc_list_for_each(it, &conf->id_map) {
4569 map = it->elem;
4570 if (map->idtype != idtype)
4571 continue;
4572
4573 if (id >= map->hostid && id < map->hostid + map->range) {
4574 retmap = map;
4575 break;
4576 }
4577 }
4578
4579 if (!retmap)
4580 return NULL;
4581
4582 retmap = malloc(sizeof(*retmap));
4583 if (!retmap)
4584 return NULL;
4585
4586 memcpy(retmap, map, sizeof(*retmap));
4587 return retmap;
4588 }
4589
4590 /*
4591 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4592 * existing one or establish a new one.
4593 */
4594 static struct id_map *idmap_add(struct lxc_conf *conf, uid_t id, enum idtype type)
4595 {
4596 int hostid_mapped;
4597 struct id_map *entry = NULL;
4598
4599 /* Reuse existing mapping. */
4600 entry = mapped_hostid_entry(conf, id, type);
4601 if (entry)
4602 return entry;
4603
4604 /* Find new mapping. */
4605 hostid_mapped = find_unmapped_nsid(conf, type);
4606 if (hostid_mapped < 0) {
4607 DEBUG("failed to find free mapping for id %d", id);
4608 return NULL;
4609 }
4610
4611 entry = malloc(sizeof(*entry));
4612 if (!entry)
4613 return NULL;
4614
4615 entry->idtype = type;
4616 entry->nsid = hostid_mapped;
4617 entry->hostid = (unsigned long)id;
4618 entry->range = 1;
4619
4620 return entry;
4621 }
4622
4623 /* Run a function in a new user namespace.
4624 * The caller's euid/egid will be mapped if it is not already.
4625 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4626 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4627 * This means we require only to establish a mapping from:
4628 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4629 * - the container root -> some sub{g,u}id
4630 * The former we add, if the user did not specifiy a mapping. The latter we
4631 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4632 * there to start the container in the first place.
4633 */
4634 int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data,
4635 const char *fn_name)
4636 {
4637 pid_t pid;
4638 uid_t euid, egid;
4639 struct userns_fn_data d;
4640 int p[2];
4641 struct lxc_list *it;
4642 struct id_map *map;
4643 char c = '1';
4644 int ret = -1;
4645 struct lxc_list *idmap = NULL, *tmplist = NULL;
4646 struct id_map *container_root_uid = NULL, *container_root_gid = NULL,
4647 *host_uid_map = NULL, *host_gid_map = NULL;
4648
4649 ret = pipe(p);
4650 if (ret < 0) {
4651 SYSERROR("opening pipe");
4652 return -1;
4653 }
4654 d.fn = fn;
4655 d.fn_name = fn_name;
4656 d.arg = data;
4657 d.p[0] = p[0];
4658 d.p[1] = p[1];
4659
4660 /* Clone child in new user namespace. */
4661 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4662 if (pid < 0) {
4663 ERROR("failed to clone child process in new user namespace");
4664 goto on_error;
4665 }
4666
4667 close(p[0]);
4668 p[0] = -1;
4669
4670 /* Find container root. */
4671 lxc_list_for_each(it, &conf->id_map) {
4672 map = it->elem;
4673
4674 if (map->nsid != 0)
4675 continue;
4676
4677 if (map->idtype == ID_TYPE_UID && container_root_uid == NULL) {
4678 container_root_uid = malloc(sizeof(*container_root_uid));
4679 if (!container_root_uid)
4680 goto on_error;
4681 container_root_uid->idtype = map->idtype;
4682 container_root_uid->hostid = map->hostid;
4683 container_root_uid->nsid = 0;
4684 container_root_uid->range = map->range;
4685 } else if (map->idtype == ID_TYPE_GID && container_root_gid == NULL) {
4686 container_root_gid = malloc(sizeof(*container_root_gid));
4687 if (!container_root_gid)
4688 goto on_error;
4689 container_root_gid->idtype = map->idtype;
4690 container_root_gid->hostid = map->hostid;
4691 container_root_gid->nsid = 0;
4692 container_root_gid->range = map->range;
4693 }
4694
4695 /* Found container root. */
4696 if (container_root_uid && container_root_gid)
4697 break;
4698 }
4699
4700 /* This is actually checked earlier but it can't hurt. */
4701 if (!container_root_uid || !container_root_gid) {
4702 ERROR("no mapping for container root found");
4703 goto on_error;
4704 }
4705
4706 host_uid_map = container_root_uid;
4707 host_gid_map = container_root_gid;
4708
4709 /* Check whether the {g,u}id of the user has a mapping. */
4710 euid = geteuid();
4711 egid = getegid();
4712 if (euid != container_root_uid->hostid)
4713 host_uid_map = idmap_add(conf, euid, ID_TYPE_UID);
4714
4715 if (egid != container_root_gid->hostid)
4716 host_gid_map = idmap_add(conf, egid, ID_TYPE_GID);
4717
4718 if (!host_uid_map) {
4719 DEBUG("failed to find mapping for uid %d", euid);
4720 goto on_error;
4721 }
4722
4723 if (!host_gid_map) {
4724 DEBUG("failed to find mapping for gid %d", egid);
4725 goto on_error;
4726 }
4727
4728 /* Allocate new {g,u}id map list. */
4729 idmap = malloc(sizeof(*idmap));
4730 if (!idmap)
4731 goto on_error;
4732 lxc_list_init(idmap);
4733
4734 /* Add container root to the map. */
4735 tmplist = malloc(sizeof(*tmplist));
4736 if (!tmplist)
4737 goto on_error;
4738 lxc_list_add_elem(tmplist, container_root_uid);
4739 lxc_list_add_tail(idmap, tmplist);
4740
4741 if (host_uid_map && (host_uid_map != container_root_uid)) {
4742 /* idmap will now keep track of that memory. */
4743 container_root_uid = NULL;
4744
4745 /* Add container root to the map. */
4746 tmplist = malloc(sizeof(*tmplist));
4747 if (!tmplist)
4748 goto on_error;
4749 lxc_list_add_elem(tmplist, host_uid_map);
4750 lxc_list_add_tail(idmap, tmplist);
4751 }
4752 /* idmap will now keep track of that memory. */
4753 container_root_uid = NULL;
4754 /* idmap will now keep track of that memory. */
4755 host_uid_map = NULL;
4756
4757 tmplist = malloc(sizeof(*tmplist));
4758 if (!tmplist)
4759 goto on_error;
4760 lxc_list_add_elem(tmplist, container_root_gid);
4761 lxc_list_add_tail(idmap, tmplist);
4762
4763 if (host_gid_map && (host_gid_map != container_root_gid)) {
4764 /* idmap will now keep track of that memory. */
4765 container_root_gid = NULL;
4766
4767 tmplist = malloc(sizeof(*tmplist));
4768 if (!tmplist)
4769 goto on_error;
4770 lxc_list_add_elem(tmplist, host_gid_map);
4771 lxc_list_add_tail(idmap, tmplist);
4772 }
4773 /* idmap will now keep track of that memory. */
4774 container_root_gid = NULL;
4775 /* idmap will now keep track of that memory. */
4776 host_gid_map = NULL;
4777
4778 if (lxc_log_get_level() == LXC_LOG_LEVEL_TRACE ||
4779 conf->loglevel == LXC_LOG_LEVEL_TRACE) {
4780 lxc_list_for_each(it, idmap) {
4781 map = it->elem;
4782 TRACE("establishing %cid mapping for \"%d\" in new "
4783 "user namespace: nsuid %lu - hostid %lu - range "
4784 "%lu",
4785 (map->idtype == ID_TYPE_UID) ? 'u' : 'g', pid,
4786 map->nsid, map->hostid, map->range);
4787 }
4788 }
4789
4790 /* Set up {g,u}id mapping for user namespace of child process. */
4791 ret = lxc_map_ids(idmap, pid);
4792 if (ret < 0) {
4793 ERROR("error setting up {g,u}id mappings for child process "
4794 "\"%d\"",
4795 pid);
4796 goto on_error;
4797 }
4798
4799 /* Tell child to proceed. */
4800 if (write(p[1], &c, 1) != 1) {
4801 SYSERROR("failed telling child process \"%d\" to proceed", pid);
4802 goto on_error;
4803 }
4804
4805 /* Wait for child to finish. */
4806 ret = wait_for_pid(pid);
4807
4808 on_error:
4809 if (idmap)
4810 lxc_free_idmap(idmap);
4811 if (container_root_uid)
4812 free(container_root_uid);
4813 if (container_root_gid)
4814 free(container_root_gid);
4815 if (host_uid_map && (host_uid_map != container_root_uid))
4816 free(host_uid_map);
4817 if (host_gid_map && (host_gid_map != container_root_gid))
4818 free(host_gid_map);
4819
4820 if (p[0] != -1)
4821 close(p[0]);
4822 close(p[1]);
4823
4824 return ret;
4825 }
4826
4827 /* not thread-safe, do not use from api without first forking */
4828 static char* getuname(void)
4829 {
4830 struct passwd *result;
4831
4832 result = getpwuid(geteuid());
4833 if (!result)
4834 return NULL;
4835
4836 return strdup(result->pw_name);
4837 }
4838
4839 /* not thread-safe, do not use from api without first forking */
4840 static char *getgname(void)
4841 {
4842 struct group *result;
4843
4844 result = getgrgid(getegid());
4845 if (!result)
4846 return NULL;
4847
4848 return strdup(result->gr_name);
4849 }
4850
4851 /* not thread-safe, do not use from api without first forking */
4852 void suggest_default_idmap(void)
4853 {
4854 FILE *f;
4855 unsigned int uid = 0, urange = 0, gid = 0, grange = 0;
4856 char *line = NULL;
4857 char *uname, *gname;
4858 size_t len = 0;
4859
4860 if (!(uname = getuname()))
4861 return;
4862
4863 if (!(gname = getgname())) {
4864 free(uname);
4865 return;
4866 }
4867
4868 f = fopen(subuidfile, "r");
4869 if (!f) {
4870 ERROR("Your system is not configured with subuids");
4871 free(gname);
4872 free(uname);
4873 return;
4874 }
4875 while (getline(&line, &len, f) != -1) {
4876 size_t no_newline = 0;
4877 char *p = strchr(line, ':'), *p2;
4878 if (*line == '#')
4879 continue;
4880 if (!p)
4881 continue;
4882 *p = '\0';
4883 p++;
4884 if (strcmp(line, uname))
4885 continue;
4886 p2 = strchr(p, ':');
4887 if (!p2)
4888 continue;
4889 *p2 = '\0';
4890 p2++;
4891 if (!*p2)
4892 continue;
4893 no_newline = strcspn(p2, "\n");
4894 p2[no_newline] = '\0';
4895
4896 if (lxc_safe_uint(p, &uid) < 0)
4897 WARN("Could not parse UID.");
4898 if (lxc_safe_uint(p2, &urange) < 0)
4899 WARN("Could not parse UID range.");
4900 }
4901 fclose(f);
4902
4903 f = fopen(subgidfile, "r");
4904 if (!f) {
4905 ERROR("Your system is not configured with subgids");
4906 free(gname);
4907 free(uname);
4908 return;
4909 }
4910 while (getline(&line, &len, f) != -1) {
4911 size_t no_newline = 0;
4912 char *p = strchr(line, ':'), *p2;
4913 if (*line == '#')
4914 continue;
4915 if (!p)
4916 continue;
4917 *p = '\0';
4918 p++;
4919 if (strcmp(line, uname))
4920 continue;
4921 p2 = strchr(p, ':');
4922 if (!p2)
4923 continue;
4924 *p2 = '\0';
4925 p2++;
4926 if (!*p2)
4927 continue;
4928 no_newline = strcspn(p2, "\n");
4929 p2[no_newline] = '\0';
4930
4931 if (lxc_safe_uint(p, &gid) < 0)
4932 WARN("Could not parse GID.");
4933 if (lxc_safe_uint(p2, &grange) < 0)
4934 WARN("Could not parse GID range.");
4935 }
4936 fclose(f);
4937
4938 free(line);
4939
4940 if (!urange || !grange) {
4941 ERROR("You do not have subuids or subgids allocated");
4942 ERROR("Unprivileged containers require subuids and subgids");
4943 return;
4944 }
4945
4946 ERROR("You must either run as root, or define uid mappings");
4947 ERROR("To pass uid mappings to lxc-create, you could create");
4948 ERROR("~/.config/lxc/default.conf:");
4949 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG);
4950 ERROR("lxc.id_map = u 0 %u %u", uid, urange);
4951 ERROR("lxc.id_map = g 0 %u %u", gid, grange);
4952
4953 free(gname);
4954 free(uname);
4955 }
4956
4957 static void free_cgroup_settings(struct lxc_list *result)
4958 {
4959 struct lxc_list *iterator, *next;
4960
4961 lxc_list_for_each_safe(iterator, result, next) {
4962 lxc_list_del(iterator);
4963 free(iterator);
4964 }
4965 free(result);
4966 }
4967
4968 /*
4969 * Return the list of cgroup_settings sorted according to the following rules
4970 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
4971 */
4972 struct lxc_list *sort_cgroup_settings(struct lxc_list* cgroup_settings)
4973 {
4974 struct lxc_list *result;
4975 struct lxc_list *memsw_limit = NULL;
4976 struct lxc_list *it = NULL;
4977 struct lxc_cgroup *cg = NULL;
4978 struct lxc_list *item = NULL;
4979
4980 result = malloc(sizeof(*result));
4981 if (!result) {
4982 ERROR("failed to allocate memory to sort cgroup settings");
4983 return NULL;
4984 }
4985 lxc_list_init(result);
4986
4987 /*Iterate over the cgroup settings and copy them to the output list*/
4988 lxc_list_for_each(it, cgroup_settings) {
4989 item = malloc(sizeof(*item));
4990 if (!item) {
4991 ERROR("failed to allocate memory to sort cgroup settings");
4992 free_cgroup_settings(result);
4993 return NULL;
4994 }
4995 item->elem = it->elem;
4996 cg = it->elem;
4997 if (strcmp(cg->subsystem, "memory.memsw.limit_in_bytes") == 0) {
4998 /* Store the memsw_limit location */
4999 memsw_limit = item;
5000 } else if (strcmp(cg->subsystem, "memory.limit_in_bytes") == 0 && memsw_limit != NULL) {
5001 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
5002 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
5003 item->elem = memsw_limit->elem;
5004 memsw_limit->elem = it->elem;
5005 }
5006 lxc_list_add_tail(result, item);
5007 }
5008
5009 return result;
5010 }