]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/conf.c
Support providing env vars to container init
[mirror_lxc.git] / src / lxc / conf.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23 #include "config.h"
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <stdarg.h>
28 #include <errno.h>
29 #include <string.h>
30 #include <dirent.h>
31 #include <unistd.h>
32 #include <inttypes.h>
33 #include <sys/wait.h>
34 #include <sys/syscall.h>
35 #include <time.h>
36
37 #if HAVE_PTY_H
38 #include <pty.h>
39 #else
40 #include <../include/openpty.h>
41 #endif
42
43 #include <linux/loop.h>
44
45 #include <sys/types.h>
46 #include <sys/utsname.h>
47 #include <sys/param.h>
48 #include <sys/stat.h>
49 #include <sys/socket.h>
50 #include <sys/mount.h>
51 #include <sys/mman.h>
52 #include <sys/prctl.h>
53
54 #include <arpa/inet.h>
55 #include <fcntl.h>
56 #include <netinet/in.h>
57 #include <net/if.h>
58 #include <libgen.h>
59
60 #include "network.h"
61 #include "error.h"
62 #include "parse.h"
63 #include "utils.h"
64 #include "conf.h"
65 #include "log.h"
66 #include "caps.h" /* for lxc_caps_last_cap() */
67 #include "bdev.h"
68 #include "cgroup.h"
69 #include "lxclock.h"
70 #include "namespace.h"
71 #include "lsm/lsm.h"
72
73 #if HAVE_SYS_CAPABILITY_H
74 #include <sys/capability.h>
75 #endif
76
77 #if HAVE_SYS_PERSONALITY_H
78 #include <sys/personality.h>
79 #endif
80
81 #if IS_BIONIC
82 #include <../include/lxcmntent.h>
83 #else
84 #include <mntent.h>
85 #endif
86
87 #include "lxcseccomp.h"
88
89 lxc_log_define(lxc_conf, lxc);
90
91 #define MAXHWLEN 18
92 #define MAXINDEXLEN 20
93 #define MAXMTULEN 16
94 #define MAXLINELEN 128
95
96 #if HAVE_SYS_CAPABILITY_H
97 #ifndef CAP_SETFCAP
98 #define CAP_SETFCAP 31
99 #endif
100
101 #ifndef CAP_MAC_OVERRIDE
102 #define CAP_MAC_OVERRIDE 32
103 #endif
104
105 #ifndef CAP_MAC_ADMIN
106 #define CAP_MAC_ADMIN 33
107 #endif
108 #endif
109
110 #ifndef PR_CAPBSET_DROP
111 #define PR_CAPBSET_DROP 24
112 #endif
113
114 #ifndef LO_FLAGS_AUTOCLEAR
115 #define LO_FLAGS_AUTOCLEAR 4
116 #endif
117
118 /* needed for cgroup automount checks, regardless of whether we
119 * have included linux/capability.h or not */
120 #ifndef CAP_SYS_ADMIN
121 #define CAP_SYS_ADMIN 21
122 #endif
123
124 /* Define pivot_root() if missing from the C library */
125 #ifndef HAVE_PIVOT_ROOT
126 static int pivot_root(const char * new_root, const char * put_old)
127 {
128 #ifdef __NR_pivot_root
129 return syscall(__NR_pivot_root, new_root, put_old);
130 #else
131 errno = ENOSYS;
132 return -1;
133 #endif
134 }
135 #else
136 extern int pivot_root(const char * new_root, const char * put_old);
137 #endif
138
139 /* Define sethostname() if missing from the C library */
140 #ifndef HAVE_SETHOSTNAME
141 static int sethostname(const char * name, size_t len)
142 {
143 #ifdef __NR_sethostname
144 return syscall(__NR_sethostname, name, len);
145 #else
146 errno = ENOSYS;
147 return -1;
148 #endif
149 }
150 #endif
151
152 /* Define __S_ISTYPE if missing from the C library */
153 #ifndef __S_ISTYPE
154 #define __S_ISTYPE(mode, mask) (((mode) & S_IFMT) == (mask))
155 #endif
156
157 char *lxchook_names[NUM_LXC_HOOKS] = {
158 "pre-start", "pre-mount", "mount", "autodev", "start", "post-stop", "clone" };
159
160 typedef int (*instanciate_cb)(struct lxc_handler *, struct lxc_netdev *);
161
162 struct mount_opt {
163 char *name;
164 int clear;
165 int flag;
166 };
167
168 struct caps_opt {
169 char *name;
170 int value;
171 };
172
173 /* Declare this here, since we don't want to reshuffle the whole file. */
174 static int in_caplist(int cap, struct lxc_list *caps);
175
176 static int instanciate_veth(struct lxc_handler *, struct lxc_netdev *);
177 static int instanciate_macvlan(struct lxc_handler *, struct lxc_netdev *);
178 static int instanciate_vlan(struct lxc_handler *, struct lxc_netdev *);
179 static int instanciate_phys(struct lxc_handler *, struct lxc_netdev *);
180 static int instanciate_empty(struct lxc_handler *, struct lxc_netdev *);
181 static int instanciate_none(struct lxc_handler *, struct lxc_netdev *);
182
183 static instanciate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
184 [LXC_NET_VETH] = instanciate_veth,
185 [LXC_NET_MACVLAN] = instanciate_macvlan,
186 [LXC_NET_VLAN] = instanciate_vlan,
187 [LXC_NET_PHYS] = instanciate_phys,
188 [LXC_NET_EMPTY] = instanciate_empty,
189 [LXC_NET_NONE] = instanciate_none,
190 };
191
192 static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
193 static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
194 static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
195 static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
196 static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
197 static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
198
199 static instanciate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
200 [LXC_NET_VETH] = shutdown_veth,
201 [LXC_NET_MACVLAN] = shutdown_macvlan,
202 [LXC_NET_VLAN] = shutdown_vlan,
203 [LXC_NET_PHYS] = shutdown_phys,
204 [LXC_NET_EMPTY] = shutdown_empty,
205 [LXC_NET_NONE] = shutdown_none,
206 };
207
208 static struct mount_opt mount_opt[] = {
209 { "defaults", 0, 0 },
210 { "ro", 0, MS_RDONLY },
211 { "rw", 1, MS_RDONLY },
212 { "suid", 1, MS_NOSUID },
213 { "nosuid", 0, MS_NOSUID },
214 { "dev", 1, MS_NODEV },
215 { "nodev", 0, MS_NODEV },
216 { "exec", 1, MS_NOEXEC },
217 { "noexec", 0, MS_NOEXEC },
218 { "sync", 0, MS_SYNCHRONOUS },
219 { "async", 1, MS_SYNCHRONOUS },
220 { "dirsync", 0, MS_DIRSYNC },
221 { "remount", 0, MS_REMOUNT },
222 { "mand", 0, MS_MANDLOCK },
223 { "nomand", 1, MS_MANDLOCK },
224 { "atime", 1, MS_NOATIME },
225 { "noatime", 0, MS_NOATIME },
226 { "diratime", 1, MS_NODIRATIME },
227 { "nodiratime", 0, MS_NODIRATIME },
228 { "bind", 0, MS_BIND },
229 { "rbind", 0, MS_BIND|MS_REC },
230 { "relatime", 0, MS_RELATIME },
231 { "norelatime", 1, MS_RELATIME },
232 { "strictatime", 0, MS_STRICTATIME },
233 { "nostrictatime", 1, MS_STRICTATIME },
234 { NULL, 0, 0 },
235 };
236
237 #if HAVE_SYS_CAPABILITY_H
238 static struct caps_opt caps_opt[] = {
239 { "chown", CAP_CHOWN },
240 { "dac_override", CAP_DAC_OVERRIDE },
241 { "dac_read_search", CAP_DAC_READ_SEARCH },
242 { "fowner", CAP_FOWNER },
243 { "fsetid", CAP_FSETID },
244 { "kill", CAP_KILL },
245 { "setgid", CAP_SETGID },
246 { "setuid", CAP_SETUID },
247 { "setpcap", CAP_SETPCAP },
248 { "linux_immutable", CAP_LINUX_IMMUTABLE },
249 { "net_bind_service", CAP_NET_BIND_SERVICE },
250 { "net_broadcast", CAP_NET_BROADCAST },
251 { "net_admin", CAP_NET_ADMIN },
252 { "net_raw", CAP_NET_RAW },
253 { "ipc_lock", CAP_IPC_LOCK },
254 { "ipc_owner", CAP_IPC_OWNER },
255 { "sys_module", CAP_SYS_MODULE },
256 { "sys_rawio", CAP_SYS_RAWIO },
257 { "sys_chroot", CAP_SYS_CHROOT },
258 { "sys_ptrace", CAP_SYS_PTRACE },
259 { "sys_pacct", CAP_SYS_PACCT },
260 { "sys_admin", CAP_SYS_ADMIN },
261 { "sys_boot", CAP_SYS_BOOT },
262 { "sys_nice", CAP_SYS_NICE },
263 { "sys_resource", CAP_SYS_RESOURCE },
264 { "sys_time", CAP_SYS_TIME },
265 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
266 { "mknod", CAP_MKNOD },
267 { "lease", CAP_LEASE },
268 #ifdef CAP_AUDIT_WRITE
269 { "audit_write", CAP_AUDIT_WRITE },
270 #endif
271 #ifdef CAP_AUDIT_CONTROL
272 { "audit_control", CAP_AUDIT_CONTROL },
273 #endif
274 { "setfcap", CAP_SETFCAP },
275 { "mac_override", CAP_MAC_OVERRIDE },
276 { "mac_admin", CAP_MAC_ADMIN },
277 #ifdef CAP_SYSLOG
278 { "syslog", CAP_SYSLOG },
279 #endif
280 #ifdef CAP_WAKE_ALARM
281 { "wake_alarm", CAP_WAKE_ALARM },
282 #endif
283 };
284 #else
285 static struct caps_opt caps_opt[] = {};
286 #endif
287
288 static int run_buffer(char *buffer)
289 {
290 struct lxc_popen_FILE *f;
291 char *output;
292 int ret;
293
294 f = lxc_popen(buffer);
295 if (!f) {
296 SYSERROR("popen failed");
297 return -1;
298 }
299
300 output = malloc(LXC_LOG_BUFFER_SIZE);
301 if (!output) {
302 ERROR("failed to allocate memory for script output");
303 lxc_pclose(f);
304 return -1;
305 }
306
307 while(fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
308 DEBUG("script output: %s", output);
309
310 free(output);
311
312 ret = lxc_pclose(f);
313 if (ret == -1) {
314 SYSERROR("Script exited on error");
315 return -1;
316 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
317 ERROR("Script exited with status %d", WEXITSTATUS(ret));
318 return -1;
319 } else if (WIFSIGNALED(ret)) {
320 ERROR("Script terminated by signal %d (%s)", WTERMSIG(ret),
321 strsignal(WTERMSIG(ret)));
322 return -1;
323 }
324
325 return 0;
326 }
327
328 static int run_script_argv(const char *name, const char *section,
329 const char *script, const char *hook, const char *lxcpath,
330 char **argsin)
331 {
332 int ret, i;
333 char *buffer;
334 size_t size = 0;
335
336 INFO("Executing script '%s' for container '%s', config section '%s'",
337 script, name, section);
338
339 for (i=0; argsin && argsin[i]; i++)
340 size += strlen(argsin[i]) + 1;
341
342 size += strlen(hook) + 1;
343
344 size += strlen(script);
345 size += strlen(name);
346 size += strlen(section);
347 size += 3;
348
349 if (size > INT_MAX)
350 return -1;
351
352 buffer = alloca(size);
353 if (!buffer) {
354 ERROR("failed to allocate memory");
355 return -1;
356 }
357
358 ret = snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
359 if (ret < 0 || ret >= size) {
360 ERROR("Script name too long");
361 return -1;
362 }
363
364 for (i=0; argsin && argsin[i]; i++) {
365 int len = size-ret;
366 int rc;
367 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
368 if (rc < 0 || rc >= len) {
369 ERROR("Script args too long");
370 return -1;
371 }
372 ret += rc;
373 }
374
375 return run_buffer(buffer);
376 }
377
378 static int run_script(const char *name, const char *section,
379 const char *script, ...)
380 {
381 int ret;
382 char *buffer, *p;
383 size_t size = 0;
384 va_list ap;
385
386 INFO("Executing script '%s' for container '%s', config section '%s'",
387 script, name, section);
388
389 va_start(ap, script);
390 while ((p = va_arg(ap, char *)))
391 size += strlen(p) + 1;
392 va_end(ap);
393
394 size += strlen(script);
395 size += strlen(name);
396 size += strlen(section);
397 size += 3;
398
399 if (size > INT_MAX)
400 return -1;
401
402 buffer = alloca(size);
403 if (!buffer) {
404 ERROR("failed to allocate memory");
405 return -1;
406 }
407
408 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
409 if (ret < 0 || ret >= size) {
410 ERROR("Script name too long");
411 return -1;
412 }
413
414 va_start(ap, script);
415 while ((p = va_arg(ap, char *))) {
416 int len = size-ret;
417 int rc;
418 rc = snprintf(buffer + ret, len, " %s", p);
419 if (rc < 0 || rc >= len) {
420 ERROR("Script args too long");
421 return -1;
422 }
423 ret += rc;
424 }
425 va_end(ap);
426
427 return run_buffer(buffer);
428 }
429
430 static int find_fstype_cb(char* buffer, void *data)
431 {
432 struct cbarg {
433 const char *rootfs;
434 const char *target;
435 const char *options;
436 } *cbarg = data;
437
438 unsigned long mntflags;
439 char *mntdata;
440 char *fstype;
441
442 /* we don't try 'nodev' entries */
443 if (strstr(buffer, "nodev"))
444 return 0;
445
446 fstype = buffer;
447 fstype += lxc_char_left_gc(fstype, strlen(fstype));
448 fstype[lxc_char_right_gc(fstype, strlen(fstype))] = '\0';
449
450 /* ignore blank line and comment */
451 if (fstype[0] == '\0' || fstype[0] == '#')
452 return 0;
453
454 DEBUG("trying to mount '%s'->'%s' with fstype '%s'",
455 cbarg->rootfs, cbarg->target, fstype);
456
457 if (parse_mntopts(cbarg->options, &mntflags, &mntdata) < 0) {
458 free(mntdata);
459 return -1;
460 }
461
462 if (mount(cbarg->rootfs, cbarg->target, fstype, mntflags, mntdata)) {
463 DEBUG("mount failed with error: %s", strerror(errno));
464 free(mntdata);
465 return 0;
466 }
467 free(mntdata);
468
469 INFO("mounted '%s' on '%s', with fstype '%s'",
470 cbarg->rootfs, cbarg->target, fstype);
471
472 return 1;
473 }
474
475 static int mount_unknown_fs(const char *rootfs, const char *target,
476 const char *options)
477 {
478 int i;
479
480 struct cbarg {
481 const char *rootfs;
482 const char *target;
483 const char *options;
484 } cbarg = {
485 .rootfs = rootfs,
486 .target = target,
487 .options = options,
488 };
489
490 /*
491 * find the filesystem type with brute force:
492 * first we check with /etc/filesystems, in case the modules
493 * are auto-loaded and fall back to the supported kernel fs
494 */
495 char *fsfile[] = {
496 "/etc/filesystems",
497 "/proc/filesystems",
498 };
499
500 for (i = 0; i < sizeof(fsfile)/sizeof(fsfile[0]); i++) {
501
502 int ret;
503
504 if (access(fsfile[i], F_OK))
505 continue;
506
507 ret = lxc_file_for_each_line(fsfile[i], find_fstype_cb, &cbarg);
508 if (ret < 0) {
509 ERROR("failed to parse '%s'", fsfile[i]);
510 return -1;
511 }
512
513 if (ret)
514 return 0;
515 }
516
517 ERROR("failed to determine fs type for '%s'", rootfs);
518 return -1;
519 }
520
521 static int mount_rootfs_dir(const char *rootfs, const char *target,
522 const char *options)
523 {
524 unsigned long mntflags;
525 char *mntdata;
526 int ret;
527
528 if (parse_mntopts(options, &mntflags, &mntdata) < 0) {
529 free(mntdata);
530 return -1;
531 }
532
533 ret = mount(rootfs, target, "none", MS_BIND | MS_REC | mntflags, mntdata);
534 free(mntdata);
535
536 return ret;
537 }
538
539 static int setup_lodev(const char *rootfs, int fd, struct loop_info64 *loinfo)
540 {
541 int rfd;
542 int ret = -1;
543
544 rfd = open(rootfs, O_RDWR);
545 if (rfd < 0) {
546 SYSERROR("failed to open '%s'", rootfs);
547 return -1;
548 }
549
550 memset(loinfo, 0, sizeof(*loinfo));
551
552 loinfo->lo_flags = LO_FLAGS_AUTOCLEAR;
553
554 if (ioctl(fd, LOOP_SET_FD, rfd)) {
555 SYSERROR("failed to LOOP_SET_FD");
556 goto out;
557 }
558
559 if (ioctl(fd, LOOP_SET_STATUS64, loinfo)) {
560 SYSERROR("failed to LOOP_SET_STATUS64");
561 goto out;
562 }
563
564 ret = 0;
565 out:
566 close(rfd);
567
568 return ret;
569 }
570
571 static int mount_rootfs_file(const char *rootfs, const char *target,
572 const char *options)
573 {
574 struct dirent dirent, *direntp;
575 struct loop_info64 loinfo;
576 int ret = -1, fd = -1, rc;
577 DIR *dir;
578 char path[MAXPATHLEN];
579
580 dir = opendir("/dev");
581 if (!dir) {
582 SYSERROR("failed to open '/dev'");
583 return -1;
584 }
585
586 while (!readdir_r(dir, &dirent, &direntp)) {
587
588 if (!direntp)
589 break;
590
591 if (!strcmp(direntp->d_name, "."))
592 continue;
593
594 if (!strcmp(direntp->d_name, ".."))
595 continue;
596
597 if (strncmp(direntp->d_name, "loop", 4))
598 continue;
599
600 rc = snprintf(path, MAXPATHLEN, "/dev/%s", direntp->d_name);
601 if (rc < 0 || rc >= MAXPATHLEN)
602 continue;
603
604 fd = open(path, O_RDWR);
605 if (fd < 0)
606 continue;
607
608 if (ioctl(fd, LOOP_GET_STATUS64, &loinfo) == 0) {
609 close(fd);
610 continue;
611 }
612
613 if (errno != ENXIO) {
614 WARN("unexpected error for ioctl on '%s': %m",
615 direntp->d_name);
616 close(fd);
617 continue;
618 }
619
620 DEBUG("found '%s' free lodev", path);
621
622 ret = setup_lodev(rootfs, fd, &loinfo);
623 if (!ret)
624 ret = mount_unknown_fs(path, target, options);
625 close(fd);
626
627 break;
628 }
629
630 if (closedir(dir))
631 WARN("failed to close directory");
632
633 return ret;
634 }
635
636 static int mount_rootfs_block(const char *rootfs, const char *target,
637 const char *options)
638 {
639 return mount_unknown_fs(rootfs, target, options);
640 }
641
642 /*
643 * pin_rootfs
644 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
645 * the duration of the container run, to prevent the container from marking
646 * the underlying fs readonly on shutdown. unlink the file immediately so
647 * no name pollution is happens
648 * return -1 on error.
649 * return -2 if nothing needed to be pinned.
650 * return an open fd (>=0) if we pinned it.
651 */
652 int pin_rootfs(const char *rootfs)
653 {
654 char absrootfs[MAXPATHLEN];
655 char absrootfspin[MAXPATHLEN];
656 struct stat s;
657 int ret, fd;
658
659 if (rootfs == NULL || strlen(rootfs) == 0)
660 return -2;
661
662 if (!realpath(rootfs, absrootfs))
663 return -2;
664
665 if (access(absrootfs, F_OK))
666 return -1;
667
668 if (stat(absrootfs, &s))
669 return -1;
670
671 if (!S_ISDIR(s.st_mode))
672 return -2;
673
674 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
675 if (ret >= MAXPATHLEN)
676 return -1;
677
678 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
679 if (fd < 0)
680 return fd;
681 (void)unlink(absrootfspin);
682 return fd;
683 }
684
685 static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
686 {
687 int r;
688 size_t i;
689 static struct {
690 int match_mask;
691 int match_flag;
692 const char *source;
693 const char *destination;
694 const char *fstype;
695 unsigned long flags;
696 const char *options;
697 } default_mounts[] = {
698 /* Read-only bind-mounting... In older kernels, doing that required
699 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
700 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
701 * kernel 2.6.26 onwards. However, this apparently does not work on
702 * kernel 3.8. Unfortunately, on that very same kernel, doing the
703 * same trick as above doesn't seem to work either, there one needs
704 * to ALSO specify MS_BIND for the remount, otherwise the entire
705 * fs is remounted read-only or the mount fails because it's busy...
706 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
707 * 2.6.32...
708 */
709 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
710 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
711 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
712 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
713 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
714 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
715 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
716 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
717 { 0, 0, NULL, NULL, NULL, 0, NULL }
718 };
719
720 for (i = 0; default_mounts[i].match_mask; i++) {
721 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
722 char *source = NULL;
723 char *destination = NULL;
724 int saved_errno;
725
726 if (default_mounts[i].source) {
727 /* will act like strdup if %r is not present */
728 source = lxc_string_replace("%r", conf->rootfs.mount, default_mounts[i].source);
729 if (!source) {
730 SYSERROR("memory allocation error");
731 return -1;
732 }
733 }
734 if (default_mounts[i].destination) {
735 /* will act like strdup if %r is not present */
736 destination = lxc_string_replace("%r", conf->rootfs.mount, default_mounts[i].destination);
737 if (!destination) {
738 saved_errno = errno;
739 SYSERROR("memory allocation error");
740 free(source);
741 errno = saved_errno;
742 return -1;
743 }
744 }
745 r = mount(source, destination, default_mounts[i].fstype, default_mounts[i].flags, default_mounts[i].options);
746 saved_errno = errno;
747 if (r < 0)
748 SYSERROR("error mounting %s on %s", source, destination);
749 free(source);
750 free(destination);
751 if (r < 0) {
752 errno = saved_errno;
753 return -1;
754 }
755 }
756 }
757
758 if (flags & LXC_AUTO_CGROUP_MASK) {
759 int cg_flags;
760
761 cg_flags = flags & LXC_AUTO_CGROUP_MASK;
762 /* If the type of cgroup mount was not specified, it depends on the
763 * container's capabilities as to what makes sense: if we have
764 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
765 * anyway, so we may as well default to read-write; then the admin
766 * will not be given a false sense of security. (And if they really
767 * want mixed r/o r/w, then they can explicitly specify :mixed.)
768 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
769 * :mixed, because then the container can't remount it read-write. */
770 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC || cg_flags == LXC_AUTO_CGROUP_FULL_NOSPEC) {
771 int has_sys_admin = 0;
772 if (!lxc_list_empty(&conf->keepcaps)) {
773 has_sys_admin = in_caplist(CAP_SYS_ADMIN, &conf->keepcaps);
774 } else {
775 has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &conf->caps);
776 }
777 if (cg_flags == LXC_AUTO_CGROUP_NOSPEC) {
778 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_RW : LXC_AUTO_CGROUP_MIXED;
779 } else {
780 cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED;
781 }
782 }
783
784 if (!cgroup_mount(conf->rootfs.mount, handler, cg_flags)) {
785 SYSERROR("error mounting /sys/fs/cgroup");
786 return -1;
787 }
788 }
789
790 return 0;
791 }
792
793 static int mount_rootfs(const char *rootfs, const char *target, const char *options)
794 {
795 char absrootfs[MAXPATHLEN];
796 struct stat s;
797 int i;
798
799 typedef int (*rootfs_cb)(const char *, const char *, const char *);
800
801 struct rootfs_type {
802 int type;
803 rootfs_cb cb;
804 } rtfs_type[] = {
805 { S_IFDIR, mount_rootfs_dir },
806 { S_IFBLK, mount_rootfs_block },
807 { S_IFREG, mount_rootfs_file },
808 };
809
810 if (!realpath(rootfs, absrootfs)) {
811 SYSERROR("failed to get real path for '%s'", rootfs);
812 return -1;
813 }
814
815 if (access(absrootfs, F_OK)) {
816 SYSERROR("'%s' is not accessible", absrootfs);
817 return -1;
818 }
819
820 if (stat(absrootfs, &s)) {
821 SYSERROR("failed to stat '%s'", absrootfs);
822 return -1;
823 }
824
825 for (i = 0; i < sizeof(rtfs_type)/sizeof(rtfs_type[0]); i++) {
826
827 if (!__S_ISTYPE(s.st_mode, rtfs_type[i].type))
828 continue;
829
830 return rtfs_type[i].cb(absrootfs, target, options);
831 }
832
833 ERROR("unsupported rootfs type for '%s'", absrootfs);
834 return -1;
835 }
836
837 static int setup_utsname(struct utsname *utsname)
838 {
839 if (!utsname)
840 return 0;
841
842 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
843 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
844 return -1;
845 }
846
847 INFO("'%s' hostname has been setup", utsname->nodename);
848
849 return 0;
850 }
851
852 struct dev_symlinks {
853 const char *oldpath;
854 const char *name;
855 };
856
857 static const struct dev_symlinks dev_symlinks[] = {
858 {"/proc/self/fd", "fd"},
859 {"/proc/self/fd/0", "stdin"},
860 {"/proc/self/fd/1", "stdout"},
861 {"/proc/self/fd/2", "stderr"},
862 };
863
864 static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
865 {
866 char path[MAXPATHLEN];
867 int ret,i;
868 struct stat s;
869
870
871 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
872 const struct dev_symlinks *d = &dev_symlinks[i];
873 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, d->name);
874 if (ret < 0 || ret >= MAXPATHLEN)
875 return -1;
876
877 /*
878 * Stat the path first. If we don't get an error
879 * accept it as is and don't try to create it
880 */
881 if (!stat(path, &s)) {
882 continue;
883 }
884
885 ret = symlink(d->oldpath, path);
886
887 if (ret && errno != EEXIST) {
888 if ( errno == EROFS ) {
889 WARN("Warning: Read Only file system while creating %s", path);
890 } else {
891 SYSERROR("Error creating %s", path);
892 return -1;
893 }
894 }
895 }
896 return 0;
897 }
898
899 static int setup_tty(const struct lxc_rootfs *rootfs,
900 const struct lxc_tty_info *tty_info, char *ttydir)
901 {
902 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
903 int i, ret;
904
905 if (!rootfs->path)
906 return 0;
907
908 for (i = 0; i < tty_info->nbtty; i++) {
909
910 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
911
912 ret = snprintf(path, sizeof(path), "%s/dev/tty%d",
913 rootfs->mount, i + 1);
914 if (ret >= sizeof(path)) {
915 ERROR("pathname too long for ttys");
916 return -1;
917 }
918 if (ttydir) {
919 /* create dev/lxc/tty%d" */
920 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/tty%d",
921 rootfs->mount, ttydir, i + 1);
922 if (ret >= sizeof(lxcpath)) {
923 ERROR("pathname too long for ttys");
924 return -1;
925 }
926 ret = creat(lxcpath, 0660);
927 if (ret==-1 && errno != EEXIST) {
928 SYSERROR("error creating %s", lxcpath);
929 return -1;
930 }
931 if (ret >= 0)
932 close(ret);
933 ret = unlink(path);
934 if (ret && errno != ENOENT) {
935 SYSERROR("error unlinking %s", path);
936 return -1;
937 }
938
939 if (mount(pty_info->name, lxcpath, "none", MS_BIND, 0)) {
940 WARN("failed to mount '%s'->'%s'",
941 pty_info->name, path);
942 continue;
943 }
944
945 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d", ttydir, i+1);
946 if (ret >= sizeof(lxcpath)) {
947 ERROR("tty pathname too long");
948 return -1;
949 }
950 ret = symlink(lxcpath, path);
951 if (ret) {
952 SYSERROR("failed to create symlink for tty %d", i+1);
953 return -1;
954 }
955 } else {
956 /* If we populated /dev, then we need to create /dev/ttyN */
957 if (access(path, F_OK)) {
958 ret = creat(path, 0660);
959 if (ret==-1) {
960 SYSERROR("error creating %s", path);
961 /* this isn't fatal, continue */
962 } else {
963 close(ret);
964 }
965 }
966 if (mount(pty_info->name, path, "none", MS_BIND, 0)) {
967 WARN("failed to mount '%s'->'%s'",
968 pty_info->name, path);
969 continue;
970 }
971 }
972 }
973
974 INFO("%d tty(s) has been setup", tty_info->nbtty);
975
976 return 0;
977 }
978
979 static int setup_rootfs_pivot_root_cb(char *buffer, void *data)
980 {
981 struct lxc_list *mountlist, *listentry, *iterator;
982 char *pivotdir, *mountpoint, *mountentry, *saveptr = NULL;
983 int found;
984 void **cbparm;
985
986 mountentry = buffer;
987 cbparm = (void **)data;
988
989 mountlist = cbparm[0];
990 pivotdir = cbparm[1];
991
992 /* parse entry, first field is mountname, ignore */
993 mountpoint = strtok_r(mountentry, " ", &saveptr);
994 if (!mountpoint)
995 return -1;
996
997 /* second field is mountpoint */
998 mountpoint = strtok_r(NULL, " ", &saveptr);
999 if (!mountpoint)
1000 return -1;
1001
1002 /* only consider mountpoints below old root fs */
1003 if (strncmp(mountpoint, pivotdir, strlen(pivotdir)))
1004 return 0;
1005
1006 /* filter duplicate mountpoints */
1007 found = 0;
1008 lxc_list_for_each(iterator, mountlist) {
1009 if (!strcmp(iterator->elem, mountpoint)) {
1010 found = 1;
1011 break;
1012 }
1013 }
1014 if (found)
1015 return 0;
1016
1017 /* add entry to list */
1018 listentry = malloc(sizeof(*listentry));
1019 if (!listentry) {
1020 SYSERROR("malloc for mountpoint listentry failed");
1021 return -1;
1022 }
1023
1024 listentry->elem = strdup(mountpoint);
1025 if (!listentry->elem) {
1026 SYSERROR("strdup failed");
1027 free(listentry);
1028 return -1;
1029 }
1030 lxc_list_add_tail(mountlist, listentry);
1031
1032 return 0;
1033 }
1034
1035 static int umount_oldrootfs(const char *oldrootfs)
1036 {
1037 char path[MAXPATHLEN];
1038 void *cbparm[2];
1039 struct lxc_list mountlist, *iterator, *next;
1040 int ok, still_mounted, last_still_mounted;
1041 int rc;
1042
1043 /* read and parse /proc/mounts in old root fs */
1044 lxc_list_init(&mountlist);
1045
1046 /* oldrootfs is on the top tree directory now */
1047 rc = snprintf(path, sizeof(path), "/%s", oldrootfs);
1048 if (rc >= sizeof(path)) {
1049 ERROR("rootfs name too long");
1050 return -1;
1051 }
1052 cbparm[0] = &mountlist;
1053
1054 cbparm[1] = strdup(path);
1055 if (!cbparm[1]) {
1056 SYSERROR("strdup failed");
1057 return -1;
1058 }
1059
1060 rc = snprintf(path, sizeof(path), "%s/proc/mounts", oldrootfs);
1061 if (rc >= sizeof(path)) {
1062 ERROR("container proc/mounts name too long");
1063 return -1;
1064 }
1065
1066 ok = lxc_file_for_each_line(path,
1067 setup_rootfs_pivot_root_cb, &cbparm);
1068 if (ok < 0) {
1069 SYSERROR("failed to read or parse mount list '%s'", path);
1070 return -1;
1071 }
1072
1073 /* umount filesystems until none left or list no longer shrinks */
1074 still_mounted = 0;
1075 do {
1076 last_still_mounted = still_mounted;
1077 still_mounted = 0;
1078
1079 lxc_list_for_each_safe(iterator, &mountlist, next) {
1080
1081 /* umount normally */
1082 if (!umount(iterator->elem)) {
1083 DEBUG("umounted '%s'", (char *)iterator->elem);
1084 lxc_list_del(iterator);
1085 continue;
1086 }
1087
1088 still_mounted++;
1089 }
1090
1091 } while (still_mounted > 0 && still_mounted != last_still_mounted);
1092
1093
1094 lxc_list_for_each(iterator, &mountlist) {
1095
1096 /* let's try a lazy umount */
1097 if (!umount2(iterator->elem, MNT_DETACH)) {
1098 INFO("lazy unmount of '%s'", (char *)iterator->elem);
1099 continue;
1100 }
1101
1102 /* be more brutal (nfs) */
1103 if (!umount2(iterator->elem, MNT_FORCE)) {
1104 INFO("forced unmount of '%s'", (char *)iterator->elem);
1105 continue;
1106 }
1107
1108 WARN("failed to unmount '%s'", (char *)iterator->elem);
1109 }
1110
1111 return 0;
1112 }
1113
1114 static int setup_rootfs_pivot_root(const char *rootfs, const char *pivotdir)
1115 {
1116 char path[MAXPATHLEN];
1117 int remove_pivotdir = 0;
1118 int rc;
1119
1120 /* change into new root fs */
1121 if (chdir(rootfs)) {
1122 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
1123 return -1;
1124 }
1125
1126 if (!pivotdir)
1127 pivotdir = "lxc_putold";
1128
1129 /* compute the full path to pivotdir under rootfs */
1130 rc = snprintf(path, sizeof(path), "%s/%s", rootfs, pivotdir);
1131 if (rc >= sizeof(path)) {
1132 ERROR("pivot dir name too long");
1133 return -1;
1134 }
1135
1136 if (access(path, F_OK)) {
1137
1138 if (mkdir_p(path, 0755) < 0) {
1139 SYSERROR("failed to create pivotdir '%s'", path);
1140 return -1;
1141 }
1142
1143 remove_pivotdir = 1;
1144 DEBUG("created '%s' directory", path);
1145 }
1146
1147 DEBUG("mountpoint for old rootfs is '%s'", path);
1148
1149 /* pivot_root into our new root fs */
1150 if (pivot_root(".", path)) {
1151 SYSERROR("pivot_root syscall failed");
1152 return -1;
1153 }
1154
1155 if (chdir("/")) {
1156 SYSERROR("can't chdir to / after pivot_root");
1157 return -1;
1158 }
1159
1160 DEBUG("pivot_root syscall to '%s' successful", rootfs);
1161
1162 /* we switch from absolute path to relative path */
1163 if (umount_oldrootfs(pivotdir))
1164 return -1;
1165
1166 /* remove temporary mount point, we don't consider the removing
1167 * as fatal */
1168 if (remove_pivotdir && rmdir(pivotdir))
1169 WARN("can't remove mountpoint '%s': %m", pivotdir);
1170
1171 return 0;
1172 }
1173
1174 /*
1175 * Check to see if a directory has something mounted on it and,
1176 * if it does, return the fstype.
1177 *
1178 * Code largely based on detect_shared_rootfs below
1179 *
1180 * Returns: # of matching entries in /proc/self/mounts
1181 * if != 0 fstype is filled with the last filesystem value.
1182 * if == 0 no matches found, fstype unchanged.
1183 *
1184 * ToDo: Maybe return the mount options in another parameter...
1185 */
1186
1187 #define LINELEN 4096
1188 #define MAX_FSTYPE_LEN 128
1189 static int mount_check_fs( const char *dir, char *fstype )
1190 {
1191 char buf[LINELEN], *p;
1192 struct stat s;
1193 FILE *f;
1194 int found_fs = 0;
1195 char *p2;
1196
1197 DEBUG("entering mount_check_fs for %s", dir);
1198
1199 if ( 0 != access(dir, F_OK) || 0 != stat(dir, &s) || 0 == S_ISDIR(s.st_mode) ) {
1200 return 0;
1201 }
1202
1203 f = fopen("/proc/self/mounts", "r");
1204 if (!f)
1205 return 0;
1206 while (fgets(buf, LINELEN, f)) {
1207 p = index(buf, ' ');
1208 if( !p )
1209 continue;
1210 *p = '\0';
1211 p2 = p + 1;
1212
1213 p = index(p2, ' ');
1214 if( !p )
1215 continue;
1216 *p = '\0';
1217
1218 /* Compare the directory in the entry to desired */
1219 if( strcmp( p2, dir ) ) {
1220 continue;
1221 }
1222
1223 p2 = p + 1;
1224 p = index( p2, ' ');
1225 if( !p )
1226 continue;
1227 *p = '\0';
1228
1229 ++found_fs;
1230
1231 if( fstype ) {
1232 strncpy( fstype, p2, MAX_FSTYPE_LEN - 1 );
1233 fstype [ MAX_FSTYPE_LEN - 1 ] = '\0';
1234 }
1235 }
1236
1237 fclose(f);
1238
1239 DEBUG("mount_check_fs returning %d last %s", found_fs, fstype);
1240
1241 return found_fs;
1242 }
1243
1244 /*
1245 * Locate a devtmpfs mount (should be on /dev) and create a container
1246 * subdirectory on it which we can then bind mount to the container
1247 * /dev instead of mounting a tmpfs there.
1248 * If we fail, return NULL.
1249 * Else return the pointer to the name buffer with the string to
1250 * the devtmpfs subdirectory.
1251 */
1252
1253 static char *mk_devtmpfs(const char *name, char *path, const char *lxcpath)
1254 {
1255 int ret;
1256 struct stat s;
1257 char tmp_path[MAXPATHLEN];
1258 char fstype[MAX_FSTYPE_LEN];
1259 char *base_path = "/dev/.lxc";
1260 char *user_path = "/dev/.lxc/user";
1261 uint64_t hash;
1262
1263 if ( 0 != access(base_path, F_OK) || 0 != stat(base_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
1264 /* This is just making /dev/.lxc it better work or we're done */
1265 ret = mkdir(base_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1266 if ( ret ) {
1267 SYSERROR( "Unable to create /dev/.lxc for autodev" );
1268 return NULL;
1269 }
1270 }
1271
1272 /*
1273 * Programmers notes:
1274 * We can not do mounts in this area of code that we want
1275 * to be visible in the host. Consequently, /dev/.lxc must
1276 * be set up earlier if we need a tmpfs mounted there.
1277 * That only affects the rare cases where autodev is enabled
1278 * for a container and devtmpfs is not mounted on /dev in the
1279 * host. In that case, we'll fall back to the old method
1280 * of mounting a tmpfs in the container and have no visibility
1281 * into the container /dev.
1282 */
1283 if( ! mount_check_fs( "/dev", fstype )
1284 || strcmp( "devtmpfs", fstype ) ) {
1285 /* Either /dev was not mounted or was not devtmpfs */
1286
1287 if ( ! mount_check_fs( "/dev/.lxc", NULL ) ) {
1288 /*
1289 * /dev/.lxc is not already mounted
1290 * Doing a mount here does no good, since
1291 * it's not visible in the host.
1292 */
1293
1294 ERROR("/dev/.lxc is not setup - taking fallback" );
1295 return NULL;
1296 }
1297 }
1298
1299 if ( 0 != access(user_path, F_OK) || 0 != stat(user_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
1300 /*
1301 * This is making /dev/.lxc/user path for non-priv users.
1302 * If this doesn't work, we'll have to fall back in the
1303 * case of non-priv users. It's mode 1777 like /tmp.
1304 */
1305 ret = mkdir(user_path, S_IRWXU | S_IRWXG | S_IRWXO | S_ISVTX);
1306 if ( ret ) {
1307 /* Issue an error but don't fail yet! */
1308 ERROR("Unable to create /dev/.lxc/user");
1309 }
1310 /* Umask tends to screw us up here */
1311 chmod(user_path, S_IRWXU | S_IRWXG | S_IRWXO | S_ISVTX);
1312 }
1313
1314 /*
1315 * Since the container name must be unique within a given
1316 * lxcpath, we're going to use a hash of the path
1317 * /lxcpath/name as our hash name in /dev/.lxc/
1318 */
1319
1320 ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s", lxcpath, name);
1321 if (ret < 0 || ret >= MAXPATHLEN)
1322 return NULL;
1323
1324 hash = fnv_64a_buf(tmp_path, ret, FNV1A_64_INIT);
1325
1326 ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s.%016" PRIx64, base_path, name, hash);
1327 if (ret < 0 || ret >= MAXPATHLEN)
1328 return NULL;
1329
1330 if ( 0 != access(tmp_path, F_OK) || 0 != stat(tmp_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
1331 ret = mkdir(tmp_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1332 if ( ret ) {
1333 /* Something must have failed with the base_path...
1334 * Maybe unpriv user. Try user_path now... */
1335 INFO("Setup in /dev/.lxc failed. Trying /dev/.lxc/user." );
1336
1337 ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s.%016" PRIx64, user_path, name, hash);
1338 if (ret < 0 || ret >= MAXPATHLEN)
1339 return NULL;
1340
1341 if ( 0 != access(tmp_path, F_OK) || 0 != stat(tmp_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
1342 ret = mkdir(tmp_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1343 if ( ret ) {
1344 ERROR("Container /dev setup in host /dev failed - taking fallback" );
1345 return NULL;
1346 }
1347 }
1348 }
1349 }
1350
1351 strcpy( path, tmp_path );
1352 return path;
1353 }
1354
1355
1356 /*
1357 * Do we want to add options for max size of /dev and a file to
1358 * specify which devices to create?
1359 */
1360 static int mount_autodev(const char *name, char *root, const char *lxcpath)
1361 {
1362 int ret;
1363 struct stat s;
1364 char path[MAXPATHLEN];
1365 char host_path[MAXPATHLEN];
1366 char devtmpfs_path[MAXPATHLEN];
1367
1368 INFO("Mounting /dev under %s", root);
1369
1370 ret = snprintf(host_path, MAXPATHLEN, "%s/%s/rootfs.dev", lxcpath, name);
1371 if (ret < 0 || ret > MAXPATHLEN)
1372 return -1;
1373
1374 ret = snprintf(path, MAXPATHLEN, "%s/dev", root);
1375 if (ret < 0 || ret > MAXPATHLEN)
1376 return -1;
1377
1378 if (mk_devtmpfs( name, devtmpfs_path, lxcpath ) ) {
1379 /*
1380 * Get rid of old links and directoriess
1381 * This could be either a symlink and we remove it,
1382 * or an empty directory and we remove it,
1383 * or non-existant and we don't care,
1384 * or a non-empty directory, and we will then emit an error
1385 * but we will not fail out the process.
1386 */
1387 unlink( host_path );
1388 rmdir( host_path );
1389 ret = symlink(devtmpfs_path, host_path);
1390
1391 if ( ret < 0 ) {
1392 SYSERROR("WARNING: Failed to create symlink '%s'->'%s'", host_path, devtmpfs_path);
1393 }
1394 DEBUG("Bind mounting %s to %s", devtmpfs_path , path );
1395 ret = mount(devtmpfs_path, path, NULL, MS_BIND, 0 );
1396 } else {
1397 /* Only mount a tmpfs on here if we don't already a mount */
1398 if ( ! mount_check_fs( host_path, NULL ) ) {
1399 DEBUG("Mounting tmpfs to %s", host_path );
1400 ret = mount("none", path, "tmpfs", 0, "size=100000,mode=755");
1401 } else {
1402 /* This allows someone to manually set up a mount */
1403 DEBUG("Bind mounting %s to %s", host_path, path );
1404 ret = mount(host_path , path, NULL, MS_BIND, 0 );
1405 }
1406 }
1407 if (ret) {
1408 SYSERROR("Failed to mount /dev at %s", root);
1409 return -1;
1410 }
1411 ret = snprintf(path, MAXPATHLEN, "%s/dev/pts", root);
1412 if (ret < 0 || ret >= MAXPATHLEN)
1413 return -1;
1414 /*
1415 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1416 * If not, then create it and exit if that fails...
1417 */
1418 if ( 0 != access(path, F_OK) || 0 != stat(path, &s) || 0 == S_ISDIR(s.st_mode) ) {
1419 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1420 if (ret) {
1421 SYSERROR("Failed to create /dev/pts in container");
1422 return -1;
1423 }
1424 }
1425
1426 INFO("Mounted /dev under %s", root);
1427 return 0;
1428 }
1429
1430 struct lxc_devs {
1431 const char *name;
1432 mode_t mode;
1433 int maj;
1434 int min;
1435 };
1436
1437 static const struct lxc_devs lxc_devs[] = {
1438 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1439 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1440 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1441 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1442 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1443 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
1444 { "console", S_IFCHR | S_IRUSR | S_IWUSR, 5, 1 },
1445 };
1446
1447 static int setup_autodev(const char *root)
1448 {
1449 int ret;
1450 char path[MAXPATHLEN];
1451 int i;
1452 mode_t cmask;
1453
1454 INFO("Creating initial consoles under %s/dev", root);
1455
1456 ret = snprintf(path, MAXPATHLEN, "%s/dev", root);
1457 if (ret < 0 || ret >= MAXPATHLEN) {
1458 ERROR("Error calculating container /dev location");
1459 return -1;
1460 }
1461
1462 INFO("Populating /dev under %s", root);
1463 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1464 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
1465 const struct lxc_devs *d = &lxc_devs[i];
1466 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", root, d->name);
1467 if (ret < 0 || ret >= MAXPATHLEN)
1468 return -1;
1469 ret = mknod(path, d->mode, makedev(d->maj, d->min));
1470 if (ret && errno != EEXIST) {
1471 SYSERROR("Error creating %s", d->name);
1472 return -1;
1473 }
1474 }
1475 umask(cmask);
1476
1477 INFO("Populated /dev under %s", root);
1478 return 0;
1479 }
1480
1481 /*
1482 * I'll forgive you for asking whether all of this is needed :) The
1483 * answer is yes.
1484 * pivot_root will fail if the new root, the put_old dir, or the parent
1485 * of current->fs->root are MS_SHARED. (parent of current->fs_root may
1486 * or may not be current->fs_root - if we assumed it always was, we could
1487 * just mount --make-rslave /). So,
1488 * 1. mount a tiny tmpfs to be parent of current->fs->root.
1489 * 2. make that MS_SLAVE
1490 * 3. make a 'root' directory under that
1491 * 4. mount --rbind / under the $tinyroot/root.
1492 * 5. make that rslave
1493 * 6. chdir and chroot into $tinyroot/root
1494 * 7. $tinyroot will be unmounted by our parent in start.c
1495 */
1496 static int chroot_into_slave(struct lxc_conf *conf)
1497 {
1498 char path[MAXPATHLEN];
1499 const char *destpath = conf->rootfs.mount;
1500 int ret;
1501
1502 if (mount(destpath, destpath, NULL, MS_BIND, 0)) {
1503 SYSERROR("failed to mount %s bind", destpath);
1504 return -1;
1505 }
1506 if (mount("", destpath, NULL, MS_SLAVE, 0)) {
1507 SYSERROR("failed to make %s slave", destpath);
1508 return -1;
1509 }
1510 if (mount("none", destpath, "tmpfs", 0, "size=10000,mode=755")) {
1511 SYSERROR("Failed to mount tmpfs / at %s", destpath);
1512 return -1;
1513 }
1514 ret = snprintf(path, MAXPATHLEN, "%s/root", destpath);
1515 if (ret < 0 || ret >= MAXPATHLEN) {
1516 ERROR("out of memory making root path");
1517 return -1;
1518 }
1519 if (mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
1520 SYSERROR("Failed to create /dev/pts in container");
1521 return -1;
1522 }
1523 if (mount("/", path, NULL, MS_BIND|MS_REC, 0)) {
1524 SYSERROR("Failed to rbind mount / to %s", path);
1525 return -1;
1526 }
1527 if (mount("", destpath, NULL, MS_SLAVE|MS_REC, 0)) {
1528 SYSERROR("Failed to make tmp-/ at %s rslave", path);
1529 return -1;
1530 }
1531 if (chroot(path)) {
1532 SYSERROR("Failed to chroot into tmp-/");
1533 return -1;
1534 }
1535 if (chdir("/")) {
1536 SYSERROR("Failed to chdir into tmp-/");
1537 return -1;
1538 }
1539 INFO("Chrooted into tmp-/ at %s", path);
1540 return 0;
1541 }
1542
1543 static int setup_rootfs(struct lxc_conf *conf)
1544 {
1545 const struct lxc_rootfs *rootfs = &conf->rootfs;
1546
1547 if (!rootfs->path) {
1548 if (mount("", "/", NULL, MS_SLAVE|MS_REC, 0)) {
1549 SYSERROR("Failed to make / rslave");
1550 return -1;
1551 }
1552 return 0;
1553 }
1554
1555 if (access(rootfs->mount, F_OK)) {
1556 SYSERROR("failed to access to '%s', check it is present",
1557 rootfs->mount);
1558 return -1;
1559 }
1560
1561 // First try mounting rootfs using a bdev
1562 struct bdev *bdev = bdev_init(conf, rootfs->path, rootfs->mount, rootfs->options);
1563 if (bdev && bdev->ops->mount(bdev) == 0) {
1564 bdev_put(bdev);
1565 DEBUG("mounted '%s' on '%s'", rootfs->path, rootfs->mount);
1566 return 0;
1567 }
1568 if (bdev)
1569 bdev_put(bdev);
1570 if (mount_rootfs(rootfs->path, rootfs->mount, rootfs->options)) {
1571 ERROR("failed to mount rootfs");
1572 return -1;
1573 }
1574
1575 DEBUG("mounted '%s' on '%s'", rootfs->path, rootfs->mount);
1576
1577 return 0;
1578 }
1579
1580 static int setup_pivot_root(const struct lxc_rootfs *rootfs)
1581 {
1582 if (!rootfs->path)
1583 return 0;
1584
1585 if (setup_rootfs_pivot_root(rootfs->mount, rootfs->pivot)) {
1586 ERROR("failed to setup pivot root");
1587 return -1;
1588 }
1589
1590 return 0;
1591 }
1592
1593 static int setup_pts(int pts)
1594 {
1595 char target[PATH_MAX];
1596
1597 if (!pts)
1598 return 0;
1599
1600 if (!access("/dev/pts/ptmx", F_OK) && umount("/dev/pts")) {
1601 SYSERROR("failed to umount 'dev/pts'");
1602 return -1;
1603 }
1604
1605 if (mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL,
1606 "newinstance,ptmxmode=0666,mode=0620,gid=5")) {
1607 SYSERROR("failed to mount a new instance of '/dev/pts'");
1608 return -1;
1609 }
1610
1611 if (access("/dev/ptmx", F_OK)) {
1612 if (!symlink("/dev/pts/ptmx", "/dev/ptmx"))
1613 goto out;
1614 SYSERROR("failed to symlink '/dev/pts/ptmx'->'/dev/ptmx'");
1615 return -1;
1616 }
1617
1618 if (realpath("/dev/ptmx", target) && !strcmp(target, "/dev/pts/ptmx"))
1619 goto out;
1620
1621 /* fallback here, /dev/pts/ptmx exists just mount bind */
1622 if (mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND, 0)) {
1623 SYSERROR("mount failed '/dev/pts/ptmx'->'/dev/ptmx'");
1624 return -1;
1625 }
1626
1627 INFO("created new pts instance");
1628
1629 out:
1630 return 0;
1631 }
1632
1633 static int setup_personality(int persona)
1634 {
1635 #if HAVE_SYS_PERSONALITY_H
1636 if (persona == -1)
1637 return 0;
1638
1639 if (personality(persona) < 0) {
1640 SYSERROR("failed to set personality to '0x%x'", persona);
1641 return -1;
1642 }
1643
1644 INFO("set personality to '0x%x'", persona);
1645 #endif
1646
1647 return 0;
1648 }
1649
1650 static int setup_dev_console(const struct lxc_rootfs *rootfs,
1651 const struct lxc_console *console)
1652 {
1653 char path[MAXPATHLEN];
1654 struct stat s;
1655 int ret;
1656
1657 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1658 if (ret >= sizeof(path)) {
1659 ERROR("console path too long");
1660 return -1;
1661 }
1662
1663 if (access(path, F_OK)) {
1664 WARN("rootfs specified but no console found at '%s'", path);
1665 return 0;
1666 }
1667
1668 if (console->master < 0) {
1669 INFO("no console");
1670 return 0;
1671 }
1672
1673 if (stat(path, &s)) {
1674 SYSERROR("failed to stat '%s'", path);
1675 return -1;
1676 }
1677
1678 if (chmod(console->name, s.st_mode)) {
1679 SYSERROR("failed to set mode '0%o' to '%s'",
1680 s.st_mode, console->name);
1681 return -1;
1682 }
1683
1684 if (mount(console->name, path, "none", MS_BIND, 0)) {
1685 ERROR("failed to mount '%s' on '%s'", console->name, path);
1686 return -1;
1687 }
1688
1689 INFO("console has been setup");
1690 return 0;
1691 }
1692
1693 static int setup_ttydir_console(const struct lxc_rootfs *rootfs,
1694 const struct lxc_console *console,
1695 char *ttydir)
1696 {
1697 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
1698 int ret;
1699
1700 /* create rootfs/dev/<ttydir> directory */
1701 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount,
1702 ttydir);
1703 if (ret >= sizeof(path))
1704 return -1;
1705 ret = mkdir(path, 0755);
1706 if (ret && errno != EEXIST) {
1707 SYSERROR("failed with errno %d to create %s", errno, path);
1708 return -1;
1709 }
1710 INFO("created %s", path);
1711
1712 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console",
1713 rootfs->mount, ttydir);
1714 if (ret >= sizeof(lxcpath)) {
1715 ERROR("console path too long");
1716 return -1;
1717 }
1718
1719 snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1720 ret = unlink(path);
1721 if (ret && errno != ENOENT) {
1722 SYSERROR("error unlinking %s", path);
1723 return -1;
1724 }
1725
1726 ret = creat(lxcpath, 0660);
1727 if (ret==-1 && errno != EEXIST) {
1728 SYSERROR("error %d creating %s", errno, lxcpath);
1729 return -1;
1730 }
1731 if (ret >= 0)
1732 close(ret);
1733
1734 if (console->master < 0) {
1735 INFO("no console");
1736 return 0;
1737 }
1738
1739 if (mount(console->name, lxcpath, "none", MS_BIND, 0)) {
1740 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1741 return -1;
1742 }
1743
1744 /* create symlink from rootfs/dev/console to 'lxc/console' */
1745 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
1746 if (ret >= sizeof(lxcpath)) {
1747 ERROR("lxc/console path too long");
1748 return -1;
1749 }
1750 ret = symlink(lxcpath, path);
1751 if (ret) {
1752 SYSERROR("failed to create symlink for console");
1753 return -1;
1754 }
1755
1756 INFO("console has been setup on %s", lxcpath);
1757
1758 return 0;
1759 }
1760
1761 static int setup_console(const struct lxc_rootfs *rootfs,
1762 const struct lxc_console *console,
1763 char *ttydir)
1764 {
1765 /* We don't have a rootfs, /dev/console will be shared */
1766 if (!rootfs->path)
1767 return 0;
1768 if (!ttydir)
1769 return setup_dev_console(rootfs, console);
1770
1771 return setup_ttydir_console(rootfs, console, ttydir);
1772 }
1773
1774 static int setup_kmsg(const struct lxc_rootfs *rootfs,
1775 const struct lxc_console *console)
1776 {
1777 char kpath[MAXPATHLEN];
1778 int ret;
1779
1780 if (!rootfs->path)
1781 return 0;
1782 ret = snprintf(kpath, sizeof(kpath), "%s/dev/kmsg", rootfs->mount);
1783 if (ret < 0 || ret >= sizeof(kpath))
1784 return -1;
1785
1786 ret = unlink(kpath);
1787 if (ret && errno != ENOENT) {
1788 SYSERROR("error unlinking %s", kpath);
1789 return -1;
1790 }
1791
1792 ret = symlink("console", kpath);
1793 if (ret) {
1794 SYSERROR("failed to create symlink for kmsg");
1795 return -1;
1796 }
1797
1798 return 0;
1799 }
1800
1801 static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1802 {
1803 struct mount_opt *mo;
1804
1805 /* If opt is found in mount_opt, set or clear flags.
1806 * Otherwise append it to data. */
1807
1808 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1809 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1810 if (mo->clear)
1811 *flags &= ~mo->flag;
1812 else
1813 *flags |= mo->flag;
1814 return;
1815 }
1816 }
1817
1818 if (strlen(*data))
1819 strcat(*data, ",");
1820 strcat(*data, opt);
1821 }
1822
1823 int parse_mntopts(const char *mntopts, unsigned long *mntflags,
1824 char **mntdata)
1825 {
1826 char *s, *data;
1827 char *p, *saveptr = NULL;
1828
1829 *mntdata = NULL;
1830 *mntflags = 0L;
1831
1832 if (!mntopts)
1833 return 0;
1834
1835 s = strdup(mntopts);
1836 if (!s) {
1837 SYSERROR("failed to allocate memory");
1838 return -1;
1839 }
1840
1841 data = malloc(strlen(s) + 1);
1842 if (!data) {
1843 SYSERROR("failed to allocate memory");
1844 free(s);
1845 return -1;
1846 }
1847 *data = 0;
1848
1849 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1850 p = strtok_r(NULL, ",", &saveptr))
1851 parse_mntopt(p, mntflags, &data);
1852
1853 if (*data)
1854 *mntdata = data;
1855 else
1856 free(data);
1857 free(s);
1858
1859 return 0;
1860 }
1861
1862 static int mount_entry(const char *fsname, const char *target,
1863 const char *fstype, unsigned long mountflags,
1864 const char *data, int optional)
1865 {
1866 if (mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data)) {
1867 if (optional) {
1868 INFO("failed to mount '%s' on '%s' (optional): %s", fsname,
1869 target, strerror(errno));
1870 return 0;
1871 }
1872 else {
1873 SYSERROR("failed to mount '%s' on '%s'", fsname, target);
1874 return -1;
1875 }
1876 }
1877
1878 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
1879
1880 DEBUG("remounting %s on %s to respect bind or remount options",
1881 fsname, target);
1882
1883 if (mount(fsname, target, fstype,
1884 mountflags | MS_REMOUNT, data)) {
1885 if (optional) {
1886 INFO("failed to mount '%s' on '%s' (optional): %s",
1887 fsname, target, strerror(errno));
1888 return 0;
1889 }
1890 else {
1891 SYSERROR("failed to mount '%s' on '%s'",
1892 fsname, target);
1893 return -1;
1894 }
1895 }
1896 }
1897
1898 DEBUG("mounted '%s' on '%s', type '%s'", fsname, target, fstype);
1899
1900 return 0;
1901 }
1902
1903 /*
1904 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1905 */
1906 static void cull_mntent_opt(struct mntent *mntent)
1907 {
1908 int i;
1909 char *p, *p2;
1910 char *list[] = {"create=dir",
1911 "create=file",
1912 "optional",
1913 NULL };
1914
1915 for (i=0; list[i]; i++) {
1916 if (!(p = strstr(mntent->mnt_opts, list[i])))
1917 continue;
1918 p2 = strchr(p, ',');
1919 if (!p2) {
1920 /* no more mntopts, so just chop it here */
1921 *p = '\0';
1922 continue;
1923 }
1924 memmove(p, p2+1, strlen(p2+1)+1);
1925 }
1926 }
1927
1928 static inline int mount_entry_on_systemfs(struct mntent *mntent)
1929 {
1930 unsigned long mntflags;
1931 char *mntdata;
1932 int ret;
1933 FILE *pathfile = NULL;
1934 char* pathdirname = NULL;
1935 bool optional = hasmntopt(mntent, "optional") != NULL;
1936
1937 if (hasmntopt(mntent, "create=dir")) {
1938 if (mkdir_p(mntent->mnt_dir, 0755) < 0) {
1939 WARN("Failed to create mount target '%s'", mntent->mnt_dir);
1940 ret = -1;
1941 }
1942 }
1943
1944 if (hasmntopt(mntent, "create=file") && access(mntent->mnt_dir, F_OK)) {
1945 pathdirname = strdup(mntent->mnt_dir);
1946 pathdirname = dirname(pathdirname);
1947 if (mkdir_p(pathdirname, 0755) < 0) {
1948 WARN("Failed to create target directory");
1949 }
1950 pathfile = fopen(mntent->mnt_dir, "wb");
1951 if (!pathfile) {
1952 WARN("Failed to create mount target '%s'", mntent->mnt_dir);
1953 ret = -1;
1954 }
1955 else
1956 fclose(pathfile);
1957 }
1958
1959 cull_mntent_opt(mntent);
1960
1961 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
1962 free(mntdata);
1963 return -1;
1964 }
1965
1966 ret = mount_entry(mntent->mnt_fsname, mntent->mnt_dir,
1967 mntent->mnt_type, mntflags, mntdata, optional);
1968
1969 free(pathdirname);
1970 free(mntdata);
1971
1972 return ret;
1973 }
1974
1975 static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
1976 const struct lxc_rootfs *rootfs,
1977 const char *lxc_name)
1978 {
1979 char *aux;
1980 char path[MAXPATHLEN];
1981 unsigned long mntflags;
1982 char *mntdata;
1983 int r, ret = 0, offset;
1984 const char *lxcpath;
1985 FILE *pathfile = NULL;
1986 char *pathdirname = NULL;
1987 bool optional = hasmntopt(mntent, "optional") != NULL;
1988
1989 lxcpath = lxc_global_config_value("lxc.lxcpath");
1990 if (!lxcpath) {
1991 ERROR("Out of memory");
1992 return -1;
1993 }
1994
1995 /* if rootfs->path is a blockdev path, allow container fstab to
1996 * use $lxcpath/CN/rootfs as the target prefix */
1997 r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
1998 if (r < 0 || r >= MAXPATHLEN)
1999 goto skipvarlib;
2000
2001 aux = strstr(mntent->mnt_dir, path);
2002 if (aux) {
2003 offset = strlen(path);
2004 goto skipabs;
2005 }
2006
2007 skipvarlib:
2008 aux = strstr(mntent->mnt_dir, rootfs->path);
2009 if (!aux) {
2010 WARN("ignoring mount point '%s'", mntent->mnt_dir);
2011 goto out;
2012 }
2013 offset = strlen(rootfs->path);
2014
2015 skipabs:
2016
2017 r = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount,
2018 aux + offset);
2019 if (r < 0 || r >= MAXPATHLEN) {
2020 WARN("pathnme too long for '%s'", mntent->mnt_dir);
2021 ret = -1;
2022 goto out;
2023 }
2024
2025 if (hasmntopt(mntent, "create=dir")) {
2026 if (mkdir_p(path, 0755) < 0) {
2027 WARN("Failed to create mount target '%s'", path);
2028 ret = -1;
2029 }
2030 }
2031
2032 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
2033 pathdirname = strdup(path);
2034 pathdirname = dirname(pathdirname);
2035 if (mkdir_p(pathdirname, 0755) < 0) {
2036 WARN("Failed to create target directory");
2037 }
2038 pathfile = fopen(path, "wb");
2039 if (!pathfile) {
2040 WARN("Failed to create mount target '%s'", path);
2041 ret = -1;
2042 }
2043 else
2044 fclose(pathfile);
2045 }
2046 cull_mntent_opt(mntent);
2047
2048 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
2049 free(mntdata);
2050 return -1;
2051 }
2052
2053 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type,
2054 mntflags, mntdata, optional);
2055
2056 free(mntdata);
2057
2058 out:
2059 free(pathdirname);
2060 return ret;
2061 }
2062
2063 static int mount_entry_on_relative_rootfs(struct mntent *mntent,
2064 const char *rootfs)
2065 {
2066 char path[MAXPATHLEN];
2067 unsigned long mntflags;
2068 char *mntdata;
2069 int ret;
2070 FILE *pathfile = NULL;
2071 char *pathdirname = NULL;
2072 bool optional = hasmntopt(mntent, "optional") != NULL;
2073
2074 /* relative to root mount point */
2075 ret = snprintf(path, sizeof(path), "%s/%s", rootfs, mntent->mnt_dir);
2076 if (ret >= sizeof(path)) {
2077 ERROR("path name too long");
2078 return -1;
2079 }
2080
2081 if (hasmntopt(mntent, "create=dir")) {
2082 if (mkdir_p(path, 0755) < 0) {
2083 WARN("Failed to create mount target '%s'", path);
2084 ret = -1;
2085 }
2086 }
2087
2088 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
2089 pathdirname = strdup(path);
2090 pathdirname = dirname(pathdirname);
2091 if (mkdir_p(pathdirname, 0755) < 0) {
2092 WARN("Failed to create target directory");
2093 }
2094 pathfile = fopen(path, "wb");
2095 if (!pathfile) {
2096 WARN("Failed to create mount target '%s'", path);
2097 ret = -1;
2098 }
2099 else
2100 fclose(pathfile);
2101 }
2102 cull_mntent_opt(mntent);
2103
2104 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
2105 free(mntdata);
2106 return -1;
2107 }
2108
2109 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type,
2110 mntflags, mntdata, optional);
2111
2112 free(pathdirname);
2113 free(mntdata);
2114
2115 return ret;
2116 }
2117
2118 static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
2119 const char *lxc_name)
2120 {
2121 struct mntent mntent;
2122 char buf[4096];
2123 int ret = -1;
2124
2125 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
2126
2127 if (!rootfs->path) {
2128 if (mount_entry_on_systemfs(&mntent))
2129 goto out;
2130 continue;
2131 }
2132
2133 /* We have a separate root, mounts are relative to it */
2134 if (mntent.mnt_dir[0] != '/') {
2135 if (mount_entry_on_relative_rootfs(&mntent,
2136 rootfs->mount))
2137 goto out;
2138 continue;
2139 }
2140
2141 if (mount_entry_on_absolute_rootfs(&mntent, rootfs, lxc_name))
2142 goto out;
2143 }
2144
2145 ret = 0;
2146
2147 INFO("mount points have been setup");
2148 out:
2149 return ret;
2150 }
2151
2152 static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
2153 const char *lxc_name)
2154 {
2155 FILE *file;
2156 int ret;
2157
2158 if (!fstab)
2159 return 0;
2160
2161 file = setmntent(fstab, "r");
2162 if (!file) {
2163 SYSERROR("failed to use '%s'", fstab);
2164 return -1;
2165 }
2166
2167 ret = mount_file_entries(rootfs, file, lxc_name);
2168
2169 endmntent(file);
2170 return ret;
2171 }
2172
2173 static int setup_mount_entries(const struct lxc_rootfs *rootfs, struct lxc_list *mount,
2174 const char *lxc_name)
2175 {
2176 FILE *file;
2177 struct lxc_list *iterator;
2178 char *mount_entry;
2179 int ret;
2180
2181 file = tmpfile();
2182 if (!file) {
2183 ERROR("tmpfile error: %m");
2184 return -1;
2185 }
2186
2187 lxc_list_for_each(iterator, mount) {
2188 mount_entry = iterator->elem;
2189 fprintf(file, "%s\n", mount_entry);
2190 }
2191
2192 rewind(file);
2193
2194 ret = mount_file_entries(rootfs, file, lxc_name);
2195
2196 fclose(file);
2197 return ret;
2198 }
2199
2200 static int parse_cap(const char *cap)
2201 {
2202 char *ptr = NULL;
2203 int i, capid = -1;
2204
2205 if (!strcmp(cap, "none"))
2206 return -2;
2207
2208 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2209
2210 if (strcmp(cap, caps_opt[i].name))
2211 continue;
2212
2213 capid = caps_opt[i].value;
2214 break;
2215 }
2216
2217 if (capid < 0) {
2218 /* try to see if it's numeric, so the user may specify
2219 * capabilities that the running kernel knows about but
2220 * we don't */
2221 errno = 0;
2222 capid = strtol(cap, &ptr, 10);
2223 if (!ptr || *ptr != '\0' || errno != 0)
2224 /* not a valid number */
2225 capid = -1;
2226 else if (capid > lxc_caps_last_cap())
2227 /* we have a number but it's not a valid
2228 * capability */
2229 capid = -1;
2230 }
2231
2232 return capid;
2233 }
2234
2235 int in_caplist(int cap, struct lxc_list *caps)
2236 {
2237 struct lxc_list *iterator;
2238 int capid;
2239
2240 lxc_list_for_each(iterator, caps) {
2241 capid = parse_cap(iterator->elem);
2242 if (capid == cap)
2243 return 1;
2244 }
2245
2246 return 0;
2247 }
2248
2249 static int setup_caps(struct lxc_list *caps)
2250 {
2251 struct lxc_list *iterator;
2252 char *drop_entry;
2253 int capid;
2254
2255 lxc_list_for_each(iterator, caps) {
2256
2257 drop_entry = iterator->elem;
2258
2259 capid = parse_cap(drop_entry);
2260
2261 if (capid < 0) {
2262 ERROR("unknown capability %s", drop_entry);
2263 return -1;
2264 }
2265
2266 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2267
2268 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
2269 SYSERROR("failed to remove %s capability", drop_entry);
2270 return -1;
2271 }
2272
2273 }
2274
2275 DEBUG("capabilities have been setup");
2276
2277 return 0;
2278 }
2279
2280 static int dropcaps_except(struct lxc_list *caps)
2281 {
2282 struct lxc_list *iterator;
2283 char *keep_entry;
2284 int i, capid;
2285 int numcaps = lxc_caps_last_cap() + 1;
2286 INFO("found %d capabilities", numcaps);
2287
2288 if (numcaps <= 0 || numcaps > 200)
2289 return -1;
2290
2291 // caplist[i] is 1 if we keep capability i
2292 int *caplist = alloca(numcaps * sizeof(int));
2293 memset(caplist, 0, numcaps * sizeof(int));
2294
2295 lxc_list_for_each(iterator, caps) {
2296
2297 keep_entry = iterator->elem;
2298
2299 capid = parse_cap(keep_entry);
2300
2301 if (capid == -2)
2302 continue;
2303
2304 if (capid < 0) {
2305 ERROR("unknown capability %s", keep_entry);
2306 return -1;
2307 }
2308
2309 DEBUG("keep capability '%s' (%d)", keep_entry, capid);
2310
2311 caplist[capid] = 1;
2312 }
2313 for (i=0; i<numcaps; i++) {
2314 if (caplist[i])
2315 continue;
2316 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
2317 SYSERROR("failed to remove capability %d", i);
2318 return -1;
2319 }
2320 }
2321
2322 DEBUG("capabilities have been setup");
2323
2324 return 0;
2325 }
2326
2327 static int setup_hw_addr(char *hwaddr, const char *ifname)
2328 {
2329 struct sockaddr sockaddr;
2330 struct ifreq ifr;
2331 int ret, fd;
2332
2333 ret = lxc_convert_mac(hwaddr, &sockaddr);
2334 if (ret) {
2335 ERROR("mac address '%s' conversion failed : %s",
2336 hwaddr, strerror(-ret));
2337 return -1;
2338 }
2339
2340 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
2341 ifr.ifr_name[IFNAMSIZ-1] = '\0';
2342 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2343
2344 fd = socket(AF_INET, SOCK_DGRAM, 0);
2345 if (fd < 0) {
2346 ERROR("socket failure : %s", strerror(errno));
2347 return -1;
2348 }
2349
2350 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
2351 close(fd);
2352 if (ret)
2353 ERROR("ioctl failure : %s", strerror(errno));
2354
2355 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
2356
2357 return ret;
2358 }
2359
2360 static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
2361 {
2362 struct lxc_list *iterator;
2363 struct lxc_inetdev *inetdev;
2364 int err;
2365
2366 lxc_list_for_each(iterator, ip) {
2367
2368 inetdev = iterator->elem;
2369
2370 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2371 &inetdev->bcast, inetdev->prefix);
2372 if (err) {
2373 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2374 ifindex, strerror(-err));
2375 return -1;
2376 }
2377 }
2378
2379 return 0;
2380 }
2381
2382 static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
2383 {
2384 struct lxc_list *iterator;
2385 struct lxc_inet6dev *inet6dev;
2386 int err;
2387
2388 lxc_list_for_each(iterator, ip) {
2389
2390 inet6dev = iterator->elem;
2391
2392 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
2393 &inet6dev->mcast, &inet6dev->acast,
2394 inet6dev->prefix);
2395 if (err) {
2396 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2397 ifindex, strerror(-err));
2398 return -1;
2399 }
2400 }
2401
2402 return 0;
2403 }
2404
2405 static int setup_netdev(struct lxc_netdev *netdev)
2406 {
2407 char ifname[IFNAMSIZ];
2408 char *current_ifname = ifname;
2409 int err;
2410
2411 /* empty network namespace */
2412 if (!netdev->ifindex) {
2413 if (netdev->flags & IFF_UP) {
2414 err = lxc_netdev_up("lo");
2415 if (err) {
2416 ERROR("failed to set the loopback up : %s",
2417 strerror(-err));
2418 return -1;
2419 }
2420 }
2421 if (netdev->type != LXC_NET_VETH)
2422 return 0;
2423 netdev->ifindex = if_nametoindex(netdev->name);
2424 }
2425
2426 /* get the new ifindex in case of physical netdev */
2427 if (netdev->type == LXC_NET_PHYS) {
2428 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2429 ERROR("failed to get ifindex for %s",
2430 netdev->link);
2431 return -1;
2432 }
2433 }
2434
2435 /* retrieve the name of the interface */
2436 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2437 ERROR("no interface corresponding to index '%d'",
2438 netdev->ifindex);
2439 return -1;
2440 }
2441
2442 /* default: let the system to choose one interface name */
2443 if (!netdev->name)
2444 netdev->name = netdev->type == LXC_NET_PHYS ?
2445 netdev->link : "eth%d";
2446
2447 /* rename the interface name */
2448 if (strcmp(ifname, netdev->name) != 0) {
2449 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2450 if (err) {
2451 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2452 strerror(-err));
2453 return -1;
2454 }
2455 }
2456
2457 /* Re-read the name of the interface because its name has changed
2458 * and would be automatically allocated by the system
2459 */
2460 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2461 ERROR("no interface corresponding to index '%d'",
2462 netdev->ifindex);
2463 return -1;
2464 }
2465
2466 /* set a mac address */
2467 if (netdev->hwaddr) {
2468 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
2469 ERROR("failed to setup hw address for '%s'",
2470 current_ifname);
2471 return -1;
2472 }
2473 }
2474
2475 /* setup ipv4 addresses on the interface */
2476 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
2477 ERROR("failed to setup ip addresses for '%s'",
2478 ifname);
2479 return -1;
2480 }
2481
2482 /* setup ipv6 addresses on the interface */
2483 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
2484 ERROR("failed to setup ipv6 addresses for '%s'",
2485 ifname);
2486 return -1;
2487 }
2488
2489 /* set the network device up */
2490 if (netdev->flags & IFF_UP) {
2491 int err;
2492
2493 err = lxc_netdev_up(current_ifname);
2494 if (err) {
2495 ERROR("failed to set '%s' up : %s", current_ifname,
2496 strerror(-err));
2497 return -1;
2498 }
2499
2500 /* the network is up, make the loopback up too */
2501 err = lxc_netdev_up("lo");
2502 if (err) {
2503 ERROR("failed to set the loopback up : %s",
2504 strerror(-err));
2505 return -1;
2506 }
2507 }
2508
2509 /* We can only set up the default routes after bringing
2510 * up the interface, sine bringing up the interface adds
2511 * the link-local routes and we can't add a default
2512 * route if the gateway is not reachable. */
2513
2514 /* setup ipv4 gateway on the interface */
2515 if (netdev->ipv4_gateway) {
2516 if (!(netdev->flags & IFF_UP)) {
2517 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2518 return -1;
2519 }
2520
2521 if (lxc_list_empty(&netdev->ipv4)) {
2522 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2523 return -1;
2524 }
2525
2526 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2527 if (err) {
2528 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2529 if (err) {
2530 ERROR("failed to add ipv4 dest for '%s': %s",
2531 ifname, strerror(-err));
2532 }
2533
2534 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2535 if (err) {
2536 ERROR("failed to setup ipv4 gateway for '%s': %s",
2537 ifname, strerror(-err));
2538 if (netdev->ipv4_gateway_auto) {
2539 char buf[INET_ADDRSTRLEN];
2540 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2541 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2542 }
2543 return -1;
2544 }
2545 }
2546 }
2547
2548 /* setup ipv6 gateway on the interface */
2549 if (netdev->ipv6_gateway) {
2550 if (!(netdev->flags & IFF_UP)) {
2551 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2552 return -1;
2553 }
2554
2555 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2556 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2557 return -1;
2558 }
2559
2560 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2561 if (err) {
2562 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2563 if (err) {
2564 ERROR("failed to add ipv6 dest for '%s': %s",
2565 ifname, strerror(-err));
2566 }
2567
2568 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2569 if (err) {
2570 ERROR("failed to setup ipv6 gateway for '%s': %s",
2571 ifname, strerror(-err));
2572 if (netdev->ipv6_gateway_auto) {
2573 char buf[INET6_ADDRSTRLEN];
2574 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2575 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2576 }
2577 return -1;
2578 }
2579 }
2580 }
2581
2582 DEBUG("'%s' has been setup", current_ifname);
2583
2584 return 0;
2585 }
2586
2587 static int setup_network(struct lxc_list *network)
2588 {
2589 struct lxc_list *iterator;
2590 struct lxc_netdev *netdev;
2591
2592 lxc_list_for_each(iterator, network) {
2593
2594 netdev = iterator->elem;
2595
2596 if (setup_netdev(netdev)) {
2597 ERROR("failed to setup netdev");
2598 return -1;
2599 }
2600 }
2601
2602 if (!lxc_list_empty(network))
2603 INFO("network has been setup");
2604
2605 return 0;
2606 }
2607
2608 /* try to move physical nics to the init netns */
2609 void restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2610 {
2611 int i, ret, oldfd;
2612 char path[MAXPATHLEN];
2613
2614 if (netnsfd < 0)
2615 return;
2616
2617 ret = snprintf(path, MAXPATHLEN, "/proc/self/ns/net");
2618 if (ret < 0 || ret >= MAXPATHLEN) {
2619 WARN("Failed to open monitor netns fd");
2620 return;
2621 }
2622 if ((oldfd = open(path, O_RDONLY)) < 0) {
2623 SYSERROR("Failed to open monitor netns fd");
2624 return;
2625 }
2626 if (setns(netnsfd, 0) != 0) {
2627 SYSERROR("Failed to enter container netns to reset nics");
2628 close(oldfd);
2629 return;
2630 }
2631 for (i=0; i<conf->num_savednics; i++) {
2632 struct saved_nic *s = &conf->saved_nics[i];
2633 if (lxc_netdev_move_by_index(s->ifindex, 1))
2634 WARN("Error moving nic index:%d back to host netns",
2635 s->ifindex);
2636 }
2637 if (setns(oldfd, 0) != 0)
2638 SYSERROR("Failed to re-enter monitor's netns");
2639 close(oldfd);
2640 }
2641
2642 void lxc_rename_phys_nics_on_shutdown(int netnsfd, struct lxc_conf *conf)
2643 {
2644 int i;
2645
2646 if (conf->num_savednics == 0)
2647 return;
2648
2649 INFO("running to reset %d nic names", conf->num_savednics);
2650 restore_phys_nics_to_netns(netnsfd, conf);
2651 for (i=0; i<conf->num_savednics; i++) {
2652 struct saved_nic *s = &conf->saved_nics[i];
2653 INFO("resetting nic %d to %s", s->ifindex, s->orig_name);
2654 lxc_netdev_rename_by_index(s->ifindex, s->orig_name);
2655 free(s->orig_name);
2656 }
2657 conf->num_savednics = 0;
2658 }
2659
2660 static char *default_rootfs_mount = LXCROOTFSMOUNT;
2661
2662 struct lxc_conf *lxc_conf_init(void)
2663 {
2664 struct lxc_conf *new;
2665 int i;
2666
2667 new = malloc(sizeof(*new));
2668 if (!new) {
2669 ERROR("lxc_conf_init : %m");
2670 return NULL;
2671 }
2672 memset(new, 0, sizeof(*new));
2673
2674 new->loglevel = LXC_LOG_PRIORITY_NOTSET;
2675 new->personality = -1;
2676 new->autodev = -1;
2677 new->console.log_path = NULL;
2678 new->console.log_fd = -1;
2679 new->console.path = NULL;
2680 new->console.peer = -1;
2681 new->console.peerpty.busy = -1;
2682 new->console.peerpty.master = -1;
2683 new->console.peerpty.slave = -1;
2684 new->console.master = -1;
2685 new->console.slave = -1;
2686 new->console.name[0] = '\0';
2687 new->maincmd_fd = -1;
2688 new->nbd_idx = -1;
2689 new->rootfs.mount = strdup(default_rootfs_mount);
2690 if (!new->rootfs.mount) {
2691 ERROR("lxc_conf_init : %m");
2692 free(new);
2693 return NULL;
2694 }
2695 new->kmsg = 1;
2696 lxc_list_init(&new->cgroup);
2697 lxc_list_init(&new->network);
2698 lxc_list_init(&new->mount_list);
2699 lxc_list_init(&new->caps);
2700 lxc_list_init(&new->keepcaps);
2701 lxc_list_init(&new->id_map);
2702 lxc_list_init(&new->includes);
2703 lxc_list_init(&new->aliens);
2704 lxc_list_init(&new->environment);
2705 for (i=0; i<NUM_LXC_HOOKS; i++)
2706 lxc_list_init(&new->hooks[i]);
2707 lxc_list_init(&new->groups);
2708 new->lsm_aa_profile = NULL;
2709 new->lsm_se_context = NULL;
2710 new->tmp_umount_proc = 0;
2711
2712 for (i = 0; i < LXC_NS_MAX; i++)
2713 new->inherit_ns_fd[i] = -1;
2714
2715 return new;
2716 }
2717
2718 static int instanciate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2719 {
2720 char veth1buf[IFNAMSIZ], *veth1;
2721 char veth2buf[IFNAMSIZ], *veth2;
2722 int err;
2723
2724 if (netdev->priv.veth_attr.pair)
2725 veth1 = netdev->priv.veth_attr.pair;
2726 else {
2727 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2728 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2729 ERROR("veth1 name too long");
2730 return -1;
2731 }
2732 veth1 = lxc_mkifname(veth1buf);
2733 if (!veth1) {
2734 ERROR("failed to allocate a temporary name");
2735 return -1;
2736 }
2737 /* store away for deconf */
2738 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
2739 }
2740
2741 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
2742 veth2 = lxc_mkifname(veth2buf);
2743 if (!veth2) {
2744 ERROR("failed to allocate a temporary name");
2745 goto out_delete;
2746 }
2747
2748 err = lxc_veth_create(veth1, veth2);
2749 if (err) {
2750 ERROR("failed to create %s-%s : %s", veth1, veth2,
2751 strerror(-err));
2752 goto out_delete;
2753 }
2754
2755 /* changing the high byte of the mac address to 0xfe, the bridge interface
2756 * will always keep the host's mac address and not take the mac address
2757 * of a container */
2758 err = setup_private_host_hw_addr(veth1);
2759 if (err) {
2760 ERROR("failed to change mac address of host interface '%s' : %s",
2761 veth1, strerror(-err));
2762 goto out_delete;
2763 }
2764
2765 if (netdev->mtu) {
2766 err = lxc_netdev_set_mtu(veth1, atoi(netdev->mtu));
2767 if (!err)
2768 err = lxc_netdev_set_mtu(veth2, atoi(netdev->mtu));
2769 if (err) {
2770 ERROR("failed to set mtu '%s' for %s-%s : %s",
2771 netdev->mtu, veth1, veth2, strerror(-err));
2772 goto out_delete;
2773 }
2774 }
2775
2776 if (netdev->link) {
2777 err = lxc_bridge_attach(netdev->link, veth1);
2778 if (err) {
2779 ERROR("failed to attach '%s' to the bridge '%s' : %s",
2780 veth1, netdev->link, strerror(-err));
2781 goto out_delete;
2782 }
2783 }
2784
2785 netdev->ifindex = if_nametoindex(veth2);
2786 if (!netdev->ifindex) {
2787 ERROR("failed to retrieve the index for %s", veth2);
2788 goto out_delete;
2789 }
2790
2791 err = lxc_netdev_up(veth1);
2792 if (err) {
2793 ERROR("failed to set %s up : %s", veth1, strerror(-err));
2794 goto out_delete;
2795 }
2796
2797 if (netdev->upscript) {
2798 err = run_script(handler->name, "net", netdev->upscript, "up",
2799 "veth", veth1, (char*) NULL);
2800 if (err)
2801 goto out_delete;
2802 }
2803
2804 DEBUG("instanciated veth '%s/%s', index is '%d'",
2805 veth1, veth2, netdev->ifindex);
2806
2807 return 0;
2808
2809 out_delete:
2810 lxc_netdev_delete_by_name(veth1);
2811 if (!netdev->priv.veth_attr.pair && veth1)
2812 free(veth1);
2813 if(veth2)
2814 free(veth2);
2815 return -1;
2816 }
2817
2818 static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2819 {
2820 char *veth1;
2821 int err;
2822
2823 if (netdev->priv.veth_attr.pair)
2824 veth1 = netdev->priv.veth_attr.pair;
2825 else
2826 veth1 = netdev->priv.veth_attr.veth1;
2827
2828 if (netdev->downscript) {
2829 err = run_script(handler->name, "net", netdev->downscript,
2830 "down", "veth", veth1, (char*) NULL);
2831 if (err)
2832 return -1;
2833 }
2834 return 0;
2835 }
2836
2837 static int instanciate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2838 {
2839 char peerbuf[IFNAMSIZ], *peer;
2840 int err;
2841
2842 if (!netdev->link) {
2843 ERROR("no link specified for macvlan netdev");
2844 return -1;
2845 }
2846
2847 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2848 if (err >= sizeof(peerbuf))
2849 return -1;
2850
2851 peer = lxc_mkifname(peerbuf);
2852 if (!peer) {
2853 ERROR("failed to make a temporary name");
2854 return -1;
2855 }
2856
2857 err = lxc_macvlan_create(netdev->link, peer,
2858 netdev->priv.macvlan_attr.mode);
2859 if (err) {
2860 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2861 peer, netdev->link, strerror(-err));
2862 goto out;
2863 }
2864
2865 netdev->ifindex = if_nametoindex(peer);
2866 if (!netdev->ifindex) {
2867 ERROR("failed to retrieve the index for %s", peer);
2868 goto out;
2869 }
2870
2871 if (netdev->upscript) {
2872 err = run_script(handler->name, "net", netdev->upscript, "up",
2873 "macvlan", netdev->link, (char*) NULL);
2874 if (err)
2875 goto out;
2876 }
2877
2878 DEBUG("instanciated macvlan '%s', index is '%d' and mode '%d'",
2879 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
2880
2881 return 0;
2882 out:
2883 lxc_netdev_delete_by_name(peer);
2884 free(peer);
2885 return -1;
2886 }
2887
2888 static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2889 {
2890 int err;
2891
2892 if (netdev->downscript) {
2893 err = run_script(handler->name, "net", netdev->downscript,
2894 "down", "macvlan", netdev->link,
2895 (char*) NULL);
2896 if (err)
2897 return -1;
2898 }
2899 return 0;
2900 }
2901
2902 /* XXX: merge with instanciate_macvlan */
2903 static int instanciate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2904 {
2905 char peer[IFNAMSIZ];
2906 int err;
2907
2908 if (!netdev->link) {
2909 ERROR("no link specified for vlan netdev");
2910 return -1;
2911 }
2912
2913 err = snprintf(peer, sizeof(peer), "vlan%d", netdev->priv.vlan_attr.vid);
2914 if (err >= sizeof(peer)) {
2915 ERROR("peer name too long");
2916 return -1;
2917 }
2918
2919 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
2920 if (err) {
2921 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2922 peer, netdev->link, strerror(-err));
2923 return -1;
2924 }
2925
2926 netdev->ifindex = if_nametoindex(peer);
2927 if (!netdev->ifindex) {
2928 ERROR("failed to retrieve the ifindex for %s", peer);
2929 lxc_netdev_delete_by_name(peer);
2930 return -1;
2931 }
2932
2933 DEBUG("instanciated vlan '%s', ifindex is '%d'", " vlan1000",
2934 netdev->ifindex);
2935
2936 return 0;
2937 }
2938
2939 static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2940 {
2941 return 0;
2942 }
2943
2944 static int instanciate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
2945 {
2946 if (!netdev->link) {
2947 ERROR("no link specified for the physical interface");
2948 return -1;
2949 }
2950
2951 netdev->ifindex = if_nametoindex(netdev->link);
2952 if (!netdev->ifindex) {
2953 ERROR("failed to retrieve the index for %s", netdev->link);
2954 return -1;
2955 }
2956
2957 if (netdev->upscript) {
2958 int err;
2959 err = run_script(handler->name, "net", netdev->upscript,
2960 "up", "phys", netdev->link, (char*) NULL);
2961 if (err)
2962 return -1;
2963 }
2964
2965 return 0;
2966 }
2967
2968 static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
2969 {
2970 int err;
2971
2972 if (netdev->downscript) {
2973 err = run_script(handler->name, "net", netdev->downscript,
2974 "down", "phys", netdev->link, (char*) NULL);
2975 if (err)
2976 return -1;
2977 }
2978 return 0;
2979 }
2980
2981 static int instanciate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
2982 {
2983 netdev->ifindex = 0;
2984 return 0;
2985 }
2986
2987 static int instanciate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
2988 {
2989 netdev->ifindex = 0;
2990 if (netdev->upscript) {
2991 int err;
2992 err = run_script(handler->name, "net", netdev->upscript,
2993 "up", "empty", (char*) NULL);
2994 if (err)
2995 return -1;
2996 }
2997 return 0;
2998 }
2999
3000 static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
3001 {
3002 int err;
3003
3004 if (netdev->downscript) {
3005 err = run_script(handler->name, "net", netdev->downscript,
3006 "down", "empty", (char*) NULL);
3007 if (err)
3008 return -1;
3009 }
3010 return 0;
3011 }
3012
3013 static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
3014 {
3015 return 0;
3016 }
3017
3018 int lxc_requests_empty_network(struct lxc_handler *handler)
3019 {
3020 struct lxc_list *network = &handler->conf->network;
3021 struct lxc_list *iterator;
3022 struct lxc_netdev *netdev;
3023 bool found_none = false, found_nic = false;
3024
3025 if (lxc_list_empty(network))
3026 return 0;
3027
3028 lxc_list_for_each(iterator, network) {
3029
3030 netdev = iterator->elem;
3031
3032 if (netdev->type == LXC_NET_NONE)
3033 found_none = true;
3034 else
3035 found_nic = true;
3036 }
3037 if (found_none && !found_nic)
3038 return 1;
3039 return 0;
3040 }
3041
3042 int lxc_create_network(struct lxc_handler *handler)
3043 {
3044 struct lxc_list *network = &handler->conf->network;
3045 struct lxc_list *iterator;
3046 struct lxc_netdev *netdev;
3047 int am_root = (getuid() == 0);
3048
3049 if (!am_root)
3050 return 0;
3051
3052 lxc_list_for_each(iterator, network) {
3053
3054 netdev = iterator->elem;
3055
3056 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
3057 ERROR("invalid network configuration type '%d'",
3058 netdev->type);
3059 return -1;
3060 }
3061
3062 if (netdev_conf[netdev->type](handler, netdev)) {
3063 ERROR("failed to create netdev");
3064 return -1;
3065 }
3066
3067 }
3068
3069 return 0;
3070 }
3071
3072 void lxc_delete_network(struct lxc_handler *handler)
3073 {
3074 struct lxc_list *network = &handler->conf->network;
3075 struct lxc_list *iterator;
3076 struct lxc_netdev *netdev;
3077
3078 lxc_list_for_each(iterator, network) {
3079 netdev = iterator->elem;
3080
3081 if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
3082 if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
3083 WARN("failed to rename to the initial name the " \
3084 "netdev '%s'", netdev->link);
3085 continue;
3086 }
3087
3088 if (netdev_deconf[netdev->type](handler, netdev)) {
3089 WARN("failed to destroy netdev");
3090 }
3091
3092 /* Recent kernel remove the virtual interfaces when the network
3093 * namespace is destroyed but in case we did not moved the
3094 * interface to the network namespace, we have to destroy it
3095 */
3096 if (netdev->ifindex != 0 &&
3097 lxc_netdev_delete_by_index(netdev->ifindex))
3098 WARN("failed to remove interface '%s'", netdev->name);
3099 }
3100 }
3101
3102 #define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3103
3104 /* lxc-user-nic returns "interface_name:interface_name\n" */
3105 #define MAX_BUFFER_SIZE IFNAMSIZ*2 + 2
3106 static int unpriv_assign_nic(struct lxc_netdev *netdev, pid_t pid)
3107 {
3108 pid_t child;
3109 int bytes, pipefd[2];
3110 char *token, *saveptr = NULL;
3111 char buffer[MAX_BUFFER_SIZE];
3112
3113 if (netdev->type != LXC_NET_VETH) {
3114 ERROR("nic type %d not support for unprivileged use",
3115 netdev->type);
3116 return -1;
3117 }
3118
3119 if(pipe(pipefd) < 0) {
3120 SYSERROR("pipe failed");
3121 return -1;
3122 }
3123
3124 if ((child = fork()) < 0) {
3125 SYSERROR("fork");
3126 close(pipefd[0]);
3127 close(pipefd[1]);
3128 return -1;
3129 }
3130
3131 if (child == 0) { // child
3132 /* close the read-end of the pipe */
3133 close(pipefd[0]);
3134 /* redirect the stdout to write-end of the pipe */
3135 dup2(pipefd[1], STDOUT_FILENO);
3136 /* close the write-end of the pipe */
3137 close(pipefd[1]);
3138
3139 // Call lxc-user-nic pid type bridge
3140 char pidstr[20];
3141 char *args[] = {LXC_USERNIC_PATH, pidstr, "veth", netdev->link, netdev->name, NULL };
3142 snprintf(pidstr, 19, "%lu", (unsigned long) pid);
3143 pidstr[19] = '\0';
3144 execvp(args[0], args);
3145 SYSERROR("execvp lxc-user-nic");
3146 exit(1);
3147 }
3148
3149 /* close the write-end of the pipe */
3150 close(pipefd[1]);
3151
3152 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
3153 if (bytes < 0) {
3154 SYSERROR("read failed");
3155 }
3156 buffer[bytes - 1] = '\0';
3157
3158 if (wait_for_pid(child) != 0) {
3159 close(pipefd[0]);
3160 return -1;
3161 }
3162
3163 /* close the read-end of the pipe */
3164 close(pipefd[0]);
3165
3166 /* fill netdev->name field */
3167 token = strtok_r(buffer, ":", &saveptr);
3168 if (!token)
3169 return -1;
3170 netdev->name = malloc(IFNAMSIZ+1);
3171 if (!netdev->name) {
3172 ERROR("Out of memory");
3173 return -1;
3174 }
3175 memset(netdev->name, 0, IFNAMSIZ+1);
3176 strncpy(netdev->name, token, IFNAMSIZ);
3177
3178 /* fill netdev->veth_attr.pair field */
3179 token = strtok_r(NULL, ":", &saveptr);
3180 if (!token)
3181 return -1;
3182 netdev->priv.veth_attr.pair = strdup(token);
3183 if (!netdev->priv.veth_attr.pair) {
3184 ERROR("Out of memory");
3185 return -1;
3186 }
3187
3188 return 0;
3189 }
3190
3191 int lxc_assign_network(struct lxc_list *network, pid_t pid)
3192 {
3193 struct lxc_list *iterator;
3194 struct lxc_netdev *netdev;
3195 int am_root = (getuid() == 0);
3196 int err;
3197
3198 lxc_list_for_each(iterator, network) {
3199
3200 netdev = iterator->elem;
3201
3202 if (netdev->type == LXC_NET_VETH && !am_root) {
3203 if (unpriv_assign_nic(netdev, pid))
3204 return -1;
3205 // lxc-user-nic has moved the nic to the new ns.
3206 // unpriv_assign_nic() fills in netdev->name.
3207 // netdev->ifindex will be filed in at setup_netdev.
3208 continue;
3209 }
3210
3211 /* empty network namespace, nothing to move */
3212 if (!netdev->ifindex)
3213 continue;
3214
3215 err = lxc_netdev_move_by_index(netdev->ifindex, pid);
3216 if (err) {
3217 ERROR("failed to move '%s' to the container : %s",
3218 netdev->link, strerror(-err));
3219 return -1;
3220 }
3221
3222 DEBUG("move '%s' to '%d'", netdev->name, pid);
3223 }
3224
3225 return 0;
3226 }
3227
3228 static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3229 size_t buf_size)
3230 {
3231 char path[PATH_MAX];
3232 int ret, closeret;
3233 FILE *f;
3234
3235 ret = snprintf(path, PATH_MAX, "/proc/%d/%cid_map", pid, idtype == ID_TYPE_UID ? 'u' : 'g');
3236 if (ret < 0 || ret >= PATH_MAX) {
3237 fprintf(stderr, "%s: path name too long\n", __func__);
3238 return -E2BIG;
3239 }
3240 f = fopen(path, "w");
3241 if (!f) {
3242 perror("open");
3243 return -EINVAL;
3244 }
3245 ret = fwrite(buf, buf_size, 1, f);
3246 if (ret < 0)
3247 SYSERROR("writing id mapping");
3248 closeret = fclose(f);
3249 if (closeret)
3250 SYSERROR("writing id mapping");
3251 return ret < 0 ? ret : closeret;
3252 }
3253
3254 int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3255 {
3256 struct lxc_list *iterator;
3257 struct id_map *map;
3258 int ret = 0, use_shadow = 0;
3259 enum idtype type;
3260 char *buf = NULL, *pos, *cmdpath = NULL;
3261
3262 cmdpath = on_path("newuidmap", NULL);
3263 if (cmdpath) {
3264 use_shadow = 1;
3265 free(cmdpath);
3266 }
3267
3268 if (!use_shadow) {
3269 cmdpath = on_path("newgidmap", NULL);
3270 if (cmdpath) {
3271 use_shadow = 1;
3272 free(cmdpath);
3273 }
3274 }
3275
3276 if (!use_shadow && geteuid()) {
3277 ERROR("Missing newuidmap/newgidmap");
3278 return -1;
3279 }
3280
3281 for(type = ID_TYPE_UID; type <= ID_TYPE_GID; type++) {
3282 int left, fill;
3283 int had_entry = 0;
3284 if (!buf) {
3285 buf = pos = malloc(4096);
3286 if (!buf)
3287 return -ENOMEM;
3288 }
3289 pos = buf;
3290 if (use_shadow)
3291 pos += sprintf(buf, "new%cidmap %d",
3292 type == ID_TYPE_UID ? 'u' : 'g',
3293 pid);
3294
3295 lxc_list_for_each(iterator, idmap) {
3296 /* The kernel only takes <= 4k for writes to /proc/<nr>/[ug]id_map */
3297 map = iterator->elem;
3298 if (map->idtype != type)
3299 continue;
3300
3301 had_entry = 1;
3302 left = 4096 - (pos - buf);
3303 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
3304 use_shadow ? " " : "",
3305 map->nsid, map->hostid, map->range,
3306 use_shadow ? "" : "\n");
3307 if (fill <= 0 || fill >= left)
3308 SYSERROR("snprintf failed, too many mappings");
3309 pos += fill;
3310 }
3311 if (!had_entry)
3312 continue;
3313
3314 if (!use_shadow) {
3315 ret = write_id_mapping(type, pid, buf, pos-buf);
3316 } else {
3317 left = 4096 - (pos - buf);
3318 fill = snprintf(pos, left, "\n");
3319 if (fill <= 0 || fill >= left)
3320 SYSERROR("snprintf failed, too many mappings");
3321 pos += fill;
3322 ret = system(buf);
3323 }
3324
3325 if (ret)
3326 break;
3327 }
3328
3329 if (buf)
3330 free(buf);
3331 return ret;
3332 }
3333
3334 /*
3335 * return the host uid/gid to which the container root is mapped in
3336 * *val.
3337 * Return true if id was found, false otherwise.
3338 */
3339 bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3340 unsigned long *val)
3341 {
3342 struct lxc_list *it;
3343 struct id_map *map;
3344
3345 lxc_list_for_each(it, &conf->id_map) {
3346 map = it->elem;
3347 if (map->idtype != idtype)
3348 continue;
3349 if (map->nsid != 0)
3350 continue;
3351 *val = map->hostid;
3352 return true;
3353 }
3354 return false;
3355 }
3356
3357 int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
3358 {
3359 struct lxc_list *it;
3360 struct id_map *map;
3361 lxc_list_for_each(it, &conf->id_map) {
3362 map = it->elem;
3363 if (map->idtype != idtype)
3364 continue;
3365 if (id >= map->hostid && id < map->hostid + map->range)
3366 return (id - map->hostid) + map->nsid;
3367 }
3368 return -1;
3369 }
3370
3371 int find_unmapped_nsuid(struct lxc_conf *conf, enum idtype idtype)
3372 {
3373 struct lxc_list *it;
3374 struct id_map *map;
3375 unsigned int freeid = 0;
3376 again:
3377 lxc_list_for_each(it, &conf->id_map) {
3378 map = it->elem;
3379 if (map->idtype != idtype)
3380 continue;
3381 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3382 freeid = map->nsid + map->range;
3383 goto again;
3384 }
3385 }
3386 return freeid;
3387 }
3388
3389 int lxc_find_gateway_addresses(struct lxc_handler *handler)
3390 {
3391 struct lxc_list *network = &handler->conf->network;
3392 struct lxc_list *iterator;
3393 struct lxc_netdev *netdev;
3394 int link_index;
3395
3396 lxc_list_for_each(iterator, network) {
3397 netdev = iterator->elem;
3398
3399 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3400 continue;
3401
3402 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3403 ERROR("gateway = auto only supported for "
3404 "veth and macvlan");
3405 return -1;
3406 }
3407
3408 if (!netdev->link) {
3409 ERROR("gateway = auto needs a link interface");
3410 return -1;
3411 }
3412
3413 link_index = if_nametoindex(netdev->link);
3414 if (!link_index)
3415 return -EINVAL;
3416
3417 if (netdev->ipv4_gateway_auto) {
3418 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3419 ERROR("failed to automatically find ipv4 gateway "
3420 "address from link interface '%s'", netdev->link);
3421 return -1;
3422 }
3423 }
3424
3425 if (netdev->ipv6_gateway_auto) {
3426 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3427 ERROR("failed to automatically find ipv6 gateway "
3428 "address from link interface '%s'", netdev->link);
3429 return -1;
3430 }
3431 }
3432 }
3433
3434 return 0;
3435 }
3436
3437 int lxc_create_tty(const char *name, struct lxc_conf *conf)
3438 {
3439 struct lxc_tty_info *tty_info = &conf->tty_info;
3440 int i, ret;
3441
3442 /* no tty in the configuration */
3443 if (!conf->tty)
3444 return 0;
3445
3446 tty_info->pty_info =
3447 malloc(sizeof(*tty_info->pty_info)*conf->tty);
3448 if (!tty_info->pty_info) {
3449 SYSERROR("failed to allocate pty_info");
3450 return -1;
3451 }
3452
3453 for (i = 0; i < conf->tty; i++) {
3454
3455 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3456
3457 process_lock();
3458 ret = openpty(&pty_info->master, &pty_info->slave,
3459 pty_info->name, NULL, NULL);
3460 process_unlock();
3461 if (ret) {
3462 SYSERROR("failed to create pty #%d", i);
3463 tty_info->nbtty = i;
3464 lxc_delete_tty(tty_info);
3465 return -1;
3466 }
3467
3468 DEBUG("allocated pty '%s' (%d/%d)",
3469 pty_info->name, pty_info->master, pty_info->slave);
3470
3471 /* Prevent leaking the file descriptors to the container */
3472 fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3473 fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3474
3475 pty_info->busy = 0;
3476 }
3477
3478 tty_info->nbtty = conf->tty;
3479
3480 INFO("tty's configured");
3481
3482 return 0;
3483 }
3484
3485 void lxc_delete_tty(struct lxc_tty_info *tty_info)
3486 {
3487 int i;
3488
3489 for (i = 0; i < tty_info->nbtty; i++) {
3490 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3491
3492 close(pty_info->master);
3493 close(pty_info->slave);
3494 }
3495
3496 free(tty_info->pty_info);
3497 tty_info->nbtty = 0;
3498 }
3499
3500 /*
3501 * chown_mapped_root: for an unprivileged user with uid/gid X to
3502 * chown a dir to subuid/subgid Y, he needs to run chown as root
3503 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3504 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3505 * root is privileged with respect to hostuid/hostgid X, allowing
3506 * him to do the chown.
3507 */
3508 int chown_mapped_root(char *path, struct lxc_conf *conf)
3509 {
3510 uid_t rootuid;
3511 gid_t rootgid;
3512 pid_t pid;
3513 unsigned long val;
3514 char *chownpath = path;
3515
3516 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
3517 ERROR("No mapping for container root");
3518 return -1;
3519 }
3520 rootuid = (uid_t) val;
3521 if (!get_mapped_rootid(conf, ID_TYPE_GID, &val)) {
3522 ERROR("No mapping for container root");
3523 return -1;
3524 }
3525 rootgid = (gid_t) val;
3526
3527 /*
3528 * In case of overlay, we want only the writeable layer
3529 * to be chowned
3530 */
3531 if (strncmp(path, "overlayfs:", 10) == 0 || strncmp(path, "aufs:", 5) == 0) {
3532 chownpath = strchr(path, ':');
3533 if (!chownpath) {
3534 ERROR("Bad overlay path: %s", path);
3535 return -1;
3536 }
3537 chownpath = strchr(chownpath+1, ':');
3538 if (!chownpath) {
3539 ERROR("Bad overlay path: %s", path);
3540 return -1;
3541 }
3542 chownpath++;
3543 }
3544 path = chownpath;
3545 if (geteuid() == 0) {
3546 if (chown(path, rootuid, rootgid) < 0) {
3547 ERROR("Error chowning %s", path);
3548 return -1;
3549 }
3550 return 0;
3551 }
3552
3553 if (rootuid == geteuid()) {
3554 // nothing to do
3555 INFO("%s: container root is our uid; no need to chown" ,__func__);
3556 return 0;
3557 }
3558
3559 pid = fork();
3560 if (pid < 0) {
3561 SYSERROR("Failed forking");
3562 return -1;
3563 }
3564 if (!pid) {
3565 int hostuid = geteuid(), hostgid = getegid(), ret;
3566 struct stat sb;
3567 char map1[100], map2[100], map3[100], map4[100], map5[100];
3568 char ugid[100];
3569 char *args1[] = { "lxc-usernsexec", "-m", map1, "-m", map2,
3570 "-m", map3, "-m", map5,
3571 "--", "chown", ugid, path, NULL };
3572 char *args2[] = { "lxc-usernsexec", "-m", map1, "-m", map2,
3573 "-m", map3, "-m", map4, "-m", map5,
3574 "--", "chown", ugid, path, NULL };
3575
3576 // save the current gid of "path"
3577 if (stat(path, &sb) < 0) {
3578 ERROR("Error stat %s", path);
3579 return -1;
3580 }
3581
3582 /*
3583 * A file has to be group-owned by a gid mapped into the
3584 * container, or the container won't be privileged over it.
3585 */
3586 if (sb.st_uid == geteuid() &&
3587 mapped_hostid(sb.st_gid, conf, ID_TYPE_GID) < 0 &&
3588 chown(path, -1, hostgid) < 0) {
3589 ERROR("Failed chgrping %s", path);
3590 return -1;
3591 }
3592
3593 // "u:0:rootuid:1"
3594 ret = snprintf(map1, 100, "u:0:%d:1", rootuid);
3595 if (ret < 0 || ret >= 100) {
3596 ERROR("Error uid printing map string");
3597 return -1;
3598 }
3599
3600 // "u:hostuid:hostuid:1"
3601 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3602 if (ret < 0 || ret >= 100) {
3603 ERROR("Error uid printing map string");
3604 return -1;
3605 }
3606
3607 // "g:0:rootgid:1"
3608 ret = snprintf(map3, 100, "g:0:%d:1", rootgid);
3609 if (ret < 0 || ret >= 100) {
3610 ERROR("Error gid printing map string");
3611 return -1;
3612 }
3613
3614 // "g:pathgid:rootgid+pathgid:1"
3615 ret = snprintf(map4, 100, "g:%d:%d:1", (gid_t)sb.st_gid,
3616 rootgid + (gid_t)sb.st_gid);
3617 if (ret < 0 || ret >= 100) {
3618 ERROR("Error gid printing map string");
3619 return -1;
3620 }
3621
3622 // "g:hostgid:hostgid:1"
3623 ret = snprintf(map5, 100, "g:%d:%d:1", hostgid, hostgid);
3624 if (ret < 0 || ret >= 100) {
3625 ERROR("Error gid printing map string");
3626 return -1;
3627 }
3628
3629 // "0:pathgid" (chown)
3630 ret = snprintf(ugid, 100, "0:%d", (gid_t)sb.st_gid);
3631 if (ret < 0 || ret >= 100) {
3632 ERROR("Error owner printing format string for chown");
3633 return -1;
3634 }
3635
3636 if (hostgid == sb.st_gid)
3637 ret = execvp("lxc-usernsexec", args1);
3638 else
3639 ret = execvp("lxc-usernsexec", args2);
3640 SYSERROR("Failed executing usernsexec");
3641 exit(1);
3642 }
3643 return wait_for_pid(pid);
3644 }
3645
3646 int ttys_shift_ids(struct lxc_conf *c)
3647 {
3648 int i;
3649
3650 if (lxc_list_empty(&c->id_map))
3651 return 0;
3652
3653 for (i = 0; i < c->tty_info.nbtty; i++) {
3654 struct lxc_pty_info *pty_info = &c->tty_info.pty_info[i];
3655
3656 if (chown_mapped_root(pty_info->name, c) < 0) {
3657 ERROR("Failed to chown %s", pty_info->name);
3658 return -1;
3659 }
3660 }
3661
3662 if (strcmp(c->console.name, "") !=0 && chown_mapped_root(c->console.name, c) < 0) {
3663 ERROR("Failed to chown %s", c->console.name);
3664 return -1;
3665 }
3666
3667 return 0;
3668 }
3669
3670 /*
3671 * This routine is called when the configuration does not already specify a value
3672 * for autodev (mounting a file system on /dev and populating it in a container).
3673 * If a hard override value has not be specified, then we try to apply some
3674 * heuristics to determine if we should switch to autodev mode.
3675 *
3676 * For instance, if the container has an /etc/systemd/system directory then it
3677 * is probably running systemd as the init process and it needs the autodev
3678 * mount to prevent it from mounting devtmpfs on /dev on it's own causing conflicts
3679 * in the host.
3680 *
3681 * We may also want to enable autodev if the host has devtmpfs mounted on its
3682 * /dev as this then enable us to use subdirectories under /dev for the container
3683 * /dev directories and we can fake udev devices.
3684 */
3685 struct start_args {
3686 char *const *argv;
3687 };
3688
3689 #define MAX_SYMLINK_DEPTH 32
3690
3691 static int check_autodev( const char *rootfs, void *data )
3692 {
3693 struct start_args *arg = data;
3694 int ret;
3695 int loop_count = 0;
3696 struct stat s;
3697 char absrootfs[MAXPATHLEN];
3698 char path[MAXPATHLEN];
3699 char abs_path[MAXPATHLEN];
3700 char *command = "/sbin/init";
3701
3702 if (rootfs == NULL || strlen(rootfs) == 0)
3703 return -2;
3704
3705 if (!realpath(rootfs, absrootfs))
3706 return -2;
3707
3708 if( arg && arg->argv[0] ) {
3709 command = arg->argv[0];
3710 DEBUG("Set exec command to %s", command );
3711 }
3712
3713 strncpy( path, command, MAXPATHLEN-1 );
3714
3715 if ( 0 != access(path, F_OK) || 0 != stat(path, &s) )
3716 return -2;
3717
3718 /* Dereference down the symlink merry path testing as we go. */
3719 /* If anything references systemd in the path - set autodev! */
3720 /* Renormalize to the rootfs before each dereference */
3721 /* Relative symlinks should fall out in the wash even with .. */
3722 while( 1 ) {
3723 if ( strstr( path, "systemd" ) ) {
3724 INFO("Container with systemd init detected - enabling autodev!");
3725 return 1;
3726 }
3727
3728 ret = snprintf(abs_path, MAXPATHLEN-1, "%s/%s", absrootfs, path);
3729 if (ret < 0 || ret > MAXPATHLEN)
3730 return -2;
3731
3732 ret = readlink( abs_path, path, MAXPATHLEN-1 );
3733
3734 if ( ( ret <= 0 ) || ( ++loop_count > MAX_SYMLINK_DEPTH ) ) {
3735 break; /* Break out for other tests */
3736 }
3737 path[ret] = '\0';
3738 }
3739
3740 /*
3741 * Add future checks here.
3742 * Return positive if we should go autodev
3743 * Return 0 if we should NOT go autodev
3744 * Return negative if we encounter an error or can not determine...
3745 */
3746
3747 /* All else fails, we don't need autodev */
3748 INFO("Autodev not required.");
3749 return 0;
3750 }
3751
3752 /*
3753 * _do_tmp_proc_mount: Mount /proc inside container if not already
3754 * mounted
3755 *
3756 * @rootfs : the rootfs where proc should be mounted
3757 *
3758 * Returns < 0 on failure, 0 if the correct proc was already mounted
3759 * and 1 if a new proc was mounted.
3760 */
3761 static int do_tmp_proc_mount(const char *rootfs)
3762 {
3763 char path[MAXPATHLEN];
3764 char link[20];
3765 int linklen, ret;
3766
3767 ret = snprintf(path, MAXPATHLEN, "%s/proc/self", rootfs);
3768 if (ret < 0 || ret >= MAXPATHLEN) {
3769 SYSERROR("proc path name too long");
3770 return -1;
3771 }
3772 memset(link, 0, 20);
3773 linklen = readlink(path, link, 20);
3774 INFO("I am %d, /proc/self points to '%s'", getpid(), link);
3775 ret = snprintf(path, MAXPATHLEN, "%s/proc", rootfs);
3776 if (linklen < 0) /* /proc not mounted */
3777 goto domount;
3778 /* can't be longer than rootfs/proc/1 */
3779 if (strncmp(link, "1", linklen) != 0) {
3780 /* wrong /procs mounted */
3781 umount2(path, MNT_DETACH); /* ignore failure */
3782 goto domount;
3783 }
3784 /* the right proc is already mounted */
3785 return 0;
3786
3787 domount:
3788 if (mount("proc", path, "proc", 0, NULL))
3789 return -1;
3790 INFO("Mounted /proc in container for security transition");
3791 return 1;
3792 }
3793
3794 int tmp_proc_mount(struct lxc_conf *lxc_conf)
3795 {
3796 int mounted;
3797
3798 if (lxc_conf->rootfs.path == NULL || strlen(lxc_conf->rootfs.path) == 0) {
3799 if (mount("proc", "/proc", "proc", 0, NULL)) {
3800 SYSERROR("Failed mounting /proc, proceeding");
3801 mounted = 0;
3802 } else
3803 mounted = 1;
3804 } else
3805 mounted = do_tmp_proc_mount(lxc_conf->rootfs.mount);
3806 if (mounted == -1) {
3807 SYSERROR("failed to mount /proc in the container.");
3808 return -1;
3809 } else if (mounted == 1) {
3810 lxc_conf->tmp_umount_proc = 1;
3811 }
3812 return 0;
3813 }
3814
3815 void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3816 {
3817 if (lxc_conf->tmp_umount_proc == 1) {
3818 umount("/proc");
3819 lxc_conf->tmp_umount_proc = 0;
3820 }
3821 }
3822
3823 static void null_endofword(char *word)
3824 {
3825 while (*word && *word != ' ' && *word != '\t')
3826 word++;
3827 *word = '\0';
3828 }
3829
3830 /*
3831 * skip @nfields spaces in @src
3832 */
3833 static char *get_field(char *src, int nfields)
3834 {
3835 char *p = src;
3836 int i;
3837
3838 for (i = 0; i < nfields; i++) {
3839 while (*p && *p != ' ' && *p != '\t')
3840 p++;
3841 if (!*p)
3842 break;
3843 p++;
3844 }
3845 return p;
3846 }
3847
3848 static void remount_all_slave(void)
3849 {
3850 /* walk /proc/mounts and change any shared entries to slave */
3851 FILE *f = fopen("/proc/self/mountinfo", "r");
3852 char *line = NULL;
3853 size_t len = 0;
3854
3855 if (!f) {
3856 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3857 ERROR("Continuing container startup...");
3858 return;
3859 }
3860
3861 while (getline(&line, &len, f) != -1) {
3862 char *target, *opts;
3863 target = get_field(line, 4);
3864 if (!target)
3865 continue;
3866 opts = get_field(target, 2);
3867 if (!opts)
3868 continue;
3869 null_endofword(opts);
3870 if (!strstr(opts, "shared"))
3871 continue;
3872 null_endofword(target);
3873 if (mount(NULL, target, NULL, MS_SLAVE, NULL)) {
3874 SYSERROR("Failed to make %s rslave", target);
3875 ERROR("Continuing...");
3876 }
3877 }
3878 fclose(f);
3879 if (line)
3880 free(line);
3881 }
3882
3883 void lxc_execute_bind_init(struct lxc_conf *conf)
3884 {
3885 int ret;
3886 char path[PATH_MAX], destpath[PATH_MAX], *p;
3887
3888 /* If init exists in the container, don't bind mount a static one */
3889 p = choose_init(conf->rootfs.mount);
3890 if (p) {
3891 free(p);
3892 return;
3893 }
3894
3895 ret = snprintf(path, PATH_MAX, SBINDIR "/init.lxc.static");
3896 if (ret < 0 || ret >= PATH_MAX) {
3897 WARN("Path name too long searching for lxc.init.static");
3898 return;
3899 }
3900
3901 if (!file_exists(path)) {
3902 INFO("%s does not exist on host", path);
3903 return;
3904 }
3905
3906 ret = snprintf(destpath, PATH_MAX, "%s%s", conf->rootfs.mount, "/init.lxc.static");
3907 if (ret < 0 || ret >= PATH_MAX) {
3908 WARN("Path name too long for container's lxc.init.static");
3909 return;
3910 }
3911
3912 if (!file_exists(destpath)) {
3913 FILE * pathfile = fopen(destpath, "wb");
3914 if (!pathfile) {
3915 SYSERROR("Failed to create mount target '%s'", destpath);
3916 return;
3917 }
3918 fclose(pathfile);
3919 }
3920
3921 ret = mount(path, destpath, "none", MS_BIND, NULL);
3922 if (ret < 0)
3923 SYSERROR("Failed to bind lxc.init.static into container");
3924 INFO("lxc.init.static bound into container at %s", path);
3925 }
3926
3927 /*
3928 * This does the work of remounting / if it is shared, calling the
3929 * container pre-mount hooks, and mounting the rootfs.
3930 */
3931 int do_rootfs_setup(struct lxc_conf *conf, const char *name, const char *lxcpath)
3932 {
3933 if (conf->rootfs_setup) {
3934 /*
3935 * rootfs was set up in another namespace. bind-mount it
3936 * to give us a mount in our own ns so we can pivot_root to it
3937 */
3938 const char *path = conf->rootfs.mount;
3939 if (mount(path, path, "rootfs", MS_BIND, NULL) < 0) {
3940 ERROR("Failed to bind-mount container / onto itself");
3941 return false;
3942 }
3943 }
3944
3945 if (detect_ramfs_rootfs()) {
3946 if (chroot_into_slave(conf)) {
3947 ERROR("Failed to chroot into slave /");
3948 return -1;
3949 }
3950 }
3951
3952 remount_all_slave();
3953
3954 if (run_lxc_hooks(name, "pre-mount", conf, lxcpath, NULL)) {
3955 ERROR("failed to run pre-mount hooks for container '%s'.", name);
3956 return -1;
3957 }
3958
3959 if (setup_rootfs(conf)) {
3960 ERROR("failed to setup rootfs for '%s'", name);
3961 return -1;
3962 }
3963
3964 conf->rootfs_setup = true;
3965 return 0;
3966 }
3967
3968 static bool verify_start_hooks(struct lxc_conf *conf)
3969 {
3970 struct lxc_list *it;
3971 char path[MAXPATHLEN];
3972 lxc_list_for_each(it, &conf->hooks[LXCHOOK_START]) {
3973 char *hookname = it->elem;
3974 struct stat st;
3975 int ret;
3976
3977 ret = snprintf(path, MAXPATHLEN, "%s%s",
3978 conf->rootfs.mount, hookname);
3979 if (ret < 0 || ret >= MAXPATHLEN)
3980 return false;
3981 ret = stat(path, &st);
3982 if (ret) {
3983 SYSERROR("Start hook %s not found in container rootfs",
3984 hookname);
3985 return false;
3986 }
3987 }
3988
3989 return true;
3990 }
3991
3992 int lxc_setup(struct lxc_handler *handler)
3993 {
3994 const char *name = handler->name;
3995 struct lxc_conf *lxc_conf = handler->conf;
3996 const char *lxcpath = handler->lxcpath;
3997 void *data = handler->data;
3998
3999 if (do_rootfs_setup(lxc_conf, name, lxcpath) < 0) {
4000 ERROR("Error setting up rootfs mount after spawn");
4001 return -1;
4002 }
4003
4004 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
4005 if (setup_utsname(lxc_conf->utsname)) {
4006 ERROR("failed to setup the utsname for '%s'", name);
4007 return -1;
4008 }
4009 }
4010
4011 if (setup_network(&lxc_conf->network)) {
4012 ERROR("failed to setup the network for '%s'", name);
4013 return -1;
4014 }
4015
4016 if (lxc_conf->autodev < 0) {
4017 lxc_conf->autodev = check_autodev(lxc_conf->rootfs.mount, data);
4018 }
4019
4020 if (lxc_conf->autodev > 0) {
4021 if (mount_autodev(name, lxc_conf->rootfs.mount, lxcpath)) {
4022 ERROR("failed to mount /dev in the container");
4023 return -1;
4024 }
4025 }
4026
4027 /* do automatic mounts (mainly /proc and /sys), but exclude
4028 * those that need to wait until other stuff has finished
4029 */
4030 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
4031 ERROR("failed to setup the automatic mounts for '%s'", name);
4032 return -1;
4033 }
4034
4035 if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name)) {
4036 ERROR("failed to setup the mounts for '%s'", name);
4037 return -1;
4038 }
4039
4040 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name)) {
4041 ERROR("failed to setup the mount entries for '%s'", name);
4042 return -1;
4043 }
4044
4045 /* Make sure any start hooks are in the rootfs */
4046 if (!verify_start_hooks(lxc_conf))
4047 return -1;
4048
4049 if (lxc_conf->is_execute)
4050 lxc_execute_bind_init(lxc_conf);
4051
4052 /* now mount only cgroup, if wanted;
4053 * before, /sys could not have been mounted
4054 * (is either mounted automatically or via fstab entries)
4055 */
4056 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
4057 ERROR("failed to setup the automatic mounts for '%s'", name);
4058 return -1;
4059 }
4060
4061 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
4062 ERROR("failed to run mount hooks for container '%s'.", name);
4063 return -1;
4064 }
4065
4066 if (lxc_conf->autodev > 0) {
4067 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
4068 ERROR("failed to run autodev hooks for container '%s'.", name);
4069 return -1;
4070 }
4071 if (setup_autodev(lxc_conf->rootfs.mount)) {
4072 ERROR("failed to populate /dev in the container");
4073 return -1;
4074 }
4075 }
4076
4077 if (!lxc_conf->is_execute && setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
4078 ERROR("failed to setup the console for '%s'", name);
4079 return -1;
4080 }
4081
4082 if (lxc_conf->kmsg) {
4083 if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console)) // don't fail
4084 ERROR("failed to setup kmsg for '%s'", name);
4085 }
4086
4087 if (!lxc_conf->is_execute && setup_tty(&lxc_conf->rootfs, &lxc_conf->tty_info, lxc_conf->ttydir)) {
4088 ERROR("failed to setup the ttys for '%s'", name);
4089 return -1;
4090 }
4091
4092 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
4093 ERROR("failed to setup /dev symlinks for '%s'", name);
4094 return -1;
4095 }
4096
4097 /* mount /proc if it's not already there */
4098 if (tmp_proc_mount(lxc_conf) < 0) {
4099 ERROR("failed to LSM mount proc for '%s'", name);
4100 return -1;
4101 }
4102
4103 if (setup_pivot_root(&lxc_conf->rootfs)) {
4104 ERROR("failed to set rootfs for '%s'", name);
4105 return -1;
4106 }
4107
4108 if (setup_pts(lxc_conf->pts)) {
4109 ERROR("failed to setup the new pts instance");
4110 return -1;
4111 }
4112
4113 if (setup_personality(lxc_conf->personality)) {
4114 ERROR("failed to setup personality");
4115 return -1;
4116 }
4117
4118 if (lxc_list_empty(&lxc_conf->id_map)) {
4119 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
4120 if (!lxc_list_empty(&lxc_conf->caps)) {
4121 ERROR("Simultaneously requested dropping and keeping caps");
4122 return -1;
4123 }
4124 if (dropcaps_except(&lxc_conf->keepcaps)) {
4125 ERROR("failed to keep requested caps");
4126 return -1;
4127 }
4128 } else if (setup_caps(&lxc_conf->caps)) {
4129 ERROR("failed to drop capabilities");
4130 return -1;
4131 }
4132 }
4133
4134 NOTICE("'%s' is setup.", name);
4135
4136 return 0;
4137 }
4138
4139 int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
4140 const char *lxcpath, char *argv[])
4141 {
4142 int which = -1;
4143 struct lxc_list *it;
4144
4145 if (strcmp(hook, "pre-start") == 0)
4146 which = LXCHOOK_PRESTART;
4147 else if (strcmp(hook, "pre-mount") == 0)
4148 which = LXCHOOK_PREMOUNT;
4149 else if (strcmp(hook, "mount") == 0)
4150 which = LXCHOOK_MOUNT;
4151 else if (strcmp(hook, "autodev") == 0)
4152 which = LXCHOOK_AUTODEV;
4153 else if (strcmp(hook, "start") == 0)
4154 which = LXCHOOK_START;
4155 else if (strcmp(hook, "post-stop") == 0)
4156 which = LXCHOOK_POSTSTOP;
4157 else if (strcmp(hook, "clone") == 0)
4158 which = LXCHOOK_CLONE;
4159 else
4160 return -1;
4161 lxc_list_for_each(it, &conf->hooks[which]) {
4162 int ret;
4163 char *hookname = it->elem;
4164 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
4165 if (ret)
4166 return ret;
4167 }
4168 return 0;
4169 }
4170
4171 static void lxc_remove_nic(struct lxc_list *it)
4172 {
4173 struct lxc_netdev *netdev = it->elem;
4174 struct lxc_list *it2,*next;
4175
4176 lxc_list_del(it);
4177
4178 if (netdev->link)
4179 free(netdev->link);
4180 if (netdev->name)
4181 free(netdev->name);
4182 if (netdev->type == LXC_NET_VETH && netdev->priv.veth_attr.pair)
4183 free(netdev->priv.veth_attr.pair);
4184 if (netdev->upscript)
4185 free(netdev->upscript);
4186 if (netdev->hwaddr)
4187 free(netdev->hwaddr);
4188 if (netdev->mtu)
4189 free(netdev->mtu);
4190 if (netdev->ipv4_gateway)
4191 free(netdev->ipv4_gateway);
4192 if (netdev->ipv6_gateway)
4193 free(netdev->ipv6_gateway);
4194 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
4195 lxc_list_del(it2);
4196 free(it2->elem);
4197 free(it2);
4198 }
4199 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
4200 lxc_list_del(it2);
4201 free(it2->elem);
4202 free(it2);
4203 }
4204 free(netdev);
4205 free(it);
4206 }
4207
4208 /* we get passed in something like '0', '0.ipv4' or '1.ipv6' */
4209 int lxc_clear_nic(struct lxc_conf *c, const char *key)
4210 {
4211 char *p1;
4212 int ret, idx, i;
4213 struct lxc_list *it;
4214 struct lxc_netdev *netdev;
4215
4216 p1 = index(key, '.');
4217 if (!p1 || *(p1+1) == '\0')
4218 p1 = NULL;
4219
4220 ret = sscanf(key, "%d", &idx);
4221 if (ret != 1) return -1;
4222 if (idx < 0)
4223 return -1;
4224
4225 i = 0;
4226 lxc_list_for_each(it, &c->network) {
4227 if (i == idx)
4228 break;
4229 i++;
4230 }
4231 if (i < idx) // we don't have that many nics defined
4232 return -1;
4233
4234 if (!it || !it->elem)
4235 return -1;
4236
4237 netdev = it->elem;
4238
4239 if (!p1) {
4240 lxc_remove_nic(it);
4241 } else if (strcmp(p1, ".ipv4") == 0) {
4242 struct lxc_list *it2,*next;
4243 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
4244 lxc_list_del(it2);
4245 free(it2->elem);
4246 free(it2);
4247 }
4248 } else if (strcmp(p1, ".ipv6") == 0) {
4249 struct lxc_list *it2,*next;
4250 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
4251 lxc_list_del(it2);
4252 free(it2->elem);
4253 free(it2);
4254 }
4255 } else if (strcmp(p1, ".link") == 0) {
4256 if (netdev->link) {
4257 free(netdev->link);
4258 netdev->link = NULL;
4259 }
4260 } else if (strcmp(p1, ".name") == 0) {
4261 if (netdev->name) {
4262 free(netdev->name);
4263 netdev->name = NULL;
4264 }
4265 } else if (strcmp(p1, ".script.up") == 0) {
4266 if (netdev->upscript) {
4267 free(netdev->upscript);
4268 netdev->upscript = NULL;
4269 }
4270 } else if (strcmp(p1, ".hwaddr") == 0) {
4271 if (netdev->hwaddr) {
4272 free(netdev->hwaddr);
4273 netdev->hwaddr = NULL;
4274 }
4275 } else if (strcmp(p1, ".mtu") == 0) {
4276 if (netdev->mtu) {
4277 free(netdev->mtu);
4278 netdev->mtu = NULL;
4279 }
4280 } else if (strcmp(p1, ".ipv4_gateway") == 0) {
4281 if (netdev->ipv4_gateway) {
4282 free(netdev->ipv4_gateway);
4283 netdev->ipv4_gateway = NULL;
4284 }
4285 } else if (strcmp(p1, ".ipv6_gateway") == 0) {
4286 if (netdev->ipv6_gateway) {
4287 free(netdev->ipv6_gateway);
4288 netdev->ipv6_gateway = NULL;
4289 }
4290 }
4291 else return -1;
4292
4293 return 0;
4294 }
4295
4296 int lxc_clear_config_network(struct lxc_conf *c)
4297 {
4298 struct lxc_list *it,*next;
4299 lxc_list_for_each_safe(it, &c->network, next) {
4300 lxc_remove_nic(it);
4301 }
4302 return 0;
4303 }
4304
4305 int lxc_clear_config_caps(struct lxc_conf *c)
4306 {
4307 struct lxc_list *it,*next;
4308
4309 lxc_list_for_each_safe(it, &c->caps, next) {
4310 lxc_list_del(it);
4311 free(it->elem);
4312 free(it);
4313 }
4314 return 0;
4315 }
4316
4317 static int lxc_free_idmap(struct lxc_list *id_map) {
4318 struct lxc_list *it, *next;
4319
4320 lxc_list_for_each_safe(it, id_map, next) {
4321 lxc_list_del(it);
4322 free(it->elem);
4323 free(it);
4324 }
4325 return 0;
4326 }
4327
4328 int lxc_clear_idmaps(struct lxc_conf *c)
4329 {
4330 return lxc_free_idmap(&c->id_map);
4331 }
4332
4333 int lxc_clear_config_keepcaps(struct lxc_conf *c)
4334 {
4335 struct lxc_list *it,*next;
4336
4337 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4338 lxc_list_del(it);
4339 free(it->elem);
4340 free(it);
4341 }
4342 return 0;
4343 }
4344
4345 int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
4346 {
4347 struct lxc_list *it,*next;
4348 bool all = false;
4349 const char *k = key + 11;
4350
4351 if (strcmp(key, "lxc.cgroup") == 0)
4352 all = true;
4353
4354 lxc_list_for_each_safe(it, &c->cgroup, next) {
4355 struct lxc_cgroup *cg = it->elem;
4356 if (!all && strcmp(cg->subsystem, k) != 0)
4357 continue;
4358 lxc_list_del(it);
4359 free(cg->subsystem);
4360 free(cg->value);
4361 free(cg);
4362 free(it);
4363 }
4364 return 0;
4365 }
4366
4367 int lxc_clear_groups(struct lxc_conf *c)
4368 {
4369 struct lxc_list *it,*next;
4370
4371 lxc_list_for_each_safe(it, &c->groups, next) {
4372 lxc_list_del(it);
4373 free(it->elem);
4374 free(it);
4375 }
4376 return 0;
4377 }
4378
4379 int lxc_clear_mount_entries(struct lxc_conf *c)
4380 {
4381 struct lxc_list *it,*next;
4382
4383 lxc_list_for_each_safe(it, &c->mount_list, next) {
4384 lxc_list_del(it);
4385 free(it->elem);
4386 free(it);
4387 }
4388 return 0;
4389 }
4390
4391 int lxc_clear_automounts(struct lxc_conf *c)
4392 {
4393 c->auto_mounts = 0;
4394 return 0;
4395 }
4396
4397 int lxc_clear_hooks(struct lxc_conf *c, const char *key)
4398 {
4399 struct lxc_list *it,*next;
4400 bool all = false, done = false;
4401 const char *k = key + 9;
4402 int i;
4403
4404 if (strcmp(key, "lxc.hook") == 0)
4405 all = true;
4406
4407 for (i=0; i<NUM_LXC_HOOKS; i++) {
4408 if (all || strcmp(k, lxchook_names[i]) == 0) {
4409 lxc_list_for_each_safe(it, &c->hooks[i], next) {
4410 lxc_list_del(it);
4411 free(it->elem);
4412 free(it);
4413 }
4414 done = true;
4415 }
4416 }
4417
4418 if (!done) {
4419 ERROR("Invalid hook key: %s", key);
4420 return -1;
4421 }
4422 return 0;
4423 }
4424
4425 static void lxc_clear_saved_nics(struct lxc_conf *conf)
4426 {
4427 int i;
4428
4429 if (!conf->saved_nics)
4430 return;
4431 for (i=0; i < conf->num_savednics; i++)
4432 free(conf->saved_nics[i].orig_name);
4433 free(conf->saved_nics);
4434 }
4435
4436 static inline void lxc_clear_aliens(struct lxc_conf *conf)
4437 {
4438 struct lxc_list *it,*next;
4439
4440 lxc_list_for_each_safe(it, &conf->aliens, next) {
4441 lxc_list_del(it);
4442 free(it->elem);
4443 free(it);
4444 }
4445 }
4446
4447 static inline void lxc_clear_includes(struct lxc_conf *conf)
4448 {
4449 struct lxc_list *it,*next;
4450
4451 lxc_list_for_each_safe(it, &conf->includes, next) {
4452 lxc_list_del(it);
4453 free(it->elem);
4454 free(it);
4455 }
4456 }
4457
4458 void lxc_conf_free(struct lxc_conf *conf)
4459 {
4460 if (!conf)
4461 return;
4462 if (conf->console.path)
4463 free(conf->console.path);
4464 if (conf->rootfs.mount)
4465 free(conf->rootfs.mount);
4466 if (conf->rootfs.options)
4467 free(conf->rootfs.options);
4468 if (conf->rootfs.path)
4469 free(conf->rootfs.path);
4470 if (conf->rootfs.pivot)
4471 free(conf->rootfs.pivot);
4472 if (conf->logfile)
4473 free(conf->logfile);
4474 if (conf->utsname)
4475 free(conf->utsname);
4476 if (conf->ttydir)
4477 free(conf->ttydir);
4478 if (conf->fstab)
4479 free(conf->fstab);
4480 if (conf->rcfile)
4481 free(conf->rcfile);
4482 lxc_clear_config_network(conf);
4483 if (conf->lsm_aa_profile)
4484 free(conf->lsm_aa_profile);
4485 if (conf->lsm_se_context)
4486 free(conf->lsm_se_context);
4487 lxc_seccomp_free(conf);
4488 lxc_clear_config_caps(conf);
4489 lxc_clear_config_keepcaps(conf);
4490 lxc_clear_cgroups(conf, "lxc.cgroup");
4491 lxc_clear_hooks(conf, "lxc.hook");
4492 lxc_clear_mount_entries(conf);
4493 lxc_clear_saved_nics(conf);
4494 lxc_clear_idmaps(conf);
4495 lxc_clear_groups(conf);
4496 lxc_clear_includes(conf);
4497 lxc_clear_aliens(conf);
4498 free(conf);
4499 }
4500
4501 struct userns_fn_data {
4502 int (*fn)(void *);
4503 void *arg;
4504 int p[2];
4505 };
4506
4507 static int run_userns_fn(void *data)
4508 {
4509 struct userns_fn_data *d = data;
4510 char c;
4511 // we're not sharing with the parent any more, if it was a thread
4512
4513 close(d->p[1]);
4514 if (read(d->p[0], &c, 1) != 1)
4515 return -1;
4516 close(d->p[0]);
4517 return d->fn(d->arg);
4518 }
4519
4520 /*
4521 * Add ID_TYPE_UID/ID_TYPE_GID entries to an existing lxc_conf,
4522 * if they are not already there.
4523 */
4524 static struct lxc_list *idmap_add_id(struct lxc_conf *conf,
4525 uid_t uid, gid_t gid)
4526 {
4527 int hostuid_mapped = mapped_hostid(uid, conf, ID_TYPE_UID);
4528 int hostgid_mapped = mapped_hostid(gid, conf, ID_TYPE_GID);
4529 struct lxc_list *new = NULL, *tmp, *it, *next;
4530 struct id_map *entry;
4531
4532 new = malloc(sizeof(*new));
4533 if (!new) {
4534 ERROR("Out of memory building id map");
4535 return NULL;
4536 }
4537 lxc_list_init(new);
4538
4539 if (hostuid_mapped < 0) {
4540 hostuid_mapped = find_unmapped_nsuid(conf, ID_TYPE_UID);
4541 if (hostuid_mapped < 0)
4542 goto err;
4543 tmp = malloc(sizeof(*tmp));
4544 if (!tmp)
4545 goto err;
4546 entry = malloc(sizeof(*entry));
4547 if (!entry) {
4548 free(tmp);
4549 goto err;
4550 }
4551 tmp->elem = entry;
4552 entry->idtype = ID_TYPE_UID;
4553 entry->nsid = hostuid_mapped;
4554 entry->hostid = (unsigned long) uid;
4555 entry->range = 1;
4556 lxc_list_add_tail(new, tmp);
4557 }
4558 if (hostgid_mapped < 0) {
4559 hostgid_mapped = find_unmapped_nsuid(conf, ID_TYPE_GID);
4560 if (hostgid_mapped < 0)
4561 goto err;
4562 tmp = malloc(sizeof(*tmp));
4563 if (!tmp)
4564 goto err;
4565 entry = malloc(sizeof(*entry));
4566 if (!entry) {
4567 free(tmp);
4568 goto err;
4569 }
4570 tmp->elem = entry;
4571 entry->idtype = ID_TYPE_GID;
4572 entry->nsid = hostgid_mapped;
4573 entry->hostid = (unsigned long) gid;
4574 entry->range = 1;
4575 lxc_list_add_tail(new, tmp);
4576 }
4577 lxc_list_for_each_safe(it, &conf->id_map, next) {
4578 tmp = malloc(sizeof(*tmp));
4579 if (!tmp)
4580 goto err;
4581 entry = malloc(sizeof(*entry));
4582 if (!entry) {
4583 free(tmp);
4584 goto err;
4585 }
4586 memset(entry, 0, sizeof(*entry));
4587 memcpy(entry, it->elem, sizeof(*entry));
4588 tmp->elem = entry;
4589 lxc_list_add_tail(new, tmp);
4590 }
4591
4592 return new;
4593
4594 err:
4595 ERROR("Out of memory building a new uid/gid map");
4596 if (new)
4597 lxc_free_idmap(new);
4598 free(new);
4599 return NULL;
4600 }
4601
4602 /*
4603 * Run a function in a new user namespace.
4604 * The caller's euid/egid will be mapped in if it is not already.
4605 */
4606 int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data)
4607 {
4608 int ret, pid;
4609 struct userns_fn_data d;
4610 char c = '1';
4611 int p[2];
4612 struct lxc_list *idmap;
4613
4614 ret = pipe(p);
4615 if (ret < 0) {
4616 SYSERROR("opening pipe");
4617 return -1;
4618 }
4619 d.fn = fn;
4620 d.arg = data;
4621 d.p[0] = p[0];
4622 d.p[1] = p[1];
4623 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4624 if (pid < 0)
4625 goto err;
4626 close(p[0]);
4627 p[0] = -1;
4628
4629 if ((idmap = idmap_add_id(conf, geteuid(), getegid())) == NULL) {
4630 ERROR("Error adding self to container uid/gid map");
4631 goto err;
4632 }
4633
4634 ret = lxc_map_ids(idmap, pid);
4635 lxc_free_idmap(idmap);
4636 free(idmap);
4637 if (ret) {
4638 ERROR("Error setting up child mappings");
4639 goto err;
4640 }
4641
4642 // kick the child
4643 if (write(p[1], &c, 1) != 1) {
4644 SYSERROR("writing to pipe to child");
4645 goto err;
4646 }
4647
4648 ret = wait_for_pid(pid);
4649
4650 close(p[1]);
4651 return ret;
4652
4653 err:
4654 if (p[0] != -1)
4655 close(p[0]);
4656 close(p[1]);
4657 return -1;
4658 }