]> git.proxmox.com Git - mirror_lxc.git/blob - src/lxc/conf.c
lxc: manually move NICs back to host after container stops
[mirror_lxc.git] / src / lxc / conf.c
1 /*
2 * lxc: linux Container library
3 *
4 * (C) Copyright IBM Corp. 2007, 2008
5 *
6 * Authors:
7 * Daniel Lezcano <daniel.lezcano at free.fr>
8 *
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23 #include "config.h"
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <stdarg.h>
28 #include <errno.h>
29 #include <string.h>
30 #include <dirent.h>
31 #include <unistd.h>
32 #include <inttypes.h>
33 #include <sys/wait.h>
34 #include <sys/syscall.h>
35 #include <time.h>
36
37 #if HAVE_PTY_H
38 #include <pty.h>
39 #else
40 #include <../include/openpty.h>
41 #endif
42
43 #include <linux/loop.h>
44
45 #include <sys/types.h>
46 #include <sys/utsname.h>
47 #include <sys/param.h>
48 #include <sys/stat.h>
49 #include <sys/socket.h>
50 #include <sys/mount.h>
51 #include <sys/mman.h>
52 #include <sys/prctl.h>
53
54 #include <arpa/inet.h>
55 #include <fcntl.h>
56 #include <netinet/in.h>
57 #include <net/if.h>
58 #include <libgen.h>
59
60 #include "network.h"
61 #include "error.h"
62 #include "parse.h"
63 #include "utils.h"
64 #include "conf.h"
65 #include "log.h"
66 #include "caps.h" /* for lxc_caps_last_cap() */
67 #include "bdev.h"
68 #include "cgroup.h"
69 #include "lxclock.h"
70 #include "namespace.h"
71 #include "lsm/lsm.h"
72
73 #if HAVE_SYS_CAPABILITY_H
74 #include <sys/capability.h>
75 #endif
76
77 #if HAVE_SYS_PERSONALITY_H
78 #include <sys/personality.h>
79 #endif
80
81 #if IS_BIONIC
82 #include <../include/lxcmntent.h>
83 #else
84 #include <mntent.h>
85 #endif
86
87 #include "lxcseccomp.h"
88
89 lxc_log_define(lxc_conf, lxc);
90
91 #define MAXHWLEN 18
92 #define MAXINDEXLEN 20
93 #define MAXMTULEN 16
94 #define MAXLINELEN 128
95
96 #if HAVE_SYS_CAPABILITY_H
97 #ifndef CAP_SETFCAP
98 #define CAP_SETFCAP 31
99 #endif
100
101 #ifndef CAP_MAC_OVERRIDE
102 #define CAP_MAC_OVERRIDE 32
103 #endif
104
105 #ifndef CAP_MAC_ADMIN
106 #define CAP_MAC_ADMIN 33
107 #endif
108 #endif
109
110 #ifndef PR_CAPBSET_DROP
111 #define PR_CAPBSET_DROP 24
112 #endif
113
114 #ifndef LO_FLAGS_AUTOCLEAR
115 #define LO_FLAGS_AUTOCLEAR 4
116 #endif
117
118 /* Define pivot_root() if missing from the C library */
119 #ifndef HAVE_PIVOT_ROOT
120 static int pivot_root(const char * new_root, const char * put_old)
121 {
122 #ifdef __NR_pivot_root
123 return syscall(__NR_pivot_root, new_root, put_old);
124 #else
125 errno = ENOSYS;
126 return -1;
127 #endif
128 }
129 #else
130 extern int pivot_root(const char * new_root, const char * put_old);
131 #endif
132
133 /* Define sethostname() if missing from the C library */
134 #ifndef HAVE_SETHOSTNAME
135 static int sethostname(const char * name, size_t len)
136 {
137 #ifdef __NR_sethostname
138 return syscall(__NR_sethostname, name, len);
139 #else
140 errno = ENOSYS;
141 return -1;
142 #endif
143 }
144 #endif
145
146 /* Define __S_ISTYPE if missing from the C library */
147 #ifndef __S_ISTYPE
148 #define __S_ISTYPE(mode, mask) (((mode) & S_IFMT) == (mask))
149 #endif
150
151 char *lxchook_names[NUM_LXC_HOOKS] = {
152 "pre-start", "pre-mount", "mount", "autodev", "start", "post-stop", "clone" };
153
154 typedef int (*instanciate_cb)(struct lxc_handler *, struct lxc_netdev *);
155
156 struct mount_opt {
157 char *name;
158 int clear;
159 int flag;
160 };
161
162 struct caps_opt {
163 char *name;
164 int value;
165 };
166
167 static int instanciate_veth(struct lxc_handler *, struct lxc_netdev *);
168 static int instanciate_macvlan(struct lxc_handler *, struct lxc_netdev *);
169 static int instanciate_vlan(struct lxc_handler *, struct lxc_netdev *);
170 static int instanciate_phys(struct lxc_handler *, struct lxc_netdev *);
171 static int instanciate_empty(struct lxc_handler *, struct lxc_netdev *);
172 static int instanciate_none(struct lxc_handler *, struct lxc_netdev *);
173
174 static instanciate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
175 [LXC_NET_VETH] = instanciate_veth,
176 [LXC_NET_MACVLAN] = instanciate_macvlan,
177 [LXC_NET_VLAN] = instanciate_vlan,
178 [LXC_NET_PHYS] = instanciate_phys,
179 [LXC_NET_EMPTY] = instanciate_empty,
180 [LXC_NET_NONE] = instanciate_none,
181 };
182
183 static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
184 static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
185 static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
186 static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
187 static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
188 static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
189
190 static instanciate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
191 [LXC_NET_VETH] = shutdown_veth,
192 [LXC_NET_MACVLAN] = shutdown_macvlan,
193 [LXC_NET_VLAN] = shutdown_vlan,
194 [LXC_NET_PHYS] = shutdown_phys,
195 [LXC_NET_EMPTY] = shutdown_empty,
196 [LXC_NET_NONE] = shutdown_none,
197 };
198
199 static struct mount_opt mount_opt[] = {
200 { "defaults", 0, 0 },
201 { "ro", 0, MS_RDONLY },
202 { "rw", 1, MS_RDONLY },
203 { "suid", 1, MS_NOSUID },
204 { "nosuid", 0, MS_NOSUID },
205 { "dev", 1, MS_NODEV },
206 { "nodev", 0, MS_NODEV },
207 { "exec", 1, MS_NOEXEC },
208 { "noexec", 0, MS_NOEXEC },
209 { "sync", 0, MS_SYNCHRONOUS },
210 { "async", 1, MS_SYNCHRONOUS },
211 { "dirsync", 0, MS_DIRSYNC },
212 { "remount", 0, MS_REMOUNT },
213 { "mand", 0, MS_MANDLOCK },
214 { "nomand", 1, MS_MANDLOCK },
215 { "atime", 1, MS_NOATIME },
216 { "noatime", 0, MS_NOATIME },
217 { "diratime", 1, MS_NODIRATIME },
218 { "nodiratime", 0, MS_NODIRATIME },
219 { "bind", 0, MS_BIND },
220 { "rbind", 0, MS_BIND|MS_REC },
221 { "relatime", 0, MS_RELATIME },
222 { "norelatime", 1, MS_RELATIME },
223 { "strictatime", 0, MS_STRICTATIME },
224 { "nostrictatime", 1, MS_STRICTATIME },
225 { NULL, 0, 0 },
226 };
227
228 #if HAVE_SYS_CAPABILITY_H
229 static struct caps_opt caps_opt[] = {
230 { "chown", CAP_CHOWN },
231 { "dac_override", CAP_DAC_OVERRIDE },
232 { "dac_read_search", CAP_DAC_READ_SEARCH },
233 { "fowner", CAP_FOWNER },
234 { "fsetid", CAP_FSETID },
235 { "kill", CAP_KILL },
236 { "setgid", CAP_SETGID },
237 { "setuid", CAP_SETUID },
238 { "setpcap", CAP_SETPCAP },
239 { "linux_immutable", CAP_LINUX_IMMUTABLE },
240 { "net_bind_service", CAP_NET_BIND_SERVICE },
241 { "net_broadcast", CAP_NET_BROADCAST },
242 { "net_admin", CAP_NET_ADMIN },
243 { "net_raw", CAP_NET_RAW },
244 { "ipc_lock", CAP_IPC_LOCK },
245 { "ipc_owner", CAP_IPC_OWNER },
246 { "sys_module", CAP_SYS_MODULE },
247 { "sys_rawio", CAP_SYS_RAWIO },
248 { "sys_chroot", CAP_SYS_CHROOT },
249 { "sys_ptrace", CAP_SYS_PTRACE },
250 { "sys_pacct", CAP_SYS_PACCT },
251 { "sys_admin", CAP_SYS_ADMIN },
252 { "sys_boot", CAP_SYS_BOOT },
253 { "sys_nice", CAP_SYS_NICE },
254 { "sys_resource", CAP_SYS_RESOURCE },
255 { "sys_time", CAP_SYS_TIME },
256 { "sys_tty_config", CAP_SYS_TTY_CONFIG },
257 { "mknod", CAP_MKNOD },
258 { "lease", CAP_LEASE },
259 #ifdef CAP_AUDIT_WRITE
260 { "audit_write", CAP_AUDIT_WRITE },
261 #endif
262 #ifdef CAP_AUDIT_CONTROL
263 { "audit_control", CAP_AUDIT_CONTROL },
264 #endif
265 { "setfcap", CAP_SETFCAP },
266 { "mac_override", CAP_MAC_OVERRIDE },
267 { "mac_admin", CAP_MAC_ADMIN },
268 #ifdef CAP_SYSLOG
269 { "syslog", CAP_SYSLOG },
270 #endif
271 #ifdef CAP_WAKE_ALARM
272 { "wake_alarm", CAP_WAKE_ALARM },
273 #endif
274 };
275 #else
276 static struct caps_opt caps_opt[] = {};
277 #endif
278
279 static int run_buffer(char *buffer)
280 {
281 struct lxc_popen_FILE *f;
282 char *output;
283 int ret;
284
285 f = lxc_popen(buffer);
286 if (!f) {
287 SYSERROR("popen failed");
288 return -1;
289 }
290
291 output = malloc(LXC_LOG_BUFFER_SIZE);
292 if (!output) {
293 ERROR("failed to allocate memory for script output");
294 lxc_pclose(f);
295 return -1;
296 }
297
298 while(fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
299 DEBUG("script output: %s", output);
300
301 free(output);
302
303 ret = lxc_pclose(f);
304 if (ret == -1) {
305 SYSERROR("Script exited on error");
306 return -1;
307 } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
308 ERROR("Script exited with status %d", WEXITSTATUS(ret));
309 return -1;
310 } else if (WIFSIGNALED(ret)) {
311 ERROR("Script terminated by signal %d (%s)", WTERMSIG(ret),
312 strsignal(WTERMSIG(ret)));
313 return -1;
314 }
315
316 return 0;
317 }
318
319 static int run_script_argv(const char *name, const char *section,
320 const char *script, const char *hook, const char *lxcpath,
321 char **argsin)
322 {
323 int ret, i;
324 char *buffer;
325 size_t size = 0;
326
327 INFO("Executing script '%s' for container '%s', config section '%s'",
328 script, name, section);
329
330 for (i=0; argsin && argsin[i]; i++)
331 size += strlen(argsin[i]) + 1;
332
333 size += strlen(hook) + 1;
334
335 size += strlen(script);
336 size += strlen(name);
337 size += strlen(section);
338 size += 3;
339
340 if (size > INT_MAX)
341 return -1;
342
343 buffer = alloca(size);
344 if (!buffer) {
345 ERROR("failed to allocate memory");
346 return -1;
347 }
348
349 ret = snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
350 if (ret < 0 || ret >= size) {
351 ERROR("Script name too long");
352 return -1;
353 }
354
355 for (i=0; argsin && argsin[i]; i++) {
356 int len = size-ret;
357 int rc;
358 rc = snprintf(buffer + ret, len, " %s", argsin[i]);
359 if (rc < 0 || rc >= len) {
360 ERROR("Script args too long");
361 return -1;
362 }
363 ret += rc;
364 }
365
366 return run_buffer(buffer);
367 }
368
369 static int run_script(const char *name, const char *section,
370 const char *script, ...)
371 {
372 int ret;
373 char *buffer, *p;
374 size_t size = 0;
375 va_list ap;
376
377 INFO("Executing script '%s' for container '%s', config section '%s'",
378 script, name, section);
379
380 va_start(ap, script);
381 while ((p = va_arg(ap, char *)))
382 size += strlen(p) + 1;
383 va_end(ap);
384
385 size += strlen(script);
386 size += strlen(name);
387 size += strlen(section);
388 size += 3;
389
390 if (size > INT_MAX)
391 return -1;
392
393 buffer = alloca(size);
394 if (!buffer) {
395 ERROR("failed to allocate memory");
396 return -1;
397 }
398
399 ret = snprintf(buffer, size, "%s %s %s", script, name, section);
400 if (ret < 0 || ret >= size) {
401 ERROR("Script name too long");
402 return -1;
403 }
404
405 va_start(ap, script);
406 while ((p = va_arg(ap, char *))) {
407 int len = size-ret;
408 int rc;
409 rc = snprintf(buffer + ret, len, " %s", p);
410 if (rc < 0 || rc >= len) {
411 ERROR("Script args too long");
412 return -1;
413 }
414 ret += rc;
415 }
416 va_end(ap);
417
418 return run_buffer(buffer);
419 }
420
421 static int find_fstype_cb(char* buffer, void *data)
422 {
423 struct cbarg {
424 const char *rootfs;
425 const char *target;
426 const char *options;
427 } *cbarg = data;
428
429 unsigned long mntflags;
430 char *mntdata;
431 char *fstype;
432
433 /* we don't try 'nodev' entries */
434 if (strstr(buffer, "nodev"))
435 return 0;
436
437 fstype = buffer;
438 fstype += lxc_char_left_gc(fstype, strlen(fstype));
439 fstype[lxc_char_right_gc(fstype, strlen(fstype))] = '\0';
440
441 DEBUG("trying to mount '%s'->'%s' with fstype '%s'",
442 cbarg->rootfs, cbarg->target, fstype);
443
444 if (parse_mntopts(cbarg->options, &mntflags, &mntdata) < 0) {
445 free(mntdata);
446 return -1;
447 }
448
449 if (mount(cbarg->rootfs, cbarg->target, fstype, mntflags, mntdata)) {
450 DEBUG("mount failed with error: %s", strerror(errno));
451 free(mntdata);
452 return 0;
453 }
454 free(mntdata);
455
456 INFO("mounted '%s' on '%s', with fstype '%s'",
457 cbarg->rootfs, cbarg->target, fstype);
458
459 return 1;
460 }
461
462 static int mount_unknown_fs(const char *rootfs, const char *target,
463 const char *options)
464 {
465 int i;
466
467 struct cbarg {
468 const char *rootfs;
469 const char *target;
470 const char *options;
471 } cbarg = {
472 .rootfs = rootfs,
473 .target = target,
474 .options = options,
475 };
476
477 /*
478 * find the filesystem type with brute force:
479 * first we check with /etc/filesystems, in case the modules
480 * are auto-loaded and fall back to the supported kernel fs
481 */
482 char *fsfile[] = {
483 "/etc/filesystems",
484 "/proc/filesystems",
485 };
486
487 for (i = 0; i < sizeof(fsfile)/sizeof(fsfile[0]); i++) {
488
489 int ret;
490
491 if (access(fsfile[i], F_OK))
492 continue;
493
494 ret = lxc_file_for_each_line(fsfile[i], find_fstype_cb, &cbarg);
495 if (ret < 0) {
496 ERROR("failed to parse '%s'", fsfile[i]);
497 return -1;
498 }
499
500 if (ret)
501 return 0;
502 }
503
504 ERROR("failed to determine fs type for '%s'", rootfs);
505 return -1;
506 }
507
508 static int mount_rootfs_dir(const char *rootfs, const char *target,
509 const char *options)
510 {
511 unsigned long mntflags;
512 char *mntdata;
513 int ret;
514
515 if (parse_mntopts(options, &mntflags, &mntdata) < 0) {
516 free(mntdata);
517 return -1;
518 }
519
520 ret = mount(rootfs, target, "none", MS_BIND | MS_REC | mntflags, mntdata);
521 free(mntdata);
522
523 return ret;
524 }
525
526 static int setup_lodev(const char *rootfs, int fd, struct loop_info64 *loinfo)
527 {
528 int rfd;
529 int ret = -1;
530
531 rfd = open(rootfs, O_RDWR);
532 if (rfd < 0) {
533 SYSERROR("failed to open '%s'", rootfs);
534 return -1;
535 }
536
537 memset(loinfo, 0, sizeof(*loinfo));
538
539 loinfo->lo_flags = LO_FLAGS_AUTOCLEAR;
540
541 if (ioctl(fd, LOOP_SET_FD, rfd)) {
542 SYSERROR("failed to LOOP_SET_FD");
543 goto out;
544 }
545
546 if (ioctl(fd, LOOP_SET_STATUS64, loinfo)) {
547 SYSERROR("failed to LOOP_SET_STATUS64");
548 goto out;
549 }
550
551 ret = 0;
552 out:
553 close(rfd);
554
555 return ret;
556 }
557
558 static int mount_rootfs_file(const char *rootfs, const char *target,
559 const char *options)
560 {
561 struct dirent dirent, *direntp;
562 struct loop_info64 loinfo;
563 int ret = -1, fd = -1, rc;
564 DIR *dir;
565 char path[MAXPATHLEN];
566
567 dir = opendir("/dev");
568 if (!dir) {
569 SYSERROR("failed to open '/dev'");
570 return -1;
571 }
572
573 while (!readdir_r(dir, &dirent, &direntp)) {
574
575 if (!direntp)
576 break;
577
578 if (!strcmp(direntp->d_name, "."))
579 continue;
580
581 if (!strcmp(direntp->d_name, ".."))
582 continue;
583
584 if (strncmp(direntp->d_name, "loop", 4))
585 continue;
586
587 rc = snprintf(path, MAXPATHLEN, "/dev/%s", direntp->d_name);
588 if (rc < 0 || rc >= MAXPATHLEN)
589 continue;
590
591 fd = open(path, O_RDWR);
592 if (fd < 0)
593 continue;
594
595 if (ioctl(fd, LOOP_GET_STATUS64, &loinfo) == 0) {
596 close(fd);
597 continue;
598 }
599
600 if (errno != ENXIO) {
601 WARN("unexpected error for ioctl on '%s': %m",
602 direntp->d_name);
603 close(fd);
604 continue;
605 }
606
607 DEBUG("found '%s' free lodev", path);
608
609 ret = setup_lodev(rootfs, fd, &loinfo);
610 if (!ret)
611 ret = mount_unknown_fs(path, target, options);
612 close(fd);
613
614 break;
615 }
616
617 if (closedir(dir))
618 WARN("failed to close directory");
619
620 return ret;
621 }
622
623 static int mount_rootfs_block(const char *rootfs, const char *target,
624 const char *options)
625 {
626 return mount_unknown_fs(rootfs, target, options);
627 }
628
629 /*
630 * pin_rootfs
631 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
632 * the duration of the container run, to prevent the container from marking
633 * the underlying fs readonly on shutdown. unlink the file immediately so
634 * no name pollution is happens
635 * return -1 on error.
636 * return -2 if nothing needed to be pinned.
637 * return an open fd (>=0) if we pinned it.
638 */
639 int pin_rootfs(const char *rootfs)
640 {
641 char absrootfs[MAXPATHLEN];
642 char absrootfspin[MAXPATHLEN];
643 struct stat s;
644 int ret, fd;
645
646 if (rootfs == NULL || strlen(rootfs) == 0)
647 return -2;
648
649 if (!realpath(rootfs, absrootfs))
650 return -2;
651
652 if (access(absrootfs, F_OK))
653 return -1;
654
655 if (stat(absrootfs, &s))
656 return -1;
657
658 if (!S_ISDIR(s.st_mode))
659 return -2;
660
661 ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
662 if (ret >= MAXPATHLEN)
663 return -1;
664
665 fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
666 if (fd < 0)
667 return fd;
668 (void)unlink(absrootfspin);
669 return fd;
670 }
671
672 static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
673 {
674 int r;
675 size_t i;
676 static struct {
677 int match_mask;
678 int match_flag;
679 const char *source;
680 const char *destination;
681 const char *fstype;
682 unsigned long flags;
683 const char *options;
684 } default_mounts[] = {
685 /* Read-only bind-mounting... In older kernels, doing that required
686 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
687 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
688 * kernel 2.6.26 onwards. However, this apparently does not work on
689 * kernel 3.8. Unfortunately, on that very same kernel, doing the
690 * same trick as above doesn't seem to work either, there one needs
691 * to ALSO specify MS_BIND for the remount, otherwise the entire
692 * fs is remounted read-only or the mount fails because it's busy...
693 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
694 * 2.6.32...
695 */
696 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
697 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
698 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
699 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
700 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
701 { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
702 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
703 { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
704 { 0, 0, NULL, NULL, NULL, 0, NULL }
705 };
706
707 for (i = 0; default_mounts[i].match_mask; i++) {
708 if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
709 char *source = NULL;
710 char *destination = NULL;
711 int saved_errno;
712
713 if (default_mounts[i].source) {
714 /* will act like strdup if %r is not present */
715 source = lxc_string_replace("%r", conf->rootfs.mount, default_mounts[i].source);
716 if (!source) {
717 SYSERROR("memory allocation error");
718 return -1;
719 }
720 }
721 if (default_mounts[i].destination) {
722 /* will act like strdup if %r is not present */
723 destination = lxc_string_replace("%r", conf->rootfs.mount, default_mounts[i].destination);
724 if (!destination) {
725 saved_errno = errno;
726 SYSERROR("memory allocation error");
727 free(source);
728 errno = saved_errno;
729 return -1;
730 }
731 }
732 r = mount(source, destination, default_mounts[i].fstype, default_mounts[i].flags, default_mounts[i].options);
733 saved_errno = errno;
734 if (r < 0)
735 SYSERROR("error mounting %s on %s", source, destination);
736 free(source);
737 free(destination);
738 if (r < 0) {
739 errno = saved_errno;
740 return -1;
741 }
742 }
743 }
744
745 if (flags & LXC_AUTO_CGROUP_MASK) {
746 if (!cgroup_mount(conf->rootfs.mount, handler,
747 flags & LXC_AUTO_CGROUP_MASK)) {
748 SYSERROR("error mounting /sys/fs/cgroup");
749 return -1;
750 }
751 }
752
753 return 0;
754 }
755
756 static int mount_rootfs(const char *rootfs, const char *target, const char *options)
757 {
758 char absrootfs[MAXPATHLEN];
759 struct stat s;
760 int i;
761
762 typedef int (*rootfs_cb)(const char *, const char *, const char *);
763
764 struct rootfs_type {
765 int type;
766 rootfs_cb cb;
767 } rtfs_type[] = {
768 { S_IFDIR, mount_rootfs_dir },
769 { S_IFBLK, mount_rootfs_block },
770 { S_IFREG, mount_rootfs_file },
771 };
772
773 if (!realpath(rootfs, absrootfs)) {
774 SYSERROR("failed to get real path for '%s'", rootfs);
775 return -1;
776 }
777
778 if (access(absrootfs, F_OK)) {
779 SYSERROR("'%s' is not accessible", absrootfs);
780 return -1;
781 }
782
783 if (stat(absrootfs, &s)) {
784 SYSERROR("failed to stat '%s'", absrootfs);
785 return -1;
786 }
787
788 for (i = 0; i < sizeof(rtfs_type)/sizeof(rtfs_type[0]); i++) {
789
790 if (!__S_ISTYPE(s.st_mode, rtfs_type[i].type))
791 continue;
792
793 return rtfs_type[i].cb(absrootfs, target, options);
794 }
795
796 ERROR("unsupported rootfs type for '%s'", absrootfs);
797 return -1;
798 }
799
800 static int setup_utsname(struct utsname *utsname)
801 {
802 if (!utsname)
803 return 0;
804
805 if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
806 SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
807 return -1;
808 }
809
810 INFO("'%s' hostname has been setup", utsname->nodename);
811
812 return 0;
813 }
814
815 struct dev_symlinks {
816 const char *oldpath;
817 const char *name;
818 };
819
820 static const struct dev_symlinks dev_symlinks[] = {
821 {"/proc/self/fd", "fd"},
822 {"/proc/self/fd/0", "stdin"},
823 {"/proc/self/fd/1", "stdout"},
824 {"/proc/self/fd/2", "stderr"},
825 };
826
827 static int setup_dev_symlinks(const struct lxc_rootfs *rootfs)
828 {
829 char path[MAXPATHLEN];
830 int ret,i;
831
832
833 for (i = 0; i < sizeof(dev_symlinks) / sizeof(dev_symlinks[0]); i++) {
834 const struct dev_symlinks *d = &dev_symlinks[i];
835 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, d->name);
836 if (ret < 0 || ret >= MAXPATHLEN)
837 return -1;
838 ret = symlink(d->oldpath, path);
839 if (ret && errno != EEXIST) {
840 SYSERROR("Error creating %s", path);
841 return -1;
842 }
843 }
844 return 0;
845 }
846
847 static int setup_tty(const struct lxc_rootfs *rootfs,
848 const struct lxc_tty_info *tty_info, char *ttydir)
849 {
850 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
851 int i, ret;
852
853 if (!rootfs->path)
854 return 0;
855
856 for (i = 0; i < tty_info->nbtty; i++) {
857
858 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
859
860 ret = snprintf(path, sizeof(path), "%s/dev/tty%d",
861 rootfs->mount, i + 1);
862 if (ret >= sizeof(path)) {
863 ERROR("pathname too long for ttys");
864 return -1;
865 }
866 if (ttydir) {
867 /* create dev/lxc/tty%d" */
868 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/tty%d",
869 rootfs->mount, ttydir, i + 1);
870 if (ret >= sizeof(lxcpath)) {
871 ERROR("pathname too long for ttys");
872 return -1;
873 }
874 ret = creat(lxcpath, 0660);
875 if (ret==-1 && errno != EEXIST) {
876 SYSERROR("error creating %s", lxcpath);
877 return -1;
878 }
879 if (ret >= 0)
880 close(ret);
881 ret = unlink(path);
882 if (ret && errno != ENOENT) {
883 SYSERROR("error unlinking %s", path);
884 return -1;
885 }
886
887 if (mount(pty_info->name, lxcpath, "none", MS_BIND, 0)) {
888 WARN("failed to mount '%s'->'%s'",
889 pty_info->name, path);
890 continue;
891 }
892
893 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d", ttydir, i+1);
894 if (ret >= sizeof(lxcpath)) {
895 ERROR("tty pathname too long");
896 return -1;
897 }
898 ret = symlink(lxcpath, path);
899 if (ret) {
900 SYSERROR("failed to create symlink for tty %d", i+1);
901 return -1;
902 }
903 } else {
904 /* If we populated /dev, then we need to create /dev/ttyN */
905 if (access(path, F_OK)) {
906 ret = creat(path, 0660);
907 if (ret==-1) {
908 SYSERROR("error creating %s", path);
909 /* this isn't fatal, continue */
910 } else {
911 close(ret);
912 }
913 }
914 if (mount(pty_info->name, path, "none", MS_BIND, 0)) {
915 WARN("failed to mount '%s'->'%s'",
916 pty_info->name, path);
917 continue;
918 }
919 }
920 }
921
922 INFO("%d tty(s) has been setup", tty_info->nbtty);
923
924 return 0;
925 }
926
927 static int setup_rootfs_pivot_root_cb(char *buffer, void *data)
928 {
929 struct lxc_list *mountlist, *listentry, *iterator;
930 char *pivotdir, *mountpoint, *mountentry, *saveptr = NULL;
931 int found;
932 void **cbparm;
933
934 mountentry = buffer;
935 cbparm = (void **)data;
936
937 mountlist = cbparm[0];
938 pivotdir = cbparm[1];
939
940 /* parse entry, first field is mountname, ignore */
941 mountpoint = strtok_r(mountentry, " ", &saveptr);
942 if (!mountpoint)
943 return -1;
944
945 /* second field is mountpoint */
946 mountpoint = strtok_r(NULL, " ", &saveptr);
947 if (!mountpoint)
948 return -1;
949
950 /* only consider mountpoints below old root fs */
951 if (strncmp(mountpoint, pivotdir, strlen(pivotdir)))
952 return 0;
953
954 /* filter duplicate mountpoints */
955 found = 0;
956 lxc_list_for_each(iterator, mountlist) {
957 if (!strcmp(iterator->elem, mountpoint)) {
958 found = 1;
959 break;
960 }
961 }
962 if (found)
963 return 0;
964
965 /* add entry to list */
966 listentry = malloc(sizeof(*listentry));
967 if (!listentry) {
968 SYSERROR("malloc for mountpoint listentry failed");
969 return -1;
970 }
971
972 listentry->elem = strdup(mountpoint);
973 if (!listentry->elem) {
974 SYSERROR("strdup failed");
975 free(listentry);
976 return -1;
977 }
978 lxc_list_add_tail(mountlist, listentry);
979
980 return 0;
981 }
982
983 static int umount_oldrootfs(const char *oldrootfs)
984 {
985 char path[MAXPATHLEN];
986 void *cbparm[2];
987 struct lxc_list mountlist, *iterator, *next;
988 int ok, still_mounted, last_still_mounted;
989 int rc;
990
991 /* read and parse /proc/mounts in old root fs */
992 lxc_list_init(&mountlist);
993
994 /* oldrootfs is on the top tree directory now */
995 rc = snprintf(path, sizeof(path), "/%s", oldrootfs);
996 if (rc >= sizeof(path)) {
997 ERROR("rootfs name too long");
998 return -1;
999 }
1000 cbparm[0] = &mountlist;
1001
1002 cbparm[1] = strdup(path);
1003 if (!cbparm[1]) {
1004 SYSERROR("strdup failed");
1005 return -1;
1006 }
1007
1008 rc = snprintf(path, sizeof(path), "%s/proc/mounts", oldrootfs);
1009 if (rc >= sizeof(path)) {
1010 ERROR("container proc/mounts name too long");
1011 return -1;
1012 }
1013
1014 ok = lxc_file_for_each_line(path,
1015 setup_rootfs_pivot_root_cb, &cbparm);
1016 if (ok < 0) {
1017 SYSERROR("failed to read or parse mount list '%s'", path);
1018 return -1;
1019 }
1020
1021 /* umount filesystems until none left or list no longer shrinks */
1022 still_mounted = 0;
1023 do {
1024 last_still_mounted = still_mounted;
1025 still_mounted = 0;
1026
1027 lxc_list_for_each_safe(iterator, &mountlist, next) {
1028
1029 /* umount normally */
1030 if (!umount(iterator->elem)) {
1031 DEBUG("umounted '%s'", (char *)iterator->elem);
1032 lxc_list_del(iterator);
1033 continue;
1034 }
1035
1036 still_mounted++;
1037 }
1038
1039 } while (still_mounted > 0 && still_mounted != last_still_mounted);
1040
1041
1042 lxc_list_for_each(iterator, &mountlist) {
1043
1044 /* let's try a lazy umount */
1045 if (!umount2(iterator->elem, MNT_DETACH)) {
1046 INFO("lazy unmount of '%s'", (char *)iterator->elem);
1047 continue;
1048 }
1049
1050 /* be more brutal (nfs) */
1051 if (!umount2(iterator->elem, MNT_FORCE)) {
1052 INFO("forced unmount of '%s'", (char *)iterator->elem);
1053 continue;
1054 }
1055
1056 WARN("failed to unmount '%s'", (char *)iterator->elem);
1057 }
1058
1059 return 0;
1060 }
1061
1062 static int setup_rootfs_pivot_root(const char *rootfs, const char *pivotdir)
1063 {
1064 char path[MAXPATHLEN];
1065 int remove_pivotdir = 0;
1066 int rc;
1067
1068 /* change into new root fs */
1069 if (chdir(rootfs)) {
1070 SYSERROR("can't chdir to new rootfs '%s'", rootfs);
1071 return -1;
1072 }
1073
1074 if (!pivotdir)
1075 pivotdir = "lxc_putold";
1076
1077 /* compute the full path to pivotdir under rootfs */
1078 rc = snprintf(path, sizeof(path), "%s/%s", rootfs, pivotdir);
1079 if (rc >= sizeof(path)) {
1080 ERROR("pivot dir name too long");
1081 return -1;
1082 }
1083
1084 if (access(path, F_OK)) {
1085
1086 if (mkdir_p(path, 0755) < 0) {
1087 SYSERROR("failed to create pivotdir '%s'", path);
1088 return -1;
1089 }
1090
1091 remove_pivotdir = 1;
1092 DEBUG("created '%s' directory", path);
1093 }
1094
1095 DEBUG("mountpoint for old rootfs is '%s'", path);
1096
1097 /* pivot_root into our new root fs */
1098 if (pivot_root(".", path)) {
1099 SYSERROR("pivot_root syscall failed");
1100 return -1;
1101 }
1102
1103 if (chdir("/")) {
1104 SYSERROR("can't chdir to / after pivot_root");
1105 return -1;
1106 }
1107
1108 DEBUG("pivot_root syscall to '%s' successful", rootfs);
1109
1110 /* we switch from absolute path to relative path */
1111 if (umount_oldrootfs(pivotdir))
1112 return -1;
1113
1114 /* remove temporary mount point, we don't consider the removing
1115 * as fatal */
1116 if (remove_pivotdir && rmdir(pivotdir))
1117 WARN("can't remove mountpoint '%s': %m", pivotdir);
1118
1119 return 0;
1120 }
1121
1122 /*
1123 * Check to see if a directory has something mounted on it and,
1124 * if it does, return the fstype.
1125 *
1126 * Code largely based on detect_shared_rootfs below
1127 *
1128 * Returns: # of matching entries in /proc/self/mounts
1129 * if != 0 fstype is filled with the last filesystem value.
1130 * if == 0 no matches found, fstype unchanged.
1131 *
1132 * ToDo: Maybe return the mount options in another parameter...
1133 */
1134
1135 #define LINELEN 4096
1136 #define MAX_FSTYPE_LEN 128
1137 static int mount_check_fs( const char *dir, char *fstype )
1138 {
1139 char buf[LINELEN], *p;
1140 struct stat s;
1141 FILE *f;
1142 int found_fs = 0;
1143 char *p2;
1144
1145 DEBUG("entering mount_check_fs for %s", dir);
1146
1147 if ( 0 != access(dir, F_OK) || 0 != stat(dir, &s) || 0 == S_ISDIR(s.st_mode) ) {
1148 return 0;
1149 }
1150
1151 f = fopen("/proc/self/mounts", "r");
1152 if (!f)
1153 return 0;
1154 while (fgets(buf, LINELEN, f)) {
1155 p = index(buf, ' ');
1156 if( !p )
1157 continue;
1158 *p = '\0';
1159 p2 = p + 1;
1160
1161 p = index(p2, ' ');
1162 if( !p )
1163 continue;
1164 *p = '\0';
1165
1166 /* Compare the directory in the entry to desired */
1167 if( strcmp( p2, dir ) ) {
1168 continue;
1169 }
1170
1171 p2 = p + 1;
1172 p = index( p2, ' ');
1173 if( !p )
1174 continue;
1175 *p = '\0';
1176
1177 ++found_fs;
1178
1179 if( fstype ) {
1180 strncpy( fstype, p2, MAX_FSTYPE_LEN - 1 );
1181 fstype [ MAX_FSTYPE_LEN - 1 ] = '\0';
1182 }
1183 }
1184
1185 fclose(f);
1186
1187 DEBUG("mount_check_fs returning %d last %s", found_fs, fstype);
1188
1189 return found_fs;
1190 }
1191
1192 /*
1193 * Locate a devtmpfs mount (should be on /dev) and create a container
1194 * subdirectory on it which we can then bind mount to the container
1195 * /dev instead of mounting a tmpfs there.
1196 * If we fail, return NULL.
1197 * Else return the pointer to the name buffer with the string to
1198 * the devtmpfs subdirectory.
1199 */
1200
1201 static char *mk_devtmpfs(const char *name, char *path, const char *lxcpath)
1202 {
1203 int ret;
1204 struct stat s;
1205 char tmp_path[MAXPATHLEN];
1206 char fstype[MAX_FSTYPE_LEN];
1207 char *base_path = "/dev/.lxc";
1208 char *user_path = "/dev/.lxc/user";
1209 uint64_t hash;
1210
1211 if ( 0 != access(base_path, F_OK) || 0 != stat(base_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
1212 /* This is just making /dev/.lxc it better work or we're done */
1213 ret = mkdir(base_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1214 if ( ret ) {
1215 SYSERROR( "Unable to create /dev/.lxc for autodev" );
1216 return NULL;
1217 }
1218 }
1219
1220 /*
1221 * Programmers notes:
1222 * We can not do mounts in this area of code that we want
1223 * to be visible in the host. Consequently, /dev/.lxc must
1224 * be set up earlier if we need a tmpfs mounted there.
1225 * That only affects the rare cases where autodev is enabled
1226 * for a container and devtmpfs is not mounted on /dev in the
1227 * host. In that case, we'll fall back to the old method
1228 * of mounting a tmpfs in the container and have no visibility
1229 * into the container /dev.
1230 */
1231 if( ! mount_check_fs( "/dev", fstype )
1232 || strcmp( "devtmpfs", fstype ) ) {
1233 /* Either /dev was not mounted or was not devtmpfs */
1234
1235 if ( ! mount_check_fs( "/dev/.lxc", NULL ) ) {
1236 /*
1237 * /dev/.lxc is not already mounted
1238 * Doing a mount here does no good, since
1239 * it's not visible in the host.
1240 */
1241
1242 ERROR("/dev/.lxc is not setup - taking fallback" );
1243 return NULL;
1244 }
1245 }
1246
1247 if ( 0 != access(user_path, F_OK) || 0 != stat(user_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
1248 /*
1249 * This is making /dev/.lxc/user path for non-priv users.
1250 * If this doesn't work, we'll have to fall back in the
1251 * case of non-priv users. It's mode 1777 like /tmp.
1252 */
1253 ret = mkdir(user_path, S_IRWXU | S_IRWXG | S_IRWXO | S_ISVTX);
1254 if ( ret ) {
1255 /* Issue an error but don't fail yet! */
1256 ERROR("Unable to create /dev/.lxc/user");
1257 }
1258 /* Umask tends to screw us up here */
1259 chmod(user_path, S_IRWXU | S_IRWXG | S_IRWXO | S_ISVTX);
1260 }
1261
1262 /*
1263 * Since the container name must be unique within a given
1264 * lxcpath, we're going to use a hash of the path
1265 * /lxcpath/name as our hash name in /dev/.lxc/
1266 */
1267
1268 ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s", lxcpath, name);
1269 if (ret < 0 || ret >= MAXPATHLEN)
1270 return NULL;
1271
1272 hash = fnv_64a_buf(tmp_path, ret, FNV1A_64_INIT);
1273
1274 ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s.%016" PRIx64, base_path, name, hash);
1275 if (ret < 0 || ret >= MAXPATHLEN)
1276 return NULL;
1277
1278 if ( 0 != access(tmp_path, F_OK) || 0 != stat(tmp_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
1279 ret = mkdir(tmp_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1280 if ( ret ) {
1281 /* Something must have failed with the base_path...
1282 * Maybe unpriv user. Try user_path now... */
1283 INFO("Setup in /dev/.lxc failed. Trying /dev/.lxc/user." );
1284
1285 ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s.%016" PRIx64, user_path, name, hash);
1286 if (ret < 0 || ret >= MAXPATHLEN)
1287 return NULL;
1288
1289 if ( 0 != access(tmp_path, F_OK) || 0 != stat(tmp_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
1290 ret = mkdir(tmp_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1291 if ( ret ) {
1292 ERROR("Container /dev setup in host /dev failed - taking fallback" );
1293 return NULL;
1294 }
1295 }
1296 }
1297 }
1298
1299 strcpy( path, tmp_path );
1300 return path;
1301 }
1302
1303
1304 /*
1305 * Do we want to add options for max size of /dev and a file to
1306 * specify which devices to create?
1307 */
1308 static int mount_autodev(const char *name, char *root, const char *lxcpath)
1309 {
1310 int ret;
1311 struct stat s;
1312 char path[MAXPATHLEN];
1313 char host_path[MAXPATHLEN];
1314 char devtmpfs_path[MAXPATHLEN];
1315
1316 INFO("Mounting /dev under %s", root);
1317
1318 ret = snprintf(host_path, MAXPATHLEN, "%s/%s/rootfs.dev", lxcpath, name);
1319 if (ret < 0 || ret > MAXPATHLEN)
1320 return -1;
1321
1322 ret = snprintf(path, MAXPATHLEN, "%s/dev", root);
1323 if (ret < 0 || ret > MAXPATHLEN)
1324 return -1;
1325
1326 if (mk_devtmpfs( name, devtmpfs_path, lxcpath ) ) {
1327 /*
1328 * Get rid of old links and directoriess
1329 * This could be either a symlink and we remove it,
1330 * or an empty directory and we remove it,
1331 * or non-existant and we don't care,
1332 * or a non-empty directory, and we will then emit an error
1333 * but we will not fail out the process.
1334 */
1335 unlink( host_path );
1336 rmdir( host_path );
1337 ret = symlink(devtmpfs_path, host_path);
1338
1339 if ( ret < 0 ) {
1340 SYSERROR("WARNING: Failed to create symlink '%s'->'%s'", host_path, devtmpfs_path);
1341 }
1342 DEBUG("Bind mounting %s to %s", devtmpfs_path , path );
1343 ret = mount(devtmpfs_path, path, NULL, MS_BIND, 0 );
1344 } else {
1345 /* Only mount a tmpfs on here if we don't already a mount */
1346 if ( ! mount_check_fs( host_path, NULL ) ) {
1347 DEBUG("Mounting tmpfs to %s", host_path );
1348 ret = mount("none", path, "tmpfs", 0, "size=100000,mode=755");
1349 } else {
1350 /* This allows someone to manually set up a mount */
1351 DEBUG("Bind mounting %s to %s", host_path, path );
1352 ret = mount(host_path , path, NULL, MS_BIND, 0 );
1353 }
1354 }
1355 if (ret) {
1356 SYSERROR("Failed to mount /dev at %s", root);
1357 return -1;
1358 }
1359 ret = snprintf(path, MAXPATHLEN, "%s/dev/pts", root);
1360 if (ret < 0 || ret >= MAXPATHLEN)
1361 return -1;
1362 /*
1363 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1364 * If not, then create it and exit if that fails...
1365 */
1366 if ( 0 != access(path, F_OK) || 0 != stat(path, &s) || 0 == S_ISDIR(s.st_mode) ) {
1367 ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
1368 if (ret) {
1369 SYSERROR("Failed to create /dev/pts in container");
1370 return -1;
1371 }
1372 }
1373
1374 INFO("Mounted /dev under %s", root);
1375 return 0;
1376 }
1377
1378 struct lxc_devs {
1379 const char *name;
1380 mode_t mode;
1381 int maj;
1382 int min;
1383 };
1384
1385 static const struct lxc_devs lxc_devs[] = {
1386 { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
1387 { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
1388 { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
1389 { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
1390 { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
1391 { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
1392 { "console", S_IFCHR | S_IRUSR | S_IWUSR, 5, 1 },
1393 };
1394
1395 static int setup_autodev(const char *root)
1396 {
1397 int ret;
1398 char path[MAXPATHLEN];
1399 int i;
1400 mode_t cmask;
1401
1402 INFO("Creating initial consoles under %s/dev", root);
1403
1404 ret = snprintf(path, MAXPATHLEN, "%s/dev", root);
1405 if (ret < 0 || ret >= MAXPATHLEN) {
1406 ERROR("Error calculating container /dev location");
1407 return -1;
1408 }
1409
1410 INFO("Populating /dev under %s", root);
1411 cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
1412 for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
1413 const struct lxc_devs *d = &lxc_devs[i];
1414 ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", root, d->name);
1415 if (ret < 0 || ret >= MAXPATHLEN)
1416 return -1;
1417 ret = mknod(path, d->mode, makedev(d->maj, d->min));
1418 if (ret && errno != EEXIST) {
1419 SYSERROR("Error creating %s", d->name);
1420 return -1;
1421 }
1422 }
1423 umask(cmask);
1424
1425 INFO("Populated /dev under %s", root);
1426 return 0;
1427 }
1428
1429 /*
1430 * I'll forgive you for asking whether all of this is needed :) The
1431 * answer is yes.
1432 * pivot_root will fail if the new root, the put_old dir, or the parent
1433 * of current->fs->root are MS_SHARED. (parent of current->fs_root may
1434 * or may not be current->fs_root - if we assumed it always was, we could
1435 * just mount --make-rslave /). So,
1436 * 1. mount a tiny tmpfs to be parent of current->fs->root.
1437 * 2. make that MS_SLAVE
1438 * 3. make a 'root' directory under that
1439 * 4. mount --rbind / under the $tinyroot/root.
1440 * 5. make that rslave
1441 * 6. chdir and chroot into $tinyroot/root
1442 * 7. $tinyroot will be unmounted by our parent in start.c
1443 */
1444 static int chroot_into_slave(struct lxc_conf *conf)
1445 {
1446 char path[MAXPATHLEN];
1447 const char *destpath = conf->rootfs.mount;
1448 int ret;
1449
1450 if (mount(destpath, destpath, NULL, MS_BIND, 0)) {
1451 SYSERROR("failed to mount %s bind", destpath);
1452 return -1;
1453 }
1454 if (mount("", destpath, NULL, MS_SLAVE, 0)) {
1455 SYSERROR("failed to make %s slave", destpath);
1456 return -1;
1457 }
1458 if (mount("none", destpath, "tmpfs", 0, "size=10000,mode=755")) {
1459 SYSERROR("Failed to mount tmpfs / at %s", destpath);
1460 return -1;
1461 }
1462 ret = snprintf(path, MAXPATHLEN, "%s/root", destpath);
1463 if (ret < 0 || ret >= MAXPATHLEN) {
1464 ERROR("out of memory making root path");
1465 return -1;
1466 }
1467 if (mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
1468 SYSERROR("Failed to create /dev/pts in container");
1469 return -1;
1470 }
1471 if (mount("/", path, NULL, MS_BIND|MS_REC, 0)) {
1472 SYSERROR("Failed to rbind mount / to %s", path);
1473 return -1;
1474 }
1475 if (mount("", destpath, NULL, MS_SLAVE|MS_REC, 0)) {
1476 SYSERROR("Failed to make tmp-/ at %s rslave", path);
1477 return -1;
1478 }
1479 if (chroot(path)) {
1480 SYSERROR("Failed to chroot into tmp-/");
1481 return -1;
1482 }
1483 if (chdir("/")) {
1484 SYSERROR("Failed to chdir into tmp-/");
1485 return -1;
1486 }
1487 INFO("Chrooted into tmp-/ at %s", path);
1488 return 0;
1489 }
1490
1491 static int setup_rootfs(struct lxc_conf *conf)
1492 {
1493 const struct lxc_rootfs *rootfs = &conf->rootfs;
1494
1495 if (!rootfs->path) {
1496 if (mount("", "/", NULL, MS_SLAVE|MS_REC, 0)) {
1497 SYSERROR("Failed to make / rslave");
1498 return -1;
1499 }
1500 return 0;
1501 }
1502
1503 if (access(rootfs->mount, F_OK)) {
1504 SYSERROR("failed to access to '%s', check it is present",
1505 rootfs->mount);
1506 return -1;
1507 }
1508
1509 if (detect_ramfs_rootfs()) {
1510 if (chroot_into_slave(conf)) {
1511 ERROR("Failed to chroot into slave /");
1512 return -1;
1513 }
1514 } else if (detect_shared_rootfs()) {
1515 if (mount("", "/", NULL, MS_SLAVE|MS_REC, 0)) {
1516 SYSERROR("Failed to make / rslave");
1517 return -1;
1518 }
1519 }
1520
1521 // First try mounting rootfs using a bdev
1522 struct bdev *bdev = bdev_init(rootfs->path, rootfs->mount, rootfs->options);
1523 if (bdev && bdev->ops->mount(bdev) == 0) {
1524 bdev_put(bdev);
1525 DEBUG("mounted '%s' on '%s'", rootfs->path, rootfs->mount);
1526 return 0;
1527 }
1528 if (bdev)
1529 bdev_put(bdev);
1530 if (mount_rootfs(rootfs->path, rootfs->mount, rootfs->options)) {
1531 ERROR("failed to mount rootfs");
1532 return -1;
1533 }
1534
1535 DEBUG("mounted '%s' on '%s'", rootfs->path, rootfs->mount);
1536
1537 return 0;
1538 }
1539
1540 static int setup_pivot_root(const struct lxc_rootfs *rootfs)
1541 {
1542 if (!rootfs->path)
1543 return 0;
1544
1545 if (setup_rootfs_pivot_root(rootfs->mount, rootfs->pivot)) {
1546 ERROR("failed to setup pivot root");
1547 return -1;
1548 }
1549
1550 return 0;
1551 }
1552
1553 static int setup_pts(int pts)
1554 {
1555 char target[PATH_MAX];
1556
1557 if (!pts)
1558 return 0;
1559
1560 if (!access("/dev/pts/ptmx", F_OK) && umount("/dev/pts")) {
1561 SYSERROR("failed to umount 'dev/pts'");
1562 return -1;
1563 }
1564
1565 if (mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL,
1566 "newinstance,ptmxmode=0666,mode=0620,gid=5")) {
1567 SYSERROR("failed to mount a new instance of '/dev/pts'");
1568 return -1;
1569 }
1570
1571 if (access("/dev/ptmx", F_OK)) {
1572 if (!symlink("/dev/pts/ptmx", "/dev/ptmx"))
1573 goto out;
1574 SYSERROR("failed to symlink '/dev/pts/ptmx'->'/dev/ptmx'");
1575 return -1;
1576 }
1577
1578 if (realpath("/dev/ptmx", target) && !strcmp(target, "/dev/pts/ptmx"))
1579 goto out;
1580
1581 /* fallback here, /dev/pts/ptmx exists just mount bind */
1582 if (mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND, 0)) {
1583 SYSERROR("mount failed '/dev/pts/ptmx'->'/dev/ptmx'");
1584 return -1;
1585 }
1586
1587 INFO("created new pts instance");
1588
1589 out:
1590 return 0;
1591 }
1592
1593 static int setup_personality(int persona)
1594 {
1595 #if HAVE_SYS_PERSONALITY_H
1596 if (persona == -1)
1597 return 0;
1598
1599 if (personality(persona) < 0) {
1600 SYSERROR("failed to set personality to '0x%x'", persona);
1601 return -1;
1602 }
1603
1604 INFO("set personality to '0x%x'", persona);
1605 #endif
1606
1607 return 0;
1608 }
1609
1610 static int setup_dev_console(const struct lxc_rootfs *rootfs,
1611 const struct lxc_console *console)
1612 {
1613 char path[MAXPATHLEN];
1614 struct stat s;
1615 int ret;
1616
1617 ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1618 if (ret >= sizeof(path)) {
1619 ERROR("console path too long");
1620 return -1;
1621 }
1622
1623 if (access(path, F_OK)) {
1624 WARN("rootfs specified but no console found at '%s'", path);
1625 return 0;
1626 }
1627
1628 if (console->master < 0) {
1629 INFO("no console");
1630 return 0;
1631 }
1632
1633 if (stat(path, &s)) {
1634 SYSERROR("failed to stat '%s'", path);
1635 return -1;
1636 }
1637
1638 if (chmod(console->name, s.st_mode)) {
1639 SYSERROR("failed to set mode '0%o' to '%s'",
1640 s.st_mode, console->name);
1641 return -1;
1642 }
1643
1644 if (mount(console->name, path, "none", MS_BIND, 0)) {
1645 ERROR("failed to mount '%s' on '%s'", console->name, path);
1646 return -1;
1647 }
1648
1649 INFO("console has been setup");
1650 return 0;
1651 }
1652
1653 static int setup_ttydir_console(const struct lxc_rootfs *rootfs,
1654 const struct lxc_console *console,
1655 char *ttydir)
1656 {
1657 char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
1658 int ret;
1659
1660 /* create rootfs/dev/<ttydir> directory */
1661 ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount,
1662 ttydir);
1663 if (ret >= sizeof(path))
1664 return -1;
1665 ret = mkdir(path, 0755);
1666 if (ret && errno != EEXIST) {
1667 SYSERROR("failed with errno %d to create %s", errno, path);
1668 return -1;
1669 }
1670 INFO("created %s", path);
1671
1672 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console",
1673 rootfs->mount, ttydir);
1674 if (ret >= sizeof(lxcpath)) {
1675 ERROR("console path too long");
1676 return -1;
1677 }
1678
1679 snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
1680 ret = unlink(path);
1681 if (ret && errno != ENOENT) {
1682 SYSERROR("error unlinking %s", path);
1683 return -1;
1684 }
1685
1686 ret = creat(lxcpath, 0660);
1687 if (ret==-1 && errno != EEXIST) {
1688 SYSERROR("error %d creating %s", errno, lxcpath);
1689 return -1;
1690 }
1691 if (ret >= 0)
1692 close(ret);
1693
1694 if (console->master < 0) {
1695 INFO("no console");
1696 return 0;
1697 }
1698
1699 if (mount(console->name, lxcpath, "none", MS_BIND, 0)) {
1700 ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
1701 return -1;
1702 }
1703
1704 /* create symlink from rootfs/dev/console to 'lxc/console' */
1705 ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
1706 if (ret >= sizeof(lxcpath)) {
1707 ERROR("lxc/console path too long");
1708 return -1;
1709 }
1710 ret = symlink(lxcpath, path);
1711 if (ret) {
1712 SYSERROR("failed to create symlink for console");
1713 return -1;
1714 }
1715
1716 INFO("console has been setup on %s", lxcpath);
1717
1718 return 0;
1719 }
1720
1721 static int setup_console(const struct lxc_rootfs *rootfs,
1722 const struct lxc_console *console,
1723 char *ttydir)
1724 {
1725 /* We don't have a rootfs, /dev/console will be shared */
1726 if (!rootfs->path)
1727 return 0;
1728 if (!ttydir)
1729 return setup_dev_console(rootfs, console);
1730
1731 return setup_ttydir_console(rootfs, console, ttydir);
1732 }
1733
1734 static int setup_kmsg(const struct lxc_rootfs *rootfs,
1735 const struct lxc_console *console)
1736 {
1737 char kpath[MAXPATHLEN];
1738 int ret;
1739
1740 if (!rootfs->path)
1741 return 0;
1742 ret = snprintf(kpath, sizeof(kpath), "%s/dev/kmsg", rootfs->mount);
1743 if (ret < 0 || ret >= sizeof(kpath))
1744 return -1;
1745
1746 ret = unlink(kpath);
1747 if (ret && errno != ENOENT) {
1748 SYSERROR("error unlinking %s", kpath);
1749 return -1;
1750 }
1751
1752 ret = symlink("console", kpath);
1753 if (ret) {
1754 SYSERROR("failed to create symlink for kmsg");
1755 return -1;
1756 }
1757
1758 return 0;
1759 }
1760
1761 static void parse_mntopt(char *opt, unsigned long *flags, char **data)
1762 {
1763 struct mount_opt *mo;
1764
1765 /* If opt is found in mount_opt, set or clear flags.
1766 * Otherwise append it to data. */
1767
1768 for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
1769 if (!strncmp(opt, mo->name, strlen(mo->name))) {
1770 if (mo->clear)
1771 *flags &= ~mo->flag;
1772 else
1773 *flags |= mo->flag;
1774 return;
1775 }
1776 }
1777
1778 if (strlen(*data))
1779 strcat(*data, ",");
1780 strcat(*data, opt);
1781 }
1782
1783 int parse_mntopts(const char *mntopts, unsigned long *mntflags,
1784 char **mntdata)
1785 {
1786 char *s, *data;
1787 char *p, *saveptr = NULL;
1788
1789 *mntdata = NULL;
1790 *mntflags = 0L;
1791
1792 if (!mntopts)
1793 return 0;
1794
1795 s = strdup(mntopts);
1796 if (!s) {
1797 SYSERROR("failed to allocate memory");
1798 return -1;
1799 }
1800
1801 data = malloc(strlen(s) + 1);
1802 if (!data) {
1803 SYSERROR("failed to allocate memory");
1804 free(s);
1805 return -1;
1806 }
1807 *data = 0;
1808
1809 for (p = strtok_r(s, ",", &saveptr); p != NULL;
1810 p = strtok_r(NULL, ",", &saveptr))
1811 parse_mntopt(p, mntflags, &data);
1812
1813 if (*data)
1814 *mntdata = data;
1815 else
1816 free(data);
1817 free(s);
1818
1819 return 0;
1820 }
1821
1822 static int mount_entry(const char *fsname, const char *target,
1823 const char *fstype, unsigned long mountflags,
1824 const char *data)
1825 {
1826 if (mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data)) {
1827 SYSERROR("failed to mount '%s' on '%s'", fsname, target);
1828 return -1;
1829 }
1830
1831 if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
1832
1833 DEBUG("remounting %s on %s to respect bind or remount options",
1834 fsname, target);
1835
1836 if (mount(fsname, target, fstype,
1837 mountflags | MS_REMOUNT, data)) {
1838 SYSERROR("failed to mount '%s' on '%s'",
1839 fsname, target);
1840 return -1;
1841 }
1842 }
1843
1844 DEBUG("mounted '%s' on '%s', type '%s'", fsname, target, fstype);
1845
1846 return 0;
1847 }
1848
1849 /*
1850 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1851 */
1852 static void cull_mntent_opt(struct mntent *mntent)
1853 {
1854 int i;
1855 char *p, *p2;
1856 char *list[] = {"create=dir",
1857 "create=file",
1858 "optional",
1859 NULL };
1860
1861 for (i=0; list[i]; i++) {
1862 if (!(p = strstr(mntent->mnt_opts, list[i])))
1863 continue;
1864 p2 = strchr(p, ',');
1865 if (!p2) {
1866 /* no more mntopts, so just chop it here */
1867 *p = '\0';
1868 continue;
1869 }
1870 memmove(p, p2+1, strlen(p2+1)+1);
1871 }
1872 }
1873
1874 static inline int mount_entry_on_systemfs(struct mntent *mntent)
1875 {
1876 unsigned long mntflags;
1877 char *mntdata;
1878 int ret;
1879 FILE *pathfile = NULL;
1880 char* pathdirname = NULL;
1881 bool optional = hasmntopt(mntent, "optional") != NULL;
1882
1883 if (hasmntopt(mntent, "create=dir")) {
1884 if (mkdir_p(mntent->mnt_dir, 0755) < 0) {
1885 WARN("Failed to create mount target '%s'", mntent->mnt_dir);
1886 ret = -1;
1887 }
1888 }
1889
1890 if (hasmntopt(mntent, "create=file") && access(mntent->mnt_dir, F_OK)) {
1891 pathdirname = strdup(mntent->mnt_dir);
1892 pathdirname = dirname(pathdirname);
1893 if (mkdir_p(pathdirname, 0755) < 0) {
1894 WARN("Failed to create target directory");
1895 }
1896 pathfile = fopen(mntent->mnt_dir, "wb");
1897 if (!pathfile) {
1898 WARN("Failed to create mount target '%s'", mntent->mnt_dir);
1899 ret = -1;
1900 }
1901 else
1902 fclose(pathfile);
1903 }
1904
1905 cull_mntent_opt(mntent);
1906
1907 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
1908 free(mntdata);
1909 return -1;
1910 }
1911
1912 ret = mount_entry(mntent->mnt_fsname, mntent->mnt_dir,
1913 mntent->mnt_type, mntflags, mntdata);
1914
1915 if (optional)
1916 ret = 0;
1917
1918 free(pathdirname);
1919 free(mntdata);
1920
1921 return ret;
1922 }
1923
1924 static int mount_entry_on_absolute_rootfs(struct mntent *mntent,
1925 const struct lxc_rootfs *rootfs,
1926 const char *lxc_name)
1927 {
1928 char *aux;
1929 char path[MAXPATHLEN];
1930 unsigned long mntflags;
1931 char *mntdata;
1932 int r, ret = 0, offset;
1933 const char *lxcpath;
1934 FILE *pathfile = NULL;
1935 char *pathdirname = NULL;
1936 bool optional = hasmntopt(mntent, "optional") != NULL;
1937
1938 lxcpath = lxc_global_config_value("lxc.lxcpath");
1939 if (!lxcpath) {
1940 ERROR("Out of memory");
1941 return -1;
1942 }
1943
1944 /* if rootfs->path is a blockdev path, allow container fstab to
1945 * use $lxcpath/CN/rootfs as the target prefix */
1946 r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
1947 if (r < 0 || r >= MAXPATHLEN)
1948 goto skipvarlib;
1949
1950 aux = strstr(mntent->mnt_dir, path);
1951 if (aux) {
1952 offset = strlen(path);
1953 goto skipabs;
1954 }
1955
1956 skipvarlib:
1957 aux = strstr(mntent->mnt_dir, rootfs->path);
1958 if (!aux) {
1959 WARN("ignoring mount point '%s'", mntent->mnt_dir);
1960 goto out;
1961 }
1962 offset = strlen(rootfs->path);
1963
1964 skipabs:
1965
1966 r = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount,
1967 aux + offset);
1968 if (r < 0 || r >= MAXPATHLEN) {
1969 WARN("pathnme too long for '%s'", mntent->mnt_dir);
1970 ret = -1;
1971 goto out;
1972 }
1973
1974 if (hasmntopt(mntent, "create=dir")) {
1975 if (mkdir_p(path, 0755) < 0) {
1976 WARN("Failed to create mount target '%s'", path);
1977 ret = -1;
1978 }
1979 }
1980
1981 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
1982 pathdirname = strdup(path);
1983 pathdirname = dirname(pathdirname);
1984 if (mkdir_p(pathdirname, 0755) < 0) {
1985 WARN("Failed to create target directory");
1986 }
1987 pathfile = fopen(path, "wb");
1988 if (!pathfile) {
1989 WARN("Failed to create mount target '%s'", path);
1990 ret = -1;
1991 }
1992 else
1993 fclose(pathfile);
1994 }
1995 cull_mntent_opt(mntent);
1996
1997 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
1998 free(mntdata);
1999 return -1;
2000 }
2001
2002 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type,
2003 mntflags, mntdata);
2004
2005 free(mntdata);
2006
2007 if (optional)
2008 ret = 0;
2009
2010 out:
2011 free(pathdirname);
2012 return ret;
2013 }
2014
2015 static int mount_entry_on_relative_rootfs(struct mntent *mntent,
2016 const char *rootfs)
2017 {
2018 char path[MAXPATHLEN];
2019 unsigned long mntflags;
2020 char *mntdata;
2021 int ret;
2022 FILE *pathfile = NULL;
2023 char *pathdirname = NULL;
2024 bool optional = hasmntopt(mntent, "optional") != NULL;
2025
2026 /* relative to root mount point */
2027 ret = snprintf(path, sizeof(path), "%s/%s", rootfs, mntent->mnt_dir);
2028 if (ret >= sizeof(path)) {
2029 ERROR("path name too long");
2030 return -1;
2031 }
2032
2033 if (hasmntopt(mntent, "create=dir")) {
2034 if (mkdir_p(path, 0755) < 0) {
2035 WARN("Failed to create mount target '%s'", path);
2036 ret = -1;
2037 }
2038 }
2039
2040 if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
2041 pathdirname = strdup(path);
2042 pathdirname = dirname(pathdirname);
2043 if (mkdir_p(pathdirname, 0755) < 0) {
2044 WARN("Failed to create target directory");
2045 }
2046 pathfile = fopen(path, "wb");
2047 if (!pathfile) {
2048 WARN("Failed to create mount target '%s'", path);
2049 ret = -1;
2050 }
2051 else
2052 fclose(pathfile);
2053 }
2054 cull_mntent_opt(mntent);
2055
2056 if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
2057 free(mntdata);
2058 return -1;
2059 }
2060
2061 ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type,
2062 mntflags, mntdata);
2063
2064 if (optional)
2065 ret = 0;
2066
2067 free(pathdirname);
2068 free(mntdata);
2069
2070 return ret;
2071 }
2072
2073 static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
2074 const char *lxc_name)
2075 {
2076 struct mntent mntent;
2077 char buf[4096];
2078 int ret = -1;
2079
2080 while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
2081
2082 if (!rootfs->path) {
2083 if (mount_entry_on_systemfs(&mntent))
2084 goto out;
2085 continue;
2086 }
2087
2088 /* We have a separate root, mounts are relative to it */
2089 if (mntent.mnt_dir[0] != '/') {
2090 if (mount_entry_on_relative_rootfs(&mntent,
2091 rootfs->mount))
2092 goto out;
2093 continue;
2094 }
2095
2096 if (mount_entry_on_absolute_rootfs(&mntent, rootfs, lxc_name))
2097 goto out;
2098 }
2099
2100 ret = 0;
2101
2102 INFO("mount points have been setup");
2103 out:
2104 return ret;
2105 }
2106
2107 static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
2108 const char *lxc_name)
2109 {
2110 FILE *file;
2111 int ret;
2112
2113 if (!fstab)
2114 return 0;
2115
2116 file = setmntent(fstab, "r");
2117 if (!file) {
2118 SYSERROR("failed to use '%s'", fstab);
2119 return -1;
2120 }
2121
2122 ret = mount_file_entries(rootfs, file, lxc_name);
2123
2124 endmntent(file);
2125 return ret;
2126 }
2127
2128 static int setup_mount_entries(const struct lxc_rootfs *rootfs, struct lxc_list *mount,
2129 const char *lxc_name)
2130 {
2131 FILE *file;
2132 struct lxc_list *iterator;
2133 char *mount_entry;
2134 int ret;
2135
2136 file = tmpfile();
2137 if (!file) {
2138 ERROR("tmpfile error: %m");
2139 return -1;
2140 }
2141
2142 lxc_list_for_each(iterator, mount) {
2143 mount_entry = iterator->elem;
2144 fprintf(file, "%s\n", mount_entry);
2145 }
2146
2147 rewind(file);
2148
2149 ret = mount_file_entries(rootfs, file, lxc_name);
2150
2151 fclose(file);
2152 return ret;
2153 }
2154
2155 static int setup_caps(struct lxc_list *caps)
2156 {
2157 struct lxc_list *iterator;
2158 char *drop_entry;
2159 char *ptr;
2160 int i, capid;
2161
2162 lxc_list_for_each(iterator, caps) {
2163
2164 drop_entry = iterator->elem;
2165
2166 capid = -1;
2167
2168 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2169
2170 if (strcmp(drop_entry, caps_opt[i].name))
2171 continue;
2172
2173 capid = caps_opt[i].value;
2174 break;
2175 }
2176
2177 if (capid < 0) {
2178 /* try to see if it's numeric, so the user may specify
2179 * capabilities that the running kernel knows about but
2180 * we don't */
2181 errno = 0;
2182 capid = strtol(drop_entry, &ptr, 10);
2183 if (!ptr || *ptr != '\0' || errno != 0)
2184 /* not a valid number */
2185 capid = -1;
2186 else if (capid > lxc_caps_last_cap())
2187 /* we have a number but it's not a valid
2188 * capability */
2189 capid = -1;
2190 }
2191
2192 if (capid < 0) {
2193 ERROR("unknown capability %s", drop_entry);
2194 return -1;
2195 }
2196
2197 DEBUG("drop capability '%s' (%d)", drop_entry, capid);
2198
2199 if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
2200 SYSERROR("failed to remove %s capability", drop_entry);
2201 return -1;
2202 }
2203
2204 }
2205
2206 DEBUG("capabilities have been setup");
2207
2208 return 0;
2209 }
2210
2211 static int dropcaps_except(struct lxc_list *caps)
2212 {
2213 struct lxc_list *iterator;
2214 char *keep_entry;
2215 char *ptr;
2216 int i, capid;
2217 int numcaps = lxc_caps_last_cap() + 1;
2218 INFO("found %d capabilities", numcaps);
2219
2220 if (numcaps <= 0 || numcaps > 200)
2221 return -1;
2222
2223 // caplist[i] is 1 if we keep capability i
2224 int *caplist = alloca(numcaps * sizeof(int));
2225 memset(caplist, 0, numcaps * sizeof(int));
2226
2227 lxc_list_for_each(iterator, caps) {
2228
2229 keep_entry = iterator->elem;
2230
2231 capid = -1;
2232
2233 for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
2234
2235 if (strcmp(keep_entry, caps_opt[i].name))
2236 continue;
2237
2238 capid = caps_opt[i].value;
2239 break;
2240 }
2241
2242 if (capid < 0) {
2243 /* try to see if it's numeric, so the user may specify
2244 * capabilities that the running kernel knows about but
2245 * we don't */
2246 capid = strtol(keep_entry, &ptr, 10);
2247 if (!ptr || *ptr != '\0' ||
2248 capid == INT_MIN || capid == INT_MAX)
2249 /* not a valid number */
2250 capid = -1;
2251 else if (capid > lxc_caps_last_cap())
2252 /* we have a number but it's not a valid
2253 * capability */
2254 capid = -1;
2255 }
2256
2257 if (capid < 0) {
2258 ERROR("unknown capability %s", keep_entry);
2259 return -1;
2260 }
2261
2262 DEBUG("drop capability '%s' (%d)", keep_entry, capid);
2263
2264 caplist[capid] = 1;
2265 }
2266 for (i=0; i<numcaps; i++) {
2267 if (caplist[i])
2268 continue;
2269 if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
2270 SYSERROR("failed to remove capability %d", i);
2271 return -1;
2272 }
2273 }
2274
2275 DEBUG("capabilities have been setup");
2276
2277 return 0;
2278 }
2279
2280 static int setup_hw_addr(char *hwaddr, const char *ifname)
2281 {
2282 struct sockaddr sockaddr;
2283 struct ifreq ifr;
2284 int ret, fd;
2285
2286 ret = lxc_convert_mac(hwaddr, &sockaddr);
2287 if (ret) {
2288 ERROR("mac address '%s' conversion failed : %s",
2289 hwaddr, strerror(-ret));
2290 return -1;
2291 }
2292
2293 memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
2294 ifr.ifr_name[IFNAMSIZ-1] = '\0';
2295 memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
2296
2297 fd = socket(AF_INET, SOCK_DGRAM, 0);
2298 if (fd < 0) {
2299 ERROR("socket failure : %s", strerror(errno));
2300 return -1;
2301 }
2302
2303 ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
2304 close(fd);
2305 if (ret)
2306 ERROR("ioctl failure : %s", strerror(errno));
2307
2308 DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
2309
2310 return ret;
2311 }
2312
2313 static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
2314 {
2315 struct lxc_list *iterator;
2316 struct lxc_inetdev *inetdev;
2317 int err;
2318
2319 lxc_list_for_each(iterator, ip) {
2320
2321 inetdev = iterator->elem;
2322
2323 err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
2324 &inetdev->bcast, inetdev->prefix);
2325 if (err) {
2326 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2327 ifindex, strerror(-err));
2328 return -1;
2329 }
2330 }
2331
2332 return 0;
2333 }
2334
2335 static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
2336 {
2337 struct lxc_list *iterator;
2338 struct lxc_inet6dev *inet6dev;
2339 int err;
2340
2341 lxc_list_for_each(iterator, ip) {
2342
2343 inet6dev = iterator->elem;
2344
2345 err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
2346 &inet6dev->mcast, &inet6dev->acast,
2347 inet6dev->prefix);
2348 if (err) {
2349 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2350 ifindex, strerror(-err));
2351 return -1;
2352 }
2353 }
2354
2355 return 0;
2356 }
2357
2358 static int setup_netdev(struct lxc_netdev *netdev)
2359 {
2360 char ifname[IFNAMSIZ];
2361 char *current_ifname = ifname;
2362 int err;
2363
2364 /* empty network namespace */
2365 if (!netdev->ifindex) {
2366 if (netdev->flags & IFF_UP) {
2367 err = lxc_netdev_up("lo");
2368 if (err) {
2369 ERROR("failed to set the loopback up : %s",
2370 strerror(-err));
2371 return -1;
2372 }
2373 }
2374 if (netdev->type != LXC_NET_VETH)
2375 return 0;
2376 netdev->ifindex = if_nametoindex(netdev->name);
2377 }
2378
2379 /* get the new ifindex in case of physical netdev */
2380 if (netdev->type == LXC_NET_PHYS) {
2381 if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
2382 ERROR("failed to get ifindex for %s",
2383 netdev->link);
2384 return -1;
2385 }
2386 }
2387
2388 /* retrieve the name of the interface */
2389 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2390 ERROR("no interface corresponding to index '%d'",
2391 netdev->ifindex);
2392 return -1;
2393 }
2394
2395 /* default: let the system to choose one interface name */
2396 if (!netdev->name)
2397 netdev->name = netdev->type == LXC_NET_PHYS ?
2398 netdev->link : "eth%d";
2399
2400 /* rename the interface name */
2401 if (strcmp(ifname, netdev->name) != 0) {
2402 err = lxc_netdev_rename_by_name(ifname, netdev->name);
2403 if (err) {
2404 ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
2405 strerror(-err));
2406 return -1;
2407 }
2408 }
2409
2410 /* Re-read the name of the interface because its name has changed
2411 * and would be automatically allocated by the system
2412 */
2413 if (!if_indextoname(netdev->ifindex, current_ifname)) {
2414 ERROR("no interface corresponding to index '%d'",
2415 netdev->ifindex);
2416 return -1;
2417 }
2418
2419 /* set a mac address */
2420 if (netdev->hwaddr) {
2421 if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
2422 ERROR("failed to setup hw address for '%s'",
2423 current_ifname);
2424 return -1;
2425 }
2426 }
2427
2428 /* setup ipv4 addresses on the interface */
2429 if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
2430 ERROR("failed to setup ip addresses for '%s'",
2431 ifname);
2432 return -1;
2433 }
2434
2435 /* setup ipv6 addresses on the interface */
2436 if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
2437 ERROR("failed to setup ipv6 addresses for '%s'",
2438 ifname);
2439 return -1;
2440 }
2441
2442 /* set the network device up */
2443 if (netdev->flags & IFF_UP) {
2444 int err;
2445
2446 err = lxc_netdev_up(current_ifname);
2447 if (err) {
2448 ERROR("failed to set '%s' up : %s", current_ifname,
2449 strerror(-err));
2450 return -1;
2451 }
2452
2453 /* the network is up, make the loopback up too */
2454 err = lxc_netdev_up("lo");
2455 if (err) {
2456 ERROR("failed to set the loopback up : %s",
2457 strerror(-err));
2458 return -1;
2459 }
2460 }
2461
2462 /* We can only set up the default routes after bringing
2463 * up the interface, sine bringing up the interface adds
2464 * the link-local routes and we can't add a default
2465 * route if the gateway is not reachable. */
2466
2467 /* setup ipv4 gateway on the interface */
2468 if (netdev->ipv4_gateway) {
2469 if (!(netdev->flags & IFF_UP)) {
2470 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
2471 return -1;
2472 }
2473
2474 if (lxc_list_empty(&netdev->ipv4)) {
2475 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
2476 return -1;
2477 }
2478
2479 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2480 if (err) {
2481 err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
2482 if (err) {
2483 ERROR("failed to add ipv4 dest for '%s': %s",
2484 ifname, strerror(-err));
2485 }
2486
2487 err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
2488 if (err) {
2489 ERROR("failed to setup ipv4 gateway for '%s': %s",
2490 ifname, strerror(-err));
2491 if (netdev->ipv4_gateway_auto) {
2492 char buf[INET_ADDRSTRLEN];
2493 inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
2494 ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
2495 }
2496 return -1;
2497 }
2498 }
2499 }
2500
2501 /* setup ipv6 gateway on the interface */
2502 if (netdev->ipv6_gateway) {
2503 if (!(netdev->flags & IFF_UP)) {
2504 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
2505 return -1;
2506 }
2507
2508 if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
2509 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
2510 return -1;
2511 }
2512
2513 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2514 if (err) {
2515 err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
2516 if (err) {
2517 ERROR("failed to add ipv6 dest for '%s': %s",
2518 ifname, strerror(-err));
2519 }
2520
2521 err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
2522 if (err) {
2523 ERROR("failed to setup ipv6 gateway for '%s': %s",
2524 ifname, strerror(-err));
2525 if (netdev->ipv6_gateway_auto) {
2526 char buf[INET6_ADDRSTRLEN];
2527 inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
2528 ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
2529 }
2530 return -1;
2531 }
2532 }
2533 }
2534
2535 DEBUG("'%s' has been setup", current_ifname);
2536
2537 return 0;
2538 }
2539
2540 static int setup_network(struct lxc_list *network)
2541 {
2542 struct lxc_list *iterator;
2543 struct lxc_netdev *netdev;
2544
2545 lxc_list_for_each(iterator, network) {
2546
2547 netdev = iterator->elem;
2548
2549 if (setup_netdev(netdev)) {
2550 ERROR("failed to setup netdev");
2551 return -1;
2552 }
2553 }
2554
2555 if (!lxc_list_empty(network))
2556 INFO("network has been setup");
2557
2558 return 0;
2559 }
2560
2561 /* try to move physical nics to the init netns */
2562 void restore_phys_nics_to_netns(int netnsfd, struct lxc_conf *conf)
2563 {
2564 int i, ret, oldfd;
2565 char path[MAXPATHLEN];
2566
2567 if (netnsfd < 0)
2568 return;
2569
2570 ret = snprintf(path, MAXPATHLEN, "/proc/self/ns/net");
2571 if (ret < 0 || ret >= MAXPATHLEN) {
2572 WARN("Failed to open monitor netns fd");
2573 return;
2574 }
2575 if ((oldfd = open(path, O_RDONLY)) < 0) {
2576 SYSERROR("Failed to open monitor netns fd");
2577 return;
2578 }
2579 if (setns(netnsfd, 0) != 0) {
2580 SYSERROR("Failed to enter container netns to reset nics");
2581 close(oldfd);
2582 return;
2583 }
2584 for (i=0; i<conf->num_savednics; i++) {
2585 struct saved_nic *s = &conf->saved_nics[i];
2586 if (lxc_netdev_move_by_index(s->ifindex, 1))
2587 WARN("Error moving nic index:%d back to host netns",
2588 s->ifindex);
2589 }
2590 if (setns(oldfd, 0) != 0)
2591 SYSERROR("Failed to re-enter monitor's netns");
2592 close(oldfd);
2593 }
2594
2595 void lxc_rename_phys_nics_on_shutdown(int netnsfd, struct lxc_conf *conf)
2596 {
2597 int i;
2598
2599 if (conf->num_savednics == 0)
2600 return;
2601
2602 INFO("running to reset %d nic names", conf->num_savednics);
2603 restore_phys_nics_to_netns(netnsfd, conf);
2604 for (i=0; i<conf->num_savednics; i++) {
2605 struct saved_nic *s = &conf->saved_nics[i];
2606 INFO("resetting nic %d to %s", s->ifindex, s->orig_name);
2607 lxc_netdev_rename_by_index(s->ifindex, s->orig_name);
2608 free(s->orig_name);
2609 }
2610 conf->num_savednics = 0;
2611 }
2612
2613 static char *default_rootfs_mount = LXCROOTFSMOUNT;
2614
2615 struct lxc_conf *lxc_conf_init(void)
2616 {
2617 struct lxc_conf *new;
2618 int i;
2619
2620 new = malloc(sizeof(*new));
2621 if (!new) {
2622 ERROR("lxc_conf_init : %m");
2623 return NULL;
2624 }
2625 memset(new, 0, sizeof(*new));
2626
2627 new->loglevel = LXC_LOG_PRIORITY_NOTSET;
2628 new->personality = -1;
2629 new->autodev = -1;
2630 new->console.log_path = NULL;
2631 new->console.log_fd = -1;
2632 new->console.path = NULL;
2633 new->console.peer = -1;
2634 new->console.peerpty.busy = -1;
2635 new->console.peerpty.master = -1;
2636 new->console.peerpty.slave = -1;
2637 new->console.master = -1;
2638 new->console.slave = -1;
2639 new->console.name[0] = '\0';
2640 new->maincmd_fd = -1;
2641 new->rootfs.mount = strdup(default_rootfs_mount);
2642 if (!new->rootfs.mount) {
2643 ERROR("lxc_conf_init : %m");
2644 free(new);
2645 return NULL;
2646 }
2647 new->kmsg = 1;
2648 lxc_list_init(&new->cgroup);
2649 lxc_list_init(&new->network);
2650 lxc_list_init(&new->mount_list);
2651 lxc_list_init(&new->caps);
2652 lxc_list_init(&new->keepcaps);
2653 lxc_list_init(&new->id_map);
2654 for (i=0; i<NUM_LXC_HOOKS; i++)
2655 lxc_list_init(&new->hooks[i]);
2656 lxc_list_init(&new->groups);
2657 new->lsm_aa_profile = NULL;
2658 new->lsm_se_context = NULL;
2659 new->tmp_umount_proc = 0;
2660
2661 for (i = 0; i < LXC_NS_MAX; i++)
2662 new->inherit_ns_fd[i] = -1;
2663
2664 return new;
2665 }
2666
2667 static int instanciate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2668 {
2669 char veth1buf[IFNAMSIZ], *veth1;
2670 char veth2buf[IFNAMSIZ], *veth2;
2671 int err;
2672
2673 if (netdev->priv.veth_attr.pair)
2674 veth1 = netdev->priv.veth_attr.pair;
2675 else {
2676 err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
2677 if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
2678 ERROR("veth1 name too long");
2679 return -1;
2680 }
2681 veth1 = lxc_mkifname(veth1buf);
2682 if (!veth1) {
2683 ERROR("failed to allocate a temporary name");
2684 return -1;
2685 }
2686 /* store away for deconf */
2687 memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
2688 }
2689
2690 snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
2691 veth2 = lxc_mkifname(veth2buf);
2692 if (!veth2) {
2693 ERROR("failed to allocate a temporary name");
2694 goto out_delete;
2695 }
2696
2697 err = lxc_veth_create(veth1, veth2);
2698 if (err) {
2699 ERROR("failed to create %s-%s : %s", veth1, veth2,
2700 strerror(-err));
2701 goto out_delete;
2702 }
2703
2704 /* changing the high byte of the mac address to 0xfe, the bridge interface
2705 * will always keep the host's mac address and not take the mac address
2706 * of a container */
2707 err = setup_private_host_hw_addr(veth1);
2708 if (err) {
2709 ERROR("failed to change mac address of host interface '%s' : %s",
2710 veth1, strerror(-err));
2711 goto out_delete;
2712 }
2713
2714 if (netdev->mtu) {
2715 err = lxc_netdev_set_mtu(veth1, atoi(netdev->mtu));
2716 if (!err)
2717 err = lxc_netdev_set_mtu(veth2, atoi(netdev->mtu));
2718 if (err) {
2719 ERROR("failed to set mtu '%s' for %s-%s : %s",
2720 netdev->mtu, veth1, veth2, strerror(-err));
2721 goto out_delete;
2722 }
2723 }
2724
2725 if (netdev->link) {
2726 err = lxc_bridge_attach(netdev->link, veth1);
2727 if (err) {
2728 ERROR("failed to attach '%s' to the bridge '%s' : %s",
2729 veth1, netdev->link, strerror(-err));
2730 goto out_delete;
2731 }
2732 }
2733
2734 netdev->ifindex = if_nametoindex(veth2);
2735 if (!netdev->ifindex) {
2736 ERROR("failed to retrieve the index for %s", veth2);
2737 goto out_delete;
2738 }
2739
2740 err = lxc_netdev_up(veth1);
2741 if (err) {
2742 ERROR("failed to set %s up : %s", veth1, strerror(-err));
2743 goto out_delete;
2744 }
2745
2746 if (netdev->upscript) {
2747 err = run_script(handler->name, "net", netdev->upscript, "up",
2748 "veth", veth1, (char*) NULL);
2749 if (err)
2750 goto out_delete;
2751 }
2752
2753 DEBUG("instanciated veth '%s/%s', index is '%d'",
2754 veth1, veth2, netdev->ifindex);
2755
2756 return 0;
2757
2758 out_delete:
2759 lxc_netdev_delete_by_name(veth1);
2760 if (!netdev->priv.veth_attr.pair && veth1)
2761 free(veth1);
2762 if(veth2)
2763 free(veth2);
2764 return -1;
2765 }
2766
2767 static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
2768 {
2769 char *veth1;
2770 int err;
2771
2772 if (netdev->priv.veth_attr.pair)
2773 veth1 = netdev->priv.veth_attr.pair;
2774 else
2775 veth1 = netdev->priv.veth_attr.veth1;
2776
2777 if (netdev->downscript) {
2778 err = run_script(handler->name, "net", netdev->downscript,
2779 "down", "veth", veth1, (char*) NULL);
2780 if (err)
2781 return -1;
2782 }
2783 return 0;
2784 }
2785
2786 static int instanciate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2787 {
2788 char peerbuf[IFNAMSIZ], *peer;
2789 int err;
2790
2791 if (!netdev->link) {
2792 ERROR("no link specified for macvlan netdev");
2793 return -1;
2794 }
2795
2796 err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
2797 if (err >= sizeof(peerbuf))
2798 return -1;
2799
2800 peer = lxc_mkifname(peerbuf);
2801 if (!peer) {
2802 ERROR("failed to make a temporary name");
2803 return -1;
2804 }
2805
2806 err = lxc_macvlan_create(netdev->link, peer,
2807 netdev->priv.macvlan_attr.mode);
2808 if (err) {
2809 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2810 peer, netdev->link, strerror(-err));
2811 goto out;
2812 }
2813
2814 netdev->ifindex = if_nametoindex(peer);
2815 if (!netdev->ifindex) {
2816 ERROR("failed to retrieve the index for %s", peer);
2817 goto out;
2818 }
2819
2820 if (netdev->upscript) {
2821 err = run_script(handler->name, "net", netdev->upscript, "up",
2822 "macvlan", netdev->link, (char*) NULL);
2823 if (err)
2824 goto out;
2825 }
2826
2827 DEBUG("instanciated macvlan '%s', index is '%d' and mode '%d'",
2828 peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
2829
2830 return 0;
2831 out:
2832 lxc_netdev_delete_by_name(peer);
2833 free(peer);
2834 return -1;
2835 }
2836
2837 static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2838 {
2839 int err;
2840
2841 if (netdev->downscript) {
2842 err = run_script(handler->name, "net", netdev->downscript,
2843 "down", "macvlan", netdev->link,
2844 (char*) NULL);
2845 if (err)
2846 return -1;
2847 }
2848 return 0;
2849 }
2850
2851 /* XXX: merge with instanciate_macvlan */
2852 static int instanciate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2853 {
2854 char peer[IFNAMSIZ];
2855 int err;
2856
2857 if (!netdev->link) {
2858 ERROR("no link specified for vlan netdev");
2859 return -1;
2860 }
2861
2862 err = snprintf(peer, sizeof(peer), "vlan%d", netdev->priv.vlan_attr.vid);
2863 if (err >= sizeof(peer)) {
2864 ERROR("peer name too long");
2865 return -1;
2866 }
2867
2868 err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
2869 if (err) {
2870 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2871 peer, netdev->link, strerror(-err));
2872 return -1;
2873 }
2874
2875 netdev->ifindex = if_nametoindex(peer);
2876 if (!netdev->ifindex) {
2877 ERROR("failed to retrieve the ifindex for %s", peer);
2878 lxc_netdev_delete_by_name(peer);
2879 return -1;
2880 }
2881
2882 DEBUG("instanciated vlan '%s', ifindex is '%d'", " vlan1000",
2883 netdev->ifindex);
2884
2885 return 0;
2886 }
2887
2888 static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
2889 {
2890 return 0;
2891 }
2892
2893 static int instanciate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
2894 {
2895 if (!netdev->link) {
2896 ERROR("no link specified for the physical interface");
2897 return -1;
2898 }
2899
2900 netdev->ifindex = if_nametoindex(netdev->link);
2901 if (!netdev->ifindex) {
2902 ERROR("failed to retrieve the index for %s", netdev->link);
2903 return -1;
2904 }
2905
2906 if (netdev->upscript) {
2907 int err;
2908 err = run_script(handler->name, "net", netdev->upscript,
2909 "up", "phys", netdev->link, (char*) NULL);
2910 if (err)
2911 return -1;
2912 }
2913
2914 return 0;
2915 }
2916
2917 static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
2918 {
2919 int err;
2920
2921 if (netdev->downscript) {
2922 err = run_script(handler->name, "net", netdev->downscript,
2923 "down", "phys", netdev->link, (char*) NULL);
2924 if (err)
2925 return -1;
2926 }
2927 return 0;
2928 }
2929
2930 static int instanciate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
2931 {
2932 netdev->ifindex = 0;
2933 return 0;
2934 }
2935
2936 static int instanciate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
2937 {
2938 netdev->ifindex = 0;
2939 if (netdev->upscript) {
2940 int err;
2941 err = run_script(handler->name, "net", netdev->upscript,
2942 "up", "empty", (char*) NULL);
2943 if (err)
2944 return -1;
2945 }
2946 return 0;
2947 }
2948
2949 static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
2950 {
2951 int err;
2952
2953 if (netdev->downscript) {
2954 err = run_script(handler->name, "net", netdev->downscript,
2955 "down", "empty", (char*) NULL);
2956 if (err)
2957 return -1;
2958 }
2959 return 0;
2960 }
2961
2962 static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
2963 {
2964 return 0;
2965 }
2966
2967 int lxc_requests_empty_network(struct lxc_handler *handler)
2968 {
2969 struct lxc_list *network = &handler->conf->network;
2970 struct lxc_list *iterator;
2971 struct lxc_netdev *netdev;
2972 bool found_none = false, found_nic = false;
2973
2974 if (lxc_list_empty(network))
2975 return 0;
2976
2977 lxc_list_for_each(iterator, network) {
2978
2979 netdev = iterator->elem;
2980
2981 if (netdev->type == LXC_NET_NONE)
2982 found_none = true;
2983 else
2984 found_nic = true;
2985 }
2986 if (found_none && !found_nic)
2987 return 1;
2988 return 0;
2989 }
2990
2991 int lxc_create_network(struct lxc_handler *handler)
2992 {
2993 struct lxc_list *network = &handler->conf->network;
2994 struct lxc_list *iterator;
2995 struct lxc_netdev *netdev;
2996 int am_root = (getuid() == 0);
2997
2998 if (!am_root)
2999 return 0;
3000
3001 lxc_list_for_each(iterator, network) {
3002
3003 netdev = iterator->elem;
3004
3005 if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
3006 ERROR("invalid network configuration type '%d'",
3007 netdev->type);
3008 return -1;
3009 }
3010
3011 if (netdev_conf[netdev->type](handler, netdev)) {
3012 ERROR("failed to create netdev");
3013 return -1;
3014 }
3015
3016 }
3017
3018 return 0;
3019 }
3020
3021 void lxc_delete_network(struct lxc_handler *handler)
3022 {
3023 struct lxc_list *network = &handler->conf->network;
3024 struct lxc_list *iterator;
3025 struct lxc_netdev *netdev;
3026
3027 lxc_list_for_each(iterator, network) {
3028 netdev = iterator->elem;
3029
3030 if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
3031 if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
3032 WARN("failed to rename to the initial name the " \
3033 "netdev '%s'", netdev->link);
3034 continue;
3035 }
3036
3037 if (netdev_deconf[netdev->type](handler, netdev)) {
3038 WARN("failed to destroy netdev");
3039 }
3040
3041 /* Recent kernel remove the virtual interfaces when the network
3042 * namespace is destroyed but in case we did not moved the
3043 * interface to the network namespace, we have to destroy it
3044 */
3045 if (netdev->ifindex != 0 &&
3046 lxc_netdev_delete_by_index(netdev->ifindex))
3047 WARN("failed to remove interface '%s'", netdev->name);
3048 }
3049 }
3050
3051 #define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3052
3053 /* lxc-user-nic returns "interface_name:interface_name\n" */
3054 #define MAX_BUFFER_SIZE IFNAMSIZ*2 + 2
3055 static int unpriv_assign_nic(struct lxc_netdev *netdev, pid_t pid)
3056 {
3057 pid_t child;
3058 int bytes, pipefd[2];
3059 char *token, *saveptr = NULL;
3060 char buffer[MAX_BUFFER_SIZE];
3061
3062 if (netdev->type != LXC_NET_VETH) {
3063 ERROR("nic type %d not support for unprivileged use",
3064 netdev->type);
3065 return -1;
3066 }
3067
3068 if(pipe(pipefd) < 0) {
3069 SYSERROR("pipe failed");
3070 return -1;
3071 }
3072
3073 if ((child = fork()) < 0) {
3074 SYSERROR("fork");
3075 close(pipefd[0]);
3076 close(pipefd[1]);
3077 return -1;
3078 }
3079
3080 if (child == 0) { // child
3081 /* close the read-end of the pipe */
3082 close(pipefd[0]);
3083 /* redirect the stdout to write-end of the pipe */
3084 dup2(pipefd[1], STDOUT_FILENO);
3085 /* close the write-end of the pipe */
3086 close(pipefd[1]);
3087
3088 // Call lxc-user-nic pid type bridge
3089 char pidstr[20];
3090 char *args[] = {LXC_USERNIC_PATH, pidstr, "veth", netdev->link, netdev->name, NULL };
3091 snprintf(pidstr, 19, "%lu", (unsigned long) pid);
3092 pidstr[19] = '\0';
3093 execvp(args[0], args);
3094 SYSERROR("execvp lxc-user-nic");
3095 exit(1);
3096 }
3097
3098 /* close the write-end of the pipe */
3099 close(pipefd[1]);
3100
3101 bytes = read(pipefd[0], &buffer, MAX_BUFFER_SIZE);
3102 if (bytes < 0) {
3103 SYSERROR("read failed");
3104 }
3105 buffer[bytes - 1] = '\0';
3106
3107 if (wait_for_pid(child) != 0) {
3108 close(pipefd[0]);
3109 return -1;
3110 }
3111
3112 /* close the read-end of the pipe */
3113 close(pipefd[0]);
3114
3115 /* fill netdev->name field */
3116 token = strtok_r(buffer, ":", &saveptr);
3117 if (!token)
3118 return -1;
3119 netdev->name = malloc(IFNAMSIZ+1);
3120 if (!netdev->name) {
3121 ERROR("Out of memory");
3122 return -1;
3123 }
3124 memset(netdev->name, 0, IFNAMSIZ+1);
3125 strncpy(netdev->name, token, IFNAMSIZ);
3126
3127 /* fill netdev->veth_attr.pair field */
3128 token = strtok_r(NULL, ":", &saveptr);
3129 if (!token)
3130 return -1;
3131 netdev->priv.veth_attr.pair = strdup(token);
3132 if (!netdev->priv.veth_attr.pair) {
3133 ERROR("Out of memory");
3134 return -1;
3135 }
3136
3137 return 0;
3138 }
3139
3140 int lxc_assign_network(struct lxc_list *network, pid_t pid)
3141 {
3142 struct lxc_list *iterator;
3143 struct lxc_netdev *netdev;
3144 int am_root = (getuid() == 0);
3145 int err;
3146
3147 lxc_list_for_each(iterator, network) {
3148
3149 netdev = iterator->elem;
3150
3151 if (netdev->type == LXC_NET_VETH && !am_root) {
3152 if (unpriv_assign_nic(netdev, pid))
3153 return -1;
3154 // lxc-user-nic has moved the nic to the new ns.
3155 // unpriv_assign_nic() fills in netdev->name.
3156 // netdev->ifindex will be filed in at setup_netdev.
3157 continue;
3158 }
3159
3160 /* empty network namespace, nothing to move */
3161 if (!netdev->ifindex)
3162 continue;
3163
3164 err = lxc_netdev_move_by_index(netdev->ifindex, pid);
3165 if (err) {
3166 ERROR("failed to move '%s' to the container : %s",
3167 netdev->link, strerror(-err));
3168 return -1;
3169 }
3170
3171 DEBUG("move '%s' to '%d'", netdev->name, pid);
3172 }
3173
3174 return 0;
3175 }
3176
3177 static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
3178 size_t buf_size)
3179 {
3180 char path[PATH_MAX];
3181 int ret, closeret;
3182 FILE *f;
3183
3184 ret = snprintf(path, PATH_MAX, "/proc/%d/%cid_map", pid, idtype == ID_TYPE_UID ? 'u' : 'g');
3185 if (ret < 0 || ret >= PATH_MAX) {
3186 fprintf(stderr, "%s: path name too long\n", __func__);
3187 return -E2BIG;
3188 }
3189 f = fopen(path, "w");
3190 if (!f) {
3191 perror("open");
3192 return -EINVAL;
3193 }
3194 ret = fwrite(buf, buf_size, 1, f);
3195 if (ret < 0)
3196 SYSERROR("writing id mapping");
3197 closeret = fclose(f);
3198 if (closeret)
3199 SYSERROR("writing id mapping");
3200 return ret < 0 ? ret : closeret;
3201 }
3202
3203 int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
3204 {
3205 struct lxc_list *iterator;
3206 struct id_map *map;
3207 int ret = 0;
3208 enum idtype type;
3209 char *buf = NULL, *pos;
3210 int use_shadow = (on_path("newuidmap") && on_path("newuidmap"));
3211
3212 if (!use_shadow && geteuid()) {
3213 ERROR("Missing newuidmap/newgidmap");
3214 return -1;
3215 }
3216
3217 for(type = ID_TYPE_UID; type <= ID_TYPE_GID; type++) {
3218 int left, fill;
3219 int had_entry = 0;
3220 if (!buf) {
3221 buf = pos = malloc(4096);
3222 if (!buf)
3223 return -ENOMEM;
3224 }
3225 pos = buf;
3226 if (use_shadow)
3227 pos += sprintf(buf, "new%cidmap %d",
3228 type == ID_TYPE_UID ? 'u' : 'g',
3229 pid);
3230
3231 lxc_list_for_each(iterator, idmap) {
3232 /* The kernel only takes <= 4k for writes to /proc/<nr>/[ug]id_map */
3233 map = iterator->elem;
3234 if (map->idtype != type)
3235 continue;
3236
3237 had_entry = 1;
3238 left = 4096 - (pos - buf);
3239 fill = snprintf(pos, left, "%s%lu %lu %lu%s",
3240 use_shadow ? " " : "",
3241 map->nsid, map->hostid, map->range,
3242 use_shadow ? "" : "\n");
3243 if (fill <= 0 || fill >= left)
3244 SYSERROR("snprintf failed, too many mappings");
3245 pos += fill;
3246 }
3247 if (!had_entry)
3248 continue;
3249
3250 if (!use_shadow) {
3251 ret = write_id_mapping(type, pid, buf, pos-buf);
3252 } else {
3253 left = 4096 - (pos - buf);
3254 fill = snprintf(pos, left, "\n");
3255 if (fill <= 0 || fill >= left)
3256 SYSERROR("snprintf failed, too many mappings");
3257 pos += fill;
3258 ret = system(buf);
3259 }
3260
3261 if (ret)
3262 break;
3263 }
3264
3265 if (buf)
3266 free(buf);
3267 return ret;
3268 }
3269
3270 /*
3271 * return the host uid to which the container root is mapped in *val.
3272 * Return true if id was found, false otherwise.
3273 */
3274 bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
3275 unsigned long *val)
3276 {
3277 struct lxc_list *it;
3278 struct id_map *map;
3279
3280 lxc_list_for_each(it, &conf->id_map) {
3281 map = it->elem;
3282 if (map->idtype != ID_TYPE_UID)
3283 continue;
3284 if (map->nsid != 0)
3285 continue;
3286 *val = map->hostid;
3287 return true;
3288 }
3289 return false;
3290 }
3291
3292 int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
3293 {
3294 struct lxc_list *it;
3295 struct id_map *map;
3296 lxc_list_for_each(it, &conf->id_map) {
3297 map = it->elem;
3298 if (map->idtype != idtype)
3299 continue;
3300 if (id >= map->hostid && id < map->hostid + map->range)
3301 return (id - map->hostid) + map->nsid;
3302 }
3303 return -1;
3304 }
3305
3306 int find_unmapped_nsuid(struct lxc_conf *conf, enum idtype idtype)
3307 {
3308 struct lxc_list *it;
3309 struct id_map *map;
3310 unsigned int freeid = 0;
3311 again:
3312 lxc_list_for_each(it, &conf->id_map) {
3313 map = it->elem;
3314 if (map->idtype != idtype)
3315 continue;
3316 if (freeid >= map->nsid && freeid < map->nsid + map->range) {
3317 freeid = map->nsid + map->range;
3318 goto again;
3319 }
3320 }
3321 return freeid;
3322 }
3323
3324 int lxc_find_gateway_addresses(struct lxc_handler *handler)
3325 {
3326 struct lxc_list *network = &handler->conf->network;
3327 struct lxc_list *iterator;
3328 struct lxc_netdev *netdev;
3329 int link_index;
3330
3331 lxc_list_for_each(iterator, network) {
3332 netdev = iterator->elem;
3333
3334 if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
3335 continue;
3336
3337 if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
3338 ERROR("gateway = auto only supported for "
3339 "veth and macvlan");
3340 return -1;
3341 }
3342
3343 if (!netdev->link) {
3344 ERROR("gateway = auto needs a link interface");
3345 return -1;
3346 }
3347
3348 link_index = if_nametoindex(netdev->link);
3349 if (!link_index)
3350 return -EINVAL;
3351
3352 if (netdev->ipv4_gateway_auto) {
3353 if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
3354 ERROR("failed to automatically find ipv4 gateway "
3355 "address from link interface '%s'", netdev->link);
3356 return -1;
3357 }
3358 }
3359
3360 if (netdev->ipv6_gateway_auto) {
3361 if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
3362 ERROR("failed to automatically find ipv6 gateway "
3363 "address from link interface '%s'", netdev->link);
3364 return -1;
3365 }
3366 }
3367 }
3368
3369 return 0;
3370 }
3371
3372 int lxc_create_tty(const char *name, struct lxc_conf *conf)
3373 {
3374 struct lxc_tty_info *tty_info = &conf->tty_info;
3375 int i, ret;
3376
3377 /* no tty in the configuration */
3378 if (!conf->tty)
3379 return 0;
3380
3381 tty_info->pty_info =
3382 malloc(sizeof(*tty_info->pty_info)*conf->tty);
3383 if (!tty_info->pty_info) {
3384 SYSERROR("failed to allocate pty_info");
3385 return -1;
3386 }
3387
3388 for (i = 0; i < conf->tty; i++) {
3389
3390 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3391
3392 process_lock();
3393 ret = openpty(&pty_info->master, &pty_info->slave,
3394 pty_info->name, NULL, NULL);
3395 process_unlock();
3396 if (ret) {
3397 SYSERROR("failed to create pty #%d", i);
3398 tty_info->nbtty = i;
3399 lxc_delete_tty(tty_info);
3400 return -1;
3401 }
3402
3403 DEBUG("allocated pty '%s' (%d/%d)",
3404 pty_info->name, pty_info->master, pty_info->slave);
3405
3406 /* Prevent leaking the file descriptors to the container */
3407 fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
3408 fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
3409
3410 pty_info->busy = 0;
3411 }
3412
3413 tty_info->nbtty = conf->tty;
3414
3415 INFO("tty's configured");
3416
3417 return 0;
3418 }
3419
3420 void lxc_delete_tty(struct lxc_tty_info *tty_info)
3421 {
3422 int i;
3423
3424 for (i = 0; i < tty_info->nbtty; i++) {
3425 struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
3426
3427 close(pty_info->master);
3428 close(pty_info->slave);
3429 }
3430
3431 free(tty_info->pty_info);
3432 tty_info->nbtty = 0;
3433 }
3434
3435 /*
3436 * chown_mapped_root: for an unprivileged user with uid X to chown a dir
3437 * to subuid Y, he needs to run chown as root in a userns where
3438 * nsid 0 is mapped to hostuid Y, and nsid Y is mapped to hostuid
3439 * X. That way, the container root is privileged with respect to
3440 * hostuid X, allowing him to do the chown.
3441 */
3442 int chown_mapped_root(char *path, struct lxc_conf *conf)
3443 {
3444 uid_t rootid;
3445 pid_t pid;
3446 unsigned long val;
3447 char *chownpath = path;
3448
3449 if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
3450 ERROR("No mapping for container root");
3451 return -1;
3452 }
3453 rootid = (uid_t) val;
3454
3455 /*
3456 * In case of overlay, we want only the writeable layer
3457 * to be chowned
3458 */
3459 if (strncmp(path, "overlayfs:", 10) == 0 || strncmp(path, "aufs:", 5) == 0) {
3460 chownpath = strchr(path, ':');
3461 if (!chownpath) {
3462 ERROR("Bad overlay path: %s", path);
3463 return -1;
3464 }
3465 chownpath = strchr(chownpath+1, ':');
3466 if (!chownpath) {
3467 ERROR("Bad overlay path: %s", path);
3468 return -1;
3469 }
3470 chownpath++;
3471 }
3472 path = chownpath;
3473 if (geteuid() == 0) {
3474 if (chown(path, rootid, -1) < 0) {
3475 ERROR("Error chowning %s", path);
3476 return -1;
3477 }
3478 return 0;
3479 }
3480
3481 if (rootid == geteuid()) {
3482 // nothing to do
3483 INFO("%s: container root is our uid; no need to chown" ,__func__);
3484 return 0;
3485 }
3486
3487 pid = fork();
3488 if (pid < 0) {
3489 SYSERROR("Failed forking");
3490 return -1;
3491 }
3492 if (!pid) {
3493 int hostuid = geteuid(), ret;
3494 char map1[100], map2[100], map3[100];
3495 char *args[] = {"lxc-usernsexec", "-m", map1, "-m", map2, "-m",
3496 map3, "--", "chown", "0", path, NULL};
3497
3498 // "u:0:rootid:1"
3499 ret = snprintf(map1, 100, "u:0:%d:1", rootid);
3500 if (ret < 0 || ret >= 100) {
3501 ERROR("Error uid printing map string");
3502 return -1;
3503 }
3504
3505 // "u:hostuid:hostuid:1"
3506 ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
3507 if (ret < 0 || ret >= 100) {
3508 ERROR("Error uid printing map string");
3509 return -1;
3510 }
3511
3512 // "g:0:hostgid:1"
3513 ret = snprintf(map3, 100, "g:0:%d:1", getgid());
3514 if (ret < 0 || ret >= 100) {
3515 ERROR("Error uid printing map string");
3516 return -1;
3517 }
3518
3519 ret = execvp("lxc-usernsexec", args);
3520 SYSERROR("Failed executing usernsexec");
3521 exit(1);
3522 }
3523 return wait_for_pid(pid);
3524 }
3525
3526 int ttys_shift_ids(struct lxc_conf *c)
3527 {
3528 int i;
3529
3530 if (lxc_list_empty(&c->id_map))
3531 return 0;
3532
3533 for (i = 0; i < c->tty_info.nbtty; i++) {
3534 struct lxc_pty_info *pty_info = &c->tty_info.pty_info[i];
3535
3536 if (chown_mapped_root(pty_info->name, c) < 0) {
3537 ERROR("Failed to chown %s", pty_info->name);
3538 return -1;
3539 }
3540 }
3541
3542 if (strcmp(c->console.name, "") !=0 && chown_mapped_root(c->console.name, c) < 0) {
3543 ERROR("Failed to chown %s", c->console.name);
3544 return -1;
3545 }
3546
3547 return 0;
3548 }
3549
3550 /*
3551 * This routine is called when the configuration does not already specify a value
3552 * for autodev (mounting a file system on /dev and populating it in a container).
3553 * If a hard override value has not be specified, then we try to apply some
3554 * heuristics to determine if we should switch to autodev mode.
3555 *
3556 * For instance, if the container has an /etc/systemd/system directory then it
3557 * is probably running systemd as the init process and it needs the autodev
3558 * mount to prevent it from mounting devtmpfs on /dev on it's own causing conflicts
3559 * in the host.
3560 *
3561 * We may also want to enable autodev if the host has devtmpfs mounted on its
3562 * /dev as this then enable us to use subdirectories under /dev for the container
3563 * /dev directories and we can fake udev devices.
3564 */
3565 struct start_args {
3566 char *const *argv;
3567 };
3568
3569 #define MAX_SYMLINK_DEPTH 32
3570
3571 static int check_autodev( const char *rootfs, void *data )
3572 {
3573 struct start_args *arg = data;
3574 int ret;
3575 int loop_count = 0;
3576 struct stat s;
3577 char absrootfs[MAXPATHLEN];
3578 char path[MAXPATHLEN];
3579 char abs_path[MAXPATHLEN];
3580 char *command = "/sbin/init";
3581
3582 if (rootfs == NULL || strlen(rootfs) == 0)
3583 return -2;
3584
3585 if (!realpath(rootfs, absrootfs))
3586 return -2;
3587
3588 if( arg && arg->argv[0] ) {
3589 command = arg->argv[0];
3590 DEBUG("Set exec command to %s", command );
3591 }
3592
3593 strncpy( path, command, MAXPATHLEN-1 );
3594
3595 if ( 0 != access(path, F_OK) || 0 != stat(path, &s) )
3596 return -2;
3597
3598 /* Dereference down the symlink merry path testing as we go. */
3599 /* If anything references systemd in the path - set autodev! */
3600 /* Renormalize to the rootfs before each dereference */
3601 /* Relative symlinks should fall out in the wash even with .. */
3602 while( 1 ) {
3603 if ( strstr( path, "systemd" ) ) {
3604 INFO("Container with systemd init detected - enabling autodev!");
3605 return 1;
3606 }
3607
3608 ret = snprintf(abs_path, MAXPATHLEN-1, "%s/%s", absrootfs, path);
3609 if (ret < 0 || ret > MAXPATHLEN)
3610 return -2;
3611
3612 ret = readlink( abs_path, path, MAXPATHLEN-1 );
3613
3614 if ( ( ret <= 0 ) || ( ++loop_count > MAX_SYMLINK_DEPTH ) ) {
3615 break; /* Break out for other tests */
3616 }
3617 path[ret] = '\0';
3618 }
3619
3620 /*
3621 * Add future checks here.
3622 * Return positive if we should go autodev
3623 * Return 0 if we should NOT go autodev
3624 * Return negative if we encounter an error or can not determine...
3625 */
3626
3627 /* All else fails, we don't need autodev */
3628 INFO("Autodev not required.");
3629 return 0;
3630 }
3631
3632 /*
3633 * _do_tmp_proc_mount: Mount /proc inside container if not already
3634 * mounted
3635 *
3636 * @rootfs : the rootfs where proc should be mounted
3637 *
3638 * Returns < 0 on failure, 0 if the correct proc was already mounted
3639 * and 1 if a new proc was mounted.
3640 */
3641 static int do_tmp_proc_mount(const char *rootfs)
3642 {
3643 char path[MAXPATHLEN];
3644 char link[20];
3645 int linklen, ret;
3646
3647 ret = snprintf(path, MAXPATHLEN, "%s/proc/self", rootfs);
3648 if (ret < 0 || ret >= MAXPATHLEN) {
3649 SYSERROR("proc path name too long");
3650 return -1;
3651 }
3652 memset(link, 0, 20);
3653 linklen = readlink(path, link, 20);
3654 INFO("I am %d, /proc/self points to '%s'", getpid(), link);
3655 ret = snprintf(path, MAXPATHLEN, "%s/proc", rootfs);
3656 if (linklen < 0) /* /proc not mounted */
3657 goto domount;
3658 /* can't be longer than rootfs/proc/1 */
3659 if (strncmp(link, "1", linklen) != 0) {
3660 /* wrong /procs mounted */
3661 umount2(path, MNT_DETACH); /* ignore failure */
3662 goto domount;
3663 }
3664 /* the right proc is already mounted */
3665 return 0;
3666
3667 domount:
3668 if (mount("proc", path, "proc", 0, NULL))
3669 return -1;
3670 INFO("Mounted /proc in container for security transition");
3671 return 1;
3672 }
3673
3674 int tmp_proc_mount(struct lxc_conf *lxc_conf)
3675 {
3676 int mounted;
3677
3678 if (lxc_conf->rootfs.path == NULL || strlen(lxc_conf->rootfs.path) == 0) {
3679 if (mount("proc", "/proc", "proc", 0, NULL)) {
3680 SYSERROR("Failed mounting /proc, proceeding");
3681 mounted = 0;
3682 } else
3683 mounted = 1;
3684 } else
3685 mounted = do_tmp_proc_mount(lxc_conf->rootfs.mount);
3686 if (mounted == -1) {
3687 SYSERROR("failed to mount /proc in the container.");
3688 return -1;
3689 } else if (mounted == 1) {
3690 lxc_conf->tmp_umount_proc = 1;
3691 }
3692 return 0;
3693 }
3694
3695 void tmp_proc_unmount(struct lxc_conf *lxc_conf)
3696 {
3697 if (lxc_conf->tmp_umount_proc == 1) {
3698 umount("/proc");
3699 lxc_conf->tmp_umount_proc = 0;
3700 }
3701 }
3702
3703 int lxc_setup(struct lxc_handler *handler)
3704 {
3705 const char *name = handler->name;
3706 struct lxc_conf *lxc_conf = handler->conf;
3707 const char *lxcpath = handler->lxcpath;
3708 void *data = handler->data;
3709
3710 if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
3711 if (setup_utsname(lxc_conf->utsname)) {
3712 ERROR("failed to setup the utsname for '%s'", name);
3713 return -1;
3714 }
3715 }
3716
3717 if (setup_network(&lxc_conf->network)) {
3718 ERROR("failed to setup the network for '%s'", name);
3719 return -1;
3720 }
3721
3722 if (run_lxc_hooks(name, "pre-mount", lxc_conf, lxcpath, NULL)) {
3723 ERROR("failed to run pre-mount hooks for container '%s'.", name);
3724 return -1;
3725 }
3726
3727 if (setup_rootfs(lxc_conf)) {
3728 ERROR("failed to setup rootfs for '%s'", name);
3729 return -1;
3730 }
3731
3732 if (lxc_conf->autodev < 0) {
3733 lxc_conf->autodev = check_autodev(lxc_conf->rootfs.mount, data);
3734 }
3735
3736 if (lxc_conf->autodev > 0) {
3737 if (mount_autodev(name, lxc_conf->rootfs.mount, lxcpath)) {
3738 ERROR("failed to mount /dev in the container");
3739 return -1;
3740 }
3741 }
3742
3743 /* do automatic mounts (mainly /proc and /sys), but exclude
3744 * those that need to wait until other stuff has finished
3745 */
3746 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
3747 ERROR("failed to setup the automatic mounts for '%s'", name);
3748 return -1;
3749 }
3750
3751 if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name)) {
3752 ERROR("failed to setup the mounts for '%s'", name);
3753 return -1;
3754 }
3755
3756 if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name)) {
3757 ERROR("failed to setup the mount entries for '%s'", name);
3758 return -1;
3759 }
3760
3761 /* now mount only cgroup, if wanted;
3762 * before, /sys could not have been mounted
3763 * (is either mounted automatically or via fstab entries)
3764 */
3765 if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
3766 ERROR("failed to setup the automatic mounts for '%s'", name);
3767 return -1;
3768 }
3769
3770 if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
3771 ERROR("failed to run mount hooks for container '%s'.", name);
3772 return -1;
3773 }
3774
3775 if (lxc_conf->autodev > 0) {
3776 if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
3777 ERROR("failed to run autodev hooks for container '%s'.", name);
3778 return -1;
3779 }
3780 if (setup_autodev(lxc_conf->rootfs.mount)) {
3781 ERROR("failed to populate /dev in the container");
3782 return -1;
3783 }
3784 }
3785
3786 if (!lxc_conf->is_execute && setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
3787 ERROR("failed to setup the console for '%s'", name);
3788 return -1;
3789 }
3790
3791 if (lxc_conf->kmsg) {
3792 if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console)) // don't fail
3793 ERROR("failed to setup kmsg for '%s'", name);
3794 }
3795
3796 if (!lxc_conf->is_execute && setup_tty(&lxc_conf->rootfs, &lxc_conf->tty_info, lxc_conf->ttydir)) {
3797 ERROR("failed to setup the ttys for '%s'", name);
3798 return -1;
3799 }
3800
3801 if (!lxc_conf->is_execute && setup_dev_symlinks(&lxc_conf->rootfs)) {
3802 ERROR("failed to setup /dev symlinks for '%s'", name);
3803 return -1;
3804 }
3805
3806 /* mount /proc if it's not already there */
3807 if (tmp_proc_mount(lxc_conf) < 0) {
3808 ERROR("failed to LSM mount proc for '%s'", name);
3809 return -1;
3810 }
3811
3812 if (setup_pivot_root(&lxc_conf->rootfs)) {
3813 ERROR("failed to set rootfs for '%s'", name);
3814 return -1;
3815 }
3816
3817 if (setup_pts(lxc_conf->pts)) {
3818 ERROR("failed to setup the new pts instance");
3819 return -1;
3820 }
3821
3822 if (setup_personality(lxc_conf->personality)) {
3823 ERROR("failed to setup personality");
3824 return -1;
3825 }
3826
3827 if (lxc_list_empty(&lxc_conf->id_map)) {
3828 if (!lxc_list_empty(&lxc_conf->keepcaps)) {
3829 if (!lxc_list_empty(&lxc_conf->caps)) {
3830 ERROR("Simultaneously requested dropping and keeping caps");
3831 return -1;
3832 }
3833 if (dropcaps_except(&lxc_conf->keepcaps)) {
3834 ERROR("failed to keep requested caps");
3835 return -1;
3836 }
3837 } else if (setup_caps(&lxc_conf->caps)) {
3838 ERROR("failed to drop capabilities");
3839 return -1;
3840 }
3841 }
3842
3843 NOTICE("'%s' is setup.", name);
3844
3845 return 0;
3846 }
3847
3848 int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
3849 const char *lxcpath, char *argv[])
3850 {
3851 int which = -1;
3852 struct lxc_list *it;
3853
3854 if (strcmp(hook, "pre-start") == 0)
3855 which = LXCHOOK_PRESTART;
3856 else if (strcmp(hook, "pre-mount") == 0)
3857 which = LXCHOOK_PREMOUNT;
3858 else if (strcmp(hook, "mount") == 0)
3859 which = LXCHOOK_MOUNT;
3860 else if (strcmp(hook, "autodev") == 0)
3861 which = LXCHOOK_AUTODEV;
3862 else if (strcmp(hook, "start") == 0)
3863 which = LXCHOOK_START;
3864 else if (strcmp(hook, "post-stop") == 0)
3865 which = LXCHOOK_POSTSTOP;
3866 else if (strcmp(hook, "clone") == 0)
3867 which = LXCHOOK_CLONE;
3868 else
3869 return -1;
3870 lxc_list_for_each(it, &conf->hooks[which]) {
3871 int ret;
3872 char *hookname = it->elem;
3873 ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
3874 if (ret)
3875 return ret;
3876 }
3877 return 0;
3878 }
3879
3880 static void lxc_remove_nic(struct lxc_list *it)
3881 {
3882 struct lxc_netdev *netdev = it->elem;
3883 struct lxc_list *it2,*next;
3884
3885 lxc_list_del(it);
3886
3887 if (netdev->link)
3888 free(netdev->link);
3889 if (netdev->name)
3890 free(netdev->name);
3891 if (netdev->type == LXC_NET_VETH && netdev->priv.veth_attr.pair)
3892 free(netdev->priv.veth_attr.pair);
3893 if (netdev->upscript)
3894 free(netdev->upscript);
3895 if (netdev->hwaddr)
3896 free(netdev->hwaddr);
3897 if (netdev->mtu)
3898 free(netdev->mtu);
3899 if (netdev->ipv4_gateway)
3900 free(netdev->ipv4_gateway);
3901 if (netdev->ipv6_gateway)
3902 free(netdev->ipv6_gateway);
3903 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
3904 lxc_list_del(it2);
3905 free(it2->elem);
3906 free(it2);
3907 }
3908 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
3909 lxc_list_del(it2);
3910 free(it2->elem);
3911 free(it2);
3912 }
3913 free(netdev);
3914 free(it);
3915 }
3916
3917 /* we get passed in something like '0', '0.ipv4' or '1.ipv6' */
3918 int lxc_clear_nic(struct lxc_conf *c, const char *key)
3919 {
3920 char *p1;
3921 int ret, idx, i;
3922 struct lxc_list *it;
3923 struct lxc_netdev *netdev;
3924
3925 p1 = index(key, '.');
3926 if (!p1 || *(p1+1) == '\0')
3927 p1 = NULL;
3928
3929 ret = sscanf(key, "%d", &idx);
3930 if (ret != 1) return -1;
3931 if (idx < 0)
3932 return -1;
3933
3934 i = 0;
3935 lxc_list_for_each(it, &c->network) {
3936 if (i == idx)
3937 break;
3938 i++;
3939 }
3940 if (i < idx) // we don't have that many nics defined
3941 return -1;
3942
3943 if (!it || !it->elem)
3944 return -1;
3945
3946 netdev = it->elem;
3947
3948 if (!p1) {
3949 lxc_remove_nic(it);
3950 } else if (strcmp(p1, ".ipv4") == 0) {
3951 struct lxc_list *it2,*next;
3952 lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
3953 lxc_list_del(it2);
3954 free(it2->elem);
3955 free(it2);
3956 }
3957 } else if (strcmp(p1, ".ipv6") == 0) {
3958 struct lxc_list *it2,*next;
3959 lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
3960 lxc_list_del(it2);
3961 free(it2->elem);
3962 free(it2);
3963 }
3964 } else if (strcmp(p1, ".link") == 0) {
3965 if (netdev->link) {
3966 free(netdev->link);
3967 netdev->link = NULL;
3968 }
3969 } else if (strcmp(p1, ".name") == 0) {
3970 if (netdev->name) {
3971 free(netdev->name);
3972 netdev->name = NULL;
3973 }
3974 } else if (strcmp(p1, ".script.up") == 0) {
3975 if (netdev->upscript) {
3976 free(netdev->upscript);
3977 netdev->upscript = NULL;
3978 }
3979 } else if (strcmp(p1, ".hwaddr") == 0) {
3980 if (netdev->hwaddr) {
3981 free(netdev->hwaddr);
3982 netdev->hwaddr = NULL;
3983 }
3984 } else if (strcmp(p1, ".mtu") == 0) {
3985 if (netdev->mtu) {
3986 free(netdev->mtu);
3987 netdev->mtu = NULL;
3988 }
3989 } else if (strcmp(p1, ".ipv4_gateway") == 0) {
3990 if (netdev->ipv4_gateway) {
3991 free(netdev->ipv4_gateway);
3992 netdev->ipv4_gateway = NULL;
3993 }
3994 } else if (strcmp(p1, ".ipv6_gateway") == 0) {
3995 if (netdev->ipv6_gateway) {
3996 free(netdev->ipv6_gateway);
3997 netdev->ipv6_gateway = NULL;
3998 }
3999 }
4000 else return -1;
4001
4002 return 0;
4003 }
4004
4005 int lxc_clear_config_network(struct lxc_conf *c)
4006 {
4007 struct lxc_list *it,*next;
4008 lxc_list_for_each_safe(it, &c->network, next) {
4009 lxc_remove_nic(it);
4010 }
4011 return 0;
4012 }
4013
4014 int lxc_clear_config_caps(struct lxc_conf *c)
4015 {
4016 struct lxc_list *it,*next;
4017
4018 lxc_list_for_each_safe(it, &c->caps, next) {
4019 lxc_list_del(it);
4020 free(it->elem);
4021 free(it);
4022 }
4023 return 0;
4024 }
4025
4026 static int lxc_free_idmap(struct lxc_list *id_map) {
4027 struct lxc_list *it, *next;
4028
4029 lxc_list_for_each_safe(it, id_map, next) {
4030 lxc_list_del(it);
4031 free(it->elem);
4032 free(it);
4033 }
4034 return 0;
4035 }
4036
4037 int lxc_clear_idmaps(struct lxc_conf *c)
4038 {
4039 return lxc_free_idmap(&c->id_map);
4040 }
4041
4042 int lxc_clear_config_keepcaps(struct lxc_conf *c)
4043 {
4044 struct lxc_list *it,*next;
4045
4046 lxc_list_for_each_safe(it, &c->keepcaps, next) {
4047 lxc_list_del(it);
4048 free(it->elem);
4049 free(it);
4050 }
4051 return 0;
4052 }
4053
4054 int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
4055 {
4056 struct lxc_list *it,*next;
4057 bool all = false;
4058 const char *k = key + 11;
4059
4060 if (strcmp(key, "lxc.cgroup") == 0)
4061 all = true;
4062
4063 lxc_list_for_each_safe(it, &c->cgroup, next) {
4064 struct lxc_cgroup *cg = it->elem;
4065 if (!all && strcmp(cg->subsystem, k) != 0)
4066 continue;
4067 lxc_list_del(it);
4068 free(cg->subsystem);
4069 free(cg->value);
4070 free(cg);
4071 free(it);
4072 }
4073 return 0;
4074 }
4075
4076 int lxc_clear_groups(struct lxc_conf *c)
4077 {
4078 struct lxc_list *it,*next;
4079
4080 lxc_list_for_each_safe(it, &c->groups, next) {
4081 lxc_list_del(it);
4082 free(it->elem);
4083 free(it);
4084 }
4085 return 0;
4086 }
4087
4088 int lxc_clear_mount_entries(struct lxc_conf *c)
4089 {
4090 struct lxc_list *it,*next;
4091
4092 lxc_list_for_each_safe(it, &c->mount_list, next) {
4093 lxc_list_del(it);
4094 free(it->elem);
4095 free(it);
4096 }
4097 return 0;
4098 }
4099
4100 int lxc_clear_hooks(struct lxc_conf *c, const char *key)
4101 {
4102 struct lxc_list *it,*next;
4103 bool all = false, done = false;
4104 const char *k = key + 9;
4105 int i;
4106
4107 if (strcmp(key, "lxc.hook") == 0)
4108 all = true;
4109
4110 for (i=0; i<NUM_LXC_HOOKS; i++) {
4111 if (all || strcmp(k, lxchook_names[i]) == 0) {
4112 lxc_list_for_each_safe(it, &c->hooks[i], next) {
4113 lxc_list_del(it);
4114 free(it->elem);
4115 free(it);
4116 }
4117 done = true;
4118 }
4119 }
4120
4121 if (!done) {
4122 ERROR("Invalid hook key: %s", key);
4123 return -1;
4124 }
4125 return 0;
4126 }
4127
4128 static void lxc_clear_saved_nics(struct lxc_conf *conf)
4129 {
4130 int i;
4131
4132 if (!conf->saved_nics)
4133 return;
4134 for (i=0; i < conf->num_savednics; i++)
4135 free(conf->saved_nics[i].orig_name);
4136 free(conf->saved_nics);
4137 }
4138
4139 void lxc_conf_free(struct lxc_conf *conf)
4140 {
4141 if (!conf)
4142 return;
4143 if (conf->console.path)
4144 free(conf->console.path);
4145 if (conf->rootfs.mount)
4146 free(conf->rootfs.mount);
4147 if (conf->rootfs.options)
4148 free(conf->rootfs.options);
4149 if (conf->rootfs.path)
4150 free(conf->rootfs.path);
4151 if (conf->rootfs.pivot)
4152 free(conf->rootfs.pivot);
4153 if (conf->logfile)
4154 free(conf->logfile);
4155 if (conf->utsname)
4156 free(conf->utsname);
4157 if (conf->ttydir)
4158 free(conf->ttydir);
4159 if (conf->fstab)
4160 free(conf->fstab);
4161 if (conf->rcfile)
4162 free(conf->rcfile);
4163 lxc_clear_config_network(conf);
4164 if (conf->lsm_aa_profile)
4165 free(conf->lsm_aa_profile);
4166 if (conf->lsm_se_context)
4167 free(conf->lsm_se_context);
4168 lxc_seccomp_free(conf);
4169 lxc_clear_config_caps(conf);
4170 lxc_clear_config_keepcaps(conf);
4171 lxc_clear_cgroups(conf, "lxc.cgroup");
4172 lxc_clear_hooks(conf, "lxc.hook");
4173 lxc_clear_mount_entries(conf);
4174 lxc_clear_saved_nics(conf);
4175 lxc_clear_idmaps(conf);
4176 lxc_clear_groups(conf);
4177 free(conf);
4178 }
4179
4180 struct userns_fn_data {
4181 int (*fn)(void *);
4182 void *arg;
4183 int p[2];
4184 };
4185
4186 static int run_userns_fn(void *data)
4187 {
4188 struct userns_fn_data *d = data;
4189 char c;
4190 // we're not sharing with the parent any more, if it was a thread
4191
4192 close(d->p[1]);
4193 if (read(d->p[0], &c, 1) != 1)
4194 return -1;
4195 close(d->p[0]);
4196 return d->fn(d->arg);
4197 }
4198
4199 /*
4200 * Add a ID_TYPE_UID entry to an existing lxc_conf, if it is not
4201 * alread there.
4202 * We may want to generalize this to do gids as well as uids, but right now
4203 * it's not necessary.
4204 */
4205 static struct lxc_list *idmap_add_id(struct lxc_conf *conf, uid_t uid)
4206 {
4207 int hostid_mapped = mapped_hostid(uid, conf, ID_TYPE_UID);
4208 struct lxc_list *new = NULL, *tmp, *it, *next;
4209 struct id_map *entry;
4210
4211 new = malloc(sizeof(*new));
4212 if (!new) {
4213 ERROR("Out of memory building id map");
4214 return NULL;
4215 }
4216 lxc_list_init(new);
4217
4218 if (hostid_mapped < 0) {
4219 hostid_mapped = find_unmapped_nsuid(conf, ID_TYPE_UID);
4220 if (hostid_mapped < 0)
4221 goto err;
4222 tmp = malloc(sizeof(*tmp));
4223 if (!tmp)
4224 goto err;
4225 entry = malloc(sizeof(*entry));
4226 if (!entry) {
4227 free(tmp);
4228 goto err;
4229 }
4230 tmp->elem = entry;
4231 entry->idtype = ID_TYPE_UID;
4232 entry->nsid = hostid_mapped;
4233 entry->hostid = (unsigned long)uid;
4234 entry->range = 1;
4235 lxc_list_add_tail(new, tmp);
4236 }
4237 lxc_list_for_each_safe(it, &conf->id_map, next) {
4238 tmp = malloc(sizeof(*tmp));
4239 if (!tmp)
4240 goto err;
4241 entry = malloc(sizeof(*entry));
4242 if (!entry) {
4243 free(tmp);
4244 goto err;
4245 }
4246 memset(entry, 0, sizeof(*entry));
4247 memcpy(entry, it->elem, sizeof(*entry));
4248 tmp->elem = entry;
4249 lxc_list_add_tail(new, tmp);
4250 }
4251
4252 return new;
4253
4254 err:
4255 ERROR("Out of memory building a new uid map");
4256 if (new)
4257 lxc_free_idmap(new);
4258 free(new);
4259 return NULL;
4260 }
4261
4262 /*
4263 * Run a function in a new user namespace.
4264 * The caller's euid will be mapped in if it is not already.
4265 */
4266 int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data)
4267 {
4268 int ret, pid;
4269 struct userns_fn_data d;
4270 char c = '1';
4271 int p[2];
4272 struct lxc_list *idmap;
4273
4274 ret = pipe(p);
4275 if (ret < 0) {
4276 SYSERROR("opening pipe");
4277 return -1;
4278 }
4279 d.fn = fn;
4280 d.arg = data;
4281 d.p[0] = p[0];
4282 d.p[1] = p[1];
4283 pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
4284 if (pid < 0)
4285 goto err;
4286 close(p[0]);
4287 p[0] = -1;
4288
4289 if ((idmap = idmap_add_id(conf, geteuid())) == NULL) {
4290 ERROR("Error adding self to container uid map");
4291 goto err;
4292 }
4293
4294 ret = lxc_map_ids(idmap, pid);
4295 lxc_free_idmap(idmap);
4296 free(idmap);
4297 if (ret) {
4298 ERROR("Error setting up child mappings");
4299 goto err;
4300 }
4301
4302 // kick the child
4303 if (write(p[1], &c, 1) != 1) {
4304 SYSERROR("writing to pipe to child");
4305 goto err;
4306 }
4307
4308 ret = wait_for_pid(pid);
4309
4310 close(p[1]);
4311 return ret;
4312
4313 err:
4314 if (p[0] != -1)
4315 close(p[0]);
4316 close(p[1]);
4317 return -1;
4318 }