2 * lxc: linux Container library
4 * (C) Copyright IBM Corp. 2007, 2008
7 * Daniel Lezcano <daniel.lezcano at free.fr>
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
34 #include <sys/syscall.h>
40 #include <../include/openpty.h>
43 #include <linux/loop.h>
45 #include <sys/types.h>
46 #include <sys/utsname.h>
47 #include <sys/param.h>
49 #include <sys/socket.h>
50 #include <sys/mount.h>
52 #include <sys/prctl.h>
54 #include <arpa/inet.h>
56 #include <netinet/in.h>
66 #include "caps.h" /* for lxc_caps_last_cap() */
70 #include "namespace.h"
73 #if HAVE_SYS_CAPABILITY_H
74 #include <sys/capability.h>
77 #if HAVE_SYS_PERSONALITY_H
78 #include <sys/personality.h>
82 #include <../include/lxcmntent.h>
87 #include "lxcseccomp.h"
89 lxc_log_define(lxc_conf
, lxc
);
92 #define MAXINDEXLEN 20
94 #define MAXLINELEN 128
96 #if HAVE_SYS_CAPABILITY_H
98 #define CAP_SETFCAP 31
101 #ifndef CAP_MAC_OVERRIDE
102 #define CAP_MAC_OVERRIDE 32
105 #ifndef CAP_MAC_ADMIN
106 #define CAP_MAC_ADMIN 33
110 #ifndef PR_CAPBSET_DROP
111 #define PR_CAPBSET_DROP 24
114 #ifndef LO_FLAGS_AUTOCLEAR
115 #define LO_FLAGS_AUTOCLEAR 4
118 /* Define pivot_root() if missing from the C library */
119 #ifndef HAVE_PIVOT_ROOT
120 static int pivot_root(const char * new_root
, const char * put_old
)
122 #ifdef __NR_pivot_root
123 return syscall(__NR_pivot_root
, new_root
, put_old
);
130 extern int pivot_root(const char * new_root
, const char * put_old
);
133 /* Define sethostname() if missing from the C library */
134 #ifndef HAVE_SETHOSTNAME
135 static int sethostname(const char * name
, size_t len
)
137 #ifdef __NR_sethostname
138 return syscall(__NR_sethostname
, name
, len
);
146 /* Define __S_ISTYPE if missing from the C library */
148 #define __S_ISTYPE(mode, mask) (((mode) & S_IFMT) == (mask))
151 char *lxchook_names
[NUM_LXC_HOOKS
] = {
152 "pre-start", "pre-mount", "mount", "autodev", "start", "post-stop", "clone" };
154 typedef int (*instanciate_cb
)(struct lxc_handler
*, struct lxc_netdev
*);
167 static int instanciate_veth(struct lxc_handler
*, struct lxc_netdev
*);
168 static int instanciate_macvlan(struct lxc_handler
*, struct lxc_netdev
*);
169 static int instanciate_vlan(struct lxc_handler
*, struct lxc_netdev
*);
170 static int instanciate_phys(struct lxc_handler
*, struct lxc_netdev
*);
171 static int instanciate_empty(struct lxc_handler
*, struct lxc_netdev
*);
172 static int instanciate_none(struct lxc_handler
*, struct lxc_netdev
*);
174 static instanciate_cb netdev_conf
[LXC_NET_MAXCONFTYPE
+ 1] = {
175 [LXC_NET_VETH
] = instanciate_veth
,
176 [LXC_NET_MACVLAN
] = instanciate_macvlan
,
177 [LXC_NET_VLAN
] = instanciate_vlan
,
178 [LXC_NET_PHYS
] = instanciate_phys
,
179 [LXC_NET_EMPTY
] = instanciate_empty
,
180 [LXC_NET_NONE
] = instanciate_none
,
183 static int shutdown_veth(struct lxc_handler
*, struct lxc_netdev
*);
184 static int shutdown_macvlan(struct lxc_handler
*, struct lxc_netdev
*);
185 static int shutdown_vlan(struct lxc_handler
*, struct lxc_netdev
*);
186 static int shutdown_phys(struct lxc_handler
*, struct lxc_netdev
*);
187 static int shutdown_empty(struct lxc_handler
*, struct lxc_netdev
*);
188 static int shutdown_none(struct lxc_handler
*, struct lxc_netdev
*);
190 static instanciate_cb netdev_deconf
[LXC_NET_MAXCONFTYPE
+ 1] = {
191 [LXC_NET_VETH
] = shutdown_veth
,
192 [LXC_NET_MACVLAN
] = shutdown_macvlan
,
193 [LXC_NET_VLAN
] = shutdown_vlan
,
194 [LXC_NET_PHYS
] = shutdown_phys
,
195 [LXC_NET_EMPTY
] = shutdown_empty
,
196 [LXC_NET_NONE
] = shutdown_none
,
199 static struct mount_opt mount_opt
[] = {
200 { "defaults", 0, 0 },
201 { "ro", 0, MS_RDONLY
},
202 { "rw", 1, MS_RDONLY
},
203 { "suid", 1, MS_NOSUID
},
204 { "nosuid", 0, MS_NOSUID
},
205 { "dev", 1, MS_NODEV
},
206 { "nodev", 0, MS_NODEV
},
207 { "exec", 1, MS_NOEXEC
},
208 { "noexec", 0, MS_NOEXEC
},
209 { "sync", 0, MS_SYNCHRONOUS
},
210 { "async", 1, MS_SYNCHRONOUS
},
211 { "dirsync", 0, MS_DIRSYNC
},
212 { "remount", 0, MS_REMOUNT
},
213 { "mand", 0, MS_MANDLOCK
},
214 { "nomand", 1, MS_MANDLOCK
},
215 { "atime", 1, MS_NOATIME
},
216 { "noatime", 0, MS_NOATIME
},
217 { "diratime", 1, MS_NODIRATIME
},
218 { "nodiratime", 0, MS_NODIRATIME
},
219 { "bind", 0, MS_BIND
},
220 { "rbind", 0, MS_BIND
|MS_REC
},
221 { "relatime", 0, MS_RELATIME
},
222 { "norelatime", 1, MS_RELATIME
},
223 { "strictatime", 0, MS_STRICTATIME
},
224 { "nostrictatime", 1, MS_STRICTATIME
},
228 #if HAVE_SYS_CAPABILITY_H
229 static struct caps_opt caps_opt
[] = {
230 { "chown", CAP_CHOWN
},
231 { "dac_override", CAP_DAC_OVERRIDE
},
232 { "dac_read_search", CAP_DAC_READ_SEARCH
},
233 { "fowner", CAP_FOWNER
},
234 { "fsetid", CAP_FSETID
},
235 { "kill", CAP_KILL
},
236 { "setgid", CAP_SETGID
},
237 { "setuid", CAP_SETUID
},
238 { "setpcap", CAP_SETPCAP
},
239 { "linux_immutable", CAP_LINUX_IMMUTABLE
},
240 { "net_bind_service", CAP_NET_BIND_SERVICE
},
241 { "net_broadcast", CAP_NET_BROADCAST
},
242 { "net_admin", CAP_NET_ADMIN
},
243 { "net_raw", CAP_NET_RAW
},
244 { "ipc_lock", CAP_IPC_LOCK
},
245 { "ipc_owner", CAP_IPC_OWNER
},
246 { "sys_module", CAP_SYS_MODULE
},
247 { "sys_rawio", CAP_SYS_RAWIO
},
248 { "sys_chroot", CAP_SYS_CHROOT
},
249 { "sys_ptrace", CAP_SYS_PTRACE
},
250 { "sys_pacct", CAP_SYS_PACCT
},
251 { "sys_admin", CAP_SYS_ADMIN
},
252 { "sys_boot", CAP_SYS_BOOT
},
253 { "sys_nice", CAP_SYS_NICE
},
254 { "sys_resource", CAP_SYS_RESOURCE
},
255 { "sys_time", CAP_SYS_TIME
},
256 { "sys_tty_config", CAP_SYS_TTY_CONFIG
},
257 { "mknod", CAP_MKNOD
},
258 { "lease", CAP_LEASE
},
259 #ifdef CAP_AUDIT_WRITE
260 { "audit_write", CAP_AUDIT_WRITE
},
262 #ifdef CAP_AUDIT_CONTROL
263 { "audit_control", CAP_AUDIT_CONTROL
},
265 { "setfcap", CAP_SETFCAP
},
266 { "mac_override", CAP_MAC_OVERRIDE
},
267 { "mac_admin", CAP_MAC_ADMIN
},
269 { "syslog", CAP_SYSLOG
},
271 #ifdef CAP_WAKE_ALARM
272 { "wake_alarm", CAP_WAKE_ALARM
},
276 static struct caps_opt caps_opt
[] = {};
279 static int run_buffer(char *buffer
)
281 struct lxc_popen_FILE
*f
;
285 f
= lxc_popen(buffer
);
287 SYSERROR("popen failed");
291 output
= malloc(LXC_LOG_BUFFER_SIZE
);
293 ERROR("failed to allocate memory for script output");
298 while(fgets(output
, LXC_LOG_BUFFER_SIZE
, f
->f
))
299 DEBUG("script output: %s", output
);
305 SYSERROR("Script exited on error");
307 } else if (WIFEXITED(ret
) && WEXITSTATUS(ret
) != 0) {
308 ERROR("Script exited with status %d", WEXITSTATUS(ret
));
310 } else if (WIFSIGNALED(ret
)) {
311 ERROR("Script terminated by signal %d (%s)", WTERMSIG(ret
),
312 strsignal(WTERMSIG(ret
)));
319 static int run_script_argv(const char *name
, const char *section
,
320 const char *script
, const char *hook
, const char *lxcpath
,
327 INFO("Executing script '%s' for container '%s', config section '%s'",
328 script
, name
, section
);
330 for (i
=0; argsin
&& argsin
[i
]; i
++)
331 size
+= strlen(argsin
[i
]) + 1;
333 size
+= strlen(hook
) + 1;
335 size
+= strlen(script
);
336 size
+= strlen(name
);
337 size
+= strlen(section
);
343 buffer
= alloca(size
);
345 ERROR("failed to allocate memory");
349 ret
= snprintf(buffer
, size
, "%s %s %s %s", script
, name
, section
, hook
);
350 if (ret
< 0 || ret
>= size
) {
351 ERROR("Script name too long");
355 for (i
=0; argsin
&& argsin
[i
]; i
++) {
358 rc
= snprintf(buffer
+ ret
, len
, " %s", argsin
[i
]);
359 if (rc
< 0 || rc
>= len
) {
360 ERROR("Script args too long");
366 return run_buffer(buffer
);
369 static int run_script(const char *name
, const char *section
,
370 const char *script
, ...)
377 INFO("Executing script '%s' for container '%s', config section '%s'",
378 script
, name
, section
);
380 va_start(ap
, script
);
381 while ((p
= va_arg(ap
, char *)))
382 size
+= strlen(p
) + 1;
385 size
+= strlen(script
);
386 size
+= strlen(name
);
387 size
+= strlen(section
);
393 buffer
= alloca(size
);
395 ERROR("failed to allocate memory");
399 ret
= snprintf(buffer
, size
, "%s %s %s", script
, name
, section
);
400 if (ret
< 0 || ret
>= size
) {
401 ERROR("Script name too long");
405 va_start(ap
, script
);
406 while ((p
= va_arg(ap
, char *))) {
409 rc
= snprintf(buffer
+ ret
, len
, " %s", p
);
410 if (rc
< 0 || rc
>= len
) {
411 ERROR("Script args too long");
418 return run_buffer(buffer
);
421 static int find_fstype_cb(char* buffer
, void *data
)
429 unsigned long mntflags
;
433 /* we don't try 'nodev' entries */
434 if (strstr(buffer
, "nodev"))
438 fstype
+= lxc_char_left_gc(fstype
, strlen(fstype
));
439 fstype
[lxc_char_right_gc(fstype
, strlen(fstype
))] = '\0';
441 DEBUG("trying to mount '%s'->'%s' with fstype '%s'",
442 cbarg
->rootfs
, cbarg
->target
, fstype
);
444 if (parse_mntopts(cbarg
->options
, &mntflags
, &mntdata
) < 0) {
449 if (mount(cbarg
->rootfs
, cbarg
->target
, fstype
, mntflags
, mntdata
)) {
450 DEBUG("mount failed with error: %s", strerror(errno
));
456 INFO("mounted '%s' on '%s', with fstype '%s'",
457 cbarg
->rootfs
, cbarg
->target
, fstype
);
462 static int mount_unknown_fs(const char *rootfs
, const char *target
,
478 * find the filesystem type with brute force:
479 * first we check with /etc/filesystems, in case the modules
480 * are auto-loaded and fall back to the supported kernel fs
487 for (i
= 0; i
< sizeof(fsfile
)/sizeof(fsfile
[0]); i
++) {
491 if (access(fsfile
[i
], F_OK
))
494 ret
= lxc_file_for_each_line(fsfile
[i
], find_fstype_cb
, &cbarg
);
496 ERROR("failed to parse '%s'", fsfile
[i
]);
504 ERROR("failed to determine fs type for '%s'", rootfs
);
508 static int mount_rootfs_dir(const char *rootfs
, const char *target
,
511 unsigned long mntflags
;
515 if (parse_mntopts(options
, &mntflags
, &mntdata
) < 0) {
520 ret
= mount(rootfs
, target
, "none", MS_BIND
| MS_REC
| mntflags
, mntdata
);
526 static int setup_lodev(const char *rootfs
, int fd
, struct loop_info64
*loinfo
)
531 rfd
= open(rootfs
, O_RDWR
);
533 SYSERROR("failed to open '%s'", rootfs
);
537 memset(loinfo
, 0, sizeof(*loinfo
));
539 loinfo
->lo_flags
= LO_FLAGS_AUTOCLEAR
;
541 if (ioctl(fd
, LOOP_SET_FD
, rfd
)) {
542 SYSERROR("failed to LOOP_SET_FD");
546 if (ioctl(fd
, LOOP_SET_STATUS64
, loinfo
)) {
547 SYSERROR("failed to LOOP_SET_STATUS64");
558 static int mount_rootfs_file(const char *rootfs
, const char *target
,
561 struct dirent dirent
, *direntp
;
562 struct loop_info64 loinfo
;
563 int ret
= -1, fd
= -1, rc
;
565 char path
[MAXPATHLEN
];
567 dir
= opendir("/dev");
569 SYSERROR("failed to open '/dev'");
573 while (!readdir_r(dir
, &dirent
, &direntp
)) {
578 if (!strcmp(direntp
->d_name
, "."))
581 if (!strcmp(direntp
->d_name
, ".."))
584 if (strncmp(direntp
->d_name
, "loop", 4))
587 rc
= snprintf(path
, MAXPATHLEN
, "/dev/%s", direntp
->d_name
);
588 if (rc
< 0 || rc
>= MAXPATHLEN
)
591 fd
= open(path
, O_RDWR
);
595 if (ioctl(fd
, LOOP_GET_STATUS64
, &loinfo
) == 0) {
600 if (errno
!= ENXIO
) {
601 WARN("unexpected error for ioctl on '%s': %m",
607 DEBUG("found '%s' free lodev", path
);
609 ret
= setup_lodev(rootfs
, fd
, &loinfo
);
611 ret
= mount_unknown_fs(path
, target
, options
);
618 WARN("failed to close directory");
623 static int mount_rootfs_block(const char *rootfs
, const char *target
,
626 return mount_unknown_fs(rootfs
, target
, options
);
631 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
632 * the duration of the container run, to prevent the container from marking
633 * the underlying fs readonly on shutdown. unlink the file immediately so
634 * no name pollution is happens
635 * return -1 on error.
636 * return -2 if nothing needed to be pinned.
637 * return an open fd (>=0) if we pinned it.
639 int pin_rootfs(const char *rootfs
)
641 char absrootfs
[MAXPATHLEN
];
642 char absrootfspin
[MAXPATHLEN
];
646 if (rootfs
== NULL
|| strlen(rootfs
) == 0)
649 if (!realpath(rootfs
, absrootfs
))
652 if (access(absrootfs
, F_OK
))
655 if (stat(absrootfs
, &s
))
658 if (!S_ISDIR(s
.st_mode
))
661 ret
= snprintf(absrootfspin
, MAXPATHLEN
, "%s/lxc.hold", absrootfs
);
662 if (ret
>= MAXPATHLEN
)
665 fd
= open(absrootfspin
, O_CREAT
| O_RDWR
, S_IWUSR
|S_IRUSR
);
668 (void)unlink(absrootfspin
);
672 static int lxc_mount_auto_mounts(struct lxc_conf
*conf
, int flags
, struct lxc_handler
*handler
)
680 const char *destination
;
684 } default_mounts
[] = {
685 /* Read-only bind-mounting... In older kernels, doing that required
686 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
687 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
688 * kernel 2.6.26 onwards. However, this apparently does not work on
689 * kernel 3.8. Unfortunately, on that very same kernel, doing the
690 * same trick as above doesn't seem to work either, there one needs
691 * to ALSO specify MS_BIND for the remount, otherwise the entire
692 * fs is remounted read-only or the mount fails because it's busy...
693 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
696 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, "proc", "%r/proc", "proc", MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, NULL
},
697 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, "%r/proc/sys", "%r/proc/sys", NULL
, MS_BIND
, NULL
},
698 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, NULL
, "%r/proc/sys", NULL
, MS_REMOUNT
|MS_BIND
|MS_RDONLY
, NULL
},
699 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL
, MS_BIND
, NULL
},
700 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, NULL
, "%r/proc/sysrq-trigger", NULL
, MS_REMOUNT
|MS_BIND
|MS_RDONLY
, NULL
},
701 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_RW
, "proc", "%r/proc", "proc", MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, NULL
},
702 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_RW
, "sysfs", "%r/sys", "sysfs", 0, NULL
},
703 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_RO
, "sysfs", "%r/sys", "sysfs", MS_RDONLY
, NULL
},
704 { 0, 0, NULL
, NULL
, NULL
, 0, NULL
}
707 for (i
= 0; default_mounts
[i
].match_mask
; i
++) {
708 if ((flags
& default_mounts
[i
].match_mask
) == default_mounts
[i
].match_flag
) {
710 char *destination
= NULL
;
713 if (default_mounts
[i
].source
) {
714 /* will act like strdup if %r is not present */
715 source
= lxc_string_replace("%r", conf
->rootfs
.mount
, default_mounts
[i
].source
);
717 SYSERROR("memory allocation error");
721 if (default_mounts
[i
].destination
) {
722 /* will act like strdup if %r is not present */
723 destination
= lxc_string_replace("%r", conf
->rootfs
.mount
, default_mounts
[i
].destination
);
726 SYSERROR("memory allocation error");
732 r
= mount(source
, destination
, default_mounts
[i
].fstype
, default_mounts
[i
].flags
, default_mounts
[i
].options
);
735 SYSERROR("error mounting %s on %s", source
, destination
);
745 if (flags
& LXC_AUTO_CGROUP_MASK
) {
746 if (!cgroup_mount(conf
->rootfs
.mount
, handler
,
747 flags
& LXC_AUTO_CGROUP_MASK
)) {
748 SYSERROR("error mounting /sys/fs/cgroup");
756 static int mount_rootfs(const char *rootfs
, const char *target
, const char *options
)
758 char absrootfs
[MAXPATHLEN
];
762 typedef int (*rootfs_cb
)(const char *, const char *, const char *);
768 { S_IFDIR
, mount_rootfs_dir
},
769 { S_IFBLK
, mount_rootfs_block
},
770 { S_IFREG
, mount_rootfs_file
},
773 if (!realpath(rootfs
, absrootfs
)) {
774 SYSERROR("failed to get real path for '%s'", rootfs
);
778 if (access(absrootfs
, F_OK
)) {
779 SYSERROR("'%s' is not accessible", absrootfs
);
783 if (stat(absrootfs
, &s
)) {
784 SYSERROR("failed to stat '%s'", absrootfs
);
788 for (i
= 0; i
< sizeof(rtfs_type
)/sizeof(rtfs_type
[0]); i
++) {
790 if (!__S_ISTYPE(s
.st_mode
, rtfs_type
[i
].type
))
793 return rtfs_type
[i
].cb(absrootfs
, target
, options
);
796 ERROR("unsupported rootfs type for '%s'", absrootfs
);
800 static int setup_utsname(struct utsname
*utsname
)
805 if (sethostname(utsname
->nodename
, strlen(utsname
->nodename
))) {
806 SYSERROR("failed to set the hostname to '%s'", utsname
->nodename
);
810 INFO("'%s' hostname has been setup", utsname
->nodename
);
815 struct dev_symlinks
{
820 static const struct dev_symlinks dev_symlinks
[] = {
821 {"/proc/self/fd", "fd"},
822 {"/proc/self/fd/0", "stdin"},
823 {"/proc/self/fd/1", "stdout"},
824 {"/proc/self/fd/2", "stderr"},
827 static int setup_dev_symlinks(const struct lxc_rootfs
*rootfs
)
829 char path
[MAXPATHLEN
];
833 for (i
= 0; i
< sizeof(dev_symlinks
) / sizeof(dev_symlinks
[0]); i
++) {
834 const struct dev_symlinks
*d
= &dev_symlinks
[i
];
835 ret
= snprintf(path
, sizeof(path
), "%s/dev/%s", rootfs
->mount
, d
->name
);
836 if (ret
< 0 || ret
>= MAXPATHLEN
)
838 ret
= symlink(d
->oldpath
, path
);
839 if (ret
&& errno
!= EEXIST
) {
840 SYSERROR("Error creating %s", path
);
847 static int setup_tty(const struct lxc_rootfs
*rootfs
,
848 const struct lxc_tty_info
*tty_info
, char *ttydir
)
850 char path
[MAXPATHLEN
], lxcpath
[MAXPATHLEN
];
856 for (i
= 0; i
< tty_info
->nbtty
; i
++) {
858 struct lxc_pty_info
*pty_info
= &tty_info
->pty_info
[i
];
860 ret
= snprintf(path
, sizeof(path
), "%s/dev/tty%d",
861 rootfs
->mount
, i
+ 1);
862 if (ret
>= sizeof(path
)) {
863 ERROR("pathname too long for ttys");
867 /* create dev/lxc/tty%d" */
868 ret
= snprintf(lxcpath
, sizeof(lxcpath
), "%s/dev/%s/tty%d",
869 rootfs
->mount
, ttydir
, i
+ 1);
870 if (ret
>= sizeof(lxcpath
)) {
871 ERROR("pathname too long for ttys");
874 ret
= creat(lxcpath
, 0660);
875 if (ret
==-1 && errno
!= EEXIST
) {
876 SYSERROR("error creating %s", lxcpath
);
882 if (ret
&& errno
!= ENOENT
) {
883 SYSERROR("error unlinking %s", path
);
887 if (mount(pty_info
->name
, lxcpath
, "none", MS_BIND
, 0)) {
888 WARN("failed to mount '%s'->'%s'",
889 pty_info
->name
, path
);
893 ret
= snprintf(lxcpath
, sizeof(lxcpath
), "%s/tty%d", ttydir
, i
+1);
894 if (ret
>= sizeof(lxcpath
)) {
895 ERROR("tty pathname too long");
898 ret
= symlink(lxcpath
, path
);
900 SYSERROR("failed to create symlink for tty %d", i
+1);
904 /* If we populated /dev, then we need to create /dev/ttyN */
905 if (access(path
, F_OK
)) {
906 ret
= creat(path
, 0660);
908 SYSERROR("error creating %s", path
);
909 /* this isn't fatal, continue */
914 if (mount(pty_info
->name
, path
, "none", MS_BIND
, 0)) {
915 WARN("failed to mount '%s'->'%s'",
916 pty_info
->name
, path
);
922 INFO("%d tty(s) has been setup", tty_info
->nbtty
);
927 static int setup_rootfs_pivot_root_cb(char *buffer
, void *data
)
929 struct lxc_list
*mountlist
, *listentry
, *iterator
;
930 char *pivotdir
, *mountpoint
, *mountentry
, *saveptr
= NULL
;
935 cbparm
= (void **)data
;
937 mountlist
= cbparm
[0];
938 pivotdir
= cbparm
[1];
940 /* parse entry, first field is mountname, ignore */
941 mountpoint
= strtok_r(mountentry
, " ", &saveptr
);
945 /* second field is mountpoint */
946 mountpoint
= strtok_r(NULL
, " ", &saveptr
);
950 /* only consider mountpoints below old root fs */
951 if (strncmp(mountpoint
, pivotdir
, strlen(pivotdir
)))
954 /* filter duplicate mountpoints */
956 lxc_list_for_each(iterator
, mountlist
) {
957 if (!strcmp(iterator
->elem
, mountpoint
)) {
965 /* add entry to list */
966 listentry
= malloc(sizeof(*listentry
));
968 SYSERROR("malloc for mountpoint listentry failed");
972 listentry
->elem
= strdup(mountpoint
);
973 if (!listentry
->elem
) {
974 SYSERROR("strdup failed");
978 lxc_list_add_tail(mountlist
, listentry
);
983 static int umount_oldrootfs(const char *oldrootfs
)
985 char path
[MAXPATHLEN
];
987 struct lxc_list mountlist
, *iterator
, *next
;
988 int ok
, still_mounted
, last_still_mounted
;
991 /* read and parse /proc/mounts in old root fs */
992 lxc_list_init(&mountlist
);
994 /* oldrootfs is on the top tree directory now */
995 rc
= snprintf(path
, sizeof(path
), "/%s", oldrootfs
);
996 if (rc
>= sizeof(path
)) {
997 ERROR("rootfs name too long");
1000 cbparm
[0] = &mountlist
;
1002 cbparm
[1] = strdup(path
);
1004 SYSERROR("strdup failed");
1008 rc
= snprintf(path
, sizeof(path
), "%s/proc/mounts", oldrootfs
);
1009 if (rc
>= sizeof(path
)) {
1010 ERROR("container proc/mounts name too long");
1014 ok
= lxc_file_for_each_line(path
,
1015 setup_rootfs_pivot_root_cb
, &cbparm
);
1017 SYSERROR("failed to read or parse mount list '%s'", path
);
1021 /* umount filesystems until none left or list no longer shrinks */
1024 last_still_mounted
= still_mounted
;
1027 lxc_list_for_each_safe(iterator
, &mountlist
, next
) {
1029 /* umount normally */
1030 if (!umount(iterator
->elem
)) {
1031 DEBUG("umounted '%s'", (char *)iterator
->elem
);
1032 lxc_list_del(iterator
);
1039 } while (still_mounted
> 0 && still_mounted
!= last_still_mounted
);
1042 lxc_list_for_each(iterator
, &mountlist
) {
1044 /* let's try a lazy umount */
1045 if (!umount2(iterator
->elem
, MNT_DETACH
)) {
1046 INFO("lazy unmount of '%s'", (char *)iterator
->elem
);
1050 /* be more brutal (nfs) */
1051 if (!umount2(iterator
->elem
, MNT_FORCE
)) {
1052 INFO("forced unmount of '%s'", (char *)iterator
->elem
);
1056 WARN("failed to unmount '%s'", (char *)iterator
->elem
);
1062 static int setup_rootfs_pivot_root(const char *rootfs
, const char *pivotdir
)
1064 char path
[MAXPATHLEN
];
1065 int remove_pivotdir
= 0;
1068 /* change into new root fs */
1069 if (chdir(rootfs
)) {
1070 SYSERROR("can't chdir to new rootfs '%s'", rootfs
);
1075 pivotdir
= "lxc_putold";
1077 /* compute the full path to pivotdir under rootfs */
1078 rc
= snprintf(path
, sizeof(path
), "%s/%s", rootfs
, pivotdir
);
1079 if (rc
>= sizeof(path
)) {
1080 ERROR("pivot dir name too long");
1084 if (access(path
, F_OK
)) {
1086 if (mkdir_p(path
, 0755) < 0) {
1087 SYSERROR("failed to create pivotdir '%s'", path
);
1091 remove_pivotdir
= 1;
1092 DEBUG("created '%s' directory", path
);
1095 DEBUG("mountpoint for old rootfs is '%s'", path
);
1097 /* pivot_root into our new root fs */
1098 if (pivot_root(".", path
)) {
1099 SYSERROR("pivot_root syscall failed");
1104 SYSERROR("can't chdir to / after pivot_root");
1108 DEBUG("pivot_root syscall to '%s' successful", rootfs
);
1110 /* we switch from absolute path to relative path */
1111 if (umount_oldrootfs(pivotdir
))
1114 /* remove temporary mount point, we don't consider the removing
1116 if (remove_pivotdir
&& rmdir(pivotdir
))
1117 WARN("can't remove mountpoint '%s': %m", pivotdir
);
1123 * Check to see if a directory has something mounted on it and,
1124 * if it does, return the fstype.
1126 * Code largely based on detect_shared_rootfs below
1128 * Returns: # of matching entries in /proc/self/mounts
1129 * if != 0 fstype is filled with the last filesystem value.
1130 * if == 0 no matches found, fstype unchanged.
1132 * ToDo: Maybe return the mount options in another parameter...
1135 #define LINELEN 4096
1136 #define MAX_FSTYPE_LEN 128
1137 static int mount_check_fs( const char *dir
, char *fstype
)
1139 char buf
[LINELEN
], *p
;
1145 DEBUG("entering mount_check_fs for %s", dir
);
1147 if ( 0 != access(dir
, F_OK
) || 0 != stat(dir
, &s
) || 0 == S_ISDIR(s
.st_mode
) ) {
1151 f
= fopen("/proc/self/mounts", "r");
1154 while (fgets(buf
, LINELEN
, f
)) {
1155 p
= index(buf
, ' ');
1166 /* Compare the directory in the entry to desired */
1167 if( strcmp( p2
, dir
) ) {
1172 p
= index( p2
, ' ');
1180 strncpy( fstype
, p2
, MAX_FSTYPE_LEN
- 1 );
1181 fstype
[ MAX_FSTYPE_LEN
- 1 ] = '\0';
1187 DEBUG("mount_check_fs returning %d last %s", found_fs
, fstype
);
1193 * Locate a devtmpfs mount (should be on /dev) and create a container
1194 * subdirectory on it which we can then bind mount to the container
1195 * /dev instead of mounting a tmpfs there.
1196 * If we fail, return NULL.
1197 * Else return the pointer to the name buffer with the string to
1198 * the devtmpfs subdirectory.
1201 static char *mk_devtmpfs(const char *name
, char *path
, const char *lxcpath
)
1205 char tmp_path
[MAXPATHLEN
];
1206 char fstype
[MAX_FSTYPE_LEN
];
1207 char *base_path
= "/dev/.lxc";
1208 char *user_path
= "/dev/.lxc/user";
1211 if ( 0 != access(base_path
, F_OK
) || 0 != stat(base_path
, &s
) || 0 == S_ISDIR(s
.st_mode
) ) {
1212 /* This is just making /dev/.lxc it better work or we're done */
1213 ret
= mkdir(base_path
, S_IRWXU
| S_IRGRP
| S_IXGRP
| S_IROTH
| S_IXOTH
);
1215 SYSERROR( "Unable to create /dev/.lxc for autodev" );
1221 * Programmers notes:
1222 * We can not do mounts in this area of code that we want
1223 * to be visible in the host. Consequently, /dev/.lxc must
1224 * be set up earlier if we need a tmpfs mounted there.
1225 * That only affects the rare cases where autodev is enabled
1226 * for a container and devtmpfs is not mounted on /dev in the
1227 * host. In that case, we'll fall back to the old method
1228 * of mounting a tmpfs in the container and have no visibility
1229 * into the container /dev.
1231 if( ! mount_check_fs( "/dev", fstype
)
1232 || strcmp( "devtmpfs", fstype
) ) {
1233 /* Either /dev was not mounted or was not devtmpfs */
1235 if ( ! mount_check_fs( "/dev/.lxc", NULL
) ) {
1237 * /dev/.lxc is not already mounted
1238 * Doing a mount here does no good, since
1239 * it's not visible in the host.
1242 ERROR("/dev/.lxc is not setup - taking fallback" );
1247 if ( 0 != access(user_path
, F_OK
) || 0 != stat(user_path
, &s
) || 0 == S_ISDIR(s
.st_mode
) ) {
1249 * This is making /dev/.lxc/user path for non-priv users.
1250 * If this doesn't work, we'll have to fall back in the
1251 * case of non-priv users. It's mode 1777 like /tmp.
1253 ret
= mkdir(user_path
, S_IRWXU
| S_IRWXG
| S_IRWXO
| S_ISVTX
);
1255 /* Issue an error but don't fail yet! */
1256 ERROR("Unable to create /dev/.lxc/user");
1258 /* Umask tends to screw us up here */
1259 chmod(user_path
, S_IRWXU
| S_IRWXG
| S_IRWXO
| S_ISVTX
);
1263 * Since the container name must be unique within a given
1264 * lxcpath, we're going to use a hash of the path
1265 * /lxcpath/name as our hash name in /dev/.lxc/
1268 ret
= snprintf(tmp_path
, MAXPATHLEN
, "%s/%s", lxcpath
, name
);
1269 if (ret
< 0 || ret
>= MAXPATHLEN
)
1272 hash
= fnv_64a_buf(tmp_path
, ret
, FNV1A_64_INIT
);
1274 ret
= snprintf(tmp_path
, MAXPATHLEN
, "%s/%s.%016" PRIx64
, base_path
, name
, hash
);
1275 if (ret
< 0 || ret
>= MAXPATHLEN
)
1278 if ( 0 != access(tmp_path
, F_OK
) || 0 != stat(tmp_path
, &s
) || 0 == S_ISDIR(s
.st_mode
) ) {
1279 ret
= mkdir(tmp_path
, S_IRWXU
| S_IRGRP
| S_IXGRP
| S_IROTH
| S_IXOTH
);
1281 /* Something must have failed with the base_path...
1282 * Maybe unpriv user. Try user_path now... */
1283 INFO("Setup in /dev/.lxc failed. Trying /dev/.lxc/user." );
1285 ret
= snprintf(tmp_path
, MAXPATHLEN
, "%s/%s.%016" PRIx64
, user_path
, name
, hash
);
1286 if (ret
< 0 || ret
>= MAXPATHLEN
)
1289 if ( 0 != access(tmp_path
, F_OK
) || 0 != stat(tmp_path
, &s
) || 0 == S_ISDIR(s
.st_mode
) ) {
1290 ret
= mkdir(tmp_path
, S_IRWXU
| S_IRGRP
| S_IXGRP
| S_IROTH
| S_IXOTH
);
1292 ERROR("Container /dev setup in host /dev failed - taking fallback" );
1299 strcpy( path
, tmp_path
);
1305 * Do we want to add options for max size of /dev and a file to
1306 * specify which devices to create?
1308 static int mount_autodev(const char *name
, char *root
, const char *lxcpath
)
1312 char path
[MAXPATHLEN
];
1313 char host_path
[MAXPATHLEN
];
1314 char devtmpfs_path
[MAXPATHLEN
];
1316 INFO("Mounting /dev under %s", root
);
1318 ret
= snprintf(host_path
, MAXPATHLEN
, "%s/%s/rootfs.dev", lxcpath
, name
);
1319 if (ret
< 0 || ret
> MAXPATHLEN
)
1322 ret
= snprintf(path
, MAXPATHLEN
, "%s/dev", root
);
1323 if (ret
< 0 || ret
> MAXPATHLEN
)
1326 if (mk_devtmpfs( name
, devtmpfs_path
, lxcpath
) ) {
1328 * Get rid of old links and directoriess
1329 * This could be either a symlink and we remove it,
1330 * or an empty directory and we remove it,
1331 * or non-existant and we don't care,
1332 * or a non-empty directory, and we will then emit an error
1333 * but we will not fail out the process.
1335 unlink( host_path
);
1337 ret
= symlink(devtmpfs_path
, host_path
);
1340 SYSERROR("WARNING: Failed to create symlink '%s'->'%s'", host_path
, devtmpfs_path
);
1342 DEBUG("Bind mounting %s to %s", devtmpfs_path
, path
);
1343 ret
= mount(devtmpfs_path
, path
, NULL
, MS_BIND
, 0 );
1345 /* Only mount a tmpfs on here if we don't already a mount */
1346 if ( ! mount_check_fs( host_path
, NULL
) ) {
1347 DEBUG("Mounting tmpfs to %s", host_path
);
1348 ret
= mount("none", path
, "tmpfs", 0, "size=100000,mode=755");
1350 /* This allows someone to manually set up a mount */
1351 DEBUG("Bind mounting %s to %s", host_path
, path
);
1352 ret
= mount(host_path
, path
, NULL
, MS_BIND
, 0 );
1356 SYSERROR("Failed to mount /dev at %s", root
);
1359 ret
= snprintf(path
, MAXPATHLEN
, "%s/dev/pts", root
);
1360 if (ret
< 0 || ret
>= MAXPATHLEN
)
1363 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1364 * If not, then create it and exit if that fails...
1366 if ( 0 != access(path
, F_OK
) || 0 != stat(path
, &s
) || 0 == S_ISDIR(s
.st_mode
) ) {
1367 ret
= mkdir(path
, S_IRWXU
| S_IRGRP
| S_IXGRP
| S_IROTH
| S_IXOTH
);
1369 SYSERROR("Failed to create /dev/pts in container");
1374 INFO("Mounted /dev under %s", root
);
1385 static const struct lxc_devs lxc_devs
[] = {
1386 { "null", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 3 },
1387 { "zero", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 5 },
1388 { "full", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 7 },
1389 { "urandom", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 9 },
1390 { "random", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 8 },
1391 { "tty", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 5, 0 },
1392 { "console", S_IFCHR
| S_IRUSR
| S_IWUSR
, 5, 1 },
1395 static int setup_autodev(const char *root
)
1398 char path
[MAXPATHLEN
];
1402 INFO("Creating initial consoles under %s/dev", root
);
1404 ret
= snprintf(path
, MAXPATHLEN
, "%s/dev", root
);
1405 if (ret
< 0 || ret
>= MAXPATHLEN
) {
1406 ERROR("Error calculating container /dev location");
1410 INFO("Populating /dev under %s", root
);
1411 cmask
= umask(S_IXUSR
| S_IXGRP
| S_IXOTH
);
1412 for (i
= 0; i
< sizeof(lxc_devs
) / sizeof(lxc_devs
[0]); i
++) {
1413 const struct lxc_devs
*d
= &lxc_devs
[i
];
1414 ret
= snprintf(path
, MAXPATHLEN
, "%s/dev/%s", root
, d
->name
);
1415 if (ret
< 0 || ret
>= MAXPATHLEN
)
1417 ret
= mknod(path
, d
->mode
, makedev(d
->maj
, d
->min
));
1418 if (ret
&& errno
!= EEXIST
) {
1419 SYSERROR("Error creating %s", d
->name
);
1425 INFO("Populated /dev under %s", root
);
1430 * I'll forgive you for asking whether all of this is needed :) The
1432 * pivot_root will fail if the new root, the put_old dir, or the parent
1433 * of current->fs->root are MS_SHARED. (parent of current->fs_root may
1434 * or may not be current->fs_root - if we assumed it always was, we could
1435 * just mount --make-rslave /). So,
1436 * 1. mount a tiny tmpfs to be parent of current->fs->root.
1437 * 2. make that MS_SLAVE
1438 * 3. make a 'root' directory under that
1439 * 4. mount --rbind / under the $tinyroot/root.
1440 * 5. make that rslave
1441 * 6. chdir and chroot into $tinyroot/root
1442 * 7. $tinyroot will be unmounted by our parent in start.c
1444 static int chroot_into_slave(struct lxc_conf
*conf
)
1446 char path
[MAXPATHLEN
];
1447 const char *destpath
= conf
->rootfs
.mount
;
1450 if (mount(destpath
, destpath
, NULL
, MS_BIND
, 0)) {
1451 SYSERROR("failed to mount %s bind", destpath
);
1454 if (mount("", destpath
, NULL
, MS_SLAVE
, 0)) {
1455 SYSERROR("failed to make %s slave", destpath
);
1458 if (mount("none", destpath
, "tmpfs", 0, "size=10000,mode=755")) {
1459 SYSERROR("Failed to mount tmpfs / at %s", destpath
);
1462 ret
= snprintf(path
, MAXPATHLEN
, "%s/root", destpath
);
1463 if (ret
< 0 || ret
>= MAXPATHLEN
) {
1464 ERROR("out of memory making root path");
1467 if (mkdir(path
, S_IRWXU
| S_IRGRP
| S_IXGRP
| S_IROTH
| S_IXOTH
)) {
1468 SYSERROR("Failed to create /dev/pts in container");
1471 if (mount("/", path
, NULL
, MS_BIND
|MS_REC
, 0)) {
1472 SYSERROR("Failed to rbind mount / to %s", path
);
1475 if (mount("", destpath
, NULL
, MS_SLAVE
|MS_REC
, 0)) {
1476 SYSERROR("Failed to make tmp-/ at %s rslave", path
);
1480 SYSERROR("Failed to chroot into tmp-/");
1484 SYSERROR("Failed to chdir into tmp-/");
1487 INFO("Chrooted into tmp-/ at %s", path
);
1491 static int setup_rootfs(struct lxc_conf
*conf
)
1493 const struct lxc_rootfs
*rootfs
= &conf
->rootfs
;
1495 if (!rootfs
->path
) {
1496 if (mount("", "/", NULL
, MS_SLAVE
|MS_REC
, 0)) {
1497 SYSERROR("Failed to make / rslave");
1503 if (access(rootfs
->mount
, F_OK
)) {
1504 SYSERROR("failed to access to '%s', check it is present",
1509 if (detect_ramfs_rootfs()) {
1510 if (chroot_into_slave(conf
)) {
1511 ERROR("Failed to chroot into slave /");
1514 } else if (detect_shared_rootfs()) {
1515 if (mount("", "/", NULL
, MS_SLAVE
|MS_REC
, 0)) {
1516 SYSERROR("Failed to make / rslave");
1521 // First try mounting rootfs using a bdev
1522 struct bdev
*bdev
= bdev_init(rootfs
->path
, rootfs
->mount
, rootfs
->options
);
1523 if (bdev
&& bdev
->ops
->mount(bdev
) == 0) {
1525 DEBUG("mounted '%s' on '%s'", rootfs
->path
, rootfs
->mount
);
1530 if (mount_rootfs(rootfs
->path
, rootfs
->mount
, rootfs
->options
)) {
1531 ERROR("failed to mount rootfs");
1535 DEBUG("mounted '%s' on '%s'", rootfs
->path
, rootfs
->mount
);
1540 static int setup_pivot_root(const struct lxc_rootfs
*rootfs
)
1545 if (setup_rootfs_pivot_root(rootfs
->mount
, rootfs
->pivot
)) {
1546 ERROR("failed to setup pivot root");
1553 static int setup_pts(int pts
)
1555 char target
[PATH_MAX
];
1560 if (!access("/dev/pts/ptmx", F_OK
) && umount("/dev/pts")) {
1561 SYSERROR("failed to umount 'dev/pts'");
1565 if (mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL
,
1566 "newinstance,ptmxmode=0666,mode=0620,gid=5")) {
1567 SYSERROR("failed to mount a new instance of '/dev/pts'");
1571 if (access("/dev/ptmx", F_OK
)) {
1572 if (!symlink("/dev/pts/ptmx", "/dev/ptmx"))
1574 SYSERROR("failed to symlink '/dev/pts/ptmx'->'/dev/ptmx'");
1578 if (realpath("/dev/ptmx", target
) && !strcmp(target
, "/dev/pts/ptmx"))
1581 /* fallback here, /dev/pts/ptmx exists just mount bind */
1582 if (mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND
, 0)) {
1583 SYSERROR("mount failed '/dev/pts/ptmx'->'/dev/ptmx'");
1587 INFO("created new pts instance");
1593 static int setup_personality(int persona
)
1595 #if HAVE_SYS_PERSONALITY_H
1599 if (personality(persona
) < 0) {
1600 SYSERROR("failed to set personality to '0x%x'", persona
);
1604 INFO("set personality to '0x%x'", persona
);
1610 static int setup_dev_console(const struct lxc_rootfs
*rootfs
,
1611 const struct lxc_console
*console
)
1613 char path
[MAXPATHLEN
];
1617 ret
= snprintf(path
, sizeof(path
), "%s/dev/console", rootfs
->mount
);
1618 if (ret
>= sizeof(path
)) {
1619 ERROR("console path too long");
1623 if (access(path
, F_OK
)) {
1624 WARN("rootfs specified but no console found at '%s'", path
);
1628 if (console
->master
< 0) {
1633 if (stat(path
, &s
)) {
1634 SYSERROR("failed to stat '%s'", path
);
1638 if (chmod(console
->name
, s
.st_mode
)) {
1639 SYSERROR("failed to set mode '0%o' to '%s'",
1640 s
.st_mode
, console
->name
);
1644 if (mount(console
->name
, path
, "none", MS_BIND
, 0)) {
1645 ERROR("failed to mount '%s' on '%s'", console
->name
, path
);
1649 INFO("console has been setup");
1653 static int setup_ttydir_console(const struct lxc_rootfs
*rootfs
,
1654 const struct lxc_console
*console
,
1657 char path
[MAXPATHLEN
], lxcpath
[MAXPATHLEN
];
1660 /* create rootfs/dev/<ttydir> directory */
1661 ret
= snprintf(path
, sizeof(path
), "%s/dev/%s", rootfs
->mount
,
1663 if (ret
>= sizeof(path
))
1665 ret
= mkdir(path
, 0755);
1666 if (ret
&& errno
!= EEXIST
) {
1667 SYSERROR("failed with errno %d to create %s", errno
, path
);
1670 INFO("created %s", path
);
1672 ret
= snprintf(lxcpath
, sizeof(lxcpath
), "%s/dev/%s/console",
1673 rootfs
->mount
, ttydir
);
1674 if (ret
>= sizeof(lxcpath
)) {
1675 ERROR("console path too long");
1679 snprintf(path
, sizeof(path
), "%s/dev/console", rootfs
->mount
);
1681 if (ret
&& errno
!= ENOENT
) {
1682 SYSERROR("error unlinking %s", path
);
1686 ret
= creat(lxcpath
, 0660);
1687 if (ret
==-1 && errno
!= EEXIST
) {
1688 SYSERROR("error %d creating %s", errno
, lxcpath
);
1694 if (console
->master
< 0) {
1699 if (mount(console
->name
, lxcpath
, "none", MS_BIND
, 0)) {
1700 ERROR("failed to mount '%s' on '%s'", console
->name
, lxcpath
);
1704 /* create symlink from rootfs/dev/console to 'lxc/console' */
1705 ret
= snprintf(lxcpath
, sizeof(lxcpath
), "%s/console", ttydir
);
1706 if (ret
>= sizeof(lxcpath
)) {
1707 ERROR("lxc/console path too long");
1710 ret
= symlink(lxcpath
, path
);
1712 SYSERROR("failed to create symlink for console");
1716 INFO("console has been setup on %s", lxcpath
);
1721 static int setup_console(const struct lxc_rootfs
*rootfs
,
1722 const struct lxc_console
*console
,
1725 /* We don't have a rootfs, /dev/console will be shared */
1729 return setup_dev_console(rootfs
, console
);
1731 return setup_ttydir_console(rootfs
, console
, ttydir
);
1734 static int setup_kmsg(const struct lxc_rootfs
*rootfs
,
1735 const struct lxc_console
*console
)
1737 char kpath
[MAXPATHLEN
];
1742 ret
= snprintf(kpath
, sizeof(kpath
), "%s/dev/kmsg", rootfs
->mount
);
1743 if (ret
< 0 || ret
>= sizeof(kpath
))
1746 ret
= unlink(kpath
);
1747 if (ret
&& errno
!= ENOENT
) {
1748 SYSERROR("error unlinking %s", kpath
);
1752 ret
= symlink("console", kpath
);
1754 SYSERROR("failed to create symlink for kmsg");
1761 static void parse_mntopt(char *opt
, unsigned long *flags
, char **data
)
1763 struct mount_opt
*mo
;
1765 /* If opt is found in mount_opt, set or clear flags.
1766 * Otherwise append it to data. */
1768 for (mo
= &mount_opt
[0]; mo
->name
!= NULL
; mo
++) {
1769 if (!strncmp(opt
, mo
->name
, strlen(mo
->name
))) {
1771 *flags
&= ~mo
->flag
;
1783 int parse_mntopts(const char *mntopts
, unsigned long *mntflags
,
1787 char *p
, *saveptr
= NULL
;
1795 s
= strdup(mntopts
);
1797 SYSERROR("failed to allocate memory");
1801 data
= malloc(strlen(s
) + 1);
1803 SYSERROR("failed to allocate memory");
1809 for (p
= strtok_r(s
, ",", &saveptr
); p
!= NULL
;
1810 p
= strtok_r(NULL
, ",", &saveptr
))
1811 parse_mntopt(p
, mntflags
, &data
);
1822 static int mount_entry(const char *fsname
, const char *target
,
1823 const char *fstype
, unsigned long mountflags
,
1826 if (mount(fsname
, target
, fstype
, mountflags
& ~MS_REMOUNT
, data
)) {
1827 SYSERROR("failed to mount '%s' on '%s'", fsname
, target
);
1831 if ((mountflags
& MS_REMOUNT
) || (mountflags
& MS_BIND
)) {
1833 DEBUG("remounting %s on %s to respect bind or remount options",
1836 if (mount(fsname
, target
, fstype
,
1837 mountflags
| MS_REMOUNT
, data
)) {
1838 SYSERROR("failed to mount '%s' on '%s'",
1844 DEBUG("mounted '%s' on '%s', type '%s'", fsname
, target
, fstype
);
1850 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1852 static void cull_mntent_opt(struct mntent
*mntent
)
1856 char *list
[] = {"create=dir",
1861 for (i
=0; list
[i
]; i
++) {
1862 if (!(p
= strstr(mntent
->mnt_opts
, list
[i
])))
1864 p2
= strchr(p
, ',');
1866 /* no more mntopts, so just chop it here */
1870 memmove(p
, p2
+1, strlen(p2
+1)+1);
1874 static inline int mount_entry_on_systemfs(struct mntent
*mntent
)
1876 unsigned long mntflags
;
1879 FILE *pathfile
= NULL
;
1880 char* pathdirname
= NULL
;
1881 bool optional
= hasmntopt(mntent
, "optional") != NULL
;
1883 if (hasmntopt(mntent
, "create=dir")) {
1884 if (mkdir_p(mntent
->mnt_dir
, 0755) < 0) {
1885 WARN("Failed to create mount target '%s'", mntent
->mnt_dir
);
1890 if (hasmntopt(mntent
, "create=file") && access(mntent
->mnt_dir
, F_OK
)) {
1891 pathdirname
= strdup(mntent
->mnt_dir
);
1892 pathdirname
= dirname(pathdirname
);
1893 if (mkdir_p(pathdirname
, 0755) < 0) {
1894 WARN("Failed to create target directory");
1896 pathfile
= fopen(mntent
->mnt_dir
, "wb");
1898 WARN("Failed to create mount target '%s'", mntent
->mnt_dir
);
1905 cull_mntent_opt(mntent
);
1907 if (parse_mntopts(mntent
->mnt_opts
, &mntflags
, &mntdata
) < 0) {
1912 ret
= mount_entry(mntent
->mnt_fsname
, mntent
->mnt_dir
,
1913 mntent
->mnt_type
, mntflags
, mntdata
);
1924 static int mount_entry_on_absolute_rootfs(struct mntent
*mntent
,
1925 const struct lxc_rootfs
*rootfs
,
1926 const char *lxc_name
)
1929 char path
[MAXPATHLEN
];
1930 unsigned long mntflags
;
1932 int r
, ret
= 0, offset
;
1933 const char *lxcpath
;
1934 FILE *pathfile
= NULL
;
1935 char *pathdirname
= NULL
;
1936 bool optional
= hasmntopt(mntent
, "optional") != NULL
;
1938 lxcpath
= lxc_global_config_value("lxc.lxcpath");
1940 ERROR("Out of memory");
1944 /* if rootfs->path is a blockdev path, allow container fstab to
1945 * use $lxcpath/CN/rootfs as the target prefix */
1946 r
= snprintf(path
, MAXPATHLEN
, "%s/%s/rootfs", lxcpath
, lxc_name
);
1947 if (r
< 0 || r
>= MAXPATHLEN
)
1950 aux
= strstr(mntent
->mnt_dir
, path
);
1952 offset
= strlen(path
);
1957 aux
= strstr(mntent
->mnt_dir
, rootfs
->path
);
1959 WARN("ignoring mount point '%s'", mntent
->mnt_dir
);
1962 offset
= strlen(rootfs
->path
);
1966 r
= snprintf(path
, MAXPATHLEN
, "%s/%s", rootfs
->mount
,
1968 if (r
< 0 || r
>= MAXPATHLEN
) {
1969 WARN("pathnme too long for '%s'", mntent
->mnt_dir
);
1974 if (hasmntopt(mntent
, "create=dir")) {
1975 if (mkdir_p(path
, 0755) < 0) {
1976 WARN("Failed to create mount target '%s'", path
);
1981 if (hasmntopt(mntent
, "create=file") && access(path
, F_OK
)) {
1982 pathdirname
= strdup(path
);
1983 pathdirname
= dirname(pathdirname
);
1984 if (mkdir_p(pathdirname
, 0755) < 0) {
1985 WARN("Failed to create target directory");
1987 pathfile
= fopen(path
, "wb");
1989 WARN("Failed to create mount target '%s'", path
);
1995 cull_mntent_opt(mntent
);
1997 if (parse_mntopts(mntent
->mnt_opts
, &mntflags
, &mntdata
) < 0) {
2002 ret
= mount_entry(mntent
->mnt_fsname
, path
, mntent
->mnt_type
,
2015 static int mount_entry_on_relative_rootfs(struct mntent
*mntent
,
2018 char path
[MAXPATHLEN
];
2019 unsigned long mntflags
;
2022 FILE *pathfile
= NULL
;
2023 char *pathdirname
= NULL
;
2024 bool optional
= hasmntopt(mntent
, "optional") != NULL
;
2026 /* relative to root mount point */
2027 ret
= snprintf(path
, sizeof(path
), "%s/%s", rootfs
, mntent
->mnt_dir
);
2028 if (ret
>= sizeof(path
)) {
2029 ERROR("path name too long");
2033 if (hasmntopt(mntent
, "create=dir")) {
2034 if (mkdir_p(path
, 0755) < 0) {
2035 WARN("Failed to create mount target '%s'", path
);
2040 if (hasmntopt(mntent
, "create=file") && access(path
, F_OK
)) {
2041 pathdirname
= strdup(path
);
2042 pathdirname
= dirname(pathdirname
);
2043 if (mkdir_p(pathdirname
, 0755) < 0) {
2044 WARN("Failed to create target directory");
2046 pathfile
= fopen(path
, "wb");
2048 WARN("Failed to create mount target '%s'", path
);
2054 cull_mntent_opt(mntent
);
2056 if (parse_mntopts(mntent
->mnt_opts
, &mntflags
, &mntdata
) < 0) {
2061 ret
= mount_entry(mntent
->mnt_fsname
, path
, mntent
->mnt_type
,
2073 static int mount_file_entries(const struct lxc_rootfs
*rootfs
, FILE *file
,
2074 const char *lxc_name
)
2076 struct mntent mntent
;
2080 while (getmntent_r(file
, &mntent
, buf
, sizeof(buf
))) {
2082 if (!rootfs
->path
) {
2083 if (mount_entry_on_systemfs(&mntent
))
2088 /* We have a separate root, mounts are relative to it */
2089 if (mntent
.mnt_dir
[0] != '/') {
2090 if (mount_entry_on_relative_rootfs(&mntent
,
2096 if (mount_entry_on_absolute_rootfs(&mntent
, rootfs
, lxc_name
))
2102 INFO("mount points have been setup");
2107 static int setup_mount(const struct lxc_rootfs
*rootfs
, const char *fstab
,
2108 const char *lxc_name
)
2116 file
= setmntent(fstab
, "r");
2118 SYSERROR("failed to use '%s'", fstab
);
2122 ret
= mount_file_entries(rootfs
, file
, lxc_name
);
2128 static int setup_mount_entries(const struct lxc_rootfs
*rootfs
, struct lxc_list
*mount
,
2129 const char *lxc_name
)
2132 struct lxc_list
*iterator
;
2138 ERROR("tmpfile error: %m");
2142 lxc_list_for_each(iterator
, mount
) {
2143 mount_entry
= iterator
->elem
;
2144 fprintf(file
, "%s\n", mount_entry
);
2149 ret
= mount_file_entries(rootfs
, file
, lxc_name
);
2155 static int setup_caps(struct lxc_list
*caps
)
2157 struct lxc_list
*iterator
;
2162 lxc_list_for_each(iterator
, caps
) {
2164 drop_entry
= iterator
->elem
;
2168 for (i
= 0; i
< sizeof(caps_opt
)/sizeof(caps_opt
[0]); i
++) {
2170 if (strcmp(drop_entry
, caps_opt
[i
].name
))
2173 capid
= caps_opt
[i
].value
;
2178 /* try to see if it's numeric, so the user may specify
2179 * capabilities that the running kernel knows about but
2182 capid
= strtol(drop_entry
, &ptr
, 10);
2183 if (!ptr
|| *ptr
!= '\0' || errno
!= 0)
2184 /* not a valid number */
2186 else if (capid
> lxc_caps_last_cap())
2187 /* we have a number but it's not a valid
2193 ERROR("unknown capability %s", drop_entry
);
2197 DEBUG("drop capability '%s' (%d)", drop_entry
, capid
);
2199 if (prctl(PR_CAPBSET_DROP
, capid
, 0, 0, 0)) {
2200 SYSERROR("failed to remove %s capability", drop_entry
);
2206 DEBUG("capabilities have been setup");
2211 static int dropcaps_except(struct lxc_list
*caps
)
2213 struct lxc_list
*iterator
;
2217 int numcaps
= lxc_caps_last_cap() + 1;
2218 INFO("found %d capabilities", numcaps
);
2220 if (numcaps
<= 0 || numcaps
> 200)
2223 // caplist[i] is 1 if we keep capability i
2224 int *caplist
= alloca(numcaps
* sizeof(int));
2225 memset(caplist
, 0, numcaps
* sizeof(int));
2227 lxc_list_for_each(iterator
, caps
) {
2229 keep_entry
= iterator
->elem
;
2233 for (i
= 0; i
< sizeof(caps_opt
)/sizeof(caps_opt
[0]); i
++) {
2235 if (strcmp(keep_entry
, caps_opt
[i
].name
))
2238 capid
= caps_opt
[i
].value
;
2243 /* try to see if it's numeric, so the user may specify
2244 * capabilities that the running kernel knows about but
2246 capid
= strtol(keep_entry
, &ptr
, 10);
2247 if (!ptr
|| *ptr
!= '\0' ||
2248 capid
== INT_MIN
|| capid
== INT_MAX
)
2249 /* not a valid number */
2251 else if (capid
> lxc_caps_last_cap())
2252 /* we have a number but it's not a valid
2258 ERROR("unknown capability %s", keep_entry
);
2262 DEBUG("drop capability '%s' (%d)", keep_entry
, capid
);
2266 for (i
=0; i
<numcaps
; i
++) {
2269 if (prctl(PR_CAPBSET_DROP
, i
, 0, 0, 0)) {
2270 SYSERROR("failed to remove capability %d", i
);
2275 DEBUG("capabilities have been setup");
2280 static int setup_hw_addr(char *hwaddr
, const char *ifname
)
2282 struct sockaddr sockaddr
;
2286 ret
= lxc_convert_mac(hwaddr
, &sockaddr
);
2288 ERROR("mac address '%s' conversion failed : %s",
2289 hwaddr
, strerror(-ret
));
2293 memcpy(ifr
.ifr_name
, ifname
, IFNAMSIZ
);
2294 ifr
.ifr_name
[IFNAMSIZ
-1] = '\0';
2295 memcpy((char *) &ifr
.ifr_hwaddr
, (char *) &sockaddr
, sizeof(sockaddr
));
2297 fd
= socket(AF_INET
, SOCK_DGRAM
, 0);
2299 ERROR("socket failure : %s", strerror(errno
));
2303 ret
= ioctl(fd
, SIOCSIFHWADDR
, &ifr
);
2306 ERROR("ioctl failure : %s", strerror(errno
));
2308 DEBUG("mac address '%s' on '%s' has been setup", hwaddr
, ifr
.ifr_name
);
2313 static int setup_ipv4_addr(struct lxc_list
*ip
, int ifindex
)
2315 struct lxc_list
*iterator
;
2316 struct lxc_inetdev
*inetdev
;
2319 lxc_list_for_each(iterator
, ip
) {
2321 inetdev
= iterator
->elem
;
2323 err
= lxc_ipv4_addr_add(ifindex
, &inetdev
->addr
,
2324 &inetdev
->bcast
, inetdev
->prefix
);
2326 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2327 ifindex
, strerror(-err
));
2335 static int setup_ipv6_addr(struct lxc_list
*ip
, int ifindex
)
2337 struct lxc_list
*iterator
;
2338 struct lxc_inet6dev
*inet6dev
;
2341 lxc_list_for_each(iterator
, ip
) {
2343 inet6dev
= iterator
->elem
;
2345 err
= lxc_ipv6_addr_add(ifindex
, &inet6dev
->addr
,
2346 &inet6dev
->mcast
, &inet6dev
->acast
,
2349 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2350 ifindex
, strerror(-err
));
2358 static int setup_netdev(struct lxc_netdev
*netdev
)
2360 char ifname
[IFNAMSIZ
];
2361 char *current_ifname
= ifname
;
2364 /* empty network namespace */
2365 if (!netdev
->ifindex
) {
2366 if (netdev
->flags
& IFF_UP
) {
2367 err
= lxc_netdev_up("lo");
2369 ERROR("failed to set the loopback up : %s",
2374 if (netdev
->type
!= LXC_NET_VETH
)
2376 netdev
->ifindex
= if_nametoindex(netdev
->name
);
2379 /* get the new ifindex in case of physical netdev */
2380 if (netdev
->type
== LXC_NET_PHYS
) {
2381 if (!(netdev
->ifindex
= if_nametoindex(netdev
->link
))) {
2382 ERROR("failed to get ifindex for %s",
2388 /* retrieve the name of the interface */
2389 if (!if_indextoname(netdev
->ifindex
, current_ifname
)) {
2390 ERROR("no interface corresponding to index '%d'",
2395 /* default: let the system to choose one interface name */
2397 netdev
->name
= netdev
->type
== LXC_NET_PHYS
?
2398 netdev
->link
: "eth%d";
2400 /* rename the interface name */
2401 if (strcmp(ifname
, netdev
->name
) != 0) {
2402 err
= lxc_netdev_rename_by_name(ifname
, netdev
->name
);
2404 ERROR("failed to rename %s->%s : %s", ifname
, netdev
->name
,
2410 /* Re-read the name of the interface because its name has changed
2411 * and would be automatically allocated by the system
2413 if (!if_indextoname(netdev
->ifindex
, current_ifname
)) {
2414 ERROR("no interface corresponding to index '%d'",
2419 /* set a mac address */
2420 if (netdev
->hwaddr
) {
2421 if (setup_hw_addr(netdev
->hwaddr
, current_ifname
)) {
2422 ERROR("failed to setup hw address for '%s'",
2428 /* setup ipv4 addresses on the interface */
2429 if (setup_ipv4_addr(&netdev
->ipv4
, netdev
->ifindex
)) {
2430 ERROR("failed to setup ip addresses for '%s'",
2435 /* setup ipv6 addresses on the interface */
2436 if (setup_ipv6_addr(&netdev
->ipv6
, netdev
->ifindex
)) {
2437 ERROR("failed to setup ipv6 addresses for '%s'",
2442 /* set the network device up */
2443 if (netdev
->flags
& IFF_UP
) {
2446 err
= lxc_netdev_up(current_ifname
);
2448 ERROR("failed to set '%s' up : %s", current_ifname
,
2453 /* the network is up, make the loopback up too */
2454 err
= lxc_netdev_up("lo");
2456 ERROR("failed to set the loopback up : %s",
2462 /* We can only set up the default routes after bringing
2463 * up the interface, sine bringing up the interface adds
2464 * the link-local routes and we can't add a default
2465 * route if the gateway is not reachable. */
2467 /* setup ipv4 gateway on the interface */
2468 if (netdev
->ipv4_gateway
) {
2469 if (!(netdev
->flags
& IFF_UP
)) {
2470 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname
);
2474 if (lxc_list_empty(&netdev
->ipv4
)) {
2475 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname
);
2479 err
= lxc_ipv4_gateway_add(netdev
->ifindex
, netdev
->ipv4_gateway
);
2481 err
= lxc_ipv4_dest_add(netdev
->ifindex
, netdev
->ipv4_gateway
);
2483 ERROR("failed to add ipv4 dest for '%s': %s",
2484 ifname
, strerror(-err
));
2487 err
= lxc_ipv4_gateway_add(netdev
->ifindex
, netdev
->ipv4_gateway
);
2489 ERROR("failed to setup ipv4 gateway for '%s': %s",
2490 ifname
, strerror(-err
));
2491 if (netdev
->ipv4_gateway_auto
) {
2492 char buf
[INET_ADDRSTRLEN
];
2493 inet_ntop(AF_INET
, netdev
->ipv4_gateway
, buf
, sizeof(buf
));
2494 ERROR("tried to set autodetected ipv4 gateway '%s'", buf
);
2501 /* setup ipv6 gateway on the interface */
2502 if (netdev
->ipv6_gateway
) {
2503 if (!(netdev
->flags
& IFF_UP
)) {
2504 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname
);
2508 if (lxc_list_empty(&netdev
->ipv6
) && !IN6_IS_ADDR_LINKLOCAL(netdev
->ipv6_gateway
)) {
2509 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname
);
2513 err
= lxc_ipv6_gateway_add(netdev
->ifindex
, netdev
->ipv6_gateway
);
2515 err
= lxc_ipv6_dest_add(netdev
->ifindex
, netdev
->ipv6_gateway
);
2517 ERROR("failed to add ipv6 dest for '%s': %s",
2518 ifname
, strerror(-err
));
2521 err
= lxc_ipv6_gateway_add(netdev
->ifindex
, netdev
->ipv6_gateway
);
2523 ERROR("failed to setup ipv6 gateway for '%s': %s",
2524 ifname
, strerror(-err
));
2525 if (netdev
->ipv6_gateway_auto
) {
2526 char buf
[INET6_ADDRSTRLEN
];
2527 inet_ntop(AF_INET6
, netdev
->ipv6_gateway
, buf
, sizeof(buf
));
2528 ERROR("tried to set autodetected ipv6 gateway '%s'", buf
);
2535 DEBUG("'%s' has been setup", current_ifname
);
2540 static int setup_network(struct lxc_list
*network
)
2542 struct lxc_list
*iterator
;
2543 struct lxc_netdev
*netdev
;
2545 lxc_list_for_each(iterator
, network
) {
2547 netdev
= iterator
->elem
;
2549 if (setup_netdev(netdev
)) {
2550 ERROR("failed to setup netdev");
2555 if (!lxc_list_empty(network
))
2556 INFO("network has been setup");
2561 /* try to move physical nics to the init netns */
2562 void restore_phys_nics_to_netns(int netnsfd
, struct lxc_conf
*conf
)
2565 char path
[MAXPATHLEN
];
2570 ret
= snprintf(path
, MAXPATHLEN
, "/proc/self/ns/net");
2571 if (ret
< 0 || ret
>= MAXPATHLEN
) {
2572 WARN("Failed to open monitor netns fd");
2575 if ((oldfd
= open(path
, O_RDONLY
)) < 0) {
2576 SYSERROR("Failed to open monitor netns fd");
2579 if (setns(netnsfd
, 0) != 0) {
2580 SYSERROR("Failed to enter container netns to reset nics");
2584 for (i
=0; i
<conf
->num_savednics
; i
++) {
2585 struct saved_nic
*s
= &conf
->saved_nics
[i
];
2586 if (lxc_netdev_move_by_index(s
->ifindex
, 1))
2587 WARN("Error moving nic index:%d back to host netns",
2590 if (setns(oldfd
, 0) != 0)
2591 SYSERROR("Failed to re-enter monitor's netns");
2595 void lxc_rename_phys_nics_on_shutdown(int netnsfd
, struct lxc_conf
*conf
)
2599 if (conf
->num_savednics
== 0)
2602 INFO("running to reset %d nic names", conf
->num_savednics
);
2603 restore_phys_nics_to_netns(netnsfd
, conf
);
2604 for (i
=0; i
<conf
->num_savednics
; i
++) {
2605 struct saved_nic
*s
= &conf
->saved_nics
[i
];
2606 INFO("resetting nic %d to %s", s
->ifindex
, s
->orig_name
);
2607 lxc_netdev_rename_by_index(s
->ifindex
, s
->orig_name
);
2610 conf
->num_savednics
= 0;
2613 static char *default_rootfs_mount
= LXCROOTFSMOUNT
;
2615 struct lxc_conf
*lxc_conf_init(void)
2617 struct lxc_conf
*new;
2620 new = malloc(sizeof(*new));
2622 ERROR("lxc_conf_init : %m");
2625 memset(new, 0, sizeof(*new));
2627 new->loglevel
= LXC_LOG_PRIORITY_NOTSET
;
2628 new->personality
= -1;
2630 new->console
.log_path
= NULL
;
2631 new->console
.log_fd
= -1;
2632 new->console
.path
= NULL
;
2633 new->console
.peer
= -1;
2634 new->console
.peerpty
.busy
= -1;
2635 new->console
.peerpty
.master
= -1;
2636 new->console
.peerpty
.slave
= -1;
2637 new->console
.master
= -1;
2638 new->console
.slave
= -1;
2639 new->console
.name
[0] = '\0';
2640 new->maincmd_fd
= -1;
2641 new->rootfs
.mount
= strdup(default_rootfs_mount
);
2642 if (!new->rootfs
.mount
) {
2643 ERROR("lxc_conf_init : %m");
2648 lxc_list_init(&new->cgroup
);
2649 lxc_list_init(&new->network
);
2650 lxc_list_init(&new->mount_list
);
2651 lxc_list_init(&new->caps
);
2652 lxc_list_init(&new->keepcaps
);
2653 lxc_list_init(&new->id_map
);
2654 for (i
=0; i
<NUM_LXC_HOOKS
; i
++)
2655 lxc_list_init(&new->hooks
[i
]);
2656 lxc_list_init(&new->groups
);
2657 new->lsm_aa_profile
= NULL
;
2658 new->lsm_se_context
= NULL
;
2659 new->tmp_umount_proc
= 0;
2661 for (i
= 0; i
< LXC_NS_MAX
; i
++)
2662 new->inherit_ns_fd
[i
] = -1;
2667 static int instanciate_veth(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2669 char veth1buf
[IFNAMSIZ
], *veth1
;
2670 char veth2buf
[IFNAMSIZ
], *veth2
;
2673 if (netdev
->priv
.veth_attr
.pair
)
2674 veth1
= netdev
->priv
.veth_attr
.pair
;
2676 err
= snprintf(veth1buf
, sizeof(veth1buf
), "vethXXXXXX");
2677 if (err
>= sizeof(veth1buf
)) { /* can't *really* happen, but... */
2678 ERROR("veth1 name too long");
2681 veth1
= lxc_mkifname(veth1buf
);
2683 ERROR("failed to allocate a temporary name");
2686 /* store away for deconf */
2687 memcpy(netdev
->priv
.veth_attr
.veth1
, veth1
, IFNAMSIZ
);
2690 snprintf(veth2buf
, sizeof(veth2buf
), "vethXXXXXX");
2691 veth2
= lxc_mkifname(veth2buf
);
2693 ERROR("failed to allocate a temporary name");
2697 err
= lxc_veth_create(veth1
, veth2
);
2699 ERROR("failed to create %s-%s : %s", veth1
, veth2
,
2704 /* changing the high byte of the mac address to 0xfe, the bridge interface
2705 * will always keep the host's mac address and not take the mac address
2707 err
= setup_private_host_hw_addr(veth1
);
2709 ERROR("failed to change mac address of host interface '%s' : %s",
2710 veth1
, strerror(-err
));
2715 err
= lxc_netdev_set_mtu(veth1
, atoi(netdev
->mtu
));
2717 err
= lxc_netdev_set_mtu(veth2
, atoi(netdev
->mtu
));
2719 ERROR("failed to set mtu '%s' for %s-%s : %s",
2720 netdev
->mtu
, veth1
, veth2
, strerror(-err
));
2726 err
= lxc_bridge_attach(netdev
->link
, veth1
);
2728 ERROR("failed to attach '%s' to the bridge '%s' : %s",
2729 veth1
, netdev
->link
, strerror(-err
));
2734 netdev
->ifindex
= if_nametoindex(veth2
);
2735 if (!netdev
->ifindex
) {
2736 ERROR("failed to retrieve the index for %s", veth2
);
2740 err
= lxc_netdev_up(veth1
);
2742 ERROR("failed to set %s up : %s", veth1
, strerror(-err
));
2746 if (netdev
->upscript
) {
2747 err
= run_script(handler
->name
, "net", netdev
->upscript
, "up",
2748 "veth", veth1
, (char*) NULL
);
2753 DEBUG("instanciated veth '%s/%s', index is '%d'",
2754 veth1
, veth2
, netdev
->ifindex
);
2759 lxc_netdev_delete_by_name(veth1
);
2760 if (!netdev
->priv
.veth_attr
.pair
&& veth1
)
2767 static int shutdown_veth(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2772 if (netdev
->priv
.veth_attr
.pair
)
2773 veth1
= netdev
->priv
.veth_attr
.pair
;
2775 veth1
= netdev
->priv
.veth_attr
.veth1
;
2777 if (netdev
->downscript
) {
2778 err
= run_script(handler
->name
, "net", netdev
->downscript
,
2779 "down", "veth", veth1
, (char*) NULL
);
2786 static int instanciate_macvlan(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2788 char peerbuf
[IFNAMSIZ
], *peer
;
2791 if (!netdev
->link
) {
2792 ERROR("no link specified for macvlan netdev");
2796 err
= snprintf(peerbuf
, sizeof(peerbuf
), "mcXXXXXX");
2797 if (err
>= sizeof(peerbuf
))
2800 peer
= lxc_mkifname(peerbuf
);
2802 ERROR("failed to make a temporary name");
2806 err
= lxc_macvlan_create(netdev
->link
, peer
,
2807 netdev
->priv
.macvlan_attr
.mode
);
2809 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2810 peer
, netdev
->link
, strerror(-err
));
2814 netdev
->ifindex
= if_nametoindex(peer
);
2815 if (!netdev
->ifindex
) {
2816 ERROR("failed to retrieve the index for %s", peer
);
2820 if (netdev
->upscript
) {
2821 err
= run_script(handler
->name
, "net", netdev
->upscript
, "up",
2822 "macvlan", netdev
->link
, (char*) NULL
);
2827 DEBUG("instanciated macvlan '%s', index is '%d' and mode '%d'",
2828 peer
, netdev
->ifindex
, netdev
->priv
.macvlan_attr
.mode
);
2832 lxc_netdev_delete_by_name(peer
);
2837 static int shutdown_macvlan(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2841 if (netdev
->downscript
) {
2842 err
= run_script(handler
->name
, "net", netdev
->downscript
,
2843 "down", "macvlan", netdev
->link
,
2851 /* XXX: merge with instanciate_macvlan */
2852 static int instanciate_vlan(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2854 char peer
[IFNAMSIZ
];
2857 if (!netdev
->link
) {
2858 ERROR("no link specified for vlan netdev");
2862 err
= snprintf(peer
, sizeof(peer
), "vlan%d", netdev
->priv
.vlan_attr
.vid
);
2863 if (err
>= sizeof(peer
)) {
2864 ERROR("peer name too long");
2868 err
= lxc_vlan_create(netdev
->link
, peer
, netdev
->priv
.vlan_attr
.vid
);
2870 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2871 peer
, netdev
->link
, strerror(-err
));
2875 netdev
->ifindex
= if_nametoindex(peer
);
2876 if (!netdev
->ifindex
) {
2877 ERROR("failed to retrieve the ifindex for %s", peer
);
2878 lxc_netdev_delete_by_name(peer
);
2882 DEBUG("instanciated vlan '%s', ifindex is '%d'", " vlan1000",
2888 static int shutdown_vlan(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2893 static int instanciate_phys(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2895 if (!netdev
->link
) {
2896 ERROR("no link specified for the physical interface");
2900 netdev
->ifindex
= if_nametoindex(netdev
->link
);
2901 if (!netdev
->ifindex
) {
2902 ERROR("failed to retrieve the index for %s", netdev
->link
);
2906 if (netdev
->upscript
) {
2908 err
= run_script(handler
->name
, "net", netdev
->upscript
,
2909 "up", "phys", netdev
->link
, (char*) NULL
);
2917 static int shutdown_phys(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2921 if (netdev
->downscript
) {
2922 err
= run_script(handler
->name
, "net", netdev
->downscript
,
2923 "down", "phys", netdev
->link
, (char*) NULL
);
2930 static int instanciate_none(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2932 netdev
->ifindex
= 0;
2936 static int instanciate_empty(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2938 netdev
->ifindex
= 0;
2939 if (netdev
->upscript
) {
2941 err
= run_script(handler
->name
, "net", netdev
->upscript
,
2942 "up", "empty", (char*) NULL
);
2949 static int shutdown_empty(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2953 if (netdev
->downscript
) {
2954 err
= run_script(handler
->name
, "net", netdev
->downscript
,
2955 "down", "empty", (char*) NULL
);
2962 static int shutdown_none(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2967 int lxc_requests_empty_network(struct lxc_handler
*handler
)
2969 struct lxc_list
*network
= &handler
->conf
->network
;
2970 struct lxc_list
*iterator
;
2971 struct lxc_netdev
*netdev
;
2972 bool found_none
= false, found_nic
= false;
2974 if (lxc_list_empty(network
))
2977 lxc_list_for_each(iterator
, network
) {
2979 netdev
= iterator
->elem
;
2981 if (netdev
->type
== LXC_NET_NONE
)
2986 if (found_none
&& !found_nic
)
2991 int lxc_create_network(struct lxc_handler
*handler
)
2993 struct lxc_list
*network
= &handler
->conf
->network
;
2994 struct lxc_list
*iterator
;
2995 struct lxc_netdev
*netdev
;
2996 int am_root
= (getuid() == 0);
3001 lxc_list_for_each(iterator
, network
) {
3003 netdev
= iterator
->elem
;
3005 if (netdev
->type
< 0 || netdev
->type
> LXC_NET_MAXCONFTYPE
) {
3006 ERROR("invalid network configuration type '%d'",
3011 if (netdev_conf
[netdev
->type
](handler
, netdev
)) {
3012 ERROR("failed to create netdev");
3021 void lxc_delete_network(struct lxc_handler
*handler
)
3023 struct lxc_list
*network
= &handler
->conf
->network
;
3024 struct lxc_list
*iterator
;
3025 struct lxc_netdev
*netdev
;
3027 lxc_list_for_each(iterator
, network
) {
3028 netdev
= iterator
->elem
;
3030 if (netdev
->ifindex
!= 0 && netdev
->type
== LXC_NET_PHYS
) {
3031 if (lxc_netdev_rename_by_index(netdev
->ifindex
, netdev
->link
))
3032 WARN("failed to rename to the initial name the " \
3033 "netdev '%s'", netdev
->link
);
3037 if (netdev_deconf
[netdev
->type
](handler
, netdev
)) {
3038 WARN("failed to destroy netdev");
3041 /* Recent kernel remove the virtual interfaces when the network
3042 * namespace is destroyed but in case we did not moved the
3043 * interface to the network namespace, we have to destroy it
3045 if (netdev
->ifindex
!= 0 &&
3046 lxc_netdev_delete_by_index(netdev
->ifindex
))
3047 WARN("failed to remove interface '%s'", netdev
->name
);
3051 #define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3053 /* lxc-user-nic returns "interface_name:interface_name\n" */
3054 #define MAX_BUFFER_SIZE IFNAMSIZ*2 + 2
3055 static int unpriv_assign_nic(struct lxc_netdev
*netdev
, pid_t pid
)
3058 int bytes
, pipefd
[2];
3059 char *token
, *saveptr
= NULL
;
3060 char buffer
[MAX_BUFFER_SIZE
];
3062 if (netdev
->type
!= LXC_NET_VETH
) {
3063 ERROR("nic type %d not support for unprivileged use",
3068 if(pipe(pipefd
) < 0) {
3069 SYSERROR("pipe failed");
3073 if ((child
= fork()) < 0) {
3080 if (child
== 0) { // child
3081 /* close the read-end of the pipe */
3083 /* redirect the stdout to write-end of the pipe */
3084 dup2(pipefd
[1], STDOUT_FILENO
);
3085 /* close the write-end of the pipe */
3088 // Call lxc-user-nic pid type bridge
3090 char *args
[] = {LXC_USERNIC_PATH
, pidstr
, "veth", netdev
->link
, netdev
->name
, NULL
};
3091 snprintf(pidstr
, 19, "%lu", (unsigned long) pid
);
3093 execvp(args
[0], args
);
3094 SYSERROR("execvp lxc-user-nic");
3098 /* close the write-end of the pipe */
3101 bytes
= read(pipefd
[0], &buffer
, MAX_BUFFER_SIZE
);
3103 SYSERROR("read failed");
3105 buffer
[bytes
- 1] = '\0';
3107 if (wait_for_pid(child
) != 0) {
3112 /* close the read-end of the pipe */
3115 /* fill netdev->name field */
3116 token
= strtok_r(buffer
, ":", &saveptr
);
3119 netdev
->name
= malloc(IFNAMSIZ
+1);
3120 if (!netdev
->name
) {
3121 ERROR("Out of memory");
3124 memset(netdev
->name
, 0, IFNAMSIZ
+1);
3125 strncpy(netdev
->name
, token
, IFNAMSIZ
);
3127 /* fill netdev->veth_attr.pair field */
3128 token
= strtok_r(NULL
, ":", &saveptr
);
3131 netdev
->priv
.veth_attr
.pair
= strdup(token
);
3132 if (!netdev
->priv
.veth_attr
.pair
) {
3133 ERROR("Out of memory");
3140 int lxc_assign_network(struct lxc_list
*network
, pid_t pid
)
3142 struct lxc_list
*iterator
;
3143 struct lxc_netdev
*netdev
;
3144 int am_root
= (getuid() == 0);
3147 lxc_list_for_each(iterator
, network
) {
3149 netdev
= iterator
->elem
;
3151 if (netdev
->type
== LXC_NET_VETH
&& !am_root
) {
3152 if (unpriv_assign_nic(netdev
, pid
))
3154 // lxc-user-nic has moved the nic to the new ns.
3155 // unpriv_assign_nic() fills in netdev->name.
3156 // netdev->ifindex will be filed in at setup_netdev.
3160 /* empty network namespace, nothing to move */
3161 if (!netdev
->ifindex
)
3164 err
= lxc_netdev_move_by_index(netdev
->ifindex
, pid
);
3166 ERROR("failed to move '%s' to the container : %s",
3167 netdev
->link
, strerror(-err
));
3171 DEBUG("move '%s' to '%d'", netdev
->name
, pid
);
3177 static int write_id_mapping(enum idtype idtype
, pid_t pid
, const char *buf
,
3180 char path
[PATH_MAX
];
3184 ret
= snprintf(path
, PATH_MAX
, "/proc/%d/%cid_map", pid
, idtype
== ID_TYPE_UID
? 'u' : 'g');
3185 if (ret
< 0 || ret
>= PATH_MAX
) {
3186 fprintf(stderr
, "%s: path name too long\n", __func__
);
3189 f
= fopen(path
, "w");
3194 ret
= fwrite(buf
, buf_size
, 1, f
);
3196 SYSERROR("writing id mapping");
3197 closeret
= fclose(f
);
3199 SYSERROR("writing id mapping");
3200 return ret
< 0 ? ret
: closeret
;
3203 int lxc_map_ids(struct lxc_list
*idmap
, pid_t pid
)
3205 struct lxc_list
*iterator
;
3209 char *buf
= NULL
, *pos
;
3210 int use_shadow
= (on_path("newuidmap") && on_path("newuidmap"));
3212 if (!use_shadow
&& geteuid()) {
3213 ERROR("Missing newuidmap/newgidmap");
3217 for(type
= ID_TYPE_UID
; type
<= ID_TYPE_GID
; type
++) {
3221 buf
= pos
= malloc(4096);
3227 pos
+= sprintf(buf
, "new%cidmap %d",
3228 type
== ID_TYPE_UID
? 'u' : 'g',
3231 lxc_list_for_each(iterator
, idmap
) {
3232 /* The kernel only takes <= 4k for writes to /proc/<nr>/[ug]id_map */
3233 map
= iterator
->elem
;
3234 if (map
->idtype
!= type
)
3238 left
= 4096 - (pos
- buf
);
3239 fill
= snprintf(pos
, left
, "%s%lu %lu %lu%s",
3240 use_shadow
? " " : "",
3241 map
->nsid
, map
->hostid
, map
->range
,
3242 use_shadow
? "" : "\n");
3243 if (fill
<= 0 || fill
>= left
)
3244 SYSERROR("snprintf failed, too many mappings");
3251 ret
= write_id_mapping(type
, pid
, buf
, pos
-buf
);
3253 left
= 4096 - (pos
- buf
);
3254 fill
= snprintf(pos
, left
, "\n");
3255 if (fill
<= 0 || fill
>= left
)
3256 SYSERROR("snprintf failed, too many mappings");
3271 * return the host uid to which the container root is mapped in *val.
3272 * Return true if id was found, false otherwise.
3274 bool get_mapped_rootid(struct lxc_conf
*conf
, enum idtype idtype
,
3277 struct lxc_list
*it
;
3280 lxc_list_for_each(it
, &conf
->id_map
) {
3282 if (map
->idtype
!= ID_TYPE_UID
)
3292 int mapped_hostid(unsigned id
, struct lxc_conf
*conf
, enum idtype idtype
)
3294 struct lxc_list
*it
;
3296 lxc_list_for_each(it
, &conf
->id_map
) {
3298 if (map
->idtype
!= idtype
)
3300 if (id
>= map
->hostid
&& id
< map
->hostid
+ map
->range
)
3301 return (id
- map
->hostid
) + map
->nsid
;
3306 int find_unmapped_nsuid(struct lxc_conf
*conf
, enum idtype idtype
)
3308 struct lxc_list
*it
;
3310 unsigned int freeid
= 0;
3312 lxc_list_for_each(it
, &conf
->id_map
) {
3314 if (map
->idtype
!= idtype
)
3316 if (freeid
>= map
->nsid
&& freeid
< map
->nsid
+ map
->range
) {
3317 freeid
= map
->nsid
+ map
->range
;
3324 int lxc_find_gateway_addresses(struct lxc_handler
*handler
)
3326 struct lxc_list
*network
= &handler
->conf
->network
;
3327 struct lxc_list
*iterator
;
3328 struct lxc_netdev
*netdev
;
3331 lxc_list_for_each(iterator
, network
) {
3332 netdev
= iterator
->elem
;
3334 if (!netdev
->ipv4_gateway_auto
&& !netdev
->ipv6_gateway_auto
)
3337 if (netdev
->type
!= LXC_NET_VETH
&& netdev
->type
!= LXC_NET_MACVLAN
) {
3338 ERROR("gateway = auto only supported for "
3339 "veth and macvlan");
3343 if (!netdev
->link
) {
3344 ERROR("gateway = auto needs a link interface");
3348 link_index
= if_nametoindex(netdev
->link
);
3352 if (netdev
->ipv4_gateway_auto
) {
3353 if (lxc_ipv4_addr_get(link_index
, &netdev
->ipv4_gateway
)) {
3354 ERROR("failed to automatically find ipv4 gateway "
3355 "address from link interface '%s'", netdev
->link
);
3360 if (netdev
->ipv6_gateway_auto
) {
3361 if (lxc_ipv6_addr_get(link_index
, &netdev
->ipv6_gateway
)) {
3362 ERROR("failed to automatically find ipv6 gateway "
3363 "address from link interface '%s'", netdev
->link
);
3372 int lxc_create_tty(const char *name
, struct lxc_conf
*conf
)
3374 struct lxc_tty_info
*tty_info
= &conf
->tty_info
;
3377 /* no tty in the configuration */
3381 tty_info
->pty_info
=
3382 malloc(sizeof(*tty_info
->pty_info
)*conf
->tty
);
3383 if (!tty_info
->pty_info
) {
3384 SYSERROR("failed to allocate pty_info");
3388 for (i
= 0; i
< conf
->tty
; i
++) {
3390 struct lxc_pty_info
*pty_info
= &tty_info
->pty_info
[i
];
3393 ret
= openpty(&pty_info
->master
, &pty_info
->slave
,
3394 pty_info
->name
, NULL
, NULL
);
3397 SYSERROR("failed to create pty #%d", i
);
3398 tty_info
->nbtty
= i
;
3399 lxc_delete_tty(tty_info
);
3403 DEBUG("allocated pty '%s' (%d/%d)",
3404 pty_info
->name
, pty_info
->master
, pty_info
->slave
);
3406 /* Prevent leaking the file descriptors to the container */
3407 fcntl(pty_info
->master
, F_SETFD
, FD_CLOEXEC
);
3408 fcntl(pty_info
->slave
, F_SETFD
, FD_CLOEXEC
);
3413 tty_info
->nbtty
= conf
->tty
;
3415 INFO("tty's configured");
3420 void lxc_delete_tty(struct lxc_tty_info
*tty_info
)
3424 for (i
= 0; i
< tty_info
->nbtty
; i
++) {
3425 struct lxc_pty_info
*pty_info
= &tty_info
->pty_info
[i
];
3427 close(pty_info
->master
);
3428 close(pty_info
->slave
);
3431 free(tty_info
->pty_info
);
3432 tty_info
->nbtty
= 0;
3436 * chown_mapped_root: for an unprivileged user with uid X to chown a dir
3437 * to subuid Y, he needs to run chown as root in a userns where
3438 * nsid 0 is mapped to hostuid Y, and nsid Y is mapped to hostuid
3439 * X. That way, the container root is privileged with respect to
3440 * hostuid X, allowing him to do the chown.
3442 int chown_mapped_root(char *path
, struct lxc_conf
*conf
)
3447 char *chownpath
= path
;
3449 if (!get_mapped_rootid(conf
, ID_TYPE_UID
, &val
)) {
3450 ERROR("No mapping for container root");
3453 rootid
= (uid_t
) val
;
3456 * In case of overlay, we want only the writeable layer
3459 if (strncmp(path
, "overlayfs:", 10) == 0 || strncmp(path
, "aufs:", 5) == 0) {
3460 chownpath
= strchr(path
, ':');
3462 ERROR("Bad overlay path: %s", path
);
3465 chownpath
= strchr(chownpath
+1, ':');
3467 ERROR("Bad overlay path: %s", path
);
3473 if (geteuid() == 0) {
3474 if (chown(path
, rootid
, -1) < 0) {
3475 ERROR("Error chowning %s", path
);
3481 if (rootid
== geteuid()) {
3483 INFO("%s: container root is our uid; no need to chown" ,__func__
);
3489 SYSERROR("Failed forking");
3493 int hostuid
= geteuid(), ret
;
3494 char map1
[100], map2
[100], map3
[100];
3495 char *args
[] = {"lxc-usernsexec", "-m", map1
, "-m", map2
, "-m",
3496 map3
, "--", "chown", "0", path
, NULL
};
3499 ret
= snprintf(map1
, 100, "u:0:%d:1", rootid
);
3500 if (ret
< 0 || ret
>= 100) {
3501 ERROR("Error uid printing map string");
3505 // "u:hostuid:hostuid:1"
3506 ret
= snprintf(map2
, 100, "u:%d:%d:1", hostuid
, hostuid
);
3507 if (ret
< 0 || ret
>= 100) {
3508 ERROR("Error uid printing map string");
3513 ret
= snprintf(map3
, 100, "g:0:%d:1", getgid());
3514 if (ret
< 0 || ret
>= 100) {
3515 ERROR("Error uid printing map string");
3519 ret
= execvp("lxc-usernsexec", args
);
3520 SYSERROR("Failed executing usernsexec");
3523 return wait_for_pid(pid
);
3526 int ttys_shift_ids(struct lxc_conf
*c
)
3530 if (lxc_list_empty(&c
->id_map
))
3533 for (i
= 0; i
< c
->tty_info
.nbtty
; i
++) {
3534 struct lxc_pty_info
*pty_info
= &c
->tty_info
.pty_info
[i
];
3536 if (chown_mapped_root(pty_info
->name
, c
) < 0) {
3537 ERROR("Failed to chown %s", pty_info
->name
);
3542 if (strcmp(c
->console
.name
, "") !=0 && chown_mapped_root(c
->console
.name
, c
) < 0) {
3543 ERROR("Failed to chown %s", c
->console
.name
);
3551 * This routine is called when the configuration does not already specify a value
3552 * for autodev (mounting a file system on /dev and populating it in a container).
3553 * If a hard override value has not be specified, then we try to apply some
3554 * heuristics to determine if we should switch to autodev mode.
3556 * For instance, if the container has an /etc/systemd/system directory then it
3557 * is probably running systemd as the init process and it needs the autodev
3558 * mount to prevent it from mounting devtmpfs on /dev on it's own causing conflicts
3561 * We may also want to enable autodev if the host has devtmpfs mounted on its
3562 * /dev as this then enable us to use subdirectories under /dev for the container
3563 * /dev directories and we can fake udev devices.
3569 #define MAX_SYMLINK_DEPTH 32
3571 static int check_autodev( const char *rootfs
, void *data
)
3573 struct start_args
*arg
= data
;
3577 char absrootfs
[MAXPATHLEN
];
3578 char path
[MAXPATHLEN
];
3579 char abs_path
[MAXPATHLEN
];
3580 char *command
= "/sbin/init";
3582 if (rootfs
== NULL
|| strlen(rootfs
) == 0)
3585 if (!realpath(rootfs
, absrootfs
))
3588 if( arg
&& arg
->argv
[0] ) {
3589 command
= arg
->argv
[0];
3590 DEBUG("Set exec command to %s", command
);
3593 strncpy( path
, command
, MAXPATHLEN
-1 );
3595 if ( 0 != access(path
, F_OK
) || 0 != stat(path
, &s
) )
3598 /* Dereference down the symlink merry path testing as we go. */
3599 /* If anything references systemd in the path - set autodev! */
3600 /* Renormalize to the rootfs before each dereference */
3601 /* Relative symlinks should fall out in the wash even with .. */
3603 if ( strstr( path
, "systemd" ) ) {
3604 INFO("Container with systemd init detected - enabling autodev!");
3608 ret
= snprintf(abs_path
, MAXPATHLEN
-1, "%s/%s", absrootfs
, path
);
3609 if (ret
< 0 || ret
> MAXPATHLEN
)
3612 ret
= readlink( abs_path
, path
, MAXPATHLEN
-1 );
3614 if ( ( ret
<= 0 ) || ( ++loop_count
> MAX_SYMLINK_DEPTH
) ) {
3615 break; /* Break out for other tests */
3621 * Add future checks here.
3622 * Return positive if we should go autodev
3623 * Return 0 if we should NOT go autodev
3624 * Return negative if we encounter an error or can not determine...
3627 /* All else fails, we don't need autodev */
3628 INFO("Autodev not required.");
3633 * _do_tmp_proc_mount: Mount /proc inside container if not already
3636 * @rootfs : the rootfs where proc should be mounted
3638 * Returns < 0 on failure, 0 if the correct proc was already mounted
3639 * and 1 if a new proc was mounted.
3641 static int do_tmp_proc_mount(const char *rootfs
)
3643 char path
[MAXPATHLEN
];
3647 ret
= snprintf(path
, MAXPATHLEN
, "%s/proc/self", rootfs
);
3648 if (ret
< 0 || ret
>= MAXPATHLEN
) {
3649 SYSERROR("proc path name too long");
3652 memset(link
, 0, 20);
3653 linklen
= readlink(path
, link
, 20);
3654 INFO("I am %d, /proc/self points to '%s'", getpid(), link
);
3655 ret
= snprintf(path
, MAXPATHLEN
, "%s/proc", rootfs
);
3656 if (linklen
< 0) /* /proc not mounted */
3658 /* can't be longer than rootfs/proc/1 */
3659 if (strncmp(link
, "1", linklen
) != 0) {
3660 /* wrong /procs mounted */
3661 umount2(path
, MNT_DETACH
); /* ignore failure */
3664 /* the right proc is already mounted */
3668 if (mount("proc", path
, "proc", 0, NULL
))
3670 INFO("Mounted /proc in container for security transition");
3674 int tmp_proc_mount(struct lxc_conf
*lxc_conf
)
3678 if (lxc_conf
->rootfs
.path
== NULL
|| strlen(lxc_conf
->rootfs
.path
) == 0) {
3679 if (mount("proc", "/proc", "proc", 0, NULL
)) {
3680 SYSERROR("Failed mounting /proc, proceeding");
3685 mounted
= do_tmp_proc_mount(lxc_conf
->rootfs
.mount
);
3686 if (mounted
== -1) {
3687 SYSERROR("failed to mount /proc in the container.");
3689 } else if (mounted
== 1) {
3690 lxc_conf
->tmp_umount_proc
= 1;
3695 void tmp_proc_unmount(struct lxc_conf
*lxc_conf
)
3697 if (lxc_conf
->tmp_umount_proc
== 1) {
3699 lxc_conf
->tmp_umount_proc
= 0;
3703 int lxc_setup(struct lxc_handler
*handler
)
3705 const char *name
= handler
->name
;
3706 struct lxc_conf
*lxc_conf
= handler
->conf
;
3707 const char *lxcpath
= handler
->lxcpath
;
3708 void *data
= handler
->data
;
3710 if (lxc_conf
->inherit_ns_fd
[LXC_NS_UTS
] == -1) {
3711 if (setup_utsname(lxc_conf
->utsname
)) {
3712 ERROR("failed to setup the utsname for '%s'", name
);
3717 if (setup_network(&lxc_conf
->network
)) {
3718 ERROR("failed to setup the network for '%s'", name
);
3722 if (run_lxc_hooks(name
, "pre-mount", lxc_conf
, lxcpath
, NULL
)) {
3723 ERROR("failed to run pre-mount hooks for container '%s'.", name
);
3727 if (setup_rootfs(lxc_conf
)) {
3728 ERROR("failed to setup rootfs for '%s'", name
);
3732 if (lxc_conf
->autodev
< 0) {
3733 lxc_conf
->autodev
= check_autodev(lxc_conf
->rootfs
.mount
, data
);
3736 if (lxc_conf
->autodev
> 0) {
3737 if (mount_autodev(name
, lxc_conf
->rootfs
.mount
, lxcpath
)) {
3738 ERROR("failed to mount /dev in the container");
3743 /* do automatic mounts (mainly /proc and /sys), but exclude
3744 * those that need to wait until other stuff has finished
3746 if (lxc_mount_auto_mounts(lxc_conf
, lxc_conf
->auto_mounts
& ~LXC_AUTO_CGROUP_MASK
, handler
) < 0) {
3747 ERROR("failed to setup the automatic mounts for '%s'", name
);
3751 if (setup_mount(&lxc_conf
->rootfs
, lxc_conf
->fstab
, name
)) {
3752 ERROR("failed to setup the mounts for '%s'", name
);
3756 if (!lxc_list_empty(&lxc_conf
->mount_list
) && setup_mount_entries(&lxc_conf
->rootfs
, &lxc_conf
->mount_list
, name
)) {
3757 ERROR("failed to setup the mount entries for '%s'", name
);
3761 /* now mount only cgroup, if wanted;
3762 * before, /sys could not have been mounted
3763 * (is either mounted automatically or via fstab entries)
3765 if (lxc_mount_auto_mounts(lxc_conf
, lxc_conf
->auto_mounts
& LXC_AUTO_CGROUP_MASK
, handler
) < 0) {
3766 ERROR("failed to setup the automatic mounts for '%s'", name
);
3770 if (run_lxc_hooks(name
, "mount", lxc_conf
, lxcpath
, NULL
)) {
3771 ERROR("failed to run mount hooks for container '%s'.", name
);
3775 if (lxc_conf
->autodev
> 0) {
3776 if (run_lxc_hooks(name
, "autodev", lxc_conf
, lxcpath
, NULL
)) {
3777 ERROR("failed to run autodev hooks for container '%s'.", name
);
3780 if (setup_autodev(lxc_conf
->rootfs
.mount
)) {
3781 ERROR("failed to populate /dev in the container");
3786 if (!lxc_conf
->is_execute
&& setup_console(&lxc_conf
->rootfs
, &lxc_conf
->console
, lxc_conf
->ttydir
)) {
3787 ERROR("failed to setup the console for '%s'", name
);
3791 if (lxc_conf
->kmsg
) {
3792 if (setup_kmsg(&lxc_conf
->rootfs
, &lxc_conf
->console
)) // don't fail
3793 ERROR("failed to setup kmsg for '%s'", name
);
3796 if (!lxc_conf
->is_execute
&& setup_tty(&lxc_conf
->rootfs
, &lxc_conf
->tty_info
, lxc_conf
->ttydir
)) {
3797 ERROR("failed to setup the ttys for '%s'", name
);
3801 if (!lxc_conf
->is_execute
&& setup_dev_symlinks(&lxc_conf
->rootfs
)) {
3802 ERROR("failed to setup /dev symlinks for '%s'", name
);
3806 /* mount /proc if it's not already there */
3807 if (tmp_proc_mount(lxc_conf
) < 0) {
3808 ERROR("failed to LSM mount proc for '%s'", name
);
3812 if (setup_pivot_root(&lxc_conf
->rootfs
)) {
3813 ERROR("failed to set rootfs for '%s'", name
);
3817 if (setup_pts(lxc_conf
->pts
)) {
3818 ERROR("failed to setup the new pts instance");
3822 if (setup_personality(lxc_conf
->personality
)) {
3823 ERROR("failed to setup personality");
3827 if (lxc_list_empty(&lxc_conf
->id_map
)) {
3828 if (!lxc_list_empty(&lxc_conf
->keepcaps
)) {
3829 if (!lxc_list_empty(&lxc_conf
->caps
)) {
3830 ERROR("Simultaneously requested dropping and keeping caps");
3833 if (dropcaps_except(&lxc_conf
->keepcaps
)) {
3834 ERROR("failed to keep requested caps");
3837 } else if (setup_caps(&lxc_conf
->caps
)) {
3838 ERROR("failed to drop capabilities");
3843 NOTICE("'%s' is setup.", name
);
3848 int run_lxc_hooks(const char *name
, char *hook
, struct lxc_conf
*conf
,
3849 const char *lxcpath
, char *argv
[])
3852 struct lxc_list
*it
;
3854 if (strcmp(hook
, "pre-start") == 0)
3855 which
= LXCHOOK_PRESTART
;
3856 else if (strcmp(hook
, "pre-mount") == 0)
3857 which
= LXCHOOK_PREMOUNT
;
3858 else if (strcmp(hook
, "mount") == 0)
3859 which
= LXCHOOK_MOUNT
;
3860 else if (strcmp(hook
, "autodev") == 0)
3861 which
= LXCHOOK_AUTODEV
;
3862 else if (strcmp(hook
, "start") == 0)
3863 which
= LXCHOOK_START
;
3864 else if (strcmp(hook
, "post-stop") == 0)
3865 which
= LXCHOOK_POSTSTOP
;
3866 else if (strcmp(hook
, "clone") == 0)
3867 which
= LXCHOOK_CLONE
;
3870 lxc_list_for_each(it
, &conf
->hooks
[which
]) {
3872 char *hookname
= it
->elem
;
3873 ret
= run_script_argv(name
, "lxc", hookname
, hook
, lxcpath
, argv
);
3880 static void lxc_remove_nic(struct lxc_list
*it
)
3882 struct lxc_netdev
*netdev
= it
->elem
;
3883 struct lxc_list
*it2
,*next
;
3891 if (netdev
->type
== LXC_NET_VETH
&& netdev
->priv
.veth_attr
.pair
)
3892 free(netdev
->priv
.veth_attr
.pair
);
3893 if (netdev
->upscript
)
3894 free(netdev
->upscript
);
3896 free(netdev
->hwaddr
);
3899 if (netdev
->ipv4_gateway
)
3900 free(netdev
->ipv4_gateway
);
3901 if (netdev
->ipv6_gateway
)
3902 free(netdev
->ipv6_gateway
);
3903 lxc_list_for_each_safe(it2
, &netdev
->ipv4
, next
) {
3908 lxc_list_for_each_safe(it2
, &netdev
->ipv6
, next
) {
3917 /* we get passed in something like '0', '0.ipv4' or '1.ipv6' */
3918 int lxc_clear_nic(struct lxc_conf
*c
, const char *key
)
3922 struct lxc_list
*it
;
3923 struct lxc_netdev
*netdev
;
3925 p1
= index(key
, '.');
3926 if (!p1
|| *(p1
+1) == '\0')
3929 ret
= sscanf(key
, "%d", &idx
);
3930 if (ret
!= 1) return -1;
3935 lxc_list_for_each(it
, &c
->network
) {
3940 if (i
< idx
) // we don't have that many nics defined
3943 if (!it
|| !it
->elem
)
3950 } else if (strcmp(p1
, ".ipv4") == 0) {
3951 struct lxc_list
*it2
,*next
;
3952 lxc_list_for_each_safe(it2
, &netdev
->ipv4
, next
) {
3957 } else if (strcmp(p1
, ".ipv6") == 0) {
3958 struct lxc_list
*it2
,*next
;
3959 lxc_list_for_each_safe(it2
, &netdev
->ipv6
, next
) {
3964 } else if (strcmp(p1
, ".link") == 0) {
3967 netdev
->link
= NULL
;
3969 } else if (strcmp(p1
, ".name") == 0) {
3972 netdev
->name
= NULL
;
3974 } else if (strcmp(p1
, ".script.up") == 0) {
3975 if (netdev
->upscript
) {
3976 free(netdev
->upscript
);
3977 netdev
->upscript
= NULL
;
3979 } else if (strcmp(p1
, ".hwaddr") == 0) {
3980 if (netdev
->hwaddr
) {
3981 free(netdev
->hwaddr
);
3982 netdev
->hwaddr
= NULL
;
3984 } else if (strcmp(p1
, ".mtu") == 0) {
3989 } else if (strcmp(p1
, ".ipv4_gateway") == 0) {
3990 if (netdev
->ipv4_gateway
) {
3991 free(netdev
->ipv4_gateway
);
3992 netdev
->ipv4_gateway
= NULL
;
3994 } else if (strcmp(p1
, ".ipv6_gateway") == 0) {
3995 if (netdev
->ipv6_gateway
) {
3996 free(netdev
->ipv6_gateway
);
3997 netdev
->ipv6_gateway
= NULL
;
4005 int lxc_clear_config_network(struct lxc_conf
*c
)
4007 struct lxc_list
*it
,*next
;
4008 lxc_list_for_each_safe(it
, &c
->network
, next
) {
4014 int lxc_clear_config_caps(struct lxc_conf
*c
)
4016 struct lxc_list
*it
,*next
;
4018 lxc_list_for_each_safe(it
, &c
->caps
, next
) {
4026 static int lxc_free_idmap(struct lxc_list
*id_map
) {
4027 struct lxc_list
*it
, *next
;
4029 lxc_list_for_each_safe(it
, id_map
, next
) {
4037 int lxc_clear_idmaps(struct lxc_conf
*c
)
4039 return lxc_free_idmap(&c
->id_map
);
4042 int lxc_clear_config_keepcaps(struct lxc_conf
*c
)
4044 struct lxc_list
*it
,*next
;
4046 lxc_list_for_each_safe(it
, &c
->keepcaps
, next
) {
4054 int lxc_clear_cgroups(struct lxc_conf
*c
, const char *key
)
4056 struct lxc_list
*it
,*next
;
4058 const char *k
= key
+ 11;
4060 if (strcmp(key
, "lxc.cgroup") == 0)
4063 lxc_list_for_each_safe(it
, &c
->cgroup
, next
) {
4064 struct lxc_cgroup
*cg
= it
->elem
;
4065 if (!all
&& strcmp(cg
->subsystem
, k
) != 0)
4068 free(cg
->subsystem
);
4076 int lxc_clear_groups(struct lxc_conf
*c
)
4078 struct lxc_list
*it
,*next
;
4080 lxc_list_for_each_safe(it
, &c
->groups
, next
) {
4088 int lxc_clear_mount_entries(struct lxc_conf
*c
)
4090 struct lxc_list
*it
,*next
;
4092 lxc_list_for_each_safe(it
, &c
->mount_list
, next
) {
4100 int lxc_clear_hooks(struct lxc_conf
*c
, const char *key
)
4102 struct lxc_list
*it
,*next
;
4103 bool all
= false, done
= false;
4104 const char *k
= key
+ 9;
4107 if (strcmp(key
, "lxc.hook") == 0)
4110 for (i
=0; i
<NUM_LXC_HOOKS
; i
++) {
4111 if (all
|| strcmp(k
, lxchook_names
[i
]) == 0) {
4112 lxc_list_for_each_safe(it
, &c
->hooks
[i
], next
) {
4122 ERROR("Invalid hook key: %s", key
);
4128 static void lxc_clear_saved_nics(struct lxc_conf
*conf
)
4132 if (!conf
->saved_nics
)
4134 for (i
=0; i
< conf
->num_savednics
; i
++)
4135 free(conf
->saved_nics
[i
].orig_name
);
4136 free(conf
->saved_nics
);
4139 void lxc_conf_free(struct lxc_conf
*conf
)
4143 if (conf
->console
.path
)
4144 free(conf
->console
.path
);
4145 if (conf
->rootfs
.mount
)
4146 free(conf
->rootfs
.mount
);
4147 if (conf
->rootfs
.options
)
4148 free(conf
->rootfs
.options
);
4149 if (conf
->rootfs
.path
)
4150 free(conf
->rootfs
.path
);
4151 if (conf
->rootfs
.pivot
)
4152 free(conf
->rootfs
.pivot
);
4154 free(conf
->logfile
);
4156 free(conf
->utsname
);
4163 lxc_clear_config_network(conf
);
4164 if (conf
->lsm_aa_profile
)
4165 free(conf
->lsm_aa_profile
);
4166 if (conf
->lsm_se_context
)
4167 free(conf
->lsm_se_context
);
4168 lxc_seccomp_free(conf
);
4169 lxc_clear_config_caps(conf
);
4170 lxc_clear_config_keepcaps(conf
);
4171 lxc_clear_cgroups(conf
, "lxc.cgroup");
4172 lxc_clear_hooks(conf
, "lxc.hook");
4173 lxc_clear_mount_entries(conf
);
4174 lxc_clear_saved_nics(conf
);
4175 lxc_clear_idmaps(conf
);
4176 lxc_clear_groups(conf
);
4180 struct userns_fn_data
{
4186 static int run_userns_fn(void *data
)
4188 struct userns_fn_data
*d
= data
;
4190 // we're not sharing with the parent any more, if it was a thread
4193 if (read(d
->p
[0], &c
, 1) != 1)
4196 return d
->fn(d
->arg
);
4200 * Add a ID_TYPE_UID entry to an existing lxc_conf, if it is not
4202 * We may want to generalize this to do gids as well as uids, but right now
4203 * it's not necessary.
4205 static struct lxc_list
*idmap_add_id(struct lxc_conf
*conf
, uid_t uid
)
4207 int hostid_mapped
= mapped_hostid(uid
, conf
, ID_TYPE_UID
);
4208 struct lxc_list
*new = NULL
, *tmp
, *it
, *next
;
4209 struct id_map
*entry
;
4211 new = malloc(sizeof(*new));
4213 ERROR("Out of memory building id map");
4218 if (hostid_mapped
< 0) {
4219 hostid_mapped
= find_unmapped_nsuid(conf
, ID_TYPE_UID
);
4220 if (hostid_mapped
< 0)
4222 tmp
= malloc(sizeof(*tmp
));
4225 entry
= malloc(sizeof(*entry
));
4231 entry
->idtype
= ID_TYPE_UID
;
4232 entry
->nsid
= hostid_mapped
;
4233 entry
->hostid
= (unsigned long)uid
;
4235 lxc_list_add_tail(new, tmp
);
4237 lxc_list_for_each_safe(it
, &conf
->id_map
, next
) {
4238 tmp
= malloc(sizeof(*tmp
));
4241 entry
= malloc(sizeof(*entry
));
4246 memset(entry
, 0, sizeof(*entry
));
4247 memcpy(entry
, it
->elem
, sizeof(*entry
));
4249 lxc_list_add_tail(new, tmp
);
4255 ERROR("Out of memory building a new uid map");
4257 lxc_free_idmap(new);
4263 * Run a function in a new user namespace.
4264 * The caller's euid will be mapped in if it is not already.
4266 int userns_exec_1(struct lxc_conf
*conf
, int (*fn
)(void *), void *data
)
4269 struct userns_fn_data d
;
4272 struct lxc_list
*idmap
;
4276 SYSERROR("opening pipe");
4283 pid
= lxc_clone(run_userns_fn
, &d
, CLONE_NEWUSER
);
4289 if ((idmap
= idmap_add_id(conf
, geteuid())) == NULL
) {
4290 ERROR("Error adding self to container uid map");
4294 ret
= lxc_map_ids(idmap
, pid
);
4295 lxc_free_idmap(idmap
);
4298 ERROR("Error setting up child mappings");
4303 if (write(p
[1], &c
, 1) != 1) {
4304 SYSERROR("writing to pipe to child");
4308 ret
= wait_for_pid(pid
);