2 * lxc: linux Container library
4 * (C) Copyright IBM Corp. 2007, 2008
7 * Daniel Lezcano <daniel.lezcano at free.fr>
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
34 #include <sys/syscall.h>
35 #include <sys/types.h>
40 #include <sys/statvfs.h>
46 #include <../include/openpty.h>
49 #include <linux/loop.h>
51 #include <sys/types.h>
52 #include <sys/utsname.h>
53 #include <sys/param.h>
55 #include <sys/socket.h>
56 #include <sys/mount.h>
58 #include <sys/prctl.h>
60 #include <arpa/inet.h>
62 #include <netinet/in.h>
73 #include "caps.h" /* for lxc_caps_last_cap() */
77 #include "namespace.h"
80 #if HAVE_SYS_CAPABILITY_H
81 #include <sys/capability.h>
84 #if HAVE_SYS_PERSONALITY_H
85 #include <sys/personality.h>
89 #include <../include/lxcmntent.h>
94 #include "lxcseccomp.h"
96 lxc_log_define(lxc_conf
, lxc
);
100 #if HAVE_SYS_CAPABILITY_H
102 #define CAP_SETFCAP 31
105 #ifndef CAP_MAC_OVERRIDE
106 #define CAP_MAC_OVERRIDE 32
109 #ifndef CAP_MAC_ADMIN
110 #define CAP_MAC_ADMIN 33
114 #ifndef PR_CAPBSET_DROP
115 #define PR_CAPBSET_DROP 24
118 #ifndef LO_FLAGS_AUTOCLEAR
119 #define LO_FLAGS_AUTOCLEAR 4
122 /* needed for cgroup automount checks, regardless of whether we
123 * have included linux/capability.h or not */
124 #ifndef CAP_SYS_ADMIN
125 #define CAP_SYS_ADMIN 21
128 /* Define pivot_root() if missing from the C library */
129 #ifndef HAVE_PIVOT_ROOT
130 static int pivot_root(const char * new_root
, const char * put_old
)
132 #ifdef __NR_pivot_root
133 return syscall(__NR_pivot_root
, new_root
, put_old
);
140 extern int pivot_root(const char * new_root
, const char * put_old
);
143 /* Define sethostname() if missing from the C library */
144 #ifndef HAVE_SETHOSTNAME
145 static int sethostname(const char * name
, size_t len
)
147 #ifdef __NR_sethostname
148 return syscall(__NR_sethostname
, name
, len
);
156 /* Define __S_ISTYPE if missing from the C library */
158 #define __S_ISTYPE(mode, mask) (((mode) & S_IFMT) == (mask))
162 #define MS_PRIVATE (1<<18)
165 char *lxchook_names
[NUM_LXC_HOOKS
] = {
166 "pre-start", "pre-mount", "mount", "autodev", "start", "post-stop", "clone" };
168 typedef int (*instantiate_cb
)(struct lxc_handler
*, struct lxc_netdev
*);
181 /* Declare this here, since we don't want to reshuffle the whole file. */
182 static int in_caplist(int cap
, struct lxc_list
*caps
);
184 static int instantiate_veth(struct lxc_handler
*, struct lxc_netdev
*);
185 static int instantiate_macvlan(struct lxc_handler
*, struct lxc_netdev
*);
186 static int instantiate_vlan(struct lxc_handler
*, struct lxc_netdev
*);
187 static int instantiate_phys(struct lxc_handler
*, struct lxc_netdev
*);
188 static int instantiate_empty(struct lxc_handler
*, struct lxc_netdev
*);
189 static int instantiate_none(struct lxc_handler
*, struct lxc_netdev
*);
191 static instantiate_cb netdev_conf
[LXC_NET_MAXCONFTYPE
+ 1] = {
192 [LXC_NET_VETH
] = instantiate_veth
,
193 [LXC_NET_MACVLAN
] = instantiate_macvlan
,
194 [LXC_NET_VLAN
] = instantiate_vlan
,
195 [LXC_NET_PHYS
] = instantiate_phys
,
196 [LXC_NET_EMPTY
] = instantiate_empty
,
197 [LXC_NET_NONE
] = instantiate_none
,
200 static int shutdown_veth(struct lxc_handler
*, struct lxc_netdev
*);
201 static int shutdown_macvlan(struct lxc_handler
*, struct lxc_netdev
*);
202 static int shutdown_vlan(struct lxc_handler
*, struct lxc_netdev
*);
203 static int shutdown_phys(struct lxc_handler
*, struct lxc_netdev
*);
204 static int shutdown_empty(struct lxc_handler
*, struct lxc_netdev
*);
205 static int shutdown_none(struct lxc_handler
*, struct lxc_netdev
*);
207 static instantiate_cb netdev_deconf
[LXC_NET_MAXCONFTYPE
+ 1] = {
208 [LXC_NET_VETH
] = shutdown_veth
,
209 [LXC_NET_MACVLAN
] = shutdown_macvlan
,
210 [LXC_NET_VLAN
] = shutdown_vlan
,
211 [LXC_NET_PHYS
] = shutdown_phys
,
212 [LXC_NET_EMPTY
] = shutdown_empty
,
213 [LXC_NET_NONE
] = shutdown_none
,
216 static struct mount_opt mount_opt
[] = {
217 { "defaults", 0, 0 },
218 { "ro", 0, MS_RDONLY
},
219 { "rw", 1, MS_RDONLY
},
220 { "suid", 1, MS_NOSUID
},
221 { "nosuid", 0, MS_NOSUID
},
222 { "dev", 1, MS_NODEV
},
223 { "nodev", 0, MS_NODEV
},
224 { "exec", 1, MS_NOEXEC
},
225 { "noexec", 0, MS_NOEXEC
},
226 { "sync", 0, MS_SYNCHRONOUS
},
227 { "async", 1, MS_SYNCHRONOUS
},
228 { "dirsync", 0, MS_DIRSYNC
},
229 { "remount", 0, MS_REMOUNT
},
230 { "mand", 0, MS_MANDLOCK
},
231 { "nomand", 1, MS_MANDLOCK
},
232 { "atime", 1, MS_NOATIME
},
233 { "noatime", 0, MS_NOATIME
},
234 { "diratime", 1, MS_NODIRATIME
},
235 { "nodiratime", 0, MS_NODIRATIME
},
236 { "bind", 0, MS_BIND
},
237 { "rbind", 0, MS_BIND
|MS_REC
},
238 { "relatime", 0, MS_RELATIME
},
239 { "norelatime", 1, MS_RELATIME
},
240 { "strictatime", 0, MS_STRICTATIME
},
241 { "nostrictatime", 1, MS_STRICTATIME
},
245 #if HAVE_SYS_CAPABILITY_H
246 static struct caps_opt caps_opt
[] = {
247 { "chown", CAP_CHOWN
},
248 { "dac_override", CAP_DAC_OVERRIDE
},
249 { "dac_read_search", CAP_DAC_READ_SEARCH
},
250 { "fowner", CAP_FOWNER
},
251 { "fsetid", CAP_FSETID
},
252 { "kill", CAP_KILL
},
253 { "setgid", CAP_SETGID
},
254 { "setuid", CAP_SETUID
},
255 { "setpcap", CAP_SETPCAP
},
256 { "linux_immutable", CAP_LINUX_IMMUTABLE
},
257 { "net_bind_service", CAP_NET_BIND_SERVICE
},
258 { "net_broadcast", CAP_NET_BROADCAST
},
259 { "net_admin", CAP_NET_ADMIN
},
260 { "net_raw", CAP_NET_RAW
},
261 { "ipc_lock", CAP_IPC_LOCK
},
262 { "ipc_owner", CAP_IPC_OWNER
},
263 { "sys_module", CAP_SYS_MODULE
},
264 { "sys_rawio", CAP_SYS_RAWIO
},
265 { "sys_chroot", CAP_SYS_CHROOT
},
266 { "sys_ptrace", CAP_SYS_PTRACE
},
267 { "sys_pacct", CAP_SYS_PACCT
},
268 { "sys_admin", CAP_SYS_ADMIN
},
269 { "sys_boot", CAP_SYS_BOOT
},
270 { "sys_nice", CAP_SYS_NICE
},
271 { "sys_resource", CAP_SYS_RESOURCE
},
272 { "sys_time", CAP_SYS_TIME
},
273 { "sys_tty_config", CAP_SYS_TTY_CONFIG
},
274 { "mknod", CAP_MKNOD
},
275 { "lease", CAP_LEASE
},
276 #ifdef CAP_AUDIT_WRITE
277 { "audit_write", CAP_AUDIT_WRITE
},
279 #ifdef CAP_AUDIT_CONTROL
280 { "audit_control", CAP_AUDIT_CONTROL
},
282 { "setfcap", CAP_SETFCAP
},
283 { "mac_override", CAP_MAC_OVERRIDE
},
284 { "mac_admin", CAP_MAC_ADMIN
},
286 { "syslog", CAP_SYSLOG
},
288 #ifdef CAP_WAKE_ALARM
289 { "wake_alarm", CAP_WAKE_ALARM
},
293 static struct caps_opt caps_opt
[] = {};
296 static int run_buffer(char *buffer
)
298 struct lxc_popen_FILE
*f
;
302 f
= lxc_popen(buffer
);
304 SYSERROR("popen failed");
308 output
= malloc(LXC_LOG_BUFFER_SIZE
);
310 ERROR("failed to allocate memory for script output");
315 while(fgets(output
, LXC_LOG_BUFFER_SIZE
, f
->f
))
316 DEBUG("script output: %s", output
);
322 SYSERROR("Script exited on error");
324 } else if (WIFEXITED(ret
) && WEXITSTATUS(ret
) != 0) {
325 ERROR("Script exited with status %d", WEXITSTATUS(ret
));
327 } else if (WIFSIGNALED(ret
)) {
328 ERROR("Script terminated by signal %d (%s)", WTERMSIG(ret
),
329 strsignal(WTERMSIG(ret
)));
336 static int run_script_argv(const char *name
, const char *section
,
337 const char *script
, const char *hook
, const char *lxcpath
,
344 INFO("Executing script '%s' for container '%s', config section '%s'",
345 script
, name
, section
);
347 for (i
=0; argsin
&& argsin
[i
]; i
++)
348 size
+= strlen(argsin
[i
]) + 1;
350 size
+= strlen(hook
) + 1;
352 size
+= strlen(script
);
353 size
+= strlen(name
);
354 size
+= strlen(section
);
360 buffer
= alloca(size
);
362 ERROR("failed to allocate memory");
366 ret
= snprintf(buffer
, size
, "%s %s %s %s", script
, name
, section
, hook
);
367 if (ret
< 0 || ret
>= size
) {
368 ERROR("Script name too long");
372 for (i
=0; argsin
&& argsin
[i
]; i
++) {
375 rc
= snprintf(buffer
+ ret
, len
, " %s", argsin
[i
]);
376 if (rc
< 0 || rc
>= len
) {
377 ERROR("Script args too long");
383 return run_buffer(buffer
);
386 static int run_script(const char *name
, const char *section
,
387 const char *script
, ...)
394 INFO("Executing script '%s' for container '%s', config section '%s'",
395 script
, name
, section
);
397 va_start(ap
, script
);
398 while ((p
= va_arg(ap
, char *)))
399 size
+= strlen(p
) + 1;
402 size
+= strlen(script
);
403 size
+= strlen(name
);
404 size
+= strlen(section
);
410 buffer
= alloca(size
);
412 ERROR("failed to allocate memory");
416 ret
= snprintf(buffer
, size
, "%s %s %s", script
, name
, section
);
417 if (ret
< 0 || ret
>= size
) {
418 ERROR("Script name too long");
422 va_start(ap
, script
);
423 while ((p
= va_arg(ap
, char *))) {
426 rc
= snprintf(buffer
+ ret
, len
, " %s", p
);
427 if (rc
< 0 || rc
>= len
) {
428 ERROR("Script args too long");
435 return run_buffer(buffer
);
438 static int find_fstype_cb(char* buffer
, void *data
)
446 unsigned long mntflags
;
450 /* we don't try 'nodev' entries */
451 if (strstr(buffer
, "nodev"))
455 fstype
+= lxc_char_left_gc(fstype
, strlen(fstype
));
456 fstype
[lxc_char_right_gc(fstype
, strlen(fstype
))] = '\0';
458 /* ignore blank line and comment */
459 if (fstype
[0] == '\0' || fstype
[0] == '#')
462 DEBUG("trying to mount '%s'->'%s' with fstype '%s'",
463 cbarg
->rootfs
, cbarg
->target
, fstype
);
465 if (parse_mntopts(cbarg
->options
, &mntflags
, &mntdata
) < 0) {
470 if (mount(cbarg
->rootfs
, cbarg
->target
, fstype
, mntflags
, mntdata
)) {
471 DEBUG("mount failed with error: %s", strerror(errno
));
477 INFO("mounted '%s' on '%s', with fstype '%s'",
478 cbarg
->rootfs
, cbarg
->target
, fstype
);
483 static int mount_unknown_fs(const char *rootfs
, const char *target
,
499 * find the filesystem type with brute force:
500 * first we check with /etc/filesystems, in case the modules
501 * are auto-loaded and fall back to the supported kernel fs
508 for (i
= 0; i
< sizeof(fsfile
)/sizeof(fsfile
[0]); i
++) {
512 if (access(fsfile
[i
], F_OK
))
515 ret
= lxc_file_for_each_line(fsfile
[i
], find_fstype_cb
, &cbarg
);
517 ERROR("failed to parse '%s'", fsfile
[i
]);
525 ERROR("failed to determine fs type for '%s'", rootfs
);
529 static int mount_rootfs_dir(const char *rootfs
, const char *target
,
532 unsigned long mntflags
;
536 if (parse_mntopts(options
, &mntflags
, &mntdata
) < 0) {
541 ret
= mount(rootfs
, target
, "none", MS_BIND
| MS_REC
| mntflags
, mntdata
);
547 static int setup_lodev(const char *rootfs
, int fd
, struct loop_info64
*loinfo
)
552 rfd
= open(rootfs
, O_RDWR
);
554 SYSERROR("failed to open '%s'", rootfs
);
558 memset(loinfo
, 0, sizeof(*loinfo
));
560 loinfo
->lo_flags
= LO_FLAGS_AUTOCLEAR
;
562 if (ioctl(fd
, LOOP_SET_FD
, rfd
)) {
563 SYSERROR("failed to LOOP_SET_FD");
567 if (ioctl(fd
, LOOP_SET_STATUS64
, loinfo
)) {
568 SYSERROR("failed to LOOP_SET_STATUS64");
579 static int mount_rootfs_file(const char *rootfs
, const char *target
,
582 struct dirent dirent
, *direntp
;
583 struct loop_info64 loinfo
;
584 int ret
= -1, fd
= -1, rc
;
586 char path
[MAXPATHLEN
];
588 dir
= opendir("/dev");
590 SYSERROR("failed to open '/dev'");
594 while (!readdir_r(dir
, &dirent
, &direntp
)) {
599 if (!strcmp(direntp
->d_name
, "."))
602 if (!strcmp(direntp
->d_name
, ".."))
605 if (strncmp(direntp
->d_name
, "loop", 4))
608 rc
= snprintf(path
, MAXPATHLEN
, "/dev/%s", direntp
->d_name
);
609 if (rc
< 0 || rc
>= MAXPATHLEN
)
612 fd
= open(path
, O_RDWR
);
616 if (ioctl(fd
, LOOP_GET_STATUS64
, &loinfo
) == 0) {
621 if (errno
!= ENXIO
) {
622 WARN("unexpected error for ioctl on '%s': %m",
628 DEBUG("found '%s' free lodev", path
);
630 ret
= setup_lodev(rootfs
, fd
, &loinfo
);
632 ret
= mount_unknown_fs(path
, target
, options
);
639 WARN("failed to close directory");
644 static int mount_rootfs_block(const char *rootfs
, const char *target
,
647 return mount_unknown_fs(rootfs
, target
, options
);
652 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
653 * the duration of the container run, to prevent the container from marking
654 * the underlying fs readonly on shutdown. unlink the file immediately so
655 * no name pollution is happens
656 * return -1 on error.
657 * return -2 if nothing needed to be pinned.
658 * return an open fd (>=0) if we pinned it.
660 int pin_rootfs(const char *rootfs
)
662 char absrootfs
[MAXPATHLEN
];
663 char absrootfspin
[MAXPATHLEN
];
667 if (rootfs
== NULL
|| strlen(rootfs
) == 0)
670 if (!realpath(rootfs
, absrootfs
))
673 if (access(absrootfs
, F_OK
))
676 if (stat(absrootfs
, &s
))
679 if (!S_ISDIR(s
.st_mode
))
682 ret
= snprintf(absrootfspin
, MAXPATHLEN
, "%s/lxc.hold", absrootfs
);
683 if (ret
>= MAXPATHLEN
)
686 fd
= open(absrootfspin
, O_CREAT
| O_RDWR
, S_IWUSR
|S_IRUSR
);
689 (void)unlink(absrootfspin
);
694 * If we are asking to remount something, make sure that any
695 * NOEXEC etc are honored.
697 static unsigned long add_required_remount_flags(const char *s
, const char *d
,
702 unsigned long required_flags
= 0;
704 if (!(flags
& MS_REMOUNT
))
712 if (statvfs(s
, &sb
) < 0)
715 if (sb
.f_flag
& MS_NOSUID
)
716 required_flags
|= MS_NOSUID
;
717 if (sb
.f_flag
& MS_NODEV
)
718 required_flags
|= MS_NODEV
;
719 if (sb
.f_flag
& MS_RDONLY
)
720 required_flags
|= MS_RDONLY
;
721 if (sb
.f_flag
& MS_NOEXEC
)
722 required_flags
|= MS_NOEXEC
;
724 return flags
| required_flags
;
730 static int lxc_mount_auto_mounts(struct lxc_conf
*conf
, int flags
, struct lxc_handler
*handler
)
738 const char *destination
;
742 } default_mounts
[] = {
743 /* Read-only bind-mounting... In older kernels, doing that required
744 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
745 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
746 * kernel 2.6.26 onwards. However, this apparently does not work on
747 * kernel 3.8. Unfortunately, on that very same kernel, doing the
748 * same trick as above doesn't seem to work either, there one needs
749 * to ALSO specify MS_BIND for the remount, otherwise the entire
750 * fs is remounted read-only or the mount fails because it's busy...
751 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
754 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, "proc", "%r/proc", "proc", MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, NULL
},
755 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, "%r/proc/sys/net", "%r/proc/net", NULL
, MS_BIND
, NULL
},
756 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, "%r/proc/sys", "%r/proc/sys", NULL
, MS_BIND
, NULL
},
757 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, NULL
, "%r/proc/sys", NULL
, MS_REMOUNT
|MS_BIND
|MS_RDONLY
, NULL
},
758 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, "%r/proc/net", "%r/proc/sys/net", NULL
, MS_MOVE
, NULL
},
759 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL
, MS_BIND
, NULL
},
760 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, NULL
, "%r/proc/sysrq-trigger", NULL
, MS_REMOUNT
|MS_BIND
|MS_RDONLY
, NULL
},
761 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_RW
, "proc", "%r/proc", "proc", MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, NULL
},
762 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_RW
, "sysfs", "%r/sys", "sysfs", 0, NULL
},
763 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_RO
, "sysfs", "%r/sys", "sysfs", MS_RDONLY
, NULL
},
764 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_MIXED
, "sysfs", "%r/sys", "sysfs", MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, NULL
},
765 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_MIXED
, "%r/sys", "%r/sys", NULL
, MS_BIND
, NULL
},
766 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_MIXED
, NULL
, "%r/sys", NULL
, MS_REMOUNT
|MS_BIND
|MS_RDONLY
, NULL
},
767 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_MIXED
, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL
},
768 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_MIXED
, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL
, MS_BIND
, NULL
},
769 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_MIXED
, NULL
, "%r/sys/devices/virtual/net", NULL
, MS_REMOUNT
|MS_BIND
|MS_NOSUID
|MS_NODEV
|MS_NOEXEC
, NULL
},
770 { 0, 0, NULL
, NULL
, NULL
, 0, NULL
}
773 for (i
= 0; default_mounts
[i
].match_mask
; i
++) {
774 if ((flags
& default_mounts
[i
].match_mask
) == default_mounts
[i
].match_flag
) {
776 char *destination
= NULL
;
778 unsigned long mflags
;
780 if (default_mounts
[i
].source
) {
781 /* will act like strdup if %r is not present */
782 source
= lxc_string_replace("%r", conf
->rootfs
.mount
, default_mounts
[i
].source
);
784 SYSERROR("memory allocation error");
788 if (default_mounts
[i
].destination
) {
789 /* will act like strdup if %r is not present */
790 destination
= lxc_string_replace("%r", conf
->rootfs
.mount
, default_mounts
[i
].destination
);
793 SYSERROR("memory allocation error");
799 mflags
= add_required_remount_flags(source
, destination
,
800 default_mounts
[i
].flags
);
801 r
= mount(source
, destination
, default_mounts
[i
].fstype
, mflags
, default_mounts
[i
].options
);
803 if (r
< 0 && errno
== ENOENT
) {
804 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source
, destination
);
808 SYSERROR("error mounting %s on %s flags %lu", source
, destination
, mflags
);
819 if (flags
& LXC_AUTO_CGROUP_MASK
) {
822 cg_flags
= flags
& LXC_AUTO_CGROUP_MASK
;
823 /* If the type of cgroup mount was not specified, it depends on the
824 * container's capabilities as to what makes sense: if we have
825 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
826 * anyway, so we may as well default to read-write; then the admin
827 * will not be given a false sense of security. (And if they really
828 * want mixed r/o r/w, then they can explicitly specify :mixed.)
829 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
830 * :mixed, because then the container can't remount it read-write. */
831 if (cg_flags
== LXC_AUTO_CGROUP_NOSPEC
|| cg_flags
== LXC_AUTO_CGROUP_FULL_NOSPEC
) {
832 int has_sys_admin
= 0;
833 if (!lxc_list_empty(&conf
->keepcaps
)) {
834 has_sys_admin
= in_caplist(CAP_SYS_ADMIN
, &conf
->keepcaps
);
836 has_sys_admin
= !in_caplist(CAP_SYS_ADMIN
, &conf
->caps
);
838 if (cg_flags
== LXC_AUTO_CGROUP_NOSPEC
) {
839 cg_flags
= has_sys_admin
? LXC_AUTO_CGROUP_RW
: LXC_AUTO_CGROUP_MIXED
;
841 cg_flags
= has_sys_admin
? LXC_AUTO_CGROUP_FULL_RW
: LXC_AUTO_CGROUP_FULL_MIXED
;
845 if (!cgroup_mount(conf
->rootfs
.mount
, handler
, cg_flags
)) {
846 SYSERROR("error mounting /sys/fs/cgroup");
854 static int mount_rootfs(const char *rootfs
, const char *target
, const char *options
)
856 char absrootfs
[MAXPATHLEN
];
860 typedef int (*rootfs_cb
)(const char *, const char *, const char *);
866 { S_IFDIR
, mount_rootfs_dir
},
867 { S_IFBLK
, mount_rootfs_block
},
868 { S_IFREG
, mount_rootfs_file
},
871 if (!realpath(rootfs
, absrootfs
)) {
872 SYSERROR("failed to get real path for '%s'", rootfs
);
876 if (access(absrootfs
, F_OK
)) {
877 SYSERROR("'%s' is not accessible", absrootfs
);
881 if (stat(absrootfs
, &s
)) {
882 SYSERROR("failed to stat '%s'", absrootfs
);
886 for (i
= 0; i
< sizeof(rtfs_type
)/sizeof(rtfs_type
[0]); i
++) {
888 if (!__S_ISTYPE(s
.st_mode
, rtfs_type
[i
].type
))
891 return rtfs_type
[i
].cb(absrootfs
, target
, options
);
894 ERROR("unsupported rootfs type for '%s'", absrootfs
);
898 static int setup_utsname(struct utsname
*utsname
)
903 if (sethostname(utsname
->nodename
, strlen(utsname
->nodename
))) {
904 SYSERROR("failed to set the hostname to '%s'", utsname
->nodename
);
908 INFO("'%s' hostname has been setup", utsname
->nodename
);
913 struct dev_symlinks
{
918 static const struct dev_symlinks dev_symlinks
[] = {
919 {"/proc/self/fd", "fd"},
920 {"/proc/self/fd/0", "stdin"},
921 {"/proc/self/fd/1", "stdout"},
922 {"/proc/self/fd/2", "stderr"},
925 static int setup_dev_symlinks(const struct lxc_rootfs
*rootfs
)
927 char path
[MAXPATHLEN
];
932 for (i
= 0; i
< sizeof(dev_symlinks
) / sizeof(dev_symlinks
[0]); i
++) {
933 const struct dev_symlinks
*d
= &dev_symlinks
[i
];
934 ret
= snprintf(path
, sizeof(path
), "%s/dev/%s", rootfs
->mount
, d
->name
);
935 if (ret
< 0 || ret
>= MAXPATHLEN
)
939 * Stat the path first. If we don't get an error
940 * accept it as is and don't try to create it
942 if (!stat(path
, &s
)) {
946 ret
= symlink(d
->oldpath
, path
);
948 if (ret
&& errno
!= EEXIST
) {
949 if ( errno
== EROFS
) {
950 WARN("Warning: Read Only file system while creating %s", path
);
952 SYSERROR("Error creating %s", path
);
961 * Build a space-separate list of ptys to pass to systemd.
963 static bool append_ptyname(char **pp
, char *name
)
968 *pp
= malloc(strlen(name
) + strlen("container_ttys=") + 1);
971 sprintf(*pp
, "container_ttys=%s", name
);
974 p
= realloc(*pp
, strlen(*pp
) + strlen(name
) + 2);
983 static int setup_tty(struct lxc_conf
*conf
)
985 const struct lxc_tty_info
*tty_info
= &conf
->tty_info
;
986 char *ttydir
= conf
->ttydir
;
987 char path
[MAXPATHLEN
], lxcpath
[MAXPATHLEN
];
990 if (!conf
->rootfs
.path
)
993 for (i
= 0; i
< tty_info
->nbtty
; i
++) {
995 struct lxc_pty_info
*pty_info
= &tty_info
->pty_info
[i
];
997 ret
= snprintf(path
, sizeof(path
), "/dev/tty%d", i
+ 1);
998 if (ret
>= sizeof(path
)) {
999 ERROR("pathname too long for ttys");
1003 /* create dev/lxc/tty%d" */
1004 ret
= snprintf(lxcpath
, sizeof(lxcpath
), "/dev/%s/tty%d", ttydir
, i
+ 1);
1005 if (ret
>= sizeof(lxcpath
)) {
1006 ERROR("pathname too long for ttys");
1009 ret
= creat(lxcpath
, 0660);
1010 if (ret
==-1 && errno
!= EEXIST
) {
1011 SYSERROR("error creating %s", lxcpath
);
1017 if (ret
&& errno
!= ENOENT
) {
1018 SYSERROR("error unlinking %s", path
);
1022 if (mount(pty_info
->name
, lxcpath
, "none", MS_BIND
, 0)) {
1023 WARN("failed to mount '%s'->'%s'",
1024 pty_info
->name
, path
);
1028 ret
= snprintf(lxcpath
, sizeof(lxcpath
), "%s/tty%d", ttydir
, i
+1);
1029 if (ret
>= sizeof(lxcpath
)) {
1030 ERROR("tty pathname too long");
1033 ret
= symlink(lxcpath
, path
);
1035 SYSERROR("failed to create symlink for tty %d", i
+1);
1039 /* If we populated /dev, then we need to create /dev/ttyN */
1040 if (access(path
, F_OK
)) {
1041 ret
= creat(path
, 0660);
1043 SYSERROR("error creating %s", path
);
1044 /* this isn't fatal, continue */
1049 if (mount(pty_info
->name
, path
, "none", MS_BIND
, 0)) {
1050 SYSERROR("failed to mount '%s'->'%s'", pty_info
->name
, path
);
1054 if (!append_ptyname(&conf
->pty_names
, pty_info
->name
)) {
1055 ERROR("Error setting up container_ttys string");
1060 INFO("%d tty(s) has been setup", tty_info
->nbtty
);
1066 static int setup_rootfs_pivot_root(const char *rootfs
, const char *pivotdir
)
1068 int oldroot
= -1, newroot
= -1;
1070 oldroot
= open("/", O_DIRECTORY
| O_RDONLY
);
1072 SYSERROR("Error opening old-/ for fchdir");
1075 newroot
= open(rootfs
, O_DIRECTORY
| O_RDONLY
);
1077 SYSERROR("Error opening new-/ for fchdir");
1081 /* change into new root fs */
1082 if (fchdir(newroot
)) {
1083 SYSERROR("can't chdir to new rootfs '%s'", rootfs
);
1087 /* pivot_root into our new root fs */
1088 if (pivot_root(".", ".")) {
1089 SYSERROR("pivot_root syscall failed");
1094 * at this point the old-root is mounted on top of our new-root
1095 * To unmounted it we must not be chdir'd into it, so escape back
1098 if (fchdir(oldroot
) < 0) {
1099 SYSERROR("Error entering oldroot");
1102 if (umount2(".", MNT_DETACH
) < 0) {
1103 SYSERROR("Error detaching old root");
1107 if (fchdir(newroot
) < 0) {
1108 SYSERROR("Error re-entering newroot");
1115 DEBUG("pivot_root syscall to '%s' successful", rootfs
);
1128 * Just create a path for /dev under $lxcpath/$name and in rootfs
1129 * If we hit an error, log it but don't fail yet.
1131 static int mount_autodev(const char *name
, char *root
, const char *lxcpath
)
1137 INFO("Mounting /dev under %s", root
);
1139 /* $(root) + "/dev/pts" + '\0' */
1140 clen
= strlen(root
) + 9;
1141 path
= alloca(clen
);
1143 ret
= snprintf(path
, clen
, "%s/dev", root
);
1144 if (ret
< 0 || ret
>= clen
)
1147 if (!dir_exists(path
)) {
1148 WARN("No /dev on container rootfs.");
1149 WARN("Proceeding without autodev setup");
1153 if (mount("none", path
, "tmpfs", 0, "size=100000,mode=755")) {
1154 SYSERROR("Failed mounting tmpfs onto %s\n", path
);
1158 INFO("Mounted tmpfs onto %s", path
);
1160 ret
= snprintf(path
, clen
, "%s/dev/pts", root
);
1161 if (ret
< 0 || ret
>= clen
)
1165 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1166 * If not, then create it and exit if that fails...
1168 if (!dir_exists(path
)) {
1169 ret
= mkdir(path
, S_IRWXU
| S_IRGRP
| S_IXGRP
| S_IROTH
| S_IXOTH
);
1171 SYSERROR("Failed to create /dev/pts in container");
1176 INFO("Mounted /dev under %s", root
);
1187 static const struct lxc_devs lxc_devs
[] = {
1188 { "null", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 3 },
1189 { "zero", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 5 },
1190 { "full", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 7 },
1191 { "urandom", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 9 },
1192 { "random", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 8 },
1193 { "tty", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 5, 0 },
1194 { "console", S_IFCHR
| S_IRUSR
| S_IWUSR
, 5, 1 },
1197 static int fill_autodev(const char *root
)
1200 char path
[MAXPATHLEN
];
1204 INFO("Creating initial consoles under %s/dev", root
);
1206 ret
= snprintf(path
, MAXPATHLEN
, "%s/dev", root
);
1207 if (ret
< 0 || ret
>= MAXPATHLEN
) {
1208 ERROR("Error calculating container /dev location");
1212 if (!dir_exists(path
)) // ignore, just don't try to fill in
1215 INFO("Populating /dev under %s", root
);
1216 cmask
= umask(S_IXUSR
| S_IXGRP
| S_IXOTH
);
1217 for (i
= 0; i
< sizeof(lxc_devs
) / sizeof(lxc_devs
[0]); i
++) {
1218 const struct lxc_devs
*d
= &lxc_devs
[i
];
1219 ret
= snprintf(path
, MAXPATHLEN
, "%s/dev/%s", root
, d
->name
);
1220 if (ret
< 0 || ret
>= MAXPATHLEN
)
1222 ret
= mknod(path
, d
->mode
, makedev(d
->maj
, d
->min
));
1223 if (ret
&& errno
!= EEXIST
) {
1224 char hostpath
[MAXPATHLEN
];
1227 // Unprivileged containers cannot create devices, so
1228 // bind mount the device from the host
1229 ret
= snprintf(hostpath
, MAXPATHLEN
, "/dev/%s", d
->name
);
1230 if (ret
< 0 || ret
>= MAXPATHLEN
)
1232 pathfile
= fopen(path
, "wb");
1234 SYSERROR("Failed to create device mount target '%s'", path
);
1238 if (mount(hostpath
, path
, 0, MS_BIND
, NULL
) != 0) {
1239 SYSERROR("Failed bind mounting device %s from host into container",
1247 INFO("Populated /dev under %s", root
);
1251 static int setup_rootfs(struct lxc_conf
*conf
)
1253 const struct lxc_rootfs
*rootfs
= &conf
->rootfs
;
1255 if (!rootfs
->path
) {
1256 if (mount("", "/", NULL
, MS_SLAVE
|MS_REC
, 0)) {
1257 SYSERROR("Failed to make / rslave");
1263 if (access(rootfs
->mount
, F_OK
)) {
1264 SYSERROR("failed to access to '%s', check it is present",
1269 // First try mounting rootfs using a bdev
1270 struct bdev
*bdev
= bdev_init(conf
, rootfs
->path
, rootfs
->mount
, rootfs
->options
);
1271 if (bdev
&& bdev
->ops
->mount(bdev
) == 0) {
1273 DEBUG("mounted '%s' on '%s'", rootfs
->path
, rootfs
->mount
);
1278 if (mount_rootfs(rootfs
->path
, rootfs
->mount
, rootfs
->options
)) {
1279 ERROR("failed to mount rootfs");
1283 DEBUG("mounted '%s' on '%s'", rootfs
->path
, rootfs
->mount
);
1288 int prepare_ramfs_root(char *root
)
1290 char buf
[LINELEN
], *p
;
1291 char nroot
[PATH_MAX
];
1296 if (realpath(root
, nroot
) == NULL
)
1299 if (chdir("/") == -1)
1303 * We could use here MS_MOVE, but in userns this mount is
1304 * locked and can't be moved.
1306 if (mount(root
, "/", NULL
, MS_REC
| MS_BIND
, NULL
)) {
1307 SYSERROR("Failed to move %s into /", root
);
1311 if (mount(NULL
, "/", NULL
, MS_REC
| MS_PRIVATE
, NULL
)) {
1312 SYSERROR("Failed to make . rprivate");
1317 * The following code cleans up inhereted mounts which are not
1320 * The mountinfo file shows not all mounts, if a few points have been
1321 * unmounted between read operations from the mountinfo. So we need to
1322 * read mountinfo a few times.
1324 * This loop can be skipped if a container uses unserns, because all
1325 * inherited mounts are locked and we should live with all this trash.
1330 f
= fopen("./proc/self/mountinfo", "r");
1332 SYSERROR("Unable to open /proc/self/mountinfo");
1335 while (fgets(buf
, LINELEN
, f
)) {
1336 for (p
= buf
, i
=0; p
&& i
< 4; i
++)
1337 p
= strchr(p
+1, ' ');
1340 p2
= strchr(p
+1, ' ');
1347 if (strcmp(p
+ 1, "/") == 0)
1349 if (strcmp(p
+ 1, "/proc") == 0)
1352 if (umount2(p
, MNT_DETACH
) == 0)
1360 if (umount2("./proc", MNT_DETACH
)) {
1361 SYSERROR("Unable to umount /proc");
1365 /* It is weird, but chdir("..") moves us in a new root */
1366 if (chdir("..") == -1) {
1367 SYSERROR("Unable to change working directory");
1371 if (chroot(".") == -1) {
1372 SYSERROR("Unable to chroot");
1379 static int setup_pivot_root(const struct lxc_rootfs
*rootfs
)
1384 if (detect_ramfs_rootfs()) {
1385 if (prepare_ramfs_root(rootfs
->mount
))
1387 } else if (setup_rootfs_pivot_root(rootfs
->mount
, rootfs
->pivot
)) {
1388 ERROR("failed to setup pivot root");
1395 static int setup_pts(int pts
)
1397 char target
[PATH_MAX
];
1402 if (!access("/dev/pts/ptmx", F_OK
) && umount("/dev/pts")) {
1403 SYSERROR("failed to umount 'dev/pts'");
1407 if (mkdir("/dev/pts", 0755)) {
1408 if ( errno
!= EEXIST
) {
1409 SYSERROR("failed to create '/dev/pts'");
1414 if (mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL
,
1415 "newinstance,ptmxmode=0666,mode=0620,gid=5")) {
1416 SYSERROR("failed to mount a new instance of '/dev/pts'");
1420 if (access("/dev/ptmx", F_OK
)) {
1421 if (!symlink("/dev/pts/ptmx", "/dev/ptmx"))
1423 SYSERROR("failed to symlink '/dev/pts/ptmx'->'/dev/ptmx'");
1427 if (realpath("/dev/ptmx", target
) && !strcmp(target
, "/dev/pts/ptmx"))
1430 /* fallback here, /dev/pts/ptmx exists just mount bind */
1431 if (mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND
, 0)) {
1432 SYSERROR("mount failed '/dev/pts/ptmx'->'/dev/ptmx'");
1436 INFO("created new pts instance");
1442 static int setup_personality(int persona
)
1444 #if HAVE_SYS_PERSONALITY_H
1448 if (personality(persona
) < 0) {
1449 SYSERROR("failed to set personality to '0x%x'", persona
);
1453 INFO("set personality to '0x%x'", persona
);
1459 static int setup_dev_console(const struct lxc_rootfs
*rootfs
,
1460 const struct lxc_console
*console
)
1462 char path
[MAXPATHLEN
];
1466 ret
= snprintf(path
, sizeof(path
), "%s/dev/console", rootfs
->mount
);
1467 if (ret
>= sizeof(path
)) {
1468 ERROR("console path too long");
1472 if (access(path
, F_OK
)) {
1473 WARN("rootfs specified but no console found at '%s'", path
);
1477 if (console
->master
< 0) {
1482 if (stat(path
, &s
)) {
1483 SYSERROR("failed to stat '%s'", path
);
1487 if (chmod(console
->name
, s
.st_mode
)) {
1488 SYSERROR("failed to set mode '0%o' to '%s'",
1489 s
.st_mode
, console
->name
);
1493 if (mount(console
->name
, path
, "none", MS_BIND
, 0)) {
1494 ERROR("failed to mount '%s' on '%s'", console
->name
, path
);
1498 INFO("console has been setup");
1502 static int setup_ttydir_console(const struct lxc_rootfs
*rootfs
,
1503 const struct lxc_console
*console
,
1506 char path
[MAXPATHLEN
], lxcpath
[MAXPATHLEN
];
1509 /* create rootfs/dev/<ttydir> directory */
1510 ret
= snprintf(path
, sizeof(path
), "%s/dev/%s", rootfs
->mount
,
1512 if (ret
>= sizeof(path
))
1514 ret
= mkdir(path
, 0755);
1515 if (ret
&& errno
!= EEXIST
) {
1516 SYSERROR("failed with errno %d to create %s", errno
, path
);
1519 INFO("created %s", path
);
1521 ret
= snprintf(lxcpath
, sizeof(lxcpath
), "%s/dev/%s/console",
1522 rootfs
->mount
, ttydir
);
1523 if (ret
>= sizeof(lxcpath
)) {
1524 ERROR("console path too long");
1528 snprintf(path
, sizeof(path
), "%s/dev/console", rootfs
->mount
);
1530 if (ret
&& errno
!= ENOENT
) {
1531 SYSERROR("error unlinking %s", path
);
1535 ret
= creat(lxcpath
, 0660);
1536 if (ret
==-1 && errno
!= EEXIST
) {
1537 SYSERROR("error %d creating %s", errno
, lxcpath
);
1543 if (console
->master
< 0) {
1548 if (mount(console
->name
, lxcpath
, "none", MS_BIND
, 0)) {
1549 ERROR("failed to mount '%s' on '%s'", console
->name
, lxcpath
);
1553 /* create symlink from rootfs/dev/console to 'lxc/console' */
1554 ret
= snprintf(lxcpath
, sizeof(lxcpath
), "%s/console", ttydir
);
1555 if (ret
>= sizeof(lxcpath
)) {
1556 ERROR("lxc/console path too long");
1559 ret
= symlink(lxcpath
, path
);
1561 SYSERROR("failed to create symlink for console");
1565 INFO("console has been setup on %s", lxcpath
);
1570 static int setup_console(const struct lxc_rootfs
*rootfs
,
1571 const struct lxc_console
*console
,
1574 /* We don't have a rootfs, /dev/console will be shared */
1578 return setup_dev_console(rootfs
, console
);
1580 return setup_ttydir_console(rootfs
, console
, ttydir
);
1583 static int setup_kmsg(const struct lxc_rootfs
*rootfs
,
1584 const struct lxc_console
*console
)
1586 char kpath
[MAXPATHLEN
];
1591 ret
= snprintf(kpath
, sizeof(kpath
), "%s/dev/kmsg", rootfs
->mount
);
1592 if (ret
< 0 || ret
>= sizeof(kpath
))
1595 ret
= unlink(kpath
);
1596 if (ret
&& errno
!= ENOENT
) {
1597 SYSERROR("error unlinking %s", kpath
);
1601 ret
= symlink("console", kpath
);
1603 SYSERROR("failed to create symlink for kmsg");
1610 static void parse_mntopt(char *opt
, unsigned long *flags
, char **data
)
1612 struct mount_opt
*mo
;
1614 /* If opt is found in mount_opt, set or clear flags.
1615 * Otherwise append it to data. */
1617 for (mo
= &mount_opt
[0]; mo
->name
!= NULL
; mo
++) {
1618 if (!strncmp(opt
, mo
->name
, strlen(mo
->name
))) {
1620 *flags
&= ~mo
->flag
;
1632 int parse_mntopts(const char *mntopts
, unsigned long *mntflags
,
1636 char *p
, *saveptr
= NULL
;
1644 s
= strdup(mntopts
);
1646 SYSERROR("failed to allocate memory");
1650 data
= malloc(strlen(s
) + 1);
1652 SYSERROR("failed to allocate memory");
1658 for (p
= strtok_r(s
, ",", &saveptr
); p
!= NULL
;
1659 p
= strtok_r(NULL
, ",", &saveptr
))
1660 parse_mntopt(p
, mntflags
, &data
);
1671 static void null_endofword(char *word
)
1673 while (*word
&& *word
!= ' ' && *word
!= '\t')
1679 * skip @nfields spaces in @src
1681 static char *get_field(char *src
, int nfields
)
1686 for (i
= 0; i
< nfields
; i
++) {
1687 while (*p
&& *p
!= ' ' && *p
!= '\t')
1696 static int mount_entry(const char *fsname
, const char *target
,
1697 const char *fstype
, unsigned long mountflags
,
1698 const char *data
, int optional
)
1704 if (mount(fsname
, target
, fstype
, mountflags
& ~MS_REMOUNT
, data
)) {
1706 INFO("failed to mount '%s' on '%s' (optional): %s", fsname
,
1707 target
, strerror(errno
));
1711 SYSERROR("failed to mount '%s' on '%s'", fsname
, target
);
1716 if ((mountflags
& MS_REMOUNT
) || (mountflags
& MS_BIND
)) {
1717 DEBUG("remounting %s on %s to respect bind or remount options",
1718 fsname
? fsname
: "(none)", target
? target
: "(none)");
1719 unsigned long rqd_flags
= 0;
1720 if (mountflags
& MS_RDONLY
)
1721 rqd_flags
|= MS_RDONLY
;
1723 if (statvfs(fsname
, &sb
) == 0) {
1724 unsigned long required_flags
= rqd_flags
;
1725 if (sb
.f_flag
& MS_NOSUID
)
1726 required_flags
|= MS_NOSUID
;
1727 if (sb
.f_flag
& MS_NODEV
)
1728 required_flags
|= MS_NODEV
;
1729 if (sb
.f_flag
& MS_RDONLY
)
1730 required_flags
|= MS_RDONLY
;
1731 if (sb
.f_flag
& MS_NOEXEC
)
1732 required_flags
|= MS_NOEXEC
;
1733 DEBUG("(at remount) flags for %s was %lu, required extra flags are %lu", fsname
, sb
.f_flag
, required_flags
);
1735 * If this was a bind mount request, and required_flags
1736 * does not have any flags which are not already in
1737 * mountflags, then skip the remount
1739 if (!(mountflags
& MS_REMOUNT
)) {
1740 if (!(required_flags
& ~mountflags
) && rqd_flags
== 0) {
1741 DEBUG("mountflags already was %lu, skipping remount",
1746 mountflags
|= required_flags
;
1750 if (mount(fsname
, target
, fstype
,
1751 mountflags
| MS_REMOUNT
, data
)) {
1753 INFO("failed to mount '%s' on '%s' (optional): %s",
1754 fsname
, target
, strerror(errno
));
1758 SYSERROR("failed to mount '%s' on '%s'",
1768 DEBUG("mounted '%s' on '%s', type '%s'", fsname
, target
, fstype
);
1774 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1776 static void cull_mntent_opt(struct mntent
*mntent
)
1780 char *list
[] = {"create=dir",
1785 for (i
=0; list
[i
]; i
++) {
1786 if (!(p
= strstr(mntent
->mnt_opts
, list
[i
])))
1788 p2
= strchr(p
, ',');
1790 /* no more mntopts, so just chop it here */
1794 memmove(p
, p2
+1, strlen(p2
+1)+1);
1798 static inline int mount_entry_on_systemfs(struct mntent
*mntent
)
1800 unsigned long mntflags
;
1803 FILE *pathfile
= NULL
;
1804 char* pathdirname
= NULL
;
1805 bool optional
= hasmntopt(mntent
, "optional") != NULL
;
1807 if (hasmntopt(mntent
, "create=dir")) {
1808 if (mkdir_p(mntent
->mnt_dir
, 0755) < 0) {
1809 WARN("Failed to create mount target '%s'", mntent
->mnt_dir
);
1814 if (hasmntopt(mntent
, "create=file") && access(mntent
->mnt_dir
, F_OK
)) {
1815 pathdirname
= strdup(mntent
->mnt_dir
);
1816 pathdirname
= dirname(pathdirname
);
1817 if (mkdir_p(pathdirname
, 0755) < 0) {
1818 WARN("Failed to create target directory");
1820 pathfile
= fopen(mntent
->mnt_dir
, "wb");
1822 WARN("Failed to create mount target '%s'", mntent
->mnt_dir
);
1829 cull_mntent_opt(mntent
);
1831 if (parse_mntopts(mntent
->mnt_opts
, &mntflags
, &mntdata
) < 0) {
1836 ret
= mount_entry(mntent
->mnt_fsname
, mntent
->mnt_dir
,
1837 mntent
->mnt_type
, mntflags
, mntdata
, optional
);
1845 static int mount_entry_on_absolute_rootfs(struct mntent
*mntent
,
1846 const struct lxc_rootfs
*rootfs
,
1847 const char *lxc_name
)
1850 char path
[MAXPATHLEN
];
1851 unsigned long mntflags
;
1853 int r
, ret
= 0, offset
;
1854 const char *lxcpath
;
1855 FILE *pathfile
= NULL
;
1856 char *pathdirname
= NULL
;
1857 bool optional
= hasmntopt(mntent
, "optional") != NULL
;
1859 lxcpath
= lxc_global_config_value("lxc.lxcpath");
1861 ERROR("Out of memory");
1865 /* if rootfs->path is a blockdev path, allow container fstab to
1866 * use $lxcpath/CN/rootfs as the target prefix */
1867 r
= snprintf(path
, MAXPATHLEN
, "%s/%s/rootfs", lxcpath
, lxc_name
);
1868 if (r
< 0 || r
>= MAXPATHLEN
)
1871 aux
= strstr(mntent
->mnt_dir
, path
);
1873 offset
= strlen(path
);
1878 aux
= strstr(mntent
->mnt_dir
, rootfs
->path
);
1880 WARN("ignoring mount point '%s'", mntent
->mnt_dir
);
1883 offset
= strlen(rootfs
->path
);
1887 r
= snprintf(path
, MAXPATHLEN
, "%s/%s", rootfs
->mount
,
1889 if (r
< 0 || r
>= MAXPATHLEN
) {
1890 WARN("pathnme too long for '%s'", mntent
->mnt_dir
);
1895 if (hasmntopt(mntent
, "create=dir")) {
1896 if (mkdir_p(path
, 0755) < 0) {
1897 WARN("Failed to create mount target '%s'", path
);
1902 if (hasmntopt(mntent
, "create=file") && access(path
, F_OK
)) {
1903 pathdirname
= strdup(path
);
1904 pathdirname
= dirname(pathdirname
);
1905 if (mkdir_p(pathdirname
, 0755) < 0) {
1906 WARN("Failed to create target directory");
1908 pathfile
= fopen(path
, "wb");
1910 WARN("Failed to create mount target '%s'", path
);
1916 cull_mntent_opt(mntent
);
1918 if (parse_mntopts(mntent
->mnt_opts
, &mntflags
, &mntdata
) < 0) {
1923 ret
= mount_entry(mntent
->mnt_fsname
, path
, mntent
->mnt_type
,
1924 mntflags
, mntdata
, optional
);
1933 static int mount_entry_on_relative_rootfs(struct mntent
*mntent
,
1936 char path
[MAXPATHLEN
];
1937 unsigned long mntflags
;
1940 FILE *pathfile
= NULL
;
1941 char *pathdirname
= NULL
;
1942 bool optional
= hasmntopt(mntent
, "optional") != NULL
;
1944 /* relative to root mount point */
1945 ret
= snprintf(path
, sizeof(path
), "%s/%s", rootfs
, mntent
->mnt_dir
);
1946 if (ret
>= sizeof(path
)) {
1947 ERROR("path name too long");
1951 if (hasmntopt(mntent
, "create=dir")) {
1952 if (mkdir_p(path
, 0755) < 0) {
1953 WARN("Failed to create mount target '%s'", path
);
1958 if (hasmntopt(mntent
, "create=file") && access(path
, F_OK
)) {
1959 pathdirname
= strdup(path
);
1960 pathdirname
= dirname(pathdirname
);
1961 if (mkdir_p(pathdirname
, 0755) < 0) {
1962 WARN("Failed to create target directory");
1964 pathfile
= fopen(path
, "wb");
1966 WARN("Failed to create mount target '%s'", path
);
1972 cull_mntent_opt(mntent
);
1974 if (parse_mntopts(mntent
->mnt_opts
, &mntflags
, &mntdata
) < 0) {
1979 ret
= mount_entry(mntent
->mnt_fsname
, path
, mntent
->mnt_type
,
1980 mntflags
, mntdata
, optional
);
1988 static int mount_file_entries(const struct lxc_rootfs
*rootfs
, FILE *file
,
1989 const char *lxc_name
)
1991 struct mntent mntent
;
1995 while (getmntent_r(file
, &mntent
, buf
, sizeof(buf
))) {
1997 if (!rootfs
->path
) {
1998 if (mount_entry_on_systemfs(&mntent
))
2003 /* We have a separate root, mounts are relative to it */
2004 if (mntent
.mnt_dir
[0] != '/') {
2005 if (mount_entry_on_relative_rootfs(&mntent
,
2011 if (mount_entry_on_absolute_rootfs(&mntent
, rootfs
, lxc_name
))
2017 INFO("mount points have been setup");
2022 static int setup_mount(const struct lxc_rootfs
*rootfs
, const char *fstab
,
2023 const char *lxc_name
)
2031 file
= setmntent(fstab
, "r");
2033 SYSERROR("failed to use '%s'", fstab
);
2037 ret
= mount_file_entries(rootfs
, file
, lxc_name
);
2043 FILE *write_mount_file(struct lxc_list
*mount
)
2046 struct lxc_list
*iterator
;
2051 ERROR("tmpfile error: %m");
2055 lxc_list_for_each(iterator
, mount
) {
2056 mount_entry
= iterator
->elem
;
2057 fprintf(file
, "%s\n", mount_entry
);
2064 static int setup_mount_entries(const struct lxc_rootfs
*rootfs
, struct lxc_list
*mount
,
2065 const char *lxc_name
)
2070 file
= write_mount_file(mount
);
2074 ret
= mount_file_entries(rootfs
, file
, lxc_name
);
2080 static int parse_cap(const char *cap
)
2085 if (!strcmp(cap
, "none"))
2088 for (i
= 0; i
< sizeof(caps_opt
)/sizeof(caps_opt
[0]); i
++) {
2090 if (strcmp(cap
, caps_opt
[i
].name
))
2093 capid
= caps_opt
[i
].value
;
2098 /* try to see if it's numeric, so the user may specify
2099 * capabilities that the running kernel knows about but
2102 capid
= strtol(cap
, &ptr
, 10);
2103 if (!ptr
|| *ptr
!= '\0' || errno
!= 0)
2104 /* not a valid number */
2106 else if (capid
> lxc_caps_last_cap())
2107 /* we have a number but it's not a valid
2115 int in_caplist(int cap
, struct lxc_list
*caps
)
2117 struct lxc_list
*iterator
;
2120 lxc_list_for_each(iterator
, caps
) {
2121 capid
= parse_cap(iterator
->elem
);
2129 static int setup_caps(struct lxc_list
*caps
)
2131 struct lxc_list
*iterator
;
2135 lxc_list_for_each(iterator
, caps
) {
2137 drop_entry
= iterator
->elem
;
2139 capid
= parse_cap(drop_entry
);
2142 ERROR("unknown capability %s", drop_entry
);
2146 DEBUG("drop capability '%s' (%d)", drop_entry
, capid
);
2148 if (prctl(PR_CAPBSET_DROP
, capid
, 0, 0, 0)) {
2149 SYSERROR("failed to remove %s capability", drop_entry
);
2155 DEBUG("capabilities have been setup");
2160 static int dropcaps_except(struct lxc_list
*caps
)
2162 struct lxc_list
*iterator
;
2165 int numcaps
= lxc_caps_last_cap() + 1;
2166 INFO("found %d capabilities", numcaps
);
2168 if (numcaps
<= 0 || numcaps
> 200)
2171 // caplist[i] is 1 if we keep capability i
2172 int *caplist
= alloca(numcaps
* sizeof(int));
2173 memset(caplist
, 0, numcaps
* sizeof(int));
2175 lxc_list_for_each(iterator
, caps
) {
2177 keep_entry
= iterator
->elem
;
2179 capid
= parse_cap(keep_entry
);
2185 ERROR("unknown capability %s", keep_entry
);
2189 DEBUG("keep capability '%s' (%d)", keep_entry
, capid
);
2193 for (i
=0; i
<numcaps
; i
++) {
2196 if (prctl(PR_CAPBSET_DROP
, i
, 0, 0, 0)) {
2197 SYSERROR("failed to remove capability %d", i
);
2202 DEBUG("capabilities have been setup");
2207 static int setup_hw_addr(char *hwaddr
, const char *ifname
)
2209 struct sockaddr sockaddr
;
2213 ret
= lxc_convert_mac(hwaddr
, &sockaddr
);
2215 ERROR("mac address '%s' conversion failed : %s",
2216 hwaddr
, strerror(-ret
));
2220 memcpy(ifr
.ifr_name
, ifname
, IFNAMSIZ
);
2221 ifr
.ifr_name
[IFNAMSIZ
-1] = '\0';
2222 memcpy((char *) &ifr
.ifr_hwaddr
, (char *) &sockaddr
, sizeof(sockaddr
));
2224 fd
= socket(AF_INET
, SOCK_DGRAM
, 0);
2226 ERROR("socket failure : %s", strerror(errno
));
2230 ret
= ioctl(fd
, SIOCSIFHWADDR
, &ifr
);
2233 ERROR("ioctl failure : %s", strerror(errno
));
2235 DEBUG("mac address '%s' on '%s' has been setup", hwaddr
, ifr
.ifr_name
);
2240 static int setup_ipv4_addr(struct lxc_list
*ip
, int ifindex
)
2242 struct lxc_list
*iterator
;
2243 struct lxc_inetdev
*inetdev
;
2246 lxc_list_for_each(iterator
, ip
) {
2248 inetdev
= iterator
->elem
;
2250 err
= lxc_ipv4_addr_add(ifindex
, &inetdev
->addr
,
2251 &inetdev
->bcast
, inetdev
->prefix
);
2253 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2254 ifindex
, strerror(-err
));
2262 static int setup_ipv6_addr(struct lxc_list
*ip
, int ifindex
)
2264 struct lxc_list
*iterator
;
2265 struct lxc_inet6dev
*inet6dev
;
2268 lxc_list_for_each(iterator
, ip
) {
2270 inet6dev
= iterator
->elem
;
2272 err
= lxc_ipv6_addr_add(ifindex
, &inet6dev
->addr
,
2273 &inet6dev
->mcast
, &inet6dev
->acast
,
2276 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2277 ifindex
, strerror(-err
));
2285 static int setup_netdev(struct lxc_netdev
*netdev
)
2287 char ifname
[IFNAMSIZ
];
2288 char *current_ifname
= ifname
;
2291 /* empty network namespace */
2292 if (!netdev
->ifindex
) {
2293 if (netdev
->flags
& IFF_UP
) {
2294 err
= lxc_netdev_up("lo");
2296 ERROR("failed to set the loopback up : %s",
2301 if (netdev
->type
!= LXC_NET_VETH
)
2303 netdev
->ifindex
= if_nametoindex(netdev
->name
);
2306 /* get the new ifindex in case of physical netdev */
2307 if (netdev
->type
== LXC_NET_PHYS
) {
2308 if (!(netdev
->ifindex
= if_nametoindex(netdev
->link
))) {
2309 ERROR("failed to get ifindex for %s",
2315 /* retrieve the name of the interface */
2316 if (!if_indextoname(netdev
->ifindex
, current_ifname
)) {
2317 ERROR("no interface corresponding to index '%d'",
2322 /* default: let the system to choose one interface name */
2324 netdev
->name
= netdev
->type
== LXC_NET_PHYS
?
2325 netdev
->link
: "eth%d";
2327 /* rename the interface name */
2328 if (strcmp(ifname
, netdev
->name
) != 0) {
2329 err
= lxc_netdev_rename_by_name(ifname
, netdev
->name
);
2331 ERROR("failed to rename %s->%s : %s", ifname
, netdev
->name
,
2337 /* Re-read the name of the interface because its name has changed
2338 * and would be automatically allocated by the system
2340 if (!if_indextoname(netdev
->ifindex
, current_ifname
)) {
2341 ERROR("no interface corresponding to index '%d'",
2346 /* set a mac address */
2347 if (netdev
->hwaddr
) {
2348 if (setup_hw_addr(netdev
->hwaddr
, current_ifname
)) {
2349 ERROR("failed to setup hw address for '%s'",
2355 /* setup ipv4 addresses on the interface */
2356 if (setup_ipv4_addr(&netdev
->ipv4
, netdev
->ifindex
)) {
2357 ERROR("failed to setup ip addresses for '%s'",
2362 /* setup ipv6 addresses on the interface */
2363 if (setup_ipv6_addr(&netdev
->ipv6
, netdev
->ifindex
)) {
2364 ERROR("failed to setup ipv6 addresses for '%s'",
2369 /* set the network device up */
2370 if (netdev
->flags
& IFF_UP
) {
2373 err
= lxc_netdev_up(current_ifname
);
2375 ERROR("failed to set '%s' up : %s", current_ifname
,
2380 /* the network is up, make the loopback up too */
2381 err
= lxc_netdev_up("lo");
2383 ERROR("failed to set the loopback up : %s",
2389 /* We can only set up the default routes after bringing
2390 * up the interface, sine bringing up the interface adds
2391 * the link-local routes and we can't add a default
2392 * route if the gateway is not reachable. */
2394 /* setup ipv4 gateway on the interface */
2395 if (netdev
->ipv4_gateway
) {
2396 if (!(netdev
->flags
& IFF_UP
)) {
2397 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname
);
2401 if (lxc_list_empty(&netdev
->ipv4
)) {
2402 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname
);
2406 err
= lxc_ipv4_gateway_add(netdev
->ifindex
, netdev
->ipv4_gateway
);
2408 err
= lxc_ipv4_dest_add(netdev
->ifindex
, netdev
->ipv4_gateway
);
2410 ERROR("failed to add ipv4 dest for '%s': %s",
2411 ifname
, strerror(-err
));
2414 err
= lxc_ipv4_gateway_add(netdev
->ifindex
, netdev
->ipv4_gateway
);
2416 ERROR("failed to setup ipv4 gateway for '%s': %s",
2417 ifname
, strerror(-err
));
2418 if (netdev
->ipv4_gateway_auto
) {
2419 char buf
[INET_ADDRSTRLEN
];
2420 inet_ntop(AF_INET
, netdev
->ipv4_gateway
, buf
, sizeof(buf
));
2421 ERROR("tried to set autodetected ipv4 gateway '%s'", buf
);
2428 /* setup ipv6 gateway on the interface */
2429 if (netdev
->ipv6_gateway
) {
2430 if (!(netdev
->flags
& IFF_UP
)) {
2431 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname
);
2435 if (lxc_list_empty(&netdev
->ipv6
) && !IN6_IS_ADDR_LINKLOCAL(netdev
->ipv6_gateway
)) {
2436 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname
);
2440 err
= lxc_ipv6_gateway_add(netdev
->ifindex
, netdev
->ipv6_gateway
);
2442 err
= lxc_ipv6_dest_add(netdev
->ifindex
, netdev
->ipv6_gateway
);
2444 ERROR("failed to add ipv6 dest for '%s': %s",
2445 ifname
, strerror(-err
));
2448 err
= lxc_ipv6_gateway_add(netdev
->ifindex
, netdev
->ipv6_gateway
);
2450 ERROR("failed to setup ipv6 gateway for '%s': %s",
2451 ifname
, strerror(-err
));
2452 if (netdev
->ipv6_gateway_auto
) {
2453 char buf
[INET6_ADDRSTRLEN
];
2454 inet_ntop(AF_INET6
, netdev
->ipv6_gateway
, buf
, sizeof(buf
));
2455 ERROR("tried to set autodetected ipv6 gateway '%s'", buf
);
2462 DEBUG("'%s' has been setup", current_ifname
);
2467 static int setup_network(struct lxc_list
*network
)
2469 struct lxc_list
*iterator
;
2470 struct lxc_netdev
*netdev
;
2472 lxc_list_for_each(iterator
, network
) {
2474 netdev
= iterator
->elem
;
2476 if (setup_netdev(netdev
)) {
2477 ERROR("failed to setup netdev");
2482 if (!lxc_list_empty(network
))
2483 INFO("network has been setup");
2488 /* try to move physical nics to the init netns */
2489 void restore_phys_nics_to_netns(int netnsfd
, struct lxc_conf
*conf
)
2492 char path
[MAXPATHLEN
];
2497 ret
= snprintf(path
, MAXPATHLEN
, "/proc/self/ns/net");
2498 if (ret
< 0 || ret
>= MAXPATHLEN
) {
2499 WARN("Failed to open monitor netns fd");
2502 if ((oldfd
= open(path
, O_RDONLY
)) < 0) {
2503 SYSERROR("Failed to open monitor netns fd");
2506 if (setns(netnsfd
, 0) != 0) {
2507 SYSERROR("Failed to enter container netns to reset nics");
2511 for (i
=0; i
<conf
->num_savednics
; i
++) {
2512 struct saved_nic
*s
= &conf
->saved_nics
[i
];
2513 if (lxc_netdev_move_by_index(s
->ifindex
, 1, NULL
))
2514 WARN("Error moving nic index:%d back to host netns",
2517 if (setns(oldfd
, 0) != 0)
2518 SYSERROR("Failed to re-enter monitor's netns");
2522 void lxc_rename_phys_nics_on_shutdown(int netnsfd
, struct lxc_conf
*conf
)
2526 if (conf
->num_savednics
== 0)
2529 INFO("running to reset %d nic names", conf
->num_savednics
);
2530 restore_phys_nics_to_netns(netnsfd
, conf
);
2531 for (i
=0; i
<conf
->num_savednics
; i
++) {
2532 struct saved_nic
*s
= &conf
->saved_nics
[i
];
2533 INFO("resetting nic %d to %s", s
->ifindex
, s
->orig_name
);
2534 lxc_netdev_rename_by_index(s
->ifindex
, s
->orig_name
);
2537 conf
->num_savednics
= 0;
2540 static char *default_rootfs_mount
= LXCROOTFSMOUNT
;
2542 struct lxc_conf
*lxc_conf_init(void)
2544 struct lxc_conf
*new;
2547 new = malloc(sizeof(*new));
2549 ERROR("lxc_conf_init : %m");
2552 memset(new, 0, sizeof(*new));
2554 new->loglevel
= LXC_LOG_PRIORITY_NOTSET
;
2555 new->personality
= -1;
2557 new->console
.log_path
= NULL
;
2558 new->console
.log_fd
= -1;
2559 new->console
.path
= NULL
;
2560 new->console
.peer
= -1;
2561 new->console
.peerpty
.busy
= -1;
2562 new->console
.peerpty
.master
= -1;
2563 new->console
.peerpty
.slave
= -1;
2564 new->console
.master
= -1;
2565 new->console
.slave
= -1;
2566 new->console
.name
[0] = '\0';
2567 new->maincmd_fd
= -1;
2569 new->rootfs
.mount
= strdup(default_rootfs_mount
);
2570 if (!new->rootfs
.mount
) {
2571 ERROR("lxc_conf_init : %m");
2576 lxc_list_init(&new->cgroup
);
2577 lxc_list_init(&new->network
);
2578 lxc_list_init(&new->mount_list
);
2579 lxc_list_init(&new->caps
);
2580 lxc_list_init(&new->keepcaps
);
2581 lxc_list_init(&new->id_map
);
2582 lxc_list_init(&new->includes
);
2583 lxc_list_init(&new->aliens
);
2584 lxc_list_init(&new->environment
);
2585 for (i
=0; i
<NUM_LXC_HOOKS
; i
++)
2586 lxc_list_init(&new->hooks
[i
]);
2587 lxc_list_init(&new->groups
);
2588 new->lsm_aa_profile
= NULL
;
2589 new->lsm_se_context
= NULL
;
2590 new->tmp_umount_proc
= 0;
2592 for (i
= 0; i
< LXC_NS_MAX
; i
++)
2593 new->inherit_ns_fd
[i
] = -1;
2598 static int instantiate_veth(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2600 char veth1buf
[IFNAMSIZ
], *veth1
;
2601 char veth2buf
[IFNAMSIZ
], *veth2
;
2604 if (netdev
->priv
.veth_attr
.pair
)
2605 veth1
= netdev
->priv
.veth_attr
.pair
;
2607 err
= snprintf(veth1buf
, sizeof(veth1buf
), "vethXXXXXX");
2608 if (err
>= sizeof(veth1buf
)) { /* can't *really* happen, but... */
2609 ERROR("veth1 name too long");
2612 veth1
= lxc_mkifname(veth1buf
);
2614 ERROR("failed to allocate a temporary name");
2617 /* store away for deconf */
2618 memcpy(netdev
->priv
.veth_attr
.veth1
, veth1
, IFNAMSIZ
);
2621 snprintf(veth2buf
, sizeof(veth2buf
), "vethXXXXXX");
2622 veth2
= lxc_mkifname(veth2buf
);
2624 ERROR("failed to allocate a temporary name");
2628 err
= lxc_veth_create(veth1
, veth2
);
2630 ERROR("failed to create veth pair (%s and %s): %s", veth1
, veth2
,
2635 /* changing the high byte of the mac address to 0xfe, the bridge interface
2636 * will always keep the host's mac address and not take the mac address
2638 err
= setup_private_host_hw_addr(veth1
);
2640 ERROR("failed to change mac address of host interface '%s': %s",
2641 veth1
, strerror(-err
));
2646 err
= lxc_netdev_set_mtu(veth1
, atoi(netdev
->mtu
));
2648 err
= lxc_netdev_set_mtu(veth2
, atoi(netdev
->mtu
));
2650 ERROR("failed to set mtu '%s' for veth pair (%s and %s): %s",
2651 netdev
->mtu
, veth1
, veth2
, strerror(-err
));
2657 err
= lxc_bridge_attach(netdev
->link
, veth1
);
2659 ERROR("failed to attach '%s' to the bridge '%s': %s",
2660 veth1
, netdev
->link
, strerror(-err
));
2665 netdev
->ifindex
= if_nametoindex(veth2
);
2666 if (!netdev
->ifindex
) {
2667 ERROR("failed to retrieve the index for %s", veth2
);
2671 err
= lxc_netdev_up(veth1
);
2673 ERROR("failed to set %s up : %s", veth1
, strerror(-err
));
2677 if (netdev
->upscript
) {
2678 err
= run_script(handler
->name
, "net", netdev
->upscript
, "up",
2679 "veth", veth1
, (char*) NULL
);
2684 DEBUG("instantiated veth '%s/%s', index is '%d'",
2685 veth1
, veth2
, netdev
->ifindex
);
2690 lxc_netdev_delete_by_name(veth1
);
2691 if (!netdev
->priv
.veth_attr
.pair
)
2697 static int shutdown_veth(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2702 if (netdev
->priv
.veth_attr
.pair
)
2703 veth1
= netdev
->priv
.veth_attr
.pair
;
2705 veth1
= netdev
->priv
.veth_attr
.veth1
;
2707 if (netdev
->downscript
) {
2708 err
= run_script(handler
->name
, "net", netdev
->downscript
,
2709 "down", "veth", veth1
, (char*) NULL
);
2716 static int instantiate_macvlan(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2718 char peerbuf
[IFNAMSIZ
], *peer
;
2721 if (!netdev
->link
) {
2722 ERROR("no link specified for macvlan netdev");
2726 err
= snprintf(peerbuf
, sizeof(peerbuf
), "mcXXXXXX");
2727 if (err
>= sizeof(peerbuf
))
2730 peer
= lxc_mkifname(peerbuf
);
2732 ERROR("failed to make a temporary name");
2736 err
= lxc_macvlan_create(netdev
->link
, peer
,
2737 netdev
->priv
.macvlan_attr
.mode
);
2739 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2740 peer
, netdev
->link
, strerror(-err
));
2744 netdev
->ifindex
= if_nametoindex(peer
);
2745 if (!netdev
->ifindex
) {
2746 ERROR("failed to retrieve the index for %s", peer
);
2750 if (netdev
->upscript
) {
2751 err
= run_script(handler
->name
, "net", netdev
->upscript
, "up",
2752 "macvlan", netdev
->link
, (char*) NULL
);
2757 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
2758 peer
, netdev
->ifindex
, netdev
->priv
.macvlan_attr
.mode
);
2762 lxc_netdev_delete_by_name(peer
);
2767 static int shutdown_macvlan(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2771 if (netdev
->downscript
) {
2772 err
= run_script(handler
->name
, "net", netdev
->downscript
,
2773 "down", "macvlan", netdev
->link
,
2781 /* XXX: merge with instantiate_macvlan */
2782 static int instantiate_vlan(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2784 char peer
[IFNAMSIZ
];
2786 static uint16_t vlan_cntr
= 0;
2788 if (!netdev
->link
) {
2789 ERROR("no link specified for vlan netdev");
2793 err
= snprintf(peer
, sizeof(peer
), "vlan%d-%d", netdev
->priv
.vlan_attr
.vid
, vlan_cntr
++);
2794 if (err
>= sizeof(peer
)) {
2795 ERROR("peer name too long");
2799 err
= lxc_vlan_create(netdev
->link
, peer
, netdev
->priv
.vlan_attr
.vid
);
2801 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2802 peer
, netdev
->link
, strerror(-err
));
2806 netdev
->ifindex
= if_nametoindex(peer
);
2807 if (!netdev
->ifindex
) {
2808 ERROR("failed to retrieve the ifindex for %s", peer
);
2809 lxc_netdev_delete_by_name(peer
);
2813 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
2819 static int shutdown_vlan(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2824 static int instantiate_phys(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2826 if (!netdev
->link
) {
2827 ERROR("no link specified for the physical interface");
2831 netdev
->ifindex
= if_nametoindex(netdev
->link
);
2832 if (!netdev
->ifindex
) {
2833 ERROR("failed to retrieve the index for %s", netdev
->link
);
2837 if (netdev
->upscript
) {
2839 err
= run_script(handler
->name
, "net", netdev
->upscript
,
2840 "up", "phys", netdev
->link
, (char*) NULL
);
2848 static int shutdown_phys(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2852 if (netdev
->downscript
) {
2853 err
= run_script(handler
->name
, "net", netdev
->downscript
,
2854 "down", "phys", netdev
->link
, (char*) NULL
);
2861 static int instantiate_none(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2863 netdev
->ifindex
= 0;
2867 static int instantiate_empty(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2869 netdev
->ifindex
= 0;
2870 if (netdev
->upscript
) {
2872 err
= run_script(handler
->name
, "net", netdev
->upscript
,
2873 "up", "empty", (char*) NULL
);
2880 static int shutdown_empty(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2884 if (netdev
->downscript
) {
2885 err
= run_script(handler
->name
, "net", netdev
->downscript
,
2886 "down", "empty", (char*) NULL
);
2893 static int shutdown_none(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2898 int lxc_requests_empty_network(struct lxc_handler
*handler
)
2900 struct lxc_list
*network
= &handler
->conf
->network
;
2901 struct lxc_list
*iterator
;
2902 struct lxc_netdev
*netdev
;
2903 bool found_none
= false, found_nic
= false;
2905 if (lxc_list_empty(network
))
2908 lxc_list_for_each(iterator
, network
) {
2910 netdev
= iterator
->elem
;
2912 if (netdev
->type
== LXC_NET_NONE
)
2917 if (found_none
&& !found_nic
)
2922 int lxc_create_network(struct lxc_handler
*handler
)
2924 struct lxc_list
*network
= &handler
->conf
->network
;
2925 struct lxc_list
*iterator
;
2926 struct lxc_netdev
*netdev
;
2927 int am_root
= (getuid() == 0);
2932 lxc_list_for_each(iterator
, network
) {
2934 netdev
= iterator
->elem
;
2936 if (netdev
->type
< 0 || netdev
->type
> LXC_NET_MAXCONFTYPE
) {
2937 ERROR("invalid network configuration type '%d'",
2942 if (netdev_conf
[netdev
->type
](handler
, netdev
)) {
2943 ERROR("failed to create netdev");
2952 void lxc_delete_network(struct lxc_handler
*handler
)
2954 struct lxc_list
*network
= &handler
->conf
->network
;
2955 struct lxc_list
*iterator
;
2956 struct lxc_netdev
*netdev
;
2958 lxc_list_for_each(iterator
, network
) {
2959 netdev
= iterator
->elem
;
2961 if (netdev
->ifindex
!= 0 && netdev
->type
== LXC_NET_PHYS
) {
2962 if (lxc_netdev_rename_by_index(netdev
->ifindex
, netdev
->link
))
2963 WARN("failed to rename to the initial name the " \
2964 "netdev '%s'", netdev
->link
);
2968 if (netdev_deconf
[netdev
->type
](handler
, netdev
)) {
2969 WARN("failed to destroy netdev");
2972 /* Recent kernel remove the virtual interfaces when the network
2973 * namespace is destroyed but in case we did not moved the
2974 * interface to the network namespace, we have to destroy it
2976 if (netdev
->ifindex
!= 0 &&
2977 lxc_netdev_delete_by_index(netdev
->ifindex
))
2978 WARN("failed to remove interface '%s'", netdev
->name
);
2982 #define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
2984 /* lxc-user-nic returns "interface_name:interface_name\n" */
2985 #define MAX_BUFFER_SIZE IFNAMSIZ*2 + 2
2986 static int unpriv_assign_nic(struct lxc_netdev
*netdev
, pid_t pid
)
2989 int bytes
, pipefd
[2];
2990 char *token
, *saveptr
= NULL
;
2991 char buffer
[MAX_BUFFER_SIZE
];
2992 char netdev_link
[IFNAMSIZ
+1];
2994 if (netdev
->type
!= LXC_NET_VETH
) {
2995 ERROR("nic type %d not support for unprivileged use",
3000 if(pipe(pipefd
) < 0) {
3001 SYSERROR("pipe failed");
3005 if ((child
= fork()) < 0) {
3012 if (child
== 0) { // child
3013 /* close the read-end of the pipe */
3015 /* redirect the stdout to write-end of the pipe */
3016 dup2(pipefd
[1], STDOUT_FILENO
);
3017 /* close the write-end of the pipe */
3020 // Call lxc-user-nic pid type bridge
3023 strncpy(netdev_link
, netdev
->link
, IFNAMSIZ
);
3025 strncpy(netdev_link
, "none", IFNAMSIZ
);
3027 char *args
[] = {LXC_USERNIC_PATH
, pidstr
, "veth", netdev_link
, netdev
->name
, NULL
};
3028 snprintf(pidstr
, 19, "%lu", (unsigned long) pid
);
3030 execvp(args
[0], args
);
3031 SYSERROR("execvp lxc-user-nic");
3035 /* close the write-end of the pipe */
3038 bytes
= read(pipefd
[0], &buffer
, MAX_BUFFER_SIZE
);
3040 SYSERROR("read failed");
3042 buffer
[bytes
- 1] = '\0';
3044 if (wait_for_pid(child
) != 0) {
3049 /* close the read-end of the pipe */
3052 /* fill netdev->name field */
3053 token
= strtok_r(buffer
, ":", &saveptr
);
3056 netdev
->name
= malloc(IFNAMSIZ
+1);
3057 if (!netdev
->name
) {
3058 ERROR("Out of memory");
3061 memset(netdev
->name
, 0, IFNAMSIZ
+1);
3062 strncpy(netdev
->name
, token
, IFNAMSIZ
);
3064 /* fill netdev->veth_attr.pair field */
3065 token
= strtok_r(NULL
, ":", &saveptr
);
3068 netdev
->priv
.veth_attr
.pair
= strdup(token
);
3069 if (!netdev
->priv
.veth_attr
.pair
) {
3070 ERROR("Out of memory");
3077 int lxc_assign_network(struct lxc_list
*network
, pid_t pid
)
3079 struct lxc_list
*iterator
;
3080 struct lxc_netdev
*netdev
;
3081 int am_root
= (getuid() == 0);
3084 lxc_list_for_each(iterator
, network
) {
3086 netdev
= iterator
->elem
;
3088 if (netdev
->type
== LXC_NET_VETH
&& !am_root
) {
3089 if (unpriv_assign_nic(netdev
, pid
))
3091 // lxc-user-nic has moved the nic to the new ns.
3092 // unpriv_assign_nic() fills in netdev->name.
3093 // netdev->ifindex will be filed in at setup_netdev.
3097 /* empty network namespace, nothing to move */
3098 if (!netdev
->ifindex
)
3101 err
= lxc_netdev_move_by_index(netdev
->ifindex
, pid
, NULL
);
3103 ERROR("failed to move '%s' to the container : %s",
3104 netdev
->link
, strerror(-err
));
3108 DEBUG("move '%s' to '%d'", netdev
->name
, pid
);
3114 static int write_id_mapping(enum idtype idtype
, pid_t pid
, const char *buf
,
3117 char path
[PATH_MAX
];
3121 ret
= snprintf(path
, PATH_MAX
, "/proc/%d/%cid_map", pid
, idtype
== ID_TYPE_UID
? 'u' : 'g');
3122 if (ret
< 0 || ret
>= PATH_MAX
) {
3123 fprintf(stderr
, "%s: path name too long\n", __func__
);
3126 f
= fopen(path
, "w");
3131 ret
= fwrite(buf
, buf_size
, 1, f
);
3133 SYSERROR("writing id mapping");
3134 closeret
= fclose(f
);
3136 SYSERROR("writing id mapping");
3137 return ret
< 0 ? ret
: closeret
;
3140 int lxc_map_ids(struct lxc_list
*idmap
, pid_t pid
)
3142 struct lxc_list
*iterator
;
3144 int ret
= 0, use_shadow
= 0;
3146 char *buf
= NULL
, *pos
, *cmdpath
= NULL
;
3149 * If newuidmap exists, that is, if shadow is handing out subuid
3150 * ranges, then insist that root also reserve ranges in subuid. This
3151 * will protected it by preventing another user from being handed the
3154 cmdpath
= on_path("newuidmap", NULL
);
3160 if (!use_shadow
&& geteuid()) {
3161 ERROR("Missing newuidmap/newgidmap");
3165 for(type
= ID_TYPE_UID
; type
<= ID_TYPE_GID
; type
++) {
3169 buf
= pos
= malloc(4096);
3175 pos
+= sprintf(buf
, "new%cidmap %d",
3176 type
== ID_TYPE_UID
? 'u' : 'g',
3179 lxc_list_for_each(iterator
, idmap
) {
3180 /* The kernel only takes <= 4k for writes to /proc/<nr>/[ug]id_map */
3181 map
= iterator
->elem
;
3182 if (map
->idtype
!= type
)
3186 left
= 4096 - (pos
- buf
);
3187 fill
= snprintf(pos
, left
, "%s%lu %lu %lu%s",
3188 use_shadow
? " " : "",
3189 map
->nsid
, map
->hostid
, map
->range
,
3190 use_shadow
? "" : "\n");
3191 if (fill
<= 0 || fill
>= left
)
3192 SYSERROR("snprintf failed, too many mappings");
3199 ret
= write_id_mapping(type
, pid
, buf
, pos
-buf
);
3201 left
= 4096 - (pos
- buf
);
3202 fill
= snprintf(pos
, left
, "\n");
3203 if (fill
<= 0 || fill
>= left
)
3204 SYSERROR("snprintf failed, too many mappings");
3218 * return the host uid/gid to which the container root is mapped in
3220 * Return true if id was found, false otherwise.
3222 bool get_mapped_rootid(struct lxc_conf
*conf
, enum idtype idtype
,
3225 struct lxc_list
*it
;
3228 lxc_list_for_each(it
, &conf
->id_map
) {
3230 if (map
->idtype
!= idtype
)
3240 int mapped_hostid(unsigned id
, struct lxc_conf
*conf
, enum idtype idtype
)
3242 struct lxc_list
*it
;
3244 lxc_list_for_each(it
, &conf
->id_map
) {
3246 if (map
->idtype
!= idtype
)
3248 if (id
>= map
->hostid
&& id
< map
->hostid
+ map
->range
)
3249 return (id
- map
->hostid
) + map
->nsid
;
3254 int find_unmapped_nsuid(struct lxc_conf
*conf
, enum idtype idtype
)
3256 struct lxc_list
*it
;
3258 unsigned int freeid
= 0;
3260 lxc_list_for_each(it
, &conf
->id_map
) {
3262 if (map
->idtype
!= idtype
)
3264 if (freeid
>= map
->nsid
&& freeid
< map
->nsid
+ map
->range
) {
3265 freeid
= map
->nsid
+ map
->range
;
3272 int lxc_find_gateway_addresses(struct lxc_handler
*handler
)
3274 struct lxc_list
*network
= &handler
->conf
->network
;
3275 struct lxc_list
*iterator
;
3276 struct lxc_netdev
*netdev
;
3279 lxc_list_for_each(iterator
, network
) {
3280 netdev
= iterator
->elem
;
3282 if (!netdev
->ipv4_gateway_auto
&& !netdev
->ipv6_gateway_auto
)
3285 if (netdev
->type
!= LXC_NET_VETH
&& netdev
->type
!= LXC_NET_MACVLAN
) {
3286 ERROR("gateway = auto only supported for "
3287 "veth and macvlan");
3291 if (!netdev
->link
) {
3292 ERROR("gateway = auto needs a link interface");
3296 link_index
= if_nametoindex(netdev
->link
);
3300 if (netdev
->ipv4_gateway_auto
) {
3301 if (lxc_ipv4_addr_get(link_index
, &netdev
->ipv4_gateway
)) {
3302 ERROR("failed to automatically find ipv4 gateway "
3303 "address from link interface '%s'", netdev
->link
);
3308 if (netdev
->ipv6_gateway_auto
) {
3309 if (lxc_ipv6_addr_get(link_index
, &netdev
->ipv6_gateway
)) {
3310 ERROR("failed to automatically find ipv6 gateway "
3311 "address from link interface '%s'", netdev
->link
);
3320 int lxc_create_tty(const char *name
, struct lxc_conf
*conf
)
3322 struct lxc_tty_info
*tty_info
= &conf
->tty_info
;
3325 /* no tty in the configuration */
3329 tty_info
->pty_info
=
3330 malloc(sizeof(*tty_info
->pty_info
)*conf
->tty
);
3331 if (!tty_info
->pty_info
) {
3332 SYSERROR("failed to allocate pty_info");
3336 for (i
= 0; i
< conf
->tty
; i
++) {
3338 struct lxc_pty_info
*pty_info
= &tty_info
->pty_info
[i
];
3341 ret
= openpty(&pty_info
->master
, &pty_info
->slave
,
3342 pty_info
->name
, NULL
, NULL
);
3345 SYSERROR("failed to create pty #%d", i
);
3346 tty_info
->nbtty
= i
;
3347 lxc_delete_tty(tty_info
);
3351 DEBUG("allocated pty '%s' (%d/%d)",
3352 pty_info
->name
, pty_info
->master
, pty_info
->slave
);
3354 /* Prevent leaking the file descriptors to the container */
3355 fcntl(pty_info
->master
, F_SETFD
, FD_CLOEXEC
);
3356 fcntl(pty_info
->slave
, F_SETFD
, FD_CLOEXEC
);
3361 tty_info
->nbtty
= conf
->tty
;
3363 INFO("tty's configured");
3368 void lxc_delete_tty(struct lxc_tty_info
*tty_info
)
3372 for (i
= 0; i
< tty_info
->nbtty
; i
++) {
3373 struct lxc_pty_info
*pty_info
= &tty_info
->pty_info
[i
];
3375 close(pty_info
->master
);
3376 close(pty_info
->slave
);
3379 free(tty_info
->pty_info
);
3380 tty_info
->nbtty
= 0;
3384 * chown_mapped_root: for an unprivileged user with uid/gid X to
3385 * chown a dir to subuid/subgid Y, he needs to run chown as root
3386 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3387 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3388 * root is privileged with respect to hostuid/hostgid X, allowing
3389 * him to do the chown.
3391 int chown_mapped_root(char *path
, struct lxc_conf
*conf
)
3397 char *chownpath
= path
;
3399 if (!get_mapped_rootid(conf
, ID_TYPE_UID
, &val
)) {
3400 ERROR("No mapping for container root");
3403 rootuid
= (uid_t
) val
;
3404 if (!get_mapped_rootid(conf
, ID_TYPE_GID
, &val
)) {
3405 ERROR("No mapping for container root");
3408 rootgid
= (gid_t
) val
;
3411 * In case of overlay, we want only the writeable layer
3414 if (strncmp(path
, "overlayfs:", 10) == 0 || strncmp(path
, "aufs:", 5) == 0) {
3415 chownpath
= strchr(path
, ':');
3417 ERROR("Bad overlay path: %s", path
);
3420 chownpath
= strchr(chownpath
+1, ':');
3422 ERROR("Bad overlay path: %s", path
);
3428 if (geteuid() == 0) {
3429 if (chown(path
, rootuid
, rootgid
) < 0) {
3430 ERROR("Error chowning %s", path
);
3436 if (rootuid
== geteuid()) {
3438 INFO("%s: container root is our uid; no need to chown" ,__func__
);
3444 SYSERROR("Failed forking");
3448 int hostuid
= geteuid(), hostgid
= getegid(), ret
;
3450 char map1
[100], map2
[100], map3
[100], map4
[100], map5
[100];
3452 char *args1
[] = { "lxc-usernsexec", "-m", map1
, "-m", map2
,
3453 "-m", map3
, "-m", map5
,
3454 "--", "chown", ugid
, path
, NULL
};
3455 char *args2
[] = { "lxc-usernsexec", "-m", map1
, "-m", map2
,
3456 "-m", map3
, "-m", map4
, "-m", map5
,
3457 "--", "chown", ugid
, path
, NULL
};
3459 // save the current gid of "path"
3460 if (stat(path
, &sb
) < 0) {
3461 ERROR("Error stat %s", path
);
3466 * A file has to be group-owned by a gid mapped into the
3467 * container, or the container won't be privileged over it.
3469 if (sb
.st_uid
== geteuid() &&
3470 mapped_hostid(sb
.st_gid
, conf
, ID_TYPE_GID
) < 0 &&
3471 chown(path
, -1, hostgid
) < 0) {
3472 ERROR("Failed chgrping %s", path
);
3477 ret
= snprintf(map1
, 100, "u:0:%d:1", rootuid
);
3478 if (ret
< 0 || ret
>= 100) {
3479 ERROR("Error uid printing map string");
3483 // "u:hostuid:hostuid:1"
3484 ret
= snprintf(map2
, 100, "u:%d:%d:1", hostuid
, hostuid
);
3485 if (ret
< 0 || ret
>= 100) {
3486 ERROR("Error uid printing map string");
3491 ret
= snprintf(map3
, 100, "g:0:%d:1", rootgid
);
3492 if (ret
< 0 || ret
>= 100) {
3493 ERROR("Error gid printing map string");
3497 // "g:pathgid:rootgid+pathgid:1"
3498 ret
= snprintf(map4
, 100, "g:%d:%d:1", (gid_t
)sb
.st_gid
,
3499 rootgid
+ (gid_t
)sb
.st_gid
);
3500 if (ret
< 0 || ret
>= 100) {
3501 ERROR("Error gid printing map string");
3505 // "g:hostgid:hostgid:1"
3506 ret
= snprintf(map5
, 100, "g:%d:%d:1", hostgid
, hostgid
);
3507 if (ret
< 0 || ret
>= 100) {
3508 ERROR("Error gid printing map string");
3512 // "0:pathgid" (chown)
3513 ret
= snprintf(ugid
, 100, "0:%d", (gid_t
)sb
.st_gid
);
3514 if (ret
< 0 || ret
>= 100) {
3515 ERROR("Error owner printing format string for chown");
3519 if (hostgid
== sb
.st_gid
)
3520 ret
= execvp("lxc-usernsexec", args1
);
3522 ret
= execvp("lxc-usernsexec", args2
);
3523 SYSERROR("Failed executing usernsexec");
3526 return wait_for_pid(pid
);
3529 int ttys_shift_ids(struct lxc_conf
*c
)
3531 if (lxc_list_empty(&c
->id_map
))
3534 if (strcmp(c
->console
.name
, "") !=0 && chown_mapped_root(c
->console
.name
, c
) < 0) {
3535 ERROR("Failed to chown %s", c
->console
.name
);
3543 * _do_tmp_proc_mount: Mount /proc inside container if not already
3546 * @rootfs : the rootfs where proc should be mounted
3548 * Returns < 0 on failure, 0 if the correct proc was already mounted
3549 * and 1 if a new proc was mounted.
3551 static int do_tmp_proc_mount(const char *rootfs
)
3553 char path
[MAXPATHLEN
];
3557 ret
= snprintf(path
, MAXPATHLEN
, "%s/proc/self", rootfs
);
3558 if (ret
< 0 || ret
>= MAXPATHLEN
) {
3559 SYSERROR("proc path name too long");
3562 memset(link
, 0, 20);
3563 linklen
= readlink(path
, link
, 20);
3564 INFO("I am %d, /proc/self points to '%s'", getpid(), link
);
3565 ret
= snprintf(path
, MAXPATHLEN
, "%s/proc", rootfs
);
3566 if (linklen
< 0) /* /proc not mounted */
3568 /* can't be longer than rootfs/proc/1 */
3569 if (strncmp(link
, "1", linklen
) != 0) {
3570 /* wrong /procs mounted */
3571 umount2(path
, MNT_DETACH
); /* ignore failure */
3574 /* the right proc is already mounted */
3578 if (mount("proc", path
, "proc", 0, NULL
))
3580 INFO("Mounted /proc in container for security transition");
3584 int tmp_proc_mount(struct lxc_conf
*lxc_conf
)
3588 if (lxc_conf
->rootfs
.path
== NULL
|| strlen(lxc_conf
->rootfs
.path
) == 0) {
3589 if (mount("proc", "/proc", "proc", 0, NULL
)) {
3590 SYSERROR("Failed mounting /proc, proceeding");
3595 mounted
= do_tmp_proc_mount(lxc_conf
->rootfs
.mount
);
3596 if (mounted
== -1) {
3597 SYSERROR("failed to mount /proc in the container.");
3599 } else if (mounted
== 1) {
3600 lxc_conf
->tmp_umount_proc
= 1;
3605 void tmp_proc_unmount(struct lxc_conf
*lxc_conf
)
3607 if (lxc_conf
->tmp_umount_proc
== 1) {
3609 lxc_conf
->tmp_umount_proc
= 0;
3613 void remount_all_slave(void)
3615 /* walk /proc/mounts and change any shared entries to slave */
3616 FILE *f
= fopen("/proc/self/mountinfo", "r");
3621 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3622 ERROR("Continuing container startup...");
3626 while (getline(&line
, &len
, f
) != -1) {
3627 char *target
, *opts
;
3628 target
= get_field(line
, 4);
3631 opts
= get_field(target
, 2);
3634 null_endofword(opts
);
3635 if (!strstr(opts
, "shared"))
3637 null_endofword(target
);
3638 if (mount(NULL
, target
, NULL
, MS_SLAVE
, NULL
)) {
3639 SYSERROR("Failed to make %s rslave", target
);
3640 ERROR("Continuing...");
3647 void lxc_execute_bind_init(struct lxc_conf
*conf
)
3650 char path
[PATH_MAX
], destpath
[PATH_MAX
], *p
;
3652 /* If init exists in the container, don't bind mount a static one */
3653 p
= choose_init(conf
->rootfs
.mount
);
3659 ret
= snprintf(path
, PATH_MAX
, SBINDIR
"/init.lxc.static");
3660 if (ret
< 0 || ret
>= PATH_MAX
) {
3661 WARN("Path name too long searching for lxc.init.static");
3665 if (!file_exists(path
)) {
3666 INFO("%s does not exist on host", path
);
3670 ret
= snprintf(destpath
, PATH_MAX
, "%s%s", conf
->rootfs
.mount
, "/init.lxc.static");
3671 if (ret
< 0 || ret
>= PATH_MAX
) {
3672 WARN("Path name too long for container's lxc.init.static");
3676 if (!file_exists(destpath
)) {
3677 FILE * pathfile
= fopen(destpath
, "wb");
3679 SYSERROR("Failed to create mount target '%s'", destpath
);
3685 ret
= mount(path
, destpath
, "none", MS_BIND
, NULL
);
3687 SYSERROR("Failed to bind lxc.init.static into container");
3688 INFO("lxc.init.static bound into container at %s", path
);
3692 * This does the work of remounting / if it is shared, calling the
3693 * container pre-mount hooks, and mounting the rootfs.
3695 int do_rootfs_setup(struct lxc_conf
*conf
, const char *name
, const char *lxcpath
)
3697 if (conf
->rootfs_setup
) {
3699 * rootfs was set up in another namespace. bind-mount it
3700 * to give us a mount in our own ns so we can pivot_root to it
3702 const char *path
= conf
->rootfs
.mount
;
3703 if (mount(path
, path
, "rootfs", MS_BIND
, NULL
) < 0) {
3704 ERROR("Failed to bind-mount container / onto itself");
3710 remount_all_slave();
3712 if (run_lxc_hooks(name
, "pre-mount", conf
, lxcpath
, NULL
)) {
3713 ERROR("failed to run pre-mount hooks for container '%s'.", name
);
3717 if (setup_rootfs(conf
)) {
3718 ERROR("failed to setup rootfs for '%s'", name
);
3722 conf
->rootfs_setup
= true;
3726 static bool verify_start_hooks(struct lxc_conf
*conf
)
3728 struct lxc_list
*it
;
3729 char path
[MAXPATHLEN
];
3730 lxc_list_for_each(it
, &conf
->hooks
[LXCHOOK_START
]) {
3731 char *hookname
= it
->elem
;
3735 ret
= snprintf(path
, MAXPATHLEN
, "%s%s",
3736 conf
->rootfs
.mount
, hookname
);
3737 if (ret
< 0 || ret
>= MAXPATHLEN
)
3739 ret
= stat(path
, &st
);
3741 SYSERROR("Start hook %s not found in container rootfs",
3751 static int send_fd(int sock
, int fd
)
3753 int ret
= lxc_abstract_unix_send_fd(sock
, fd
, NULL
, 0);
3757 SYSERROR("Error sending tty fd to parent");
3764 static int send_ttys_to_parent(struct lxc_handler
*handler
)
3766 struct lxc_conf
*conf
= handler
->conf
;
3767 const struct lxc_tty_info
*tty_info
= &conf
->tty_info
;
3769 int sock
= handler
->ttysock
[0];
3771 for (i
= 0; i
< tty_info
->nbtty
; i
++) {
3772 struct lxc_pty_info
*pty_info
= &tty_info
->pty_info
[i
];
3773 if (send_fd(sock
, pty_info
->slave
) < 0)
3775 close(pty_info
->slave
);
3776 pty_info
->slave
= -1;
3777 if (send_fd(sock
, pty_info
->master
) < 0)
3779 close(pty_info
->master
);
3780 pty_info
->master
= -1;
3783 close(handler
->ttysock
[0]);
3784 close(handler
->ttysock
[1]);
3789 ERROR("Error writing tty fd to parent");
3793 int lxc_setup(struct lxc_handler
*handler
)
3795 const char *name
= handler
->name
;
3796 struct lxc_conf
*lxc_conf
= handler
->conf
;
3797 const char *lxcpath
= handler
->lxcpath
;
3799 if (do_rootfs_setup(lxc_conf
, name
, lxcpath
) < 0) {
3800 ERROR("Error setting up rootfs mount after spawn");
3804 if (lxc_conf
->inherit_ns_fd
[LXC_NS_UTS
] == -1) {
3805 if (setup_utsname(lxc_conf
->utsname
)) {
3806 ERROR("failed to setup the utsname for '%s'", name
);
3811 if (setup_network(&lxc_conf
->network
)) {
3812 ERROR("failed to setup the network for '%s'", name
);
3816 if (lxc_conf
->autodev
> 0) {
3817 if (mount_autodev(name
, lxc_conf
->rootfs
.mount
, lxcpath
)) {
3818 ERROR("failed to mount /dev in the container");
3823 /* do automatic mounts (mainly /proc and /sys), but exclude
3824 * those that need to wait until other stuff has finished
3826 if (lxc_mount_auto_mounts(lxc_conf
, lxc_conf
->auto_mounts
& ~LXC_AUTO_CGROUP_MASK
, handler
) < 0) {
3827 ERROR("failed to setup the automatic mounts for '%s'", name
);
3831 if (setup_mount(&lxc_conf
->rootfs
, lxc_conf
->fstab
, name
)) {
3832 ERROR("failed to setup the mounts for '%s'", name
);
3836 if (!lxc_list_empty(&lxc_conf
->mount_list
) && setup_mount_entries(&lxc_conf
->rootfs
, &lxc_conf
->mount_list
, name
)) {
3837 ERROR("failed to setup the mount entries for '%s'", name
);
3841 /* Make sure any start hooks are in the rootfs */
3842 if (!verify_start_hooks(lxc_conf
))
3845 if (lxc_conf
->is_execute
)
3846 lxc_execute_bind_init(lxc_conf
);
3848 /* now mount only cgroup, if wanted;
3849 * before, /sys could not have been mounted
3850 * (is either mounted automatically or via fstab entries)
3852 if (lxc_mount_auto_mounts(lxc_conf
, lxc_conf
->auto_mounts
& LXC_AUTO_CGROUP_MASK
, handler
) < 0) {
3853 ERROR("failed to setup the automatic mounts for '%s'", name
);
3857 if (run_lxc_hooks(name
, "mount", lxc_conf
, lxcpath
, NULL
)) {
3858 ERROR("failed to run mount hooks for container '%s'.", name
);
3862 if (lxc_conf
->autodev
> 0) {
3863 if (run_lxc_hooks(name
, "autodev", lxc_conf
, lxcpath
, NULL
)) {
3864 ERROR("failed to run autodev hooks for container '%s'.", name
);
3867 if (fill_autodev(lxc_conf
->rootfs
.mount
)) {
3868 ERROR("failed to populate /dev in the container");
3873 if (!lxc_conf
->is_execute
&& setup_console(&lxc_conf
->rootfs
, &lxc_conf
->console
, lxc_conf
->ttydir
)) {
3874 ERROR("failed to setup the console for '%s'", name
);
3878 if (lxc_conf
->kmsg
) {
3879 if (setup_kmsg(&lxc_conf
->rootfs
, &lxc_conf
->console
)) // don't fail
3880 ERROR("failed to setup kmsg for '%s'", name
);
3883 if (!lxc_conf
->is_execute
&& setup_dev_symlinks(&lxc_conf
->rootfs
)) {
3884 ERROR("failed to setup /dev symlinks for '%s'", name
);
3888 /* mount /proc if it's not already there */
3889 if (tmp_proc_mount(lxc_conf
) < 0) {
3890 ERROR("failed to LSM mount proc for '%s'", name
);
3894 if (setup_pivot_root(&lxc_conf
->rootfs
)) {
3895 ERROR("failed to set rootfs for '%s'", name
);
3899 if (setup_pts(lxc_conf
->pts
)) {
3900 ERROR("failed to setup the new pts instance");
3904 if (lxc_create_tty(name
, lxc_conf
)) {
3905 ERROR("failed to create the ttys");
3909 if (send_ttys_to_parent(handler
) < 0) {
3910 ERROR("failure sending console info to parent");
3915 if (!lxc_conf
->is_execute
&& setup_tty(lxc_conf
)) {
3916 ERROR("failed to setup the ttys for '%s'", name
);
3920 if (lxc_conf
->pty_names
&& setenv("container_ttys", lxc_conf
->pty_names
, 1))
3921 SYSERROR("failed to set environment variable for container ptys");
3924 if (setup_personality(lxc_conf
->personality
)) {
3925 ERROR("failed to setup personality");
3929 if (!lxc_list_empty(&lxc_conf
->keepcaps
)) {
3930 if (!lxc_list_empty(&lxc_conf
->caps
)) {
3931 ERROR("Simultaneously requested dropping and keeping caps");
3934 if (dropcaps_except(&lxc_conf
->keepcaps
)) {
3935 ERROR("failed to keep requested caps");
3938 } else if (setup_caps(&lxc_conf
->caps
)) {
3939 ERROR("failed to drop capabilities");
3943 NOTICE("'%s' is setup.", name
);
3948 int run_lxc_hooks(const char *name
, char *hook
, struct lxc_conf
*conf
,
3949 const char *lxcpath
, char *argv
[])
3952 struct lxc_list
*it
;
3954 if (strcmp(hook
, "pre-start") == 0)
3955 which
= LXCHOOK_PRESTART
;
3956 else if (strcmp(hook
, "pre-mount") == 0)
3957 which
= LXCHOOK_PREMOUNT
;
3958 else if (strcmp(hook
, "mount") == 0)
3959 which
= LXCHOOK_MOUNT
;
3960 else if (strcmp(hook
, "autodev") == 0)
3961 which
= LXCHOOK_AUTODEV
;
3962 else if (strcmp(hook
, "start") == 0)
3963 which
= LXCHOOK_START
;
3964 else if (strcmp(hook
, "post-stop") == 0)
3965 which
= LXCHOOK_POSTSTOP
;
3966 else if (strcmp(hook
, "clone") == 0)
3967 which
= LXCHOOK_CLONE
;
3970 lxc_list_for_each(it
, &conf
->hooks
[which
]) {
3972 char *hookname
= it
->elem
;
3973 ret
= run_script_argv(name
, "lxc", hookname
, hook
, lxcpath
, argv
);
3980 static void lxc_remove_nic(struct lxc_list
*it
)
3982 struct lxc_netdev
*netdev
= it
->elem
;
3983 struct lxc_list
*it2
,*next
;
3989 if (netdev
->type
== LXC_NET_VETH
)
3990 free(netdev
->priv
.veth_attr
.pair
);
3991 free(netdev
->upscript
);
3992 free(netdev
->hwaddr
);
3994 free(netdev
->ipv4_gateway
);
3995 free(netdev
->ipv6_gateway
);
3996 lxc_list_for_each_safe(it2
, &netdev
->ipv4
, next
) {
4001 lxc_list_for_each_safe(it2
, &netdev
->ipv6
, next
) {
4010 /* we get passed in something like '0', '0.ipv4' or '1.ipv6' */
4011 int lxc_clear_nic(struct lxc_conf
*c
, const char *key
)
4015 struct lxc_list
*it
;
4016 struct lxc_netdev
*netdev
;
4018 p1
= strchr(key
, '.');
4019 if (!p1
|| *(p1
+1) == '\0')
4022 ret
= sscanf(key
, "%d", &idx
);
4023 if (ret
!= 1) return -1;
4028 lxc_list_for_each(it
, &c
->network
) {
4033 if (i
< idx
) // we don't have that many nics defined
4036 if (!it
|| !it
->elem
)
4043 } else if (strcmp(p1
, ".ipv4") == 0) {
4044 struct lxc_list
*it2
,*next
;
4045 lxc_list_for_each_safe(it2
, &netdev
->ipv4
, next
) {
4050 } else if (strcmp(p1
, ".ipv6") == 0) {
4051 struct lxc_list
*it2
,*next
;
4052 lxc_list_for_each_safe(it2
, &netdev
->ipv6
, next
) {
4063 int lxc_clear_config_network(struct lxc_conf
*c
)
4065 struct lxc_list
*it
,*next
;
4066 lxc_list_for_each_safe(it
, &c
->network
, next
) {
4072 int lxc_clear_config_caps(struct lxc_conf
*c
)
4074 struct lxc_list
*it
,*next
;
4076 lxc_list_for_each_safe(it
, &c
->caps
, next
) {
4084 static int lxc_free_idmap(struct lxc_list
*id_map
) {
4085 struct lxc_list
*it
, *next
;
4087 lxc_list_for_each_safe(it
, id_map
, next
) {
4095 int lxc_clear_idmaps(struct lxc_conf
*c
)
4097 return lxc_free_idmap(&c
->id_map
);
4100 int lxc_clear_config_keepcaps(struct lxc_conf
*c
)
4102 struct lxc_list
*it
,*next
;
4104 lxc_list_for_each_safe(it
, &c
->keepcaps
, next
) {
4112 int lxc_clear_cgroups(struct lxc_conf
*c
, const char *key
)
4114 struct lxc_list
*it
,*next
;
4116 const char *k
= key
+ 11;
4118 if (strcmp(key
, "lxc.cgroup") == 0)
4121 lxc_list_for_each_safe(it
, &c
->cgroup
, next
) {
4122 struct lxc_cgroup
*cg
= it
->elem
;
4123 if (!all
&& strcmp(cg
->subsystem
, k
) != 0)
4126 free(cg
->subsystem
);
4134 int lxc_clear_groups(struct lxc_conf
*c
)
4136 struct lxc_list
*it
,*next
;
4138 lxc_list_for_each_safe(it
, &c
->groups
, next
) {
4146 int lxc_clear_environment(struct lxc_conf
*c
)
4148 struct lxc_list
*it
,*next
;
4150 lxc_list_for_each_safe(it
, &c
->environment
, next
) {
4159 int lxc_clear_mount_entries(struct lxc_conf
*c
)
4161 struct lxc_list
*it
,*next
;
4163 lxc_list_for_each_safe(it
, &c
->mount_list
, next
) {
4171 int lxc_clear_automounts(struct lxc_conf
*c
)
4177 int lxc_clear_hooks(struct lxc_conf
*c
, const char *key
)
4179 struct lxc_list
*it
,*next
;
4180 bool all
= false, done
= false;
4181 const char *k
= key
+ 9;
4184 if (strcmp(key
, "lxc.hook") == 0)
4187 for (i
=0; i
<NUM_LXC_HOOKS
; i
++) {
4188 if (all
|| strcmp(k
, lxchook_names
[i
]) == 0) {
4189 lxc_list_for_each_safe(it
, &c
->hooks
[i
], next
) {
4199 ERROR("Invalid hook key: %s", key
);
4205 static void lxc_clear_saved_nics(struct lxc_conf
*conf
)
4209 if (!conf
->saved_nics
)
4211 for (i
=0; i
< conf
->num_savednics
; i
++)
4212 free(conf
->saved_nics
[i
].orig_name
);
4213 free(conf
->saved_nics
);
4216 static inline void lxc_clear_aliens(struct lxc_conf
*conf
)
4218 struct lxc_list
*it
,*next
;
4220 lxc_list_for_each_safe(it
, &conf
->aliens
, next
) {
4227 static inline void lxc_clear_includes(struct lxc_conf
*conf
)
4229 struct lxc_list
*it
,*next
;
4231 lxc_list_for_each_safe(it
, &conf
->includes
, next
) {
4238 void lxc_conf_free(struct lxc_conf
*conf
)
4242 free(conf
->console
.log_path
);
4243 free(conf
->console
.path
);
4244 free(conf
->rootfs
.mount
);
4245 free(conf
->rootfs
.options
);
4246 free(conf
->rootfs
.path
);
4247 free(conf
->rootfs
.pivot
);
4248 free(conf
->logfile
);
4249 free(conf
->utsname
);
4253 free(conf
->init_cmd
);
4254 free(conf
->unexpanded_config
);
4255 free(conf
->pty_names
);
4256 lxc_clear_config_network(conf
);
4257 free(conf
->lsm_aa_profile
);
4258 free(conf
->lsm_se_context
);
4259 lxc_seccomp_free(conf
);
4260 lxc_clear_config_caps(conf
);
4261 lxc_clear_config_keepcaps(conf
);
4262 lxc_clear_cgroups(conf
, "lxc.cgroup");
4263 lxc_clear_hooks(conf
, "lxc.hook");
4264 lxc_clear_mount_entries(conf
);
4265 lxc_clear_saved_nics(conf
);
4266 lxc_clear_idmaps(conf
);
4267 lxc_clear_groups(conf
);
4268 lxc_clear_includes(conf
);
4269 lxc_clear_aliens(conf
);
4270 lxc_clear_environment(conf
);
4274 struct userns_fn_data
{
4280 static int run_userns_fn(void *data
)
4282 struct userns_fn_data
*d
= data
;
4284 // we're not sharing with the parent any more, if it was a thread
4287 if (read(d
->p
[0], &c
, 1) != 1)
4290 return d
->fn(d
->arg
);
4294 * Add ID_TYPE_UID/ID_TYPE_GID entries to an existing lxc_conf,
4295 * if they are not already there.
4297 static struct lxc_list
*idmap_add_id(struct lxc_conf
*conf
,
4298 uid_t uid
, gid_t gid
)
4300 int hostuid_mapped
= mapped_hostid(uid
, conf
, ID_TYPE_UID
);
4301 int hostgid_mapped
= mapped_hostid(gid
, conf
, ID_TYPE_GID
);
4302 struct lxc_list
*new = NULL
, *tmp
, *it
, *next
;
4303 struct id_map
*entry
;
4305 new = malloc(sizeof(*new));
4307 ERROR("Out of memory building id map");
4312 if (hostuid_mapped
< 0) {
4313 hostuid_mapped
= find_unmapped_nsuid(conf
, ID_TYPE_UID
);
4314 if (hostuid_mapped
< 0)
4316 tmp
= malloc(sizeof(*tmp
));
4319 entry
= malloc(sizeof(*entry
));
4325 entry
->idtype
= ID_TYPE_UID
;
4326 entry
->nsid
= hostuid_mapped
;
4327 entry
->hostid
= (unsigned long) uid
;
4329 lxc_list_add_tail(new, tmp
);
4331 if (hostgid_mapped
< 0) {
4332 hostgid_mapped
= find_unmapped_nsuid(conf
, ID_TYPE_GID
);
4333 if (hostgid_mapped
< 0)
4335 tmp
= malloc(sizeof(*tmp
));
4338 entry
= malloc(sizeof(*entry
));
4344 entry
->idtype
= ID_TYPE_GID
;
4345 entry
->nsid
= hostgid_mapped
;
4346 entry
->hostid
= (unsigned long) gid
;
4348 lxc_list_add_tail(new, tmp
);
4350 lxc_list_for_each_safe(it
, &conf
->id_map
, next
) {
4351 tmp
= malloc(sizeof(*tmp
));
4354 entry
= malloc(sizeof(*entry
));
4359 memset(entry
, 0, sizeof(*entry
));
4360 memcpy(entry
, it
->elem
, sizeof(*entry
));
4362 lxc_list_add_tail(new, tmp
);
4368 ERROR("Out of memory building a new uid/gid map");
4370 lxc_free_idmap(new);
4376 * Run a function in a new user namespace.
4377 * The caller's euid/egid will be mapped in if it is not already.
4379 int userns_exec_1(struct lxc_conf
*conf
, int (*fn
)(void *), void *data
)
4382 struct userns_fn_data d
;
4385 struct lxc_list
*idmap
;
4389 SYSERROR("opening pipe");
4396 pid
= lxc_clone(run_userns_fn
, &d
, CLONE_NEWUSER
);
4402 if ((idmap
= idmap_add_id(conf
, geteuid(), getegid())) == NULL
) {
4403 ERROR("Error adding self to container uid/gid map");
4407 ret
= lxc_map_ids(idmap
, pid
);
4408 lxc_free_idmap(idmap
);
4411 ERROR("Error setting up child mappings");
4416 if (write(p
[1], &c
, 1) != 1) {
4417 SYSERROR("writing to pipe to child");
4421 ret
= wait_for_pid(pid
);
4433 /* not thread-safe, do not use from api without first forking */
4434 static char* getuname(void)
4436 struct passwd
*result
;
4438 result
= getpwuid(geteuid());
4442 return strdup(result
->pw_name
);
4445 /* not thread-safe, do not use from api without first forking */
4446 static char *getgname(void)
4448 struct group
*result
;
4450 result
= getgrgid(getegid());
4454 return strdup(result
->gr_name
);
4457 /* not thread-safe, do not use from api without first forking */
4458 void suggest_default_idmap(void)
4461 unsigned int uid
= 0, urange
= 0, gid
= 0, grange
= 0;
4463 char *uname
, *gname
;
4466 if (!(uname
= getuname()))
4469 if (!(gname
= getgname())) {
4474 f
= fopen(subuidfile
, "r");
4476 ERROR("Your system is not configured with subuids");
4481 while (getline(&line
, &len
, f
) != -1) {
4482 char *p
= strchr(line
, ':'), *p2
;
4489 if (strcmp(line
, uname
))
4491 p2
= strchr(p
, ':');
4503 f
= fopen(subuidfile
, "r");
4505 ERROR("Your system is not configured with subgids");
4510 while (getline(&line
, &len
, f
) != -1) {
4511 char *p
= strchr(line
, ':'), *p2
;
4518 if (strcmp(line
, uname
))
4520 p2
= strchr(p
, ':');
4534 if (!urange
|| !grange
) {
4535 ERROR("You do not have subuids or subgids allocated");
4536 ERROR("Unprivileged containers require subuids and subgids");
4540 ERROR("You must either run as root, or define uid mappings");
4541 ERROR("To pass uid mappings to lxc-create, you could create");
4542 ERROR("~/.config/lxc/default.conf:");
4543 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG
);
4544 ERROR("lxc.id_map = u 0 %u %u", uid
, urange
);
4545 ERROR("lxc.id_map = g 0 %u %u", gid
, grange
);