2 * lxc: linux Container library
4 * (C) Copyright IBM Corp. 2007, 2008
7 * Daniel Lezcano <daniel.lezcano at free.fr>
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
34 #include <sys/syscall.h>
40 #include <../include/openpty.h>
43 #include <linux/loop.h>
45 #include <sys/types.h>
46 #include <sys/utsname.h>
47 #include <sys/param.h>
49 #include <sys/socket.h>
50 #include <sys/mount.h>
52 #include <sys/prctl.h>
54 #include <arpa/inet.h>
56 #include <netinet/in.h>
66 #include "caps.h" /* for lxc_caps_last_cap() */
70 #include "namespace.h"
73 #if HAVE_SYS_CAPABILITY_H
74 #include <sys/capability.h>
77 #if HAVE_SYS_PERSONALITY_H
78 #include <sys/personality.h>
82 #include <../include/lxcmntent.h>
87 #include "lxcseccomp.h"
89 lxc_log_define(lxc_conf
, lxc
);
92 #define MAXINDEXLEN 20
94 #define MAXLINELEN 128
96 #if HAVE_SYS_CAPABILITY_H
98 #define CAP_SETFCAP 31
101 #ifndef CAP_MAC_OVERRIDE
102 #define CAP_MAC_OVERRIDE 32
105 #ifndef CAP_MAC_ADMIN
106 #define CAP_MAC_ADMIN 33
110 #ifndef PR_CAPBSET_DROP
111 #define PR_CAPBSET_DROP 24
114 #ifndef LO_FLAGS_AUTOCLEAR
115 #define LO_FLAGS_AUTOCLEAR 4
118 /* needed for cgroup automount checks, regardless of whether we
119 * have included linux/capability.h or not */
120 #ifndef CAP_SYS_ADMIN
121 #define CAP_SYS_ADMIN 21
124 /* Define pivot_root() if missing from the C library */
125 #ifndef HAVE_PIVOT_ROOT
126 static int pivot_root(const char * new_root
, const char * put_old
)
128 #ifdef __NR_pivot_root
129 return syscall(__NR_pivot_root
, new_root
, put_old
);
136 extern int pivot_root(const char * new_root
, const char * put_old
);
139 /* Define sethostname() if missing from the C library */
140 #ifndef HAVE_SETHOSTNAME
141 static int sethostname(const char * name
, size_t len
)
143 #ifdef __NR_sethostname
144 return syscall(__NR_sethostname
, name
, len
);
152 /* Define __S_ISTYPE if missing from the C library */
154 #define __S_ISTYPE(mode, mask) (((mode) & S_IFMT) == (mask))
157 char *lxchook_names
[NUM_LXC_HOOKS
] = {
158 "pre-start", "pre-mount", "mount", "autodev", "start", "post-stop", "clone" };
160 typedef int (*instanciate_cb
)(struct lxc_handler
*, struct lxc_netdev
*);
173 /* Declare this here, since we don't want to reshuffle the whole file. */
174 static int in_caplist(int cap
, struct lxc_list
*caps
);
176 static int instanciate_veth(struct lxc_handler
*, struct lxc_netdev
*);
177 static int instanciate_macvlan(struct lxc_handler
*, struct lxc_netdev
*);
178 static int instanciate_vlan(struct lxc_handler
*, struct lxc_netdev
*);
179 static int instanciate_phys(struct lxc_handler
*, struct lxc_netdev
*);
180 static int instanciate_empty(struct lxc_handler
*, struct lxc_netdev
*);
181 static int instanciate_none(struct lxc_handler
*, struct lxc_netdev
*);
183 static instanciate_cb netdev_conf
[LXC_NET_MAXCONFTYPE
+ 1] = {
184 [LXC_NET_VETH
] = instanciate_veth
,
185 [LXC_NET_MACVLAN
] = instanciate_macvlan
,
186 [LXC_NET_VLAN
] = instanciate_vlan
,
187 [LXC_NET_PHYS
] = instanciate_phys
,
188 [LXC_NET_EMPTY
] = instanciate_empty
,
189 [LXC_NET_NONE
] = instanciate_none
,
192 static int shutdown_veth(struct lxc_handler
*, struct lxc_netdev
*);
193 static int shutdown_macvlan(struct lxc_handler
*, struct lxc_netdev
*);
194 static int shutdown_vlan(struct lxc_handler
*, struct lxc_netdev
*);
195 static int shutdown_phys(struct lxc_handler
*, struct lxc_netdev
*);
196 static int shutdown_empty(struct lxc_handler
*, struct lxc_netdev
*);
197 static int shutdown_none(struct lxc_handler
*, struct lxc_netdev
*);
199 static instanciate_cb netdev_deconf
[LXC_NET_MAXCONFTYPE
+ 1] = {
200 [LXC_NET_VETH
] = shutdown_veth
,
201 [LXC_NET_MACVLAN
] = shutdown_macvlan
,
202 [LXC_NET_VLAN
] = shutdown_vlan
,
203 [LXC_NET_PHYS
] = shutdown_phys
,
204 [LXC_NET_EMPTY
] = shutdown_empty
,
205 [LXC_NET_NONE
] = shutdown_none
,
208 static struct mount_opt mount_opt
[] = {
209 { "defaults", 0, 0 },
210 { "ro", 0, MS_RDONLY
},
211 { "rw", 1, MS_RDONLY
},
212 { "suid", 1, MS_NOSUID
},
213 { "nosuid", 0, MS_NOSUID
},
214 { "dev", 1, MS_NODEV
},
215 { "nodev", 0, MS_NODEV
},
216 { "exec", 1, MS_NOEXEC
},
217 { "noexec", 0, MS_NOEXEC
},
218 { "sync", 0, MS_SYNCHRONOUS
},
219 { "async", 1, MS_SYNCHRONOUS
},
220 { "dirsync", 0, MS_DIRSYNC
},
221 { "remount", 0, MS_REMOUNT
},
222 { "mand", 0, MS_MANDLOCK
},
223 { "nomand", 1, MS_MANDLOCK
},
224 { "atime", 1, MS_NOATIME
},
225 { "noatime", 0, MS_NOATIME
},
226 { "diratime", 1, MS_NODIRATIME
},
227 { "nodiratime", 0, MS_NODIRATIME
},
228 { "bind", 0, MS_BIND
},
229 { "rbind", 0, MS_BIND
|MS_REC
},
230 { "relatime", 0, MS_RELATIME
},
231 { "norelatime", 1, MS_RELATIME
},
232 { "strictatime", 0, MS_STRICTATIME
},
233 { "nostrictatime", 1, MS_STRICTATIME
},
237 #if HAVE_SYS_CAPABILITY_H
238 static struct caps_opt caps_opt
[] = {
239 { "chown", CAP_CHOWN
},
240 { "dac_override", CAP_DAC_OVERRIDE
},
241 { "dac_read_search", CAP_DAC_READ_SEARCH
},
242 { "fowner", CAP_FOWNER
},
243 { "fsetid", CAP_FSETID
},
244 { "kill", CAP_KILL
},
245 { "setgid", CAP_SETGID
},
246 { "setuid", CAP_SETUID
},
247 { "setpcap", CAP_SETPCAP
},
248 { "linux_immutable", CAP_LINUX_IMMUTABLE
},
249 { "net_bind_service", CAP_NET_BIND_SERVICE
},
250 { "net_broadcast", CAP_NET_BROADCAST
},
251 { "net_admin", CAP_NET_ADMIN
},
252 { "net_raw", CAP_NET_RAW
},
253 { "ipc_lock", CAP_IPC_LOCK
},
254 { "ipc_owner", CAP_IPC_OWNER
},
255 { "sys_module", CAP_SYS_MODULE
},
256 { "sys_rawio", CAP_SYS_RAWIO
},
257 { "sys_chroot", CAP_SYS_CHROOT
},
258 { "sys_ptrace", CAP_SYS_PTRACE
},
259 { "sys_pacct", CAP_SYS_PACCT
},
260 { "sys_admin", CAP_SYS_ADMIN
},
261 { "sys_boot", CAP_SYS_BOOT
},
262 { "sys_nice", CAP_SYS_NICE
},
263 { "sys_resource", CAP_SYS_RESOURCE
},
264 { "sys_time", CAP_SYS_TIME
},
265 { "sys_tty_config", CAP_SYS_TTY_CONFIG
},
266 { "mknod", CAP_MKNOD
},
267 { "lease", CAP_LEASE
},
268 #ifdef CAP_AUDIT_WRITE
269 { "audit_write", CAP_AUDIT_WRITE
},
271 #ifdef CAP_AUDIT_CONTROL
272 { "audit_control", CAP_AUDIT_CONTROL
},
274 { "setfcap", CAP_SETFCAP
},
275 { "mac_override", CAP_MAC_OVERRIDE
},
276 { "mac_admin", CAP_MAC_ADMIN
},
278 { "syslog", CAP_SYSLOG
},
280 #ifdef CAP_WAKE_ALARM
281 { "wake_alarm", CAP_WAKE_ALARM
},
285 static struct caps_opt caps_opt
[] = {};
288 static int run_buffer(char *buffer
)
290 struct lxc_popen_FILE
*f
;
294 f
= lxc_popen(buffer
);
296 SYSERROR("popen failed");
300 output
= malloc(LXC_LOG_BUFFER_SIZE
);
302 ERROR("failed to allocate memory for script output");
307 while(fgets(output
, LXC_LOG_BUFFER_SIZE
, f
->f
))
308 DEBUG("script output: %s", output
);
314 SYSERROR("Script exited on error");
316 } else if (WIFEXITED(ret
) && WEXITSTATUS(ret
) != 0) {
317 ERROR("Script exited with status %d", WEXITSTATUS(ret
));
319 } else if (WIFSIGNALED(ret
)) {
320 ERROR("Script terminated by signal %d (%s)", WTERMSIG(ret
),
321 strsignal(WTERMSIG(ret
)));
328 static int run_script_argv(const char *name
, const char *section
,
329 const char *script
, const char *hook
, const char *lxcpath
,
336 INFO("Executing script '%s' for container '%s', config section '%s'",
337 script
, name
, section
);
339 for (i
=0; argsin
&& argsin
[i
]; i
++)
340 size
+= strlen(argsin
[i
]) + 1;
342 size
+= strlen(hook
) + 1;
344 size
+= strlen(script
);
345 size
+= strlen(name
);
346 size
+= strlen(section
);
352 buffer
= alloca(size
);
354 ERROR("failed to allocate memory");
358 ret
= snprintf(buffer
, size
, "%s %s %s %s", script
, name
, section
, hook
);
359 if (ret
< 0 || ret
>= size
) {
360 ERROR("Script name too long");
364 for (i
=0; argsin
&& argsin
[i
]; i
++) {
367 rc
= snprintf(buffer
+ ret
, len
, " %s", argsin
[i
]);
368 if (rc
< 0 || rc
>= len
) {
369 ERROR("Script args too long");
375 return run_buffer(buffer
);
378 static int run_script(const char *name
, const char *section
,
379 const char *script
, ...)
386 INFO("Executing script '%s' for container '%s', config section '%s'",
387 script
, name
, section
);
389 va_start(ap
, script
);
390 while ((p
= va_arg(ap
, char *)))
391 size
+= strlen(p
) + 1;
394 size
+= strlen(script
);
395 size
+= strlen(name
);
396 size
+= strlen(section
);
402 buffer
= alloca(size
);
404 ERROR("failed to allocate memory");
408 ret
= snprintf(buffer
, size
, "%s %s %s", script
, name
, section
);
409 if (ret
< 0 || ret
>= size
) {
410 ERROR("Script name too long");
414 va_start(ap
, script
);
415 while ((p
= va_arg(ap
, char *))) {
418 rc
= snprintf(buffer
+ ret
, len
, " %s", p
);
419 if (rc
< 0 || rc
>= len
) {
420 ERROR("Script args too long");
427 return run_buffer(buffer
);
430 static int find_fstype_cb(char* buffer
, void *data
)
438 unsigned long mntflags
;
442 /* we don't try 'nodev' entries */
443 if (strstr(buffer
, "nodev"))
447 fstype
+= lxc_char_left_gc(fstype
, strlen(fstype
));
448 fstype
[lxc_char_right_gc(fstype
, strlen(fstype
))] = '\0';
450 /* ignore blank line and comment */
451 if (fstype
[0] == '\0' || fstype
[0] == '#')
454 DEBUG("trying to mount '%s'->'%s' with fstype '%s'",
455 cbarg
->rootfs
, cbarg
->target
, fstype
);
457 if (parse_mntopts(cbarg
->options
, &mntflags
, &mntdata
) < 0) {
462 if (mount(cbarg
->rootfs
, cbarg
->target
, fstype
, mntflags
, mntdata
)) {
463 DEBUG("mount failed with error: %s", strerror(errno
));
469 INFO("mounted '%s' on '%s', with fstype '%s'",
470 cbarg
->rootfs
, cbarg
->target
, fstype
);
475 static int mount_unknown_fs(const char *rootfs
, const char *target
,
491 * find the filesystem type with brute force:
492 * first we check with /etc/filesystems, in case the modules
493 * are auto-loaded and fall back to the supported kernel fs
500 for (i
= 0; i
< sizeof(fsfile
)/sizeof(fsfile
[0]); i
++) {
504 if (access(fsfile
[i
], F_OK
))
507 ret
= lxc_file_for_each_line(fsfile
[i
], find_fstype_cb
, &cbarg
);
509 ERROR("failed to parse '%s'", fsfile
[i
]);
517 ERROR("failed to determine fs type for '%s'", rootfs
);
521 static int mount_rootfs_dir(const char *rootfs
, const char *target
,
524 unsigned long mntflags
;
528 if (parse_mntopts(options
, &mntflags
, &mntdata
) < 0) {
533 ret
= mount(rootfs
, target
, "none", MS_BIND
| MS_REC
| mntflags
, mntdata
);
539 static int setup_lodev(const char *rootfs
, int fd
, struct loop_info64
*loinfo
)
544 rfd
= open(rootfs
, O_RDWR
);
546 SYSERROR("failed to open '%s'", rootfs
);
550 memset(loinfo
, 0, sizeof(*loinfo
));
552 loinfo
->lo_flags
= LO_FLAGS_AUTOCLEAR
;
554 if (ioctl(fd
, LOOP_SET_FD
, rfd
)) {
555 SYSERROR("failed to LOOP_SET_FD");
559 if (ioctl(fd
, LOOP_SET_STATUS64
, loinfo
)) {
560 SYSERROR("failed to LOOP_SET_STATUS64");
571 static int mount_rootfs_file(const char *rootfs
, const char *target
,
574 struct dirent dirent
, *direntp
;
575 struct loop_info64 loinfo
;
576 int ret
= -1, fd
= -1, rc
;
578 char path
[MAXPATHLEN
];
580 dir
= opendir("/dev");
582 SYSERROR("failed to open '/dev'");
586 while (!readdir_r(dir
, &dirent
, &direntp
)) {
591 if (!strcmp(direntp
->d_name
, "."))
594 if (!strcmp(direntp
->d_name
, ".."))
597 if (strncmp(direntp
->d_name
, "loop", 4))
600 rc
= snprintf(path
, MAXPATHLEN
, "/dev/%s", direntp
->d_name
);
601 if (rc
< 0 || rc
>= MAXPATHLEN
)
604 fd
= open(path
, O_RDWR
);
608 if (ioctl(fd
, LOOP_GET_STATUS64
, &loinfo
) == 0) {
613 if (errno
!= ENXIO
) {
614 WARN("unexpected error for ioctl on '%s': %m",
620 DEBUG("found '%s' free lodev", path
);
622 ret
= setup_lodev(rootfs
, fd
, &loinfo
);
624 ret
= mount_unknown_fs(path
, target
, options
);
631 WARN("failed to close directory");
636 static int mount_rootfs_block(const char *rootfs
, const char *target
,
639 return mount_unknown_fs(rootfs
, target
, options
);
644 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
645 * the duration of the container run, to prevent the container from marking
646 * the underlying fs readonly on shutdown. unlink the file immediately so
647 * no name pollution is happens
648 * return -1 on error.
649 * return -2 if nothing needed to be pinned.
650 * return an open fd (>=0) if we pinned it.
652 int pin_rootfs(const char *rootfs
)
654 char absrootfs
[MAXPATHLEN
];
655 char absrootfspin
[MAXPATHLEN
];
659 if (rootfs
== NULL
|| strlen(rootfs
) == 0)
662 if (!realpath(rootfs
, absrootfs
))
665 if (access(absrootfs
, F_OK
))
668 if (stat(absrootfs
, &s
))
671 if (!S_ISDIR(s
.st_mode
))
674 ret
= snprintf(absrootfspin
, MAXPATHLEN
, "%s/lxc.hold", absrootfs
);
675 if (ret
>= MAXPATHLEN
)
678 fd
= open(absrootfspin
, O_CREAT
| O_RDWR
, S_IWUSR
|S_IRUSR
);
681 (void)unlink(absrootfspin
);
685 static int lxc_mount_auto_mounts(struct lxc_conf
*conf
, int flags
, struct lxc_handler
*handler
)
693 const char *destination
;
697 } default_mounts
[] = {
698 /* Read-only bind-mounting... In older kernels, doing that required
699 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
700 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
701 * kernel 2.6.26 onwards. However, this apparently does not work on
702 * kernel 3.8. Unfortunately, on that very same kernel, doing the
703 * same trick as above doesn't seem to work either, there one needs
704 * to ALSO specify MS_BIND for the remount, otherwise the entire
705 * fs is remounted read-only or the mount fails because it's busy...
706 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
709 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, "proc", "%r/proc", "proc", MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, NULL
},
710 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, "%r/proc/sys", "%r/proc/sys", NULL
, MS_BIND
, NULL
},
711 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, NULL
, "%r/proc/sys", NULL
, MS_REMOUNT
|MS_BIND
|MS_RDONLY
, NULL
},
712 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL
, MS_BIND
, NULL
},
713 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, NULL
, "%r/proc/sysrq-trigger", NULL
, MS_REMOUNT
|MS_BIND
|MS_RDONLY
, NULL
},
714 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_RW
, "proc", "%r/proc", "proc", MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, NULL
},
715 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_RW
, "sysfs", "%r/sys", "sysfs", 0, NULL
},
716 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_RO
, "sysfs", "%r/sys", "sysfs", MS_RDONLY
, NULL
},
717 { 0, 0, NULL
, NULL
, NULL
, 0, NULL
}
720 for (i
= 0; default_mounts
[i
].match_mask
; i
++) {
721 if ((flags
& default_mounts
[i
].match_mask
) == default_mounts
[i
].match_flag
) {
723 char *destination
= NULL
;
726 if (default_mounts
[i
].source
) {
727 /* will act like strdup if %r is not present */
728 source
= lxc_string_replace("%r", conf
->rootfs
.mount
, default_mounts
[i
].source
);
730 SYSERROR("memory allocation error");
734 if (default_mounts
[i
].destination
) {
735 /* will act like strdup if %r is not present */
736 destination
= lxc_string_replace("%r", conf
->rootfs
.mount
, default_mounts
[i
].destination
);
739 SYSERROR("memory allocation error");
745 r
= mount(source
, destination
, default_mounts
[i
].fstype
, default_mounts
[i
].flags
, default_mounts
[i
].options
);
748 SYSERROR("error mounting %s on %s", source
, destination
);
758 if (flags
& LXC_AUTO_CGROUP_MASK
) {
761 cg_flags
= flags
& LXC_AUTO_CGROUP_MASK
;
762 /* If the type of cgroup mount was not specified, it depends on the
763 * container's capabilities as to what makes sense: if we have
764 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
765 * anyway, so we may as well default to read-write; then the admin
766 * will not be given a false sense of security. (And if they really
767 * want mixed r/o r/w, then they can explicitly specify :mixed.)
768 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
769 * :mixed, because then the container can't remount it read-write. */
770 if (cg_flags
== LXC_AUTO_CGROUP_NOSPEC
|| cg_flags
== LXC_AUTO_CGROUP_FULL_NOSPEC
) {
771 int has_sys_admin
= 0;
772 if (!lxc_list_empty(&conf
->keepcaps
)) {
773 has_sys_admin
= in_caplist(CAP_SYS_ADMIN
, &conf
->keepcaps
);
775 has_sys_admin
= !in_caplist(CAP_SYS_ADMIN
, &conf
->caps
);
777 if (cg_flags
== LXC_AUTO_CGROUP_NOSPEC
) {
778 cg_flags
= has_sys_admin
? LXC_AUTO_CGROUP_RW
: LXC_AUTO_CGROUP_MIXED
;
780 cg_flags
= has_sys_admin
? LXC_AUTO_CGROUP_FULL_RW
: LXC_AUTO_CGROUP_FULL_MIXED
;
784 if (!cgroup_mount(conf
->rootfs
.mount
, handler
, cg_flags
)) {
785 SYSERROR("error mounting /sys/fs/cgroup");
793 static int mount_rootfs(const char *rootfs
, const char *target
, const char *options
)
795 char absrootfs
[MAXPATHLEN
];
799 typedef int (*rootfs_cb
)(const char *, const char *, const char *);
805 { S_IFDIR
, mount_rootfs_dir
},
806 { S_IFBLK
, mount_rootfs_block
},
807 { S_IFREG
, mount_rootfs_file
},
810 if (!realpath(rootfs
, absrootfs
)) {
811 SYSERROR("failed to get real path for '%s'", rootfs
);
815 if (access(absrootfs
, F_OK
)) {
816 SYSERROR("'%s' is not accessible", absrootfs
);
820 if (stat(absrootfs
, &s
)) {
821 SYSERROR("failed to stat '%s'", absrootfs
);
825 for (i
= 0; i
< sizeof(rtfs_type
)/sizeof(rtfs_type
[0]); i
++) {
827 if (!__S_ISTYPE(s
.st_mode
, rtfs_type
[i
].type
))
830 return rtfs_type
[i
].cb(absrootfs
, target
, options
);
833 ERROR("unsupported rootfs type for '%s'", absrootfs
);
837 static int setup_utsname(struct utsname
*utsname
)
842 if (sethostname(utsname
->nodename
, strlen(utsname
->nodename
))) {
843 SYSERROR("failed to set the hostname to '%s'", utsname
->nodename
);
847 INFO("'%s' hostname has been setup", utsname
->nodename
);
852 struct dev_symlinks
{
857 static const struct dev_symlinks dev_symlinks
[] = {
858 {"/proc/self/fd", "fd"},
859 {"/proc/self/fd/0", "stdin"},
860 {"/proc/self/fd/1", "stdout"},
861 {"/proc/self/fd/2", "stderr"},
864 static int setup_dev_symlinks(const struct lxc_rootfs
*rootfs
)
866 char path
[MAXPATHLEN
];
871 for (i
= 0; i
< sizeof(dev_symlinks
) / sizeof(dev_symlinks
[0]); i
++) {
872 const struct dev_symlinks
*d
= &dev_symlinks
[i
];
873 ret
= snprintf(path
, sizeof(path
), "%s/dev/%s", rootfs
->mount
, d
->name
);
874 if (ret
< 0 || ret
>= MAXPATHLEN
)
878 * Stat the path first. If we don't get an error
879 * accept it as is and don't try to create it
881 if (!stat(path
, &s
)) {
885 ret
= symlink(d
->oldpath
, path
);
887 if (ret
&& errno
!= EEXIST
) {
888 if ( errno
== EROFS
) {
889 WARN("Warning: Read Only file system while creating %s", path
);
891 SYSERROR("Error creating %s", path
);
899 static int setup_tty(const struct lxc_rootfs
*rootfs
,
900 const struct lxc_tty_info
*tty_info
, char *ttydir
)
902 char path
[MAXPATHLEN
], lxcpath
[MAXPATHLEN
];
908 for (i
= 0; i
< tty_info
->nbtty
; i
++) {
910 struct lxc_pty_info
*pty_info
= &tty_info
->pty_info
[i
];
912 ret
= snprintf(path
, sizeof(path
), "%s/dev/tty%d",
913 rootfs
->mount
, i
+ 1);
914 if (ret
>= sizeof(path
)) {
915 ERROR("pathname too long for ttys");
919 /* create dev/lxc/tty%d" */
920 ret
= snprintf(lxcpath
, sizeof(lxcpath
), "%s/dev/%s/tty%d",
921 rootfs
->mount
, ttydir
, i
+ 1);
922 if (ret
>= sizeof(lxcpath
)) {
923 ERROR("pathname too long for ttys");
926 ret
= creat(lxcpath
, 0660);
927 if (ret
==-1 && errno
!= EEXIST
) {
928 SYSERROR("error creating %s", lxcpath
);
934 if (ret
&& errno
!= ENOENT
) {
935 SYSERROR("error unlinking %s", path
);
939 if (mount(pty_info
->name
, lxcpath
, "none", MS_BIND
, 0)) {
940 WARN("failed to mount '%s'->'%s'",
941 pty_info
->name
, path
);
945 ret
= snprintf(lxcpath
, sizeof(lxcpath
), "%s/tty%d", ttydir
, i
+1);
946 if (ret
>= sizeof(lxcpath
)) {
947 ERROR("tty pathname too long");
950 ret
= symlink(lxcpath
, path
);
952 SYSERROR("failed to create symlink for tty %d", i
+1);
956 /* If we populated /dev, then we need to create /dev/ttyN */
957 if (access(path
, F_OK
)) {
958 ret
= creat(path
, 0660);
960 SYSERROR("error creating %s", path
);
961 /* this isn't fatal, continue */
966 if (mount(pty_info
->name
, path
, "none", MS_BIND
, 0)) {
967 WARN("failed to mount '%s'->'%s'",
968 pty_info
->name
, path
);
974 INFO("%d tty(s) has been setup", tty_info
->nbtty
);
979 static int setup_rootfs_pivot_root_cb(char *buffer
, void *data
)
981 struct lxc_list
*mountlist
, *listentry
, *iterator
;
982 char *pivotdir
, *mountpoint
, *mountentry
, *saveptr
= NULL
;
987 cbparm
= (void **)data
;
989 mountlist
= cbparm
[0];
990 pivotdir
= cbparm
[1];
992 /* parse entry, first field is mountname, ignore */
993 mountpoint
= strtok_r(mountentry
, " ", &saveptr
);
997 /* second field is mountpoint */
998 mountpoint
= strtok_r(NULL
, " ", &saveptr
);
1002 /* only consider mountpoints below old root fs */
1003 if (strncmp(mountpoint
, pivotdir
, strlen(pivotdir
)))
1006 /* filter duplicate mountpoints */
1008 lxc_list_for_each(iterator
, mountlist
) {
1009 if (!strcmp(iterator
->elem
, mountpoint
)) {
1017 /* add entry to list */
1018 listentry
= malloc(sizeof(*listentry
));
1020 SYSERROR("malloc for mountpoint listentry failed");
1024 listentry
->elem
= strdup(mountpoint
);
1025 if (!listentry
->elem
) {
1026 SYSERROR("strdup failed");
1030 lxc_list_add_tail(mountlist
, listentry
);
1035 static int umount_oldrootfs(const char *oldrootfs
)
1037 char path
[MAXPATHLEN
];
1039 struct lxc_list mountlist
, *iterator
, *next
;
1040 int ok
, still_mounted
, last_still_mounted
;
1043 /* read and parse /proc/mounts in old root fs */
1044 lxc_list_init(&mountlist
);
1046 /* oldrootfs is on the top tree directory now */
1047 rc
= snprintf(path
, sizeof(path
), "/%s", oldrootfs
);
1048 if (rc
>= sizeof(path
)) {
1049 ERROR("rootfs name too long");
1052 cbparm
[0] = &mountlist
;
1054 cbparm
[1] = strdup(path
);
1056 SYSERROR("strdup failed");
1060 rc
= snprintf(path
, sizeof(path
), "%s/proc/mounts", oldrootfs
);
1061 if (rc
>= sizeof(path
)) {
1062 ERROR("container proc/mounts name too long");
1066 ok
= lxc_file_for_each_line(path
,
1067 setup_rootfs_pivot_root_cb
, &cbparm
);
1069 SYSERROR("failed to read or parse mount list '%s'", path
);
1073 /* umount filesystems until none left or list no longer shrinks */
1076 last_still_mounted
= still_mounted
;
1079 lxc_list_for_each_safe(iterator
, &mountlist
, next
) {
1081 /* umount normally */
1082 if (!umount(iterator
->elem
)) {
1083 DEBUG("umounted '%s'", (char *)iterator
->elem
);
1084 lxc_list_del(iterator
);
1091 } while (still_mounted
> 0 && still_mounted
!= last_still_mounted
);
1094 lxc_list_for_each(iterator
, &mountlist
) {
1096 /* let's try a lazy umount */
1097 if (!umount2(iterator
->elem
, MNT_DETACH
)) {
1098 INFO("lazy unmount of '%s'", (char *)iterator
->elem
);
1102 /* be more brutal (nfs) */
1103 if (!umount2(iterator
->elem
, MNT_FORCE
)) {
1104 INFO("forced unmount of '%s'", (char *)iterator
->elem
);
1108 WARN("failed to unmount '%s'", (char *)iterator
->elem
);
1114 static int setup_rootfs_pivot_root(const char *rootfs
, const char *pivotdir
)
1116 char path
[MAXPATHLEN
];
1117 int remove_pivotdir
= 0;
1120 /* change into new root fs */
1121 if (chdir(rootfs
)) {
1122 SYSERROR("can't chdir to new rootfs '%s'", rootfs
);
1127 pivotdir
= "lxc_putold";
1129 /* compute the full path to pivotdir under rootfs */
1130 rc
= snprintf(path
, sizeof(path
), "%s/%s", rootfs
, pivotdir
);
1131 if (rc
>= sizeof(path
)) {
1132 ERROR("pivot dir name too long");
1136 if (access(path
, F_OK
)) {
1138 if (mkdir_p(path
, 0755) < 0) {
1139 SYSERROR("failed to create pivotdir '%s'", path
);
1143 remove_pivotdir
= 1;
1144 DEBUG("created '%s' directory", path
);
1147 DEBUG("mountpoint for old rootfs is '%s'", path
);
1149 /* pivot_root into our new root fs */
1150 if (pivot_root(".", path
)) {
1151 SYSERROR("pivot_root syscall failed");
1156 SYSERROR("can't chdir to / after pivot_root");
1160 DEBUG("pivot_root syscall to '%s' successful", rootfs
);
1162 /* we switch from absolute path to relative path */
1163 if (umount_oldrootfs(pivotdir
))
1166 /* remove temporary mount point, we don't consider the removing
1168 if (remove_pivotdir
&& rmdir(pivotdir
))
1169 WARN("can't remove mountpoint '%s': %m", pivotdir
);
1175 * Check to see if a directory has something mounted on it and,
1176 * if it does, return the fstype.
1178 * Code largely based on detect_shared_rootfs below
1180 * Returns: # of matching entries in /proc/self/mounts
1181 * if != 0 fstype is filled with the last filesystem value.
1182 * if == 0 no matches found, fstype unchanged.
1184 * ToDo: Maybe return the mount options in another parameter...
1187 #define LINELEN 4096
1188 #define MAX_FSTYPE_LEN 128
1189 static int mount_check_fs( const char *dir
, char *fstype
)
1191 char buf
[LINELEN
], *p
;
1197 DEBUG("entering mount_check_fs for %s", dir
);
1199 if ( 0 != access(dir
, F_OK
) || 0 != stat(dir
, &s
) || 0 == S_ISDIR(s
.st_mode
) ) {
1203 f
= fopen("/proc/self/mounts", "r");
1206 while (fgets(buf
, LINELEN
, f
)) {
1207 p
= index(buf
, ' ');
1218 /* Compare the directory in the entry to desired */
1219 if( strcmp( p2
, dir
) ) {
1224 p
= index( p2
, ' ');
1232 strncpy( fstype
, p2
, MAX_FSTYPE_LEN
- 1 );
1233 fstype
[ MAX_FSTYPE_LEN
- 1 ] = '\0';
1239 DEBUG("mount_check_fs returning %d last %s", found_fs
, fstype
);
1245 * Locate a devtmpfs mount (should be on /dev) and create a container
1246 * subdirectory on it which we can then bind mount to the container
1247 * /dev instead of mounting a tmpfs there.
1248 * If we fail, return NULL.
1249 * Else return the pointer to the name buffer with the string to
1250 * the devtmpfs subdirectory.
1253 static char *mk_devtmpfs(const char *name
, char *path
, const char *lxcpath
)
1257 char tmp_path
[MAXPATHLEN
];
1258 char fstype
[MAX_FSTYPE_LEN
];
1259 char *base_path
= "/dev/.lxc";
1260 char *user_path
= "/dev/.lxc/user";
1263 if ( 0 != access(base_path
, F_OK
) || 0 != stat(base_path
, &s
) || 0 == S_ISDIR(s
.st_mode
) ) {
1264 /* This is just making /dev/.lxc it better work or we're done */
1265 ret
= mkdir(base_path
, S_IRWXU
| S_IRGRP
| S_IXGRP
| S_IROTH
| S_IXOTH
);
1267 SYSERROR( "Unable to create /dev/.lxc for autodev" );
1273 * Programmers notes:
1274 * We can not do mounts in this area of code that we want
1275 * to be visible in the host. Consequently, /dev/.lxc must
1276 * be set up earlier if we need a tmpfs mounted there.
1277 * That only affects the rare cases where autodev is enabled
1278 * for a container and devtmpfs is not mounted on /dev in the
1279 * host. In that case, we'll fall back to the old method
1280 * of mounting a tmpfs in the container and have no visibility
1281 * into the container /dev.
1283 if( ! mount_check_fs( "/dev", fstype
)
1284 || strcmp( "devtmpfs", fstype
) ) {
1285 /* Either /dev was not mounted or was not devtmpfs */
1287 if ( ! mount_check_fs( "/dev/.lxc", NULL
) ) {
1289 * /dev/.lxc is not already mounted
1290 * Doing a mount here does no good, since
1291 * it's not visible in the host.
1294 ERROR("/dev/.lxc is not setup - taking fallback" );
1299 if ( 0 != access(user_path
, F_OK
) || 0 != stat(user_path
, &s
) || 0 == S_ISDIR(s
.st_mode
) ) {
1301 * This is making /dev/.lxc/user path for non-priv users.
1302 * If this doesn't work, we'll have to fall back in the
1303 * case of non-priv users. It's mode 1777 like /tmp.
1305 ret
= mkdir(user_path
, S_IRWXU
| S_IRWXG
| S_IRWXO
| S_ISVTX
);
1307 /* Issue an error but don't fail yet! */
1308 ERROR("Unable to create /dev/.lxc/user");
1310 /* Umask tends to screw us up here */
1311 chmod(user_path
, S_IRWXU
| S_IRWXG
| S_IRWXO
| S_ISVTX
);
1315 * Since the container name must be unique within a given
1316 * lxcpath, we're going to use a hash of the path
1317 * /lxcpath/name as our hash name in /dev/.lxc/
1320 ret
= snprintf(tmp_path
, MAXPATHLEN
, "%s/%s", lxcpath
, name
);
1321 if (ret
< 0 || ret
>= MAXPATHLEN
)
1324 hash
= fnv_64a_buf(tmp_path
, ret
, FNV1A_64_INIT
);
1326 ret
= snprintf(tmp_path
, MAXPATHLEN
, "%s/%s.%016" PRIx64
, base_path
, name
, hash
);
1327 if (ret
< 0 || ret
>= MAXPATHLEN
)
1330 if ( 0 != access(tmp_path
, F_OK
) || 0 != stat(tmp_path
, &s
) || 0 == S_ISDIR(s
.st_mode
) ) {
1331 ret
= mkdir(tmp_path
, S_IRWXU
| S_IRGRP
| S_IXGRP
| S_IROTH
| S_IXOTH
);
1333 /* Something must have failed with the base_path...
1334 * Maybe unpriv user. Try user_path now... */
1335 INFO("Setup in /dev/.lxc failed. Trying /dev/.lxc/user." );
1337 ret
= snprintf(tmp_path
, MAXPATHLEN
, "%s/%s.%016" PRIx64
, user_path
, name
, hash
);
1338 if (ret
< 0 || ret
>= MAXPATHLEN
)
1341 if ( 0 != access(tmp_path
, F_OK
) || 0 != stat(tmp_path
, &s
) || 0 == S_ISDIR(s
.st_mode
) ) {
1342 ret
= mkdir(tmp_path
, S_IRWXU
| S_IRGRP
| S_IXGRP
| S_IROTH
| S_IXOTH
);
1344 ERROR("Container /dev setup in host /dev failed - taking fallback" );
1351 strcpy( path
, tmp_path
);
1357 * Do we want to add options for max size of /dev and a file to
1358 * specify which devices to create?
1360 static int mount_autodev(const char *name
, char *root
, const char *lxcpath
)
1364 char path
[MAXPATHLEN
];
1365 char host_path
[MAXPATHLEN
];
1366 char devtmpfs_path
[MAXPATHLEN
];
1368 INFO("Mounting /dev under %s", root
);
1370 ret
= snprintf(host_path
, MAXPATHLEN
, "%s/%s/rootfs.dev", lxcpath
, name
);
1371 if (ret
< 0 || ret
> MAXPATHLEN
)
1374 ret
= snprintf(path
, MAXPATHLEN
, "%s/dev", root
);
1375 if (ret
< 0 || ret
> MAXPATHLEN
)
1378 if (mk_devtmpfs( name
, devtmpfs_path
, lxcpath
) ) {
1380 * Get rid of old links and directoriess
1381 * This could be either a symlink and we remove it,
1382 * or an empty directory and we remove it,
1383 * or non-existant and we don't care,
1384 * or a non-empty directory, and we will then emit an error
1385 * but we will not fail out the process.
1387 unlink( host_path
);
1389 ret
= symlink(devtmpfs_path
, host_path
);
1392 SYSERROR("WARNING: Failed to create symlink '%s'->'%s'", host_path
, devtmpfs_path
);
1394 DEBUG("Bind mounting %s to %s", devtmpfs_path
, path
);
1395 ret
= mount(devtmpfs_path
, path
, NULL
, MS_BIND
, 0 );
1397 /* Only mount a tmpfs on here if we don't already a mount */
1398 if ( ! mount_check_fs( host_path
, NULL
) ) {
1399 DEBUG("Mounting tmpfs to %s", host_path
);
1400 ret
= mount("none", path
, "tmpfs", 0, "size=100000,mode=755");
1402 /* This allows someone to manually set up a mount */
1403 DEBUG("Bind mounting %s to %s", host_path
, path
);
1404 ret
= mount(host_path
, path
, NULL
, MS_BIND
, 0 );
1408 SYSERROR("Failed to mount /dev at %s", root
);
1411 ret
= snprintf(path
, MAXPATHLEN
, "%s/dev/pts", root
);
1412 if (ret
< 0 || ret
>= MAXPATHLEN
)
1415 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1416 * If not, then create it and exit if that fails...
1418 if ( 0 != access(path
, F_OK
) || 0 != stat(path
, &s
) || 0 == S_ISDIR(s
.st_mode
) ) {
1419 ret
= mkdir(path
, S_IRWXU
| S_IRGRP
| S_IXGRP
| S_IROTH
| S_IXOTH
);
1421 SYSERROR("Failed to create /dev/pts in container");
1426 INFO("Mounted /dev under %s", root
);
1437 static const struct lxc_devs lxc_devs
[] = {
1438 { "null", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 3 },
1439 { "zero", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 5 },
1440 { "full", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 7 },
1441 { "urandom", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 9 },
1442 { "random", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 8 },
1443 { "tty", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 5, 0 },
1444 { "console", S_IFCHR
| S_IRUSR
| S_IWUSR
, 5, 1 },
1447 static int setup_autodev(const char *root
)
1450 char path
[MAXPATHLEN
];
1454 INFO("Creating initial consoles under %s/dev", root
);
1456 ret
= snprintf(path
, MAXPATHLEN
, "%s/dev", root
);
1457 if (ret
< 0 || ret
>= MAXPATHLEN
) {
1458 ERROR("Error calculating container /dev location");
1462 INFO("Populating /dev under %s", root
);
1463 cmask
= umask(S_IXUSR
| S_IXGRP
| S_IXOTH
);
1464 for (i
= 0; i
< sizeof(lxc_devs
) / sizeof(lxc_devs
[0]); i
++) {
1465 const struct lxc_devs
*d
= &lxc_devs
[i
];
1466 ret
= snprintf(path
, MAXPATHLEN
, "%s/dev/%s", root
, d
->name
);
1467 if (ret
< 0 || ret
>= MAXPATHLEN
)
1469 ret
= mknod(path
, d
->mode
, makedev(d
->maj
, d
->min
));
1470 if (ret
&& errno
!= EEXIST
) {
1471 SYSERROR("Error creating %s", d
->name
);
1477 INFO("Populated /dev under %s", root
);
1482 * I'll forgive you for asking whether all of this is needed :) The
1484 * pivot_root will fail if the new root, the put_old dir, or the parent
1485 * of current->fs->root are MS_SHARED. (parent of current->fs_root may
1486 * or may not be current->fs_root - if we assumed it always was, we could
1487 * just mount --make-rslave /). So,
1488 * 1. mount a tiny tmpfs to be parent of current->fs->root.
1489 * 2. make that MS_SLAVE
1490 * 3. make a 'root' directory under that
1491 * 4. mount --rbind / under the $tinyroot/root.
1492 * 5. make that rslave
1493 * 6. chdir and chroot into $tinyroot/root
1494 * 7. $tinyroot will be unmounted by our parent in start.c
1496 static int chroot_into_slave(struct lxc_conf
*conf
)
1498 char path
[MAXPATHLEN
];
1499 const char *destpath
= conf
->rootfs
.mount
;
1502 if (mount(destpath
, destpath
, NULL
, MS_BIND
, 0)) {
1503 SYSERROR("failed to mount %s bind", destpath
);
1506 if (mount("", destpath
, NULL
, MS_SLAVE
, 0)) {
1507 SYSERROR("failed to make %s slave", destpath
);
1510 if (mount("none", destpath
, "tmpfs", 0, "size=10000,mode=755")) {
1511 SYSERROR("Failed to mount tmpfs / at %s", destpath
);
1514 ret
= snprintf(path
, MAXPATHLEN
, "%s/root", destpath
);
1515 if (ret
< 0 || ret
>= MAXPATHLEN
) {
1516 ERROR("out of memory making root path");
1519 if (mkdir(path
, S_IRWXU
| S_IRGRP
| S_IXGRP
| S_IROTH
| S_IXOTH
)) {
1520 SYSERROR("Failed to create /dev/pts in container");
1523 if (mount("/", path
, NULL
, MS_BIND
|MS_REC
, 0)) {
1524 SYSERROR("Failed to rbind mount / to %s", path
);
1527 if (mount("", destpath
, NULL
, MS_SLAVE
|MS_REC
, 0)) {
1528 SYSERROR("Failed to make tmp-/ at %s rslave", path
);
1532 SYSERROR("Failed to chroot into tmp-/");
1536 SYSERROR("Failed to chdir into tmp-/");
1539 INFO("Chrooted into tmp-/ at %s", path
);
1543 static int setup_rootfs(struct lxc_conf
*conf
)
1545 const struct lxc_rootfs
*rootfs
= &conf
->rootfs
;
1547 if (!rootfs
->path
) {
1548 if (mount("", "/", NULL
, MS_SLAVE
|MS_REC
, 0)) {
1549 SYSERROR("Failed to make / rslave");
1555 if (access(rootfs
->mount
, F_OK
)) {
1556 SYSERROR("failed to access to '%s', check it is present",
1561 // First try mounting rootfs using a bdev
1562 struct bdev
*bdev
= bdev_init(conf
, rootfs
->path
, rootfs
->mount
, rootfs
->options
);
1563 if (bdev
&& bdev
->ops
->mount(bdev
) == 0) {
1565 DEBUG("mounted '%s' on '%s'", rootfs
->path
, rootfs
->mount
);
1570 if (mount_rootfs(rootfs
->path
, rootfs
->mount
, rootfs
->options
)) {
1571 ERROR("failed to mount rootfs");
1575 DEBUG("mounted '%s' on '%s'", rootfs
->path
, rootfs
->mount
);
1580 static int setup_pivot_root(const struct lxc_rootfs
*rootfs
)
1585 if (setup_rootfs_pivot_root(rootfs
->mount
, rootfs
->pivot
)) {
1586 ERROR("failed to setup pivot root");
1593 static int setup_pts(int pts
)
1595 char target
[PATH_MAX
];
1600 if (!access("/dev/pts/ptmx", F_OK
) && umount("/dev/pts")) {
1601 SYSERROR("failed to umount 'dev/pts'");
1605 if (mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL
,
1606 "newinstance,ptmxmode=0666,mode=0620,gid=5")) {
1607 SYSERROR("failed to mount a new instance of '/dev/pts'");
1611 if (access("/dev/ptmx", F_OK
)) {
1612 if (!symlink("/dev/pts/ptmx", "/dev/ptmx"))
1614 SYSERROR("failed to symlink '/dev/pts/ptmx'->'/dev/ptmx'");
1618 if (realpath("/dev/ptmx", target
) && !strcmp(target
, "/dev/pts/ptmx"))
1621 /* fallback here, /dev/pts/ptmx exists just mount bind */
1622 if (mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND
, 0)) {
1623 SYSERROR("mount failed '/dev/pts/ptmx'->'/dev/ptmx'");
1627 INFO("created new pts instance");
1633 static int setup_personality(int persona
)
1635 #if HAVE_SYS_PERSONALITY_H
1639 if (personality(persona
) < 0) {
1640 SYSERROR("failed to set personality to '0x%x'", persona
);
1644 INFO("set personality to '0x%x'", persona
);
1650 static int setup_dev_console(const struct lxc_rootfs
*rootfs
,
1651 const struct lxc_console
*console
)
1653 char path
[MAXPATHLEN
];
1657 ret
= snprintf(path
, sizeof(path
), "%s/dev/console", rootfs
->mount
);
1658 if (ret
>= sizeof(path
)) {
1659 ERROR("console path too long");
1663 if (access(path
, F_OK
)) {
1664 WARN("rootfs specified but no console found at '%s'", path
);
1668 if (console
->master
< 0) {
1673 if (stat(path
, &s
)) {
1674 SYSERROR("failed to stat '%s'", path
);
1678 if (chmod(console
->name
, s
.st_mode
)) {
1679 SYSERROR("failed to set mode '0%o' to '%s'",
1680 s
.st_mode
, console
->name
);
1684 if (mount(console
->name
, path
, "none", MS_BIND
, 0)) {
1685 ERROR("failed to mount '%s' on '%s'", console
->name
, path
);
1689 INFO("console has been setup");
1693 static int setup_ttydir_console(const struct lxc_rootfs
*rootfs
,
1694 const struct lxc_console
*console
,
1697 char path
[MAXPATHLEN
], lxcpath
[MAXPATHLEN
];
1700 /* create rootfs/dev/<ttydir> directory */
1701 ret
= snprintf(path
, sizeof(path
), "%s/dev/%s", rootfs
->mount
,
1703 if (ret
>= sizeof(path
))
1705 ret
= mkdir(path
, 0755);
1706 if (ret
&& errno
!= EEXIST
) {
1707 SYSERROR("failed with errno %d to create %s", errno
, path
);
1710 INFO("created %s", path
);
1712 ret
= snprintf(lxcpath
, sizeof(lxcpath
), "%s/dev/%s/console",
1713 rootfs
->mount
, ttydir
);
1714 if (ret
>= sizeof(lxcpath
)) {
1715 ERROR("console path too long");
1719 snprintf(path
, sizeof(path
), "%s/dev/console", rootfs
->mount
);
1721 if (ret
&& errno
!= ENOENT
) {
1722 SYSERROR("error unlinking %s", path
);
1726 ret
= creat(lxcpath
, 0660);
1727 if (ret
==-1 && errno
!= EEXIST
) {
1728 SYSERROR("error %d creating %s", errno
, lxcpath
);
1734 if (console
->master
< 0) {
1739 if (mount(console
->name
, lxcpath
, "none", MS_BIND
, 0)) {
1740 ERROR("failed to mount '%s' on '%s'", console
->name
, lxcpath
);
1744 /* create symlink from rootfs/dev/console to 'lxc/console' */
1745 ret
= snprintf(lxcpath
, sizeof(lxcpath
), "%s/console", ttydir
);
1746 if (ret
>= sizeof(lxcpath
)) {
1747 ERROR("lxc/console path too long");
1750 ret
= symlink(lxcpath
, path
);
1752 SYSERROR("failed to create symlink for console");
1756 INFO("console has been setup on %s", lxcpath
);
1761 static int setup_console(const struct lxc_rootfs
*rootfs
,
1762 const struct lxc_console
*console
,
1765 /* We don't have a rootfs, /dev/console will be shared */
1769 return setup_dev_console(rootfs
, console
);
1771 return setup_ttydir_console(rootfs
, console
, ttydir
);
1774 static int setup_kmsg(const struct lxc_rootfs
*rootfs
,
1775 const struct lxc_console
*console
)
1777 char kpath
[MAXPATHLEN
];
1782 ret
= snprintf(kpath
, sizeof(kpath
), "%s/dev/kmsg", rootfs
->mount
);
1783 if (ret
< 0 || ret
>= sizeof(kpath
))
1786 ret
= unlink(kpath
);
1787 if (ret
&& errno
!= ENOENT
) {
1788 SYSERROR("error unlinking %s", kpath
);
1792 ret
= symlink("console", kpath
);
1794 SYSERROR("failed to create symlink for kmsg");
1801 static void parse_mntopt(char *opt
, unsigned long *flags
, char **data
)
1803 struct mount_opt
*mo
;
1805 /* If opt is found in mount_opt, set or clear flags.
1806 * Otherwise append it to data. */
1808 for (mo
= &mount_opt
[0]; mo
->name
!= NULL
; mo
++) {
1809 if (!strncmp(opt
, mo
->name
, strlen(mo
->name
))) {
1811 *flags
&= ~mo
->flag
;
1823 int parse_mntopts(const char *mntopts
, unsigned long *mntflags
,
1827 char *p
, *saveptr
= NULL
;
1835 s
= strdup(mntopts
);
1837 SYSERROR("failed to allocate memory");
1841 data
= malloc(strlen(s
) + 1);
1843 SYSERROR("failed to allocate memory");
1849 for (p
= strtok_r(s
, ",", &saveptr
); p
!= NULL
;
1850 p
= strtok_r(NULL
, ",", &saveptr
))
1851 parse_mntopt(p
, mntflags
, &data
);
1862 static int mount_entry(const char *fsname
, const char *target
,
1863 const char *fstype
, unsigned long mountflags
,
1864 const char *data
, int optional
)
1866 if (mount(fsname
, target
, fstype
, mountflags
& ~MS_REMOUNT
, data
)) {
1868 INFO("failed to mount '%s' on '%s' (optional): %s", fsname
,
1869 target
, strerror(errno
));
1873 SYSERROR("failed to mount '%s' on '%s'", fsname
, target
);
1878 if ((mountflags
& MS_REMOUNT
) || (mountflags
& MS_BIND
)) {
1880 DEBUG("remounting %s on %s to respect bind or remount options",
1883 if (mount(fsname
, target
, fstype
,
1884 mountflags
| MS_REMOUNT
, data
)) {
1886 INFO("failed to mount '%s' on '%s' (optional): %s",
1887 fsname
, target
, strerror(errno
));
1891 SYSERROR("failed to mount '%s' on '%s'",
1898 DEBUG("mounted '%s' on '%s', type '%s'", fsname
, target
, fstype
);
1904 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1906 static void cull_mntent_opt(struct mntent
*mntent
)
1910 char *list
[] = {"create=dir",
1915 for (i
=0; list
[i
]; i
++) {
1916 if (!(p
= strstr(mntent
->mnt_opts
, list
[i
])))
1918 p2
= strchr(p
, ',');
1920 /* no more mntopts, so just chop it here */
1924 memmove(p
, p2
+1, strlen(p2
+1)+1);
1928 static inline int mount_entry_on_systemfs(struct mntent
*mntent
)
1930 unsigned long mntflags
;
1933 FILE *pathfile
= NULL
;
1934 char* pathdirname
= NULL
;
1935 bool optional
= hasmntopt(mntent
, "optional") != NULL
;
1937 if (hasmntopt(mntent
, "create=dir")) {
1938 if (mkdir_p(mntent
->mnt_dir
, 0755) < 0) {
1939 WARN("Failed to create mount target '%s'", mntent
->mnt_dir
);
1944 if (hasmntopt(mntent
, "create=file") && access(mntent
->mnt_dir
, F_OK
)) {
1945 pathdirname
= strdup(mntent
->mnt_dir
);
1946 pathdirname
= dirname(pathdirname
);
1947 if (mkdir_p(pathdirname
, 0755) < 0) {
1948 WARN("Failed to create target directory");
1950 pathfile
= fopen(mntent
->mnt_dir
, "wb");
1952 WARN("Failed to create mount target '%s'", mntent
->mnt_dir
);
1959 cull_mntent_opt(mntent
);
1961 if (parse_mntopts(mntent
->mnt_opts
, &mntflags
, &mntdata
) < 0) {
1966 ret
= mount_entry(mntent
->mnt_fsname
, mntent
->mnt_dir
,
1967 mntent
->mnt_type
, mntflags
, mntdata
, optional
);
1975 static int mount_entry_on_absolute_rootfs(struct mntent
*mntent
,
1976 const struct lxc_rootfs
*rootfs
,
1977 const char *lxc_name
)
1980 char path
[MAXPATHLEN
];
1981 unsigned long mntflags
;
1983 int r
, ret
= 0, offset
;
1984 const char *lxcpath
;
1985 FILE *pathfile
= NULL
;
1986 char *pathdirname
= NULL
;
1987 bool optional
= hasmntopt(mntent
, "optional") != NULL
;
1989 lxcpath
= lxc_global_config_value("lxc.lxcpath");
1991 ERROR("Out of memory");
1995 /* if rootfs->path is a blockdev path, allow container fstab to
1996 * use $lxcpath/CN/rootfs as the target prefix */
1997 r
= snprintf(path
, MAXPATHLEN
, "%s/%s/rootfs", lxcpath
, lxc_name
);
1998 if (r
< 0 || r
>= MAXPATHLEN
)
2001 aux
= strstr(mntent
->mnt_dir
, path
);
2003 offset
= strlen(path
);
2008 aux
= strstr(mntent
->mnt_dir
, rootfs
->path
);
2010 WARN("ignoring mount point '%s'", mntent
->mnt_dir
);
2013 offset
= strlen(rootfs
->path
);
2017 r
= snprintf(path
, MAXPATHLEN
, "%s/%s", rootfs
->mount
,
2019 if (r
< 0 || r
>= MAXPATHLEN
) {
2020 WARN("pathnme too long for '%s'", mntent
->mnt_dir
);
2025 if (hasmntopt(mntent
, "create=dir")) {
2026 if (mkdir_p(path
, 0755) < 0) {
2027 WARN("Failed to create mount target '%s'", path
);
2032 if (hasmntopt(mntent
, "create=file") && access(path
, F_OK
)) {
2033 pathdirname
= strdup(path
);
2034 pathdirname
= dirname(pathdirname
);
2035 if (mkdir_p(pathdirname
, 0755) < 0) {
2036 WARN("Failed to create target directory");
2038 pathfile
= fopen(path
, "wb");
2040 WARN("Failed to create mount target '%s'", path
);
2046 cull_mntent_opt(mntent
);
2048 if (parse_mntopts(mntent
->mnt_opts
, &mntflags
, &mntdata
) < 0) {
2053 ret
= mount_entry(mntent
->mnt_fsname
, path
, mntent
->mnt_type
,
2054 mntflags
, mntdata
, optional
);
2063 static int mount_entry_on_relative_rootfs(struct mntent
*mntent
,
2066 char path
[MAXPATHLEN
];
2067 unsigned long mntflags
;
2070 FILE *pathfile
= NULL
;
2071 char *pathdirname
= NULL
;
2072 bool optional
= hasmntopt(mntent
, "optional") != NULL
;
2074 /* relative to root mount point */
2075 ret
= snprintf(path
, sizeof(path
), "%s/%s", rootfs
, mntent
->mnt_dir
);
2076 if (ret
>= sizeof(path
)) {
2077 ERROR("path name too long");
2081 if (hasmntopt(mntent
, "create=dir")) {
2082 if (mkdir_p(path
, 0755) < 0) {
2083 WARN("Failed to create mount target '%s'", path
);
2088 if (hasmntopt(mntent
, "create=file") && access(path
, F_OK
)) {
2089 pathdirname
= strdup(path
);
2090 pathdirname
= dirname(pathdirname
);
2091 if (mkdir_p(pathdirname
, 0755) < 0) {
2092 WARN("Failed to create target directory");
2094 pathfile
= fopen(path
, "wb");
2096 WARN("Failed to create mount target '%s'", path
);
2102 cull_mntent_opt(mntent
);
2104 if (parse_mntopts(mntent
->mnt_opts
, &mntflags
, &mntdata
) < 0) {
2109 ret
= mount_entry(mntent
->mnt_fsname
, path
, mntent
->mnt_type
,
2110 mntflags
, mntdata
, optional
);
2118 static int mount_file_entries(const struct lxc_rootfs
*rootfs
, FILE *file
,
2119 const char *lxc_name
)
2121 struct mntent mntent
;
2125 while (getmntent_r(file
, &mntent
, buf
, sizeof(buf
))) {
2127 if (!rootfs
->path
) {
2128 if (mount_entry_on_systemfs(&mntent
))
2133 /* We have a separate root, mounts are relative to it */
2134 if (mntent
.mnt_dir
[0] != '/') {
2135 if (mount_entry_on_relative_rootfs(&mntent
,
2141 if (mount_entry_on_absolute_rootfs(&mntent
, rootfs
, lxc_name
))
2147 INFO("mount points have been setup");
2152 static int setup_mount(const struct lxc_rootfs
*rootfs
, const char *fstab
,
2153 const char *lxc_name
)
2161 file
= setmntent(fstab
, "r");
2163 SYSERROR("failed to use '%s'", fstab
);
2167 ret
= mount_file_entries(rootfs
, file
, lxc_name
);
2173 static int setup_mount_entries(const struct lxc_rootfs
*rootfs
, struct lxc_list
*mount
,
2174 const char *lxc_name
)
2177 struct lxc_list
*iterator
;
2183 ERROR("tmpfile error: %m");
2187 lxc_list_for_each(iterator
, mount
) {
2188 mount_entry
= iterator
->elem
;
2189 fprintf(file
, "%s\n", mount_entry
);
2194 ret
= mount_file_entries(rootfs
, file
, lxc_name
);
2200 static int parse_cap(const char *cap
)
2205 if (!strcmp(cap
, "none"))
2208 for (i
= 0; i
< sizeof(caps_opt
)/sizeof(caps_opt
[0]); i
++) {
2210 if (strcmp(cap
, caps_opt
[i
].name
))
2213 capid
= caps_opt
[i
].value
;
2218 /* try to see if it's numeric, so the user may specify
2219 * capabilities that the running kernel knows about but
2222 capid
= strtol(cap
, &ptr
, 10);
2223 if (!ptr
|| *ptr
!= '\0' || errno
!= 0)
2224 /* not a valid number */
2226 else if (capid
> lxc_caps_last_cap())
2227 /* we have a number but it's not a valid
2235 int in_caplist(int cap
, struct lxc_list
*caps
)
2237 struct lxc_list
*iterator
;
2240 lxc_list_for_each(iterator
, caps
) {
2241 capid
= parse_cap(iterator
->elem
);
2249 static int setup_caps(struct lxc_list
*caps
)
2251 struct lxc_list
*iterator
;
2255 lxc_list_for_each(iterator
, caps
) {
2257 drop_entry
= iterator
->elem
;
2259 capid
= parse_cap(drop_entry
);
2262 ERROR("unknown capability %s", drop_entry
);
2266 DEBUG("drop capability '%s' (%d)", drop_entry
, capid
);
2268 if (prctl(PR_CAPBSET_DROP
, capid
, 0, 0, 0)) {
2269 SYSERROR("failed to remove %s capability", drop_entry
);
2275 DEBUG("capabilities have been setup");
2280 static int dropcaps_except(struct lxc_list
*caps
)
2282 struct lxc_list
*iterator
;
2285 int numcaps
= lxc_caps_last_cap() + 1;
2286 INFO("found %d capabilities", numcaps
);
2288 if (numcaps
<= 0 || numcaps
> 200)
2291 // caplist[i] is 1 if we keep capability i
2292 int *caplist
= alloca(numcaps
* sizeof(int));
2293 memset(caplist
, 0, numcaps
* sizeof(int));
2295 lxc_list_for_each(iterator
, caps
) {
2297 keep_entry
= iterator
->elem
;
2299 capid
= parse_cap(keep_entry
);
2305 ERROR("unknown capability %s", keep_entry
);
2309 DEBUG("keep capability '%s' (%d)", keep_entry
, capid
);
2313 for (i
=0; i
<numcaps
; i
++) {
2316 if (prctl(PR_CAPBSET_DROP
, i
, 0, 0, 0)) {
2317 SYSERROR("failed to remove capability %d", i
);
2322 DEBUG("capabilities have been setup");
2327 static int setup_hw_addr(char *hwaddr
, const char *ifname
)
2329 struct sockaddr sockaddr
;
2333 ret
= lxc_convert_mac(hwaddr
, &sockaddr
);
2335 ERROR("mac address '%s' conversion failed : %s",
2336 hwaddr
, strerror(-ret
));
2340 memcpy(ifr
.ifr_name
, ifname
, IFNAMSIZ
);
2341 ifr
.ifr_name
[IFNAMSIZ
-1] = '\0';
2342 memcpy((char *) &ifr
.ifr_hwaddr
, (char *) &sockaddr
, sizeof(sockaddr
));
2344 fd
= socket(AF_INET
, SOCK_DGRAM
, 0);
2346 ERROR("socket failure : %s", strerror(errno
));
2350 ret
= ioctl(fd
, SIOCSIFHWADDR
, &ifr
);
2353 ERROR("ioctl failure : %s", strerror(errno
));
2355 DEBUG("mac address '%s' on '%s' has been setup", hwaddr
, ifr
.ifr_name
);
2360 static int setup_ipv4_addr(struct lxc_list
*ip
, int ifindex
)
2362 struct lxc_list
*iterator
;
2363 struct lxc_inetdev
*inetdev
;
2366 lxc_list_for_each(iterator
, ip
) {
2368 inetdev
= iterator
->elem
;
2370 err
= lxc_ipv4_addr_add(ifindex
, &inetdev
->addr
,
2371 &inetdev
->bcast
, inetdev
->prefix
);
2373 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2374 ifindex
, strerror(-err
));
2382 static int setup_ipv6_addr(struct lxc_list
*ip
, int ifindex
)
2384 struct lxc_list
*iterator
;
2385 struct lxc_inet6dev
*inet6dev
;
2388 lxc_list_for_each(iterator
, ip
) {
2390 inet6dev
= iterator
->elem
;
2392 err
= lxc_ipv6_addr_add(ifindex
, &inet6dev
->addr
,
2393 &inet6dev
->mcast
, &inet6dev
->acast
,
2396 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2397 ifindex
, strerror(-err
));
2405 static int setup_netdev(struct lxc_netdev
*netdev
)
2407 char ifname
[IFNAMSIZ
];
2408 char *current_ifname
= ifname
;
2411 /* empty network namespace */
2412 if (!netdev
->ifindex
) {
2413 if (netdev
->flags
& IFF_UP
) {
2414 err
= lxc_netdev_up("lo");
2416 ERROR("failed to set the loopback up : %s",
2421 if (netdev
->type
!= LXC_NET_VETH
)
2423 netdev
->ifindex
= if_nametoindex(netdev
->name
);
2426 /* get the new ifindex in case of physical netdev */
2427 if (netdev
->type
== LXC_NET_PHYS
) {
2428 if (!(netdev
->ifindex
= if_nametoindex(netdev
->link
))) {
2429 ERROR("failed to get ifindex for %s",
2435 /* retrieve the name of the interface */
2436 if (!if_indextoname(netdev
->ifindex
, current_ifname
)) {
2437 ERROR("no interface corresponding to index '%d'",
2442 /* default: let the system to choose one interface name */
2444 netdev
->name
= netdev
->type
== LXC_NET_PHYS
?
2445 netdev
->link
: "eth%d";
2447 /* rename the interface name */
2448 if (strcmp(ifname
, netdev
->name
) != 0) {
2449 err
= lxc_netdev_rename_by_name(ifname
, netdev
->name
);
2451 ERROR("failed to rename %s->%s : %s", ifname
, netdev
->name
,
2457 /* Re-read the name of the interface because its name has changed
2458 * and would be automatically allocated by the system
2460 if (!if_indextoname(netdev
->ifindex
, current_ifname
)) {
2461 ERROR("no interface corresponding to index '%d'",
2466 /* set a mac address */
2467 if (netdev
->hwaddr
) {
2468 if (setup_hw_addr(netdev
->hwaddr
, current_ifname
)) {
2469 ERROR("failed to setup hw address for '%s'",
2475 /* setup ipv4 addresses on the interface */
2476 if (setup_ipv4_addr(&netdev
->ipv4
, netdev
->ifindex
)) {
2477 ERROR("failed to setup ip addresses for '%s'",
2482 /* setup ipv6 addresses on the interface */
2483 if (setup_ipv6_addr(&netdev
->ipv6
, netdev
->ifindex
)) {
2484 ERROR("failed to setup ipv6 addresses for '%s'",
2489 /* set the network device up */
2490 if (netdev
->flags
& IFF_UP
) {
2493 err
= lxc_netdev_up(current_ifname
);
2495 ERROR("failed to set '%s' up : %s", current_ifname
,
2500 /* the network is up, make the loopback up too */
2501 err
= lxc_netdev_up("lo");
2503 ERROR("failed to set the loopback up : %s",
2509 /* We can only set up the default routes after bringing
2510 * up the interface, sine bringing up the interface adds
2511 * the link-local routes and we can't add a default
2512 * route if the gateway is not reachable. */
2514 /* setup ipv4 gateway on the interface */
2515 if (netdev
->ipv4_gateway
) {
2516 if (!(netdev
->flags
& IFF_UP
)) {
2517 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname
);
2521 if (lxc_list_empty(&netdev
->ipv4
)) {
2522 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname
);
2526 err
= lxc_ipv4_gateway_add(netdev
->ifindex
, netdev
->ipv4_gateway
);
2528 err
= lxc_ipv4_dest_add(netdev
->ifindex
, netdev
->ipv4_gateway
);
2530 ERROR("failed to add ipv4 dest for '%s': %s",
2531 ifname
, strerror(-err
));
2534 err
= lxc_ipv4_gateway_add(netdev
->ifindex
, netdev
->ipv4_gateway
);
2536 ERROR("failed to setup ipv4 gateway for '%s': %s",
2537 ifname
, strerror(-err
));
2538 if (netdev
->ipv4_gateway_auto
) {
2539 char buf
[INET_ADDRSTRLEN
];
2540 inet_ntop(AF_INET
, netdev
->ipv4_gateway
, buf
, sizeof(buf
));
2541 ERROR("tried to set autodetected ipv4 gateway '%s'", buf
);
2548 /* setup ipv6 gateway on the interface */
2549 if (netdev
->ipv6_gateway
) {
2550 if (!(netdev
->flags
& IFF_UP
)) {
2551 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname
);
2555 if (lxc_list_empty(&netdev
->ipv6
) && !IN6_IS_ADDR_LINKLOCAL(netdev
->ipv6_gateway
)) {
2556 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname
);
2560 err
= lxc_ipv6_gateway_add(netdev
->ifindex
, netdev
->ipv6_gateway
);
2562 err
= lxc_ipv6_dest_add(netdev
->ifindex
, netdev
->ipv6_gateway
);
2564 ERROR("failed to add ipv6 dest for '%s': %s",
2565 ifname
, strerror(-err
));
2568 err
= lxc_ipv6_gateway_add(netdev
->ifindex
, netdev
->ipv6_gateway
);
2570 ERROR("failed to setup ipv6 gateway for '%s': %s",
2571 ifname
, strerror(-err
));
2572 if (netdev
->ipv6_gateway_auto
) {
2573 char buf
[INET6_ADDRSTRLEN
];
2574 inet_ntop(AF_INET6
, netdev
->ipv6_gateway
, buf
, sizeof(buf
));
2575 ERROR("tried to set autodetected ipv6 gateway '%s'", buf
);
2582 DEBUG("'%s' has been setup", current_ifname
);
2587 static int setup_network(struct lxc_list
*network
)
2589 struct lxc_list
*iterator
;
2590 struct lxc_netdev
*netdev
;
2592 lxc_list_for_each(iterator
, network
) {
2594 netdev
= iterator
->elem
;
2596 if (setup_netdev(netdev
)) {
2597 ERROR("failed to setup netdev");
2602 if (!lxc_list_empty(network
))
2603 INFO("network has been setup");
2608 /* try to move physical nics to the init netns */
2609 void restore_phys_nics_to_netns(int netnsfd
, struct lxc_conf
*conf
)
2612 char path
[MAXPATHLEN
];
2617 ret
= snprintf(path
, MAXPATHLEN
, "/proc/self/ns/net");
2618 if (ret
< 0 || ret
>= MAXPATHLEN
) {
2619 WARN("Failed to open monitor netns fd");
2622 if ((oldfd
= open(path
, O_RDONLY
)) < 0) {
2623 SYSERROR("Failed to open monitor netns fd");
2626 if (setns(netnsfd
, 0) != 0) {
2627 SYSERROR("Failed to enter container netns to reset nics");
2631 for (i
=0; i
<conf
->num_savednics
; i
++) {
2632 struct saved_nic
*s
= &conf
->saved_nics
[i
];
2633 if (lxc_netdev_move_by_index(s
->ifindex
, 1))
2634 WARN("Error moving nic index:%d back to host netns",
2637 if (setns(oldfd
, 0) != 0)
2638 SYSERROR("Failed to re-enter monitor's netns");
2642 void lxc_rename_phys_nics_on_shutdown(int netnsfd
, struct lxc_conf
*conf
)
2646 if (conf
->num_savednics
== 0)
2649 INFO("running to reset %d nic names", conf
->num_savednics
);
2650 restore_phys_nics_to_netns(netnsfd
, conf
);
2651 for (i
=0; i
<conf
->num_savednics
; i
++) {
2652 struct saved_nic
*s
= &conf
->saved_nics
[i
];
2653 INFO("resetting nic %d to %s", s
->ifindex
, s
->orig_name
);
2654 lxc_netdev_rename_by_index(s
->ifindex
, s
->orig_name
);
2657 conf
->num_savednics
= 0;
2660 static char *default_rootfs_mount
= LXCROOTFSMOUNT
;
2662 struct lxc_conf
*lxc_conf_init(void)
2664 struct lxc_conf
*new;
2667 new = malloc(sizeof(*new));
2669 ERROR("lxc_conf_init : %m");
2672 memset(new, 0, sizeof(*new));
2674 new->loglevel
= LXC_LOG_PRIORITY_NOTSET
;
2675 new->personality
= -1;
2677 new->console
.log_path
= NULL
;
2678 new->console
.log_fd
= -1;
2679 new->console
.path
= NULL
;
2680 new->console
.peer
= -1;
2681 new->console
.peerpty
.busy
= -1;
2682 new->console
.peerpty
.master
= -1;
2683 new->console
.peerpty
.slave
= -1;
2684 new->console
.master
= -1;
2685 new->console
.slave
= -1;
2686 new->console
.name
[0] = '\0';
2687 new->maincmd_fd
= -1;
2689 new->rootfs
.mount
= strdup(default_rootfs_mount
);
2690 if (!new->rootfs
.mount
) {
2691 ERROR("lxc_conf_init : %m");
2696 lxc_list_init(&new->cgroup
);
2697 lxc_list_init(&new->network
);
2698 lxc_list_init(&new->mount_list
);
2699 lxc_list_init(&new->caps
);
2700 lxc_list_init(&new->keepcaps
);
2701 lxc_list_init(&new->id_map
);
2702 lxc_list_init(&new->includes
);
2703 lxc_list_init(&new->aliens
);
2704 lxc_list_init(&new->environment
);
2705 for (i
=0; i
<NUM_LXC_HOOKS
; i
++)
2706 lxc_list_init(&new->hooks
[i
]);
2707 lxc_list_init(&new->groups
);
2708 new->lsm_aa_profile
= NULL
;
2709 new->lsm_se_context
= NULL
;
2710 new->tmp_umount_proc
= 0;
2712 for (i
= 0; i
< LXC_NS_MAX
; i
++)
2713 new->inherit_ns_fd
[i
] = -1;
2718 static int instanciate_veth(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2720 char veth1buf
[IFNAMSIZ
], *veth1
;
2721 char veth2buf
[IFNAMSIZ
], *veth2
;
2724 if (netdev
->priv
.veth_attr
.pair
)
2725 veth1
= netdev
->priv
.veth_attr
.pair
;
2727 err
= snprintf(veth1buf
, sizeof(veth1buf
), "vethXXXXXX");
2728 if (err
>= sizeof(veth1buf
)) { /* can't *really* happen, but... */
2729 ERROR("veth1 name too long");
2732 veth1
= lxc_mkifname(veth1buf
);
2734 ERROR("failed to allocate a temporary name");
2737 /* store away for deconf */
2738 memcpy(netdev
->priv
.veth_attr
.veth1
, veth1
, IFNAMSIZ
);
2741 snprintf(veth2buf
, sizeof(veth2buf
), "vethXXXXXX");
2742 veth2
= lxc_mkifname(veth2buf
);
2744 ERROR("failed to allocate a temporary name");
2748 err
= lxc_veth_create(veth1
, veth2
);
2750 ERROR("failed to create %s-%s : %s", veth1
, veth2
,
2755 /* changing the high byte of the mac address to 0xfe, the bridge interface
2756 * will always keep the host's mac address and not take the mac address
2758 err
= setup_private_host_hw_addr(veth1
);
2760 ERROR("failed to change mac address of host interface '%s' : %s",
2761 veth1
, strerror(-err
));
2766 err
= lxc_netdev_set_mtu(veth1
, atoi(netdev
->mtu
));
2768 err
= lxc_netdev_set_mtu(veth2
, atoi(netdev
->mtu
));
2770 ERROR("failed to set mtu '%s' for %s-%s : %s",
2771 netdev
->mtu
, veth1
, veth2
, strerror(-err
));
2777 err
= lxc_bridge_attach(netdev
->link
, veth1
);
2779 ERROR("failed to attach '%s' to the bridge '%s' : %s",
2780 veth1
, netdev
->link
, strerror(-err
));
2785 netdev
->ifindex
= if_nametoindex(veth2
);
2786 if (!netdev
->ifindex
) {
2787 ERROR("failed to retrieve the index for %s", veth2
);
2791 err
= lxc_netdev_up(veth1
);
2793 ERROR("failed to set %s up : %s", veth1
, strerror(-err
));
2797 if (netdev
->upscript
) {
2798 err
= run_script(handler
->name
, "net", netdev
->upscript
, "up",
2799 "veth", veth1
, (char*) NULL
);
2804 DEBUG("instanciated veth '%s/%s', index is '%d'",
2805 veth1
, veth2
, netdev
->ifindex
);
2810 lxc_netdev_delete_by_name(veth1
);
2811 if (!netdev
->priv
.veth_attr
.pair
&& veth1
)
2818 static int shutdown_veth(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2823 if (netdev
->priv
.veth_attr
.pair
)
2824 veth1
= netdev
->priv
.veth_attr
.pair
;
2826 veth1
= netdev
->priv
.veth_attr
.veth1
;
2828 if (netdev
->downscript
) {
2829 err
= run_script(handler
->name
, "net", netdev
->downscript
,
2830 "down", "veth", veth1
, (char*) NULL
);
2837 static int instanciate_macvlan(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2839 char peerbuf
[IFNAMSIZ
], *peer
;
2842 if (!netdev
->link
) {
2843 ERROR("no link specified for macvlan netdev");
2847 err
= snprintf(peerbuf
, sizeof(peerbuf
), "mcXXXXXX");
2848 if (err
>= sizeof(peerbuf
))
2851 peer
= lxc_mkifname(peerbuf
);
2853 ERROR("failed to make a temporary name");
2857 err
= lxc_macvlan_create(netdev
->link
, peer
,
2858 netdev
->priv
.macvlan_attr
.mode
);
2860 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2861 peer
, netdev
->link
, strerror(-err
));
2865 netdev
->ifindex
= if_nametoindex(peer
);
2866 if (!netdev
->ifindex
) {
2867 ERROR("failed to retrieve the index for %s", peer
);
2871 if (netdev
->upscript
) {
2872 err
= run_script(handler
->name
, "net", netdev
->upscript
, "up",
2873 "macvlan", netdev
->link
, (char*) NULL
);
2878 DEBUG("instanciated macvlan '%s', index is '%d' and mode '%d'",
2879 peer
, netdev
->ifindex
, netdev
->priv
.macvlan_attr
.mode
);
2883 lxc_netdev_delete_by_name(peer
);
2888 static int shutdown_macvlan(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2892 if (netdev
->downscript
) {
2893 err
= run_script(handler
->name
, "net", netdev
->downscript
,
2894 "down", "macvlan", netdev
->link
,
2902 /* XXX: merge with instanciate_macvlan */
2903 static int instanciate_vlan(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2905 char peer
[IFNAMSIZ
];
2908 if (!netdev
->link
) {
2909 ERROR("no link specified for vlan netdev");
2913 err
= snprintf(peer
, sizeof(peer
), "vlan%d", netdev
->priv
.vlan_attr
.vid
);
2914 if (err
>= sizeof(peer
)) {
2915 ERROR("peer name too long");
2919 err
= lxc_vlan_create(netdev
->link
, peer
, netdev
->priv
.vlan_attr
.vid
);
2921 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2922 peer
, netdev
->link
, strerror(-err
));
2926 netdev
->ifindex
= if_nametoindex(peer
);
2927 if (!netdev
->ifindex
) {
2928 ERROR("failed to retrieve the ifindex for %s", peer
);
2929 lxc_netdev_delete_by_name(peer
);
2933 DEBUG("instanciated vlan '%s', ifindex is '%d'", " vlan1000",
2939 static int shutdown_vlan(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2944 static int instanciate_phys(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2946 if (!netdev
->link
) {
2947 ERROR("no link specified for the physical interface");
2951 netdev
->ifindex
= if_nametoindex(netdev
->link
);
2952 if (!netdev
->ifindex
) {
2953 ERROR("failed to retrieve the index for %s", netdev
->link
);
2957 if (netdev
->upscript
) {
2959 err
= run_script(handler
->name
, "net", netdev
->upscript
,
2960 "up", "phys", netdev
->link
, (char*) NULL
);
2968 static int shutdown_phys(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2972 if (netdev
->downscript
) {
2973 err
= run_script(handler
->name
, "net", netdev
->downscript
,
2974 "down", "phys", netdev
->link
, (char*) NULL
);
2981 static int instanciate_none(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2983 netdev
->ifindex
= 0;
2987 static int instanciate_empty(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2989 netdev
->ifindex
= 0;
2990 if (netdev
->upscript
) {
2992 err
= run_script(handler
->name
, "net", netdev
->upscript
,
2993 "up", "empty", (char*) NULL
);
3000 static int shutdown_empty(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
3004 if (netdev
->downscript
) {
3005 err
= run_script(handler
->name
, "net", netdev
->downscript
,
3006 "down", "empty", (char*) NULL
);
3013 static int shutdown_none(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
3018 int lxc_requests_empty_network(struct lxc_handler
*handler
)
3020 struct lxc_list
*network
= &handler
->conf
->network
;
3021 struct lxc_list
*iterator
;
3022 struct lxc_netdev
*netdev
;
3023 bool found_none
= false, found_nic
= false;
3025 if (lxc_list_empty(network
))
3028 lxc_list_for_each(iterator
, network
) {
3030 netdev
= iterator
->elem
;
3032 if (netdev
->type
== LXC_NET_NONE
)
3037 if (found_none
&& !found_nic
)
3042 int lxc_create_network(struct lxc_handler
*handler
)
3044 struct lxc_list
*network
= &handler
->conf
->network
;
3045 struct lxc_list
*iterator
;
3046 struct lxc_netdev
*netdev
;
3047 int am_root
= (getuid() == 0);
3052 lxc_list_for_each(iterator
, network
) {
3054 netdev
= iterator
->elem
;
3056 if (netdev
->type
< 0 || netdev
->type
> LXC_NET_MAXCONFTYPE
) {
3057 ERROR("invalid network configuration type '%d'",
3062 if (netdev_conf
[netdev
->type
](handler
, netdev
)) {
3063 ERROR("failed to create netdev");
3072 void lxc_delete_network(struct lxc_handler
*handler
)
3074 struct lxc_list
*network
= &handler
->conf
->network
;
3075 struct lxc_list
*iterator
;
3076 struct lxc_netdev
*netdev
;
3078 lxc_list_for_each(iterator
, network
) {
3079 netdev
= iterator
->elem
;
3081 if (netdev
->ifindex
!= 0 && netdev
->type
== LXC_NET_PHYS
) {
3082 if (lxc_netdev_rename_by_index(netdev
->ifindex
, netdev
->link
))
3083 WARN("failed to rename to the initial name the " \
3084 "netdev '%s'", netdev
->link
);
3088 if (netdev_deconf
[netdev
->type
](handler
, netdev
)) {
3089 WARN("failed to destroy netdev");
3092 /* Recent kernel remove the virtual interfaces when the network
3093 * namespace is destroyed but in case we did not moved the
3094 * interface to the network namespace, we have to destroy it
3096 if (netdev
->ifindex
!= 0 &&
3097 lxc_netdev_delete_by_index(netdev
->ifindex
))
3098 WARN("failed to remove interface '%s'", netdev
->name
);
3102 #define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3104 /* lxc-user-nic returns "interface_name:interface_name\n" */
3105 #define MAX_BUFFER_SIZE IFNAMSIZ*2 + 2
3106 static int unpriv_assign_nic(struct lxc_netdev
*netdev
, pid_t pid
)
3109 int bytes
, pipefd
[2];
3110 char *token
, *saveptr
= NULL
;
3111 char buffer
[MAX_BUFFER_SIZE
];
3113 if (netdev
->type
!= LXC_NET_VETH
) {
3114 ERROR("nic type %d not support for unprivileged use",
3119 if(pipe(pipefd
) < 0) {
3120 SYSERROR("pipe failed");
3124 if ((child
= fork()) < 0) {
3131 if (child
== 0) { // child
3132 /* close the read-end of the pipe */
3134 /* redirect the stdout to write-end of the pipe */
3135 dup2(pipefd
[1], STDOUT_FILENO
);
3136 /* close the write-end of the pipe */
3139 // Call lxc-user-nic pid type bridge
3141 char *args
[] = {LXC_USERNIC_PATH
, pidstr
, "veth", netdev
->link
, netdev
->name
, NULL
};
3142 snprintf(pidstr
, 19, "%lu", (unsigned long) pid
);
3144 execvp(args
[0], args
);
3145 SYSERROR("execvp lxc-user-nic");
3149 /* close the write-end of the pipe */
3152 bytes
= read(pipefd
[0], &buffer
, MAX_BUFFER_SIZE
);
3154 SYSERROR("read failed");
3156 buffer
[bytes
- 1] = '\0';
3158 if (wait_for_pid(child
) != 0) {
3163 /* close the read-end of the pipe */
3166 /* fill netdev->name field */
3167 token
= strtok_r(buffer
, ":", &saveptr
);
3170 netdev
->name
= malloc(IFNAMSIZ
+1);
3171 if (!netdev
->name
) {
3172 ERROR("Out of memory");
3175 memset(netdev
->name
, 0, IFNAMSIZ
+1);
3176 strncpy(netdev
->name
, token
, IFNAMSIZ
);
3178 /* fill netdev->veth_attr.pair field */
3179 token
= strtok_r(NULL
, ":", &saveptr
);
3182 netdev
->priv
.veth_attr
.pair
= strdup(token
);
3183 if (!netdev
->priv
.veth_attr
.pair
) {
3184 ERROR("Out of memory");
3191 int lxc_assign_network(struct lxc_list
*network
, pid_t pid
)
3193 struct lxc_list
*iterator
;
3194 struct lxc_netdev
*netdev
;
3195 int am_root
= (getuid() == 0);
3198 lxc_list_for_each(iterator
, network
) {
3200 netdev
= iterator
->elem
;
3202 if (netdev
->type
== LXC_NET_VETH
&& !am_root
) {
3203 if (unpriv_assign_nic(netdev
, pid
))
3205 // lxc-user-nic has moved the nic to the new ns.
3206 // unpriv_assign_nic() fills in netdev->name.
3207 // netdev->ifindex will be filed in at setup_netdev.
3211 /* empty network namespace, nothing to move */
3212 if (!netdev
->ifindex
)
3215 err
= lxc_netdev_move_by_index(netdev
->ifindex
, pid
);
3217 ERROR("failed to move '%s' to the container : %s",
3218 netdev
->link
, strerror(-err
));
3222 DEBUG("move '%s' to '%d'", netdev
->name
, pid
);
3228 static int write_id_mapping(enum idtype idtype
, pid_t pid
, const char *buf
,
3231 char path
[PATH_MAX
];
3235 ret
= snprintf(path
, PATH_MAX
, "/proc/%d/%cid_map", pid
, idtype
== ID_TYPE_UID
? 'u' : 'g');
3236 if (ret
< 0 || ret
>= PATH_MAX
) {
3237 fprintf(stderr
, "%s: path name too long\n", __func__
);
3240 f
= fopen(path
, "w");
3245 ret
= fwrite(buf
, buf_size
, 1, f
);
3247 SYSERROR("writing id mapping");
3248 closeret
= fclose(f
);
3250 SYSERROR("writing id mapping");
3251 return ret
< 0 ? ret
: closeret
;
3254 int lxc_map_ids(struct lxc_list
*idmap
, pid_t pid
)
3256 struct lxc_list
*iterator
;
3258 int ret
= 0, use_shadow
= 0;
3260 char *buf
= NULL
, *pos
, *cmdpath
= NULL
;
3262 cmdpath
= on_path("newuidmap", NULL
);
3269 cmdpath
= on_path("newgidmap", NULL
);
3276 if (!use_shadow
&& geteuid()) {
3277 ERROR("Missing newuidmap/newgidmap");
3281 for(type
= ID_TYPE_UID
; type
<= ID_TYPE_GID
; type
++) {
3285 buf
= pos
= malloc(4096);
3291 pos
+= sprintf(buf
, "new%cidmap %d",
3292 type
== ID_TYPE_UID
? 'u' : 'g',
3295 lxc_list_for_each(iterator
, idmap
) {
3296 /* The kernel only takes <= 4k for writes to /proc/<nr>/[ug]id_map */
3297 map
= iterator
->elem
;
3298 if (map
->idtype
!= type
)
3302 left
= 4096 - (pos
- buf
);
3303 fill
= snprintf(pos
, left
, "%s%lu %lu %lu%s",
3304 use_shadow
? " " : "",
3305 map
->nsid
, map
->hostid
, map
->range
,
3306 use_shadow
? "" : "\n");
3307 if (fill
<= 0 || fill
>= left
)
3308 SYSERROR("snprintf failed, too many mappings");
3315 ret
= write_id_mapping(type
, pid
, buf
, pos
-buf
);
3317 left
= 4096 - (pos
- buf
);
3318 fill
= snprintf(pos
, left
, "\n");
3319 if (fill
<= 0 || fill
>= left
)
3320 SYSERROR("snprintf failed, too many mappings");
3335 * return the host uid/gid to which the container root is mapped in
3337 * Return true if id was found, false otherwise.
3339 bool get_mapped_rootid(struct lxc_conf
*conf
, enum idtype idtype
,
3342 struct lxc_list
*it
;
3345 lxc_list_for_each(it
, &conf
->id_map
) {
3347 if (map
->idtype
!= idtype
)
3357 int mapped_hostid(unsigned id
, struct lxc_conf
*conf
, enum idtype idtype
)
3359 struct lxc_list
*it
;
3361 lxc_list_for_each(it
, &conf
->id_map
) {
3363 if (map
->idtype
!= idtype
)
3365 if (id
>= map
->hostid
&& id
< map
->hostid
+ map
->range
)
3366 return (id
- map
->hostid
) + map
->nsid
;
3371 int find_unmapped_nsuid(struct lxc_conf
*conf
, enum idtype idtype
)
3373 struct lxc_list
*it
;
3375 unsigned int freeid
= 0;
3377 lxc_list_for_each(it
, &conf
->id_map
) {
3379 if (map
->idtype
!= idtype
)
3381 if (freeid
>= map
->nsid
&& freeid
< map
->nsid
+ map
->range
) {
3382 freeid
= map
->nsid
+ map
->range
;
3389 int lxc_find_gateway_addresses(struct lxc_handler
*handler
)
3391 struct lxc_list
*network
= &handler
->conf
->network
;
3392 struct lxc_list
*iterator
;
3393 struct lxc_netdev
*netdev
;
3396 lxc_list_for_each(iterator
, network
) {
3397 netdev
= iterator
->elem
;
3399 if (!netdev
->ipv4_gateway_auto
&& !netdev
->ipv6_gateway_auto
)
3402 if (netdev
->type
!= LXC_NET_VETH
&& netdev
->type
!= LXC_NET_MACVLAN
) {
3403 ERROR("gateway = auto only supported for "
3404 "veth and macvlan");
3408 if (!netdev
->link
) {
3409 ERROR("gateway = auto needs a link interface");
3413 link_index
= if_nametoindex(netdev
->link
);
3417 if (netdev
->ipv4_gateway_auto
) {
3418 if (lxc_ipv4_addr_get(link_index
, &netdev
->ipv4_gateway
)) {
3419 ERROR("failed to automatically find ipv4 gateway "
3420 "address from link interface '%s'", netdev
->link
);
3425 if (netdev
->ipv6_gateway_auto
) {
3426 if (lxc_ipv6_addr_get(link_index
, &netdev
->ipv6_gateway
)) {
3427 ERROR("failed to automatically find ipv6 gateway "
3428 "address from link interface '%s'", netdev
->link
);
3437 int lxc_create_tty(const char *name
, struct lxc_conf
*conf
)
3439 struct lxc_tty_info
*tty_info
= &conf
->tty_info
;
3442 /* no tty in the configuration */
3446 tty_info
->pty_info
=
3447 malloc(sizeof(*tty_info
->pty_info
)*conf
->tty
);
3448 if (!tty_info
->pty_info
) {
3449 SYSERROR("failed to allocate pty_info");
3453 for (i
= 0; i
< conf
->tty
; i
++) {
3455 struct lxc_pty_info
*pty_info
= &tty_info
->pty_info
[i
];
3458 ret
= openpty(&pty_info
->master
, &pty_info
->slave
,
3459 pty_info
->name
, NULL
, NULL
);
3462 SYSERROR("failed to create pty #%d", i
);
3463 tty_info
->nbtty
= i
;
3464 lxc_delete_tty(tty_info
);
3468 DEBUG("allocated pty '%s' (%d/%d)",
3469 pty_info
->name
, pty_info
->master
, pty_info
->slave
);
3471 /* Prevent leaking the file descriptors to the container */
3472 fcntl(pty_info
->master
, F_SETFD
, FD_CLOEXEC
);
3473 fcntl(pty_info
->slave
, F_SETFD
, FD_CLOEXEC
);
3478 tty_info
->nbtty
= conf
->tty
;
3480 INFO("tty's configured");
3485 void lxc_delete_tty(struct lxc_tty_info
*tty_info
)
3489 for (i
= 0; i
< tty_info
->nbtty
; i
++) {
3490 struct lxc_pty_info
*pty_info
= &tty_info
->pty_info
[i
];
3492 close(pty_info
->master
);
3493 close(pty_info
->slave
);
3496 free(tty_info
->pty_info
);
3497 tty_info
->nbtty
= 0;
3501 * chown_mapped_root: for an unprivileged user with uid/gid X to
3502 * chown a dir to subuid/subgid Y, he needs to run chown as root
3503 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3504 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3505 * root is privileged with respect to hostuid/hostgid X, allowing
3506 * him to do the chown.
3508 int chown_mapped_root(char *path
, struct lxc_conf
*conf
)
3514 char *chownpath
= path
;
3516 if (!get_mapped_rootid(conf
, ID_TYPE_UID
, &val
)) {
3517 ERROR("No mapping for container root");
3520 rootuid
= (uid_t
) val
;
3521 if (!get_mapped_rootid(conf
, ID_TYPE_GID
, &val
)) {
3522 ERROR("No mapping for container root");
3525 rootgid
= (gid_t
) val
;
3528 * In case of overlay, we want only the writeable layer
3531 if (strncmp(path
, "overlayfs:", 10) == 0 || strncmp(path
, "aufs:", 5) == 0) {
3532 chownpath
= strchr(path
, ':');
3534 ERROR("Bad overlay path: %s", path
);
3537 chownpath
= strchr(chownpath
+1, ':');
3539 ERROR("Bad overlay path: %s", path
);
3545 if (geteuid() == 0) {
3546 if (chown(path
, rootuid
, rootgid
) < 0) {
3547 ERROR("Error chowning %s", path
);
3553 if (rootuid
== geteuid()) {
3555 INFO("%s: container root is our uid; no need to chown" ,__func__
);
3561 SYSERROR("Failed forking");
3565 int hostuid
= geteuid(), hostgid
= getegid(), ret
;
3567 char map1
[100], map2
[100], map3
[100], map4
[100], map5
[100];
3569 char *args1
[] = { "lxc-usernsexec", "-m", map1
, "-m", map2
,
3570 "-m", map3
, "-m", map5
,
3571 "--", "chown", ugid
, path
, NULL
};
3572 char *args2
[] = { "lxc-usernsexec", "-m", map1
, "-m", map2
,
3573 "-m", map3
, "-m", map4
, "-m", map5
,
3574 "--", "chown", ugid
, path
, NULL
};
3576 // save the current gid of "path"
3577 if (stat(path
, &sb
) < 0) {
3578 ERROR("Error stat %s", path
);
3583 * A file has to be group-owned by a gid mapped into the
3584 * container, or the container won't be privileged over it.
3586 if (sb
.st_uid
== geteuid() &&
3587 mapped_hostid(sb
.st_gid
, conf
, ID_TYPE_GID
) < 0 &&
3588 chown(path
, -1, hostgid
) < 0) {
3589 ERROR("Failed chgrping %s", path
);
3594 ret
= snprintf(map1
, 100, "u:0:%d:1", rootuid
);
3595 if (ret
< 0 || ret
>= 100) {
3596 ERROR("Error uid printing map string");
3600 // "u:hostuid:hostuid:1"
3601 ret
= snprintf(map2
, 100, "u:%d:%d:1", hostuid
, hostuid
);
3602 if (ret
< 0 || ret
>= 100) {
3603 ERROR("Error uid printing map string");
3608 ret
= snprintf(map3
, 100, "g:0:%d:1", rootgid
);
3609 if (ret
< 0 || ret
>= 100) {
3610 ERROR("Error gid printing map string");
3614 // "g:pathgid:rootgid+pathgid:1"
3615 ret
= snprintf(map4
, 100, "g:%d:%d:1", (gid_t
)sb
.st_gid
,
3616 rootgid
+ (gid_t
)sb
.st_gid
);
3617 if (ret
< 0 || ret
>= 100) {
3618 ERROR("Error gid printing map string");
3622 // "g:hostgid:hostgid:1"
3623 ret
= snprintf(map5
, 100, "g:%d:%d:1", hostgid
, hostgid
);
3624 if (ret
< 0 || ret
>= 100) {
3625 ERROR("Error gid printing map string");
3629 // "0:pathgid" (chown)
3630 ret
= snprintf(ugid
, 100, "0:%d", (gid_t
)sb
.st_gid
);
3631 if (ret
< 0 || ret
>= 100) {
3632 ERROR("Error owner printing format string for chown");
3636 if (hostgid
== sb
.st_gid
)
3637 ret
= execvp("lxc-usernsexec", args1
);
3639 ret
= execvp("lxc-usernsexec", args2
);
3640 SYSERROR("Failed executing usernsexec");
3643 return wait_for_pid(pid
);
3646 int ttys_shift_ids(struct lxc_conf
*c
)
3650 if (lxc_list_empty(&c
->id_map
))
3653 for (i
= 0; i
< c
->tty_info
.nbtty
; i
++) {
3654 struct lxc_pty_info
*pty_info
= &c
->tty_info
.pty_info
[i
];
3656 if (chown_mapped_root(pty_info
->name
, c
) < 0) {
3657 ERROR("Failed to chown %s", pty_info
->name
);
3662 if (strcmp(c
->console
.name
, "") !=0 && chown_mapped_root(c
->console
.name
, c
) < 0) {
3663 ERROR("Failed to chown %s", c
->console
.name
);
3671 * This routine is called when the configuration does not already specify a value
3672 * for autodev (mounting a file system on /dev and populating it in a container).
3673 * If a hard override value has not be specified, then we try to apply some
3674 * heuristics to determine if we should switch to autodev mode.
3676 * For instance, if the container has an /etc/systemd/system directory then it
3677 * is probably running systemd as the init process and it needs the autodev
3678 * mount to prevent it from mounting devtmpfs on /dev on it's own causing conflicts
3681 * We may also want to enable autodev if the host has devtmpfs mounted on its
3682 * /dev as this then enable us to use subdirectories under /dev for the container
3683 * /dev directories and we can fake udev devices.
3689 #define MAX_SYMLINK_DEPTH 32
3691 static int check_autodev( const char *rootfs
, void *data
)
3693 struct start_args
*arg
= data
;
3697 char absrootfs
[MAXPATHLEN
];
3698 char path
[MAXPATHLEN
];
3699 char abs_path
[MAXPATHLEN
];
3700 char *command
= "/sbin/init";
3702 if (rootfs
== NULL
|| strlen(rootfs
) == 0)
3705 if (!realpath(rootfs
, absrootfs
))
3708 if( arg
&& arg
->argv
[0] ) {
3709 command
= arg
->argv
[0];
3710 DEBUG("Set exec command to %s", command
);
3713 strncpy( path
, command
, MAXPATHLEN
-1 );
3715 if ( 0 != access(path
, F_OK
) || 0 != stat(path
, &s
) )
3718 /* Dereference down the symlink merry path testing as we go. */
3719 /* If anything references systemd in the path - set autodev! */
3720 /* Renormalize to the rootfs before each dereference */
3721 /* Relative symlinks should fall out in the wash even with .. */
3723 if ( strstr( path
, "systemd" ) ) {
3724 INFO("Container with systemd init detected - enabling autodev!");
3728 ret
= snprintf(abs_path
, MAXPATHLEN
-1, "%s/%s", absrootfs
, path
);
3729 if (ret
< 0 || ret
> MAXPATHLEN
)
3732 ret
= readlink( abs_path
, path
, MAXPATHLEN
-1 );
3734 if ( ( ret
<= 0 ) || ( ++loop_count
> MAX_SYMLINK_DEPTH
) ) {
3735 break; /* Break out for other tests */
3741 * Add future checks here.
3742 * Return positive if we should go autodev
3743 * Return 0 if we should NOT go autodev
3744 * Return negative if we encounter an error or can not determine...
3747 /* All else fails, we don't need autodev */
3748 INFO("Autodev not required.");
3753 * _do_tmp_proc_mount: Mount /proc inside container if not already
3756 * @rootfs : the rootfs where proc should be mounted
3758 * Returns < 0 on failure, 0 if the correct proc was already mounted
3759 * and 1 if a new proc was mounted.
3761 static int do_tmp_proc_mount(const char *rootfs
)
3763 char path
[MAXPATHLEN
];
3767 ret
= snprintf(path
, MAXPATHLEN
, "%s/proc/self", rootfs
);
3768 if (ret
< 0 || ret
>= MAXPATHLEN
) {
3769 SYSERROR("proc path name too long");
3772 memset(link
, 0, 20);
3773 linklen
= readlink(path
, link
, 20);
3774 INFO("I am %d, /proc/self points to '%s'", getpid(), link
);
3775 ret
= snprintf(path
, MAXPATHLEN
, "%s/proc", rootfs
);
3776 if (linklen
< 0) /* /proc not mounted */
3778 /* can't be longer than rootfs/proc/1 */
3779 if (strncmp(link
, "1", linklen
) != 0) {
3780 /* wrong /procs mounted */
3781 umount2(path
, MNT_DETACH
); /* ignore failure */
3784 /* the right proc is already mounted */
3788 if (mount("proc", path
, "proc", 0, NULL
))
3790 INFO("Mounted /proc in container for security transition");
3794 int tmp_proc_mount(struct lxc_conf
*lxc_conf
)
3798 if (lxc_conf
->rootfs
.path
== NULL
|| strlen(lxc_conf
->rootfs
.path
) == 0) {
3799 if (mount("proc", "/proc", "proc", 0, NULL
)) {
3800 SYSERROR("Failed mounting /proc, proceeding");
3805 mounted
= do_tmp_proc_mount(lxc_conf
->rootfs
.mount
);
3806 if (mounted
== -1) {
3807 SYSERROR("failed to mount /proc in the container.");
3809 } else if (mounted
== 1) {
3810 lxc_conf
->tmp_umount_proc
= 1;
3815 void tmp_proc_unmount(struct lxc_conf
*lxc_conf
)
3817 if (lxc_conf
->tmp_umount_proc
== 1) {
3819 lxc_conf
->tmp_umount_proc
= 0;
3823 static void null_endofword(char *word
)
3825 while (*word
&& *word
!= ' ' && *word
!= '\t')
3831 * skip @nfields spaces in @src
3833 static char *get_field(char *src
, int nfields
)
3838 for (i
= 0; i
< nfields
; i
++) {
3839 while (*p
&& *p
!= ' ' && *p
!= '\t')
3848 static void remount_all_slave(void)
3850 /* walk /proc/mounts and change any shared entries to slave */
3851 FILE *f
= fopen("/proc/self/mountinfo", "r");
3856 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3857 ERROR("Continuing container startup...");
3861 while (getline(&line
, &len
, f
) != -1) {
3862 char *target
, *opts
;
3863 target
= get_field(line
, 4);
3866 opts
= get_field(target
, 2);
3869 null_endofword(opts
);
3870 if (!strstr(opts
, "shared"))
3872 null_endofword(target
);
3873 if (mount(NULL
, target
, NULL
, MS_SLAVE
, NULL
)) {
3874 SYSERROR("Failed to make %s rslave", target
);
3875 ERROR("Continuing...");
3883 void lxc_execute_bind_init(struct lxc_conf
*conf
)
3886 char path
[PATH_MAX
], destpath
[PATH_MAX
], *p
;
3888 /* If init exists in the container, don't bind mount a static one */
3889 p
= choose_init(conf
->rootfs
.mount
);
3895 ret
= snprintf(path
, PATH_MAX
, SBINDIR
"/init.lxc.static");
3896 if (ret
< 0 || ret
>= PATH_MAX
) {
3897 WARN("Path name too long searching for lxc.init.static");
3901 if (!file_exists(path
)) {
3902 INFO("%s does not exist on host", path
);
3906 ret
= snprintf(destpath
, PATH_MAX
, "%s%s", conf
->rootfs
.mount
, "/init.lxc.static");
3907 if (ret
< 0 || ret
>= PATH_MAX
) {
3908 WARN("Path name too long for container's lxc.init.static");
3912 if (!file_exists(destpath
)) {
3913 FILE * pathfile
= fopen(destpath
, "wb");
3915 SYSERROR("Failed to create mount target '%s'", destpath
);
3921 ret
= mount(path
, destpath
, "none", MS_BIND
, NULL
);
3923 SYSERROR("Failed to bind lxc.init.static into container");
3924 INFO("lxc.init.static bound into container at %s", path
);
3928 * This does the work of remounting / if it is shared, calling the
3929 * container pre-mount hooks, and mounting the rootfs.
3931 int do_rootfs_setup(struct lxc_conf
*conf
, const char *name
, const char *lxcpath
)
3933 if (conf
->rootfs_setup
) {
3935 * rootfs was set up in another namespace. bind-mount it
3936 * to give us a mount in our own ns so we can pivot_root to it
3938 const char *path
= conf
->rootfs
.mount
;
3939 if (mount(path
, path
, "rootfs", MS_BIND
, NULL
) < 0) {
3940 ERROR("Failed to bind-mount container / onto itself");
3945 if (detect_ramfs_rootfs()) {
3946 if (chroot_into_slave(conf
)) {
3947 ERROR("Failed to chroot into slave /");
3952 remount_all_slave();
3954 if (run_lxc_hooks(name
, "pre-mount", conf
, lxcpath
, NULL
)) {
3955 ERROR("failed to run pre-mount hooks for container '%s'.", name
);
3959 if (setup_rootfs(conf
)) {
3960 ERROR("failed to setup rootfs for '%s'", name
);
3964 conf
->rootfs_setup
= true;
3968 static bool verify_start_hooks(struct lxc_conf
*conf
)
3970 struct lxc_list
*it
;
3971 char path
[MAXPATHLEN
];
3972 lxc_list_for_each(it
, &conf
->hooks
[LXCHOOK_START
]) {
3973 char *hookname
= it
->elem
;
3977 ret
= snprintf(path
, MAXPATHLEN
, "%s%s",
3978 conf
->rootfs
.mount
, hookname
);
3979 if (ret
< 0 || ret
>= MAXPATHLEN
)
3981 ret
= stat(path
, &st
);
3983 SYSERROR("Start hook %s not found in container rootfs",
3992 int lxc_setup(struct lxc_handler
*handler
)
3994 const char *name
= handler
->name
;
3995 struct lxc_conf
*lxc_conf
= handler
->conf
;
3996 const char *lxcpath
= handler
->lxcpath
;
3997 void *data
= handler
->data
;
3999 if (do_rootfs_setup(lxc_conf
, name
, lxcpath
) < 0) {
4000 ERROR("Error setting up rootfs mount after spawn");
4004 if (lxc_conf
->inherit_ns_fd
[LXC_NS_UTS
] == -1) {
4005 if (setup_utsname(lxc_conf
->utsname
)) {
4006 ERROR("failed to setup the utsname for '%s'", name
);
4011 if (setup_network(&lxc_conf
->network
)) {
4012 ERROR("failed to setup the network for '%s'", name
);
4016 if (lxc_conf
->autodev
< 0) {
4017 lxc_conf
->autodev
= check_autodev(lxc_conf
->rootfs
.mount
, data
);
4020 if (lxc_conf
->autodev
> 0) {
4021 if (mount_autodev(name
, lxc_conf
->rootfs
.mount
, lxcpath
)) {
4022 ERROR("failed to mount /dev in the container");
4027 /* do automatic mounts (mainly /proc and /sys), but exclude
4028 * those that need to wait until other stuff has finished
4030 if (lxc_mount_auto_mounts(lxc_conf
, lxc_conf
->auto_mounts
& ~LXC_AUTO_CGROUP_MASK
, handler
) < 0) {
4031 ERROR("failed to setup the automatic mounts for '%s'", name
);
4035 if (setup_mount(&lxc_conf
->rootfs
, lxc_conf
->fstab
, name
)) {
4036 ERROR("failed to setup the mounts for '%s'", name
);
4040 if (!lxc_list_empty(&lxc_conf
->mount_list
) && setup_mount_entries(&lxc_conf
->rootfs
, &lxc_conf
->mount_list
, name
)) {
4041 ERROR("failed to setup the mount entries for '%s'", name
);
4045 /* Make sure any start hooks are in the rootfs */
4046 if (!verify_start_hooks(lxc_conf
))
4049 if (lxc_conf
->is_execute
)
4050 lxc_execute_bind_init(lxc_conf
);
4052 /* now mount only cgroup, if wanted;
4053 * before, /sys could not have been mounted
4054 * (is either mounted automatically or via fstab entries)
4056 if (lxc_mount_auto_mounts(lxc_conf
, lxc_conf
->auto_mounts
& LXC_AUTO_CGROUP_MASK
, handler
) < 0) {
4057 ERROR("failed to setup the automatic mounts for '%s'", name
);
4061 if (run_lxc_hooks(name
, "mount", lxc_conf
, lxcpath
, NULL
)) {
4062 ERROR("failed to run mount hooks for container '%s'.", name
);
4066 if (lxc_conf
->autodev
> 0) {
4067 if (run_lxc_hooks(name
, "autodev", lxc_conf
, lxcpath
, NULL
)) {
4068 ERROR("failed to run autodev hooks for container '%s'.", name
);
4071 if (setup_autodev(lxc_conf
->rootfs
.mount
)) {
4072 ERROR("failed to populate /dev in the container");
4077 if (!lxc_conf
->is_execute
&& setup_console(&lxc_conf
->rootfs
, &lxc_conf
->console
, lxc_conf
->ttydir
)) {
4078 ERROR("failed to setup the console for '%s'", name
);
4082 if (lxc_conf
->kmsg
) {
4083 if (setup_kmsg(&lxc_conf
->rootfs
, &lxc_conf
->console
)) // don't fail
4084 ERROR("failed to setup kmsg for '%s'", name
);
4087 if (!lxc_conf
->is_execute
&& setup_tty(&lxc_conf
->rootfs
, &lxc_conf
->tty_info
, lxc_conf
->ttydir
)) {
4088 ERROR("failed to setup the ttys for '%s'", name
);
4092 if (!lxc_conf
->is_execute
&& setup_dev_symlinks(&lxc_conf
->rootfs
)) {
4093 ERROR("failed to setup /dev symlinks for '%s'", name
);
4097 /* mount /proc if it's not already there */
4098 if (tmp_proc_mount(lxc_conf
) < 0) {
4099 ERROR("failed to LSM mount proc for '%s'", name
);
4103 if (setup_pivot_root(&lxc_conf
->rootfs
)) {
4104 ERROR("failed to set rootfs for '%s'", name
);
4108 if (setup_pts(lxc_conf
->pts
)) {
4109 ERROR("failed to setup the new pts instance");
4113 if (setup_personality(lxc_conf
->personality
)) {
4114 ERROR("failed to setup personality");
4118 if (lxc_list_empty(&lxc_conf
->id_map
)) {
4119 if (!lxc_list_empty(&lxc_conf
->keepcaps
)) {
4120 if (!lxc_list_empty(&lxc_conf
->caps
)) {
4121 ERROR("Simultaneously requested dropping and keeping caps");
4124 if (dropcaps_except(&lxc_conf
->keepcaps
)) {
4125 ERROR("failed to keep requested caps");
4128 } else if (setup_caps(&lxc_conf
->caps
)) {
4129 ERROR("failed to drop capabilities");
4134 NOTICE("'%s' is setup.", name
);
4139 int run_lxc_hooks(const char *name
, char *hook
, struct lxc_conf
*conf
,
4140 const char *lxcpath
, char *argv
[])
4143 struct lxc_list
*it
;
4145 if (strcmp(hook
, "pre-start") == 0)
4146 which
= LXCHOOK_PRESTART
;
4147 else if (strcmp(hook
, "pre-mount") == 0)
4148 which
= LXCHOOK_PREMOUNT
;
4149 else if (strcmp(hook
, "mount") == 0)
4150 which
= LXCHOOK_MOUNT
;
4151 else if (strcmp(hook
, "autodev") == 0)
4152 which
= LXCHOOK_AUTODEV
;
4153 else if (strcmp(hook
, "start") == 0)
4154 which
= LXCHOOK_START
;
4155 else if (strcmp(hook
, "post-stop") == 0)
4156 which
= LXCHOOK_POSTSTOP
;
4157 else if (strcmp(hook
, "clone") == 0)
4158 which
= LXCHOOK_CLONE
;
4161 lxc_list_for_each(it
, &conf
->hooks
[which
]) {
4163 char *hookname
= it
->elem
;
4164 ret
= run_script_argv(name
, "lxc", hookname
, hook
, lxcpath
, argv
);
4171 static void lxc_remove_nic(struct lxc_list
*it
)
4173 struct lxc_netdev
*netdev
= it
->elem
;
4174 struct lxc_list
*it2
,*next
;
4182 if (netdev
->type
== LXC_NET_VETH
&& netdev
->priv
.veth_attr
.pair
)
4183 free(netdev
->priv
.veth_attr
.pair
);
4184 if (netdev
->upscript
)
4185 free(netdev
->upscript
);
4187 free(netdev
->hwaddr
);
4190 if (netdev
->ipv4_gateway
)
4191 free(netdev
->ipv4_gateway
);
4192 if (netdev
->ipv6_gateway
)
4193 free(netdev
->ipv6_gateway
);
4194 lxc_list_for_each_safe(it2
, &netdev
->ipv4
, next
) {
4199 lxc_list_for_each_safe(it2
, &netdev
->ipv6
, next
) {
4208 /* we get passed in something like '0', '0.ipv4' or '1.ipv6' */
4209 int lxc_clear_nic(struct lxc_conf
*c
, const char *key
)
4213 struct lxc_list
*it
;
4214 struct lxc_netdev
*netdev
;
4216 p1
= index(key
, '.');
4217 if (!p1
|| *(p1
+1) == '\0')
4220 ret
= sscanf(key
, "%d", &idx
);
4221 if (ret
!= 1) return -1;
4226 lxc_list_for_each(it
, &c
->network
) {
4231 if (i
< idx
) // we don't have that many nics defined
4234 if (!it
|| !it
->elem
)
4241 } else if (strcmp(p1
, ".ipv4") == 0) {
4242 struct lxc_list
*it2
,*next
;
4243 lxc_list_for_each_safe(it2
, &netdev
->ipv4
, next
) {
4248 } else if (strcmp(p1
, ".ipv6") == 0) {
4249 struct lxc_list
*it2
,*next
;
4250 lxc_list_for_each_safe(it2
, &netdev
->ipv6
, next
) {
4255 } else if (strcmp(p1
, ".link") == 0) {
4258 netdev
->link
= NULL
;
4260 } else if (strcmp(p1
, ".name") == 0) {
4263 netdev
->name
= NULL
;
4265 } else if (strcmp(p1
, ".script.up") == 0) {
4266 if (netdev
->upscript
) {
4267 free(netdev
->upscript
);
4268 netdev
->upscript
= NULL
;
4270 } else if (strcmp(p1
, ".hwaddr") == 0) {
4271 if (netdev
->hwaddr
) {
4272 free(netdev
->hwaddr
);
4273 netdev
->hwaddr
= NULL
;
4275 } else if (strcmp(p1
, ".mtu") == 0) {
4280 } else if (strcmp(p1
, ".ipv4_gateway") == 0) {
4281 if (netdev
->ipv4_gateway
) {
4282 free(netdev
->ipv4_gateway
);
4283 netdev
->ipv4_gateway
= NULL
;
4285 } else if (strcmp(p1
, ".ipv6_gateway") == 0) {
4286 if (netdev
->ipv6_gateway
) {
4287 free(netdev
->ipv6_gateway
);
4288 netdev
->ipv6_gateway
= NULL
;
4296 int lxc_clear_config_network(struct lxc_conf
*c
)
4298 struct lxc_list
*it
,*next
;
4299 lxc_list_for_each_safe(it
, &c
->network
, next
) {
4305 int lxc_clear_config_caps(struct lxc_conf
*c
)
4307 struct lxc_list
*it
,*next
;
4309 lxc_list_for_each_safe(it
, &c
->caps
, next
) {
4317 static int lxc_free_idmap(struct lxc_list
*id_map
) {
4318 struct lxc_list
*it
, *next
;
4320 lxc_list_for_each_safe(it
, id_map
, next
) {
4328 int lxc_clear_idmaps(struct lxc_conf
*c
)
4330 return lxc_free_idmap(&c
->id_map
);
4333 int lxc_clear_config_keepcaps(struct lxc_conf
*c
)
4335 struct lxc_list
*it
,*next
;
4337 lxc_list_for_each_safe(it
, &c
->keepcaps
, next
) {
4345 int lxc_clear_cgroups(struct lxc_conf
*c
, const char *key
)
4347 struct lxc_list
*it
,*next
;
4349 const char *k
= key
+ 11;
4351 if (strcmp(key
, "lxc.cgroup") == 0)
4354 lxc_list_for_each_safe(it
, &c
->cgroup
, next
) {
4355 struct lxc_cgroup
*cg
= it
->elem
;
4356 if (!all
&& strcmp(cg
->subsystem
, k
) != 0)
4359 free(cg
->subsystem
);
4367 int lxc_clear_groups(struct lxc_conf
*c
)
4369 struct lxc_list
*it
,*next
;
4371 lxc_list_for_each_safe(it
, &c
->groups
, next
) {
4379 int lxc_clear_mount_entries(struct lxc_conf
*c
)
4381 struct lxc_list
*it
,*next
;
4383 lxc_list_for_each_safe(it
, &c
->mount_list
, next
) {
4391 int lxc_clear_automounts(struct lxc_conf
*c
)
4397 int lxc_clear_hooks(struct lxc_conf
*c
, const char *key
)
4399 struct lxc_list
*it
,*next
;
4400 bool all
= false, done
= false;
4401 const char *k
= key
+ 9;
4404 if (strcmp(key
, "lxc.hook") == 0)
4407 for (i
=0; i
<NUM_LXC_HOOKS
; i
++) {
4408 if (all
|| strcmp(k
, lxchook_names
[i
]) == 0) {
4409 lxc_list_for_each_safe(it
, &c
->hooks
[i
], next
) {
4419 ERROR("Invalid hook key: %s", key
);
4425 static void lxc_clear_saved_nics(struct lxc_conf
*conf
)
4429 if (!conf
->saved_nics
)
4431 for (i
=0; i
< conf
->num_savednics
; i
++)
4432 free(conf
->saved_nics
[i
].orig_name
);
4433 free(conf
->saved_nics
);
4436 static inline void lxc_clear_aliens(struct lxc_conf
*conf
)
4438 struct lxc_list
*it
,*next
;
4440 lxc_list_for_each_safe(it
, &conf
->aliens
, next
) {
4447 static inline void lxc_clear_includes(struct lxc_conf
*conf
)
4449 struct lxc_list
*it
,*next
;
4451 lxc_list_for_each_safe(it
, &conf
->includes
, next
) {
4458 void lxc_conf_free(struct lxc_conf
*conf
)
4462 if (conf
->console
.path
)
4463 free(conf
->console
.path
);
4464 if (conf
->rootfs
.mount
)
4465 free(conf
->rootfs
.mount
);
4466 if (conf
->rootfs
.options
)
4467 free(conf
->rootfs
.options
);
4468 if (conf
->rootfs
.path
)
4469 free(conf
->rootfs
.path
);
4470 if (conf
->rootfs
.pivot
)
4471 free(conf
->rootfs
.pivot
);
4473 free(conf
->logfile
);
4475 free(conf
->utsname
);
4482 lxc_clear_config_network(conf
);
4483 if (conf
->lsm_aa_profile
)
4484 free(conf
->lsm_aa_profile
);
4485 if (conf
->lsm_se_context
)
4486 free(conf
->lsm_se_context
);
4487 lxc_seccomp_free(conf
);
4488 lxc_clear_config_caps(conf
);
4489 lxc_clear_config_keepcaps(conf
);
4490 lxc_clear_cgroups(conf
, "lxc.cgroup");
4491 lxc_clear_hooks(conf
, "lxc.hook");
4492 lxc_clear_mount_entries(conf
);
4493 lxc_clear_saved_nics(conf
);
4494 lxc_clear_idmaps(conf
);
4495 lxc_clear_groups(conf
);
4496 lxc_clear_includes(conf
);
4497 lxc_clear_aliens(conf
);
4501 struct userns_fn_data
{
4507 static int run_userns_fn(void *data
)
4509 struct userns_fn_data
*d
= data
;
4511 // we're not sharing with the parent any more, if it was a thread
4514 if (read(d
->p
[0], &c
, 1) != 1)
4517 return d
->fn(d
->arg
);
4521 * Add ID_TYPE_UID/ID_TYPE_GID entries to an existing lxc_conf,
4522 * if they are not already there.
4524 static struct lxc_list
*idmap_add_id(struct lxc_conf
*conf
,
4525 uid_t uid
, gid_t gid
)
4527 int hostuid_mapped
= mapped_hostid(uid
, conf
, ID_TYPE_UID
);
4528 int hostgid_mapped
= mapped_hostid(gid
, conf
, ID_TYPE_GID
);
4529 struct lxc_list
*new = NULL
, *tmp
, *it
, *next
;
4530 struct id_map
*entry
;
4532 new = malloc(sizeof(*new));
4534 ERROR("Out of memory building id map");
4539 if (hostuid_mapped
< 0) {
4540 hostuid_mapped
= find_unmapped_nsuid(conf
, ID_TYPE_UID
);
4541 if (hostuid_mapped
< 0)
4543 tmp
= malloc(sizeof(*tmp
));
4546 entry
= malloc(sizeof(*entry
));
4552 entry
->idtype
= ID_TYPE_UID
;
4553 entry
->nsid
= hostuid_mapped
;
4554 entry
->hostid
= (unsigned long) uid
;
4556 lxc_list_add_tail(new, tmp
);
4558 if (hostgid_mapped
< 0) {
4559 hostgid_mapped
= find_unmapped_nsuid(conf
, ID_TYPE_GID
);
4560 if (hostgid_mapped
< 0)
4562 tmp
= malloc(sizeof(*tmp
));
4565 entry
= malloc(sizeof(*entry
));
4571 entry
->idtype
= ID_TYPE_GID
;
4572 entry
->nsid
= hostgid_mapped
;
4573 entry
->hostid
= (unsigned long) gid
;
4575 lxc_list_add_tail(new, tmp
);
4577 lxc_list_for_each_safe(it
, &conf
->id_map
, next
) {
4578 tmp
= malloc(sizeof(*tmp
));
4581 entry
= malloc(sizeof(*entry
));
4586 memset(entry
, 0, sizeof(*entry
));
4587 memcpy(entry
, it
->elem
, sizeof(*entry
));
4589 lxc_list_add_tail(new, tmp
);
4595 ERROR("Out of memory building a new uid/gid map");
4597 lxc_free_idmap(new);
4603 * Run a function in a new user namespace.
4604 * The caller's euid/egid will be mapped in if it is not already.
4606 int userns_exec_1(struct lxc_conf
*conf
, int (*fn
)(void *), void *data
)
4609 struct userns_fn_data d
;
4612 struct lxc_list
*idmap
;
4616 SYSERROR("opening pipe");
4623 pid
= lxc_clone(run_userns_fn
, &d
, CLONE_NEWUSER
);
4629 if ((idmap
= idmap_add_id(conf
, geteuid(), getegid())) == NULL
) {
4630 ERROR("Error adding self to container uid/gid map");
4634 ret
= lxc_map_ids(idmap
, pid
);
4635 lxc_free_idmap(idmap
);
4638 ERROR("Error setting up child mappings");
4643 if (write(p
[1], &c
, 1) != 1) {
4644 SYSERROR("writing to pipe to child");
4648 ret
= wait_for_pid(pid
);