2 * lxc: linux Container library
4 * (C) Copyright IBM Corp. 2007, 2008
7 * Daniel Lezcano <daniel.lezcano at free.fr>
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
40 #include <arpa/inet.h>
41 #include <linux/loop.h>
43 #include <netinet/in.h>
45 #include <sys/mount.h>
46 #include <sys/param.h>
47 #include <sys/prctl.h>
49 #include <sys/socket.h>
50 #include <sys/sysmacros.h>
51 #include <sys/syscall.h>
52 #include <sys/types.h>
53 #include <sys/utsname.h>
58 # include <sys/mkdev.h>
62 #include <sys/statvfs.h>
68 #include <../include/openpty.h>
71 #ifdef HAVE_LINUX_MEMFD_H
72 #include <linux/memfd.h>
77 #include "caps.h" /* for lxc_caps_last_cap() */
84 #include "lxcoverlay.h"
85 #include "lxcseccomp.h"
86 #include "namespace.h"
93 #include <sys/capability.h>
96 #if HAVE_SYS_PERSONALITY_H
97 #include <sys/personality.h>
101 #include <../include/lxcmntent.h>
103 #include <../include/prlimit.h>
109 lxc_log_define(lxc_conf
, lxc
);
113 #define CAP_SETFCAP 31
116 #ifndef CAP_MAC_OVERRIDE
117 #define CAP_MAC_OVERRIDE 32
120 #ifndef CAP_MAC_ADMIN
121 #define CAP_MAC_ADMIN 33
125 #ifndef PR_CAPBSET_DROP
126 #define PR_CAPBSET_DROP 24
129 #ifndef LO_FLAGS_AUTOCLEAR
130 #define LO_FLAGS_AUTOCLEAR 4
141 /* needed for cgroup automount checks, regardless of whether we
142 * have included linux/capability.h or not */
143 #ifndef CAP_SYS_ADMIN
144 #define CAP_SYS_ADMIN 21
147 /* Define pivot_root() if missing from the C library */
148 #ifndef HAVE_PIVOT_ROOT
149 static int pivot_root(const char * new_root
, const char * put_old
)
151 #ifdef __NR_pivot_root
152 return syscall(__NR_pivot_root
, new_root
, put_old
);
159 extern int pivot_root(const char * new_root
, const char * put_old
);
162 /* Define sethostname() if missing from the C library */
163 #ifndef HAVE_SETHOSTNAME
164 static int sethostname(const char * name
, size_t len
)
166 #ifdef __NR_sethostname
167 return syscall(__NR_sethostname
, name
, len
);
175 /* Define __S_ISTYPE if missing from the C library */
177 #define __S_ISTYPE(mode, mask) (((mode) & S_IFMT) == (mask))
181 #define MS_PRIVATE (1<<18)
185 #define MS_LAZYTIME (1<<25)
190 #define MFD_CLOEXEC 0x0001U
193 #ifndef MFD_ALLOW_SEALING
194 #define MFD_ALLOW_SEALING 0x0002U
197 #ifndef HAVE_MEMFD_CREATE
198 static int memfd_create(const char *name
, unsigned int flags
) {
199 #ifndef __NR_memfd_create
201 #define __NR_memfd_create 356
202 #elif defined __x86_64__
203 #define __NR_memfd_create 319
204 #elif defined __arm__
205 #define __NR_memfd_create 385
206 #elif defined __aarch64__
207 #define __NR_memfd_create 279
208 #elif defined __s390__
209 #define __NR_memfd_create 350
210 #elif defined __powerpc__
211 #define __NR_memfd_create 360
212 #elif defined __sparc__
213 #define __NR_memfd_create 348
214 #elif defined __blackfin__
215 #define __NR_memfd_create 390
216 #elif defined __ia64__
217 #define __NR_memfd_create 1340
218 #elif defined _MIPS_SIM
219 #if _MIPS_SIM == _MIPS_SIM_ABI32
220 #define __NR_memfd_create 4354
222 #if _MIPS_SIM == _MIPS_SIM_NABI32
223 #define __NR_memfd_create 6318
225 #if _MIPS_SIM == _MIPS_SIM_ABI64
226 #define __NR_memfd_create 5314
230 #ifdef __NR_memfd_create
231 return syscall(__NR_memfd_create
, name
, flags
);
238 extern int memfd_create(const char *name
, unsigned int flags
);
241 char *lxchook_names
[NUM_LXC_HOOKS
] = {
242 "pre-start", "pre-mount", "mount", "autodev", "start", "stop", "post-stop", "clone", "destroy" };
244 typedef int (*instantiate_cb
)(struct lxc_handler
*, struct lxc_netdev
*);
263 * The lxc_conf of the container currently being worked on in an
265 * This is used in the error calls
268 __thread
struct lxc_conf
*current_config
;
270 struct lxc_conf
*current_config
;
273 /* Declare this here, since we don't want to reshuffle the whole file. */
274 static int in_caplist(int cap
, struct lxc_list
*caps
);
276 static int instantiate_veth(struct lxc_handler
*, struct lxc_netdev
*);
277 static int instantiate_macvlan(struct lxc_handler
*, struct lxc_netdev
*);
278 static int instantiate_vlan(struct lxc_handler
*, struct lxc_netdev
*);
279 static int instantiate_phys(struct lxc_handler
*, struct lxc_netdev
*);
280 static int instantiate_empty(struct lxc_handler
*, struct lxc_netdev
*);
281 static int instantiate_none(struct lxc_handler
*, struct lxc_netdev
*);
283 static instantiate_cb netdev_conf
[LXC_NET_MAXCONFTYPE
+ 1] = {
284 [LXC_NET_VETH
] = instantiate_veth
,
285 [LXC_NET_MACVLAN
] = instantiate_macvlan
,
286 [LXC_NET_VLAN
] = instantiate_vlan
,
287 [LXC_NET_PHYS
] = instantiate_phys
,
288 [LXC_NET_EMPTY
] = instantiate_empty
,
289 [LXC_NET_NONE
] = instantiate_none
,
292 static int shutdown_veth(struct lxc_handler
*, struct lxc_netdev
*);
293 static int shutdown_macvlan(struct lxc_handler
*, struct lxc_netdev
*);
294 static int shutdown_vlan(struct lxc_handler
*, struct lxc_netdev
*);
295 static int shutdown_phys(struct lxc_handler
*, struct lxc_netdev
*);
296 static int shutdown_empty(struct lxc_handler
*, struct lxc_netdev
*);
297 static int shutdown_none(struct lxc_handler
*, struct lxc_netdev
*);
299 static instantiate_cb netdev_deconf
[LXC_NET_MAXCONFTYPE
+ 1] = {
300 [LXC_NET_VETH
] = shutdown_veth
,
301 [LXC_NET_MACVLAN
] = shutdown_macvlan
,
302 [LXC_NET_VLAN
] = shutdown_vlan
,
303 [LXC_NET_PHYS
] = shutdown_phys
,
304 [LXC_NET_EMPTY
] = shutdown_empty
,
305 [LXC_NET_NONE
] = shutdown_none
,
308 static struct mount_opt mount_opt
[] = {
309 { "async", 1, MS_SYNCHRONOUS
},
310 { "atime", 1, MS_NOATIME
},
311 { "bind", 0, MS_BIND
},
312 { "defaults", 0, 0 },
313 { "dev", 1, MS_NODEV
},
314 { "diratime", 1, MS_NODIRATIME
},
315 { "dirsync", 0, MS_DIRSYNC
},
316 { "exec", 1, MS_NOEXEC
},
317 { "lazytime", 0, MS_LAZYTIME
},
318 { "mand", 0, MS_MANDLOCK
},
319 { "noatime", 0, MS_NOATIME
},
320 { "nodev", 0, MS_NODEV
},
321 { "nodiratime", 0, MS_NODIRATIME
},
322 { "noexec", 0, MS_NOEXEC
},
323 { "nomand", 1, MS_MANDLOCK
},
324 { "norelatime", 1, MS_RELATIME
},
325 { "nostrictatime", 1, MS_STRICTATIME
},
326 { "nosuid", 0, MS_NOSUID
},
327 { "rbind", 0, MS_BIND
|MS_REC
},
328 { "relatime", 0, MS_RELATIME
},
329 { "remount", 0, MS_REMOUNT
},
330 { "ro", 0, MS_RDONLY
},
331 { "rw", 1, MS_RDONLY
},
332 { "strictatime", 0, MS_STRICTATIME
},
333 { "suid", 1, MS_NOSUID
},
334 { "sync", 0, MS_SYNCHRONOUS
},
339 static struct caps_opt caps_opt
[] = {
340 { "chown", CAP_CHOWN
},
341 { "dac_override", CAP_DAC_OVERRIDE
},
342 { "dac_read_search", CAP_DAC_READ_SEARCH
},
343 { "fowner", CAP_FOWNER
},
344 { "fsetid", CAP_FSETID
},
345 { "kill", CAP_KILL
},
346 { "setgid", CAP_SETGID
},
347 { "setuid", CAP_SETUID
},
348 { "setpcap", CAP_SETPCAP
},
349 { "linux_immutable", CAP_LINUX_IMMUTABLE
},
350 { "net_bind_service", CAP_NET_BIND_SERVICE
},
351 { "net_broadcast", CAP_NET_BROADCAST
},
352 { "net_admin", CAP_NET_ADMIN
},
353 { "net_raw", CAP_NET_RAW
},
354 { "ipc_lock", CAP_IPC_LOCK
},
355 { "ipc_owner", CAP_IPC_OWNER
},
356 { "sys_module", CAP_SYS_MODULE
},
357 { "sys_rawio", CAP_SYS_RAWIO
},
358 { "sys_chroot", CAP_SYS_CHROOT
},
359 { "sys_ptrace", CAP_SYS_PTRACE
},
360 { "sys_pacct", CAP_SYS_PACCT
},
361 { "sys_admin", CAP_SYS_ADMIN
},
362 { "sys_boot", CAP_SYS_BOOT
},
363 { "sys_nice", CAP_SYS_NICE
},
364 { "sys_resource", CAP_SYS_RESOURCE
},
365 { "sys_time", CAP_SYS_TIME
},
366 { "sys_tty_config", CAP_SYS_TTY_CONFIG
},
367 { "mknod", CAP_MKNOD
},
368 { "lease", CAP_LEASE
},
369 #ifdef CAP_AUDIT_READ
370 { "audit_read", CAP_AUDIT_READ
},
372 #ifdef CAP_AUDIT_WRITE
373 { "audit_write", CAP_AUDIT_WRITE
},
375 #ifdef CAP_AUDIT_CONTROL
376 { "audit_control", CAP_AUDIT_CONTROL
},
378 { "setfcap", CAP_SETFCAP
},
379 { "mac_override", CAP_MAC_OVERRIDE
},
380 { "mac_admin", CAP_MAC_ADMIN
},
382 { "syslog", CAP_SYSLOG
},
384 #ifdef CAP_WAKE_ALARM
385 { "wake_alarm", CAP_WAKE_ALARM
},
387 #ifdef CAP_BLOCK_SUSPEND
388 { "block_suspend", CAP_BLOCK_SUSPEND
},
392 static struct caps_opt caps_opt
[] = {};
395 static struct limit_opt limit_opt
[] = {
400 { "core", RLIMIT_CORE
},
403 { "cpu", RLIMIT_CPU
},
406 { "data", RLIMIT_DATA
},
409 { "fsize", RLIMIT_FSIZE
},
412 { "locks", RLIMIT_LOCKS
},
414 #ifdef RLIMIT_MEMLOCK
415 { "memlock", RLIMIT_MEMLOCK
},
417 #ifdef RLIMIT_MSGQUEUE
418 { "msgqueue", RLIMIT_MSGQUEUE
},
421 { "nice", RLIMIT_NICE
},
424 { "nofile", RLIMIT_NOFILE
},
427 { "nproc", RLIMIT_NPROC
},
430 { "rss", RLIMIT_RSS
},
433 { "rtprio", RLIMIT_RTPRIO
},
436 { "rttime", RLIMIT_RTTIME
},
438 #ifdef RLIMIT_SIGPENDING
439 { "sigpending", RLIMIT_SIGPENDING
},
442 { "stack", RLIMIT_STACK
},
446 static int run_buffer(char *buffer
)
448 struct lxc_popen_FILE
*f
;
452 f
= lxc_popen(buffer
);
454 SYSERROR("Failed to popen() %s.", buffer
);
458 output
= malloc(LXC_LOG_BUFFER_SIZE
);
460 ERROR("Failed to allocate memory for %s.", buffer
);
465 while (fgets(output
, LXC_LOG_BUFFER_SIZE
, f
->f
))
466 DEBUG("Script %s with output: %s.", buffer
, output
);
472 SYSERROR("Script exited with error.");
474 } else if (WIFEXITED(ret
) && WEXITSTATUS(ret
) != 0) {
475 ERROR("Script exited with status %d.", WEXITSTATUS(ret
));
477 } else if (WIFSIGNALED(ret
)) {
478 ERROR("Script terminated by signal %d.", WTERMSIG(ret
));
485 static int run_script_argv(const char *name
, const char *section
,
486 const char *script
, const char *hook
,
487 const char *lxcpath
, char **argsin
)
493 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
494 script
, name
, section
);
496 for (i
= 0; argsin
&& argsin
[i
]; i
++)
497 size
+= strlen(argsin
[i
]) + 1;
499 size
+= strlen(hook
) + 1;
501 size
+= strlen(script
);
502 size
+= strlen(name
);
503 size
+= strlen(section
);
509 buffer
= alloca(size
);
511 ERROR("Failed to allocate memory.");
516 snprintf(buffer
, size
, "%s %s %s %s", script
, name
, section
, hook
);
517 if (ret
< 0 || (size_t)ret
>= size
) {
518 ERROR("Script name too long.");
522 for (i
= 0; argsin
&& argsin
[i
]; i
++) {
523 int len
= size
- ret
;
525 rc
= snprintf(buffer
+ ret
, len
, " %s", argsin
[i
]);
526 if (rc
< 0 || rc
>= len
) {
527 ERROR("Script args too long.");
533 return run_buffer(buffer
);
536 static int run_script(const char *name
, const char *section
, const char *script
,
544 INFO("Executing script \"%s\" for container \"%s\", config section \"%s\".",
545 script
, name
, section
);
547 va_start(ap
, script
);
548 while ((p
= va_arg(ap
, char *)))
549 size
+= strlen(p
) + 1;
552 size
+= strlen(script
);
553 size
+= strlen(name
);
554 size
+= strlen(section
);
560 buffer
= alloca(size
);
562 ERROR("Failed to allocate memory.");
566 ret
= snprintf(buffer
, size
, "%s %s %s", script
, name
, section
);
567 if (ret
< 0 || ret
>= size
) {
568 ERROR("Script name too long.");
572 va_start(ap
, script
);
573 while ((p
= va_arg(ap
, char *))) {
574 int len
= size
- ret
;
576 rc
= snprintf(buffer
+ ret
, len
, " %s", p
);
577 if (rc
< 0 || rc
>= len
) {
578 ERROR("Script args too long.");
585 return run_buffer(buffer
);
588 static int mount_rootfs_dir(const char *rootfs
, const char *target
,
591 unsigned long mntflags
;
595 if (parse_mntopts(options
, &mntflags
, &mntdata
) < 0) {
600 ret
= mount(rootfs
, target
, "none", MS_BIND
| MS_REC
| mntflags
, mntdata
);
606 static int lxc_mount_rootfs_file(const char *rootfs
, const char *target
,
610 char path
[MAXPATHLEN
];
612 loopfd
= lxc_prepare_loop_dev(rootfs
, path
, LO_FLAGS_AUTOCLEAR
);
615 DEBUG("prepared loop device \"%s\"", path
);
617 ret
= mount_unknown_fs(path
, target
, options
);
620 DEBUG("mounted rootfs \"%s\" on loop device \"%s\" via loop device \"%s\"", rootfs
, target
, path
);
625 static int mount_rootfs_block(const char *rootfs
, const char *target
,
628 return mount_unknown_fs(rootfs
, target
, options
);
633 * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
634 * the duration of the container run, to prevent the container from marking
635 * the underlying fs readonly on shutdown. unlink the file immediately so
636 * no name pollution is happens
637 * return -1 on error.
638 * return -2 if nothing needed to be pinned.
639 * return an open fd (>=0) if we pinned it.
641 int pin_rootfs(const char *rootfs
)
643 char absrootfs
[MAXPATHLEN
];
644 char absrootfspin
[MAXPATHLEN
];
648 if (rootfs
== NULL
|| strlen(rootfs
) == 0)
651 if (!realpath(rootfs
, absrootfs
))
654 if (access(absrootfs
, F_OK
))
657 if (stat(absrootfs
, &s
))
660 if (!S_ISDIR(s
.st_mode
))
663 ret
= snprintf(absrootfspin
, MAXPATHLEN
, "%s/lxc.hold", absrootfs
);
664 if (ret
>= MAXPATHLEN
)
667 fd
= open(absrootfspin
, O_CREAT
| O_RDWR
, S_IWUSR
|S_IRUSR
);
670 (void)unlink(absrootfspin
);
675 * If we are asking to remount something, make sure that any
676 * NOEXEC etc are honored.
678 static unsigned long add_required_remount_flags(const char *s
, const char *d
,
683 unsigned long required_flags
= 0;
685 if (!(flags
& MS_REMOUNT
))
693 if (statvfs(s
, &sb
) < 0)
696 if (sb
.f_flag
& MS_NOSUID
)
697 required_flags
|= MS_NOSUID
;
698 if (sb
.f_flag
& MS_NODEV
)
699 required_flags
|= MS_NODEV
;
700 if (sb
.f_flag
& MS_RDONLY
)
701 required_flags
|= MS_RDONLY
;
702 if (sb
.f_flag
& MS_NOEXEC
)
703 required_flags
|= MS_NOEXEC
;
705 return flags
| required_flags
;
711 static int lxc_mount_auto_mounts(struct lxc_conf
*conf
, int flags
, struct lxc_handler
*handler
)
719 const char *destination
;
723 } default_mounts
[] = {
724 /* Read-only bind-mounting... In older kernels, doing that required
725 * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
726 * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
727 * kernel 2.6.26 onwards. However, this apparently does not work on
728 * kernel 3.8. Unfortunately, on that very same kernel, doing the
729 * same trick as above doesn't seem to work either, there one needs
730 * to ALSO specify MS_BIND for the remount, otherwise the entire
731 * fs is remounted read-only or the mount fails because it's busy...
732 * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
735 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, "proc", "%r/proc", "proc", MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, NULL
},
736 /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */
737 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, "%r/proc/sys/net", "%r/proc/tty", NULL
, MS_BIND
, NULL
},
738 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, "%r/proc/sys", "%r/proc/sys", NULL
, MS_BIND
, NULL
},
739 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, NULL
, "%r/proc/sys", NULL
, MS_REMOUNT
|MS_BIND
|MS_RDONLY
, NULL
},
740 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, "%r/proc/tty", "%r/proc/sys/net", NULL
, MS_MOVE
, NULL
},
741 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL
, MS_BIND
, NULL
},
742 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_MIXED
, NULL
, "%r/proc/sysrq-trigger", NULL
, MS_REMOUNT
|MS_BIND
|MS_RDONLY
, NULL
},
743 { LXC_AUTO_PROC_MASK
, LXC_AUTO_PROC_RW
, "proc", "%r/proc", "proc", MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, NULL
},
744 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_RW
, "sysfs", "%r/sys", "sysfs", 0, NULL
},
745 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_RO
, "sysfs", "%r/sys", "sysfs", MS_RDONLY
, NULL
},
746 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_MIXED
, "sysfs", "%r/sys", "sysfs", MS_NODEV
|MS_NOEXEC
|MS_NOSUID
, NULL
},
747 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_MIXED
, "%r/sys", "%r/sys", NULL
, MS_BIND
, NULL
},
748 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_MIXED
, NULL
, "%r/sys", NULL
, MS_REMOUNT
|MS_BIND
|MS_RDONLY
, NULL
},
749 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_MIXED
, "sysfs", "%r/sys/devices/virtual/net", "sysfs", 0, NULL
},
750 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_MIXED
, "%r/sys/devices/virtual/net/devices/virtual/net", "%r/sys/devices/virtual/net", NULL
, MS_BIND
, NULL
},
751 { LXC_AUTO_SYS_MASK
, LXC_AUTO_SYS_MIXED
, NULL
, "%r/sys/devices/virtual/net", NULL
, MS_REMOUNT
|MS_BIND
|MS_NOSUID
|MS_NODEV
|MS_NOEXEC
, NULL
},
752 { 0, 0, NULL
, NULL
, NULL
, 0, NULL
}
755 for (i
= 0; default_mounts
[i
].match_mask
; i
++) {
756 if ((flags
& default_mounts
[i
].match_mask
) == default_mounts
[i
].match_flag
) {
758 char *destination
= NULL
;
760 unsigned long mflags
;
762 if (default_mounts
[i
].source
) {
763 /* will act like strdup if %r is not present */
764 source
= lxc_string_replace("%r", conf
->rootfs
.path
? conf
->rootfs
.mount
: "", default_mounts
[i
].source
);
766 SYSERROR("memory allocation error");
770 if (!default_mounts
[i
].destination
) {
771 ERROR("BUG: auto mounts destination %d was NULL", i
);
775 /* will act like strdup if %r is not present */
776 destination
= lxc_string_replace("%r", conf
->rootfs
.path
? conf
->rootfs
.mount
: "", default_mounts
[i
].destination
);
779 SYSERROR("memory allocation error");
784 mflags
= add_required_remount_flags(source
, destination
,
785 default_mounts
[i
].flags
);
786 r
= safe_mount(source
, destination
, default_mounts
[i
].fstype
, mflags
, default_mounts
[i
].options
, conf
->rootfs
.path
? conf
->rootfs
.mount
: NULL
);
788 if (r
< 0 && errno
== ENOENT
) {
789 INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source
, destination
);
793 SYSERROR("error mounting %s on %s flags %lu", source
, destination
, mflags
);
804 if (flags
& LXC_AUTO_CGROUP_MASK
) {
807 cg_flags
= flags
& LXC_AUTO_CGROUP_MASK
;
808 /* If the type of cgroup mount was not specified, it depends on the
809 * container's capabilities as to what makes sense: if we have
810 * CAP_SYS_ADMIN, the read-only part can be remounted read-write
811 * anyway, so we may as well default to read-write; then the admin
812 * will not be given a false sense of security. (And if they really
813 * want mixed r/o r/w, then they can explicitly specify :mixed.)
814 * OTOH, if the container lacks CAP_SYS_ADMIN, do only default to
815 * :mixed, because then the container can't remount it read-write. */
816 if (cg_flags
== LXC_AUTO_CGROUP_NOSPEC
|| cg_flags
== LXC_AUTO_CGROUP_FULL_NOSPEC
) {
817 int has_sys_admin
= 0;
819 if (!lxc_list_empty(&conf
->keepcaps
))
820 has_sys_admin
= in_caplist(CAP_SYS_ADMIN
, &conf
->keepcaps
);
822 has_sys_admin
= !in_caplist(CAP_SYS_ADMIN
, &conf
->caps
);
824 if (cg_flags
== LXC_AUTO_CGROUP_NOSPEC
)
825 cg_flags
= has_sys_admin
? LXC_AUTO_CGROUP_RW
: LXC_AUTO_CGROUP_MIXED
;
827 cg_flags
= has_sys_admin
? LXC_AUTO_CGROUP_FULL_RW
: LXC_AUTO_CGROUP_FULL_MIXED
;
830 if (!cgroup_mount(conf
->rootfs
.path
? conf
->rootfs
.mount
: "", handler
, cg_flags
)) {
831 SYSERROR("error mounting /sys/fs/cgroup");
839 static int mount_rootfs(const char *rootfs
, const char *target
, const char *options
)
841 char absrootfs
[MAXPATHLEN
];
845 typedef int (*rootfs_cb
)(const char *, const char *, const char *);
851 { S_IFDIR
, mount_rootfs_dir
},
852 { S_IFBLK
, mount_rootfs_block
},
853 { S_IFREG
, lxc_mount_rootfs_file
},
856 if (!realpath(rootfs
, absrootfs
)) {
857 SYSERROR("Failed to get real path for \"%s\".", rootfs
);
861 if (access(absrootfs
, F_OK
)) {
862 SYSERROR("The rootfs \"%s\" is not accessible.", absrootfs
);
866 if (stat(absrootfs
, &s
)) {
867 SYSERROR("Failed to stat the rootfs \"%s\".", absrootfs
);
871 for (i
= 0; i
< sizeof(rtfs_type
)/sizeof(rtfs_type
[0]); i
++) {
872 if (!__S_ISTYPE(s
.st_mode
, rtfs_type
[i
].type
))
875 return rtfs_type
[i
].cb(absrootfs
, target
, options
);
878 ERROR("Unsupported rootfs type for rootfs \"%s\".", absrootfs
);
882 static int setup_utsname(struct utsname
*utsname
)
887 if (sethostname(utsname
->nodename
, strlen(utsname
->nodename
))) {
888 SYSERROR("failed to set the hostname to '%s'", utsname
->nodename
);
892 INFO("'%s' hostname has been setup", utsname
->nodename
);
897 struct dev_symlinks
{
902 static const struct dev_symlinks dev_symlinks
[] = {
903 {"/proc/self/fd", "fd"},
904 {"/proc/self/fd/0", "stdin"},
905 {"/proc/self/fd/1", "stdout"},
906 {"/proc/self/fd/2", "stderr"},
909 static int setup_dev_symlinks(const struct lxc_rootfs
*rootfs
)
911 char path
[MAXPATHLEN
];
916 for (i
= 0; i
< sizeof(dev_symlinks
) / sizeof(dev_symlinks
[0]); i
++) {
917 const struct dev_symlinks
*d
= &dev_symlinks
[i
];
918 ret
= snprintf(path
, sizeof(path
), "%s/dev/%s", rootfs
->path
? rootfs
->mount
: "", d
->name
);
919 if (ret
< 0 || ret
>= MAXPATHLEN
)
923 * Stat the path first. If we don't get an error
924 * accept it as is and don't try to create it
926 if (!stat(path
, &s
)) {
930 ret
= symlink(d
->oldpath
, path
);
932 if (ret
&& errno
!= EEXIST
) {
933 if ( errno
== EROFS
) {
934 WARN("Warning: Read Only file system while creating %s", path
);
936 SYSERROR("Error creating %s", path
);
945 * Build a space-separate list of ptys to pass to systemd.
947 static bool append_ptyname(char **pp
, char *name
)
952 *pp
= malloc(strlen(name
) + strlen("container_ttys=") + 1);
955 sprintf(*pp
, "container_ttys=%s", name
);
958 p
= realloc(*pp
, strlen(*pp
) + strlen(name
) + 2);
967 static int lxc_setup_tty(struct lxc_conf
*conf
)
970 const struct lxc_tty_info
*tty_info
= &conf
->tty_info
;
971 char *ttydir
= conf
->ttydir
;
972 char path
[MAXPATHLEN
], lxcpath
[MAXPATHLEN
];
974 if (!conf
->rootfs
.path
)
977 for (i
= 0; i
< tty_info
->nbtty
; i
++) {
978 struct lxc_pty_info
*pty_info
= &tty_info
->pty_info
[i
];
980 ret
= snprintf(path
, sizeof(path
), "/dev/tty%d", i
+ 1);
981 if (ret
< 0 || (size_t)ret
>= sizeof(path
)) {
982 ERROR("pathname too long for ttys");
987 /* create dev/lxc/tty%d" */
988 ret
= snprintf(lxcpath
, sizeof(lxcpath
),
989 "/dev/%s/tty%d", ttydir
, i
+ 1);
990 if (ret
< 0 || (size_t)ret
>= sizeof(lxcpath
)) {
991 ERROR("pathname too long for ttys");
995 ret
= creat(lxcpath
, 0660);
996 if (ret
< 0 && errno
!= EEXIST
) {
997 SYSERROR("failed to create \"%s\"", lxcpath
);
1004 if (ret
< 0 && errno
!= ENOENT
) {
1005 SYSERROR("failed to unlink \"%s\"", path
);
1009 ret
= mount(pty_info
->name
, lxcpath
, "none", MS_BIND
, 0);
1011 WARN("failed to bind mount \"%s\" onto \"%s\"",
1012 pty_info
->name
, path
);
1015 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info
->name
,
1018 ret
= snprintf(lxcpath
, sizeof(lxcpath
), "%s/tty%d",
1020 if (ret
< 0 || (size_t)ret
>= sizeof(lxcpath
)) {
1021 ERROR("tty pathname too long");
1025 ret
= symlink(lxcpath
, path
);
1027 SYSERROR("failed to create symlink \"%s\" -> \"%s\"",
1032 /* If we populated /dev, then we need to create
1035 ret
= access(path
, F_OK
);
1037 ret
= creat(path
, 0660);
1039 SYSERROR("failed to create \"%s\"", path
);
1040 /* this isn't fatal, continue */
1046 ret
= mount(pty_info
->name
, path
, "none", MS_BIND
, 0);
1048 SYSERROR("failed to mount '%s'->'%s'", pty_info
->name
, path
);
1052 DEBUG("bind mounted \"%s\" onto \"%s\"", pty_info
->name
,
1056 if (!append_ptyname(&conf
->pty_names
, pty_info
->name
)) {
1057 ERROR("Error setting up container_ttys string");
1062 INFO("finished setting up %d /dev/tty<N> device(s)", tty_info
->nbtty
);
1066 static int setup_rootfs_pivot_root(const char *rootfs
)
1068 int oldroot
= -1, newroot
= -1;
1070 oldroot
= open("/", O_DIRECTORY
| O_RDONLY
);
1072 SYSERROR("Error opening old-/ for fchdir");
1075 newroot
= open(rootfs
, O_DIRECTORY
| O_RDONLY
);
1077 SYSERROR("Error opening new-/ for fchdir");
1081 /* change into new root fs */
1082 if (fchdir(newroot
)) {
1083 SYSERROR("can't chdir to new rootfs '%s'", rootfs
);
1087 /* pivot_root into our new root fs */
1088 if (pivot_root(".", ".")) {
1089 SYSERROR("pivot_root syscall failed");
1094 * at this point the old-root is mounted on top of our new-root
1095 * To unmounted it we must not be chdir'd into it, so escape back
1098 if (fchdir(oldroot
) < 0) {
1099 SYSERROR("Error entering oldroot");
1102 if (umount2(".", MNT_DETACH
) < 0) {
1103 SYSERROR("Error detaching old root");
1107 if (fchdir(newroot
) < 0) {
1108 SYSERROR("Error re-entering newroot");
1115 DEBUG("pivot_root syscall to '%s' successful", rootfs
);
1128 * Just create a path for /dev under $lxcpath/$name and in rootfs
1129 * If we hit an error, log it but don't fail yet.
1131 static int mount_autodev(const char *name
, const struct lxc_rootfs
*rootfs
, const char *lxcpath
)
1137 INFO("Mounting container /dev");
1139 /* $(rootfs->mount) + "/dev/pts" + '\0' */
1140 clen
= (rootfs
->path
? strlen(rootfs
->mount
) : 0) + 9;
1141 path
= alloca(clen
);
1143 ret
= snprintf(path
, clen
, "%s/dev", rootfs
->path
? rootfs
->mount
: "");
1144 if (ret
< 0 || ret
>= clen
)
1147 if (!dir_exists(path
)) {
1148 WARN("No /dev in container.");
1149 WARN("Proceeding without autodev setup");
1153 ret
= safe_mount("none", path
, "tmpfs", 0, "size=500000,mode=755",
1154 rootfs
->path
? rootfs
->mount
: NULL
);
1156 SYSERROR("Failed mounting tmpfs onto %s\n", path
);
1160 INFO("Mounted tmpfs onto %s", path
);
1162 ret
= snprintf(path
, clen
, "%s/dev/pts", rootfs
->path
? rootfs
->mount
: "");
1163 if (ret
< 0 || ret
>= clen
)
1167 * If we are running on a devtmpfs mapping, dev/pts may already exist.
1168 * If not, then create it and exit if that fails...
1170 if (!dir_exists(path
)) {
1171 ret
= mkdir(path
, S_IRWXU
| S_IRGRP
| S_IXGRP
| S_IROTH
| S_IXOTH
);
1173 SYSERROR("Failed to create /dev/pts in container");
1178 INFO("Mounted container /dev");
1189 static const struct lxc_devs lxc_devs
[] = {
1190 { "null", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 3 },
1191 { "zero", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 5 },
1192 { "full", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 7 },
1193 { "urandom", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 9 },
1194 { "random", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 1, 8 },
1195 { "tty", S_IFCHR
| S_IRWXU
| S_IRWXG
| S_IRWXO
, 5, 0 },
1198 static int lxc_fill_autodev(const struct lxc_rootfs
*rootfs
)
1201 char path
[MAXPATHLEN
];
1205 ret
= snprintf(path
, MAXPATHLEN
, "%s/dev", rootfs
->path
? rootfs
->mount
: "");
1206 if (ret
< 0 || ret
>= MAXPATHLEN
) {
1207 ERROR("Error calculating container /dev location");
1211 /* ignore, just don't try to fill in */
1212 if (!dir_exists(path
))
1215 INFO("populating container /dev");
1216 cmask
= umask(S_IXUSR
| S_IXGRP
| S_IXOTH
);
1217 for (i
= 0; i
< sizeof(lxc_devs
) / sizeof(lxc_devs
[0]); i
++) {
1218 const struct lxc_devs
*d
= &lxc_devs
[i
];
1220 ret
= snprintf(path
, MAXPATHLEN
, "%s/dev/%s", rootfs
->path
? rootfs
->mount
: "", d
->name
);
1221 if (ret
< 0 || ret
>= MAXPATHLEN
)
1224 ret
= mknod(path
, d
->mode
, makedev(d
->maj
, d
->min
));
1226 char hostpath
[MAXPATHLEN
];
1229 if (errno
== EEXIST
) {
1230 DEBUG("\"%s\" device already existed", path
);
1234 /* Unprivileged containers cannot create devices, so
1235 * bind mount the device from the host.
1237 ret
= snprintf(hostpath
, MAXPATHLEN
, "/dev/%s", d
->name
);
1238 if (ret
< 0 || ret
>= MAXPATHLEN
)
1240 pathfile
= fopen(path
, "wb");
1242 SYSERROR("Failed to create device mount target '%s'", path
);
1246 if (safe_mount(hostpath
, path
, 0, MS_BIND
, NULL
, rootfs
->path
? rootfs
->mount
: NULL
) != 0) {
1247 SYSERROR("Failed bind mounting device %s from host into container", d
->name
);
1250 DEBUG("bind mounted \"%s\" onto \"%s\"", hostpath
, path
);
1252 DEBUG("created device node \"%s\"", path
);
1257 INFO("populated container /dev");
1261 static int setup_rootfs(struct lxc_conf
*conf
)
1264 const struct lxc_rootfs
*rootfs
;
1266 rootfs
= &conf
->rootfs
;
1267 if (!rootfs
->path
) {
1268 if (mount("", "/", NULL
, MS_SLAVE
| MS_REC
, 0)) {
1269 SYSERROR("Failed to make / rslave.");
1275 if (access(rootfs
->mount
, F_OK
)) {
1276 SYSERROR("Failed to access to \"%s\". Check it is present.",
1281 /* First try mounting rootfs using a bdev. */
1282 bdev
= bdev_init(conf
, rootfs
->path
, rootfs
->mount
, rootfs
->options
);
1283 if (bdev
&& !bdev
->ops
->mount(bdev
)) {
1285 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1286 rootfs
->path
, rootfs
->mount
,
1287 rootfs
->options
? rootfs
->options
: "(null)");
1292 if (mount_rootfs(rootfs
->path
, rootfs
->mount
, rootfs
->options
)) {
1293 ERROR("Failed to mount rootfs \"%s\" onto \"%s\" with options \"%s\".",
1294 rootfs
->path
, rootfs
->mount
,
1295 rootfs
->options
? rootfs
->options
: "(null)");
1299 DEBUG("Mounted rootfs \"%s\" onto \"%s\" with options \"%s\".",
1300 rootfs
->path
, rootfs
->mount
,
1301 rootfs
->options
? rootfs
->options
: "(null)");
1305 int prepare_ramfs_root(char *root
)
1307 char buf
[LXC_LINELEN
], *p
;
1308 char nroot
[PATH_MAX
];
1313 if (realpath(root
, nroot
) == NULL
)
1316 if (chdir("/") == -1)
1320 * We could use here MS_MOVE, but in userns this mount is
1321 * locked and can't be moved.
1323 if (mount(root
, "/", NULL
, MS_REC
| MS_BIND
, NULL
) < 0) {
1324 SYSERROR("Failed to move %s into /", root
);
1328 if (mount(NULL
, "/", NULL
, MS_REC
| MS_PRIVATE
, NULL
) < 0) {
1329 SYSERROR("Failed to make . rprivate");
1334 * The following code cleans up inhereted mounts which are not
1337 * The mountinfo file shows not all mounts, if a few points have been
1338 * unmounted between read operations from the mountinfo. So we need to
1339 * read mountinfo a few times.
1341 * This loop can be skipped if a container uses unserns, because all
1342 * inherited mounts are locked and we should live with all this trash.
1347 f
= fopen("./proc/self/mountinfo", "r");
1349 SYSERROR("Unable to open /proc/self/mountinfo");
1352 while (fgets(buf
, LXC_LINELEN
, f
)) {
1353 for (p
= buf
, i
=0; p
&& i
< 4; i
++)
1354 p
= strchr(p
+1, ' ');
1357 p2
= strchr(p
+1, ' ');
1364 if (strcmp(p
+ 1, "/") == 0)
1366 if (strcmp(p
+ 1, "/proc") == 0)
1369 if (umount2(p
, MNT_DETACH
) == 0)
1377 /* This also can be skipped if a container uses unserns */
1378 umount2("./proc", MNT_DETACH
);
1380 /* It is weird, but chdir("..") moves us in a new root */
1381 if (chdir("..") == -1) {
1382 SYSERROR("Unable to change working directory");
1386 if (chroot(".") == -1) {
1387 SYSERROR("Unable to chroot");
1394 static int setup_pivot_root(const struct lxc_rootfs
*rootfs
)
1396 if (!rootfs
->path
) {
1397 DEBUG("container does not have a rootfs, so not doing pivot root");
1401 if (detect_ramfs_rootfs()) {
1402 DEBUG("detected that container is on ramfs");
1403 if (prepare_ramfs_root(rootfs
->mount
)) {
1404 ERROR("failed to prepare minimal ramfs root");
1408 DEBUG("prepared ramfs root for container");
1412 if (setup_rootfs_pivot_root(rootfs
->mount
) < 0) {
1413 ERROR("failed to pivot root");
1417 DEBUG("finished pivot root");
1421 static int lxc_setup_devpts(int num_pts
)
1424 const char *devpts_mntopts
= "newinstance,ptmxmode=0666,mode=0620,gid=5";
1427 DEBUG("no new devpts instance will be mounted since no pts "
1428 "devices are requested");
1432 /* Unmount old devpts instance. */
1433 ret
= access("/dev/pts/ptmx", F_OK
);
1435 ret
= umount("/dev/pts");
1437 SYSERROR("failed to unmount old devpts instance");
1440 DEBUG("unmounted old /dev/pts instance");
1443 /* Create mountpoint for devpts instance. */
1444 ret
= mkdir("/dev/pts", 0755);
1445 if (ret
< 0 && errno
!= EEXIST
) {
1446 SYSERROR("failed to create the \"/dev/pts\" directory");
1450 /* Mount new devpts instance. */
1451 ret
= mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL
, devpts_mntopts
);
1453 SYSERROR("failed to mount new devpts instance");
1456 DEBUG("mount new devpts instance with options \"%s\"", devpts_mntopts
);
1458 /* Remove any pre-existing /dev/ptmx file. */
1459 ret
= access("/dev/ptmx", F_OK
);
1461 ret
= remove("/dev/ptmx");
1463 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1466 DEBUG("removed existing \"/dev/ptmx\"");
1469 /* Create dummy /dev/ptmx file as bind mountpoint for /dev/pts/ptmx. */
1470 ret
= open("/dev/ptmx", O_CREAT
, 0666);
1472 SYSERROR("failed to create dummy \"/dev/ptmx\" file as bind mount target");
1476 DEBUG("created dummy \"/dev/ptmx\" file as bind mount target");
1478 /* Fallback option: create symlink /dev/ptmx -> /dev/pts/ptmx */
1479 ret
= mount("/dev/pts/ptmx", "/dev/ptmx", NULL
, MS_BIND
, NULL
);
1481 DEBUG("bind mounted \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1484 /* Fallthrough and try to create a symlink. */
1485 ERROR("failed to bind mount \"/dev/pts/ptmx\" to \"/dev/ptmx\"");
1488 /* Remove the dummy /dev/ptmx file we created above. */
1489 ret
= remove("/dev/ptmx");
1491 SYSERROR("failed to remove existing \"/dev/ptmx\"");
1495 /* Fallback option: Create symlink /dev/ptmx -> /dev/pts/ptmx. */
1496 ret
= symlink("/dev/pts/ptmx", "/dev/ptmx");
1498 SYSERROR("failed to create symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1501 DEBUG("created symlink \"/dev/ptmx\" -> \"/dev/pts/ptmx\"");
1506 static int setup_personality(int persona
)
1508 #if HAVE_SYS_PERSONALITY_H
1512 if (personality(persona
) < 0) {
1513 SYSERROR("failed to set personality to '0x%x'", persona
);
1517 INFO("set personality to '0x%x'", persona
);
1523 static int lxc_setup_dev_console(const struct lxc_rootfs
*rootfs
,
1524 const struct lxc_console
*console
)
1526 char path
[MAXPATHLEN
];
1529 if (console
->path
&& !strcmp(console
->path
, "none"))
1532 ret
= snprintf(path
, sizeof(path
), "%s/dev/console", rootfs
->mount
);
1533 if (ret
< 0 || (size_t)ret
>= sizeof(path
))
1536 /* When we are asked to setup a console we remove any previous
1537 * /dev/console bind-mounts.
1539 if (file_exists(path
)) {
1540 ret
= lxc_unstack_mountpoint(path
, false);
1542 ERROR("failed to unmount \"%s\": %s", path
, strerror(errno
));
1545 DEBUG("cleared all (%d) mounts from \"%s\"", ret
, path
);
1549 SYSERROR("error unlinking %s", path
);
1554 /* For unprivileged containers autodev or automounts will already have
1555 * taken care of creating /dev/console.
1557 fd
= open(path
, O_CREAT
| O_EXCL
, S_IXUSR
| S_IXGRP
| S_IXOTH
);
1559 if (errno
!= EEXIST
) {
1560 SYSERROR("failed to create console");
1567 if (chmod(console
->name
, S_IXUSR
| S_IXGRP
| S_IXOTH
)) {
1568 SYSERROR("failed to set mode '0%o' to '%s'", S_IXUSR
| S_IXGRP
| S_IXOTH
, console
->name
);
1572 if (safe_mount(console
->name
, path
, "none", MS_BIND
, 0, rootfs
->mount
) < 0) {
1573 ERROR("failed to mount '%s' on '%s'", console
->name
, path
);
1577 DEBUG("mounted pts device \"%s\" onto \"%s\"", console
->name
, path
);
1581 static int lxc_setup_ttydir_console(const struct lxc_rootfs
*rootfs
,
1582 const struct lxc_console
*console
,
1586 char path
[MAXPATHLEN
], lxcpath
[MAXPATHLEN
];
1588 /* create rootfs/dev/<ttydir> directory */
1589 ret
= snprintf(path
, sizeof(path
), "%s/dev/%s", rootfs
->mount
, ttydir
);
1590 if (ret
< 0 || (size_t)ret
>= sizeof(path
))
1593 ret
= mkdir(path
, 0755);
1594 if (ret
&& errno
!= EEXIST
) {
1595 SYSERROR("failed with errno %d to create %s", errno
, path
);
1598 DEBUG("created directory for console and tty devices at \%s\"", path
);
1600 ret
= snprintf(lxcpath
, sizeof(lxcpath
), "%s/dev/%s/console", rootfs
->mount
, ttydir
);
1601 if (ret
< 0 || (size_t)ret
>= sizeof(lxcpath
))
1604 ret
= creat(lxcpath
, 0660);
1605 if (ret
== -1 && errno
!= EEXIST
) {
1606 SYSERROR("error %d creating %s", errno
, lxcpath
);
1612 ret
= snprintf(path
, sizeof(path
), "%s/dev/console", rootfs
->mount
);
1613 if (ret
< 0 || (size_t)ret
>= sizeof(lxcpath
))
1616 /* When we are asked to setup a console we remove any previous
1617 * /dev/console bind-mounts.
1619 if (console
->path
&& !strcmp(console
->path
, "none")) {
1621 ret
= stat(path
, &st
);
1623 if (errno
== ENOENT
)
1625 SYSERROR("failed stat() \"%s\"", path
);
1629 /* /dev/console must be character device with major number 5 and
1630 * minor number 1. If not, give benefit of the doubt and assume
1631 * the user has mounted something else right there on purpose.
1633 if (((st
.st_mode
& S_IFMT
) != S_IFCHR
) || major(st
.st_rdev
) != 5 || minor(st
.st_rdev
) != 1)
1636 /* In case the user requested a bind-mount for /dev/console and
1637 * requests a ttydir we move the mount to the
1638 * /dev/<ttydir/console.
1639 * Note, we only move the uppermost mount and clear all other
1640 * mounts underneath for safety.
1641 * If it is a character device created via mknod() we simply
1644 ret
= safe_mount(path
, lxcpath
, "none", MS_MOVE
, NULL
, rootfs
->mount
);
1646 if (errno
!= EINVAL
) {
1647 ERROR("failed to MS_MOVE \"%s\" to \"%s\": %s", path
, lxcpath
, strerror(errno
));
1650 /* path was not a mountpoint */
1651 ret
= rename(path
, lxcpath
);
1653 ERROR("failed to rename \"%s\" to \"%s\": %s", path
, lxcpath
, strerror(errno
));
1656 DEBUG("renamed \"%s\" to \"%s\"", path
, lxcpath
);
1658 DEBUG("moved mount \"%s\" to \"%s\"", path
, lxcpath
);
1661 /* Clear all remaining bind-mounts. */
1662 ret
= lxc_unstack_mountpoint(path
, false);
1664 ERROR("failed to unmount \"%s\": %s", path
, strerror(errno
));
1667 DEBUG("cleared all (%d) mounts from \"%s\"", ret
, path
);
1670 if (file_exists(path
)) {
1671 ret
= lxc_unstack_mountpoint(path
, false);
1673 ERROR("failed to unmount \"%s\": %s", path
, strerror(errno
));
1676 DEBUG("cleared all (%d) mounts from \"%s\"", ret
, path
);
1680 if (safe_mount(console
->name
, lxcpath
, "none", MS_BIND
, 0, rootfs
->mount
) < 0) {
1681 ERROR("failed to mount '%s' on '%s'", console
->name
, lxcpath
);
1684 DEBUG("mounted \"%s\" onto \"%s\"", console
->name
, lxcpath
);
1687 /* create symlink from rootfs /dev/console to '<ttydir>/console' */
1688 ret
= snprintf(lxcpath
, sizeof(lxcpath
), "%s/console", ttydir
);
1689 if (ret
< 0 || (size_t)ret
>= sizeof(lxcpath
))
1693 if (ret
&& errno
!= ENOENT
) {
1694 SYSERROR("error unlinking %s", path
);
1698 ret
= symlink(lxcpath
, path
);
1700 SYSERROR("failed to create symlink for console from \"%s\" to \"%s\"", lxcpath
, path
);
1704 DEBUG("console has been setup under \"%s\" and symlinked to \"%s\"", lxcpath
, path
);
1708 static int lxc_setup_console(const struct lxc_rootfs
*rootfs
,
1709 const struct lxc_console
*console
, char *ttydir
)
1711 /* We don't have a rootfs, /dev/console will be shared. */
1712 if (!rootfs
->path
) {
1713 DEBUG("/dev/console will be shared with the host");
1718 return lxc_setup_dev_console(rootfs
, console
);
1720 return lxc_setup_ttydir_console(rootfs
, console
, ttydir
);
1723 static int setup_kmsg(const struct lxc_rootfs
*rootfs
,
1724 const struct lxc_console
*console
)
1726 char kpath
[MAXPATHLEN
];
1731 ret
= snprintf(kpath
, sizeof(kpath
), "%s/dev/kmsg", rootfs
->mount
);
1732 if (ret
< 0 || ret
>= sizeof(kpath
))
1735 ret
= unlink(kpath
);
1736 if (ret
&& errno
!= ENOENT
) {
1737 SYSERROR("error unlinking %s", kpath
);
1741 ret
= symlink("console", kpath
);
1743 SYSERROR("failed to create symlink for kmsg");
1750 static void parse_mntopt(char *opt
, unsigned long *flags
, char **data
)
1752 struct mount_opt
*mo
;
1754 /* If opt is found in mount_opt, set or clear flags.
1755 * Otherwise append it to data. */
1757 for (mo
= &mount_opt
[0]; mo
->name
!= NULL
; mo
++) {
1758 if (!strncmp(opt
, mo
->name
, strlen(mo
->name
))) {
1760 *flags
&= ~mo
->flag
;
1772 int parse_mntopts(const char *mntopts
, unsigned long *mntflags
,
1776 char *p
, *saveptr
= NULL
;
1784 s
= strdup(mntopts
);
1786 SYSERROR("failed to allocate memory");
1790 data
= malloc(strlen(s
) + 1);
1792 SYSERROR("failed to allocate memory");
1798 for (p
= strtok_r(s
, ",", &saveptr
); p
!= NULL
;
1799 p
= strtok_r(NULL
, ",", &saveptr
))
1800 parse_mntopt(p
, mntflags
, &data
);
1811 static void null_endofword(char *word
)
1813 while (*word
&& *word
!= ' ' && *word
!= '\t')
1819 * skip @nfields spaces in @src
1821 static char *get_field(char *src
, int nfields
)
1826 for (i
= 0; i
< nfields
; i
++) {
1827 while (*p
&& *p
!= ' ' && *p
!= '\t')
1836 static int mount_entry(const char *fsname
, const char *target
,
1837 const char *fstype
, unsigned long mountflags
,
1838 const char *data
, int optional
, int dev
, const char *rootfs
)
1844 if (safe_mount(fsname
, target
, fstype
, mountflags
& ~MS_REMOUNT
, data
, rootfs
)) {
1846 INFO("failed to mount '%s' on '%s' (optional): %s", fsname
,
1847 target
, strerror(errno
));
1851 SYSERROR("failed to mount '%s' on '%s'", fsname
, target
);
1856 if ((mountflags
& MS_REMOUNT
) || (mountflags
& MS_BIND
)) {
1857 DEBUG("remounting %s on %s to respect bind or remount options",
1858 fsname
? fsname
: "(none)", target
? target
: "(none)");
1859 unsigned long rqd_flags
= 0;
1860 if (mountflags
& MS_RDONLY
)
1861 rqd_flags
|= MS_RDONLY
;
1863 if (statvfs(fsname
, &sb
) == 0) {
1864 unsigned long required_flags
= rqd_flags
;
1865 if (sb
.f_flag
& MS_NOSUID
)
1866 required_flags
|= MS_NOSUID
;
1867 if (sb
.f_flag
& MS_NODEV
&& !dev
)
1868 required_flags
|= MS_NODEV
;
1869 if (sb
.f_flag
& MS_RDONLY
)
1870 required_flags
|= MS_RDONLY
;
1871 if (sb
.f_flag
& MS_NOEXEC
)
1872 required_flags
|= MS_NOEXEC
;
1873 DEBUG("(at remount) flags for %s was %lu, required extra flags are %lu", fsname
, sb
.f_flag
, required_flags
);
1875 * If this was a bind mount request, and required_flags
1876 * does not have any flags which are not already in
1877 * mountflags, then skip the remount
1879 if (!(mountflags
& MS_REMOUNT
)) {
1880 if (!(required_flags
& ~mountflags
) && rqd_flags
== 0) {
1881 DEBUG("mountflags already was %lu, skipping remount",
1886 mountflags
|= required_flags
;
1890 if (mount(fsname
, target
, fstype
,
1891 mountflags
| MS_REMOUNT
, data
) < 0) {
1893 INFO("failed to mount '%s' on '%s' (optional): %s",
1894 fsname
, target
, strerror(errno
));
1898 SYSERROR("failed to mount '%s' on '%s'",
1908 DEBUG("mounted '%s' on '%s', type '%s'", fsname
, target
, fstype
);
1914 * Remove 'optional', 'create=dir', and 'create=file' from mntopt
1916 static void cull_mntent_opt(struct mntent
*mntent
)
1920 char *list
[] = {"create=dir",
1925 for (i
=0; list
[i
]; i
++) {
1926 if (!(p
= strstr(mntent
->mnt_opts
, list
[i
])))
1928 p2
= strchr(p
, ',');
1930 /* no more mntopts, so just chop it here */
1934 memmove(p
, p2
+1, strlen(p2
+1)+1);
1938 static int mount_entry_create_dir_file(const struct mntent
*mntent
,
1939 const char* path
, const struct lxc_rootfs
*rootfs
,
1940 const char *lxc_name
, const char *lxc_path
)
1942 char *pathdirname
= NULL
;
1944 FILE *pathfile
= NULL
;
1946 if (strncmp(mntent
->mnt_type
, "overlay", 7) == 0) {
1947 if (ovl_mkdir(mntent
, rootfs
, lxc_name
, lxc_path
) < 0)
1949 } else if (strncmp(mntent
->mnt_type
, "aufs", 4) == 0) {
1950 if (aufs_mkdir(mntent
, rootfs
, lxc_name
, lxc_path
) < 0)
1954 if (hasmntopt(mntent
, "create=dir")) {
1955 if (mkdir_p(path
, 0755) < 0) {
1956 WARN("Failed to create mount target '%s'", path
);
1961 if (hasmntopt(mntent
, "create=file") && access(path
, F_OK
)) {
1962 pathdirname
= strdup(path
);
1963 pathdirname
= dirname(pathdirname
);
1964 if (mkdir_p(pathdirname
, 0755) < 0) {
1965 WARN("Failed to create target directory");
1967 pathfile
= fopen(path
, "wb");
1969 WARN("Failed to create mount target '%s'", path
);
1979 /* rootfs, lxc_name, and lxc_path can be NULL when the container is created
1980 * without a rootfs. */
1981 static inline int mount_entry_on_generic(struct mntent
*mntent
,
1982 const char* path
, const struct lxc_rootfs
*rootfs
,
1983 const char *lxc_name
, const char *lxc_path
)
1985 unsigned long mntflags
;
1988 bool optional
= hasmntopt(mntent
, "optional") != NULL
;
1989 bool dev
= hasmntopt(mntent
, "dev") != NULL
;
1991 char *rootfs_path
= NULL
;
1992 if (rootfs
&& rootfs
->path
)
1993 rootfs_path
= rootfs
->mount
;
1995 ret
= mount_entry_create_dir_file(mntent
, path
, rootfs
, lxc_name
, lxc_path
);
1998 return optional
? 0 : -1;
2000 cull_mntent_opt(mntent
);
2002 if (parse_mntopts(mntent
->mnt_opts
, &mntflags
, &mntdata
) < 0) {
2007 ret
= mount_entry(mntent
->mnt_fsname
, path
, mntent
->mnt_type
, mntflags
,
2008 mntdata
, optional
, dev
, rootfs_path
);
2014 static inline int mount_entry_on_systemfs(struct mntent
*mntent
)
2016 char path
[MAXPATHLEN
];
2019 /* For containers created without a rootfs all mounts are treated as
2020 * absolute paths starting at / on the host. */
2021 if (mntent
->mnt_dir
[0] != '/')
2022 ret
= snprintf(path
, sizeof(path
), "/%s", mntent
->mnt_dir
);
2024 ret
= snprintf(path
, sizeof(path
), "%s", mntent
->mnt_dir
);
2026 if (ret
< 0 || ret
>= sizeof(path
)) {
2027 ERROR("path name too long");
2031 return mount_entry_on_generic(mntent
, path
, NULL
, NULL
, NULL
);
2034 static int mount_entry_on_absolute_rootfs(struct mntent
*mntent
,
2035 const struct lxc_rootfs
*rootfs
,
2036 const char *lxc_name
,
2037 const char *lxc_path
)
2040 char path
[MAXPATHLEN
];
2041 int r
, ret
= 0, offset
;
2042 const char *lxcpath
;
2044 lxcpath
= lxc_global_config_value("lxc.lxcpath");
2046 ERROR("Out of memory");
2050 /* if rootfs->path is a blockdev path, allow container fstab to
2051 * use $lxcpath/CN/rootfs as the target prefix */
2052 r
= snprintf(path
, MAXPATHLEN
, "%s/%s/rootfs", lxcpath
, lxc_name
);
2053 if (r
< 0 || r
>= MAXPATHLEN
)
2056 aux
= strstr(mntent
->mnt_dir
, path
);
2058 offset
= strlen(path
);
2063 aux
= strstr(mntent
->mnt_dir
, rootfs
->path
);
2065 WARN("ignoring mount point '%s'", mntent
->mnt_dir
);
2068 offset
= strlen(rootfs
->path
);
2072 r
= snprintf(path
, MAXPATHLEN
, "%s/%s", rootfs
->mount
,
2074 if (r
< 0 || r
>= MAXPATHLEN
) {
2075 WARN("pathnme too long for '%s'", mntent
->mnt_dir
);
2079 return mount_entry_on_generic(mntent
, path
, rootfs
, lxc_name
, lxc_path
);
2082 static int mount_entry_on_relative_rootfs(struct mntent
*mntent
,
2083 const struct lxc_rootfs
*rootfs
,
2084 const char *lxc_name
,
2085 const char *lxc_path
)
2087 char path
[MAXPATHLEN
];
2090 /* relative to root mount point */
2091 ret
= snprintf(path
, sizeof(path
), "%s/%s", rootfs
->mount
, mntent
->mnt_dir
);
2092 if (ret
< 0 || ret
>= sizeof(path
)) {
2093 ERROR("path name too long");
2097 return mount_entry_on_generic(mntent
, path
, rootfs
, lxc_name
, lxc_path
);
2100 static int mount_file_entries(const struct lxc_rootfs
*rootfs
, FILE *file
,
2101 const char *lxc_name
, const char *lxc_path
)
2103 struct mntent mntent
;
2107 while (getmntent_r(file
, &mntent
, buf
, sizeof(buf
))) {
2109 if (!rootfs
->path
) {
2110 if (mount_entry_on_systemfs(&mntent
))
2115 /* We have a separate root, mounts are relative to it */
2116 if (mntent
.mnt_dir
[0] != '/') {
2117 if (mount_entry_on_relative_rootfs(&mntent
, rootfs
, lxc_name
, lxc_path
))
2122 if (mount_entry_on_absolute_rootfs(&mntent
, rootfs
, lxc_name
, lxc_path
))
2128 INFO("mount points have been setup");
2133 static int setup_mount(const struct lxc_rootfs
*rootfs
, const char *fstab
,
2134 const char *lxc_name
, const char *lxc_path
)
2142 file
= setmntent(fstab
, "r");
2144 SYSERROR("failed to use '%s'", fstab
);
2148 ret
= mount_file_entries(rootfs
, file
, lxc_name
, lxc_path
);
2154 FILE *make_anonymous_mount_file(struct lxc_list
*mount
)
2158 struct lxc_list
*iterator
;
2162 fd
= memfd_create("lxc_mount_file", MFD_CLOEXEC
);
2164 if (errno
!= ENOSYS
)
2168 file
= fdopen(fd
, "r+");
2172 int saved_errno
= errno
;
2175 ERROR("Could not create mount entry file: %s.", strerror(saved_errno
));
2179 lxc_list_for_each(iterator
, mount
) {
2180 mount_entry
= iterator
->elem
;
2181 ret
= fprintf(file
, "%s\n", mount_entry
);
2182 if (ret
< strlen(mount_entry
))
2183 WARN("Could not write mount entry to anonymous mount file.");
2186 if (fseek(file
, 0, SEEK_SET
) < 0) {
2194 static int setup_mount_entries(const struct lxc_rootfs
*rootfs
,
2195 struct lxc_list
*mount
, const char *lxc_name
,
2196 const char *lxc_path
)
2201 file
= make_anonymous_mount_file(mount
);
2205 ret
= mount_file_entries(rootfs
, file
, lxc_name
, lxc_path
);
2211 static int parse_cap(const char *cap
)
2217 if (!strcmp(cap
, "none"))
2220 for (i
= 0; i
< sizeof(caps_opt
)/sizeof(caps_opt
[0]); i
++) {
2222 if (strcmp(cap
, caps_opt
[i
].name
))
2225 capid
= caps_opt
[i
].value
;
2230 /* try to see if it's numeric, so the user may specify
2231 * capabilities that the running kernel knows about but
2234 capid
= strtol(cap
, &ptr
, 10);
2235 if (!ptr
|| *ptr
!= '\0' || errno
!= 0)
2236 /* not a valid number */
2238 else if (capid
> lxc_caps_last_cap())
2239 /* we have a number but it's not a valid
2247 int in_caplist(int cap
, struct lxc_list
*caps
)
2249 struct lxc_list
*iterator
;
2252 lxc_list_for_each(iterator
, caps
) {
2253 capid
= parse_cap(iterator
->elem
);
2261 static int setup_caps(struct lxc_list
*caps
)
2263 struct lxc_list
*iterator
;
2267 lxc_list_for_each(iterator
, caps
) {
2269 drop_entry
= iterator
->elem
;
2271 capid
= parse_cap(drop_entry
);
2274 ERROR("unknown capability %s", drop_entry
);
2278 DEBUG("drop capability '%s' (%d)", drop_entry
, capid
);
2280 if (prctl(PR_CAPBSET_DROP
, capid
, 0, 0, 0)) {
2281 SYSERROR("failed to remove %s capability", drop_entry
);
2287 DEBUG("capabilities have been setup");
2292 static int dropcaps_except(struct lxc_list
*caps
)
2294 struct lxc_list
*iterator
;
2297 int numcaps
= lxc_caps_last_cap() + 1;
2298 INFO("found %d capabilities", numcaps
);
2300 if (numcaps
<= 0 || numcaps
> 200)
2303 // caplist[i] is 1 if we keep capability i
2304 int *caplist
= alloca(numcaps
* sizeof(int));
2305 memset(caplist
, 0, numcaps
* sizeof(int));
2307 lxc_list_for_each(iterator
, caps
) {
2309 keep_entry
= iterator
->elem
;
2311 capid
= parse_cap(keep_entry
);
2317 ERROR("unknown capability %s", keep_entry
);
2321 DEBUG("keep capability '%s' (%d)", keep_entry
, capid
);
2325 for (i
=0; i
<numcaps
; i
++) {
2328 if (prctl(PR_CAPBSET_DROP
, i
, 0, 0, 0)) {
2329 SYSERROR("failed to remove capability %d", i
);
2334 DEBUG("capabilities have been setup");
2339 static int setup_hw_addr(char *hwaddr
, const char *ifname
)
2341 struct sockaddr sockaddr
;
2343 int ret
, fd
, saved_errno
;
2345 ret
= lxc_convert_mac(hwaddr
, &sockaddr
);
2347 ERROR("mac address '%s' conversion failed : %s",
2348 hwaddr
, strerror(-ret
));
2352 memcpy(ifr
.ifr_name
, ifname
, IFNAMSIZ
);
2353 ifr
.ifr_name
[IFNAMSIZ
-1] = '\0';
2354 memcpy((char *) &ifr
.ifr_hwaddr
, (char *) &sockaddr
, sizeof(sockaddr
));
2356 fd
= socket(AF_INET
, SOCK_DGRAM
, 0);
2358 ERROR("socket failure : %s", strerror(errno
));
2362 ret
= ioctl(fd
, SIOCSIFHWADDR
, &ifr
);
2363 saved_errno
= errno
;
2366 ERROR("ioctl failure : %s", strerror(saved_errno
));
2368 DEBUG("mac address '%s' on '%s' has been setup", hwaddr
, ifr
.ifr_name
);
2373 static int setup_ipv4_addr(struct lxc_list
*ip
, int ifindex
)
2375 struct lxc_list
*iterator
;
2376 struct lxc_inetdev
*inetdev
;
2379 lxc_list_for_each(iterator
, ip
) {
2381 inetdev
= iterator
->elem
;
2383 err
= lxc_ipv4_addr_add(ifindex
, &inetdev
->addr
,
2384 &inetdev
->bcast
, inetdev
->prefix
);
2386 ERROR("failed to setup_ipv4_addr ifindex %d : %s",
2387 ifindex
, strerror(-err
));
2395 static int setup_ipv6_addr(struct lxc_list
*ip
, int ifindex
)
2397 struct lxc_list
*iterator
;
2398 struct lxc_inet6dev
*inet6dev
;
2401 lxc_list_for_each(iterator
, ip
) {
2403 inet6dev
= iterator
->elem
;
2405 err
= lxc_ipv6_addr_add(ifindex
, &inet6dev
->addr
,
2406 &inet6dev
->mcast
, &inet6dev
->acast
,
2409 ERROR("failed to setup_ipv6_addr ifindex %d : %s",
2410 ifindex
, strerror(-err
));
2418 static int setup_netdev(struct lxc_netdev
*netdev
)
2420 char ifname
[IFNAMSIZ
];
2421 char *current_ifname
= ifname
;
2424 /* empty network namespace */
2425 if (!netdev
->ifindex
) {
2426 if (netdev
->flags
& IFF_UP
) {
2427 err
= lxc_netdev_up("lo");
2429 ERROR("failed to set the loopback up : %s",
2434 if (netdev
->type
!= LXC_NET_VETH
)
2436 netdev
->ifindex
= if_nametoindex(netdev
->name
);
2439 /* get the new ifindex in case of physical netdev */
2440 if (netdev
->type
== LXC_NET_PHYS
) {
2441 if (!(netdev
->ifindex
= if_nametoindex(netdev
->link
))) {
2442 ERROR("failed to get ifindex for %s",
2448 /* retrieve the name of the interface */
2449 if (!if_indextoname(netdev
->ifindex
, current_ifname
)) {
2450 ERROR("no interface corresponding to index '%d'",
2455 /* default: let the system to choose one interface name */
2457 netdev
->name
= netdev
->type
== LXC_NET_PHYS
?
2458 netdev
->link
: "eth%d";
2460 /* rename the interface name */
2461 if (strcmp(ifname
, netdev
->name
) != 0) {
2462 err
= lxc_netdev_rename_by_name(ifname
, netdev
->name
);
2464 ERROR("failed to rename %s->%s : %s", ifname
, netdev
->name
,
2470 /* Re-read the name of the interface because its name has changed
2471 * and would be automatically allocated by the system
2473 if (!if_indextoname(netdev
->ifindex
, current_ifname
)) {
2474 ERROR("no interface corresponding to index '%d'",
2479 /* set a mac address */
2480 if (netdev
->hwaddr
) {
2481 if (setup_hw_addr(netdev
->hwaddr
, current_ifname
)) {
2482 ERROR("failed to setup hw address for '%s'",
2488 /* setup ipv4 addresses on the interface */
2489 if (setup_ipv4_addr(&netdev
->ipv4
, netdev
->ifindex
)) {
2490 ERROR("failed to setup ip addresses for '%s'",
2495 /* setup ipv6 addresses on the interface */
2496 if (setup_ipv6_addr(&netdev
->ipv6
, netdev
->ifindex
)) {
2497 ERROR("failed to setup ipv6 addresses for '%s'",
2502 /* set the network device up */
2503 if (netdev
->flags
& IFF_UP
) {
2506 err
= lxc_netdev_up(current_ifname
);
2508 ERROR("failed to set '%s' up : %s", current_ifname
,
2513 /* the network is up, make the loopback up too */
2514 err
= lxc_netdev_up("lo");
2516 ERROR("failed to set the loopback up : %s",
2522 /* We can only set up the default routes after bringing
2523 * up the interface, sine bringing up the interface adds
2524 * the link-local routes and we can't add a default
2525 * route if the gateway is not reachable. */
2527 /* setup ipv4 gateway on the interface */
2528 if (netdev
->ipv4_gateway
) {
2529 if (!(netdev
->flags
& IFF_UP
)) {
2530 ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname
);
2534 if (lxc_list_empty(&netdev
->ipv4
)) {
2535 ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname
);
2539 err
= lxc_ipv4_gateway_add(netdev
->ifindex
, netdev
->ipv4_gateway
);
2541 err
= lxc_ipv4_dest_add(netdev
->ifindex
, netdev
->ipv4_gateway
);
2543 ERROR("failed to add ipv4 dest for '%s': %s",
2544 ifname
, strerror(-err
));
2547 err
= lxc_ipv4_gateway_add(netdev
->ifindex
, netdev
->ipv4_gateway
);
2549 ERROR("failed to setup ipv4 gateway for '%s': %s",
2550 ifname
, strerror(-err
));
2551 if (netdev
->ipv4_gateway_auto
) {
2552 char buf
[INET_ADDRSTRLEN
];
2553 inet_ntop(AF_INET
, netdev
->ipv4_gateway
, buf
, sizeof(buf
));
2554 ERROR("tried to set autodetected ipv4 gateway '%s'", buf
);
2561 /* setup ipv6 gateway on the interface */
2562 if (netdev
->ipv6_gateway
) {
2563 if (!(netdev
->flags
& IFF_UP
)) {
2564 ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname
);
2568 if (lxc_list_empty(&netdev
->ipv6
) && !IN6_IS_ADDR_LINKLOCAL(netdev
->ipv6_gateway
)) {
2569 ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname
);
2573 err
= lxc_ipv6_gateway_add(netdev
->ifindex
, netdev
->ipv6_gateway
);
2575 err
= lxc_ipv6_dest_add(netdev
->ifindex
, netdev
->ipv6_gateway
);
2577 ERROR("failed to add ipv6 dest for '%s': %s",
2578 ifname
, strerror(-err
));
2581 err
= lxc_ipv6_gateway_add(netdev
->ifindex
, netdev
->ipv6_gateway
);
2583 ERROR("failed to setup ipv6 gateway for '%s': %s",
2584 ifname
, strerror(-err
));
2585 if (netdev
->ipv6_gateway_auto
) {
2586 char buf
[INET6_ADDRSTRLEN
];
2587 inet_ntop(AF_INET6
, netdev
->ipv6_gateway
, buf
, sizeof(buf
));
2588 ERROR("tried to set autodetected ipv6 gateway '%s'", buf
);
2595 DEBUG("'%s' has been setup", current_ifname
);
2600 static int setup_network(struct lxc_list
*network
)
2602 struct lxc_list
*iterator
;
2603 struct lxc_netdev
*netdev
;
2605 lxc_list_for_each(iterator
, network
) {
2607 netdev
= iterator
->elem
;
2609 if (setup_netdev(netdev
)) {
2610 ERROR("failed to setup netdev");
2615 if (!lxc_list_empty(network
))
2616 INFO("network has been setup");
2621 static int parse_resource(const char *res
) {
2625 for (i
= 0; i
< sizeof(limit_opt
)/sizeof(limit_opt
[0]); ++i
) {
2626 if (strcmp(res
, limit_opt
[i
].name
) == 0)
2627 return limit_opt
[i
].value
;
2630 /* try to see if it's numeric, so the user may specify
2631 * resources that the running kernel knows about but
2633 if (lxc_safe_int(res
, &resid
) == 0)
2638 int setup_resource_limits(struct lxc_list
*limits
, pid_t pid
) {
2639 struct lxc_list
*it
;
2640 struct lxc_limit
*lim
;
2643 lxc_list_for_each(it
, limits
) {
2646 resid
= parse_resource(lim
->resource
);
2648 ERROR("unknown resource %s", lim
->resource
);
2652 if (prlimit(pid
, resid
, &lim
->limit
, NULL
) != 0) {
2653 ERROR("failed to set limit %s: %s", lim
->resource
, strerror(errno
));
2660 /* try to move physical nics to the init netns */
2661 void lxc_restore_phys_nics_to_netns(int netnsfd
, struct lxc_conf
*conf
)
2664 char ifname
[IFNAMSIZ
];
2666 if (netnsfd
< 0 || conf
->num_savednics
== 0)
2669 INFO("Running to reset %d nic names.", conf
->num_savednics
);
2671 oldfd
= lxc_preserve_ns(getpid(), "net");
2673 SYSERROR("Failed to open monitor netns fd.");
2677 if (setns(netnsfd
, 0) != 0) {
2678 SYSERROR("Failed to enter container netns to reset nics");
2682 for (i
=0; i
<conf
->num_savednics
; i
++) {
2683 struct saved_nic
*s
= &conf
->saved_nics
[i
];
2684 /* retrieve the name of the interface */
2685 if (!if_indextoname(s
->ifindex
, ifname
)) {
2686 WARN("no interface corresponding to index '%d'", s
->ifindex
);
2689 if (lxc_netdev_move_by_name(ifname
, 1, s
->orig_name
))
2690 WARN("Error moving nic name:%s back to host netns", ifname
);
2693 conf
->num_savednics
= 0;
2695 if (setns(oldfd
, 0) != 0)
2696 SYSERROR("Failed to re-enter monitor's netns");
2700 static char *default_rootfs_mount
= LXCROOTFSMOUNT
;
2702 struct lxc_conf
*lxc_conf_init(void)
2704 struct lxc_conf
*new;
2707 new = malloc(sizeof(*new));
2709 ERROR("lxc_conf_init : %m");
2712 memset(new, 0, sizeof(*new));
2714 new->loglevel
= LXC_LOG_PRIORITY_NOTSET
;
2715 new->personality
= -1;
2717 new->console
.log_path
= NULL
;
2718 new->console
.log_fd
= -1;
2719 new->console
.path
= NULL
;
2720 new->console
.peer
= -1;
2721 new->console
.peerpty
.busy
= -1;
2722 new->console
.peerpty
.master
= -1;
2723 new->console
.peerpty
.slave
= -1;
2724 new->console
.master
= -1;
2725 new->console
.slave
= -1;
2726 new->console
.name
[0] = '\0';
2727 new->maincmd_fd
= -1;
2729 new->rootfs
.mount
= strdup(default_rootfs_mount
);
2730 if (!new->rootfs
.mount
) {
2731 ERROR("lxc_conf_init : %m");
2737 lxc_list_init(&new->cgroup
);
2738 lxc_list_init(&new->network
);
2739 lxc_list_init(&new->mount_list
);
2740 lxc_list_init(&new->caps
);
2741 lxc_list_init(&new->keepcaps
);
2742 lxc_list_init(&new->id_map
);
2743 lxc_list_init(&new->includes
);
2744 lxc_list_init(&new->aliens
);
2745 lxc_list_init(&new->environment
);
2746 lxc_list_init(&new->limits
);
2747 for (i
=0; i
<NUM_LXC_HOOKS
; i
++)
2748 lxc_list_init(&new->hooks
[i
]);
2749 lxc_list_init(&new->groups
);
2750 new->lsm_aa_profile
= NULL
;
2751 new->lsm_se_context
= NULL
;
2752 new->tmp_umount_proc
= 0;
2754 for (i
= 0; i
< LXC_NS_MAX
; i
++)
2755 new->inherit_ns_fd
[i
] = -1;
2757 /* if running in a new user namespace, init and COMMAND
2758 * default to running as UID/GID 0 when using lxc-execute */
2765 static int instantiate_veth(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2767 char *veth1
, *veth2
;
2768 char veth1buf
[IFNAMSIZ
], veth2buf
[IFNAMSIZ
];
2769 int bridge_index
, err
;
2770 unsigned int mtu
= 0;
2772 if (netdev
->priv
.veth_attr
.pair
) {
2773 veth1
= netdev
->priv
.veth_attr
.pair
;
2774 if (handler
->conf
->reboot
)
2775 lxc_netdev_delete_by_name(veth1
);
2777 err
= snprintf(veth1buf
, sizeof(veth1buf
), "vethXXXXXX");
2778 if (err
>= sizeof(veth1buf
)) { /* can't *really* happen, but... */
2779 ERROR("veth1 name too long");
2782 veth1
= lxc_mkifname(veth1buf
);
2784 ERROR("failed to allocate a temporary name");
2787 /* store away for deconf */
2788 memcpy(netdev
->priv
.veth_attr
.veth1
, veth1
, IFNAMSIZ
);
2791 snprintf(veth2buf
, sizeof(veth2buf
), "vethXXXXXX");
2792 veth2
= lxc_mkifname(veth2buf
);
2794 ERROR("failed to allocate a temporary name");
2798 err
= lxc_veth_create(veth1
, veth2
);
2800 ERROR("failed to create veth pair \"%s\" and \"%s\": %s", veth1
,
2801 veth2
, strerror(-err
));
2805 /* changing the high byte of the mac address to 0xfe, the bridge interface
2806 * will always keep the host's mac address and not take the mac address
2808 err
= setup_private_host_hw_addr(veth1
);
2810 ERROR("failed to change mac address of host interface \"%s\": %s",
2811 veth1
, strerror(-err
));
2815 netdev
->ifindex
= if_nametoindex(veth2
);
2816 if (!netdev
->ifindex
) {
2817 ERROR("failed to retrieve the index for \"%s\"", veth2
);
2822 if (lxc_safe_uint(netdev
->mtu
, &mtu
) < 0)
2823 WARN("failed to parse mtu from");
2825 INFO("retrieved mtu %d", mtu
);
2826 } else if (netdev
->link
) {
2827 bridge_index
= if_nametoindex(netdev
->link
);
2829 mtu
= netdev_get_mtu(bridge_index
);
2830 INFO("retrieved mtu %d from %s", mtu
, netdev
->link
);
2832 mtu
= netdev_get_mtu(netdev
->ifindex
);
2833 INFO("retrieved mtu %d from %s", mtu
, veth2
);
2838 err
= lxc_netdev_set_mtu(veth1
, mtu
);
2840 err
= lxc_netdev_set_mtu(veth2
, mtu
);
2842 ERROR("failed to set mtu \"%d\" for veth pair \"%s\" "
2844 mtu
, veth1
, veth2
, strerror(-err
));
2850 err
= lxc_bridge_attach(handler
->lxcpath
, handler
->name
, netdev
->link
, veth1
);
2852 ERROR("failed to attach \"%s\" to bridge \"%s\": %s",
2853 veth1
, netdev
->link
, strerror(-err
));
2856 INFO("attached \"%s\" to bridge \"%s\"", veth1
, netdev
->link
);
2859 err
= lxc_netdev_up(veth1
);
2861 ERROR("failed to set \"%s\" up: %s", veth1
, strerror(-err
));
2865 if (netdev
->upscript
) {
2866 err
= run_script(handler
->name
, "net", netdev
->upscript
, "up",
2867 "veth", veth1
, (char*) NULL
);
2872 DEBUG("instantiated veth \"%s/%s\", index is \"%d\"", veth1
, veth2
,
2878 if (netdev
->ifindex
!= 0)
2879 lxc_netdev_delete_by_name(veth1
);
2880 if (!netdev
->priv
.veth_attr
.pair
)
2886 static int shutdown_veth(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2891 if (netdev
->priv
.veth_attr
.pair
)
2892 veth1
= netdev
->priv
.veth_attr
.pair
;
2894 veth1
= netdev
->priv
.veth_attr
.veth1
;
2896 if (netdev
->downscript
) {
2897 err
= run_script(handler
->name
, "net", netdev
->downscript
,
2898 "down", "veth", veth1
, (char*) NULL
);
2905 static int instantiate_macvlan(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2907 char peerbuf
[IFNAMSIZ
], *peer
;
2910 if (!netdev
->link
) {
2911 ERROR("no link specified for macvlan netdev");
2915 err
= snprintf(peerbuf
, sizeof(peerbuf
), "mcXXXXXX");
2916 if (err
>= sizeof(peerbuf
))
2919 peer
= lxc_mkifname(peerbuf
);
2921 ERROR("failed to make a temporary name");
2925 err
= lxc_macvlan_create(netdev
->link
, peer
,
2926 netdev
->priv
.macvlan_attr
.mode
);
2928 ERROR("failed to create macvlan interface '%s' on '%s' : %s",
2929 peer
, netdev
->link
, strerror(-err
));
2933 netdev
->ifindex
= if_nametoindex(peer
);
2934 if (!netdev
->ifindex
) {
2935 ERROR("failed to retrieve the index for %s", peer
);
2939 if (netdev
->upscript
) {
2940 err
= run_script(handler
->name
, "net", netdev
->upscript
, "up",
2941 "macvlan", netdev
->link
, (char*) NULL
);
2946 DEBUG("instantiated macvlan '%s', index is '%d' and mode '%d'",
2947 peer
, netdev
->ifindex
, netdev
->priv
.macvlan_attr
.mode
);
2951 lxc_netdev_delete_by_name(peer
);
2956 static int shutdown_macvlan(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2960 if (netdev
->downscript
) {
2961 err
= run_script(handler
->name
, "net", netdev
->downscript
,
2962 "down", "macvlan", netdev
->link
,
2970 /* XXX: merge with instantiate_macvlan */
2971 static int instantiate_vlan(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
2973 char peer
[IFNAMSIZ
];
2975 static uint16_t vlan_cntr
= 0;
2976 unsigned int mtu
= 0;
2978 if (!netdev
->link
) {
2979 ERROR("no link specified for vlan netdev");
2983 err
= snprintf(peer
, sizeof(peer
), "vlan%d-%d", netdev
->priv
.vlan_attr
.vid
, vlan_cntr
++);
2984 if (err
>= sizeof(peer
)) {
2985 ERROR("peer name too long");
2989 err
= lxc_vlan_create(netdev
->link
, peer
, netdev
->priv
.vlan_attr
.vid
);
2991 ERROR("failed to create vlan interface '%s' on '%s' : %s",
2992 peer
, netdev
->link
, strerror(-err
));
2996 netdev
->ifindex
= if_nametoindex(peer
);
2997 if (!netdev
->ifindex
) {
2998 ERROR("failed to retrieve the ifindex for %s", peer
);
2999 lxc_netdev_delete_by_name(peer
);
3003 DEBUG("instantiated vlan '%s', ifindex is '%d'", " vlan1000",
3006 if (lxc_safe_uint(netdev
->mtu
, &mtu
) < 0) {
3007 ERROR("Failed to retrieve mtu from: '%d'/'%s'.",
3008 netdev
->ifindex
, netdev
->name
);
3011 err
= lxc_netdev_set_mtu(peer
, mtu
);
3013 ERROR("failed to set mtu '%s' for %s : %s",
3014 netdev
->mtu
, peer
, strerror(-err
));
3015 lxc_netdev_delete_by_name(peer
);
3023 static int shutdown_vlan(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
3028 static int instantiate_phys(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
3030 if (!netdev
->link
) {
3031 ERROR("no link specified for the physical interface");
3035 netdev
->ifindex
= if_nametoindex(netdev
->link
);
3036 if (!netdev
->ifindex
) {
3037 ERROR("failed to retrieve the index for %s", netdev
->link
);
3041 if (netdev
->upscript
) {
3043 err
= run_script(handler
->name
, "net", netdev
->upscript
,
3044 "up", "phys", netdev
->link
, (char*) NULL
);
3052 static int shutdown_phys(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
3056 if (netdev
->downscript
) {
3057 err
= run_script(handler
->name
, "net", netdev
->downscript
,
3058 "down", "phys", netdev
->link
, (char*) NULL
);
3065 static int instantiate_none(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
3067 netdev
->ifindex
= 0;
3071 static int instantiate_empty(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
3073 netdev
->ifindex
= 0;
3074 if (netdev
->upscript
) {
3076 err
= run_script(handler
->name
, "net", netdev
->upscript
,
3077 "up", "empty", (char*) NULL
);
3084 static int shutdown_empty(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
3088 if (netdev
->downscript
) {
3089 err
= run_script(handler
->name
, "net", netdev
->downscript
,
3090 "down", "empty", (char*) NULL
);
3097 static int shutdown_none(struct lxc_handler
*handler
, struct lxc_netdev
*netdev
)
3102 int lxc_requests_empty_network(struct lxc_handler
*handler
)
3104 struct lxc_list
*network
= &handler
->conf
->network
;
3105 struct lxc_list
*iterator
;
3106 struct lxc_netdev
*netdev
;
3107 bool found_none
= false, found_nic
= false;
3109 if (lxc_list_empty(network
))
3112 lxc_list_for_each(iterator
, network
) {
3114 netdev
= iterator
->elem
;
3116 if (netdev
->type
== LXC_NET_NONE
)
3121 if (found_none
&& !found_nic
)
3126 int lxc_create_network(struct lxc_handler
*handler
)
3128 struct lxc_list
*network
= &handler
->conf
->network
;
3129 struct lxc_list
*iterator
;
3130 struct lxc_netdev
*netdev
;
3131 int am_root
= (getuid() == 0);
3136 lxc_list_for_each(iterator
, network
) {
3138 netdev
= iterator
->elem
;
3140 if (netdev
->type
< 0 || netdev
->type
> LXC_NET_MAXCONFTYPE
) {
3141 ERROR("invalid network configuration type '%d'",
3146 if (netdev_conf
[netdev
->type
](handler
, netdev
)) {
3147 ERROR("failed to create netdev");
3156 bool lxc_delete_network(struct lxc_handler
*handler
)
3159 struct lxc_list
*network
= &handler
->conf
->network
;
3160 struct lxc_list
*iterator
;
3161 struct lxc_netdev
*netdev
;
3162 bool deleted_all
= true;
3164 lxc_list_for_each(iterator
, network
) {
3165 netdev
= iterator
->elem
;
3167 if (netdev
->ifindex
!= 0 && netdev
->type
== LXC_NET_PHYS
) {
3168 if (lxc_netdev_rename_by_index(netdev
->ifindex
, netdev
->link
))
3169 WARN("Failed to rename interface with index %d "
3170 "to its initial name \"%s\".",
3171 netdev
->ifindex
, netdev
->link
);
3175 if (netdev_deconf
[netdev
->type
](handler
, netdev
)) {
3176 WARN("Failed to destroy netdev");
3179 /* Recent kernel remove the virtual interfaces when the network
3180 * namespace is destroyed but in case we did not moved the
3181 * interface to the network namespace, we have to destroy it
3183 if (netdev
->ifindex
!= 0) {
3184 ret
= lxc_netdev_delete_by_index(netdev
->ifindex
);
3185 if (-ret
== ENODEV
) {
3186 INFO("Interface \"%s\" with index %d already "
3187 "deleted or existing in different network "
3189 netdev
->name
? netdev
->name
: "(null)",
3191 } else if (ret
< 0) {
3192 deleted_all
= false;
3193 WARN("Failed to remove interface \"%s\" with "
3195 netdev
->name
? netdev
->name
: "(null)",
3196 netdev
->ifindex
, strerror(-ret
));
3198 INFO("Removed interface \"%s\" with index %d.",
3199 netdev
->name
? netdev
->name
: "(null)",
3204 /* Explicitly delete host veth device to prevent lingering
3205 * devices. We had issues in LXD around this.
3207 if (netdev
->ifindex
!= 0 && netdev
->type
== LXC_NET_VETH
&& !am_unpriv()) {
3209 if (netdev
->priv
.veth_attr
.pair
) {
3210 hostveth
= netdev
->priv
.veth_attr
.pair
;
3211 ret
= lxc_netdev_delete_by_name(hostveth
);
3213 WARN("Failed to remove interface \"%s\" from host: %s.", hostveth
, strerror(-ret
));
3215 INFO("Removed interface \"%s\" from host.", hostveth
);
3217 } else if (strlen(netdev
->priv
.veth_attr
.veth1
) > 0) {
3218 hostveth
= netdev
->priv
.veth_attr
.veth1
;
3219 ret
= lxc_netdev_delete_by_name(hostveth
);
3221 WARN("Failed to remove \"%s\" from host: %s.", hostveth
, strerror(-ret
));
3223 INFO("Removed interface \"%s\" from host.", hostveth
);
3224 memset((void *)&netdev
->priv
.veth_attr
.veth1
, 0, sizeof(netdev
->priv
.veth_attr
.veth1
));
3233 #define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
3235 /* lxc-user-nic returns "interface_name:interface_name\n" */
3236 #define MAX_BUFFER_SIZE IFNAMSIZ * 2 + 2
3237 static int unpriv_assign_nic(const char *lxcpath
, char *lxcname
,
3238 struct lxc_netdev
*netdev
, pid_t pid
)
3241 int bytes
, pipefd
[2];
3242 char *token
, *saveptr
= NULL
;
3243 char buffer
[MAX_BUFFER_SIZE
];
3244 char netdev_link
[IFNAMSIZ
+ 1];
3246 if (netdev
->type
!= LXC_NET_VETH
) {
3247 ERROR("nic type %d not support for unprivileged use",
3252 if (pipe(pipefd
) < 0) {
3253 SYSERROR("pipe failed");
3265 if (child
== 0) { // child
3266 /* Call lxc-user-nic pid type bridge. */
3268 char pidstr
[LXC_NUMSTRLEN64
];
3270 close(pipefd
[0]); /* Close the read-end of the pipe. */
3272 /* Redirect stdout to write-end of the pipe. */
3273 ret
= dup2(pipefd
[1], STDOUT_FILENO
);
3274 close(pipefd
[1]); /* Close the write-end of the pipe. */
3276 SYSERROR("Failed to dup2() to redirect stdout to pipe file descriptor.");
3281 strncpy(netdev_link
, netdev
->link
, IFNAMSIZ
);
3283 strncpy(netdev_link
, "none", IFNAMSIZ
);
3285 ret
= snprintf(pidstr
, LXC_NUMSTRLEN64
, "%d", pid
);
3286 if (ret
< 0 || ret
>= LXC_NUMSTRLEN64
)
3288 pidstr
[LXC_NUMSTRLEN64
- 1] = '\0';
3290 INFO("Execing lxc-user-nic %s %s %s veth %s %s", lxcpath
,
3291 lxcname
, pidstr
, netdev_link
, netdev
->name
);
3292 execlp(LXC_USERNIC_PATH
, LXC_USERNIC_PATH
, lxcpath
, lxcname
,
3293 pidstr
, "veth", netdev_link
, netdev
->name
, NULL
);
3295 SYSERROR("Failed to exec lxc-user-nic.");
3299 /* close the write-end of the pipe */
3302 bytes
= read(pipefd
[0], &buffer
, MAX_BUFFER_SIZE
);
3304 SYSERROR("Failed to read from pipe file descriptor.");
3305 buffer
[bytes
- 1] = '\0';
3307 if (wait_for_pid(child
) != 0) {
3312 /* close the read-end of the pipe */
3315 /* fill netdev->name field */
3316 token
= strtok_r(buffer
, ":", &saveptr
);
3320 netdev
->name
= malloc(IFNAMSIZ
+ 1);
3321 if (!netdev
->name
) {
3322 SYSERROR("Failed to allocate memory.");
3325 memset(netdev
->name
, 0, IFNAMSIZ
+ 1);
3326 strncpy(netdev
->name
, token
, IFNAMSIZ
);
3328 /* fill netdev->veth_attr.pair field */
3329 token
= strtok_r(NULL
, ":", &saveptr
);
3333 netdev
->priv
.veth_attr
.pair
= strdup(token
);
3334 if (!netdev
->priv
.veth_attr
.pair
) {
3335 ERROR("Failed to allocate memory.");
3342 int lxc_assign_network(const char *lxcpath
, char *lxcname
,
3343 struct lxc_list
*network
, pid_t pid
)
3345 struct lxc_list
*iterator
;
3346 struct lxc_netdev
*netdev
;
3347 char ifname
[IFNAMSIZ
];
3348 int am_root
= (getuid() == 0);
3351 lxc_list_for_each(iterator
, network
) {
3353 netdev
= iterator
->elem
;
3355 if (netdev
->type
== LXC_NET_VETH
&& !am_root
) {
3357 INFO("mtu ignored due to insufficient privilege");
3358 if (unpriv_assign_nic(lxcpath
, lxcname
, netdev
, pid
))
3360 // lxc-user-nic has moved the nic to the new ns.
3361 // unpriv_assign_nic() fills in netdev->name.
3362 // netdev->ifindex will be filed in at setup_netdev.
3366 /* empty network namespace, nothing to move */
3367 if (!netdev
->ifindex
)
3370 /* retrieve the name of the interface */
3371 if (!if_indextoname(netdev
->ifindex
, ifname
)) {
3372 ERROR("no interface corresponding to index '%d'", netdev
->ifindex
);
3376 err
= lxc_netdev_move_by_name(ifname
, pid
, NULL
);
3378 ERROR("failed to move '%s' to the container : %s",
3379 netdev
->link
, strerror(-err
));
3383 DEBUG("move '%s'/'%s' to '%d': .", ifname
, netdev
->name
, pid
);
3389 static int write_id_mapping(enum idtype idtype
, pid_t pid
, const char *buf
,
3392 char path
[MAXPATHLEN
];
3395 ret
= snprintf(path
, MAXPATHLEN
, "/proc/%d/%cid_map", pid
,
3396 idtype
== ID_TYPE_UID
? 'u' : 'g');
3397 if (ret
< 0 || ret
>= MAXPATHLEN
) {
3398 ERROR("failed to create path \"%s\"", path
);
3402 fd
= open(path
, O_WRONLY
);
3404 SYSERROR("failed to open \"%s\"", path
);
3409 ret
= lxc_write_nointr(fd
, buf
, buf_size
);
3410 if (ret
!= buf_size
) {
3411 SYSERROR("failed to write %cid mapping to \"%s\"",
3412 idtype
== ID_TYPE_UID
? 'u' : 'g', path
);
3421 /* Check whether a binary exist and has either CAP_SETUID, CAP_SETGID or both. */
3422 static int idmaptool_on_path_and_privileged(const char *binary
, cap_value_t cap
)
3429 path
= on_path(binary
, NULL
);
3433 ret
= stat(path
, &st
);
3439 /* Check if the binary is setuid. */
3440 if (st
.st_mode
& S_ISUID
) {
3441 DEBUG("The binary \"%s\" does have the setuid bit set.", path
);
3446 #if HAVE_LIBCAP && LIBCAP_SUPPORTS_FILE_CAPABILITIES
3447 /* Check if it has the CAP_SETUID capability. */
3448 if ((cap
& CAP_SETUID
) &&
3449 lxc_file_cap_is_set(path
, CAP_SETUID
, CAP_EFFECTIVE
) &&
3450 lxc_file_cap_is_set(path
, CAP_SETUID
, CAP_PERMITTED
)) {
3451 DEBUG("The binary \"%s\" has CAP_SETUID in its CAP_EFFECTIVE "
3452 "and CAP_PERMITTED sets.", path
);
3457 /* Check if it has the CAP_SETGID capability. */
3458 if ((cap
& CAP_SETGID
) &&
3459 lxc_file_cap_is_set(path
, CAP_SETGID
, CAP_EFFECTIVE
) &&
3460 lxc_file_cap_is_set(path
, CAP_SETGID
, CAP_PERMITTED
)) {
3461 DEBUG("The binary \"%s\" has CAP_SETGID in its CAP_EFFECTIVE "
3462 "and CAP_PERMITTED sets.", path
);
3467 /* If we cannot check for file capabilities we need to give the benefit
3468 * of the doubt. Otherwise we might fail even though all the necessary
3469 * file capabilities are set.
3471 DEBUG("Cannot check for file capabilites as full capability support is "
3472 "missing. Manual intervention needed.");
3481 int lxc_map_ids_exec_wrapper(void *args
)
3483 execl("/bin/sh", "sh", "-c", (char *)args
, (char *)NULL
);
3487 int lxc_map_ids(struct lxc_list
*idmap
, pid_t pid
)
3490 struct lxc_list
*iterator
;
3495 char cmd_output
[MAXPATHLEN
];
3496 /* strlen("new@idmap") = 9
3504 * We add some additional space to make sure that we really have
3505 * LXC_IDMAPLEN bytes available for our the {g,u]id mapping.
3507 char mapbuf
[9 + 1 + LXC_NUMSTRLEN64
+ 1 + LXC_IDMAPLEN
] = {0};
3508 int ret
= 0, uidmap
= 0, gidmap
= 0;
3509 bool use_shadow
= false, had_entry
= false;
3511 /* If new{g,u}idmap exists, that is, if shadow is handing out subuid
3512 * ranges, then insist that root also reserve ranges in subuid. This
3513 * will protected it by preventing another user from being handed the
3516 uidmap
= idmaptool_on_path_and_privileged("newuidmap", CAP_SETUID
);
3517 gidmap
= idmaptool_on_path_and_privileged("newgidmap", CAP_SETGID
);
3518 if (uidmap
> 0 && gidmap
> 0) {
3519 DEBUG("Functional newuidmap and newgidmap binary found.");
3522 /* In case unprivileged users run application containers via
3523 * execute() or a start*() there are valid cases where they may
3524 * only want to map their own {g,u}id. Let's not block them from
3525 * doing so by requiring geteuid() == 0.
3527 DEBUG("No newuidmap and newgidmap binary found. Trying to "
3528 "write directly with euid %d.", geteuid());
3531 for (type
= ID_TYPE_UID
, u_or_g
= 'u'; type
<= ID_TYPE_GID
;
3532 type
++, u_or_g
= 'g') {
3536 pos
+= sprintf(mapbuf
, "new%cidmap %d", u_or_g
, pid
);
3538 lxc_list_for_each(iterator
, idmap
) {
3539 /* The kernel only takes <= 4k for writes to
3540 * /proc/<nr>/[ug]id_map
3542 map
= iterator
->elem
;
3543 if (map
->idtype
!= type
)
3548 left
= LXC_IDMAPLEN
- (pos
- mapbuf
);
3549 fill
= snprintf(pos
, left
, "%s%lu %lu %lu%s",
3550 use_shadow
? " " : "", map
->nsid
,
3551 map
->hostid
, map
->range
,
3552 use_shadow
? "" : "\n");
3553 if (fill
<= 0 || fill
>= left
)
3554 SYSERROR("Too many {g,u}id mappings defined.");
3561 /* Try to catch the ouput of new{g,u}idmap to make debugging
3565 ret
= run_command(cmd_output
, sizeof(cmd_output
),
3566 lxc_map_ids_exec_wrapper
,
3569 ERROR("new%cidmap failed to write mapping: %s",
3570 u_or_g
, cmd_output
);
3574 ret
= write_id_mapping(type
, pid
, mapbuf
, pos
- mapbuf
);
3579 memset(mapbuf
, 0, sizeof(mapbuf
));
3586 * return the host uid/gid to which the container root is mapped in
3588 * Return true if id was found, false otherwise.
3590 bool get_mapped_rootid(struct lxc_conf
*conf
, enum idtype idtype
,
3593 struct lxc_list
*it
;
3596 lxc_list_for_each(it
, &conf
->id_map
) {
3598 if (map
->idtype
!= idtype
)
3608 int mapped_hostid(unsigned id
, struct lxc_conf
*conf
, enum idtype idtype
)
3610 struct lxc_list
*it
;
3612 lxc_list_for_each(it
, &conf
->id_map
) {
3614 if (map
->idtype
!= idtype
)
3616 if (id
>= map
->hostid
&& id
< map
->hostid
+ map
->range
)
3617 return (id
- map
->hostid
) + map
->nsid
;
3622 int find_unmapped_nsid(struct lxc_conf
*conf
, enum idtype idtype
)
3624 struct lxc_list
*it
;
3626 unsigned int freeid
= 0;
3628 lxc_list_for_each(it
, &conf
->id_map
) {
3630 if (map
->idtype
!= idtype
)
3632 if (freeid
>= map
->nsid
&& freeid
< map
->nsid
+ map
->range
) {
3633 freeid
= map
->nsid
+ map
->range
;
3640 int lxc_find_gateway_addresses(struct lxc_handler
*handler
)
3642 struct lxc_list
*network
= &handler
->conf
->network
;
3643 struct lxc_list
*iterator
;
3644 struct lxc_netdev
*netdev
;
3647 lxc_list_for_each(iterator
, network
) {
3648 netdev
= iterator
->elem
;
3650 if (!netdev
->ipv4_gateway_auto
&& !netdev
->ipv6_gateway_auto
)
3653 if (netdev
->type
!= LXC_NET_VETH
&& netdev
->type
!= LXC_NET_MACVLAN
) {
3654 ERROR("gateway = auto only supported for "
3655 "veth and macvlan");
3659 if (!netdev
->link
) {
3660 ERROR("gateway = auto needs a link interface");
3664 link_index
= if_nametoindex(netdev
->link
);
3668 if (netdev
->ipv4_gateway_auto
) {
3669 if (lxc_ipv4_addr_get(link_index
, &netdev
->ipv4_gateway
)) {
3670 ERROR("failed to automatically find ipv4 gateway "
3671 "address from link interface '%s'", netdev
->link
);
3676 if (netdev
->ipv6_gateway_auto
) {
3677 if (lxc_ipv6_addr_get(link_index
, &netdev
->ipv6_gateway
)) {
3678 ERROR("failed to automatically find ipv6 gateway "
3679 "address from link interface '%s'", netdev
->link
);
3688 int lxc_create_tty(const char *name
, struct lxc_conf
*conf
)
3690 struct lxc_tty_info
*tty_info
= &conf
->tty_info
;
3693 /* no tty in the configuration */
3697 tty_info
->pty_info
= malloc(sizeof(*tty_info
->pty_info
) * conf
->tty
);
3698 if (!tty_info
->pty_info
) {
3699 SYSERROR("failed to allocate struct *pty_info");
3703 for (i
= 0; i
< conf
->tty
; i
++) {
3704 struct lxc_pty_info
*pty_info
= &tty_info
->pty_info
[i
];
3707 ret
= openpty(&pty_info
->master
, &pty_info
->slave
,
3708 pty_info
->name
, NULL
, NULL
);
3711 SYSERROR("failed to create pty device number %d", i
);
3712 tty_info
->nbtty
= i
;
3713 lxc_delete_tty(tty_info
);
3717 DEBUG("allocated pty \"%s\" with master fd %d and slave fd %d",
3718 pty_info
->name
, pty_info
->master
, pty_info
->slave
);
3720 /* Prevent leaking the file descriptors to the container */
3721 ret
= fcntl(pty_info
->master
, F_SETFD
, FD_CLOEXEC
);
3723 WARN("failed to set FD_CLOEXEC flag on master fd %d of "
3724 "pty device \"%s\": %s",
3725 pty_info
->master
, pty_info
->name
, strerror(errno
));
3727 ret
= fcntl(pty_info
->slave
, F_SETFD
, FD_CLOEXEC
);
3729 WARN("failed to set FD_CLOEXEC flag on slave fd %d of "
3730 "pty device \"%s\": %s",
3731 pty_info
->slave
, pty_info
->name
, strerror(errno
));
3736 tty_info
->nbtty
= conf
->tty
;
3738 INFO("finished allocating %d pts devices", conf
->tty
);
3742 void lxc_delete_tty(struct lxc_tty_info
*tty_info
)
3746 for (i
= 0; i
< tty_info
->nbtty
; i
++) {
3747 struct lxc_pty_info
*pty_info
= &tty_info
->pty_info
[i
];
3749 close(pty_info
->master
);
3750 close(pty_info
->slave
);
3753 free(tty_info
->pty_info
);
3754 tty_info
->pty_info
= NULL
;
3755 tty_info
->nbtty
= 0;
3759 int chown_mapped_root_exec_wrapper(void *args
)
3761 execvp("lxc-usernsexec", args
);
3766 * chown_mapped_root: for an unprivileged user with uid/gid X to
3767 * chown a dir to subuid/subgid Y, he needs to run chown as root
3768 * in a userns where nsid 0 is mapped to hostuid/hostgid Y, and
3769 * nsid Y is mapped to hostuid/hostgid X. That way, the container
3770 * root is privileged with respect to hostuid/hostgid X, allowing
3771 * him to do the chown.
3773 int chown_mapped_root(char *path
, struct lxc_conf
*conf
)
3775 uid_t rootuid
, rootgid
;
3777 char *chownpath
= path
;
3778 int hostuid
, hostgid
, ret
;
3780 char map1
[100], map2
[100], map3
[100], map4
[100], map5
[100];
3782 char *args1
[] = {"lxc-usernsexec",
3787 "--", "chown", ugid
, path
,
3789 char *args2
[] = {"lxc-usernsexec",
3795 "--", "chown", ugid
, path
,
3797 char cmd_output
[MAXPATHLEN
];
3799 hostuid
= geteuid();
3800 hostgid
= getegid();
3802 if (!get_mapped_rootid(conf
, ID_TYPE_UID
, &val
)) {
3803 ERROR("No uid mapping for container root");
3806 rootuid
= (uid_t
)val
;
3807 if (!get_mapped_rootid(conf
, ID_TYPE_GID
, &val
)) {
3808 ERROR("No gid mapping for container root");
3811 rootgid
= (gid_t
)val
;
3814 * In case of overlay, we want only the writeable layer to be chowned
3816 if (strncmp(path
, "overlayfs:", 10) == 0 || strncmp(path
, "aufs:", 5) == 0) {
3817 chownpath
= strchr(path
, ':');
3819 ERROR("Bad overlay path: %s", path
);
3822 chownpath
= strchr(chownpath
+ 1, ':');
3824 ERROR("Bad overlay path: %s", path
);
3831 if (chown(path
, rootuid
, rootgid
) < 0) {
3832 ERROR("Error chowning %s", path
);
3838 if (rootuid
== hostuid
) {
3840 INFO("%s: container root is our uid; no need to chown" ,__func__
);
3844 // save the current gid of "path"
3845 if (stat(path
, &sb
) < 0) {
3846 ERROR("Error stat %s", path
);
3851 * A file has to be group-owned by a gid mapped into the
3852 * container, or the container won't be privileged over it.
3854 DEBUG("trying to chown \"%s\" to %d", path
, hostgid
);
3855 if (sb
.st_uid
== hostuid
&&
3856 mapped_hostid(sb
.st_gid
, conf
, ID_TYPE_GID
) < 0 &&
3857 chown(path
, -1, hostgid
) < 0) {
3858 ERROR("Failed chgrping %s", path
);
3863 ret
= snprintf(map1
, 100, "u:0:%d:1", rootuid
);
3864 if (ret
< 0 || ret
>= 100) {
3865 ERROR("Error uid printing map string");
3869 // "u:hostuid:hostuid:1"
3870 ret
= snprintf(map2
, 100, "u:%d:%d:1", hostuid
, hostuid
);
3871 if (ret
< 0 || ret
>= 100) {
3872 ERROR("Error uid printing map string");
3877 ret
= snprintf(map3
, 100, "g:0:%d:1", rootgid
);
3878 if (ret
< 0 || ret
>= 100) {
3879 ERROR("Error gid printing map string");
3883 // "g:pathgid:rootgid+pathgid:1"
3884 ret
= snprintf(map4
, 100, "g:%d:%d:1", (gid_t
)sb
.st_gid
,
3885 rootgid
+ (gid_t
)sb
.st_gid
);
3886 if (ret
< 0 || ret
>= 100) {
3887 ERROR("Error gid printing map string");
3891 // "g:hostgid:hostgid:1"
3892 ret
= snprintf(map5
, 100, "g:%d:%d:1", hostgid
, hostgid
);
3893 if (ret
< 0 || ret
>= 100) {
3894 ERROR("Error gid printing map string");
3898 // "0:pathgid" (chown)
3899 ret
= snprintf(ugid
, 100, "0:%d", (gid_t
)sb
.st_gid
);
3900 if (ret
< 0 || ret
>= 100) {
3901 ERROR("Error owner printing format string for chown");
3905 if (hostgid
== sb
.st_gid
)
3906 ret
= run_command(cmd_output
, sizeof(cmd_output
),
3907 chown_mapped_root_exec_wrapper
,
3910 ret
= run_command(cmd_output
, sizeof(cmd_output
),
3911 chown_mapped_root_exec_wrapper
,
3914 ERROR("lxc-usernsexec failed: %s", cmd_output
);
3919 int ttys_shift_ids(struct lxc_conf
*c
)
3921 if (lxc_list_empty(&c
->id_map
))
3924 if (strcmp(c
->console
.name
, "") !=0 && chown_mapped_root(c
->console
.name
, c
) < 0) {
3925 ERROR("Failed to chown %s", c
->console
.name
);
3932 /* NOTE: Must not be called from inside the container namespace! */
3933 int lxc_create_tmp_proc_mount(struct lxc_conf
*conf
)
3937 mounted
= lxc_mount_proc_if_needed(conf
->rootfs
.path
? conf
->rootfs
.mount
: "");
3938 if (mounted
== -1) {
3939 SYSERROR("failed to mount /proc in the container");
3940 /* continue only if there is no rootfs */
3941 if (conf
->rootfs
.path
)
3943 } else if (mounted
== 1) {
3944 conf
->tmp_umount_proc
= 1;
3950 void tmp_proc_unmount(struct lxc_conf
*lxc_conf
)
3952 if (lxc_conf
->tmp_umount_proc
== 1) {
3954 lxc_conf
->tmp_umount_proc
= 0;
3958 void remount_all_slave(void)
3960 /* walk /proc/mounts and change any shared entries to slave */
3961 FILE *f
= fopen("/proc/self/mountinfo", "r");
3966 SYSERROR("Failed to open /proc/self/mountinfo to mark all shared");
3967 ERROR("Continuing container startup...");
3971 while (getline(&line
, &len
, f
) != -1) {
3972 char *target
, *opts
;
3973 target
= get_field(line
, 4);
3976 opts
= get_field(target
, 2);
3979 null_endofword(opts
);
3980 if (!strstr(opts
, "shared"))
3982 null_endofword(target
);
3983 if (mount(NULL
, target
, NULL
, MS_SLAVE
, NULL
)) {
3984 SYSERROR("Failed to make %s rslave", target
);
3985 ERROR("Continuing...");
3992 void lxc_execute_bind_init(struct lxc_conf
*conf
)
3995 char path
[PATH_MAX
], destpath
[PATH_MAX
], *p
;
3997 /* If init exists in the container, don't bind mount a static one */
3998 p
= choose_init(conf
->rootfs
.mount
);
4004 ret
= snprintf(path
, PATH_MAX
, SBINDIR
"/init.lxc.static");
4005 if (ret
< 0 || ret
>= PATH_MAX
) {
4006 WARN("Path name too long searching for lxc.init.static");
4010 if (!file_exists(path
)) {
4011 INFO("%s does not exist on host", path
);
4015 ret
= snprintf(destpath
, PATH_MAX
, "%s%s", conf
->rootfs
.mount
, "/init.lxc.static");
4016 if (ret
< 0 || ret
>= PATH_MAX
) {
4017 WARN("Path name too long for container's lxc.init.static");
4021 if (!file_exists(destpath
)) {
4022 FILE * pathfile
= fopen(destpath
, "wb");
4024 SYSERROR("Failed to create mount target '%s'", destpath
);
4030 ret
= safe_mount(path
, destpath
, "none", MS_BIND
, NULL
, conf
->rootfs
.mount
);
4032 SYSERROR("Failed to bind lxc.init.static into container");
4033 INFO("lxc.init.static bound into container at %s", path
);
4037 * This does the work of remounting / if it is shared, calling the
4038 * container pre-mount hooks, and mounting the rootfs.
4040 int do_rootfs_setup(struct lxc_conf
*conf
, const char *name
, const char *lxcpath
)
4042 if (conf
->rootfs_setup
) {
4044 * rootfs was set up in another namespace. bind-mount it
4045 * to give us a mount in our own ns so we can pivot_root to it
4047 const char *path
= conf
->rootfs
.mount
;
4048 if (mount(path
, path
, "rootfs", MS_BIND
, NULL
) < 0) {
4049 ERROR("Failed to bind-mount container / onto itself");
4055 remount_all_slave();
4057 if (run_lxc_hooks(name
, "pre-mount", conf
, lxcpath
, NULL
)) {
4058 ERROR("failed to run pre-mount hooks for container '%s'.", name
);
4062 if (setup_rootfs(conf
)) {
4063 ERROR("failed to setup rootfs for '%s'", name
);
4067 conf
->rootfs_setup
= true;
4071 static bool verify_start_hooks(struct lxc_conf
*conf
)
4073 struct lxc_list
*it
;
4074 char path
[MAXPATHLEN
];
4075 lxc_list_for_each(it
, &conf
->hooks
[LXCHOOK_START
]) {
4076 char *hookname
= it
->elem
;
4080 ret
= snprintf(path
, MAXPATHLEN
, "%s%s",
4081 conf
->rootfs
.path
? conf
->rootfs
.mount
: "", hookname
);
4082 if (ret
< 0 || ret
>= MAXPATHLEN
)
4084 ret
= stat(path
, &st
);
4086 SYSERROR("Start hook %s not found in container",
4096 static int send_fd(int sock
, int fd
)
4098 int ret
= lxc_abstract_unix_send_fd(sock
, fd
, NULL
, 0);
4102 SYSERROR("Error sending tty fd to parent");
4109 static int send_ttys_to_parent(struct lxc_handler
*handler
)
4112 struct lxc_conf
*conf
= handler
->conf
;
4113 const struct lxc_tty_info
*tty_info
= &conf
->tty_info
;
4114 int sock
= handler
->ttysock
[0];
4116 for (i
= 0; i
< tty_info
->nbtty
; i
++) {
4117 struct lxc_pty_info
*pty_info
= &tty_info
->pty_info
[i
];
4118 ret
= send_fd(sock
, pty_info
->slave
);
4120 send_fd(sock
, pty_info
->master
);
4121 TRACE("sending pty \"%s\" with master fd %d and slave fd %d to "
4123 pty_info
->name
, pty_info
->master
, pty_info
->slave
);
4124 close(pty_info
->slave
);
4125 pty_info
->slave
= -1;
4126 close(pty_info
->master
);
4127 pty_info
->master
= -1;
4129 ERROR("failed to send pty \"%s\" with master fd %d and "
4130 "slave fd %d to parent : %s",
4131 pty_info
->name
, pty_info
->master
, pty_info
->slave
,
4137 close(handler
->ttysock
[0]);
4138 close(handler
->ttysock
[1]);
4143 ERROR("Error writing tty fd to parent");
4147 int lxc_setup(struct lxc_handler
*handler
)
4149 const char *name
= handler
->name
;
4150 struct lxc_conf
*lxc_conf
= handler
->conf
;
4151 const char *lxcpath
= handler
->lxcpath
;
4153 if (do_rootfs_setup(lxc_conf
, name
, lxcpath
) < 0) {
4154 ERROR("Error setting up rootfs mount after spawn");
4158 if (lxc_conf
->inherit_ns_fd
[LXC_NS_UTS
] == -1) {
4159 if (setup_utsname(lxc_conf
->utsname
)) {
4160 ERROR("failed to setup the utsname for '%s'", name
);
4165 if (setup_network(&lxc_conf
->network
)) {
4166 ERROR("failed to setup the network for '%s'", name
);
4170 if (lxc_conf
->autodev
> 0) {
4171 if (mount_autodev(name
, &lxc_conf
->rootfs
, lxcpath
)) {
4172 ERROR("failed to mount /dev in the container");
4177 /* do automatic mounts (mainly /proc and /sys), but exclude
4178 * those that need to wait until other stuff has finished
4180 if (lxc_mount_auto_mounts(lxc_conf
, lxc_conf
->auto_mounts
& ~LXC_AUTO_CGROUP_MASK
, handler
) < 0) {
4181 ERROR("failed to setup the automatic mounts for '%s'", name
);
4185 if (setup_mount(&lxc_conf
->rootfs
, lxc_conf
->fstab
, name
, lxcpath
)) {
4186 ERROR("failed to setup the mounts for '%s'", name
);
4190 if (!lxc_list_empty(&lxc_conf
->mount_list
) && setup_mount_entries(&lxc_conf
->rootfs
, &lxc_conf
->mount_list
, name
, lxcpath
)) {
4191 ERROR("failed to setup the mount entries for '%s'", name
);
4195 /* Make sure any start hooks are in the container */
4196 if (!verify_start_hooks(lxc_conf
))
4199 if (lxc_conf
->is_execute
)
4200 lxc_execute_bind_init(lxc_conf
);
4202 /* now mount only cgroup, if wanted;
4203 * before, /sys could not have been mounted
4204 * (is either mounted automatically or via fstab entries)
4206 if (lxc_mount_auto_mounts(lxc_conf
, lxc_conf
->auto_mounts
& LXC_AUTO_CGROUP_MASK
, handler
) < 0) {
4207 ERROR("failed to setup the automatic mounts for '%s'", name
);
4211 if (run_lxc_hooks(name
, "mount", lxc_conf
, lxcpath
, NULL
)) {
4212 ERROR("failed to run mount hooks for container '%s'.", name
);
4216 if (lxc_conf
->autodev
> 0) {
4217 if (run_lxc_hooks(name
, "autodev", lxc_conf
, lxcpath
, NULL
)) {
4218 ERROR("failed to run autodev hooks for container '%s'.", name
);
4221 if (lxc_fill_autodev(&lxc_conf
->rootfs
)) {
4222 ERROR("failed to populate /dev in the container");
4227 if (!lxc_conf
->is_execute
&& lxc_setup_console(&lxc_conf
->rootfs
, &lxc_conf
->console
, lxc_conf
->ttydir
)) {
4228 ERROR("failed to setup the console for '%s'", name
);
4232 if (lxc_conf
->kmsg
) {
4233 if (setup_kmsg(&lxc_conf
->rootfs
, &lxc_conf
->console
)) // don't fail
4234 ERROR("failed to setup kmsg for '%s'", name
);
4237 if (!lxc_conf
->is_execute
&& setup_dev_symlinks(&lxc_conf
->rootfs
)) {
4238 ERROR("failed to setup /dev symlinks for '%s'", name
);
4242 /* mount /proc if it's not already there */
4243 if (lxc_create_tmp_proc_mount(lxc_conf
) < 0) {
4244 ERROR("failed to LSM mount proc for '%s'", name
);
4248 if (setup_pivot_root(&lxc_conf
->rootfs
)) {
4249 ERROR("failed to set rootfs for '%s'", name
);
4253 if (lxc_setup_devpts(lxc_conf
->pts
)) {
4254 ERROR("failed to setup the new pts instance");
4258 if (lxc_create_tty(name
, lxc_conf
)) {
4259 ERROR("failed to create the ttys");
4263 if (send_ttys_to_parent(handler
) < 0) {
4264 ERROR("failure sending console info to parent");
4268 if (!lxc_conf
->is_execute
&& lxc_setup_tty(lxc_conf
)) {
4269 ERROR("failed to setup the ttys for '%s'", name
);
4273 if (lxc_conf
->pty_names
&& setenv("container_ttys", lxc_conf
->pty_names
, 1))
4274 SYSERROR("failed to set environment variable for container ptys");
4277 if (setup_personality(lxc_conf
->personality
)) {
4278 ERROR("failed to setup personality");
4282 if (!lxc_list_empty(&lxc_conf
->keepcaps
)) {
4283 if (!lxc_list_empty(&lxc_conf
->caps
)) {
4284 ERROR("Container requests lxc.cap.drop and lxc.cap.keep: either use lxc.cap.drop or lxc.cap.keep, not both.");
4287 if (dropcaps_except(&lxc_conf
->keepcaps
)) {
4288 ERROR("failed to keep requested caps");
4291 } else if (setup_caps(&lxc_conf
->caps
)) {
4292 ERROR("failed to drop capabilities");
4296 NOTICE("'%s' is setup.", name
);
4301 int run_lxc_hooks(const char *name
, char *hook
, struct lxc_conf
*conf
,
4302 const char *lxcpath
, char *argv
[])
4305 struct lxc_list
*it
;
4307 if (strcmp(hook
, "pre-start") == 0)
4308 which
= LXCHOOK_PRESTART
;
4309 else if (strcmp(hook
, "pre-mount") == 0)
4310 which
= LXCHOOK_PREMOUNT
;
4311 else if (strcmp(hook
, "mount") == 0)
4312 which
= LXCHOOK_MOUNT
;
4313 else if (strcmp(hook
, "autodev") == 0)
4314 which
= LXCHOOK_AUTODEV
;
4315 else if (strcmp(hook
, "start") == 0)
4316 which
= LXCHOOK_START
;
4317 else if (strcmp(hook
, "stop") == 0)
4318 which
= LXCHOOK_STOP
;
4319 else if (strcmp(hook
, "post-stop") == 0)
4320 which
= LXCHOOK_POSTSTOP
;
4321 else if (strcmp(hook
, "clone") == 0)
4322 which
= LXCHOOK_CLONE
;
4323 else if (strcmp(hook
, "destroy") == 0)
4324 which
= LXCHOOK_DESTROY
;
4327 lxc_list_for_each(it
, &conf
->hooks
[which
]) {
4329 char *hookname
= it
->elem
;
4330 ret
= run_script_argv(name
, "lxc", hookname
, hook
, lxcpath
, argv
);
4337 static void lxc_remove_nic(struct lxc_list
*it
)
4339 struct lxc_netdev
*netdev
= it
->elem
;
4340 struct lxc_list
*it2
,*next
;
4346 if (netdev
->type
== LXC_NET_VETH
)
4347 free(netdev
->priv
.veth_attr
.pair
);
4348 free(netdev
->upscript
);
4349 free(netdev
->hwaddr
);
4351 free(netdev
->ipv4_gateway
);
4352 free(netdev
->ipv6_gateway
);
4353 lxc_list_for_each_safe(it2
, &netdev
->ipv4
, next
) {
4358 lxc_list_for_each_safe(it2
, &netdev
->ipv6
, next
) {
4367 /* we get passed in something like '0', '0.ipv4' or '1.ipv6' */
4368 int lxc_clear_nic(struct lxc_conf
*c
, const char *key
)
4372 struct lxc_list
*it
;
4373 struct lxc_netdev
*netdev
;
4375 p1
= strchr(key
, '.');
4376 if (!p1
|| *(p1
+1) == '\0')
4379 ret
= sscanf(key
, "%d", &idx
);
4380 if (ret
!= 1) return -1;
4385 lxc_list_for_each(it
, &c
->network
) {
4390 if (i
< idx
) // we don't have that many nics defined
4393 if (!it
|| !it
->elem
)
4400 } else if (strcmp(p1
, ".ipv4") == 0) {
4401 struct lxc_list
*it2
,*next
;
4402 lxc_list_for_each_safe(it2
, &netdev
->ipv4
, next
) {
4407 } else if (strcmp(p1
, ".ipv6") == 0) {
4408 struct lxc_list
*it2
,*next
;
4409 lxc_list_for_each_safe(it2
, &netdev
->ipv6
, next
) {
4420 int lxc_clear_config_network(struct lxc_conf
*c
)
4422 struct lxc_list
*it
,*next
;
4423 lxc_list_for_each_safe(it
, &c
->network
, next
) {
4429 int lxc_clear_config_caps(struct lxc_conf
*c
)
4431 struct lxc_list
*it
,*next
;
4433 lxc_list_for_each_safe(it
, &c
->caps
, next
) {
4441 static int lxc_free_idmap(struct lxc_list
*id_map
) {
4442 struct lxc_list
*it
, *next
;
4444 lxc_list_for_each_safe(it
, id_map
, next
) {
4452 int lxc_clear_idmaps(struct lxc_conf
*c
)
4454 return lxc_free_idmap(&c
->id_map
);
4457 int lxc_clear_config_keepcaps(struct lxc_conf
*c
)
4459 struct lxc_list
*it
,*next
;
4461 lxc_list_for_each_safe(it
, &c
->keepcaps
, next
) {
4469 int lxc_clear_cgroups(struct lxc_conf
*c
, const char *key
)
4471 struct lxc_list
*it
,*next
;
4473 const char *k
= NULL
;
4475 if (strcmp(key
, "lxc.cgroup") == 0)
4477 else if (strncmp(key
, "lxc.cgroup.", sizeof("lxc.cgroup.")-1) == 0)
4478 k
= key
+ sizeof("lxc.cgroup.")-1;
4482 lxc_list_for_each_safe(it
, &c
->cgroup
, next
) {
4483 struct lxc_cgroup
*cg
= it
->elem
;
4484 if (!all
&& strcmp(cg
->subsystem
, k
) != 0)
4487 free(cg
->subsystem
);
4495 int lxc_clear_limits(struct lxc_conf
*c
, const char *key
)
4497 struct lxc_list
*it
, *next
;
4499 const char *k
= NULL
;
4501 if (strcmp(key
, "lxc.limit") == 0)
4503 else if (strncmp(key
, "lxc.limit.", sizeof("lxc.limit.")-1) == 0)
4504 k
= key
+ sizeof("lxc.limit.")-1;
4508 lxc_list_for_each_safe(it
, &c
->limits
, next
) {
4509 struct lxc_limit
*lim
= it
->elem
;
4510 if (!all
&& strcmp(lim
->resource
, k
) != 0)
4513 free(lim
->resource
);
4520 int lxc_clear_groups(struct lxc_conf
*c
)
4522 struct lxc_list
*it
,*next
;
4524 lxc_list_for_each_safe(it
, &c
->groups
, next
) {
4532 int lxc_clear_environment(struct lxc_conf
*c
)
4534 struct lxc_list
*it
,*next
;
4536 lxc_list_for_each_safe(it
, &c
->environment
, next
) {
4545 int lxc_clear_mount_entries(struct lxc_conf
*c
)
4547 struct lxc_list
*it
,*next
;
4549 lxc_list_for_each_safe(it
, &c
->mount_list
, next
) {
4557 int lxc_clear_automounts(struct lxc_conf
*c
)
4563 int lxc_clear_hooks(struct lxc_conf
*c
, const char *key
)
4565 struct lxc_list
*it
,*next
;
4566 bool all
= false, done
= false;
4567 const char *k
= NULL
;
4570 if (strcmp(key
, "lxc.hook") == 0)
4572 else if (strncmp(key
, "lxc.hook.", sizeof("lxc.hook.")-1) == 0)
4573 k
= key
+ sizeof("lxc.hook.")-1;
4577 for (i
=0; i
<NUM_LXC_HOOKS
; i
++) {
4578 if (all
|| strcmp(k
, lxchook_names
[i
]) == 0) {
4579 lxc_list_for_each_safe(it
, &c
->hooks
[i
], next
) {
4589 ERROR("Invalid hook key: %s", key
);
4595 static void lxc_clear_saved_nics(struct lxc_conf
*conf
)
4599 if (!conf
->saved_nics
)
4601 for (i
=0; i
< conf
->num_savednics
; i
++)
4602 free(conf
->saved_nics
[i
].orig_name
);
4603 free(conf
->saved_nics
);
4606 static inline void lxc_clear_aliens(struct lxc_conf
*conf
)
4608 struct lxc_list
*it
,*next
;
4610 lxc_list_for_each_safe(it
, &conf
->aliens
, next
) {
4617 void lxc_clear_includes(struct lxc_conf
*conf
)
4619 struct lxc_list
*it
,*next
;
4621 lxc_list_for_each_safe(it
, &conf
->includes
, next
) {
4628 void lxc_conf_free(struct lxc_conf
*conf
)
4632 if (current_config
== conf
)
4633 current_config
= NULL
;
4634 free(conf
->console
.log_path
);
4635 free(conf
->console
.path
);
4636 free(conf
->rootfs
.mount
);
4637 free(conf
->rootfs
.bdev_type
);
4638 free(conf
->rootfs
.options
);
4639 free(conf
->rootfs
.path
);
4640 free(conf
->logfile
);
4641 if (conf
->logfd
!= -1)
4643 free(conf
->utsname
);
4647 free(conf
->init_cmd
);
4648 free(conf
->unexpanded_config
);
4649 free(conf
->pty_names
);
4651 lxc_clear_config_network(conf
);
4652 free(conf
->lsm_aa_profile
);
4653 free(conf
->lsm_se_context
);
4654 lxc_seccomp_free(conf
);
4655 lxc_clear_config_caps(conf
);
4656 lxc_clear_config_keepcaps(conf
);
4657 lxc_clear_cgroups(conf
, "lxc.cgroup");
4658 lxc_clear_hooks(conf
, "lxc.hook");
4659 lxc_clear_mount_entries(conf
);
4660 lxc_clear_saved_nics(conf
);
4661 lxc_clear_idmaps(conf
);
4662 lxc_clear_groups(conf
);
4663 lxc_clear_includes(conf
);
4664 lxc_clear_aliens(conf
);
4665 lxc_clear_environment(conf
);
4666 lxc_clear_limits(conf
, "lxc.limit");
4670 struct userns_fn_data
{
4672 const char *fn_name
;
4677 static int run_userns_fn(void *data
)
4679 struct userns_fn_data
*d
= data
;
4682 /* Close write end of the pipe. */
4685 /* Wait for parent to finish establishing a new mapping in the user
4686 * namespace we are executing in.
4688 if (read(d
->p
[0], &c
, 1) != 1)
4691 /* Close read end of the pipe. */
4695 TRACE("calling function \"%s\"", d
->fn_name
);
4696 /* Call function to run. */
4697 return d
->fn(d
->arg
);
4700 static struct id_map
*mapped_hostid_entry(struct lxc_conf
*conf
, unsigned id
,
4703 struct lxc_list
*it
;
4705 struct id_map
*retmap
= NULL
;
4707 lxc_list_for_each(it
, &conf
->id_map
) {
4709 if (map
->idtype
!= idtype
)
4712 if (id
>= map
->hostid
&& id
< map
->hostid
+ map
->range
) {
4721 retmap
= malloc(sizeof(*retmap
));
4725 memcpy(retmap
, map
, sizeof(*retmap
));
4730 * Allocate a new {g,u}id mapping for the given {g,u}id. Re-use an already
4731 * existing one or establish a new one.
4733 static struct id_map
*idmap_add(struct lxc_conf
*conf
, uid_t id
, enum idtype type
)
4736 struct id_map
*entry
= NULL
;
4738 /* Reuse existing mapping. */
4739 entry
= mapped_hostid_entry(conf
, id
, type
);
4743 /* Find new mapping. */
4744 hostid_mapped
= find_unmapped_nsid(conf
, type
);
4745 if (hostid_mapped
< 0) {
4746 DEBUG("failed to find free mapping for id %d", id
);
4750 entry
= malloc(sizeof(*entry
));
4754 entry
->idtype
= type
;
4755 entry
->nsid
= hostid_mapped
;
4756 entry
->hostid
= (unsigned long)id
;
4762 /* Run a function in a new user namespace.
4763 * The caller's euid/egid will be mapped if it is not already.
4764 * Afaict, userns_exec_1() is only used to operate based on privileges for the
4765 * user's own {g,u}id on the host and for the container root's unmapped {g,u}id.
4766 * This means we require only to establish a mapping from:
4767 * - the container root {g,u}id as seen from the host > user's host {g,u}id
4768 * - the container root -> some sub{g,u}id
4769 * The former we add, if the user did not specifiy a mapping. The latter we
4770 * retrieve from the ontainer's configured {g,u}id mappings as it must have been
4771 * there to start the container in the first place.
4773 int userns_exec_1(struct lxc_conf
*conf
, int (*fn
)(void *), void *data
,
4774 const char *fn_name
)
4778 struct userns_fn_data d
;
4780 struct lxc_list
*it
;
4784 struct lxc_list
*idmap
= NULL
, *tmplist
= NULL
;
4785 struct id_map
*container_root_uid
= NULL
, *container_root_gid
= NULL
,
4786 *host_uid_map
= NULL
, *host_gid_map
= NULL
;
4790 SYSERROR("opening pipe");
4794 d
.fn_name
= fn_name
;
4799 /* Clone child in new user namespace. */
4800 pid
= lxc_clone(run_userns_fn
, &d
, CLONE_NEWUSER
);
4802 ERROR("failed to clone child process in new user namespace");
4809 /* Find container root. */
4810 lxc_list_for_each(it
, &conf
->id_map
) {
4816 if (map
->idtype
== ID_TYPE_UID
&& container_root_uid
== NULL
) {
4817 container_root_uid
= malloc(sizeof(*container_root_uid
));
4818 if (!container_root_uid
)
4820 container_root_uid
->idtype
= map
->idtype
;
4821 container_root_uid
->hostid
= map
->hostid
;
4822 container_root_uid
->nsid
= 0;
4823 container_root_uid
->range
= map
->range
;
4824 } else if (map
->idtype
== ID_TYPE_GID
&& container_root_gid
== NULL
) {
4825 container_root_gid
= malloc(sizeof(*container_root_gid
));
4826 if (!container_root_gid
)
4828 container_root_gid
->idtype
= map
->idtype
;
4829 container_root_gid
->hostid
= map
->hostid
;
4830 container_root_gid
->nsid
= 0;
4831 container_root_gid
->range
= map
->range
;
4834 /* Found container root. */
4835 if (container_root_uid
&& container_root_gid
)
4839 /* This is actually checked earlier but it can't hurt. */
4840 if (!container_root_uid
|| !container_root_gid
) {
4841 ERROR("no mapping for container root found");
4845 host_uid_map
= container_root_uid
;
4846 host_gid_map
= container_root_gid
;
4848 /* Check whether the {g,u}id of the user has a mapping. */
4851 if (euid
!= container_root_uid
->hostid
)
4852 host_uid_map
= idmap_add(conf
, euid
, ID_TYPE_UID
);
4854 if (egid
!= container_root_gid
->hostid
)
4855 host_gid_map
= idmap_add(conf
, egid
, ID_TYPE_GID
);
4857 if (!host_uid_map
) {
4858 DEBUG("failed to find mapping for uid %d", euid
);
4862 if (!host_gid_map
) {
4863 DEBUG("failed to find mapping for gid %d", egid
);
4867 /* Allocate new {g,u}id map list. */
4868 idmap
= malloc(sizeof(*idmap
));
4871 lxc_list_init(idmap
);
4873 /* Add container root to the map. */
4874 tmplist
= malloc(sizeof(*tmplist
));
4877 lxc_list_add_elem(tmplist
, container_root_uid
);
4878 lxc_list_add_tail(idmap
, tmplist
);
4880 if (host_uid_map
&& (host_uid_map
!= container_root_uid
)) {
4881 /* idmap will now keep track of that memory. */
4882 container_root_uid
= NULL
;
4884 /* Add container root to the map. */
4885 tmplist
= malloc(sizeof(*tmplist
));
4888 lxc_list_add_elem(tmplist
, host_uid_map
);
4889 lxc_list_add_tail(idmap
, tmplist
);
4891 /* idmap will now keep track of that memory. */
4892 container_root_uid
= NULL
;
4893 /* idmap will now keep track of that memory. */
4894 host_uid_map
= NULL
;
4896 tmplist
= malloc(sizeof(*tmplist
));
4899 lxc_list_add_elem(tmplist
, container_root_gid
);
4900 lxc_list_add_tail(idmap
, tmplist
);
4902 if (host_gid_map
&& (host_gid_map
!= container_root_gid
)) {
4903 /* idmap will now keep track of that memory. */
4904 container_root_gid
= NULL
;
4906 tmplist
= malloc(sizeof(*tmplist
));
4909 lxc_list_add_elem(tmplist
, host_gid_map
);
4910 lxc_list_add_tail(idmap
, tmplist
);
4912 /* idmap will now keep track of that memory. */
4913 container_root_gid
= NULL
;
4914 /* idmap will now keep track of that memory. */
4915 host_gid_map
= NULL
;
4917 if (lxc_log_get_level() == LXC_LOG_PRIORITY_TRACE
||
4918 conf
->loglevel
== LXC_LOG_PRIORITY_TRACE
) {
4919 lxc_list_for_each(it
, idmap
) {
4921 TRACE("establishing %cid mapping for \"%d\" in new "
4922 "user namespace: nsuid %lu - hostid %lu - range "
4924 (map
->idtype
== ID_TYPE_UID
) ? 'u' : 'g', pid
,
4925 map
->nsid
, map
->hostid
, map
->range
);
4929 /* Set up {g,u}id mapping for user namespace of child process. */
4930 ret
= lxc_map_ids(idmap
, pid
);
4932 ERROR("error setting up {g,u}id mappings for child process "
4938 /* Tell child to proceed. */
4939 if (write(p
[1], &c
, 1) != 1) {
4940 SYSERROR("failed telling child process \"%d\" to proceed", pid
);
4944 /* Wait for child to finish. */
4945 ret
= wait_for_pid(pid
);
4949 lxc_free_idmap(idmap
);
4950 if (container_root_uid
)
4951 free(container_root_uid
);
4952 if (container_root_gid
)
4953 free(container_root_gid
);
4954 if (host_uid_map
&& (host_uid_map
!= container_root_uid
))
4956 if (host_gid_map
&& (host_gid_map
!= container_root_gid
))
4966 /* not thread-safe, do not use from api without first forking */
4967 static char* getuname(void)
4969 struct passwd
*result
;
4971 result
= getpwuid(geteuid());
4975 return strdup(result
->pw_name
);
4978 /* not thread-safe, do not use from api without first forking */
4979 static char *getgname(void)
4981 struct group
*result
;
4983 result
= getgrgid(getegid());
4987 return strdup(result
->gr_name
);
4990 /* not thread-safe, do not use from api without first forking */
4991 void suggest_default_idmap(void)
4994 unsigned int uid
= 0, urange
= 0, gid
= 0, grange
= 0;
4996 char *uname
, *gname
;
4999 if (!(uname
= getuname()))
5002 if (!(gname
= getgname())) {
5007 f
= fopen(subuidfile
, "r");
5009 ERROR("Your system is not configured with subuids");
5014 while (getline(&line
, &len
, f
) != -1) {
5015 size_t no_newline
= 0;
5016 char *p
= strchr(line
, ':'), *p2
;
5023 if (strcmp(line
, uname
))
5025 p2
= strchr(p
, ':');
5032 no_newline
= strcspn(p2
, "\n");
5033 p2
[no_newline
] = '\0';
5035 if (lxc_safe_uint(p
, &uid
) < 0)
5036 WARN("Could not parse UID.");
5037 if (lxc_safe_uint(p2
, &urange
) < 0)
5038 WARN("Could not parse UID range.");
5042 f
= fopen(subgidfile
, "r");
5044 ERROR("Your system is not configured with subgids");
5049 while (getline(&line
, &len
, f
) != -1) {
5050 size_t no_newline
= 0;
5051 char *p
= strchr(line
, ':'), *p2
;
5058 if (strcmp(line
, uname
))
5060 p2
= strchr(p
, ':');
5067 no_newline
= strcspn(p2
, "\n");
5068 p2
[no_newline
] = '\0';
5070 if (lxc_safe_uint(p
, &gid
) < 0)
5071 WARN("Could not parse GID.");
5072 if (lxc_safe_uint(p2
, &grange
) < 0)
5073 WARN("Could not parse GID range.");
5079 if (!urange
|| !grange
) {
5080 ERROR("You do not have subuids or subgids allocated");
5081 ERROR("Unprivileged containers require subuids and subgids");
5085 ERROR("You must either run as root, or define uid mappings");
5086 ERROR("To pass uid mappings to lxc-create, you could create");
5087 ERROR("~/.config/lxc/default.conf:");
5088 ERROR("lxc.include = %s", LXC_DEFAULT_CONFIG
);
5089 ERROR("lxc.id_map = u 0 %u %u", uid
, urange
);
5090 ERROR("lxc.id_map = g 0 %u %u", gid
, grange
);
5096 static void free_cgroup_settings(struct lxc_list
*result
)
5098 struct lxc_list
*iterator
, *next
;
5100 lxc_list_for_each_safe(iterator
, result
, next
) {
5101 lxc_list_del(iterator
);
5108 * Return the list of cgroup_settings sorted according to the following rules
5109 * 1. Put memory.limit_in_bytes before memory.memsw.limit_in_bytes
5111 struct lxc_list
*sort_cgroup_settings(struct lxc_list
* cgroup_settings
)
5113 struct lxc_list
*result
;
5114 struct lxc_list
*memsw_limit
= NULL
;
5115 struct lxc_list
*it
= NULL
;
5116 struct lxc_cgroup
*cg
= NULL
;
5117 struct lxc_list
*item
= NULL
;
5119 result
= malloc(sizeof(*result
));
5121 ERROR("failed to allocate memory to sort cgroup settings");
5124 lxc_list_init(result
);
5126 /*Iterate over the cgroup settings and copy them to the output list*/
5127 lxc_list_for_each(it
, cgroup_settings
) {
5128 item
= malloc(sizeof(*item
));
5130 ERROR("failed to allocate memory to sort cgroup settings");
5131 free_cgroup_settings(result
);
5134 item
->elem
= it
->elem
;
5136 if (strcmp(cg
->subsystem
, "memory.memsw.limit_in_bytes") == 0) {
5137 /* Store the memsw_limit location */
5139 } else if (strcmp(cg
->subsystem
, "memory.limit_in_bytes") == 0 && memsw_limit
!= NULL
) {
5140 /* lxc.cgroup.memory.memsw.limit_in_bytes is found before
5141 * lxc.cgroup.memory.limit_in_bytes, swap these two items */
5142 item
->elem
= memsw_limit
->elem
;
5143 memsw_limit
->elem
= it
->elem
;
5145 lxc_list_add_tail(result
, item
);