2 * lxc: linux Container library
4 * (C) Copyright IBM Corp. 2007, 2008
7 * Daniel Lezcano <daniel.lezcano at free.fr>
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 #define __STDC_FORMAT_MACROS /* Required for PRIu64 to work. */
41 #include <sys/mount.h>
42 #include <sys/param.h>
43 #include <sys/prctl.h>
45 #include <sys/types.h>
52 #include "namespace.h"
54 #include "raw_syscalls.h"
55 #include "syscall_wrappers.h"
59 #include "include/strlcpy.h"
63 #include "include/strlcat.h"
67 #define O_PATH 010000000
71 #define O_NOFOLLOW 00400000
74 lxc_log_define(utils
, lxc
);
77 * if path is btrfs, tries to remove it and any subvolumes beneath it
79 extern bool btrfs_try_remove_subvol(const char *path
);
81 static int _recursive_rmdir(const char *dirname
, dev_t pdev
,
82 const char *exclude
, int level
, bool onedev
)
84 struct dirent
*direntp
;
87 char pathname
[PATH_MAX
];
88 bool hadexclude
= false;
90 dir
= opendir(dirname
);
92 ERROR("Failed to open \"%s\"", dirname
);
96 while ((direntp
= readdir(dir
))) {
100 if (!strcmp(direntp
->d_name
, ".") ||
101 !strcmp(direntp
->d_name
, ".."))
104 rc
= snprintf(pathname
, PATH_MAX
, "%s/%s", dirname
, direntp
->d_name
);
105 if (rc
< 0 || rc
>= PATH_MAX
) {
106 ERROR("The name of path is too long");
111 if (!level
&& exclude
&& !strcmp(direntp
->d_name
, exclude
)) {
112 ret
= rmdir(pathname
);
116 INFO("Not deleting snapshot \"%s\"", pathname
);
120 ret
= unlink(pathname
);
122 INFO("Failed to remove \"%s\"", pathname
);
125 SYSERROR("Failed to rmdir \"%s\"", pathname
);
134 ret
= lstat(pathname
, &mystat
);
136 SYSERROR("Failed to stat \"%s\"", pathname
);
141 if (onedev
&& mystat
.st_dev
!= pdev
) {
142 /* TODO should we be checking /proc/self/mountinfo for
143 * pathname and not doing this if found? */
144 if (btrfs_try_remove_subvol(pathname
))
145 INFO("Removed btrfs subvolume at \"%s\"", pathname
);
149 if (S_ISDIR(mystat
.st_mode
)) {
150 if (_recursive_rmdir(pathname
, pdev
, exclude
, level
+1, onedev
) < 0)
153 if (unlink(pathname
) < 0) {
154 SYSERROR("Failed to delete \"%s\"", pathname
);
160 if (rmdir(dirname
) < 0 && !btrfs_try_remove_subvol(dirname
) && !hadexclude
) {
161 SYSERROR("Failed to delete \"%s\"", dirname
);
167 SYSERROR("Failed to close directory \"%s\"", dirname
);
171 return failed
? -1 : 0;
174 /* In overlayfs, st_dev is unreliable. So on overlayfs we don't do the
177 static bool is_native_overlayfs(const char *path
)
179 if (has_fs_type(path
, OVERLAY_SUPER_MAGIC
) ||
180 has_fs_type(path
, OVERLAYFS_SUPER_MAGIC
))
186 /* returns 0 on success, -1 if there were any failures */
187 extern int lxc_rmdir_onedev(const char *path
, const char *exclude
)
192 if (is_native_overlayfs(path
))
195 if (lstat(path
, &mystat
) < 0) {
199 SYSERROR("Failed to stat \"%s\"", path
);
203 return _recursive_rmdir(path
, mystat
.st_dev
, exclude
, 0, onedev
);
206 /* borrowed from iproute2 */
207 extern int get_u16(unsigned short *val
, const char *arg
, int base
)
216 res
= strtoul(arg
, &ptr
, base
);
217 if (!ptr
|| ptr
== arg
|| *ptr
|| res
> 0xFFFF || errno
!= 0)
225 int mkdir_p(const char *dir
, mode_t mode
)
227 const char *tmp
= dir
;
228 const char *orig
= dir
;
234 dir
= tmp
+ strspn(tmp
, "/");
235 tmp
= dir
+ strcspn(dir
, "/");
238 makeme
= strndup(orig
, dir
- orig
);
242 ret
= mkdir(makeme
, mode
);
243 if (ret
< 0 && errno
!= EEXIST
) {
244 SYSERROR("Failed to create directory \"%s\"", makeme
);
250 } while (tmp
!= dir
);
261 if (stat(RUNTIME_PATH
, &sb
) < 0)
264 if (geteuid() == sb
.st_uid
|| getegid() == sb
.st_gid
) {
265 rundir
= strdup(RUNTIME_PATH
);
269 rundir
= getenv("XDG_RUNTIME_DIR");
271 rundir
= strdup(rundir
);
275 INFO("XDG_RUNTIME_DIR isn't set in the environment");
276 homedir
= getenv("HOME");
278 ERROR("HOME isn't set in the environment");
282 rundir
= malloc(sizeof(char) * (17 + strlen(homedir
)));
286 sprintf(rundir
, "%s/.cache/lxc/run/", homedir
);
291 int wait_for_pid(pid_t pid
)
296 ret
= waitpid(pid
, &status
, 0);
307 if (!WIFEXITED(status
) || WEXITSTATUS(status
) != 0)
313 int lxc_wait_for_pid_status(pid_t pid
)
318 ret
= waitpid(pid
, &status
, 0);
333 #include <gnutls/gnutls.h>
334 #include <gnutls/crypto.h>
336 __attribute__((constructor
))
337 static void gnutls_lxc_init(void)
339 gnutls_global_init();
342 int sha1sum_file(char *fnam
, unsigned char *digest
)
352 f
= fopen_cloexec(fnam
, "r");
354 SYSERROR("Failed to open template \"%s\"", fnam
);
358 if (fseek(f
, 0, SEEK_END
) < 0) {
359 SYSERROR("Failed to seek to end of template");
364 if ((flen
= ftell(f
)) < 0) {
365 SYSERROR("Failed to tell size of template");
370 if (fseek(f
, 0, SEEK_SET
) < 0) {
371 SYSERROR("Failed to seek to start of template");
376 if ((buf
= malloc(flen
+1)) == NULL
) {
377 SYSERROR("Out of memory");
382 if (fread(buf
, 1, flen
, f
) != flen
) {
383 SYSERROR("Failed to read template");
390 SYSERROR("Failed to close template");
396 ret
= gnutls_hash_fast(GNUTLS_DIG_SHA1
, buf
, flen
, (void *)digest
);
402 struct lxc_popen_FILE
*lxc_popen(const char *command
)
407 struct lxc_popen_FILE
*fp
= NULL
;
409 ret
= pipe2(pipe_fds
, O_CLOEXEC
);
422 /* duplicate stdout */
423 if (pipe_fds
[1] != STDOUT_FILENO
)
424 ret
= dup2(pipe_fds
[1], STDOUT_FILENO
);
426 ret
= fcntl(pipe_fds
[1], F_SETFD
, 0);
432 /* duplicate stderr */
433 if (pipe_fds
[1] != STDERR_FILENO
)
434 ret
= dup2(pipe_fds
[1], STDERR_FILENO
);
436 ret
= fcntl(pipe_fds
[1], F_SETFD
, 0);
441 /* unblock all signals */
442 ret
= sigfillset(&mask
);
446 ret
= pthread_sigmask(SIG_UNBLOCK
, &mask
, NULL
);
450 execl("/bin/sh", "sh", "-c", command
, (char *)NULL
);
457 fp
= malloc(sizeof(*fp
));
461 memset(fp
, 0, sizeof(*fp
));
463 fp
->child_pid
= child_pid
;
464 fp
->pipe
= pipe_fds
[0];
466 /* From now on, closing fp->f will also close fp->pipe. So only ever
467 * call fclose(fp->f).
469 fp
->f
= fdopen(pipe_fds
[0], "r");
476 /* We can only close pipe_fds[0] if fdopen() didn't succeed or wasn't
477 * called yet. Otherwise the fd belongs to the file opened by fdopen()
478 * since it isn't dup()ed.
480 if (fp
&& !fp
->f
&& pipe_fds
[0] >= 0)
483 if (pipe_fds
[1] >= 0)
495 int lxc_pclose(struct lxc_popen_FILE
*fp
)
504 wait_pid
= waitpid(fp
->child_pid
, &wstatus
, 0);
505 } while (wait_pid
< 0 && errno
== EINTR
);
516 int randseed(bool srand_it
)
520 * srand pre-seed function based on /dev/urandom
522 unsigned int seed
= time(NULL
) + getpid();
524 f
= fopen("/dev/urandom", "r");
526 int ret
= fread(&seed
, sizeof(seed
), 1, f
);
528 SYSDEBUG("Unable to fread /dev/urandom, fallback to time+pid rand seed");
539 uid_t
get_ns_uid(uid_t orig
)
543 uid_t nsid
, hostid
, range
;
546 f
= fopen("/proc/self/uid_map", "r");
548 SYSERROR("Failed to open uid_map");
552 while (getline(&line
, &sz
, f
) != -1) {
553 if (sscanf(line
, "%u %u %u", &nsid
, &hostid
, &range
) != 3)
556 if (hostid
<= orig
&& hostid
+ range
> orig
) {
557 nsid
+= orig
- hostid
;
562 nsid
= LXC_INVALID_UID
;
570 gid_t
get_ns_gid(gid_t orig
)
574 gid_t nsid
, hostid
, range
;
577 f
= fopen("/proc/self/gid_map", "r");
579 SYSERROR("Failed to open gid_map");
583 while (getline(&line
, &sz
, f
) != -1) {
584 if (sscanf(line
, "%u %u %u", &nsid
, &hostid
, &range
) != 3)
587 if (hostid
<= orig
&& hostid
+ range
> orig
) {
588 nsid
+= orig
- hostid
;
593 nsid
= LXC_INVALID_GID
;
601 bool dir_exists(const char *path
)
606 ret
= stat(path
, &sb
);
608 /* Could be something other than eexist, just say "no". */
611 return S_ISDIR(sb
.st_mode
);
614 /* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS.
615 * FNV has good anti collision properties and we're not worried
616 * about pre-image resistance or one-way-ness, we're just trying to make
617 * the name unique in the 108 bytes of space we have.
619 uint64_t fnv_64a_buf(void *buf
, size_t len
, uint64_t hval
)
623 for(bp
= buf
; bp
< (unsigned char *)buf
+ len
; bp
++) {
624 /* xor the bottom with the current octet */
625 hval
^= (uint64_t)*bp
;
628 * multiply by the 64 bit FNV magic prime mod 2^64
630 hval
+= (hval
<< 1) + (hval
<< 4) + (hval
<< 5) +
631 (hval
<< 7) + (hval
<< 8) + (hval
<< 40);
637 bool is_shared_mountpoint(const char *path
)
639 char buf
[LXC_LINELEN
];
644 f
= fopen("/proc/self/mountinfo", "r");
648 while (fgets(buf
, LXC_LINELEN
, f
)) {
649 for (p
= buf
, i
= 0; p
&& i
< 4; i
++)
650 p
= strchr(p
+ 1, ' ');
654 p2
= strchr(p
+ 1, ' ');
659 if (strcmp(p
+ 1, path
) == 0) {
660 /* This is the path. Is it shared? */
661 p
= strchr(p2
+ 1, ' ');
662 if (p
&& strstr(p
, "shared:")) {
674 * Detect whether / is mounted MS_SHARED. The only way I know of to
675 * check that is through /proc/self/mountinfo.
676 * I'm only checking for /. If the container rootfs or mount location
677 * is MS_SHARED, but not '/', then you're out of luck - figuring that
678 * out would be too much work to be worth it.
680 int detect_shared_rootfs(void)
682 if (is_shared_mountpoint("/"))
688 bool switch_to_ns(pid_t pid
, const char *ns
)
691 char nspath
[PATH_MAX
];
693 /* Switch to new ns */
694 ret
= snprintf(nspath
, PATH_MAX
, "/proc/%d/ns/%s", pid
, ns
);
695 if (ret
< 0 || ret
>= PATH_MAX
)
698 fd
= open(nspath
, O_RDONLY
);
700 SYSERROR("Failed to open \"%s\"", nspath
);
706 SYSERROR("Failed to set process %d to \"%s\" of %d.", pid
, ns
, fd
);
716 * looking at fs/proc_namespace.c, it appears we can
717 * actually expect the rootfs entry to very specifically contain
718 * " - rootfs rootfs "
719 * IIUC, so long as we've chrooted so that rootfs is not our root,
720 * the rootfs entry should always be skipped in mountinfo contents.
722 bool detect_ramfs_rootfs(void)
730 f
= fopen("/proc/self/mountinfo", "r");
732 SYSERROR("Failed to open mountinfo");
736 while (getline(&line
, &len
, f
) != -1) {
737 for (p
= line
, i
= 0; p
&& i
< 4; i
++)
738 p
= strchr(p
+ 1, ' ');
742 p2
= strchr(p
+ 1, ' ');
747 if (strcmp(p
+ 1, "/") == 0) {
748 /* This is '/'. Is it the ramfs? */
749 p
= strchr(p2
+ 1, '-');
750 if (p
&& strncmp(p
, "- rootfs rootfs ", 16) == 0) {
753 INFO("Rootfs is located on ramfs");
764 char *on_path(const char *cmd
, const char *rootfs
)
766 char *entry
= NULL
, *path
= NULL
;
767 char cmdpath
[PATH_MAX
];
770 path
= getenv("PATH");
778 lxc_iterate_parts (entry
, path
, ":") {
780 ret
= snprintf(cmdpath
, PATH_MAX
, "%s/%s/%s", rootfs
,
783 ret
= snprintf(cmdpath
, PATH_MAX
, "%s/%s", entry
, cmd
);
784 if (ret
< 0 || ret
>= PATH_MAX
)
787 if (access(cmdpath
, X_OK
) == 0) {
789 return strdup(cmdpath
);
797 bool cgns_supported(void)
799 return file_exists("/proc/self/ns/cgroup");
802 /* historically lxc-init has been under /usr/lib/lxc and under
803 * /usr/lib/$ARCH/lxc. It now lives as $prefix/sbin/init.lxc.
805 char *choose_init(const char *rootfs
)
808 const char *empty
= "",
810 int ret
, env_set
= 0;
812 if (!getenv("PATH")) {
813 if (setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 0))
814 SYSERROR("Failed to setenv");
819 retv
= on_path("init.lxc", rootfs
);
822 if (unsetenv("PATH"))
823 SYSERROR("Failed to unsetenv");
828 retv
= malloc(PATH_MAX
);
837 ret
= snprintf(retv
, PATH_MAX
, "%s/%s/%s", tmp
, SBINDIR
, "/init.lxc");
838 if (ret
< 0 || ret
>= PATH_MAX
) {
839 ERROR("The name of path is too long");
843 if (access(retv
, X_OK
) == 0)
846 ret
= snprintf(retv
, PATH_MAX
, "%s/%s/%s", tmp
, LXCINITDIR
, "/lxc/lxc-init");
847 if (ret
< 0 || ret
>= PATH_MAX
) {
848 ERROR("The name of path is too long");
852 if (access(retv
, X_OK
) == 0)
855 ret
= snprintf(retv
, PATH_MAX
, "%s/usr/lib/lxc/lxc-init", tmp
);
856 if (ret
< 0 || ret
>= PATH_MAX
) {
857 ERROR("The name of path is too long");
861 if (access(retv
, X_OK
) == 0)
864 ret
= snprintf(retv
, PATH_MAX
, "%s/sbin/lxc-init", tmp
);
865 if (ret
< 0 || ret
>= PATH_MAX
) {
866 ERROR("The name of path is too long");
870 if (access(retv
, X_OK
) == 0)
874 * Last resort, look for the statically compiled init.lxc which we
875 * hopefully bind-mounted in.
876 * If we are called during container setup, and we get to this point,
877 * then the init.lxc.static from the host will need to be bind-mounted
878 * in. So we return NULL here to indicate that.
883 ret
= snprintf(retv
, PATH_MAX
, "/init.lxc.static");
884 if (ret
< 0 || ret
>= PATH_MAX
) {
885 WARN("Nonsense - name /lxc.init.static too long");
889 if (access(retv
, X_OK
) == 0)
898 * Given the '-t' template option to lxc-create, figure out what to
899 * do. If the template is a full executable path, use that. If it
900 * is something like 'sshd', then return $templatepath/lxc-sshd.
901 * On success return the template, on error return NULL.
903 char *get_template_path(const char *t
)
908 if (t
[0] == '/' && access(t
, X_OK
) == 0) {
913 len
= strlen(LXCTEMPLATEDIR
) + strlen(t
) + strlen("/lxc-") + 1;
919 ret
= snprintf(tpath
, len
, "%s/lxc-%s", LXCTEMPLATEDIR
, t
);
920 if (ret
< 0 || ret
>= len
) {
925 if (access(tpath
, X_OK
) < 0) {
926 SYSERROR("bad template: %s", t
);
935 * @path: a pathname where / replaced with '\0'.
936 * @offsetp: pointer to int showing which path segment was last seen.
937 * Updated on return to reflect the next segment.
938 * @fulllen: full original path length.
939 * Returns a pointer to the next path segment, or NULL if done.
941 static char *get_nextpath(char *path
, int *offsetp
, int fulllen
)
943 int offset
= *offsetp
;
945 if (offset
>= fulllen
)
948 while (offset
< fulllen
&& path
[offset
] != '\0')
951 while (offset
< fulllen
&& path
[offset
] == '\0')
956 return (offset
< fulllen
) ? &path
[offset
] : NULL
;
960 * Check that @subdir is a subdir of @dir. @len is the length of
961 * @dir (to avoid having to recalculate it).
963 static bool is_subdir(const char *subdir
, const char *dir
, size_t len
)
965 size_t subdirlen
= strlen(subdir
);
970 if (strncmp(subdir
, dir
, len
) != 0)
973 if (dir
[len
-1] == '/')
976 if (subdir
[len
] == '/' || subdirlen
== len
)
983 * Check if the open fd is a symlink. Return -ELOOP if it is. Return
984 * -ENOENT if we couldn't fstat. Return 0 if the fd is ok.
986 static int check_symlink(int fd
)
991 ret
= fstat(fd
, &sb
);
995 if (S_ISLNK(sb
.st_mode
))
1002 * Open a file or directory, provided that it contains no symlinks.
1004 * CAVEAT: This function must not be used for other purposes than container
1005 * setup before executing the container's init
1007 static int open_if_safe(int dirfd
, const char *nextpath
)
1009 int newfd
= openat(dirfd
, nextpath
, O_RDONLY
| O_NOFOLLOW
);
1010 if (newfd
>= 0) /* Was not a symlink, all good. */
1016 if (errno
== EPERM
|| errno
== EACCES
) {
1017 /* We're not root (cause we got EPERM) so try opening with
1020 newfd
= openat(dirfd
, nextpath
, O_PATH
| O_NOFOLLOW
);
1022 /* O_PATH will return an fd for symlinks. We know
1023 * nextpath wasn't a symlink at last openat, so if fd is
1024 * now a link, then something * fishy is going on.
1026 int ret
= check_symlink(newfd
);
1038 * Open a path intending for mounting, ensuring that the final path
1039 * is inside the container's rootfs.
1041 * CAVEAT: This function must not be used for other purposes than container
1042 * setup before executing the container's init
1044 * @target: path to be opened
1045 * @prefix_skip: a part of @target in which to ignore symbolic links. This
1046 * would be the container's rootfs.
1048 * Return an open fd for the path, or <0 on error.
1050 static int open_without_symlink(const char *target
, const char *prefix_skip
)
1052 int curlen
= 0, dirfd
, fulllen
, i
;
1055 fulllen
= strlen(target
);
1057 /* make sure prefix-skip makes sense */
1058 if (prefix_skip
&& strlen(prefix_skip
) > 0) {
1059 curlen
= strlen(prefix_skip
);
1060 if (!is_subdir(target
, prefix_skip
, curlen
)) {
1061 ERROR("WHOA there - target \"%s\" didn't start with prefix \"%s\"",
1062 target
, prefix_skip
);
1067 * get_nextpath() expects the curlen argument to be
1068 * on a (turned into \0) / or before it, so decrement
1069 * curlen to make sure that happens
1078 /* Make a copy of target which we can hack up, and tokenize it */
1079 if ((dup
= strdup(target
)) == NULL
) {
1080 ERROR("Out of memory checking for symbolic link");
1084 for (i
= 0; i
< fulllen
; i
++) {
1089 dirfd
= open(prefix_skip
, O_RDONLY
);
1091 SYSERROR("Failed to open path \"%s\"", prefix_skip
);
1096 int newfd
, saved_errno
;
1099 if ((nextpath
= get_nextpath(dup
, &curlen
, fulllen
)) == NULL
)
1102 newfd
= open_if_safe(dirfd
, nextpath
);
1103 saved_errno
= errno
;
1108 errno
= saved_errno
;
1110 SYSERROR("%s in %s was a symbolic link!", nextpath
, target
);
1122 * Safely mount a path into a container, ensuring that the mount target
1123 * is under the container's @rootfs. (If @rootfs is NULL, then the container
1124 * uses the host's /)
1126 * CAVEAT: This function must not be used for other purposes than container
1127 * setup before executing the container's init
1129 int safe_mount(const char *src
, const char *dest
, const char *fstype
,
1130 unsigned long flags
, const void *data
, const char *rootfs
)
1132 int destfd
, ret
, saved_errno
;
1133 /* Only needs enough for /proc/self/fd/<fd>. */
1134 char srcbuf
[50], destbuf
[50];
1136 const char *mntsrc
= src
;
1141 /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
1142 if (flags
& MS_BIND
&& src
&& src
[0] != '/') {
1143 INFO("This is a relative bind mount");
1145 srcfd
= open_without_symlink(src
, NULL
);
1149 ret
= snprintf(srcbuf
, 50, "/proc/self/fd/%d", srcfd
);
1150 if (ret
< 0 || ret
> 50) {
1152 ERROR("Out of memory");
1158 destfd
= open_without_symlink(dest
, rootfs
);
1161 saved_errno
= errno
;
1163 errno
= saved_errno
;
1169 ret
= snprintf(destbuf
, 50, "/proc/self/fd/%d", destfd
);
1170 if (ret
< 0 || ret
> 50) {
1175 ERROR("Out of memory");
1179 ret
= mount(mntsrc
, destbuf
, fstype
, flags
, data
);
1180 saved_errno
= errno
;
1186 errno
= saved_errno
;
1187 SYSERROR("Failed to mount \"%s\" onto \"%s\"", src
? src
: "(null)", dest
);
1195 * Mount a proc under @rootfs if proc self points to a pid other than
1196 * my own. This is needed to have a known-good proc mount for setting
1197 * up LSMs both at container startup and attach.
1199 * @rootfs : the rootfs where proc should be mounted
1201 * Returns < 0 on failure, 0 if the correct proc was already mounted
1202 * and 1 if a new proc was mounted.
1204 * NOTE: not to be called from inside the container namespace!
1206 int lxc_mount_proc_if_needed(const char *rootfs
)
1208 char path
[PATH_MAX
] = {0};
1209 int link_to_pid
, linklen
, mypid
, ret
;
1210 char link
[INTTYPE_TO_STRLEN(pid_t
)] = {0};
1212 ret
= snprintf(path
, PATH_MAX
, "%s/proc/self", rootfs
);
1213 if (ret
< 0 || ret
>= PATH_MAX
) {
1214 SYSERROR("The name of proc path is too long");
1218 linklen
= readlink(path
, link
, sizeof(link
));
1220 ret
= snprintf(path
, PATH_MAX
, "%s/proc", rootfs
);
1221 if (ret
< 0 || ret
>= PATH_MAX
) {
1222 SYSERROR("The name of proc path is too long");
1226 /* /proc not mounted */
1228 if (mkdir(path
, 0755) && errno
!= EEXIST
)
1232 } else if (linklen
>= sizeof(link
)) {
1233 link
[linklen
- 1] = '\0';
1234 ERROR("Readlink returned truncated content: \"%s\"", link
);
1238 mypid
= lxc_raw_getpid();
1239 INFO("I am %d, /proc/self points to \"%s\"", mypid
, link
);
1241 if (lxc_safe_int(link
, &link_to_pid
) < 0)
1244 /* correct procfs is already mounted */
1245 if (link_to_pid
== mypid
)
1248 ret
= umount2(path
, MNT_DETACH
);
1250 SYSWARN("Failed to umount \"%s\" with MNT_DETACH", path
);
1253 /* rootfs is NULL */
1254 if (!strcmp(rootfs
, ""))
1255 ret
= mount("proc", path
, "proc", 0, NULL
);
1257 ret
= safe_mount("proc", path
, "proc", 0, NULL
, rootfs
);
1261 INFO("Mounted /proc in container for security transition");
1265 int open_devnull(void)
1267 int fd
= open("/dev/null", O_RDWR
);
1269 SYSERROR("Can't open /dev/null");
1274 int set_stdfds(int fd
)
1281 ret
= dup2(fd
, STDIN_FILENO
);
1285 ret
= dup2(fd
, STDOUT_FILENO
);
1289 ret
= dup2(fd
, STDERR_FILENO
);
1296 int null_stdfds(void)
1301 fd
= open_devnull();
1303 ret
= set_stdfds(fd
);
1310 /* Check whether a signal is blocked by a process. */
1311 /* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
1312 #define __PROC_STATUS_LEN (6 + INTTYPE_TO_STRLEN(pid_t) + 7 + 1)
1313 bool task_blocks_signal(pid_t pid
, int signal
)
1316 char status
[__PROC_STATUS_LEN
] = {0};
1318 uint64_t sigblk
= 0, one
= 1;
1323 ret
= snprintf(status
, __PROC_STATUS_LEN
, "/proc/%d/status", pid
);
1324 if (ret
< 0 || ret
>= __PROC_STATUS_LEN
)
1327 f
= fopen(status
, "r");
1331 while (getline(&line
, &n
, f
) != -1) {
1334 if (strncmp(line
, "SigBlk:", 7))
1337 numstr
= lxc_trim_whitespace_in_place(line
+ 7);
1338 ret
= lxc_safe_uint64(numstr
, &sigblk
, 16);
1345 if (sigblk
& (one
<< (signal
- 1)))
1354 int lxc_preserve_ns(const int pid
, const char *ns
)
1357 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
1358 #define __NS_PATH_LEN 50
1359 char path
[__NS_PATH_LEN
];
1361 /* This way we can use this function to also check whether namespaces
1362 * are supported by the kernel by passing in the NULL or the empty
1365 ret
= snprintf(path
, __NS_PATH_LEN
, "/proc/%d/ns%s%s", pid
,
1366 !ns
|| strcmp(ns
, "") == 0 ? "" : "/",
1367 !ns
|| strcmp(ns
, "") == 0 ? "" : ns
);
1368 if (ret
< 0 || (size_t)ret
>= __NS_PATH_LEN
) {
1373 return open(path
, O_RDONLY
| O_CLOEXEC
);
1376 bool lxc_switch_uid_gid(uid_t uid
, gid_t gid
)
1380 if (gid
!= LXC_INVALID_GID
) {
1383 SYSERROR("Failed to switch to gid %d", gid
);
1386 NOTICE("Switched to gid %d", gid
);
1389 if (uid
!= LXC_INVALID_UID
) {
1392 SYSERROR("Failed to switch to uid %d", uid
);
1395 NOTICE("Switched to uid %d", uid
);
1401 /* Simple convenience function which enables uniform logging. */
1402 bool lxc_setgroups(int size
, gid_t list
[])
1404 if (setgroups(size
, list
) < 0) {
1405 SYSERROR("Failed to setgroups()");
1408 NOTICE("Dropped additional groups");
1413 static int lxc_get_unused_loop_dev_legacy(char *loop_name
)
1416 struct loop_info64 lo64
;
1418 int dfd
= -1, fd
= -1, ret
= -1;
1420 dir
= opendir("/dev");
1422 SYSERROR("Failed to open \"/dev\"");
1426 while ((dp
= readdir(dir
))) {
1427 if (strncmp(dp
->d_name
, "loop", 4) != 0)
1434 fd
= openat(dfd
, dp
->d_name
, O_RDWR
);
1438 ret
= ioctl(fd
, LOOP_GET_STATUS64
, &lo64
);
1440 if (ioctl(fd
, LOOP_GET_STATUS64
, &lo64
) == 0 ||
1448 ret
= snprintf(loop_name
, LO_NAME_SIZE
, "/dev/%s", dp
->d_name
);
1449 if (ret
< 0 || ret
>= LO_NAME_SIZE
) {
1466 static int lxc_get_unused_loop_dev(char *name_loop
)
1469 int fd_ctl
= -1, fd_tmp
= -1;
1471 fd_ctl
= open("/dev/loop-control", O_RDWR
| O_CLOEXEC
);
1473 SYSERROR("Failed to open loop control");
1477 loop_nr
= ioctl(fd_ctl
, LOOP_CTL_GET_FREE
);
1479 SYSERROR("Failed to get loop control");
1483 ret
= snprintf(name_loop
, LO_NAME_SIZE
, "/dev/loop%d", loop_nr
);
1484 if (ret
< 0 || ret
>= LO_NAME_SIZE
)
1487 fd_tmp
= open(name_loop
, O_RDWR
| O_CLOEXEC
);
1489 SYSERROR("Failed to open loop \"%s\"", name_loop
);
1496 int lxc_prepare_loop_dev(const char *source
, char *loop_dev
, int flags
)
1499 struct loop_info64 lo64
;
1500 int fd_img
= -1, fret
= -1, fd_loop
= -1;
1502 fd_loop
= lxc_get_unused_loop_dev(loop_dev
);
1504 if (fd_loop
!= -ENODEV
)
1507 fd_loop
= lxc_get_unused_loop_dev_legacy(loop_dev
);
1512 fd_img
= open(source
, O_RDWR
| O_CLOEXEC
);
1514 SYSERROR("Failed to open source \"%s\"", source
);
1518 ret
= ioctl(fd_loop
, LOOP_SET_FD
, fd_img
);
1520 SYSERROR("Failed to set loop fd");
1524 memset(&lo64
, 0, sizeof(lo64
));
1525 lo64
.lo_flags
= flags
;
1527 ret
= ioctl(fd_loop
, LOOP_SET_STATUS64
, &lo64
);
1529 SYSERROR("Failed to set loop status64");
1539 if (fret
< 0 && fd_loop
>= 0) {
1547 int lxc_unstack_mountpoint(const char *path
, bool lazy
)
1553 ret
= umount2(path
, lazy
? MNT_DETACH
: 0);
1555 /* We consider anything else than EINVAL deadly to prevent going
1556 * into an infinite loop. (The other alternative is constantly
1557 * parsing /proc/self/mountinfo which is yucky and probably
1560 if (errno
!= EINVAL
)
1563 /* Just stop counting when this happens. That'd just be so
1564 * stupid that we won't even bother trying to report back the
1565 * correct value anymore.
1567 if (umounts
!= INT_MAX
)
1570 /* We succeeded in umounting. Make sure that there's no other
1571 * mountpoint stacked underneath.
1579 int run_command(char *buf
, size_t buf_size
, int (*child_fn
)(void *), void *args
)
1582 int ret
, fret
, pipefd
[2];
1585 /* Make sure our callers do not receive uninitialized memory. */
1586 if (buf_size
> 0 && buf
)
1589 if (pipe(pipefd
) < 0) {
1590 SYSERROR("Failed to create pipe");
1594 child
= lxc_raw_clone(0);
1598 SYSERROR("Failed to create new process");
1603 /* Close the read-end of the pipe. */
1606 /* Redirect std{err,out} to write-end of the
1609 ret
= dup2(pipefd
[1], STDOUT_FILENO
);
1611 ret
= dup2(pipefd
[1], STDERR_FILENO
);
1613 /* Close the write-end of the pipe. */
1617 SYSERROR("Failed to duplicate std{err,out} file descriptor");
1618 _exit(EXIT_FAILURE
);
1621 /* Does not return. */
1623 ERROR("Failed to exec command");
1624 _exit(EXIT_FAILURE
);
1627 /* close the write-end of the pipe */
1630 if (buf
&& buf_size
> 0) {
1631 bytes
= lxc_read_nointr(pipefd
[0], buf
, buf_size
- 1);
1633 buf
[bytes
- 1] = '\0';
1636 fret
= wait_for_pid(child
);
1637 /* close the read-end of the pipe */
1643 bool lxc_nic_exists(char *nic
)
1645 #define __LXC_SYS_CLASS_NET_LEN 15 + IFNAMSIZ + 1
1646 char path
[__LXC_SYS_CLASS_NET_LEN
];
1650 if (!strcmp(nic
, "none"))
1653 ret
= snprintf(path
, __LXC_SYS_CLASS_NET_LEN
, "/sys/class/net/%s", nic
);
1654 if (ret
< 0 || (size_t)ret
>= __LXC_SYS_CLASS_NET_LEN
)
1657 ret
= stat(path
, &sb
);
1664 uint64_t lxc_find_next_power2(uint64_t n
)
1666 /* 0 is not valid input. We return 0 to the caller since 0 is not a
1667 * valid power of two.
1682 int lxc_set_death_signal(int signal
, pid_t parent
)
1687 ret
= prctl(PR_SET_PDEATHSIG
, prctl_arg(signal
), prctl_arg(0),
1688 prctl_arg(0), prctl_arg(0));
1690 /* Check whether we have been orphaned. */
1691 ppid
= (pid_t
)syscall(SYS_getppid
);
1692 if (ppid
!= parent
) {
1693 ret
= raise(SIGKILL
);
1704 int fd_cloexec(int fd
, bool cloexec
)
1708 oflags
= fcntl(fd
, F_GETFD
, 0);
1713 nflags
= oflags
| FD_CLOEXEC
;
1715 nflags
= oflags
& ~FD_CLOEXEC
;
1717 if (nflags
== oflags
)
1720 if (fcntl(fd
, F_SETFD
, nflags
) < 0)
1726 int recursive_destroy(char *dirname
)
1729 struct dirent
*direntp
;
1733 dir
= opendir(dirname
);
1735 SYSERROR("Failed to open dir \"%s\"", dirname
);
1739 while ((direntp
= readdir(dir
))) {
1743 if (!strcmp(direntp
->d_name
, ".") ||
1744 !strcmp(direntp
->d_name
, ".."))
1747 pathname
= must_make_path(dirname
, direntp
->d_name
, NULL
);
1749 ret
= lstat(pathname
, &mystat
);
1752 SYSWARN("Failed to stat \"%s\"", pathname
);
1758 if (!S_ISDIR(mystat
.st_mode
))
1761 ret
= recursive_destroy(pathname
);
1769 ret
= rmdir(dirname
);
1772 SYSWARN("Failed to delete \"%s\"", dirname
);
1777 ret
= closedir(dir
);
1780 SYSWARN("Failed to delete \"%s\"", dirname
);
1788 int lxc_setup_keyring(void)
1790 key_serial_t keyring
;
1793 /* Try to allocate a new session keyring for the container to prevent
1794 * information leaks.
1796 keyring
= keyctl(KEYCTL_JOIN_SESSION_KEYRING
, prctl_arg(0),
1797 prctl_arg(0), prctl_arg(0), prctl_arg(0));
1801 DEBUG("The keyctl() syscall is not supported or blocked");
1806 DEBUG("Failed to access kernel keyring. Continuing...");
1809 SYSERROR("Failed to create kernel keyring");