1 /* SPDX-License-Identifier: LGPL-2.1+ */
6 #define __STDC_FORMAT_MACROS /* Required for PRIu64 to work. */
21 #include <sys/mount.h>
22 #include <sys/param.h>
23 #include <sys/prctl.h>
25 #include <sys/types.h>
33 #include "memory_utils.h"
34 #include "namespace.h"
35 #include "open_utils.h"
37 #include "process_utils.h"
38 #include "syscall_wrappers.h"
50 #define O_PATH 010000000
54 #define O_NOFOLLOW 00400000
57 lxc_log_define(utils
, lxc
);
60 * if path is btrfs, tries to remove it and any subvolumes beneath it
62 extern bool btrfs_try_remove_subvol(const char *path
);
64 static int _recursive_rmdir(const char *dirname
, dev_t pdev
,
65 const char *exclude
, int level
, bool onedev
)
67 __do_closedir
DIR *dir
= NULL
;
69 bool hadexclude
= false;
71 struct dirent
*direntp
;
72 char pathname
[PATH_MAX
];
74 dir
= opendir(dirname
);
76 return log_error(-1, "Failed to open \"%s\"", dirname
);
78 while ((direntp
= readdir(dir
))) {
82 if (strequal(direntp
->d_name
, ".") ||
83 strequal(direntp
->d_name
, ".."))
86 rc
= strnprintf(pathname
, sizeof(pathname
), "%s/%s", dirname
, direntp
->d_name
);
88 ERROR("The name of path is too long");
93 if (!level
&& exclude
&& strequal(direntp
->d_name
, exclude
)) {
94 ret
= rmdir(pathname
);
98 INFO("Not deleting snapshot \"%s\"", pathname
);
102 ret
= unlink(pathname
);
104 INFO("Failed to remove \"%s\"", pathname
);
107 SYSERROR("Failed to rmdir \"%s\"", pathname
);
116 ret
= lstat(pathname
, &mystat
);
118 SYSERROR("Failed to stat \"%s\"", pathname
);
123 if (onedev
&& mystat
.st_dev
!= pdev
) {
124 if (btrfs_try_remove_subvol(pathname
))
125 INFO("Removed btrfs subvolume at \"%s\"", pathname
);
129 if (S_ISDIR(mystat
.st_mode
)) {
130 if (_recursive_rmdir(pathname
, pdev
, exclude
, level
+ 1, onedev
) < 0)
133 ret
= unlink(pathname
);
135 __do_close
int fd
= -EBADF
;
137 fd
= open(pathname
, O_RDONLY
| O_CLOEXEC
| O_NONBLOCK
);
139 /* The file might be marked immutable. */
141 ret
= ioctl(fd
, FS_IOC_GETFLAGS
, &attr
);
143 SYSERROR("Failed to retrieve file flags");
144 attr
&= ~FS_IMMUTABLE_FL
;
145 ret
= ioctl(fd
, FS_IOC_SETFLAGS
, &attr
);
147 SYSERROR("Failed to set file flags");
150 ret
= unlink(pathname
);
152 SYSERROR("Failed to delete \"%s\"", pathname
);
159 if (rmdir(dirname
) < 0 && !btrfs_try_remove_subvol(dirname
) && !hadexclude
) {
160 SYSERROR("Failed to delete \"%s\"", dirname
);
164 return failed
? -1 : 0;
168 * In overlayfs, st_dev is unreliable. So on overlayfs we don't do the
169 * lxc_rmdir_onedev().
171 static inline bool is_native_overlayfs(const char *path
)
173 return has_fs_type(path
, OVERLAY_SUPER_MAGIC
) ||
174 has_fs_type(path
, OVERLAYFS_SUPER_MAGIC
);
177 /* returns 0 on success, -1 if there were any failures */
178 extern int lxc_rmdir_onedev(const char *path
, const char *exclude
)
183 if (is_native_overlayfs(path
))
186 if (lstat(path
, &mystat
) < 0) {
190 return log_error_errno(-1, errno
, "Failed to stat \"%s\"", path
);
193 return _recursive_rmdir(path
, mystat
.st_dev
, exclude
, 0, onedev
);
196 /* borrowed from iproute2 */
197 extern int get_u16(unsigned short *val
, const char *arg
, int base
)
203 return ret_errno(EINVAL
);
206 res
= strtoul(arg
, &ptr
, base
);
207 if (!ptr
|| ptr
== arg
|| *ptr
|| res
> 0xFFFF || errno
!= 0)
208 return ret_errno(ERANGE
);
215 int mkdir_p(const char *dir
, mode_t mode
)
217 const char *tmp
= dir
;
218 const char *orig
= dir
;
220 if (access(dir
, F_OK
) != -1)
224 __do_free
char *makeme
= NULL
;
227 dir
= tmp
+ strspn(tmp
, "/");
228 tmp
= dir
+ strcspn(dir
, "/");
230 makeme
= strndup(orig
, dir
- orig
);
232 return ret_set_errno(-1, ENOMEM
);
234 ret
= mkdir(makeme
, mode
);
235 if (ret
< 0 && errno
!= EEXIST
)
236 return log_error_errno(-1, errno
, "Failed to create directory \"%s\"", makeme
);
238 } while (tmp
!= dir
);
243 char *get_rundir(void)
245 __do_free
char *rundir
= NULL
;
252 if (stat(RUNTIME_PATH
, &sb
) < 0)
255 if (geteuid() == sb
.st_uid
|| getegid() == sb
.st_gid
)
256 return strdup(RUNTIME_PATH
);
258 static_rundir
= getenv("XDG_RUNTIME_DIR");
260 return strdup(static_rundir
);
262 INFO("XDG_RUNTIME_DIR isn't set in the environment");
263 homedir
= getenv("HOME");
265 return log_error(NULL
, "HOME isn't set in the environment");
267 len
= strlen(homedir
) + 17;
268 rundir
= malloc(sizeof(char) * len
);
272 ret
= strnprintf(rundir
, len
, "%s/.cache/lxc/run/", homedir
);
274 return ret_set_errno(NULL
, EIO
);
276 return move_ptr(rundir
);
279 int wait_for_pid(pid_t pid
)
284 ret
= waitpid(pid
, &status
, 0);
295 if (!WIFEXITED(status
) || WEXITSTATUS(status
) != 0)
301 int wait_for_pidfd(int pidfd
)
309 ret
= waitid(P_PIDFD
, pidfd
, &info
, __WALL
| WEXITED
);
310 } while (ret
< 0 && errno
== EINTR
);
312 return !ret
&& WIFEXITED(info
.si_status
) && WEXITSTATUS(info
.si_status
) == 0;
315 int lxc_wait_for_pid_status(pid_t pid
)
320 ret
= waitpid(pid
, &status
, 0);
334 bool wait_exited(pid_t pid
)
338 status
= lxc_wait_for_pid_status(pid
);
340 return log_error(false, "Failed to reap on child process %d", pid
);
341 if (WIFSIGNALED(status
))
342 return log_error(false, "Child process %d terminated by signal %d", pid
, WTERMSIG(status
));
343 if (!WIFEXITED(status
))
344 return log_error(false, "Child did not termiate correctly");
345 if (WEXITSTATUS(status
))
346 return log_error(false, "Child terminated with error %d", WEXITSTATUS(status
));
348 TRACE("Reaped child process %d", pid
);
353 #include <openssl/evp.h>
355 static int do_sha1_hash(const char *buf
, int buflen
, unsigned char *md_value
,
356 unsigned int *md_len
)
361 md
= EVP_get_digestbyname("sha1");
363 return log_error(-1, "Unknown message digest: sha1\n");
365 mdctx
= EVP_MD_CTX_create();
366 EVP_DigestInit_ex(mdctx
, md
, NULL
);
367 EVP_DigestUpdate(mdctx
, buf
, buflen
);
368 EVP_DigestFinal_ex(mdctx
, md_value
, md_len
);
369 EVP_MD_CTX_destroy(mdctx
);
374 int sha1sum_file(char *fnam
, unsigned char *digest
, unsigned int *md_len
)
376 __do_free
char *buf
= NULL
;
377 __do_fclose
FILE *f
= NULL
;
385 f
= fopen_cloexec(fnam
, "r");
387 return log_error_errno(-1, errno
, "Failed to open template \"%s\"", fnam
);
389 if (fseek(f
, 0, SEEK_END
) < 0)
390 return log_error_errno(-1, errno
, "Failed to seek to end of template");
394 return log_error_errno(-1, errno
, "Failed to tell size of template");
396 if (fseek(f
, 0, SEEK_SET
) < 0)
397 return log_error_errno(-1, errno
, "Failed to seek to start of template");
399 buf
= malloc(flen
+ 1);
401 return log_error_errno(-1, ENOMEM
, "Out of memory");
403 nbytes
= fread(buf
, 1, flen
, f
);
404 if (nbytes
< 0 || nbytes
!= flen
)
405 return log_error_errno(-1, errno
, "Failed to read template");
408 ret
= do_sha1_hash(buf
, flen
, (void *)digest
, md_len
);
413 struct lxc_popen_FILE
*lxc_popen(const char *command
)
418 struct lxc_popen_FILE
*fp
= NULL
;
420 ret
= pipe2(pipe_fds
, O_CLOEXEC
);
433 /* duplicate stdout */
434 if (pipe_fds
[1] != STDOUT_FILENO
)
435 ret
= dup2(pipe_fds
[1], STDOUT_FILENO
);
437 ret
= fcntl(pipe_fds
[1], F_SETFD
, 0);
443 /* duplicate stderr */
444 if (pipe_fds
[1] != STDERR_FILENO
)
445 ret
= dup2(pipe_fds
[1], STDERR_FILENO
);
447 ret
= fcntl(pipe_fds
[1], F_SETFD
, 0);
452 /* unblock all signals */
453 ret
= sigfillset(&mask
);
457 ret
= pthread_sigmask(SIG_UNBLOCK
, &mask
, NULL
);
461 /* check if /bin/sh exist, otherwise try Android location /system/bin/sh */
462 if (file_exists("/bin/sh"))
463 execl("/bin/sh", "sh", "-c", command
, (char *)NULL
);
465 execl("/system/bin/sh", "sh", "-c", command
, (char *)NULL
);
473 fp
= malloc(sizeof(*fp
));
477 memset(fp
, 0, sizeof(*fp
));
479 fp
->child_pid
= child_pid
;
480 fp
->pipe
= pipe_fds
[0];
482 /* From now on, closing fp->f will also close fp->pipe. So only ever
483 * call fclose(fp->f).
485 fp
->f
= fdopen(pipe_fds
[0], "r");
492 /* We can only close pipe_fds[0] if fdopen() didn't succeed or wasn't
493 * called yet. Otherwise the fd belongs to the file opened by fdopen()
494 * since it isn't dup()ed.
496 if (fp
&& !fp
->f
&& pipe_fds
[0] >= 0)
499 if (pipe_fds
[1] >= 0)
511 int lxc_pclose(struct lxc_popen_FILE
*fp
)
520 wait_pid
= waitpid(fp
->child_pid
, &wstatus
, 0);
521 } while (wait_pid
< 0 && errno
== EINTR
);
532 int randseed(bool srand_it
)
534 __do_fclose
FILE *f
= NULL
;
536 * srand pre-seed function based on /dev/urandom
538 unsigned int seed
= time(NULL
) + getpid();
540 f
= fopen("/dev/urandom", "re");
542 int ret
= fread(&seed
, sizeof(seed
), 1, f
);
544 SYSDEBUG("Unable to fread /dev/urandom, fallback to time+pid rand seed");
553 uid_t
get_ns_uid(uid_t orig
)
555 __do_free
char *line
= NULL
;
556 __do_fclose
FILE *f
= NULL
;
558 uid_t nsid
, hostid
, range
;
560 f
= fopen("/proc/self/uid_map", "re");
562 return log_error_errno(0, errno
, "Failed to open uid_map");
564 while (getline(&line
, &sz
, f
) != -1) {
565 if (sscanf(line
, "%u %u %u", &nsid
, &hostid
, &range
) != 3)
568 if (hostid
<= orig
&& hostid
+ range
> orig
)
569 return nsid
+= orig
- hostid
;
572 return LXC_INVALID_UID
;
575 gid_t
get_ns_gid(gid_t orig
)
577 __do_free
char *line
= NULL
;
578 __do_fclose
FILE *f
= NULL
;
580 gid_t nsid
, hostid
, range
;
582 f
= fopen("/proc/self/gid_map", "re");
584 return log_error_errno(0, errno
, "Failed to open gid_map");
586 while (getline(&line
, &sz
, f
) != -1) {
587 if (sscanf(line
, "%u %u %u", &nsid
, &hostid
, &range
) != 3)
590 if (hostid
<= orig
&& hostid
+ range
> orig
)
591 return nsid
+= orig
- hostid
;
594 return LXC_INVALID_GID
;
597 bool dir_exists(const char *path
)
599 return exists_dir_at(-1, path
);
602 /* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS.
603 * FNV has good anti collision properties and we're not worried
604 * about pre-image resistance or one-way-ness, we're just trying to make
605 * the name unique in the 108 bytes of space we have.
607 uint64_t fnv_64a_buf(void *buf
, size_t len
, uint64_t hval
)
611 for(bp
= buf
; bp
< (unsigned char *)buf
+ len
; bp
++) {
612 /* xor the bottom with the current octet */
613 hval
^= (uint64_t)*bp
;
616 * multiply by the 64 bit FNV magic prime mod 2^64
618 hval
+= (hval
<< 1) + (hval
<< 4) + (hval
<< 5) +
619 (hval
<< 7) + (hval
<< 8) + (hval
<< 40);
625 bool is_shared_mountpoint(const char *path
)
627 __do_fclose
FILE *f
= NULL
;
628 __do_free
char *line
= NULL
;
632 f
= fopen("/proc/self/mountinfo", "re");
636 while (getline(&line
, &len
, f
) > 0) {
637 char *slider1
, *slider2
;
639 for (slider1
= line
, i
= 0; slider1
&& i
< 4; i
++)
640 slider1
= strchr(slider1
+ 1, ' ');
645 slider2
= strchr(slider1
+ 1, ' ');
650 if (strequal(slider1
+ 1, path
)) {
651 /* This is the path. Is it shared? */
652 slider1
= strchr(slider2
+ 1, ' ');
653 if (slider1
&& strstr(slider1
, "shared:"))
662 * Detect whether / is mounted MS_SHARED. The only way I know of to
663 * check that is through /proc/self/mountinfo.
664 * I'm only checking for /. If the container rootfs or mount location
665 * is MS_SHARED, but not '/', then you're out of luck - figuring that
666 * out would be too much work to be worth it.
668 int detect_shared_rootfs(void)
670 if (is_shared_mountpoint("/"))
676 bool switch_to_ns(pid_t pid
, const char *ns
)
678 __do_close
int fd
= -EBADF
;
680 char nspath
[STRLITERALLEN("/proc//ns/")
681 + INTTYPE_TO_STRLEN(pid_t
)
682 + LXC_NAMESPACE_NAME_MAX
];
684 /* Switch to new ns */
685 ret
= strnprintf(nspath
, sizeof(nspath
), "/proc/%d/ns/%s", pid
, ns
);
689 fd
= open(nspath
, O_RDONLY
| O_CLOEXEC
);
691 return log_error_errno(false, errno
, "Failed to open \"%s\"", nspath
);
695 return log_error_errno(false, errno
, "Failed to set process %d to \"%s\" of %d", pid
, ns
, fd
);
701 * looking at fs/proc_namespace.c, it appears we can
702 * actually expect the rootfs entry to very specifically contain
703 * " - rootfs rootfs "
704 * IIUC, so long as we've chrooted so that rootfs is not our root,
705 * the rootfs entry should always be skipped in mountinfo contents.
707 bool detect_ramfs_rootfs(void)
709 __do_free
char *line
= NULL
;
710 __do_free
void *fopen_cache
= NULL
;
711 __do_fclose
FILE *f
= NULL
;
714 f
= fopen_cached("/proc/self/mountinfo", "re", &fopen_cache
);
718 while (getline(&line
, &len
, f
) != -1) {
722 for (p
= line
, i
= 0; p
&& i
< 4; i
++)
723 p
= strchr(p
+ 1, ' ');
727 p2
= strchr(p
+ 1, ' ');
731 if (strequal(p
+ 1, "/")) {
732 /* This is '/'. Is it the ramfs? */
733 p
= strchr(p2
+ 1, '-');
734 if (p
&& strnequal(p
, "- rootfs ", 9))
742 char *on_path(const char *cmd
, const char *rootfs
)
744 __do_free
char *path
= NULL
;
746 char cmdpath
[PATH_MAX
];
749 path
= getenv("PATH");
757 lxc_iterate_parts(entry
, path
, ":") {
759 ret
= strnprintf(cmdpath
, sizeof(cmdpath
), "%s/%s/%s", rootfs
, entry
, cmd
);
761 ret
= strnprintf(cmdpath
, sizeof(cmdpath
), "%s/%s", entry
, cmd
);
765 if (access(cmdpath
, X_OK
) == 0)
766 return strdup(cmdpath
);
772 /* historically lxc-init has been under /usr/lib/lxc and under
773 * /usr/lib/$ARCH/lxc. It now lives as $prefix/sbin/init.lxc.
775 char *choose_init(const char *rootfs
)
778 const char *empty
= "",
780 int ret
, env_set
= 0;
782 if (!getenv("PATH")) {
783 if (setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 0))
784 SYSERROR("Failed to setenv");
789 retv
= on_path("init.lxc", rootfs
);
792 if (unsetenv("PATH"))
793 SYSERROR("Failed to unsetenv");
798 retv
= malloc(PATH_MAX
);
807 ret
= strnprintf(retv
, PATH_MAX
, "%s/%s/%s", tmp
, SBINDIR
, "/init.lxc");
809 ERROR("The name of path is too long");
813 if (access(retv
, X_OK
) == 0)
816 ret
= strnprintf(retv
, PATH_MAX
, "%s/%s/%s", tmp
, LXCINITDIR
, "/lxc/lxc-init");
818 ERROR("The name of path is too long");
822 if (access(retv
, X_OK
) == 0)
825 ret
= strnprintf(retv
, PATH_MAX
, "%s/usr/lib/lxc/lxc-init", tmp
);
827 ERROR("The name of path is too long");
831 if (access(retv
, X_OK
) == 0)
834 ret
= strnprintf(retv
, PATH_MAX
, "%s/sbin/lxc-init", tmp
);
836 ERROR("The name of path is too long");
840 if (access(retv
, X_OK
) == 0)
844 * Last resort, look for the statically compiled init.lxc which we
845 * hopefully bind-mounted in.
846 * If we are called during container setup, and we get to this point,
847 * then the init.lxc.static from the host will need to be bind-mounted
848 * in. So we return NULL here to indicate that.
853 ret
= strnprintf(retv
, PATH_MAX
, "/init.lxc.static");
855 WARN("Nonsense - name /lxc.init.static too long");
859 if (access(retv
, X_OK
) == 0)
868 * Given the '-t' template option to lxc-create, figure out what to
869 * do. If the template is a full executable path, use that. If it
870 * is something like 'sshd', then return $templatepath/lxc-sshd.
871 * On success return the template, on error return NULL.
873 char *get_template_path(const char *t
)
879 if (access(t
, X_OK
) == 0) {
882 SYSERROR("Bad template pathname: %s", t
);
887 len
= strlen(LXCTEMPLATEDIR
) + strlen(t
) + strlen("/lxc-") + 1;
893 ret
= strnprintf(tpath
, len
, "%s/lxc-%s", LXCTEMPLATEDIR
, t
);
899 if (access(tpath
, X_OK
) < 0) {
900 SYSERROR("bad template: %s", t
);
909 * @path: a pathname where / replaced with '\0'.
910 * @offsetp: pointer to int showing which path segment was last seen.
911 * Updated on return to reflect the next segment.
912 * @fulllen: full original path length.
913 * Returns a pointer to the next path segment, or NULL if done.
915 static char *get_nextpath(char *path
, int *offsetp
, int fulllen
)
917 int offset
= *offsetp
;
919 if (offset
>= fulllen
)
922 while (offset
< fulllen
&& path
[offset
] != '\0')
925 while (offset
< fulllen
&& path
[offset
] == '\0')
930 return (offset
< fulllen
) ? &path
[offset
] : NULL
;
934 * Check that @subdir is a subdir of @dir. @len is the length of
935 * @dir (to avoid having to recalculate it).
937 static bool is_subdir(const char *subdir
, const char *dir
, size_t len
)
939 size_t subdirlen
= strlen(subdir
);
944 if (!strnequal(subdir
, dir
, len
))
947 if (dir
[len
-1] == '/')
950 if (subdir
[len
] == '/' || subdirlen
== len
)
957 * Check if the open fd is a symlink. Return -ELOOP if it is. Return
958 * -ENOENT if we couldn't fstat. Return 0 if the fd is ok.
960 static int check_symlink(int fd
)
965 ret
= fstat(fd
, &sb
);
969 if (S_ISLNK(sb
.st_mode
))
976 * Open a file or directory, provided that it contains no symlinks.
978 * CAVEAT: This function must not be used for other purposes than container
979 * setup before executing the container's init
981 static int open_if_safe(int dirfd
, const char *nextpath
)
983 int newfd
= openat(dirfd
, nextpath
, O_RDONLY
| O_NOFOLLOW
);
984 if (newfd
>= 0) /* Was not a symlink, all good. */
990 if (errno
== EPERM
|| errno
== EACCES
) {
991 /* We're not root (cause we got EPERM) so try opening with
994 newfd
= openat(dirfd
, nextpath
, O_PATH
| O_NOFOLLOW
);
996 /* O_PATH will return an fd for symlinks. We know
997 * nextpath wasn't a symlink at last openat, so if fd is
998 * now a link, then something * fishy is going on.
1000 int ret
= check_symlink(newfd
);
1012 * Open a path intending for mounting, ensuring that the final path
1013 * is inside the container's rootfs.
1015 * CAVEAT: This function must not be used for other purposes than container
1016 * setup before executing the container's init
1018 * @target: path to be opened
1019 * @prefix_skip: a part of @target in which to ignore symbolic links. This
1020 * would be the container's rootfs.
1022 * Return an open fd for the path, or <0 on error.
1024 static int open_without_symlink(const char *target
, const char *prefix_skip
)
1026 int curlen
= 0, dirfd
, fulllen
, i
;
1029 fulllen
= strlen(target
);
1031 /* make sure prefix-skip makes sense */
1032 if (prefix_skip
&& strlen(prefix_skip
) > 0) {
1033 curlen
= strlen(prefix_skip
);
1034 if (!is_subdir(target
, prefix_skip
, curlen
)) {
1035 ERROR("WHOA there - target \"%s\" didn't start with prefix \"%s\"",
1036 target
, prefix_skip
);
1041 * get_nextpath() expects the curlen argument to be
1042 * on a (turned into \0) / or before it, so decrement
1043 * curlen to make sure that happens
1052 /* Make a copy of target which we can hack up, and tokenize it */
1053 if ((dup
= strdup(target
)) == NULL
) {
1054 ERROR("Out of memory checking for symbolic link");
1058 for (i
= 0; i
< fulllen
; i
++) {
1063 dirfd
= open(prefix_skip
, O_RDONLY
);
1065 SYSERROR("Failed to open path \"%s\"", prefix_skip
);
1070 int newfd
, saved_errno
;
1073 if ((nextpath
= get_nextpath(dup
, &curlen
, fulllen
)) == NULL
)
1076 newfd
= open_if_safe(dirfd
, nextpath
);
1077 saved_errno
= errno
;
1082 errno
= saved_errno
;
1084 SYSERROR("%s in %s was a symbolic link!", nextpath
, target
);
1095 int __safe_mount_beneath_at(int beneath_fd
, const char *src
, const char *dst
, const char *fstype
,
1096 unsigned int flags
, const void *data
)
1098 __do_close
int source_fd
= -EBADF
, target_fd
= -EBADF
;
1099 struct open_how how
= {
1100 .flags
= PROTECT_OPATH_DIRECTORY
,
1101 .resolve
= PROTECT_LOOKUP_BENEATH_WITH_MAGICLINKS
,
1104 char src_buf
[LXC_PROC_PID_FD_LEN
], tgt_buf
[LXC_PROC_PID_FD_LEN
];
1109 if ((flags
& MS_BIND
) && src
&& src
[0] != '/') {
1110 source_fd
= openat2(beneath_fd
, src
, &how
, sizeof(how
));
1113 ret
= strnprintf(src_buf
, sizeof(src_buf
), "/proc/self/fd/%d", source_fd
);
1120 target_fd
= openat2(beneath_fd
, dst
, &how
, sizeof(how
));
1122 return log_error_errno(-errno
, errno
, "Failed to open %d(%s)", beneath_fd
, dst
);
1123 ret
= strnprintf(tgt_buf
, sizeof(tgt_buf
), "/proc/self/fd/%d", target_fd
);
1127 if (!is_empty_string(src_buf
))
1128 ret
= mount(src_buf
, tgt_buf
, fstype
, flags
, data
);
1130 ret
= mount(src
, tgt_buf
, fstype
, flags
, data
);
1135 int safe_mount_beneath(const char *beneath
, const char *src
, const char *dst
, const char *fstype
,
1136 unsigned int flags
, const void *data
)
1138 __do_close
int beneath_fd
= -EBADF
;
1139 const char *path
= beneath
? beneath
: "/";
1141 beneath_fd
= openat(-1, path
, PROTECT_OPATH_DIRECTORY
);
1143 return log_error_errno(-errno
, errno
, "Failed to open %s", path
);
1145 return __safe_mount_beneath_at(beneath_fd
, src
, dst
, fstype
, flags
, data
);
1148 int safe_mount_beneath_at(int beneath_fd
, const char *src
, const char *dst
, const char *fstype
,
1149 unsigned int flags
, const void *data
)
1151 return __safe_mount_beneath_at(beneath_fd
, src
, dst
, fstype
, flags
, data
);
1155 * Safely mount a path into a container, ensuring that the mount target
1156 * is under the container's @rootfs. (If @rootfs is NULL, then the container
1157 * uses the host's /)
1159 * CAVEAT: This function must not be used for other purposes than container
1160 * setup before executing the container's init
1162 int safe_mount(const char *src
, const char *dest
, const char *fstype
,
1163 unsigned long flags
, const void *data
, const char *rootfs
)
1165 int destfd
, ret
, saved_errno
;
1166 /* Only needs enough for /proc/self/fd/<fd>. */
1167 char srcbuf
[50], destbuf
[50];
1169 const char *mntsrc
= src
;
1174 /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
1175 if (flags
& MS_BIND
&& src
&& src
[0] != '/') {
1176 INFO("This is a relative bind mount");
1178 srcfd
= open_without_symlink(src
, NULL
);
1182 ret
= strnprintf(srcbuf
, sizeof(srcbuf
), "/proc/self/fd/%d", srcfd
);
1185 ERROR("Out of memory");
1191 destfd
= open_without_symlink(dest
, rootfs
);
1194 saved_errno
= errno
;
1196 errno
= saved_errno
;
1202 ret
= strnprintf(destbuf
, sizeof(destbuf
), "/proc/self/fd/%d", destfd
);
1208 ERROR("Out of memory");
1212 ret
= mount(mntsrc
, destbuf
, fstype
, flags
, data
);
1213 saved_errno
= errno
;
1219 errno
= saved_errno
;
1220 SYSERROR("Failed to mount \"%s\" onto \"%s\"", src
? src
: "(null)", dest
);
1227 int open_devnull(void)
1229 int fd
= open("/dev/null", O_RDWR
);
1231 SYSERROR("Can't open /dev/null");
1236 int set_stdfds(int fd
)
1243 ret
= dup2(fd
, STDIN_FILENO
);
1247 ret
= dup2(fd
, STDOUT_FILENO
);
1251 ret
= dup2(fd
, STDERR_FILENO
);
1258 int null_stdfds(void)
1263 fd
= open_devnull();
1265 ret
= set_stdfds(fd
);
1272 /* Check whether a signal is blocked by a process. */
1273 /* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
1274 #define __PROC_STATUS_LEN (6 + INTTYPE_TO_STRLEN(pid_t) + 7 + 1)
1275 bool task_blocks_signal(pid_t pid
, int signal
)
1277 __do_free
char *line
= NULL
;
1278 __do_fclose
FILE *f
= NULL
;
1280 char status
[__PROC_STATUS_LEN
] = {0};
1281 uint64_t sigblk
= 0, one
= 1;
1285 ret
= strnprintf(status
, sizeof(status
), "/proc/%d/status", pid
);
1289 f
= fopen(status
, "re");
1293 while (getline(&line
, &n
, f
) != -1) {
1296 if (!strnequal(line
, "SigBlk:", 7))
1299 numstr
= lxc_trim_whitespace_in_place(line
+ 7);
1300 ret
= lxc_safe_uint64(numstr
, &sigblk
, 16);
1307 if (sigblk
& (one
<< (signal
- 1)))
1313 int lxc_preserve_ns(const int pid
, const char *ns
)
1316 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
1317 #define __NS_PATH_LEN 50
1318 char path
[__NS_PATH_LEN
];
1320 /* This way we can use this function to also check whether namespaces
1321 * are supported by the kernel by passing in the NULL or the empty
1324 ret
= strnprintf(path
, sizeof(path
), "/proc/%d/ns%s%s", pid
,
1325 !ns
|| strequal(ns
, "") ? "" : "/",
1326 !ns
|| strequal(ns
, "") ? "" : ns
);
1328 return ret_errno(EIO
);
1330 return open(path
, O_RDONLY
| O_CLOEXEC
);
1333 bool lxc_switch_uid_gid(uid_t uid
, gid_t gid
)
1337 if (gid
!= LXC_INVALID_GID
) {
1338 ret
= setresgid(gid
, gid
, gid
);
1340 SYSERROR("Failed to switch to gid %d", gid
);
1343 NOTICE("Switched to gid %d", gid
);
1346 if (uid
!= LXC_INVALID_UID
) {
1347 ret
= setresuid(uid
, uid
, uid
);
1349 SYSERROR("Failed to switch to uid %d", uid
);
1352 NOTICE("Switched to uid %d", uid
);
1358 /* Simple convenience function which enables uniform logging. */
1359 bool lxc_drop_groups(void)
1363 ret
= setgroups(0, NULL
);
1365 return log_error_errno(false, errno
, "Failed to drop supplimentary groups");
1367 NOTICE("Dropped supplimentary groups");
1371 bool lxc_setgroups(gid_t list
[], size_t size
)
1375 ret
= setgroups(size
, list
);
1377 return log_error_errno(false, errno
, "Failed to set supplimentary groups");
1379 if (size
> 0 && lxc_log_trace()) {
1380 for (size_t i
= 0; i
< size
; i
++)
1381 TRACE("Setting supplimentary group %d", list
[i
]);
1384 NOTICE("Set supplimentary groups");
1388 static int lxc_get_unused_loop_dev_legacy(char *loop_name
)
1391 struct loop_info64 lo64
;
1393 int dfd
= -1, fd
= -1, ret
= -1;
1395 dir
= opendir("/dev");
1397 SYSERROR("Failed to open \"/dev\"");
1401 while ((dp
= readdir(dir
))) {
1402 if (!strnequal(dp
->d_name
, "loop", 4))
1409 fd
= openat(dfd
, dp
->d_name
, O_RDWR
);
1413 ret
= ioctl(fd
, LOOP_GET_STATUS64
, &lo64
);
1415 if (ioctl(fd
, LOOP_GET_STATUS64
, &lo64
) == 0 ||
1423 ret
= strnprintf(loop_name
, LO_NAME_SIZE
, "/dev/%s", dp
->d_name
);
1441 static int lxc_get_unused_loop_dev(char *name_loop
)
1444 int fd_ctl
= -1, fd_tmp
= -1;
1446 fd_ctl
= open("/dev/loop-control", O_RDWR
| O_CLOEXEC
);
1448 SYSERROR("Failed to open loop control");
1452 loop_nr
= ioctl(fd_ctl
, LOOP_CTL_GET_FREE
);
1454 SYSERROR("Failed to get loop control");
1458 ret
= strnprintf(name_loop
, LO_NAME_SIZE
, "/dev/loop%d", loop_nr
);
1462 fd_tmp
= open(name_loop
, O_RDWR
| O_CLOEXEC
);
1464 /* on Android loop devices are moved under /dev/block, give it a shot */
1465 ret
= strnprintf(name_loop
, LO_NAME_SIZE
, "/dev/block/loop%d", loop_nr
);
1469 fd_tmp
= open(name_loop
, O_RDWR
| O_CLOEXEC
);
1471 SYSERROR("Failed to open loop \"%s\"", name_loop
);
1479 int lxc_prepare_loop_dev(const char *source
, char *loop_dev
, int flags
)
1482 struct loop_info64 lo64
;
1483 int fd_img
= -1, fret
= -1, fd_loop
= -1;
1485 fd_loop
= lxc_get_unused_loop_dev(loop_dev
);
1487 if (fd_loop
!= -ENODEV
)
1490 fd_loop
= lxc_get_unused_loop_dev_legacy(loop_dev
);
1495 fd_img
= open(source
, O_RDWR
| O_CLOEXEC
);
1497 SYSERROR("Failed to open source \"%s\"", source
);
1501 ret
= ioctl(fd_loop
, LOOP_SET_FD
, fd_img
);
1503 SYSERROR("Failed to set loop fd");
1507 memset(&lo64
, 0, sizeof(lo64
));
1508 lo64
.lo_flags
= flags
;
1510 strlcpy((char *)lo64
.lo_file_name
, source
, LO_NAME_SIZE
);
1512 ret
= ioctl(fd_loop
, LOOP_SET_STATUS64
, &lo64
);
1514 SYSERROR("Failed to set loop status64");
1524 if (fret
< 0 && fd_loop
>= 0) {
1532 int lxc_unstack_mountpoint(const char *path
, bool lazy
)
1538 ret
= umount2(path
, lazy
? MNT_DETACH
: 0);
1540 /* We consider anything else than EINVAL deadly to prevent going
1541 * into an infinite loop. (The other alternative is constantly
1542 * parsing /proc/self/mountinfo which is yucky and probably
1545 if (errno
!= EINVAL
)
1548 /* Just stop counting when this happens. That'd just be so
1549 * stupid that we won't even bother trying to report back the
1550 * correct value anymore.
1552 if (umounts
!= INT_MAX
)
1555 /* We succeeded in umounting. Make sure that there's no other
1556 * mountpoint stacked underneath.
1564 static int run_command_internal(char *buf
, size_t buf_size
, int (*child_fn
)(void *), void *args
, bool wait_status
)
1567 int ret
, fret
, pipefd
[2];
1570 /* Make sure our callers do not receive uninitialized memory. */
1571 if (buf_size
> 0 && buf
)
1574 if (pipe(pipefd
) < 0) {
1575 SYSERROR("Failed to create pipe");
1579 child
= lxc_raw_clone(0, NULL
);
1583 SYSERROR("Failed to create new process");
1588 /* Close the read-end of the pipe. */
1591 /* Redirect std{err,out} to write-end of the
1594 ret
= dup2(pipefd
[1], STDOUT_FILENO
);
1596 ret
= dup2(pipefd
[1], STDERR_FILENO
);
1598 /* Close the write-end of the pipe. */
1602 SYSERROR("Failed to duplicate std{err,out} file descriptor");
1603 _exit(EXIT_FAILURE
);
1606 /* Does not return. */
1608 ERROR("Failed to exec command");
1609 _exit(EXIT_FAILURE
);
1612 /* close the write-end of the pipe */
1615 if (buf
&& buf_size
> 0) {
1616 bytes
= lxc_read_nointr(pipefd
[0], buf
, buf_size
- 1);
1618 buf
[bytes
- 1] = '\0';
1622 fret
= lxc_wait_for_pid_status(child
);
1624 fret
= wait_for_pid(child
);
1626 /* close the read-end of the pipe */
1632 int run_command(char *buf
, size_t buf_size
, int (*child_fn
)(void *), void *args
)
1634 return run_command_internal(buf
, buf_size
, child_fn
, args
, false);
1637 int run_command_status(char *buf
, size_t buf_size
, int (*child_fn
)(void *), void *args
)
1639 return run_command_internal(buf
, buf_size
, child_fn
, args
, true);
1642 bool lxc_nic_exists(char *nic
)
1644 #define __LXC_SYS_CLASS_NET_LEN 15 + IFNAMSIZ + 1
1645 char path
[__LXC_SYS_CLASS_NET_LEN
];
1649 if (strequal(nic
, "none"))
1652 ret
= strnprintf(path
, sizeof(path
), "/sys/class/net/%s", nic
);
1656 ret
= stat(path
, &sb
);
1663 uint64_t lxc_find_next_power2(uint64_t n
)
1665 /* 0 is not valid input. We return 0 to the caller since 0 is not a
1666 * valid power of two.
1681 static int process_dead(/* takes */ int status_fd
)
1683 __do_close
int dupfd
= -EBADF
;
1684 __do_free
char *line
= NULL
;
1685 __do_fclose
FILE *f
= NULL
;
1689 dupfd
= dup(status_fd
);
1693 if (fd_cloexec(dupfd
, true) < 0)
1696 f
= fdopen(dupfd
, "re");
1700 /* Transfer ownership of fd. */
1704 while (getline(&line
, &n
, f
) != -1) {
1707 if (!strnequal(line
, "State:", 6))
1710 state
= lxc_trim_whitespace_in_place(line
+ 6);
1711 /* only check whether process is dead or zombie for now */
1712 if (*state
== 'X' || *state
== 'Z')
1719 int lxc_set_death_signal(int signal
, pid_t parent
, int parent_status_fd
)
1724 ret
= prctl(PR_SET_PDEATHSIG
, prctl_arg(signal
), prctl_arg(0),
1725 prctl_arg(0), prctl_arg(0));
1727 /* verify that we haven't been orphaned in the meantime */
1728 ppid
= (pid_t
)syscall(SYS_getppid
);
1729 if (ppid
== 0) { /* parent outside our pidns */
1730 if (parent_status_fd
< 0)
1733 if (process_dead(parent_status_fd
) == 1)
1734 return raise(SIGKILL
);
1735 } else if (ppid
!= parent
) {
1736 return raise(SIGKILL
);
1745 int lxc_rm_rf(const char *dirname
)
1747 __do_closedir
DIR *dir
= NULL
;
1750 struct dirent
*direntp
;
1752 dir
= opendir(dirname
);
1754 return log_error_errno(-1, errno
, "Failed to open dir \"%s\"", dirname
);
1756 while ((direntp
= readdir(dir
))) {
1757 __do_free
char *pathname
= NULL
;
1760 if (strequal(direntp
->d_name
, ".") ||
1761 strequal(direntp
->d_name
, ".."))
1764 pathname
= must_make_path(dirname
, direntp
->d_name
, NULL
);
1765 ret
= lstat(pathname
, &mystat
);
1768 SYSWARN("Failed to stat \"%s\"", pathname
);
1774 if (!S_ISDIR(mystat
.st_mode
))
1777 ret
= lxc_rm_rf(pathname
);
1782 ret
= rmdir(dirname
);
1784 return log_warn_errno(-1, errno
, "Failed to delete \"%s\"", dirname
);
1789 bool lxc_can_use_pidfd(int pidfd
)
1794 return log_trace(false, "Kernel does not support pidfds");
1797 * We don't care whether or not children were in a waitable state. We
1798 * just care whether waitid() recognizes P_PIDFD.
1800 * Btw, while I have your attention, the above waitid() code is an
1801 * excellent example of how _not_ to do flag-based kernel APIs. So if
1802 * you ever go into kernel development or are already and you add this
1803 * kind of flag potpourri even though you have read this comment shame
1804 * on you. May the gods of operating system development have mercy on
1805 * your soul because I won't.
1807 ret
= waitid(P_PIDFD
, pidfd
, NULL
,
1808 /* Type of children to wait for. */
1810 /* How to wait for them. */
1812 /* What state to wait for. */
1813 WEXITED
| WSTOPPED
| WCONTINUED
);
1815 return log_error_errno(false, errno
, "Kernel does not support waiting on processes through pidfds");
1817 ret
= lxc_raw_pidfd_send_signal(pidfd
, 0, NULL
, 0);
1819 return log_error_errno(false, errno
, "Kernel does not support sending singals through pidfds");
1821 return log_trace(true, "Kernel supports pidfds");
1824 int fix_stdio_permissions(uid_t uid
)
1826 __do_close
int devnull_fd
= -EBADF
;
1828 int std_fds
[] = {STDIN_FILENO
, STDOUT_FILENO
, STDERR_FILENO
};
1830 struct stat st
, st_null
;
1832 devnull_fd
= open_devnull();
1834 return log_trace_errno(-1, errno
, "Failed to open \"/dev/null\"");
1836 ret
= fstat(devnull_fd
, &st_null
);
1838 return log_trace_errno(-errno
, errno
, "Failed to stat \"/dev/null\"");
1840 for (size_t i
= 0; i
< ARRAY_SIZE(std_fds
); i
++) {
1841 ret
= fstat(std_fds
[i
], &st
);
1843 SYSWARN("Failed to stat standard I/O file descriptor %d", std_fds
[i
]);
1848 if (st
.st_rdev
== st_null
.st_rdev
)
1851 ret
= fchown(std_fds
[i
], uid
, st
.st_gid
);
1853 SYSTRACE("Failed to chown standard I/O file descriptor %d to uid %d and gid %d",
1854 std_fds
[i
], uid
, st
.st_gid
);
1859 ret
= fchmod(std_fds
[i
], 0700);
1861 SYSTRACE("Failed to chmod standard I/O file descriptor %d", std_fds
[i
]);
1869 bool multiply_overflow(int64_t base
, uint64_t mult
, int64_t *res
)
1871 if (base
> 0 && base
> (int64_t)(INT64_MAX
/ mult
))
1874 if (base
< 0 && base
< (int64_t)(INT64_MIN
/ mult
))
1877 *res
= (int64_t)(base
* mult
);
1881 int print_r(int fd
, const char *path
)
1883 __do_close
int dfd
= -EBADF
, dfd_dup
= -EBADF
;
1884 __do_closedir
DIR *dir
= NULL
;
1886 struct dirent
*direntp
;
1889 if (is_empty_string(path
)) {
1890 char buf
[LXC_PROC_SELF_FD_LEN
];
1892 ret
= strnprintf(buf
, sizeof(buf
), "/proc/self/fd/%d", fd
);
1894 return ret_errno(EIO
);
1897 * O_PATH file descriptors can't be used so we need to re-open
1900 dfd
= openat(-EBADF
, buf
, O_CLOEXEC
| O_DIRECTORY
, 0);
1902 dfd
= openat(fd
, path
, O_CLOEXEC
| O_DIRECTORY
, 0);
1907 dfd_dup
= dup_cloexec(dfd
);
1911 dir
= fdopendir(dfd
);
1914 /* Transfer ownership to fdopendir(). */
1917 while ((direntp
= readdir(dir
))) {
1918 if (!strcmp(direntp
->d_name
, ".") ||
1919 !strcmp(direntp
->d_name
, ".."))
1922 ret
= fstatat(dfd_dup
, direntp
->d_name
, &st
, AT_SYMLINK_NOFOLLOW
);
1923 if (ret
< 0 && errno
!= ENOENT
)
1927 if (S_ISDIR(st
.st_mode
))
1928 ret
= print_r(dfd_dup
, direntp
->d_name
);
1930 INFO("mode(%o):uid(%d):gid(%d) -> %d/%s\n",
1931 (st
.st_mode
& ~S_IFMT
), st
.st_uid
, st
.st_gid
, dfd_dup
,
1933 if (ret
< 0 && errno
!= ENOENT
)
1937 if (is_empty_string(path
))
1938 ret
= fstatat(fd
, "", &st
, AT_NO_AUTOMOUNT
| AT_SYMLINK_NOFOLLOW
| AT_EMPTY_PATH
);
1940 ret
= fstatat(fd
, path
, &st
, AT_NO_AUTOMOUNT
| AT_SYMLINK_NOFOLLOW
);
1944 INFO("mode(%o):uid(%d):gid(%d) -> %s",
1945 (st
.st_mode
& ~S_IFMT
), st
.st_uid
, st
.st_gid
, maybe_empty(path
));