2 * lxc: linux Container library
4 * (C) Copyright IBM Corp. 2007, 2008
7 * Daniel Lezcano <daniel.lezcano at free.fr>
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
38 #include <sys/mount.h>
39 #include <sys/param.h>
40 #include <sys/prctl.h>
42 #include <sys/types.h>
48 #include "namespace.h"
56 #define PR_SET_MM_MAP 14
77 #define O_PATH 010000000
81 #define O_NOFOLLOW 00400000
84 lxc_log_define(lxc_utils
, lxc
);
87 * if path is btrfs, tries to remove it and any subvolumes beneath it
89 extern bool btrfs_try_remove_subvol(const char *path
);
91 static int _recursive_rmdir(char *dirname
, dev_t pdev
,
92 const char *exclude
, int level
, bool onedev
)
94 struct dirent
*direntp
;
97 char pathname
[MAXPATHLEN
];
98 bool hadexclude
= false;
100 dir
= opendir(dirname
);
102 ERROR("%s: failed to open %s", __func__
, dirname
);
106 while ((direntp
= readdir(dir
))) {
113 if (!strcmp(direntp
->d_name
, ".") ||
114 !strcmp(direntp
->d_name
, ".."))
117 rc
= snprintf(pathname
, MAXPATHLEN
, "%s/%s", dirname
, direntp
->d_name
);
118 if (rc
< 0 || rc
>= MAXPATHLEN
) {
119 ERROR("pathname too long");
124 if (!level
&& exclude
&& !strcmp(direntp
->d_name
, exclude
)) {
125 ret
= rmdir(pathname
);
129 INFO("Not deleting snapshot %s", pathname
);
133 ret
= unlink(pathname
);
135 INFO("%s: failed to remove %s", __func__
, pathname
);
138 SYSERROR("%s: failed to rmdir %s", __func__
, pathname
);
146 ret
= lstat(pathname
, &mystat
);
148 ERROR("%s: failed to stat %s", __func__
, pathname
);
152 if (onedev
&& mystat
.st_dev
!= pdev
) {
153 /* TODO should we be checking /proc/self/mountinfo for
154 * pathname and not doing this if found? */
155 if (btrfs_try_remove_subvol(pathname
))
156 INFO("Removed btrfs subvolume at %s\n", pathname
);
159 if (S_ISDIR(mystat
.st_mode
)) {
160 if (_recursive_rmdir(pathname
, pdev
, exclude
, level
+1, onedev
) < 0)
163 if (unlink(pathname
) < 0) {
164 SYSERROR("%s: failed to delete %s", __func__
, pathname
);
170 if (rmdir(dirname
) < 0 && !btrfs_try_remove_subvol(dirname
) && !hadexclude
) {
171 ERROR("%s: failed to delete %s", __func__
, dirname
);
177 ERROR("%s: failed to close directory %s", __func__
, dirname
);
181 return failed
? -1 : 0;
184 /* we have two different magic values for overlayfs, yay */
185 #define OVERLAYFS_SUPER_MAGIC 0x794c764f
186 #define OVERLAY_SUPER_MAGIC 0x794c7630
188 * In overlayfs, st_dev is unreliable. so on overlayfs we don't do
189 * the lxc_rmdir_onedev()
191 static bool is_native_overlayfs(const char *path
)
195 if (statfs(path
, &sb
) < 0)
197 if (sb
.f_type
== OVERLAYFS_SUPER_MAGIC
||
198 sb
.f_type
== OVERLAY_SUPER_MAGIC
)
203 /* returns 0 on success, -1 if there were any failures */
204 extern int lxc_rmdir_onedev(char *path
, const char *exclude
)
209 if (is_native_overlayfs(path
)) {
213 if (lstat(path
, &mystat
) < 0) {
216 ERROR("%s: failed to stat %s", __func__
, path
);
220 return _recursive_rmdir(path
, mystat
.st_dev
, exclude
, 0, onedev
);
223 /* borrowed from iproute2 */
224 extern int get_u16(unsigned short *val
, const char *arg
, int base
)
233 res
= strtoul(arg
, &ptr
, base
);
234 if (!ptr
|| ptr
== arg
|| *ptr
|| res
> 0xFFFF || errno
!= 0)
242 extern int mkdir_p(const char *dir
, mode_t mode
)
244 const char *tmp
= dir
;
245 const char *orig
= dir
;
249 dir
= tmp
+ strspn(tmp
, "/");
250 tmp
= dir
+ strcspn(dir
, "/");
251 makeme
= strndup(orig
, dir
- orig
);
253 if (mkdir(makeme
, mode
) && errno
!= EEXIST
) {
254 SYSERROR("failed to create directory '%s'", makeme
);
270 if (geteuid() == 0) {
271 rundir
= strdup(RUNTIME_PATH
);
275 rundir
= getenv("XDG_RUNTIME_DIR");
277 rundir
= strdup(rundir
);
281 INFO("XDG_RUNTIME_DIR isn't set in the environment.");
282 homedir
= getenv("HOME");
284 ERROR("HOME isn't set in the environment.");
288 rundir
= malloc(sizeof(char) * (17 + strlen(homedir
)));
289 sprintf(rundir
, "%s/.cache/lxc/run/", homedir
);
294 int wait_for_pid(pid_t pid
)
299 ret
= waitpid(pid
, &status
, 0);
307 if (!WIFEXITED(status
) || WEXITSTATUS(status
) != 0)
312 int lxc_wait_for_pid_status(pid_t pid
)
317 ret
= waitpid(pid
, &status
, 0);
328 ssize_t
lxc_write_nointr(int fd
, const void* buf
, size_t count
)
332 ret
= write(fd
, buf
, count
);
333 if (ret
< 0 && errno
== EINTR
)
338 ssize_t
lxc_read_nointr(int fd
, void* buf
, size_t count
)
342 ret
= read(fd
, buf
, count
);
343 if (ret
< 0 && errno
== EINTR
)
348 ssize_t
lxc_read_nointr_expect(int fd
, void* buf
, size_t count
, const void* expected_buf
)
351 ret
= lxc_read_nointr(fd
, buf
, count
);
354 if ((size_t)ret
!= count
)
356 if (expected_buf
&& memcmp(buf
, expected_buf
, count
) != 0) {
364 #include <gnutls/gnutls.h>
365 #include <gnutls/crypto.h>
367 __attribute__((constructor
))
368 static void gnutls_lxc_init(void)
370 gnutls_global_init();
373 int sha1sum_file(char *fnam
, unsigned char *digest
)
382 f
= fopen_cloexec(fnam
, "r");
384 SYSERROR("Error opening template");
387 if (fseek(f
, 0, SEEK_END
) < 0) {
388 SYSERROR("Error seeking to end of template");
392 if ((flen
= ftell(f
)) < 0) {
393 SYSERROR("Error telling size of template");
397 if (fseek(f
, 0, SEEK_SET
) < 0) {
398 SYSERROR("Error seeking to start of template");
402 if ((buf
= malloc(flen
+1)) == NULL
) {
403 SYSERROR("Out of memory");
407 if (fread(buf
, 1, flen
, f
) != flen
) {
408 SYSERROR("Failure reading template");
414 SYSERROR("Failre closing template");
419 ret
= gnutls_hash_fast(GNUTLS_DIG_SHA1
, buf
, flen
, (void *)digest
);
425 char** lxc_va_arg_list_to_argv(va_list ap
, size_t skip
, int do_strdup
)
428 size_t count
= 1 + skip
;
431 /* first determine size of argument list, we don't want to reallocate
436 char* arg
= va_arg(ap2
, char*);
443 result
= calloc(count
, sizeof(char*));
448 char* arg
= va_arg(ap
, char*);
451 arg
= do_strdup
? strdup(arg
) : arg
;
454 result
[count
++] = arg
;
457 /* calloc has already set last element to NULL*/
465 const char** lxc_va_arg_list_to_argv_const(va_list ap
, size_t skip
)
467 return (const char**)lxc_va_arg_list_to_argv(ap
, skip
, 0);
470 extern struct lxc_popen_FILE
*lxc_popen(const char *command
)
472 struct lxc_popen_FILE
*fp
= NULL
;
473 int parent_end
= -1, child_end
= -1;
477 int r
= pipe2(pipe_fds
, O_CLOEXEC
);
480 ERROR("pipe2 failure");
484 parent_end
= pipe_fds
[0];
485 child_end
= pipe_fds
[1];
489 if (child_pid
== 0) {
491 int child_std_end
= STDOUT_FILENO
;
493 if (child_end
!= child_std_end
) {
494 /* dup2() doesn't dup close-on-exec flag */
495 dup2(child_end
, child_std_end
);
497 /* it's safe not to close child_end here
498 * as it's marked close-on-exec anyway
502 * The descriptor is already the one we will use.
503 * But it must not be marked close-on-exec.
506 if (fcntl(child_end
, F_SETFD
, 0) != 0) {
507 SYSERROR("Failed to remove FD_CLOEXEC from fd.");
514 * This is the main/only reason
515 * why we do our lousy popen() emulation.
520 sigprocmask(SIG_UNBLOCK
, &mask
, NULL
);
523 execl("/bin/sh", "sh", "-c", command
, (char *) NULL
);
533 ERROR("fork failure");
537 fp
= calloc(1, sizeof(*fp
));
539 ERROR("failed to allocate memory");
543 fp
->f
= fdopen(parent_end
, "r");
545 ERROR("fdopen failure");
549 fp
->child_pid
= child_pid
;
558 parent_end
= -1; /* so we do not close it second time */
564 if (parent_end
!= -1)
570 extern int lxc_pclose(struct lxc_popen_FILE
*fp
)
579 child_pid
= fp
->child_pid
;
580 /* free memory (we still need to close file stream) */
585 if (!f
|| fclose(f
)) {
586 ERROR("fclose failure");
591 wait_pid
= waitpid(child_pid
, &wstatus
, 0);
592 } while (wait_pid
== -1 && errno
== EINTR
);
594 if (wait_pid
== -1) {
595 ERROR("waitpid failure");
602 char *lxc_string_replace(const char *needle
, const char *replacement
, const char *haystack
)
604 ssize_t len
= -1, saved_len
= -1;
606 size_t replacement_len
= strlen(replacement
);
607 size_t needle_len
= strlen(needle
);
609 /* should be executed exactly twice */
610 while (len
== -1 || result
== NULL
) {
616 result
= calloc(1, len
+ 1);
624 for (last_p
= (char *)haystack
, p
= strstr(last_p
, needle
); p
; last_p
= p
, p
= strstr(last_p
, needle
)) {
625 part_len
= (ssize_t
)(p
- last_p
);
626 if (result
&& part_len
> 0)
627 memcpy(&result
[len
], last_p
, part_len
);
629 if (result
&& replacement_len
> 0)
630 memcpy(&result
[len
], replacement
, replacement_len
);
631 len
+= replacement_len
;
634 part_len
= strlen(last_p
);
635 if (result
&& part_len
> 0)
636 memcpy(&result
[len
], last_p
, part_len
);
640 /* make sure we did the same thing twice,
641 * once for calculating length, the other
642 * time for copying data */
643 if (saved_len
!= len
) {
647 /* make sure we didn't overwrite any buffer,
648 * due to calloc the string should be 0-terminated */
649 if (result
[len
] != '\0') {
657 bool lxc_string_in_array(const char *needle
, const char **haystack
)
659 for (; haystack
&& *haystack
; haystack
++)
660 if (!strcmp(needle
, *haystack
))
665 char *lxc_string_join(const char *sep
, const char **parts
, bool use_as_prefix
)
669 size_t sep_len
= strlen(sep
);
670 size_t result_len
= use_as_prefix
* sep_len
;
672 /* calculate new string length */
673 for (p
= (char **)parts
; *p
; p
++)
674 result_len
+= (p
> (char **)parts
) * sep_len
+ strlen(*p
);
676 result
= calloc(result_len
+ 1, 1);
682 for (p
= (char **)parts
; *p
; p
++) {
683 if (p
> (char **)parts
)
691 char **lxc_normalize_path(const char *path
)
695 size_t components_len
= 0;
698 components
= lxc_string_split(path
, '/');
701 for (p
= components
; *p
; p
++)
704 /* resolve '.' and '..' */
705 for (pos
= 0; pos
< components_len
; ) {
706 if (!strcmp(components
[pos
], ".") || (!strcmp(components
[pos
], "..") && pos
== 0)) {
707 /* eat this element */
708 free(components
[pos
]);
709 memmove(&components
[pos
], &components
[pos
+1], sizeof(char *) * (components_len
- pos
));
711 } else if (!strcmp(components
[pos
], "..")) {
712 /* eat this and the previous element */
713 free(components
[pos
- 1]);
714 free(components
[pos
]);
715 memmove(&components
[pos
-1], &components
[pos
+1], sizeof(char *) * (components_len
- pos
));
726 bool lxc_deslashify(char **path
)
733 parts
= lxc_normalize_path(*path
);
737 /* We'll end up here if path == "///" or path == "". */
744 n
= strcspn(*path
, "/");
756 p
= lxc_string_join("/", (const char **)parts
, **path
== '/');
765 lxc_free_array((void **)parts
, free
);
769 char *lxc_append_paths(const char *first
, const char *second
)
771 size_t len
= strlen(first
) + strlen(second
) + 1;
772 const char *pattern
= "%s%s";
775 if (second
[0] != '/') {
780 result
= calloc(1, len
);
784 snprintf(result
, len
, pattern
, first
, second
);
788 bool lxc_string_in_list(const char *needle
, const char *haystack
, char _sep
)
790 char *token
, *str
, *saveptr
= NULL
;
791 char sep
[2] = { _sep
, '\0' };
793 if (!haystack
|| !needle
)
796 str
= alloca(strlen(haystack
)+1);
797 strcpy(str
, haystack
);
798 for (; (token
= strtok_r(str
, sep
, &saveptr
)); str
= NULL
) {
799 if (strcmp(needle
, token
) == 0)
806 char **lxc_string_split(const char *string
, char _sep
)
808 char *token
, *str
, *saveptr
= NULL
;
809 char sep
[2] = {_sep
, '\0'};
810 char **tmp
= NULL
, **result
= NULL
;
811 size_t result_capacity
= 0;
812 size_t result_count
= 0;
816 return calloc(1, sizeof(char *));
818 str
= alloca(strlen(string
) + 1);
820 for (; (token
= strtok_r(str
, sep
, &saveptr
)); str
= NULL
) {
821 r
= lxc_grow_array((void ***)&result
, &result_capacity
, result_count
+ 1, 16);
824 result
[result_count
] = strdup(token
);
825 if (!result
[result_count
])
830 /* if we allocated too much, reduce it */
831 tmp
= realloc(result
, (result_count
+ 1) * sizeof(char *));
835 /* Make sure we don't return uninitialized memory. */
836 if (result_count
== 0)
841 lxc_free_array((void **)result
, free
);
846 char **lxc_string_split_and_trim(const char *string
, char _sep
)
848 char *token
, *str
, *saveptr
= NULL
;
849 char sep
[2] = { _sep
, '\0' };
850 char **result
= NULL
;
851 size_t result_capacity
= 0;
852 size_t result_count
= 0;
857 return calloc(1, sizeof(char *));
859 str
= alloca(strlen(string
)+1);
861 for (; (token
= strtok_r(str
, sep
, &saveptr
)); str
= NULL
) {
862 while (token
[0] == ' ' || token
[0] == '\t')
865 while (i
> 0 && (token
[i
- 1] == ' ' || token
[i
- 1] == '\t')) {
869 r
= lxc_grow_array((void ***)&result
, &result_capacity
, result_count
+ 1, 16);
872 result
[result_count
] = strdup(token
);
873 if (!result
[result_count
])
878 /* if we allocated too much, reduce it */
879 return realloc(result
, (result_count
+ 1) * sizeof(char *));
882 lxc_free_array((void **)result
, free
);
887 void lxc_free_array(void **array
, lxc_free_fn element_free_fn
)
890 for (p
= array
; p
&& *p
; p
++)
895 int lxc_grow_array(void ***array
, size_t* capacity
, size_t new_size
, size_t capacity_increment
)
900 /* first time around, catch some trivial mistakes of the user
901 * only initializing one of these */
902 if (!*array
|| !*capacity
) {
907 new_capacity
= *capacity
;
908 while (new_size
+ 1 > new_capacity
)
909 new_capacity
+= capacity_increment
;
910 if (new_capacity
!= *capacity
) {
911 /* we have to reallocate */
912 new_array
= realloc(*array
, new_capacity
* sizeof(void *));
915 memset(&new_array
[*capacity
], 0, (new_capacity
- (*capacity
)) * sizeof(void *));
917 *capacity
= new_capacity
;
920 /* array has sufficient elements */
924 size_t lxc_array_len(void **array
)
929 for (p
= array
; p
&& *p
; p
++)
935 int lxc_write_to_file(const char *filename
, const void* buf
, size_t count
, bool add_newline
)
940 fd
= open(filename
, O_WRONLY
| O_TRUNC
| O_CREAT
| O_CLOEXEC
, 0666);
943 ret
= lxc_write_nointr(fd
, buf
, count
);
946 if ((size_t)ret
!= count
)
949 ret
= lxc_write_nointr(fd
, "\n", 1);
963 int lxc_read_from_file(const char *filename
, void* buf
, size_t count
)
965 int fd
= -1, saved_errno
;
968 fd
= open(filename
, O_RDONLY
| O_CLOEXEC
);
972 if (!buf
|| !count
) {
975 while ((ret
= read(fd
, buf2
, 100)) > 0)
980 memset(buf
, 0, count
);
981 ret
= read(fd
, buf
, count
);
985 ERROR("read %s: %s", filename
, strerror(errno
));
993 void **lxc_append_null_to_array(void **array
, size_t count
)
997 /* Append NULL to the array */
999 temp
= realloc(array
, (count
+ 1) * sizeof(*array
));
1002 for (i
= 0; i
< count
; i
++)
1008 array
[count
] = NULL
;
1013 int randseed(bool srand_it
)
1016 srand pre-seed function based on /dev/urandom
1018 unsigned int seed
= time(NULL
) + getpid();
1021 f
= fopen("/dev/urandom", "r");
1023 int ret
= fread(&seed
, sizeof(seed
), 1, f
);
1025 DEBUG("unable to fread /dev/urandom, %s, fallback to time+pid rand seed", strerror(errno
));
1035 uid_t
get_ns_uid(uid_t orig
)
1039 uid_t nsid
, hostid
, range
;
1040 FILE *f
= fopen("/proc/self/uid_map", "r");
1044 while (getline(&line
, &sz
, f
) != -1) {
1045 if (sscanf(line
, "%u %u %u", &nsid
, &hostid
, &range
) != 3)
1047 if (hostid
<= orig
&& hostid
+ range
> orig
) {
1048 nsid
+= orig
- hostid
;
1060 bool dir_exists(const char *path
)
1065 ret
= stat(path
, &sb
);
1067 // could be something other than eexist, just say no
1069 return S_ISDIR(sb
.st_mode
);
1072 /* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS.
1073 * FNV has good anti collision properties and we're not worried
1074 * about pre-image resistance or one-way-ness, we're just trying to make
1075 * the name unique in the 108 bytes of space we have.
1077 uint64_t fnv_64a_buf(void *buf
, size_t len
, uint64_t hval
)
1081 for(bp
= buf
; bp
< (unsigned char *)buf
+ len
; bp
++)
1083 /* xor the bottom with the current octet */
1084 hval
^= (uint64_t)*bp
;
1087 * multiply by the 64 bit FNV magic prime mod 2^64
1089 hval
+= (hval
<< 1) + (hval
<< 4) + (hval
<< 5) +
1090 (hval
<< 7) + (hval
<< 8) + (hval
<< 40);
1097 * Detect whether / is mounted MS_SHARED. The only way I know of to
1098 * check that is through /proc/self/mountinfo.
1099 * I'm only checking for /. If the container rootfs or mount location
1100 * is MS_SHARED, but not '/', then you're out of luck - figuring that
1101 * out would be too much work to be worth it.
1103 int detect_shared_rootfs(void)
1105 char buf
[LXC_LINELEN
], *p
;
1110 f
= fopen("/proc/self/mountinfo", "r");
1113 while (fgets(buf
, LXC_LINELEN
, f
)) {
1114 for (p
= buf
, i
= 0; p
&& i
< 4; i
++)
1115 p
= strchr(p
+ 1, ' ');
1118 p2
= strchr(p
+ 1, ' ');
1122 if (strcmp(p
+ 1, "/") == 0) {
1123 // this is '/'. is it shared?
1124 p
= strchr(p2
+ 1, ' ');
1125 if (p
&& strstr(p
, "shared:")) {
1135 bool switch_to_ns(pid_t pid
, const char *ns
) {
1137 char nspath
[MAXPATHLEN
];
1139 /* Switch to new ns */
1140 ret
= snprintf(nspath
, MAXPATHLEN
, "/proc/%d/ns/%s", pid
, ns
);
1141 if (ret
< 0 || ret
>= MAXPATHLEN
)
1144 fd
= open(nspath
, O_RDONLY
);
1146 SYSERROR("failed to open %s", nspath
);
1152 SYSERROR("failed to set process %d to %s of %d.", pid
, ns
, fd
);
1161 * looking at fs/proc_namespace.c, it appears we can
1162 * actually expect the rootfs entry to very specifically contain
1163 * " - rootfs rootfs "
1164 * IIUC, so long as we've chrooted so that rootfs is not our root,
1165 * the rootfs entry should always be skipped in mountinfo contents.
1167 bool detect_ramfs_rootfs(void)
1175 f
= fopen("/proc/self/mountinfo", "r");
1179 while (getline(&line
, &len
, f
) != -1) {
1180 for (p
= line
, i
= 0; p
&& i
< 4; i
++)
1181 p
= strchr(p
+ 1, ' ');
1184 p2
= strchr(p
+ 1, ' ');
1188 if (strcmp(p
+ 1, "/") == 0) {
1189 // this is '/'. is it the ramfs?
1190 p
= strchr(p2
+ 1, '-');
1191 if (p
&& strncmp(p
, "- rootfs rootfs ", 16) == 0) {
1203 char *on_path(const char *cmd
, const char *rootfs
) {
1206 char *saveptr
= NULL
;
1207 char cmdpath
[MAXPATHLEN
];
1210 path
= getenv("PATH");
1214 path
= strdup(path
);
1218 entry
= strtok_r(path
, ":", &saveptr
);
1221 ret
= snprintf(cmdpath
, MAXPATHLEN
, "%s/%s/%s", rootfs
, entry
, cmd
);
1223 ret
= snprintf(cmdpath
, MAXPATHLEN
, "%s/%s", entry
, cmd
);
1225 if (ret
< 0 || ret
>= MAXPATHLEN
)
1228 if (access(cmdpath
, X_OK
) == 0) {
1230 return strdup(cmdpath
);
1234 entry
= strtok_r(NULL
, ":", &saveptr
);
1241 bool file_exists(const char *f
)
1243 struct stat statbuf
;
1245 return stat(f
, &statbuf
) == 0;
1248 bool cgns_supported(void)
1250 return file_exists("/proc/self/ns/cgroup");
1253 /* historically lxc-init has been under /usr/lib/lxc and under
1254 * /usr/lib/$ARCH/lxc. It now lives as $prefix/sbin/init.lxc.
1256 char *choose_init(const char *rootfs
)
1259 const char *empty
= "",
1261 int ret
, env_set
= 0;
1264 if (!getenv("PATH")) {
1265 if (setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 0))
1266 SYSERROR("Failed to setenv");
1270 retv
= on_path("init.lxc", rootfs
);
1273 if (unsetenv("PATH"))
1274 SYSERROR("Failed to unsetenv");
1280 retv
= malloc(PATH_MAX
);
1289 ret
= snprintf(retv
, PATH_MAX
, "%s/%s/%s", tmp
, SBINDIR
, "/init.lxc");
1290 if (ret
< 0 || ret
>= PATH_MAX
) {
1291 ERROR("pathname too long");
1295 ret
= stat(retv
, &mystat
);
1299 ret
= snprintf(retv
, PATH_MAX
, "%s/%s/%s", tmp
, LXCINITDIR
, "/lxc/lxc-init");
1300 if (ret
< 0 || ret
>= PATH_MAX
) {
1301 ERROR("pathname too long");
1305 ret
= stat(retv
, &mystat
);
1309 ret
= snprintf(retv
, PATH_MAX
, "%s/usr/lib/lxc/lxc-init", tmp
);
1310 if (ret
< 0 || ret
>= PATH_MAX
) {
1311 ERROR("pathname too long");
1314 ret
= stat(retv
, &mystat
);
1318 ret
= snprintf(retv
, PATH_MAX
, "%s/sbin/lxc-init", tmp
);
1319 if (ret
< 0 || ret
>= PATH_MAX
) {
1320 ERROR("pathname too long");
1323 ret
= stat(retv
, &mystat
);
1328 * Last resort, look for the statically compiled init.lxc which we
1329 * hopefully bind-mounted in.
1330 * If we are called during container setup, and we get to this point,
1331 * then the init.lxc.static from the host will need to be bind-mounted
1332 * in. So we return NULL here to indicate that.
1337 ret
= snprintf(retv
, PATH_MAX
, "/init.lxc.static");
1338 if (ret
< 0 || ret
>= PATH_MAX
) {
1339 WARN("Nonsense - name /lxc.init.static too long");
1342 ret
= stat(retv
, &mystat
);
1351 int print_to_file(const char *file
, const char *content
)
1356 f
= fopen(file
, "w");
1359 if (fprintf(f
, "%s", content
) != strlen(content
))
1365 int is_dir(const char *path
)
1367 struct stat statbuf
;
1368 int ret
= stat(path
, &statbuf
);
1369 if (ret
== 0 && S_ISDIR(statbuf
.st_mode
))
1375 * Given the '-t' template option to lxc-create, figure out what to
1376 * do. If the template is a full executable path, use that. If it
1377 * is something like 'sshd', then return $templatepath/lxc-sshd.
1378 * On success return the template, on error return NULL.
1380 char *get_template_path(const char *t
)
1385 if (t
[0] == '/' && access(t
, X_OK
) == 0) {
1390 len
= strlen(LXCTEMPLATEDIR
) + strlen(t
) + strlen("/lxc-") + 1;
1391 tpath
= malloc(len
);
1394 ret
= snprintf(tpath
, len
, "%s/lxc-%s", LXCTEMPLATEDIR
, t
);
1395 if (ret
< 0 || ret
>= len
) {
1399 if (access(tpath
, X_OK
) < 0) {
1400 SYSERROR("bad template: %s", t
);
1409 * Sets the process title to the specified title. Note that this may fail if
1410 * the kernel doesn't support PR_SET_MM_MAP (kernels <3.18).
1412 int setproctitle(char *title
)
1414 static char *proctitle
= NULL
;
1415 char buf
[2048], *tmp
;
1417 int i
, len
, ret
= 0;
1419 /* We don't really need to know all of this stuff, but unfortunately
1420 * PR_SET_MM_MAP requires us to set it all at once, so we have to
1421 * figure it out anyway.
1423 unsigned long start_data
, end_data
, start_brk
, start_code
, end_code
,
1424 start_stack
, arg_start
, arg_end
, env_start
, env_end
,
1426 struct prctl_mm_map prctl_map
;
1428 f
= fopen_cloexec("/proc/self/stat", "r");
1433 tmp
= fgets(buf
, sizeof(buf
), f
);
1439 /* Skip the first 25 fields, column 26-28 are start_code, end_code,
1440 * and start_stack */
1441 tmp
= strchr(buf
, ' ');
1442 for (i
= 0; i
< 24; i
++) {
1445 tmp
= strchr(tmp
+1, ' ');
1450 i
= sscanf(tmp
, "%lu %lu %lu", &start_code
, &end_code
, &start_stack
);
1454 /* Skip the next 19 fields, column 45-51 are start_data to arg_end */
1455 for (i
= 0; i
< 19; i
++) {
1458 tmp
= strchr(tmp
+1, ' ');
1464 i
= sscanf(tmp
, "%lu %lu %lu %*u %*u %lu %lu",
1473 /* Include the null byte here, because in the calculations below we
1474 * want to have room for it. */
1475 len
= strlen(title
) + 1;
1477 proctitle
= realloc(proctitle
, len
);
1481 arg_start
= (unsigned long) proctitle
;
1482 arg_end
= arg_start
+ len
;
1484 brk_val
= syscall(__NR_brk
, 0);
1486 prctl_map
= (struct prctl_mm_map
) {
1487 .start_code
= start_code
,
1488 .end_code
= end_code
,
1489 .start_stack
= start_stack
,
1490 .start_data
= start_data
,
1491 .end_data
= end_data
,
1492 .start_brk
= start_brk
,
1494 .arg_start
= arg_start
,
1496 .env_start
= env_start
,
1503 ret
= prctl(PR_SET_MM
, PR_SET_MM_MAP
, (long) &prctl_map
, sizeof(prctl_map
), 0);
1505 strcpy((char*)arg_start
, title
);
1507 INFO("setting cmdline failed - %s", strerror(errno
));
1513 * @path: a pathname where / replaced with '\0'.
1514 * @offsetp: pointer to int showing which path segment was last seen.
1515 * Updated on return to reflect the next segment.
1516 * @fulllen: full original path length.
1517 * Returns a pointer to the next path segment, or NULL if done.
1519 static char *get_nextpath(char *path
, int *offsetp
, int fulllen
)
1521 int offset
= *offsetp
;
1523 if (offset
>= fulllen
)
1526 while (path
[offset
] != '\0' && offset
< fulllen
)
1528 while (path
[offset
] == '\0' && offset
< fulllen
)
1532 return (offset
< fulllen
) ? &path
[offset
] : NULL
;
1536 * Check that @subdir is a subdir of @dir. @len is the length of
1537 * @dir (to avoid having to recalculate it).
1539 static bool is_subdir(const char *subdir
, const char *dir
, size_t len
)
1541 size_t subdirlen
= strlen(subdir
);
1543 if (subdirlen
< len
)
1545 if (strncmp(subdir
, dir
, len
) != 0)
1547 if (dir
[len
-1] == '/')
1549 if (subdir
[len
] == '/' || subdirlen
== len
)
1555 * Check if the open fd is a symlink. Return -ELOOP if it is. Return
1556 * -ENOENT if we couldn't fstat. Return 0 if the fd is ok.
1558 static int check_symlink(int fd
)
1561 int ret
= fstat(fd
, &sb
);
1564 if (S_ISLNK(sb
.st_mode
))
1570 * Open a file or directory, provided that it contains no symlinks.
1572 * CAVEAT: This function must not be used for other purposes than container
1573 * setup before executing the container's init
1575 static int open_if_safe(int dirfd
, const char *nextpath
)
1577 int newfd
= openat(dirfd
, nextpath
, O_RDONLY
| O_NOFOLLOW
);
1578 if (newfd
>= 0) // was not a symlink, all good
1584 if (errno
== EPERM
|| errno
== EACCES
) {
1585 /* we're not root (cause we got EPERM) so
1586 try opening with O_PATH */
1587 newfd
= openat(dirfd
, nextpath
, O_PATH
| O_NOFOLLOW
);
1589 /* O_PATH will return an fd for symlinks. We know
1590 * nextpath wasn't a symlink at last openat, so if fd
1591 * is now a link, then something * fishy is going on
1593 int ret
= check_symlink(newfd
);
1605 * Open a path intending for mounting, ensuring that the final path
1606 * is inside the container's rootfs.
1608 * CAVEAT: This function must not be used for other purposes than container
1609 * setup before executing the container's init
1611 * @target: path to be opened
1612 * @prefix_skip: a part of @target in which to ignore symbolic links. This
1613 * would be the container's rootfs.
1615 * Return an open fd for the path, or <0 on error.
1617 static int open_without_symlink(const char *target
, const char *prefix_skip
)
1619 int curlen
= 0, dirfd
, fulllen
, i
;
1622 fulllen
= strlen(target
);
1624 /* make sure prefix-skip makes sense */
1625 if (prefix_skip
&& strlen(prefix_skip
) > 0) {
1626 curlen
= strlen(prefix_skip
);
1627 if (!is_subdir(target
, prefix_skip
, curlen
)) {
1628 ERROR("WHOA there - target '%s' didn't start with prefix '%s'",
1629 target
, prefix_skip
);
1633 * get_nextpath() expects the curlen argument to be
1634 * on a (turned into \0) / or before it, so decrement
1635 * curlen to make sure that happens
1644 /* Make a copy of target which we can hack up, and tokenize it */
1645 if ((dup
= strdup(target
)) == NULL
) {
1646 SYSERROR("Out of memory checking for symbolic link");
1649 for (i
= 0; i
< fulllen
; i
++) {
1654 dirfd
= open(prefix_skip
, O_RDONLY
);
1658 int newfd
, saved_errno
;
1661 if ((nextpath
= get_nextpath(dup
, &curlen
, fulllen
)) == NULL
)
1663 newfd
= open_if_safe(dirfd
, nextpath
);
1664 saved_errno
= errno
;
1668 errno
= saved_errno
;
1670 SYSERROR("%s in %s was a symbolic link!", nextpath
, target
);
1681 * Safely mount a path into a container, ensuring that the mount target
1682 * is under the container's @rootfs. (If @rootfs is NULL, then the container
1683 * uses the host's /)
1685 * CAVEAT: This function must not be used for other purposes than container
1686 * setup before executing the container's init
1688 int safe_mount(const char *src
, const char *dest
, const char *fstype
,
1689 unsigned long flags
, const void *data
, const char *rootfs
)
1691 int srcfd
= -1, destfd
, ret
, saved_errno
;
1692 char srcbuf
[50], destbuf
[50]; // only needs enough for /proc/self/fd/<fd>
1693 const char *mntsrc
= src
;
1698 /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
1699 if (flags
& MS_BIND
&& src
&& src
[0] != '/') {
1700 INFO("this is a relative bind mount");
1701 srcfd
= open_without_symlink(src
, NULL
);
1704 ret
= snprintf(srcbuf
, 50, "/proc/self/fd/%d", srcfd
);
1705 if (ret
< 0 || ret
> 50) {
1707 ERROR("Out of memory");
1713 destfd
= open_without_symlink(dest
, rootfs
);
1716 saved_errno
= errno
;
1718 errno
= saved_errno
;
1723 ret
= snprintf(destbuf
, 50, "/proc/self/fd/%d", destfd
);
1724 if (ret
< 0 || ret
> 50) {
1728 ERROR("Out of memory");
1732 ret
= mount(mntsrc
, destbuf
, fstype
, flags
, data
);
1733 saved_errno
= errno
;
1738 errno
= saved_errno
;
1739 SYSERROR("Failed to mount %s onto %s", src
, dest
);
1747 * Mount a proc under @rootfs if proc self points to a pid other than
1748 * my own. This is needed to have a known-good proc mount for setting
1749 * up LSMs both at container startup and attach.
1751 * @rootfs : the rootfs where proc should be mounted
1753 * Returns < 0 on failure, 0 if the correct proc was already mounted
1754 * and 1 if a new proc was mounted.
1756 * NOTE: not to be called from inside the container namespace!
1758 int lxc_mount_proc_if_needed(const char *rootfs
)
1760 char path
[MAXPATHLEN
];
1761 int link_to_pid
, linklen
, mypid
, ret
;
1762 char link
[LXC_NUMSTRLEN64
] = {0};
1764 ret
= snprintf(path
, MAXPATHLEN
, "%s/proc/self", rootfs
);
1765 if (ret
< 0 || ret
>= MAXPATHLEN
) {
1766 SYSERROR("proc path name too long");
1770 linklen
= readlink(path
, link
, LXC_NUMSTRLEN64
);
1772 ret
= snprintf(path
, MAXPATHLEN
, "%s/proc", rootfs
);
1773 if (ret
< 0 || ret
>= MAXPATHLEN
) {
1774 SYSERROR("proc path name too long");
1778 /* /proc not mounted */
1780 if (mkdir(path
, 0755) && errno
!= EEXIST
)
1783 } else if (linklen
>= LXC_NUMSTRLEN64
) {
1784 link
[linklen
- 1] = '\0';
1785 ERROR("readlink returned truncated content: \"%s\"", link
);
1790 INFO("I am %d, /proc/self points to \"%s\"", mypid
, link
);
1792 if (lxc_safe_int(link
, &link_to_pid
) < 0)
1795 /* correct procfs is already mounted */
1796 if (link_to_pid
== mypid
)
1799 ret
= umount2(path
, MNT_DETACH
);
1801 WARN("failed to umount \"%s\" with MNT_DETACH", path
);
1804 /* rootfs is NULL */
1805 if (!strcmp(rootfs
, ""))
1806 ret
= mount("proc", path
, "proc", 0, NULL
);
1808 ret
= safe_mount("proc", path
, "proc", 0, NULL
, rootfs
);
1812 INFO("mounted /proc in container for security transition");
1816 int open_devnull(void)
1818 int fd
= open("/dev/null", O_RDWR
);
1821 SYSERROR("Can't open /dev/null");
1826 int set_stdfds(int fd
)
1831 if (dup2(fd
, 0) < 0)
1833 if (dup2(fd
, 1) < 0)
1835 if (dup2(fd
, 2) < 0)
1841 int null_stdfds(void)
1844 int fd
= open_devnull();
1847 ret
= set_stdfds(fd
);
1855 * Return the number of lines in file @fn, or -1 on error
1857 int lxc_count_file_lines(const char *fn
)
1864 f
= fopen_cloexec(fn
, "r");
1868 while (getline(&line
, &sz
, f
) != -1) {
1876 void *lxc_strmmap(void *addr
, size_t length
, int prot
, int flags
, int fd
,
1879 void *tmp
= NULL
, *overlap
= NULL
;
1881 /* We establish an anonymous mapping that is one byte larger than the
1882 * underlying file. The pages handed to us are zero filled. */
1883 tmp
= mmap(addr
, length
+ 1, PROT_READ
, MAP_PRIVATE
| MAP_ANONYMOUS
, -1, 0);
1884 if (tmp
== MAP_FAILED
)
1887 /* Now we establish a fixed-address mapping starting at the address we
1888 * received from our anonymous mapping and replace all bytes excluding
1889 * the additional \0-byte with the file. This allows us to use normal
1890 * string-handling functions. */
1891 overlap
= mmap(tmp
, length
, prot
, MAP_FIXED
| flags
, fd
, offset
);
1892 if (overlap
== MAP_FAILED
)
1893 munmap(tmp
, length
+ 1);
1898 int lxc_strmunmap(void *addr
, size_t length
)
1900 return munmap(addr
, length
+ 1);
1903 /* Check whether a signal is blocked by a process. */
1904 /* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */
1905 #define __PROC_STATUS_LEN (5 + (LXC_NUMSTRLEN64) + 7 + 1)
1906 bool task_blocking_signal(pid_t pid
, int signal
)
1910 long unsigned int sigblk
= 0;
1915 char status
[__PROC_STATUS_LEN
];
1917 ret
= snprintf(status
, __PROC_STATUS_LEN
, "/proc/%d/status", pid
);
1918 if (ret
< 0 || ret
>= __PROC_STATUS_LEN
)
1921 f
= fopen(status
, "r");
1925 while (getline(&line
, &n
, f
) != -1) {
1926 if (!strncmp(line
, "SigBlk:\t", 8))
1927 if (sscanf(line
+ 8, "%lx", &sigblk
) != 1)
1931 if (sigblk
& signal
)
1940 static int lxc_append_null_to_list(void ***list
)
1946 for (; (*list
)[newentry
]; newentry
++) {
1950 tmp
= realloc(*list
, (newentry
+ 2) * sizeof(void **));
1955 (*list
)[newentry
+ 1] = NULL
;
1960 int lxc_append_string(char ***list
, char *entry
)
1965 newentry
= lxc_append_null_to_list((void ***)list
);
1969 copy
= strdup(entry
);
1973 (*list
)[newentry
] = copy
;
1978 int lxc_preserve_ns(const int pid
, const char *ns
)
1981 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
1982 #define __NS_PATH_LEN 50
1983 char path
[__NS_PATH_LEN
];
1985 /* This way we can use this function to also check whether namespaces
1986 * are supported by the kernel by passing in the NULL or the empty
1989 ret
= snprintf(path
, __NS_PATH_LEN
, "/proc/%d/ns%s%s", pid
,
1990 !ns
|| strcmp(ns
, "") == 0 ? "" : "/",
1991 !ns
|| strcmp(ns
, "") == 0 ? "" : ns
);
1992 if (ret
< 0 || (size_t)ret
>= __NS_PATH_LEN
)
1995 return open(path
, O_RDONLY
| O_CLOEXEC
);
1998 int lxc_safe_uint(const char *numstr
, unsigned int *converted
)
2001 unsigned long int uli
;
2003 while (isspace(*numstr
))
2010 uli
= strtoul(numstr
, &err
, 0);
2011 if (errno
== ERANGE
&& uli
== ULONG_MAX
)
2014 if (err
== numstr
|| *err
!= '\0')
2020 *converted
= (unsigned int)uli
;
2024 int lxc_safe_ulong(const char *numstr
, unsigned long *converted
)
2027 unsigned long int uli
;
2029 while (isspace(*numstr
))
2036 uli
= strtoul(numstr
, &err
, 0);
2037 if (errno
== ERANGE
&& uli
== ULONG_MAX
)
2040 if (err
== numstr
|| *err
!= '\0')
2047 int lxc_safe_int(const char *numstr
, int *converted
)
2050 signed long int sli
;
2053 sli
= strtol(numstr
, &err
, 0);
2054 if (errno
== ERANGE
&& (sli
== LONG_MAX
|| sli
== LONG_MIN
))
2057 if (errno
!= 0 && sli
== 0)
2060 if (err
== numstr
|| *err
!= '\0')
2063 if (sli
> INT_MAX
|| sli
< INT_MIN
)
2066 *converted
= (int)sli
;
2070 int lxc_safe_long(const char *numstr
, long int *converted
)
2073 signed long int sli
;
2076 sli
= strtol(numstr
, &err
, 0);
2077 if (errno
== ERANGE
&& (sli
== LONG_MAX
|| sli
== LONG_MIN
))
2080 if (errno
!= 0 && sli
== 0)
2083 if (err
== numstr
|| *err
!= '\0')
2090 int lxc_switch_uid_gid(uid_t uid
, gid_t gid
)
2092 if (setgid(gid
) < 0) {
2093 SYSERROR("Failed to switch to gid %d.", gid
);
2096 NOTICE("Switched to gid %d.", gid
);
2098 if (setuid(uid
) < 0) {
2099 SYSERROR("Failed to switch to uid %d.", uid
);
2102 NOTICE("Switched to uid %d.", uid
);
2107 /* Simple covenience function which enables uniform logging. */
2108 int lxc_setgroups(int size
, gid_t list
[])
2110 if (setgroups(size
, list
) < 0) {
2111 SYSERROR("Failed to setgroups().");
2114 NOTICE("Dropped additional groups.");
2119 static int lxc_get_unused_loop_dev_legacy(char *loop_name
)
2122 struct loop_info64 lo64
;
2124 int dfd
= -1, fd
= -1, ret
= -1;
2126 dir
= opendir("/dev");
2130 while ((dp
= readdir(dir
))) {
2134 if (strncmp(dp
->d_name
, "loop", 4) != 0)
2141 fd
= openat(dfd
, dp
->d_name
, O_RDWR
);
2145 ret
= ioctl(fd
, LOOP_GET_STATUS64
, &lo64
);
2147 if (ioctl(fd
, LOOP_GET_STATUS64
, &lo64
) == 0 ||
2155 ret
= snprintf(loop_name
, LO_NAME_SIZE
, "/dev/%s", dp
->d_name
);
2156 if (ret
< 0 || ret
>= LO_NAME_SIZE
) {
2173 static int lxc_get_unused_loop_dev(char *name_loop
)
2176 int fd_ctl
= -1, fd_tmp
= -1;
2178 fd_ctl
= open("/dev/loop-control", O_RDWR
| O_CLOEXEC
);
2182 loop_nr
= ioctl(fd_ctl
, LOOP_CTL_GET_FREE
);
2186 ret
= snprintf(name_loop
, LO_NAME_SIZE
, "/dev/loop%d", loop_nr
);
2187 if (ret
< 0 || ret
>= LO_NAME_SIZE
)
2190 fd_tmp
= open(name_loop
, O_RDWR
| O_CLOEXEC
);
2199 int lxc_prepare_loop_dev(const char *source
, char *loop_dev
, int flags
)
2202 struct loop_info64 lo64
;
2203 int fd_img
= -1, fret
= -1, fd_loop
= -1;
2205 fd_loop
= lxc_get_unused_loop_dev(loop_dev
);
2207 if (fd_loop
== -ENODEV
)
2208 fd_loop
= lxc_get_unused_loop_dev_legacy(loop_dev
);
2213 fd_img
= open(source
, O_RDWR
| O_CLOEXEC
);
2217 ret
= ioctl(fd_loop
, LOOP_SET_FD
, fd_img
);
2221 memset(&lo64
, 0, sizeof(lo64
));
2222 lo64
.lo_flags
= flags
;
2224 ret
= ioctl(fd_loop
, LOOP_SET_STATUS64
, &lo64
);
2234 if (fret
< 0 && fd_loop
>= 0) {
2242 int lxc_unstack_mountpoint(const char *path
, bool lazy
)
2248 ret
= umount2(path
, lazy
? MNT_DETACH
: 0);
2250 /* We consider anything else than EINVAL deadly to prevent going
2251 * into an infinite loop. (The other alternative is constantly
2252 * parsing /proc/self/mountinfo which is yucky and probably
2255 if (errno
!= EINVAL
)
2258 /* Just stop counting when this happens. That'd just be so
2259 * stupid that we won't even bother trying to report back the
2260 * correct value anymore.
2262 if (umounts
!= INT_MAX
)
2264 /* We succeeded in umounting. Make sure that there's no other
2265 * mountpoint stacked underneath.
2273 int run_command(char *buf
, size_t buf_size
, int (*child_fn
)(void *), void *args
)
2276 int ret
, fret
, pipefd
[2];
2279 /* Make sure our callers do not receive unitialized memory. */
2280 if (buf_size
> 0 && buf
)
2283 if (pipe(pipefd
) < 0) {
2284 SYSERROR("failed to create pipe");
2292 SYSERROR("failed to create new process");
2297 /* Close the read-end of the pipe. */
2300 /* Redirect std{err,out} to write-end of the
2303 ret
= dup2(pipefd
[1], STDOUT_FILENO
);
2305 ret
= dup2(pipefd
[1], STDERR_FILENO
);
2307 /* Close the write-end of the pipe. */
2311 SYSERROR("failed to duplicate std{err,out} file descriptor");
2315 /* Does not return. */
2317 ERROR("failed to exec command");
2321 /* close the write-end of the pipe */
2324 bytes
= read(pipefd
[0], buf
, (buf_size
> 0) ? (buf_size
- 1) : 0);
2326 buf
[bytes
- 1] = '\0';
2328 fret
= wait_for_pid(child
);
2329 /* close the read-end of the pipe */