1 /* SPDX-License-Identifier: LGPL-2.1+ */
10 #ifndef FUSE_USE_VERSION
11 #define FUSE_USE_VERSION 30
14 #ifndef FUSE_USE_VERSION
15 #define FUSE_USE_VERSION 26
19 #define _FILE_OFFSET_BITS 64
27 #include <sys/mount.h>
29 #include <sys/types.h>
34 #include "../memory_utils.h"
36 #include "cgroup_utils.h"
38 int get_cgroup_version(char *line
)
40 if (is_cgroupfs_v1(line
))
41 return CGROUP_SUPER_MAGIC
;
43 if (is_cgroupfs_v2(line
))
44 return CGROUP2_SUPER_MAGIC
;
49 bool is_cgroupfs_v1(char *line
)
51 char *p
= strstr(line
, " - ");
54 return strncmp(p
, " - cgroup ", 10) == 0;
57 bool is_cgroupfs_v2(char *line
)
59 char *p
= strstr(line
, " - ");
63 return strncmp(p
, " - cgroup2 ", 11) == 0;
66 int unified_cgroup_hierarchy(void)
72 ret
= statfs(DEFAULT_CGROUP_MOUNTPOINT
, &fs
);
76 if (is_fs_type(&fs
, CGROUP2_SUPER_MAGIC
))
77 return CGROUP2_SUPER_MAGIC
;
82 bool is_cgroup_fd(int fd
)
88 ret
= fstatfs(fd
, &fs
);
92 if (is_fs_type(&fs
, CGROUP2_SUPER_MAGIC
) ||
93 is_fs_type(&fs
, CGROUP_SUPER_MAGIC
))
99 void *must_realloc(void *orig
, size_t sz
)
104 ret
= realloc(orig
, sz
);
110 char *must_make_path(const char *first
, ...)
114 size_t full_len
= strlen(first
);
118 dest
= must_copy_string(first
);
121 va_start(args
, first
);
122 while ((cur
= va_arg(args
, char *)) != NULL
) {
123 buf_len
= strlen(cur
);
129 dest
= must_realloc(dest
, full_len
+ 1);
132 memcpy(dest
+ cur_len
, "/", 1);
136 memcpy(dest
+ cur_len
, cur
, buf_len
);
141 dest
[cur_len
] = '\0';
145 bool is_fs_type(const struct statfs
*fs
, fs_type_magic magic_val
)
147 return (fs
->f_type
== (fs_type_magic
)magic_val
);
150 char *must_copy_string(const char *entry
)
164 char *lxc_string_join(const char *sep
, const char **parts
, bool use_as_prefix
)
168 size_t sep_len
= strlen(sep
);
169 size_t result_len
= use_as_prefix
* sep_len
;
172 /* calculate new string length */
173 for (p
= (char **)parts
; *p
; p
++)
174 result_len
+= (p
> (char **)parts
) * sep_len
+ strlen(*p
);
176 buf_len
= result_len
+ 1;
177 result
= calloc(buf_len
, 1);
182 (void)strlcpy(result
, sep
, buf_len
);
184 for (p
= (char **)parts
; *p
; p
++) {
185 if (p
> (char **)parts
)
186 (void)strlcat(result
, sep
, buf_len
);
188 (void)strlcat(result
, *p
, buf_len
);
194 int lxc_count_file_lines(const char *fn
)
196 __do_fclose
FILE *f
= NULL
;
197 __do_free
char *line
= NULL
;
201 f
= fopen_cloexec(fn
, "r");
205 while (getline(&line
, &sz
, f
) != -1)
211 bool dir_exists(const char *path
)
216 ret
= stat(path
, &sb
);
218 /* Could be something other than eexist, just say "no". */
221 return S_ISDIR(sb
.st_mode
);
225 * @path: a pathname where / replaced with '\0'.
226 * @offsetp: pointer to int showing which path segment was last seen.
227 * Updated on return to reflect the next segment.
228 * @fulllen: full original path length.
229 * Returns a pointer to the next path segment, or NULL if done.
231 static char *get_nextpath(char *path
, int *offsetp
, int fulllen
)
233 int offset
= *offsetp
;
235 if (offset
>= fulllen
)
238 while (offset
< fulllen
&& path
[offset
] != '\0')
241 while (offset
< fulllen
&& path
[offset
] == '\0')
246 return (offset
< fulllen
) ? &path
[offset
] : NULL
;
250 * Check that @subdir is a subdir of @dir. @len is the length of
251 * @dir (to avoid having to recalculate it).
253 static bool is_subdir(const char *subdir
, const char *dir
, size_t len
)
255 size_t subdirlen
= strlen(subdir
);
260 if (strncmp(subdir
, dir
, len
) != 0)
263 if (dir
[len
-1] == '/')
266 if (subdir
[len
] == '/' || subdirlen
== len
)
273 * Check if the open fd is a symlink. Return -ELOOP if it is. Return
274 * -ENOENT if we couldn't fstat. Return 0 if the fd is ok.
276 static int check_symlink(int fd
)
281 ret
= fstat(fd
, &sb
);
285 if (S_ISLNK(sb
.st_mode
))
292 * Open a file or directory, provided that it contains no symlinks.
294 * CAVEAT: This function must not be used for other purposes than container
295 * setup before executing the container's init
297 static int open_if_safe(int dirfd
, const char *nextpath
)
299 __do_close
int newfd
= -EBADF
;
301 newfd
= openat(dirfd
, nextpath
, O_RDONLY
| O_CLOEXEC
| O_NOFOLLOW
);
302 if (newfd
>= 0) /* Was not a symlink, all good. */
303 return move_fd(newfd
);
308 if (errno
== EPERM
|| errno
== EACCES
) {
309 /* We're not root (cause we got EPERM) so try opening with
312 newfd
= openat(dirfd
, nextpath
, O_PATH
| O_NOFOLLOW
);
314 /* O_PATH will return an fd for symlinks. We know
315 * nextpath wasn't a symlink at last openat, so if fd is
316 * now a link, then something * fishy is going on.
318 int ret
= check_symlink(newfd
);
324 return move_fd(newfd
);
328 * Open a path intending for mounting, ensuring that the final path
329 * is inside the container's rootfs.
331 * CAVEAT: This function must not be used for other purposes than container
332 * setup before executing the container's init
334 * @target: path to be opened
335 * @prefix_skip: a part of @target in which to ignore symbolic links. This
336 * would be the container's rootfs.
338 * Return an open fd for the path, or <0 on error.
340 static int open_without_symlink(const char *target
, const char *prefix_skip
)
342 __do_close
int dirfd
= -EBADF
;
343 __do_free
char *dup
= NULL
;
344 int curlen
= 0, fulllen
, i
;
346 fulllen
= strlen(target
);
348 /* make sure prefix-skip makes sense */
349 if (prefix_skip
&& strlen(prefix_skip
) > 0) {
350 curlen
= strlen(prefix_skip
);
351 if (!is_subdir(target
, prefix_skip
, curlen
))
355 * get_nextpath() expects the curlen argument to be
356 * on a (turned into \0) / or before it, so decrement
357 * curlen to make sure that happens
366 /* Make a copy of target which we can hack up, and tokenize it */
367 dup
= strdup(target
);
369 return ret_errno(ENOMEM
);
371 for (i
= 0; i
< fulllen
; i
++) {
376 dirfd
= open(prefix_skip
, O_RDONLY
);
384 nextpath
= get_nextpath(dup
, &curlen
, fulllen
);
386 return move_fd(dirfd
);
388 newfd
= open_if_safe(dirfd
, nextpath
);
389 close_prot_errno_disarm(dirfd
);
395 return move_fd(dirfd
);
399 * Safely mount a path into a container, ensuring that the mount target
400 * is under the container's @rootfs. (If @rootfs is NULL, then the container
403 * CAVEAT: This function must not be used for other purposes than container
404 * setup before executing the container's init
406 int safe_mount(const char *src
, const char *dest
, const char *fstype
,
407 unsigned long flags
, const void *data
, const char *rootfs
)
409 __do_close
int destfd
= -EBADF
, srcfd
= -EBADF
;
411 /* Only needs enough for /proc/self/fd/<fd>. */
412 char srcbuf
[50], destbuf
[50];
413 const char *mntsrc
= src
;
418 /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */
419 if (flags
& MS_BIND
&& src
&& src
[0] != '/') {
421 srcfd
= open_without_symlink(src
, NULL
);
425 ret
= snprintf(srcbuf
, sizeof(srcbuf
), "/proc/self/fd/%d", srcfd
);
426 if (ret
< 0 || ret
>= (int)sizeof(srcbuf
))
431 destfd
= open_without_symlink(dest
, rootfs
);
435 ret
= snprintf(destbuf
, sizeof(destbuf
), "/proc/self/fd/%d", destfd
);
436 if (ret
< 0 || ret
>= (int)sizeof(destbuf
))
437 return ret_errno(EINVAL
);
439 ret
= mount(mntsrc
, destbuf
, fstype
, flags
, data
);
447 size_t strlcpy(char *dest
, const char *src
, size_t size
)
449 size_t ret
= strlen(src
);
452 size_t len
= (ret
>= size
) ? size
- 1 : ret
;
453 memcpy(dest
, src
, len
);
462 size_t strlcat(char *d
, const char *s
, size_t n
)
464 size_t l
= strnlen(d
, n
);
466 return l
+ strlen(s
);
468 return l
+ strlcpy(d
+ l
, s
, n
- l
);
472 FILE *fopen_cloexec(const char *path
, const char *mode
)
474 __do_close
int fd
= -EBADF
;
475 __do_fclose
FILE *ret
= NULL
;
479 if (!strncmp(mode
, "r+", 2)) {
482 } else if (!strncmp(mode
, "r", 1)) {
483 open_mode
= O_RDONLY
;
485 } else if (!strncmp(mode
, "w+", 2)) {
486 open_mode
= O_RDWR
| O_TRUNC
| O_CREAT
;
488 } else if (!strncmp(mode
, "w", 1)) {
489 open_mode
= O_WRONLY
| O_TRUNC
| O_CREAT
;
491 } else if (!strncmp(mode
, "a+", 2)) {
492 open_mode
= O_RDWR
| O_CREAT
| O_APPEND
;
494 } else if (!strncmp(mode
, "a", 1)) {
495 open_mode
= O_WRONLY
| O_CREAT
| O_APPEND
;
498 for (; mode
[step
]; step
++)
499 if (mode
[step
] == 'x')
501 open_mode
|= O_CLOEXEC
;
503 fd
= open(path
, open_mode
, 0660);
507 ret
= fdopen(fd
, mode
);
512 return move_ptr(ret
);
515 /* Given a multi-line string, return a null-terminated copy of the current line. */
516 static char *copy_to_eol(char *p
)
518 char *p2
= strchr(p
, '\n'), *sret
;
525 sret
= must_realloc(NULL
, len
+ 1);
526 memcpy(sret
, p
, len
);
531 static void batch_realloc(char **mem
, size_t oldlen
, size_t newlen
)
533 int newbatches
= (newlen
/ BATCH_SIZE
) + 1;
534 int oldbatches
= (oldlen
/ BATCH_SIZE
) + 1;
536 if (!*mem
|| newbatches
> oldbatches
) {
537 *mem
= must_realloc(*mem
, newbatches
* BATCH_SIZE
);
541 void append_line(char **dest
, size_t oldlen
, char *new, size_t newlen
)
543 size_t full
= oldlen
+ newlen
;
545 batch_realloc(dest
, oldlen
, full
+ 1);
547 memcpy(*dest
+ oldlen
, new, newlen
+ 1);
550 static inline void drop_trailing_newlines(char *s
)
554 for (l
= strlen(s
); l
> 0 && s
[l
- 1] == '\n'; l
--)
558 /* Slurp in a whole file */
559 char *read_file(const char *fnam
)
561 __do_free
char *line
= NULL
;
562 __do_fclose
FILE *f
= NULL
;
565 size_t len
= 0, fulllen
= 0;
567 f
= fopen(fnam
, "re");
570 while ((linelen
= getline(&line
, &len
, f
)) != -1) {
571 append_line(&buf
, fulllen
, line
, linelen
);
577 char *read_file_strip_newline(const char *fnam
)
581 buf
= read_file(fnam
);
583 drop_trailing_newlines(buf
);
587 /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
588 char *cg_unified_get_current_cgroup(pid_t pid
)
590 __do_free
char *basecginfo
= NULL
;
591 char path
[STRLITERALLEN("/proc//cgroup") + INTTYPE_TO_STRLEN(pid_t
) + 1];
594 snprintf(path
, sizeof(path
), "/proc/%d/cgroup", pid
> 0 ? pid
: 1);
595 basecginfo
= read_file(path
);
599 base_cgroup
= strstr(basecginfo
, "0::/");
603 base_cgroup
= base_cgroup
+ 3;
604 return copy_to_eol(base_cgroup
);
607 /* cgline: pointer to character after the first ':' in a line in a \n-terminated
608 * /proc/self/cgroup file. Check whether controller c is present.
610 static bool controller_in_clist(char *cgline
, const char *c
)
612 __do_free
char *tmp
= NULL
;
616 eol
= strchr(cgline
, ':');
621 tmp
= must_realloc(NULL
, len
+ 1);
622 memcpy(tmp
, cgline
, len
);
625 lxc_iterate_parts(tok
, tmp
, ",")
626 if (strcmp(tok
, c
) == 0)
632 /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for
635 char *cg_hybrid_get_current_cgroup(char *basecginfo
, const char *controller
, int type
)
637 char *p
= basecginfo
;
640 bool is_cgv2_base_cgroup
= false;
642 /* cgroup v2 entry in "/proc/<pid>/cgroup": "0::/some/path" */
643 if ((type
== CGROUP2_SUPER_MAGIC
) && (*p
== '0'))
644 is_cgv2_base_cgroup
= true;
651 if (is_cgv2_base_cgroup
|| (controller
&& controller_in_clist(p
, controller
))) {
656 return copy_to_eol(p
);
666 char *cg_legacy_get_current_cgroup(pid_t pid
, const char *controller
)
668 __do_free
char *basecginfo
= NULL
;
669 char path
[STRLITERALLEN("/proc//cgroup") + INTTYPE_TO_STRLEN(pid_t
) + 1];
671 snprintf(path
, sizeof(path
), "/proc/%d/cgroup", pid
> 0 ? pid
: 1);
672 basecginfo
= read_file(path
);
674 return ret_set_errno(NULL
, ENOMEM
);
676 return cg_hybrid_get_current_cgroup(basecginfo
, controller
,
681 char *readat_file(int dirfd
, const char *path
)
683 __do_close
int fd
= -EBADF
;
684 __do_free
char *line
= NULL
;
685 __do_fclose
FILE *f
= NULL
;
687 size_t len
= 0, fulllen
= 0;
690 fd
= openat(dirfd
, path
, O_NOFOLLOW
| O_RDONLY
| O_CLOEXEC
);
694 f
= fdopen(fd
, "re");
697 /* Transfer ownership of fd */
700 while ((linelen
= getline(&line
, &len
, f
)) != -1) {
701 append_line(&buf
, fulllen
, line
, linelen
);
706 drop_trailing_newlines(buf
);
711 bool mkdir_p(const char *dir
, mode_t mode
)
713 const char *tmp
= dir
;
714 const char *orig
= dir
;
718 dir
= tmp
+ strspn(tmp
, "/");
719 tmp
= dir
+ strcspn(dir
, "/");
720 makeme
= strndup(orig
, dir
- orig
);
723 if (mkdir(makeme
, mode
) && errno
!= EEXIST
) {
724 lxcfs_error("Failed to create directory '%s': %s.\n",
725 makeme
, strerror(errno
));
735 static bool same_file(int fd1
, int fd2
)
737 struct stat st1
, st2
;
739 if (fstat(fd1
, &st1
) < 0 || fstat(fd2
, &st2
) < 0)
742 return (st1
.st_dev
== st2
.st_dev
) && (st1
.st_ino
== st2
.st_ino
);
746 * cgroup_walkup_to_root() - Walk upwards to cgroup root to find valid value
748 * @cgroup2_root_fd: File descriptor for the cgroup2 root mount point.
749 * @hierarchy_fd: File descriptor for the hierarchy.
750 * @cgroup: A cgroup directory relative to @hierarchy_fd.
751 * @file: The file in @cgroup from which to read a value.
752 * @value: Return argument to store value read from @file.
754 * This function tries to read a valid value from @file in @cgroup in
755 * @hierarchy_fd. If it is a legacy cgroup hierarchy and we fail to find a
756 * valid value we terminate early and report an error.
757 * The cgroup2 hierarchy however, has different semantics. In a few controller
758 * files it will show the value "max" or simply leave it completely empty
759 * thereby indicating that no limit has been set for this particular cgroup.
760 * However, that doesn't mean that there's no limit. A cgroup further up the
761 * hierarchy could have a limit set that also applies to the cgroup we are
762 * interested in. So for the unified cgroup hierarchy we need to keep walking
763 * towards the cgroup2 root cgroup and try to parse a valid value.
765 * Returns: 0 if a limit was found, 1 if no limit was set or "max" was set,
766 * -errno if an error occurred.
768 int cgroup_walkup_to_root(int cgroup2_root_fd
, int hierarchy_fd
,
769 const char *cgroup
, const char *file
, char **value
)
771 __do_close
int dir_fd
= -EBADF
;
772 __do_free
char *val
= NULL
;
774 /* Look in our current cgroup for a valid value. */
775 dir_fd
= openat(hierarchy_fd
, cgroup
, O_DIRECTORY
| O_PATH
| O_CLOEXEC
);
779 val
= readat_file(dir_fd
, file
);
780 if (!is_empty_string(val
) && strcmp(val
, "max") != 0) {
781 *value
= move_ptr(val
);
786 * Legacy cgroup hierarchies should always show a valid value in the
787 * file of the cgroup. So no need to do this upwards walking crap.
789 if (cgroup2_root_fd
< 0)
791 else if (same_file(cgroup2_root_fd
, dir_fd
))
796 * Set an arbitraty hard-coded limit to prevent us from ending
797 * up in an endless loop. There really shouldn't be any cgroup
798 * tree that is 1000 levels deep. That would be insane in
799 * principal and performance-wise.
801 for (int i
= 0; i
< 1000; i
++) {
802 __do_close
int inner_fd
= -EBADF
;
803 __do_free
char *new_val
= NULL
;
805 inner_fd
= move_fd(dir_fd
);
806 dir_fd
= openat(inner_fd
, "..", O_DIRECTORY
| O_PATH
| O_CLOEXEC
);
811 * We're at the root of the cgroup2 tree so stop walking
813 * Since we walked up the whole tree we haven't found an actual
814 * limit anywhere apparently.
816 * Note that we're not checking the root cgroup itself simply
817 * because a lot of the controllers don't expose files with
818 * limits to the root cgroup.
820 if (same_file(cgroup2_root_fd
, dir_fd
))
823 /* We found a valid value. Terminate walk. */
824 new_val
= readat_file(dir_fd
, file
);
825 if (!is_empty_string(new_val
) && strcmp(new_val
, "max") != 0) {
826 *value
= move_ptr(new_val
);
831 return log_error_errno(-ELOOP
, ELOOP
, "To many nested cgroups or invalid mount tree. Terminating walk");