]>
git.proxmox.com Git - mirror_lxcfs.git/blob - pam/pam_cgfs.c
3 * Copyright © 2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 * Author: Christian Brauner <christian.brauner@ubuntu.com>
7 * When a user logs in, this pam module will create cgroups which the user may
8 * administer. It handles both pure cgroupfs v1 and pure cgroupfs v2, as well as
9 * mixed mounts, where some controllers are mounted in a standard cgroupfs v1
10 * hierarchy location (/sys/fs/cgroup/<controller>) and others are in the
11 * cgroupfs v2 hierarchy.
12 * Writeable cgroups are either created for all controllers or, if specified,
13 * for any controllers listed on the command line.
14 * The cgroup created will be "user/$user/0" for the first session,
15 * "user/$user/1" for the second, etc.
17 * Systems with a systemd init system are treated specially, both with respect
18 * to cgroupfs v1 and cgroupfs v2. For both, cgroupfs v1 and cgroupfs v2, We
19 * check whether systemd already placed us in a cgroup it created:
21 * user.slice/user-uid.slice/session-n.scope
23 * by checking whether uid == our uid. If it did, we simply chown the last
24 * part (session-n.scope). If it did not we create a cgroup as outlined above
25 * (user/$user/n) and chown it to our uid.
26 * The same holds for cgroupfs v2 where this assumptions becomes crucial:
27 * We __have to__ be placed in our under the cgroup systemd created for us on
28 * login, otherwise things like starting an xserver or similar will not work.
30 * All requested cgroups must be mounted under /sys/fs/cgroup/$controller,
31 * no messing around with finding mountpoints.
33 * See COPYING file for details.
49 #include <linux/unistd.h>
50 #include <sys/mount.h>
51 #include <sys/param.h>
53 #include <sys/types.h>
56 #define PAM_SM_SESSION
57 #include <security/_pam_macros.h>
58 #include <security/pam_modules.h>
62 #ifndef CGROUP_SUPER_MAGIC
63 #define CGROUP_SUPER_MAGIC 0x27e0eb
66 #ifndef CGROUP2_SUPER_MAGIC
67 #define CGROUP2_SUPER_MAGIC 0x63677270
70 /* Taken over modified from the kernel sources. */
71 #define NBITS 32 /* bits in uint32_t */
72 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
73 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS)
75 static enum cg_mount_mode
{
80 CGROUP_UNINITIALIZED
= 3,
81 } cg_mount_mode
= CGROUP_UNINITIALIZED
;
83 /* Common helper prototypes. */
84 static void append_line(char **dest
, size_t oldlen
, char *new, size_t newlen
);
85 static int append_null_to_list(void ***list
);
86 static void batch_realloc(char **mem
, size_t oldlen
, size_t newlen
);
87 static inline void clear_bit(unsigned bit
, uint32_t *bitarr
)
89 bitarr
[bit
/ NBITS
] &= ~(1 << (bit
% NBITS
));
91 static char *copy_to_eol(char *s
);
92 static bool file_exists(const char *f
);
93 static void free_string_list(char **list
);
94 static char *get_mountpoint(char *line
);
95 static bool get_uid_gid(const char *user
, uid_t
*uid
, gid_t
*gid
);
96 static int handle_login(const char *user
, uid_t uid
, gid_t gid
);
97 static inline bool is_set(unsigned bit
, uint32_t *bitarr
)
99 return (bitarr
[bit
/ NBITS
] & (1 << (bit
% NBITS
))) != 0;
101 /* __typeof__ should be safe to use with all compilers. */
102 typedef __typeof__(((struct statfs
*)NULL
)->f_type
) fs_type_magic
;
103 static bool has_fs_type(const struct statfs
*fs
, fs_type_magic magic_val
);
104 static bool is_lxcfs(const char *line
);
105 static bool is_cgv1(char *line
);
106 static bool is_cgv2(char *line
);
107 static bool mkdir_p(const char *root
, char *path
);
108 static void *must_alloc(size_t sz
);
109 static void must_add_to_list(char ***clist
, char *entry
);
110 static void must_append_controller(char **klist
, char **nlist
, char ***clist
,
112 static void must_append_string(char ***list
, char *entry
);
113 static char *must_copy_string(const char *entry
);
114 static char *must_make_path(const char *first
, ...) __attribute__((sentinel
));
115 static void *must_realloc(void *orig
, size_t sz
);
116 static void mysyslog(int err
, const char *format
, ...) __attribute__((sentinel
));
117 static char *read_file(char *fnam
);
118 static int read_from_file(const char *filename
, void* buf
, size_t count
);
119 static int recursive_rmdir(char *dirname
);
120 static inline void set_bit(unsigned bit
, uint32_t *bitarr
)
122 bitarr
[bit
/ NBITS
] |= (1 << (bit
% NBITS
));
124 static bool string_in_list(char **list
, const char *entry
);
125 char *string_join(const char *sep
, const char **parts
, bool use_as_prefix
);
126 static void trim(char *s
);
127 static bool write_int(char *path
, int v
);
128 ssize_t
write_nointr(int fd
, const void* buf
, size_t count
);
130 /* cgroupfs prototypes. */
131 static bool cg_belongs_to_uid_gid(const char *path
, uid_t uid
, gid_t gid
);
132 static uint32_t *cg_cpumask(char *buf
, size_t nbits
);
133 static bool cg_copy_parent_file(char *path
, char *file
);
134 static char *cg_cpumask_to_cpulist(uint32_t *bitarr
, size_t nbits
);
135 static bool cg_enter(const char *cgroup
);
136 static void cg_escape(void);
137 static bool cg_filter_and_set_cpus(char *path
, bool am_initialized
);
138 static ssize_t
cg_get_max_cpus(char *cpulist
);
139 static int cg_get_version_of_mntpt(const char *path
);
140 static bool cg_init(uid_t uid
, gid_t gid
);
141 static void cg_mark_to_make_rw(const char *cstring
);
142 static void cg_prune_empty_cgroups(const char *user
);
143 static bool cg_systemd_created_user_slice(const char *base_cgroup
,
144 const char *init_cgroup
,
145 const char *in
, uid_t uid
);
146 static bool cg_systemd_chown_existing_cgroup(const char *mountpoint
,
147 const char *base_cgroup
, uid_t uid
,
149 bool systemd_user_slice
);
150 static bool cg_systemd_under_user_slice_1(const char *in
, uid_t uid
);
151 static bool cg_systemd_under_user_slice_2(const char *base_cgroup
,
152 const char *init_cgroup
, uid_t uid
);
153 static void cg_systemd_prune_init_scope(char *cg
);
154 int cg_write_to_file(const char *filename
, const void *buf
, size_t count
,
156 static bool is_lxcfs(const char *line
);
158 /* cgroupfs v1 prototypes. */
159 struct cgv1_hierarchy
{
165 bool create_rw_cgroup
;
166 bool systemd_user_slice
;
169 static struct cgv1_hierarchy
**cgv1_hierarchies
;
171 static void cgv1_add_controller(char **clist
, char *mountpoint
,
172 char *base_cgroup
, char *init_cgroup
);
173 static bool cgv1_controller_in_clist(char *cgline
, char *c
);
174 static bool cgv1_controller_lists_intersect(char **l1
, char **l2
);
175 static bool cgv1_controller_list_is_dup(struct cgv1_hierarchy
**hlist
,
177 static bool cgv1_create(const char *cgroup
, uid_t uid
, gid_t gid
,
179 static bool cgv1_create_one(struct cgv1_hierarchy
*h
, const char *cgroup
,
180 uid_t uid
, gid_t gid
, bool *existed
);
181 static bool cgv1_enter(const char *cgroup
);
182 static void cgv1_escape(void);
183 static bool cgv1_get_controllers(char ***klist
, char ***nlist
);
184 static char *cgv1_get_current_cgroup(char *basecginfo
, char *controller
);
185 static char **cgv1_get_proc_mountinfo_controllers(char **klist
, char **nlist
,
187 static bool cgv1_handle_cpuset_hierarchy(struct cgv1_hierarchy
*h
,
189 static bool cgv1_handle_root_cpuset_hierarchy(struct cgv1_hierarchy
*h
);
190 static bool cgv1_init(uid_t uid
, gid_t gid
);
191 static void cgv1_mark_to_make_rw(char **clist
);
192 static char *cgv1_must_prefix_named(char *entry
);
193 static bool cgv1_prune_empty_cgroups(const char *user
);
194 static bool cgv1_remove_one(struct cgv1_hierarchy
*h
, const char *cgroup
);
195 static bool is_cgv1(char *line
);
197 /* cgroupfs v2 prototypes. */
198 struct cgv2_hierarchy
{
204 bool create_rw_cgroup
;
205 bool systemd_user_slice
;
208 /* Actually this should only be a single hierarchy. But for the sake of
209 * parallelism and because the layout of the cgroupfs v2 is still somewhat
210 * changing, we'll leave it as an array of structs.
212 static struct cgv2_hierarchy
**cgv2_hierarchies
;
214 static void cgv2_add_controller(char **clist
, char *mountpoint
,
215 char *base_cgroup
, char *init_cgroup
,
216 bool systemd_user_slice
);
217 static bool cgv2_create(const char *cgroup
, uid_t uid
, gid_t gid
,
219 static bool cgv2_enter(const char *cgroup
);
220 static void cgv2_escape(void);
221 static char *cgv2_get_current_cgroup(int pid
);
222 static bool cgv2_init(uid_t uid
, gid_t gid
);
223 static void cgv2_mark_to_make_rw(char **clist
);
224 static bool cgv2_prune_empty_cgroups(const char *user
);
225 static bool cgv2_remove(const char *cgroup
);
226 static bool is_cgv2(char *line
);
228 /* Common helper functions. */
229 static void mysyslog(int err
, const char *format
, ...)
233 va_start(args
, format
);
234 openlog("PAM-CGFS", LOG_CONS
| LOG_PID
, LOG_AUTH
);
235 vsyslog(err
, format
, args
);
240 /* realloc() pointer; do not fail. */
241 static void *must_realloc(void *orig
, size_t sz
)
246 ret
= realloc(orig
, sz
);
252 /* realloc() pointer in batch sizes; do not fail. */
253 #define BATCH_SIZE 50
254 static void batch_realloc(char **mem
, size_t oldlen
, size_t newlen
)
256 int newbatches
= (newlen
/ BATCH_SIZE
) + 1;
257 int oldbatches
= (oldlen
/ BATCH_SIZE
) + 1;
259 if (!*mem
|| newbatches
> oldbatches
)
260 *mem
= must_realloc(*mem
, newbatches
* BATCH_SIZE
);
263 /* Append lines as is to pointer; do not fail. */
264 static void append_line(char **dest
, size_t oldlen
, char *new, size_t newlen
)
266 size_t full
= oldlen
+ newlen
;
268 batch_realloc(dest
, oldlen
, full
+ 1);
270 memcpy(*dest
+ oldlen
, new, newlen
+ 1);
273 /* Read in whole file and return allocated pointer. */
274 static char *read_file(char *fnam
)
278 char *line
= NULL
, *buf
= NULL
;
279 size_t len
= 0, fulllen
= 0;
281 f
= fopen(fnam
, "r");
285 while ((linelen
= getline(&line
, &len
, f
)) != -1) {
286 append_line(&buf
, fulllen
, line
, linelen
);
296 /* Given a pointer to a null-terminated array of pointers, realloc to add one
297 * entry, and point the new entry to NULL. Do not fail. Return the index to the
298 * second-to-last entry - that is, the one which is now available for use
299 * (keeping the list null-terminated).
301 static int append_null_to_list(void ***list
)
306 for (; (*list
)[newentry
]; newentry
++) {
310 *list
= must_realloc(*list
, (newentry
+ 2) * sizeof(void **));
311 (*list
)[newentry
+ 1] = NULL
;
316 /* Make allocated copy of string; do not fail. */
317 static char *must_copy_string(const char *entry
)
331 /* Append new entry to null-terminated array of pointer; make sure that array of
332 * pointers will still be null-terminated.
334 static void must_append_string(char ***list
, char *entry
)
339 newentry
= append_null_to_list((void ***)list
);
340 copy
= must_copy_string(entry
);
341 (*list
)[newentry
] = copy
;
344 /* Remove newlines from string. */
345 static void trim(char *s
)
347 size_t len
= strlen(s
);
349 while (s
[len
- 1] == '\n')
353 /* Allocate pointer; do not fail. */
354 static void *must_alloc(size_t sz
)
356 return must_realloc(NULL
, sz
);
359 /* Make allocated copy of string. End of string is taken to be '\n'. */
360 static char *copy_to_eol(char *s
)
362 char *newline
, *sret
;
365 newline
= strchr(s
, '\n');
370 sret
= must_alloc(len
+ 1);
371 memcpy(sret
, s
, len
);
377 /* Check if given entry under /proc/<pid>/mountinfo is a fuse.lxcfs mount. */
378 static bool is_lxcfs(const char *line
)
380 char *p
= strstr(line
, " - ");
384 return strncmp(p
, " - fuse.lxcfs ", 14) == 0;
387 /* Check if given entry under /proc/<pid>/mountinfo is a cgroupfs v1 mount. */
388 static bool is_cgv1(char *line
)
390 char *p
= strstr(line
, " - ");
394 return strncmp(p
, " - cgroup ", 10) == 0;
397 /* Check if given entry under /proc/<pid>/mountinfo is a cgroupfs v2 mount. */
398 static bool is_cgv2(char *line
)
400 char *p
= strstr(line
, " - ");
404 return strncmp(p
, " - cgroup2 ", 11) == 0;
407 /* Given a null-terminated array of strings, check whether @entry is one of the
410 static bool string_in_list(char **list
, const char *entry
)
414 for (it
= list
; it
&& *it
; it
++)
415 if (strcmp(*it
, entry
) == 0)
421 /* Free null-terminated array of strings. */
422 static void free_string_list(char **list
)
426 for (it
= list
; it
&& *it
; it
++)
431 /* Concatenate all passed-in strings into one path. Do not fail. If any piece
432 * is not prefixed with '/', add a '/'. Does not remove duplicate '///' from the
435 static char *must_make_path(const char *first
, ...)
441 full_len
= strlen(first
);
443 dest
= must_copy_string(first
);
445 va_start(args
, first
);
446 while ((cur
= va_arg(args
, char *)) != NULL
) {
447 full_len
+= strlen(cur
);
452 dest
= must_realloc(dest
, full_len
+ 1);
464 /* Write single integer to file. */
465 static bool write_int(char *path
, int v
)
470 f
= fopen(path
, "w");
474 if (fprintf(f
, "%d\n", v
) < 0)
483 /* Check if a given file exists. */
484 static bool file_exists(const char *f
)
488 return stat(f
, &statbuf
) == 0;
491 /* Create directory and (if necessary) its parents. */
492 static bool mkdir_p(const char *root
, char *path
)
496 if (strlen(path
) < strlen(root
))
499 if (strlen(path
) == strlen(root
))
502 b
= path
+ strlen(root
) + 1;
504 while (*b
&& (*b
== '/'))
510 while (*e
&& *e
!= '/')
517 if (file_exists(path
))
520 if (mkdir(path
, 0755) < 0) {
521 lxcfs_debug("Failed to create %s: %m.\n", path
);
536 /* Recursively remove directory and its parents. */
537 static int recursive_rmdir(char *dirname
)
539 struct dirent
*direntp
;
543 dir
= opendir(dirname
);
547 while ((direntp
= readdir(dir
))) {
554 if (!strcmp(direntp
->d_name
, ".") ||
555 !strcmp(direntp
->d_name
, ".."))
558 pathname
= must_make_path(dirname
, direntp
->d_name
, NULL
);
560 if (lstat(pathname
, &st
)) {
562 lxcfs_debug("Failed to stat %s.\n", pathname
);
567 if (!S_ISDIR(st
.st_mode
))
570 if (recursive_rmdir(pathname
) < 0)
576 if (rmdir(dirname
) < 0) {
578 lxcfs_debug("Failed to delete %s: %m.\n", dirname
);
582 if (closedir(dir
) < 0) {
584 lxcfs_debug("Failed to delete %s: %m.\n", dirname
);
591 /* Add new entry to null-terminated array of pointers. Make sure array is still
594 static void must_add_to_list(char ***clist
, char *entry
)
598 newentry
= append_null_to_list((void ***)clist
);
599 (*clist
)[newentry
] = must_copy_string(entry
);
602 /* Get mountpoint from a /proc/<pid>/mountinfo line. */
603 static char *get_mountpoint(char *line
)
611 for (i
= 0; i
< 4; i
++) {
623 sret
= must_alloc(len
+ 1);
624 memcpy(sret
, p
, len
);
630 /* Create list of cgroupfs v1 controller found under /proc/self/cgroup. Skips
631 * the 0::/some/path cgroupfs v2 hierarchy listed. Splits controllers into
632 * kernel controllers (@klist) and named controllers (@nlist).
634 static bool cgv1_get_controllers(char ***klist
, char ***nlist
)
640 f
= fopen("/proc/self/cgroup", "r");
644 while (getline(&line
, &len
, f
) != -1) {
646 char *saveptr
= NULL
;
648 p
= strchr(line
, ':');
658 /* Skip the v2 hierarchy. */
662 for (tok
= strtok_r(p
, ",", &saveptr
); tok
;
663 tok
= strtok_r(NULL
, ",", &saveptr
)) {
664 if (strncmp(tok
, "name=", 5) == 0)
665 must_append_string(nlist
, tok
);
667 must_append_string(klist
, tok
);
677 /* Get list of controllers for cgroupfs v2 hierarchy by looking at
678 * cgroup.controllers and/or cgroup.subtree_control of a given (parent) cgroup.
679 static bool cgv2_get_controllers(char ***klist)
685 /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */
686 static char *cgv2_get_current_cgroup(int pid
)
690 char *current_cgroup
;
692 /* The largest integer that can fit into long int is 2^64. This is a
693 * 20-digit number. */
694 #define __PIDLEN /* /proc */ 5 + /* /pid-to-str */ 21 + /* /cgroup */ 7 + /* \0 */ 1
697 ret
= snprintf(path
, __PIDLEN
, "/proc/%d/cgroup", pid
);
698 if (ret
< 0 || ret
>= __PIDLEN
)
701 cgroups_v2
= read_file(path
);
705 current_cgroup
= strstr(cgroups_v2
, "0::/");
709 current_cgroup
= current_cgroup
+ 3;
710 copy
= copy_to_eol(current_cgroup
);
722 /* Given two null-terminated lists of strings, return true if any string is in
725 static bool cgv1_controller_lists_intersect(char **l1
, char **l2
)
732 for (it
= l1
; it
&& *it
; it
++)
733 if (string_in_list(l2
, *it
))
739 /* For a null-terminated list of controllers @clist, return true if any of those
740 * controllers is already listed the null-terminated list of hierarchies @hlist.
741 * Realistically, if one is present, all must be present.
743 static bool cgv1_controller_list_is_dup(struct cgv1_hierarchy
**hlist
, char **clist
)
745 struct cgv1_hierarchy
**it
;
747 for (it
= hlist
; it
&& *it
; it
++)
748 if ((*it
)->controllers
)
749 if (cgv1_controller_lists_intersect((*it
)->controllers
, clist
))
755 /* Set boolean to mark controllers under which we are supposed create a
758 static void cgv1_mark_to_make_rw(char **clist
)
760 struct cgv1_hierarchy
**it
;
762 for (it
= cgv1_hierarchies
; it
&& *it
; it
++)
763 if ((*it
)->controllers
)
764 if (cgv1_controller_lists_intersect((*it
)->controllers
, clist
))
765 (*it
)->create_rw_cgroup
= true;
768 /* Set boolean to mark whether we are supposed to create a writeable cgroup in
769 * the cgroupfs v2 hierarchy.
771 static void cgv2_mark_to_make_rw(char **clist
)
773 if (string_in_list(clist
, "unified"))
774 if (cgv2_hierarchies
)
775 (*cgv2_hierarchies
)->create_rw_cgroup
= true;
778 /* Wrapper around cgv{1,2}_mark_to_make_rw(). */
779 static void cg_mark_to_make_rw(const char *cstring
)
782 char *saveptr
= NULL
;
785 copy
= must_copy_string(cstring
);
787 for (tok
= strtok_r(copy
, ",", &saveptr
); tok
;
788 tok
= strtok_r(NULL
, ",", &saveptr
))
789 must_add_to_list(&clist
, tok
);
793 cgv1_mark_to_make_rw(clist
);
794 cgv2_mark_to_make_rw(clist
);
796 free_string_list(clist
);
799 /* Prefix any named controllers with "name=", e.g. "name=systemd". */
800 static char *cgv1_must_prefix_named(char *entry
)
807 s
= must_alloc(len
+ 6);
809 ret
= snprintf(s
, len
+ 6, "name=%s", entry
);
810 if (ret
< 0 || (size_t)ret
>= (len
+ 6))
816 /* Append kernel controller in @klist or named controller in @nlist to @clist */
817 static void must_append_controller(char **klist
, char **nlist
, char ***clist
, char *entry
)
822 if (string_in_list(klist
, entry
) && string_in_list(nlist
, entry
))
825 newentry
= append_null_to_list((void ***)clist
);
827 if (strncmp(entry
, "name=", 5) == 0)
828 copy
= must_copy_string(entry
);
829 else if (string_in_list(klist
, entry
))
830 copy
= must_copy_string(entry
);
832 copy
= cgv1_must_prefix_named(entry
);
834 (*clist
)[newentry
] = copy
;
837 /* Get the controllers from a mountinfo line. There are other ways we could get
838 * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we
839 * could parse the mount options. But we simply assume that the mountpoint must
840 * be /sys/fs/cgroup/controller-list
842 static char **cgv1_get_proc_mountinfo_controllers(char **klist
, char **nlist
, char *line
)
846 char *saveptr
= NULL
;
851 for (i
= 0; i
< 4; i
++) {
860 if (strncmp(p
, "/sys/fs/cgroup/", 15) != 0)
870 for (tok
= strtok_r(p
, ",", &saveptr
); tok
;
871 tok
= strtok_r(NULL
, ",", &saveptr
))
872 must_append_controller(klist
, nlist
, &aret
, tok
);
877 /* Check if a cgroupfs v2 controller is present in the string @cgline. */
878 static bool cgv1_controller_in_clist(char *cgline
, char *c
)
881 char *tok
, *eol
, *tmp
;
882 char *saveptr
= NULL
;
884 eol
= strchr(cgline
, ':');
889 tmp
= alloca(len
+ 1);
890 memcpy(tmp
, cgline
, len
);
893 for (tok
= strtok_r(tmp
, ",", &saveptr
); tok
;
894 tok
= strtok_r(NULL
, ",", &saveptr
)) {
895 if (strcmp(tok
, c
) == 0)
901 /* Get current cgroup from the /proc/<pid>/cgroup file passed in via @basecginfo
902 * of a given cgv1 controller passed in via @controller.
904 static char *cgv1_get_current_cgroup(char *basecginfo
, char *controller
)
916 if (cgv1_controller_in_clist(p
, controller
)) {
922 return copy_to_eol(p
);
934 /* Remove /init.scope from string @cg. This will mostly affect systemd-based
937 #define INIT_SCOPE "/init.scope"
938 static void cg_systemd_prune_init_scope(char *cg
)
945 point
= cg
+ strlen(cg
) - strlen(INIT_SCOPE
);
949 if (strcmp(point
, INIT_SCOPE
) == 0) {
957 /* Add new info about a mounted cgroupfs v1 hierarchy. Includes the controllers
958 * mounted into that hierarchy (e.g. cpu,cpuacct), the mountpoint of that
959 * hierarchy (/sys/fs/cgroup/<controller>, the base cgroup of the current
960 * process gathered from /proc/self/cgroup, and the init cgroup of PID1 gathered
961 * from /proc/1/cgroup.
963 static void cgv1_add_controller(char **clist
, char *mountpoint
, char *base_cgroup
, char *init_cgroup
)
965 struct cgv1_hierarchy
*new;
968 new = must_alloc(sizeof(*new));
969 new->controllers
= clist
;
970 new->mountpoint
= mountpoint
;
971 new->base_cgroup
= base_cgroup
;
972 new->fullcgpath
= NULL
;
973 new->create_rw_cgroup
= false;
974 new->init_cgroup
= init_cgroup
;
975 new->systemd_user_slice
= false;
977 newentry
= append_null_to_list((void ***)&cgv1_hierarchies
);
978 cgv1_hierarchies
[newentry
] = new;
981 /* Add new info about the mounted cgroupfs v2 hierarchy. Can (but doesn't
982 * currently) include the controllers mounted into the hierarchy (e.g. memory,
983 * pids, blkio), the mountpoint of that hierarchy (Should usually be
984 * /sys/fs/cgroup but some init systems seems to think it might be a good idea
985 * to also mount empty cgroupfs v2 hierarchies at /sys/fs/cgroup/systemd.), the
986 * base cgroup of the current process gathered from /proc/self/cgroup, and the
987 * init cgroup of PID1 gathered from /proc/1/cgroup.
989 static void cgv2_add_controller(char **clist
, char *mountpoint
, char *base_cgroup
, char *init_cgroup
, bool systemd_user_slice
)
991 struct cgv2_hierarchy
*new;
994 new = must_alloc(sizeof(*new));
995 new->controllers
= clist
;
996 new->mountpoint
= mountpoint
;
997 new->base_cgroup
= base_cgroup
;
998 new->fullcgpath
= NULL
;
999 new->create_rw_cgroup
= false;
1000 new->init_cgroup
= init_cgroup
;
1001 new->systemd_user_slice
= systemd_user_slice
;
1003 newentry
= append_null_to_list((void ***)&cgv2_hierarchies
);
1004 cgv2_hierarchies
[newentry
] = new;
1007 /* In Ubuntu 14.04, the paths created for us were
1008 * '/user/$uid.user/$something.session' This can be merged better with
1009 * systemd_created_slice_for_us(), but keeping it separate makes it easier to
1010 * reason about the correctness.
1012 static bool cg_systemd_under_user_slice_1(const char *in
, uid_t uid
)
1020 copy
= must_copy_string(in
);
1021 if (strlen(copy
) < strlen("/user/1.user/1.session"))
1023 p
= copy
+ strlen(copy
) - 1;
1025 /* skip any trailing '/' (shouldn't be any, but be sure) */
1026 while (p
>= copy
&& *p
== '/')
1031 /* Get last path element */
1032 while (p
>= copy
&& *p
!= '/')
1036 /* make sure it is something.session */
1037 len
= strlen(p
+ 1);
1038 if (len
< strlen("1.session") ||
1039 strncmp(p
+ 1 + len
- 8, ".session", 8) != 0)
1042 /* ok last path piece checks out, now check the second to last */
1044 while (p
>= copy
&& *(--p
) != '/')
1046 if (sscanf(p
+ 1, "%d.user/", &id
) != 1)
1059 /* So long as our path relative to init starts with /user.slice/user-$uid.slice,
1060 * assume it belongs to $uid and chown it
1062 static bool cg_systemd_under_user_slice_2(const char *base_cgroup
,
1063 const char *init_cgroup
, uid_t uid
)
1067 size_t curlen
, initlen
;
1069 curlen
= strlen(base_cgroup
);
1070 initlen
= strlen(init_cgroup
);
1071 if (curlen
<= initlen
)
1074 if (strncmp(base_cgroup
, init_cgroup
, initlen
) != 0)
1077 ret
= snprintf(buf
, 100, "/user.slice/user-%d.slice/", (int)uid
);
1078 if (ret
< 0 || ret
>= 100)
1082 initlen
= 0; // skip the '/'
1084 return strncmp(base_cgroup
+ initlen
, buf
, strlen(buf
)) == 0;
1087 /* The systemd-created path is: user-$uid.slice/session-c$session.scope. If that
1088 * is not the end of our systemd path, then we're not part of the PAM call that
1089 * created that path.
1091 * The last piece is chowned to $uid, the user- part not.
1092 * Note: If the user creates paths that look like what we're looking for to
1094 * - they fool us, we create new cgroups, and they get auto-logged-out.
1095 * - they fool a root sudo, systemd cgroup is not changed but chowned, and they
1096 * lose ownership of their cgroups
1098 static bool cg_systemd_created_user_slice(const char *base_cgroup
,
1099 const char *init_cgroup
,
1100 const char *in
, uid_t uid
)
1108 copy
= must_copy_string(in
);
1110 /* An old version of systemd has already created a cgroup for us. */
1111 if (cg_systemd_under_user_slice_1(in
, uid
))
1114 /* A new version of systemd has already created a cgroup for us. */
1115 if (cg_systemd_under_user_slice_2(base_cgroup
, init_cgroup
, uid
))
1118 if (strlen(copy
) < strlen("/user-0.slice/session-0.scope"))
1121 p
= copy
+ strlen(copy
) - 1;
1122 /* Skip any trailing '/' (shouldn't be any, but be sure). */
1123 while (p
>= copy
&& *p
== '/')
1129 /* Get last path element */
1130 while (p
>= copy
&& *p
!= '/')
1136 /* Make sure it is session-something.scope. */
1137 len
= strlen(p
+ 1);
1138 if (strncmp(p
+ 1, "session-", strlen("session-")) != 0 ||
1139 strncmp(p
+ 1 + len
- 6, ".scope", 6) != 0)
1142 /* Ok last path piece checks out, now check the second to last. */
1144 while (p
>= copy
&& *(--p
) != '/')
1147 if (sscanf(p
+ 1, "user-%d.slice/", &id
) != 1)
1160 /* Chown existing cgroup that systemd has already created for us. */
1161 static bool cg_systemd_chown_existing_cgroup(const char *mountpoint
,
1162 const char *base_cgroup
, uid_t uid
,
1163 gid_t gid
, bool systemd_user_slice
)
1167 if (!systemd_user_slice
)
1170 path
= must_make_path(mountpoint
, base_cgroup
, NULL
);
1172 /* A cgroup within name=systemd has already been created. So we only
1175 if (chown(path
, uid
, gid
) < 0)
1176 mysyslog(LOG_WARNING
, "Failed to chown %s to %d:%d: %m.\n",
1177 path
, (int)uid
, (int)gid
, NULL
);
1183 /* Detect and store information about cgroupfs v1 hierarchies. */
1184 static bool cgv1_init(uid_t uid
, gid_t gid
)
1187 struct cgv1_hierarchy
**it
;
1190 char **klist
= NULL
, **nlist
= NULL
;
1193 basecginfo
= read_file("/proc/self/cgroup");
1197 f
= fopen("/proc/self/mountinfo", "r");
1203 cgv1_get_controllers(&klist
, &nlist
);
1205 while (getline(&line
, &len
, f
) != -1) {
1206 char **controller_list
= NULL
;
1207 char *mountpoint
, *base_cgroup
;
1209 if (is_lxcfs(line
) || !is_cgv1(line
))
1212 controller_list
= cgv1_get_proc_mountinfo_controllers(klist
, nlist
, line
);
1213 if (!controller_list
)
1216 if (cgv1_controller_list_is_dup(cgv1_hierarchies
,
1218 free(controller_list
);
1222 mountpoint
= get_mountpoint(line
);
1224 free_string_list(controller_list
);
1228 base_cgroup
= cgv1_get_current_cgroup(basecginfo
, controller_list
[0]);
1230 free_string_list(controller_list
);
1235 lxcfs_debug("Detected cgroupfs v1 controller \"%s\" with "
1236 "mountpoint \"%s\" and cgroup \"%s\".\n",
1237 controller_list
[0], mountpoint
, base_cgroup
);
1238 cgv1_add_controller(controller_list
, mountpoint
, base_cgroup
,
1241 free_string_list(klist
);
1242 free_string_list(nlist
);
1247 /* Retrieve init cgroup path for all controllers. */
1248 basecginfo
= read_file("/proc/1/cgroup");
1252 for (it
= cgv1_hierarchies
; it
&& *it
; it
++) {
1253 if ((*it
)->controllers
) {
1254 char *init_cgroup
, *user_slice
;
1255 /* We've already stored the controller and received its
1256 * current cgroup. If we now fail to retrieve its init
1257 * cgroup, we should probably fail.
1259 init_cgroup
= cgv1_get_current_cgroup(basecginfo
, (*it
)->controllers
[0]);
1264 cg_systemd_prune_init_scope(init_cgroup
);
1265 (*it
)->init_cgroup
= init_cgroup
;
1266 lxcfs_debug("cgroupfs v1 controller \"%s\" has init "
1268 (*(*it
)->controllers
), init_cgroup
);
1269 /* Check whether systemd has already created a cgroup
1272 user_slice
= must_make_path((*it
)->mountpoint
, (*it
)->base_cgroup
, NULL
);
1273 if (cg_systemd_created_user_slice((*it
)->base_cgroup
, (*it
)->init_cgroup
, user_slice
, uid
))
1274 (*it
)->systemd_user_slice
= true;
1282 /* __typeof__ should be safe to use with all compilers. */
1283 typedef __typeof__(((struct statfs
*)NULL
)->f_type
) fs_type_magic
;
1284 /* Check whether given mountpoint has mount type specified via @magic_val. */
1285 static bool has_fs_type(const struct statfs
*fs
, fs_type_magic magic_val
)
1287 return (fs
->f_type
== (fs_type_magic
)magic_val
);
1290 /* Check whether @path is a cgroupfs v1 or cgroupfs v2 mount. Returns -1 if
1291 * statfs fails. If @path is null /sys/fs/cgroup is checked.
1293 static int cg_get_version_of_mntpt(const char *path
)
1299 ret
= statfs(path
, &sb
);
1301 ret
= statfs("/sys/fs/cgroup", &sb
);
1306 if (has_fs_type(&sb
, CGROUP_SUPER_MAGIC
))
1308 else if (has_fs_type(&sb
, CGROUP2_SUPER_MAGIC
))
1314 /* Detect and store information about the cgroupfs v2 hierarchy. Currently only
1315 * deals with the empty v2 hierachy as we do not retrieve enabled controllers.
1317 static bool cgv2_init(uid_t uid
, gid_t gid
)
1322 char *current_cgroup
= NULL
, *init_cgroup
= NULL
;
1326 current_cgroup
= cgv2_get_current_cgroup(getpid());
1327 if (!current_cgroup
) {
1328 /* No v2 hierarchy present. We're done. */
1333 init_cgroup
= cgv2_get_current_cgroup(1);
1335 /* If we're here and didn't fail already above, then something's
1336 * certainly wrong, so error this time.
1340 cg_systemd_prune_init_scope(init_cgroup
);
1342 /* Check if the v2 hierarchy is mounted at its standard location.
1343 * If so we can skip the rest of the work here. Although the unified
1344 * hierarchy can be mounted multiple times, each of those mountpoints
1345 * will expose identical information.
1347 if (cg_get_version_of_mntpt("/sys/fs/cgroup") == 2) {
1349 bool has_user_slice
= false;
1351 mountpoint
= must_copy_string("/sys/fs/cgroup");
1355 user_slice
= must_make_path(mountpoint
, current_cgroup
, NULL
);
1356 if (cg_systemd_created_user_slice(current_cgroup
, init_cgroup
, user_slice
, uid
))
1357 has_user_slice
= true;
1360 cgv2_add_controller(NULL
, mountpoint
, current_cgroup
, init_cgroup
, has_user_slice
);
1366 f
= fopen("/proc/self/mountinfo", "r");
1370 /* we support simple cgroup mounts and lxcfs mounts */
1371 while (getline(&line
, &len
, f
) != -1) {
1373 bool has_user_slice
= false;
1377 mountpoint
= get_mountpoint(line
);
1381 user_slice
= must_make_path(mountpoint
, current_cgroup
, NULL
);
1382 if (cg_systemd_created_user_slice(current_cgroup
, init_cgroup
, user_slice
, uid
))
1383 has_user_slice
= true;
1386 cgv2_add_controller(NULL
, mountpoint
, current_cgroup
, init_cgroup
, has_user_slice
);
1387 /* Although the unified hierarchy can be mounted multiple times,
1388 * each of those mountpoints will expose identical information.
1389 * So let the first mountpoint we find, win.
1394 lxcfs_debug("Detected cgroupfs v2 hierarchy at mountpoint \"%s\" with "
1395 "current cgroup \"%s\" and init cgroup \"%s\".\n",
1396 mountpoint
, current_cgroup
, init_cgroup
);
1406 /* Detect and store information about mounted cgroupfs v1 hierarchies and the
1407 * cgroupfs v2 hierarchy.
1408 * Detect whether we are on a pure cgroupfs v1, cgroupfs v2, or mixed system,
1409 * where some controllers are mounted into their standard cgroupfs v1 locations
1410 * (/sys/fs/cgroup/<controller>) and others are mounted into the cgroupfs v2
1411 * hierarchy (/sys/fs/cgroup).
1413 static bool cg_init(uid_t uid
, gid_t gid
)
1415 if (!cgv1_init(uid
, gid
))
1418 if (!cgv2_init(uid
, gid
))
1421 if (cgv1_hierarchies
&& cgv2_hierarchies
) {
1422 cg_mount_mode
= CGROUP_MIXED
;
1423 lxcfs_debug("%s\n", "Detected cgroupfs v1 and v2 hierarchies.");
1424 } else if (cgv1_hierarchies
&& !cgv2_hierarchies
) {
1425 cg_mount_mode
= CGROUP_PURE_V1
;
1426 lxcfs_debug("%s\n", "Detected cgroupfs v1 hierarchies.");
1427 } else if (cgv2_hierarchies
&& !cgv1_hierarchies
) {
1428 cg_mount_mode
= CGROUP_PURE_V2
;
1429 lxcfs_debug("%s\n", "Detected cgroupfs v2 hierarchies.");
1431 cg_mount_mode
= CGROUP_UNKNOWN
;
1432 mysyslog(LOG_ERR
, "Could not detect cgroupfs hierarchy.\n", NULL
);
1435 if (cg_mount_mode
== CGROUP_UNKNOWN
)
1441 /* Try to move/migrate us into @cgroup in a cgroupfs v1 hierarchy. */
1442 static bool cgv1_enter(const char *cgroup
)
1444 struct cgv1_hierarchy
**it
;
1446 for (it
= cgv1_hierarchies
; it
&& *it
; it
++) {
1448 bool entered
= false;
1450 if (!(*it
)->controllers
|| !(*it
)->mountpoint
||
1451 !(*it
)->init_cgroup
|| !(*it
)->create_rw_cgroup
)
1454 for (controller
= (*it
)->controllers
; controller
&& *controller
;
1458 /* We've already been placed in a user slice, so we
1459 * don't need to enter the cgroup again.
1461 if ((*it
)->systemd_user_slice
) {
1466 path
= must_make_path((*it
)->mountpoint
,
1471 if (!file_exists(path
)) {
1473 path
= must_make_path((*it
)->mountpoint
,
1479 lxcfs_debug("Attempting to enter cgroupfs v1 hierarchy in \"%s\" cgroup.\n", path
);
1480 entered
= write_int(path
, (int)getpid());
1485 lxcfs_debug("Failed to enter cgroupfs v1 hierarchy in \"%s\" cgroup.\n", path
);
1495 /* Try to move/migrate us into @cgroup in the cgroupfs v2 hierarchy. */
1496 static bool cgv2_enter(const char *cgroup
)
1498 struct cgv2_hierarchy
*v2
;
1500 bool entered
= false;
1502 if (!cgv2_hierarchies
)
1505 v2
= *cgv2_hierarchies
;
1507 if (!v2
->mountpoint
|| !v2
->base_cgroup
)
1510 if (!v2
->create_rw_cgroup
|| v2
->systemd_user_slice
)
1513 path
= must_make_path(v2
->mountpoint
, v2
->base_cgroup
, cgroup
,
1514 "/cgroup.procs", NULL
);
1515 lxcfs_debug("Attempting to enter cgroupfs v2 hierarchy in cgroup \"%s\".\n", path
);
1516 entered
= write_int(path
, (int)getpid());
1518 lxcfs_debug("Failed to enter cgroupfs v2 hierarchy in cgroup \"%s\".\n", path
);
1528 /* Wrapper around cgv{1,2}_enter(). */
1529 static bool cg_enter(const char *cgroup
)
1531 if (!cgv1_enter(cgroup
)) {
1532 mysyslog(LOG_WARNING
, "cgroupfs v1: Failed to enter cgroups.\n", NULL
);
1536 if (!cgv2_enter(cgroup
)) {
1537 mysyslog(LOG_WARNING
, "cgroupfs v2: Failed to enter cgroups.\n", NULL
);
1544 /* Escape to root cgroup in all detected cgroupfs v1 hierarchies. */
1545 static void cgv1_escape(void)
1547 struct cgv1_hierarchy
**it
;
1549 /* In case systemd hasn't already placed us in a user slice for the
1550 * cpuset v1 controller we will reside in the root cgroup. This means
1551 * that cgroup.clone_children will not have been initialized for us so
1554 for (it
= cgv1_hierarchies
; it
&& *it
; it
++)
1555 if (!cgv1_handle_root_cpuset_hierarchy(*it
))
1556 mysyslog(LOG_WARNING
, "cgroupfs v1: Failed to initialize cpuset.\n", NULL
);
1558 if (!cgv1_enter("/"))
1559 mysyslog(LOG_WARNING
, "cgroupfs v1: Failed to escape to init's cgroup.\n", NULL
);
1562 /* Escape to root cgroup in the cgroupfs v2 hierarchy. */
1563 static void cgv2_escape(void)
1565 if (!cgv2_enter("/"))
1566 mysyslog(LOG_WARNING
, "cgroupfs v2: Failed to escape to init's cgroup.\n", NULL
);
1569 /* Wrapper around cgv{1,2}_escape(). */
1570 static void cg_escape(void)
1576 /* Get uid and gid for @user. */
1577 static bool get_uid_gid(const char *user
, uid_t
*uid
, gid_t
*gid
)
1579 struct passwd
*pwent
;
1581 pwent
= getpwnam(user
);
1585 *uid
= pwent
->pw_uid
;
1586 *gid
= pwent
->pw_gid
;
1591 /* Check if cgroup belongs to our uid and gid. If so, reuse it. */
1592 static bool cg_belongs_to_uid_gid(const char *path
, uid_t uid
, gid_t gid
)
1594 struct stat statbuf
;
1596 if (stat(path
, &statbuf
) < 0)
1599 if (!(statbuf
.st_uid
== uid
) || !(statbuf
.st_gid
== gid
))
1605 /* Create cpumask from cpulist aka turn:
1613 static uint32_t *cg_cpumask(char *buf
, size_t nbits
)
1616 char *saveptr
= NULL
;
1617 size_t arrlen
= BITS_TO_LONGS(nbits
);
1618 uint32_t *bitarr
= calloc(arrlen
, sizeof(uint32_t));
1622 for (; (token
= strtok_r(buf
, ",", &saveptr
)); buf
= NULL
) {
1624 unsigned start
= strtoul(token
, NULL
, 0);
1625 unsigned end
= start
;
1627 char *range
= strchr(token
, '-');
1629 end
= strtoul(range
+ 1, NULL
, 0);
1630 if (!(start
<= end
)) {
1640 while (start
<= end
)
1641 set_bit(start
++, bitarr
);
1647 char *string_join(const char *sep
, const char **parts
, bool use_as_prefix
)
1651 size_t sep_len
= strlen(sep
);
1652 size_t result_len
= use_as_prefix
* sep_len
;
1654 /* calculate new string length */
1655 for (p
= (char **)parts
; *p
; p
++)
1656 result_len
+= (p
> (char **)parts
) * sep_len
+ strlen(*p
);
1658 result
= calloc(result_len
+ 1, 1);
1663 strcpy(result
, sep
);
1664 for (p
= (char **)parts
; *p
; p
++) {
1665 if (p
> (char **)parts
)
1666 strcat(result
, sep
);
1673 /* The largest integer that can fit into long int is 2^64. This is a
1676 #define __IN_TO_STR_LEN 21
1677 /* Turn cpumask into simple, comma-separated cpulist. */
1678 static char *cg_cpumask_to_cpulist(uint32_t *bitarr
, size_t nbits
)
1682 char numstr
[__IN_TO_STR_LEN
] = {0};
1683 char **cpulist
= NULL
;
1685 for (i
= 0; i
<= nbits
; i
++) {
1686 if (is_set(i
, bitarr
)) {
1687 ret
= snprintf(numstr
, __IN_TO_STR_LEN
, "%zu", i
);
1688 if (ret
< 0 || (size_t)ret
>= __IN_TO_STR_LEN
) {
1689 free_string_list(cpulist
);
1692 must_append_string(&cpulist
, numstr
);
1695 return string_join(",", (const char **)cpulist
, false);
1698 static ssize_t
cg_get_max_cpus(char *cpulist
)
1701 char *maxcpus
= cpulist
;
1704 c1
= strrchr(maxcpus
, ',');
1708 c2
= strrchr(maxcpus
, '-');
1718 else if (!c1
&& c2
) // The reverse case is obvs. not needed.
1721 /* If the above logic is correct, c1 should always hold a valid string
1726 cpus
= strtoul(c1
, NULL
, 0);
1733 ssize_t
write_nointr(int fd
, const void* buf
, size_t count
)
1737 ret
= write(fd
, buf
, count
);
1738 if (ret
< 0 && errno
== EINTR
)
1743 int cg_write_to_file(const char *filename
, const void* buf
, size_t count
, bool add_newline
)
1745 int fd
, saved_errno
;
1748 fd
= open(filename
, O_WRONLY
| O_TRUNC
| O_CREAT
| O_CLOEXEC
, 0666);
1751 ret
= write_nointr(fd
, buf
, count
);
1754 if ((size_t)ret
!= count
)
1757 ret
= write_nointr(fd
, "\n", 1);
1765 saved_errno
= errno
;
1767 errno
= saved_errno
;
1771 static bool cg_filter_and_set_cpus(char *path
, bool am_initialized
)
1773 char *lastslash
, *fpath
, oldv
;
1777 ssize_t maxposs
= 0, maxisol
= 0;
1778 char *cpulist
= NULL
, *posscpus
= NULL
, *isolcpus
= NULL
;
1779 uint32_t *possmask
= NULL
, *isolmask
= NULL
;
1782 lastslash
= strrchr(path
, '/');
1783 if (!lastslash
) { // bug... this shouldn't be possible
1784 lxcfs_debug("cgfsng:copy_parent_file: bad path %s", path
);
1789 fpath
= must_make_path(path
, "cpuset.cpus", NULL
);
1790 posscpus
= read_file(fpath
);
1794 /* Get maximum number of cpus found in possible cpuset. */
1795 maxposs
= cg_get_max_cpus(posscpus
);
1799 isolcpus
= read_file("/sys/devices/system/cpu/isolated");
1802 if (!isdigit(isolcpus
[0])) {
1804 /* No isolated cpus but we weren't already initialized by
1805 * someone. We should simply copy the parents cpuset.cpus
1808 if (!am_initialized
)
1810 /* No isolated cpus but we were already initialized by someone.
1811 * Nothing more to do for us.
1817 /* Get maximum number of cpus found in isolated cpuset. */
1818 maxisol
= cg_get_max_cpus(isolcpus
);
1822 if (maxposs
< maxisol
)
1826 possmask
= cg_cpumask(posscpus
, maxposs
);
1830 isolmask
= cg_cpumask(isolcpus
, maxposs
);
1834 for (i
= 0; i
<= maxposs
; i
++) {
1835 if (is_set(i
, isolmask
) && is_set(i
, possmask
)) {
1836 clear_bit(i
, possmask
);
1840 cpulist
= cg_cpumask_to_cpulist(possmask
, maxposs
);
1841 if (!cpulist
) /* Bug */
1846 fpath
= must_make_path(path
, "cpuset.cpus", NULL
);
1847 ret
= cg_write_to_file(fpath
, cpulist
, strlen(cpulist
), false);
1857 if (posscpus
!= cpulist
)
1865 int read_from_file(const char *filename
, void* buf
, size_t count
)
1867 int fd
= -1, saved_errno
;
1870 fd
= open(filename
, O_RDONLY
| O_CLOEXEC
);
1874 if (!buf
|| !count
) {
1877 while ((ret
= read(fd
, buf2
, 100)) > 0)
1882 memset(buf
, 0, count
);
1883 ret
= read(fd
, buf
, count
);
1887 lxcfs_debug("read %s: %s", filename
, strerror(errno
));
1889 saved_errno
= errno
;
1891 errno
= saved_errno
;
1895 /* Copy contents of parent(@path)/@file to @path/@file */
1896 static bool cg_copy_parent_file(char *path
, char *file
)
1898 char *lastslash
, *value
= NULL
, *fpath
, oldv
;
1902 lastslash
= strrchr(path
, '/');
1903 if (!lastslash
) { // bug... this shouldn't be possible
1904 lxcfs_debug("cgfsng:copy_parent_file: bad path %s", path
);
1909 fpath
= must_make_path(path
, file
, NULL
);
1910 len
= read_from_file(fpath
, NULL
, 0);
1913 value
= must_alloc(len
+ 1);
1914 if (read_from_file(fpath
, value
, len
) != len
)
1918 fpath
= must_make_path(path
, file
, NULL
);
1919 ret
= cg_write_to_file(fpath
, value
, len
, false);
1921 lxcfs_debug("Unable to write %s to %s", value
, fpath
);
1927 lxcfs_debug("Error reading '%s'", fpath
);
1933 /* In case systemd hasn't already placed us in a user slice for the cpuset v1
1934 * controller we will reside in the root cgroup. This means that
1935 * cgroup.clone_children will not have been initialized for us so we need to do
1938 static bool cgv1_handle_root_cpuset_hierarchy(struct cgv1_hierarchy
*h
)
1940 char *clonechildrenpath
, v
;
1942 if (!string_in_list(h
->controllers
, "cpuset"))
1945 clonechildrenpath
= must_make_path(h
->mountpoint
, "cgroup.clone_children", NULL
);
1947 if (read_from_file(clonechildrenpath
, &v
, 1) < 0) {
1948 lxcfs_debug("Failed to read '%s'", clonechildrenpath
);
1949 free(clonechildrenpath
);
1953 if (v
== '1') { /* already set for us by someone else */
1954 free(clonechildrenpath
);
1958 if (cg_write_to_file(clonechildrenpath
, "1", 1, false) < 0) {
1959 /* Set clone_children so children inherit our settings */
1960 lxcfs_debug("Failed to write 1 to %s", clonechildrenpath
);
1961 free(clonechildrenpath
);
1964 free(clonechildrenpath
);
1969 * Initialize the cpuset hierarchy in first directory of @gname and
1970 * set cgroup.clone_children so that children inherit settings.
1971 * Since the h->base_path is populated by init or ourselves, we know
1972 * it is already initialized.
1974 static bool cgv1_handle_cpuset_hierarchy(struct cgv1_hierarchy
*h
,
1977 char *cgpath
, *clonechildrenpath
, v
, *slash
;
1979 if (!string_in_list(h
->controllers
, "cpuset"))
1984 slash
= strchr(cgroup
, '/');
1988 cgpath
= must_make_path(h
->mountpoint
, h
->base_cgroup
, cgroup
, NULL
);
1991 if (mkdir(cgpath
, 0755) < 0 && errno
!= EEXIST
) {
1992 lxcfs_debug("Failed to create '%s'", cgpath
);
1996 clonechildrenpath
= must_make_path(cgpath
, "cgroup.clone_children", NULL
);
1997 if (!file_exists(clonechildrenpath
)) { /* unified hierarchy doesn't have clone_children */
1998 free(clonechildrenpath
);
2002 if (read_from_file(clonechildrenpath
, &v
, 1) < 0) {
2003 lxcfs_debug("Failed to read '%s'", clonechildrenpath
);
2004 free(clonechildrenpath
);
2009 /* Make sure any isolated cpus are removed from cpuset.cpus. */
2010 if (!cg_filter_and_set_cpus(cgpath
, v
== '1'))
2013 if (v
== '1') { /* already set for us by someone else */
2014 free(clonechildrenpath
);
2019 /* copy parent's settings */
2020 if (!cg_copy_parent_file(cgpath
, "cpuset.mems")) {
2022 free(clonechildrenpath
);
2027 if (cg_write_to_file(clonechildrenpath
, "1", 1, false) < 0) {
2028 /* Set clone_children so children inherit our settings */
2029 lxcfs_debug("Failed to write 1 to %s", clonechildrenpath
);
2030 free(clonechildrenpath
);
2033 free(clonechildrenpath
);
2037 /* Create and chown @cgroup for all given controllers in a cgroupfs v1 hierarchy
2038 * (For example, create @cgroup for the cpu and cpuacct controller mounted into
2039 * /sys/fs/cgroup/cpu,cpuacct). Check if the path already exists and report back
2040 * to the caller in @existed.
2042 #define __PAM_CGFS_USER "/user/"
2043 #define __PAM_CGFS_USER_LEN 6
2044 static bool cgv1_create_one(struct cgv1_hierarchy
*h
, const char *cgroup
, uid_t uid
, gid_t gid
, bool *existed
)
2046 char *clean_base_cgroup
, *path
;
2048 struct cgv1_hierarchy
*it
;
2049 bool created
= false;
2053 for (controller
= it
->controllers
; controller
&& *controller
;
2057 if (!cgv1_handle_cpuset_hierarchy(it
, cgroup
))
2060 /* If systemd has already created a cgroup for us, keep using
2063 if (cg_systemd_chown_existing_cgroup(it
->mountpoint
,
2064 it
->base_cgroup
, uid
, gid
,
2065 it
->systemd_user_slice
)) {
2069 /* We need to make sure that we do not create an endless chain
2070 * of sub-cgroups. So we check if we have already logged in
2071 * somehow (sudo -i, su, etc.) and have created a
2072 * /user/PAM_user/idx cgroup. If so, we skip that part. For most
2073 * cgroups this is unnecessary since we use the init_cgroup
2074 * anyway, but for controllers which have an existing systemd
2075 * cgroup that does not match the current uid, this is pretty
2078 if (strncmp(it
->base_cgroup
, __PAM_CGFS_USER
, __PAM_CGFS_USER_LEN
) == 0) {
2079 free(it
->base_cgroup
);
2080 it
->base_cgroup
= must_copy_string("/");
2083 strstr(it
->base_cgroup
, __PAM_CGFS_USER
);
2084 if (clean_base_cgroup
)
2085 *clean_base_cgroup
= '\0';
2088 path
= must_make_path(it
->mountpoint
, it
->init_cgroup
, cgroup
, NULL
);
2089 lxcfs_debug("Constructing path: %s.\n", path
);
2090 if (file_exists(path
)) {
2091 bool our_cg
= cg_belongs_to_uid_gid(path
, uid
, gid
);
2092 lxcfs_debug("%s existed and does %s have our uid and gid.\n", path
, our_cg
? "" : "not");
2100 created
= mkdir_p(it
->mountpoint
, path
);
2105 if (chown(path
, uid
, gid
) < 0)
2106 lxcfs_debug("Failed to chown %s to %d:%d: %m.\n", path
,
2107 (int)uid
, (int)gid
);
2118 /* Try to remove @cgroup for all given controllers in a cgroupfs v1 hierarchy
2119 * (For example, try to remove @cgroup for the cpu and cpuacct controller
2120 * mounted into /sys/fs/cgroup/cpu,cpuacct). Ignores failures.
2122 static bool cgv1_remove_one(struct cgv1_hierarchy
*h
, const char *cgroup
)
2127 /* Better safe than sorry. */
2128 if (!h
->controllers
)
2131 /* Cgroups created by systemd for us which we re-use won't be removed
2132 * here, since we're using init_cgroup + cgroup as path instead of
2133 * base_cgroup + cgroup.
2135 path
= must_make_path(h
->mountpoint
, h
->init_cgroup
, cgroup
, NULL
);
2136 (void)recursive_rmdir(path
);
2142 /* Try to remove @cgroup the cgroupfs v2 hierarchy. */
2143 static bool cgv2_remove(const char *cgroup
)
2145 struct cgv2_hierarchy
*v2
;
2148 if (!cgv2_hierarchies
)
2151 v2
= *cgv2_hierarchies
;
2153 /* If we reused an already existing cgroup, don't bother trying to
2154 * remove (a potentially wrong)/the path.
2155 * Cgroups created by systemd for us which we re-use would be removed
2156 * here, since we're using base_cgroup + cgroup as path.
2158 if (v2
->systemd_user_slice
)
2161 path
= must_make_path(v2
->mountpoint
, v2
->base_cgroup
, cgroup
, NULL
);
2162 (void)recursive_rmdir(path
);
2168 /* Create @cgroup in all detected cgroupfs v1 hierarchy. If the creation fails
2169 * for any cgroupfs v1 hierarchy, remove all we have created so far. Report
2170 * back, to the caller if the creation failed due to @cgroup already existing
2173 static bool cgv1_create(const char *cgroup
, uid_t uid
, gid_t gid
, bool *existed
)
2175 struct cgv1_hierarchy
**it
, **rev_it
;
2176 bool all_created
= true;
2178 for (it
= cgv1_hierarchies
; it
&& *it
; it
++) {
2179 if (!(*it
)->controllers
|| !(*it
)->mountpoint
||
2180 !(*it
)->init_cgroup
|| !(*it
)->create_rw_cgroup
)
2183 if (!cgv1_create_one(*it
, cgroup
, uid
, gid
, existed
)) {
2184 all_created
= false;
2192 for (rev_it
= cgv1_hierarchies
; rev_it
&& *rev_it
&& (*rev_it
!= *it
);
2194 cgv1_remove_one(*rev_it
, cgroup
);
2199 /* Create @cgroup in the cgroupfs v2 hierarchy. Report back, to the caller if
2200 * the creation failed due to @cgroup already existing via @existed.
2202 static bool cgv2_create(const char *cgroup
, uid_t uid
, gid_t gid
, bool *existed
)
2204 char *clean_base_cgroup
;
2206 struct cgv2_hierarchy
*v2
;
2207 bool created
= false;
2211 if (!cgv2_hierarchies
|| !(*cgv2_hierarchies
)->create_rw_cgroup
)
2214 v2
= *cgv2_hierarchies
;
2216 /* We can't be placed under init's cgroup for the v2 hierarchy. We need
2217 * to be placed under our current cgroup.
2219 if (cg_systemd_chown_existing_cgroup(v2
->mountpoint
,
2220 v2
->base_cgroup
, uid
, gid
,
2221 v2
->systemd_user_slice
))
2224 /* We need to make sure that we do not create an endless chaing of
2225 * sub-cgroups. So we check if we have already logged in somehow (sudo
2226 * -i, su, etc.) and have created a /user/PAM_user/idx cgroup. If so, we
2229 if (strncmp(v2
->base_cgroup
, __PAM_CGFS_USER
, __PAM_CGFS_USER_LEN
) == 0) {
2230 free(v2
->base_cgroup
);
2231 v2
->base_cgroup
= must_copy_string("/");
2233 clean_base_cgroup
= strstr(v2
->base_cgroup
, __PAM_CGFS_USER
);
2234 if (clean_base_cgroup
)
2235 *clean_base_cgroup
= '\0';
2238 path
= must_make_path(v2
->mountpoint
, v2
->base_cgroup
, cgroup
, NULL
);
2239 lxcfs_debug("Constructing path \"%s\".\n", path
);
2240 if (file_exists(path
)) {
2241 bool our_cg
= cg_belongs_to_uid_gid(path
, uid
, gid
);
2242 lxcfs_debug("%s existed and does %s have our uid and gid.\n", path
, our_cg
? "" : "not");
2251 created
= mkdir_p(v2
->mountpoint
, path
);
2257 if (chown(path
, uid
, gid
) < 0)
2258 mysyslog(LOG_WARNING
, "Failed to chown %s to %d:%d: %m.\n",
2259 path
, (int)uid
, (int)gid
, NULL
);
2265 /* Create writeable cgroups for @user at login. Details can be found in the
2266 * preamble/license at the top of this file.
2268 static int handle_login(const char *user
, uid_t uid
, gid_t gid
)
2272 char cg
[MAXPATHLEN
];
2277 ret
= snprintf(cg
, MAXPATHLEN
, "/user/%s/%d", user
, idx
);
2278 if (ret
< 0 || ret
>= MAXPATHLEN
) {
2279 mysyslog(LOG_ERR
, "Username too long.\n", NULL
);
2280 return PAM_SESSION_ERR
;
2284 if (!cgv2_create(cg
, uid
, gid
, &existed
)) {
2290 mysyslog(LOG_ERR
, "Failed to create a cgroup for user %s.\n", user
, NULL
);
2291 return PAM_SESSION_ERR
;
2295 if (!cgv1_create(cg
, uid
, gid
, &existed
)) {
2301 mysyslog(LOG_ERR
, "Failed to create a cgroup for user %s.\n", user
, NULL
);
2302 return PAM_SESSION_ERR
;
2305 if (!cg_enter(cg
)) {
2306 mysyslog( LOG_ERR
, "Failed to enter user cgroup %s for user %s.\n", cg
, user
, NULL
);
2307 return PAM_SESSION_ERR
;
2315 /* Try to prune cgroups we created and that now are empty from all cgroupfs v1
2318 static bool cgv1_prune_empty_cgroups(const char *user
)
2320 bool controller_removed
= true;
2321 bool all_removed
= true;
2322 struct cgv1_hierarchy
**it
;
2324 for (it
= cgv1_hierarchies
; it
&& *it
; it
++) {
2326 char *path_base
, *path_init
;
2329 if (!(*it
)->controllers
|| !(*it
)->mountpoint
||
2330 !(*it
)->init_cgroup
|| !(*it
)->create_rw_cgroup
)
2333 for (controller
= (*it
)->controllers
; controller
&& *controller
;
2335 bool path_base_rm
, path_init_rm
;
2337 path_base
= must_make_path((*it
)->mountpoint
, (*it
)->base_cgroup
, "/user", user
, NULL
);
2338 lxcfs_debug("cgroupfs v1: Trying to prune \"%s\".\n", path_base
);
2339 ret
= recursive_rmdir(path_base
);
2340 if (ret
== -ENOENT
|| ret
>= 0)
2341 path_base_rm
= true;
2343 path_base_rm
= false;
2346 path_init
= must_make_path((*it
)->mountpoint
, (*it
)->init_cgroup
, "/user", user
, NULL
);
2347 lxcfs_debug("cgroupfs v1: Trying to prune \"%s\".\n", path_init
);
2348 ret
= recursive_rmdir(path_init
);
2349 if (ret
== -ENOENT
|| ret
>= 0)
2350 path_init_rm
= true;
2352 path_init_rm
= false;
2355 if (!path_base_rm
&& !path_init_rm
) {
2356 controller_removed
= false;
2360 controller_removed
= true;
2363 if (!controller_removed
)
2364 all_removed
= false;
2370 /* Try to prune cgroup we created and that now is empty from the cgroupfs v2
2373 static bool cgv2_prune_empty_cgroups(const char *user
)
2376 struct cgv2_hierarchy
*v2
;
2377 char *path_base
, *path_init
;
2378 bool path_base_rm
, path_init_rm
;
2380 if (!cgv2_hierarchies
)
2383 v2
= *cgv2_hierarchies
;
2385 path_base
= must_make_path(v2
->mountpoint
, v2
->base_cgroup
, "/user", user
, NULL
);
2386 lxcfs_debug("cgroupfs v2: Trying to prune \"%s\".\n", path_base
);
2387 ret
= recursive_rmdir(path_base
);
2388 if (ret
== -ENOENT
|| ret
>= 0)
2389 path_base_rm
= true;
2391 path_base_rm
= false;
2394 path_init
= must_make_path(v2
->mountpoint
, v2
->init_cgroup
, "/user", user
, NULL
);
2395 lxcfs_debug("cgroupfs v2: Trying to prune \"%s\".\n", path_init
);
2396 ret
= recursive_rmdir(path_init
);
2397 if (ret
== -ENOENT
|| ret
>= 0)
2398 path_init_rm
= true;
2400 path_init_rm
= false;
2403 if (!path_base_rm
&& !path_init_rm
)
2409 /* Wrapper around cgv{1,2}_prune_empty_cgroups(). */
2410 static void cg_prune_empty_cgroups(const char *user
)
2412 (void)cgv1_prune_empty_cgroups(user
);
2413 (void)cgv2_prune_empty_cgroups(user
);
2416 /* Free allocated information for detected cgroupfs v1 hierarchies. */
2417 static void cgv1_free_hierarchies(void)
2419 struct cgv1_hierarchy
**it
;
2421 if (!cgv1_hierarchies
)
2424 for (it
= cgv1_hierarchies
; it
&& *it
; it
++) {
2425 if ((*it
)->controllers
) {
2427 for (tmp
= (*it
)->controllers
; tmp
&& *tmp
; tmp
++)
2430 free((*it
)->controllers
);
2432 free((*it
)->mountpoint
);
2433 free((*it
)->base_cgroup
);
2434 free((*it
)->fullcgpath
);
2435 free((*it
)->init_cgroup
);
2437 free(cgv1_hierarchies
);
2440 /* Free allocated information for the detected cgroupfs v2 hierarchy. */
2441 static void cgv2_free_hierarchies(void)
2443 struct cgv2_hierarchy
**it
;
2445 if (!cgv2_hierarchies
)
2448 for (it
= cgv2_hierarchies
; it
&& *it
; it
++) {
2449 if ((*it
)->controllers
) {
2451 for (tmp
= (*it
)->controllers
; tmp
&& *tmp
; tmp
++)
2454 free((*it
)->controllers
);
2456 free((*it
)->mountpoint
);
2457 free((*it
)->base_cgroup
);
2458 free((*it
)->fullcgpath
);
2459 free((*it
)->init_cgroup
);
2461 free(cgv2_hierarchies
);
2464 /* Wrapper around cgv{1,2}_free_hierarchies(). */
2465 static void cg_exit(void)
2467 cgv1_free_hierarchies();
2468 cgv2_free_hierarchies();
2471 int pam_sm_open_session(pam_handle_t
*pamh
, int flags
, int argc
,
2477 const char *PAM_user
= NULL
;
2479 ret
= pam_get_user(pamh
, &PAM_user
, NULL
);
2480 if (ret
!= PAM_SUCCESS
) {
2481 mysyslog(LOG_ERR
, "PAM-CGFS: couldn't get user\n", NULL
);
2482 return PAM_SESSION_ERR
;
2485 if (!get_uid_gid(PAM_user
, &uid
, &gid
)) {
2486 mysyslog(LOG_ERR
, "Failed to get uid and gid for %s.\n", PAM_user
, NULL
);
2487 return PAM_SESSION_ERR
;
2490 if (!cg_init(uid
, gid
)) {
2491 mysyslog(LOG_ERR
, "Failed to get list of controllers\n", NULL
);
2492 return PAM_SESSION_ERR
;
2495 /* Try to prune cgroups, that are actually empty but were still marked
2496 * as busy by the kernel so we couldn't remove them on session close.
2498 cg_prune_empty_cgroups(PAM_user
);
2500 if (cg_mount_mode
== CGROUP_UNKNOWN
)
2501 return PAM_SESSION_ERR
;
2503 if (argc
> 1 && strcmp(argv
[0], "-c") == 0)
2504 cg_mark_to_make_rw(argv
[1]);
2506 return handle_login(PAM_user
, uid
, gid
);
2509 int pam_sm_close_session(pam_handle_t
*pamh
, int flags
, int argc
,
2515 const char *PAM_user
= NULL
;
2517 ret
= pam_get_user(pamh
, &PAM_user
, NULL
);
2518 if (ret
!= PAM_SUCCESS
) {
2519 mysyslog(LOG_ERR
, "PAM-CGFS: couldn't get user\n", NULL
);
2520 return PAM_SESSION_ERR
;
2523 if (!get_uid_gid(PAM_user
, &uid
, &gid
)) {
2524 mysyslog(LOG_ERR
, "Failed to get uid and gid for %s.\n", PAM_user
, NULL
);
2525 return PAM_SESSION_ERR
;
2528 if (cg_mount_mode
== CGROUP_UNINITIALIZED
) {
2529 if (!cg_init(uid
, gid
))
2530 mysyslog(LOG_ERR
, "Failed to get list of controllers\n", NULL
);
2532 if (argc
> 1 && strcmp(argv
[0], "-c") == 0)
2533 cg_mark_to_make_rw(argv
[1]);
2536 cg_prune_empty_cgroups(PAM_user
);