]> git.proxmox.com Git - mirror_lxcfs.git/blame - src/cgroup_fuse.c
tree-wide: use a single fuse header
[mirror_lxcfs.git] / src / cgroup_fuse.c
CommitLineData
db0463bf 1/* SPDX-License-Identifier: LGPL-2.1+ */
580fe4df 2
f834b6bf
SP
3#include "config.h"
4
580fe4df
CB
5#include <dirent.h>
6#include <errno.h>
7#include <fcntl.h>
580fe4df
CB
8#include <inttypes.h>
9#include <libgen.h>
10#include <pthread.h>
11#include <sched.h>
12#include <stdarg.h>
13#include <stdbool.h>
14#include <stdint.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <time.h>
19#include <unistd.h>
20#include <wait.h>
21#include <linux/magic.h>
22#include <linux/sched.h>
23#include <sys/epoll.h>
24#include <sys/mman.h>
25#include <sys/mount.h>
26#include <sys/param.h>
27#include <sys/socket.h>
28#include <sys/syscall.h>
29#include <sys/sysinfo.h>
30#include <sys/vfs.h>
31
e01afbb7
CB
32#include "cgroup_fuse.h"
33
580fe4df 34#include "bindings.h"
580fe4df
CB
35#include "cgroups/cgroup.h"
36#include "cgroups/cgroup_utils.h"
ec2043ed 37#include "lxcfs_fuse_compat.h"
580fe4df
CB
38#include "memory_utils.h"
39#include "utils.h"
40
41struct cgfs_files {
42 char *name;
43 uint32_t uid, gid;
44 uint32_t mode;
45};
46
47struct pid_ns_clone_args {
48 int *cpipe;
49 int sock;
50 pid_t tpid;
51 /* pid_from_ns or pid_to_ns. */
52 int (*wrapped) (int, pid_t);
53};
54
8a03c08b
CB
55static inline int get_cgroup_fd_handle_named(const char *controller)
56{
988c21e3 57 if (controller && strcmp(controller, "systemd") == 0)
8a03c08b
CB
58 return get_cgroup_fd("name=systemd");
59
60 return get_cgroup_fd(controller);
61}
62
63static char *get_pid_cgroup_handle_named(pid_t pid, const char *controller)
64{
988c21e3 65 if (controller && strcmp(controller, "systemd") == 0)
8a03c08b
CB
66 return get_pid_cgroup(pid, "name=systemd");
67
68 return get_pid_cgroup(pid, controller);
69}
70
71static bool get_cgroup_handle_named(struct cgroup_ops *ops,
72 const char *controller, const char *cgroup,
73 const char *file, char **value)
74{
988c21e3 75 if (controller && strcmp(controller, "systemd") == 0)
8a03c08b
CB
76 return cgroup_ops->get(ops, "name=systemd", cgroup, file, value);
77
78 return cgroup_ops->get(cgroup_ops, controller, cgroup, file, value);
79}
80
580fe4df
CB
81/*
82 * given /cgroup/freezer/a/b, return "freezer".
83 * the returned char* should NOT be freed.
84 */
85static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
86{
87 const char *p1;
88 char *contr, *slash;
89
90 if (strlen(path) < 9) {
91 errno = EACCES;
92 return NULL;
93 }
94 if (*(path + 7) != '/') {
95 errno = EINVAL;
96 return NULL;
97 }
98 p1 = path + 8;
99 contr = strdupa(p1);
100 if (!contr) {
101 errno = ENOMEM;
102 return NULL;
103 }
104 slash = strstr(contr, "/");
105 if (slash)
106 *slash = '\0';
107
108 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
109 if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0)
110 return (*h)->__controllers;
111 }
112 errno = ENOENT;
113 return NULL;
114}
115
116/*
117 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
118 * Note that the returned value may include files (keynames) etc
119 */
120static const char *find_cgroup_in_path(const char *path)
121{
122 const char *p1;
123
124 if (strlen(path) < 9) {
125 errno = EACCES;
126 return NULL;
127 }
128 p1 = strstr(path + 8, "/");
129 if (!p1) {
130 errno = EINVAL;
131 return NULL;
132 }
133 errno = 0;
134 return p1 + 1;
135}
136
137/*
138 * split the last path element from the path in @cg.
139 * @dir is newly allocated and should be freed, @last not
140*/
141static void get_cgdir_and_path(const char *cg, char **dir, char **last)
142{
143 char *p;
144
145 do {
146 *dir = strdup(cg);
147 } while (!*dir);
148 *last = strrchr(cg, '/');
149 if (!*last) {
150 *last = NULL;
151 return;
152 }
153 p = strrchr(*dir, '/');
154 *p = '\0';
155}
156
3c5e8230
CB
157static bool is_child_cgroup(const char *controller, const char *cgroup,
158 const char *file)
580fe4df 159{
3c5e8230
CB
160 __do_free char *path = NULL;
161 int cfd, ret;
580fe4df
CB
162 struct stat sb;
163
8a03c08b 164 cfd = get_cgroup_fd_handle_named(controller);
580fe4df
CB
165 if (cfd < 0)
166 return false;
167
3c5e8230
CB
168 path = must_make_path_relative(cgroup, file, NULL);
169 ret = fstatat(cfd, path, &sb, 0);
580fe4df
CB
170 if (ret < 0 || !S_ISDIR(sb.st_mode))
171 return false;
172
173 return true;
174}
175
176/*
86521450 177 * If pid is in /a/b/c, they may see that /a exists, but not /b or /a/c.
580fe4df
CB
178 */
179static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
180{
181 bool answer = false;
182 char *c2, *task_cg;
183 size_t target_len, task_len;
184
185 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
186 return true;
187
8a03c08b 188 c2 = get_pid_cgroup_handle_named(pid, contrl);
580fe4df
CB
189 if (!c2)
190 return false;
191 prune_init_slice(c2);
192
193 task_cg = c2 + 1;
194 target_len = strlen(cg);
195 task_len = strlen(task_cg);
196 if (task_len == 0) {
197 /* Task is in the root cg, it can see everything. This case is
198 * not handled by the strmcps below, since they test for the
199 * last /, but that is the first / that we've chopped off
200 * above.
201 */
202 answer = true;
203 goto out;
204 }
205 if (strcmp(cg, task_cg) == 0) {
206 answer = true;
207 goto out;
208 }
209 if (target_len < task_len) {
210 /* looking up a parent dir */
211 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
212 answer = true;
213 goto out;
214 }
215 if (target_len > task_len) {
216 /* looking up a child dir */
217 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
218 answer = true;
219 goto out;
220 }
221
222out:
223 free(c2);
224 return answer;
225}
226
227/*
228 * taskcg is a/b/c
229 * querycg is /a/b/c/d/e
230 * we return 'd'
231 */
232static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
233{
234 char *start, *end;
235
236 if (strlen(taskcg) <= strlen(querycg)) {
237 lxcfs_error("%s\n", "I was fed bad input.");
238 return NULL;
239 }
240
241 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
242 start = strdup(taskcg + 1);
243 else
244 start = strdup(taskcg + strlen(querycg) + 1);
245 if (!start)
246 return NULL;
247 end = strchr(start, '/');
248 if (end)
249 *end = '\0';
250 return start;
251}
252
253/*
86521450
CB
254 * If pid is in /a/b/c/d, they may only act on things under cg=/a/b/c/d.
255 * If pid is in /a, they may act on /a/b, but not on /b.
580fe4df
CB
256 * if the answer is false and nextcg is not NULL, then *nextcg will point
257 * to a string containing the next cgroup directory under cg, which must be
258 * freed by the caller.
259 */
260static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
261{
262 bool answer = false;
988c21e3 263 char *c2;
580fe4df
CB
264 char *linecmp;
265
988c21e3 266 c2 = get_pid_cgroup_handle_named(pid, contrl);
580fe4df
CB
267 if (!c2)
268 return false;
269 prune_init_slice(c2);
270
271 /*
272 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
273 * they pass in a cgroup without leading '/'
274 *
275 * The original line here was:
276 * linecmp = *cg == '/' ? c2 : c2+1;
277 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
278 * Serge, do you know?
279 */
280 if (*cg == '/' || !strncmp(cg, "./", 2))
281 linecmp = c2;
282 else
283 linecmp = c2 + 1;
284 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
285 if (nextcg) {
286 *nextcg = get_next_cgroup_dir(linecmp, cg);
287 }
288 goto out;
289 }
290 answer = true;
291
292out:
293 free(c2);
294 return answer;
295}
296
297static struct cgfs_files *cgfs_get_key(const char *controller,
298 const char *cgroup, const char *file)
299{
1281cd28 300 __do_free char *path = NULL;
580fe4df 301 struct cgfs_files *newkey;
1281cd28
CB
302 int cfd, ret;
303 struct stat sb;
580fe4df 304
8a03c08b 305 cfd = get_cgroup_fd_handle_named(controller);
580fe4df
CB
306 if (cfd < 0)
307 return false;
308
309 if (file && *file == '/')
310 file++;
311
312 if (file && strchr(file, '/'))
313 return NULL;
314
580fe4df 315 if (file)
1281cd28
CB
316 path = must_make_path_relative(cgroup, file, NULL);
317 else
318 path = must_make_path_relative(cgroup, NULL);
319 ret = fstatat(cfd, path, &sb, 0);
580fe4df
CB
320 if (ret < 0)
321 return NULL;
322
1281cd28
CB
323 newkey = must_realloc(NULL, sizeof(struct cgfs_files));
324
580fe4df
CB
325 if (file)
326 newkey->name = must_copy_string(file);
327 else if (strrchr(cgroup, '/'))
328 newkey->name = must_copy_string(strrchr(cgroup, '/'));
329 else
330 newkey->name = must_copy_string(cgroup);
331 newkey->uid = sb.st_uid;
332 newkey->gid = sb.st_gid;
333 newkey->mode = sb.st_mode;
334
335 return newkey;
336}
337
338/*
339 * Given a open file * to /proc/pid/{u,g}id_map, and an id
340 * valid in the caller's namespace, return the id mapped into
341 * pid's namespace.
342 * Returns the mapped id, or -1 on error.
343 */
3cf1e562 344static int convert_id_to_ns(FILE *idfile, unsigned int in_id)
580fe4df
CB
345{
346 unsigned int nsuid, // base id for a range in the idfile's namespace
347 hostuid, // base id for a range in the caller's namespace
348 count; // number of ids in this range
349 char line[400];
350 int ret;
351
352 fseek(idfile, 0L, SEEK_SET);
353 while (fgets(line, 400, idfile)) {
354 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
355 if (ret != 3)
356 continue;
357 if (hostuid + count < hostuid || nsuid + count < nsuid) {
358 /*
359 * uids wrapped around - unexpected as this is a procfile,
360 * so just bail.
361 */
362 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
363 nsuid, hostuid, count, line);
364 return -1;
365 }
366 if (hostuid <= in_id && hostuid+count > in_id) {
367 /*
368 * now since hostuid <= in_id < hostuid+count, and
369 * hostuid+count and nsuid+count do not wrap around,
370 * we know that nsuid+(in_id-hostuid) which must be
371 * less that nsuid+(count) must not wrap around
372 */
373 return (in_id - hostuid) + nsuid;
374 }
375 }
376
377 // no answer found
378 return -1;
379}
380
381/*
382 * for is_privileged_over,
86521450 383 * specify whether we require the calling uid to be root in their
580fe4df
CB
384 * namespace
385 */
386#define NS_ROOT_REQD true
387#define NS_ROOT_OPT false
388
389#define PROCLEN 100
390
391static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
392{
3cf1e562 393 FILE *f;
580fe4df
CB
394 char fpath[PROCLEN];
395 int ret;
396 bool answer = false;
397 uid_t nsuid;
398
3cf1e562 399 if (victim == (uid_t)-1 || uid == (uid_t)-1)
580fe4df
CB
400 return false;
401
402 /*
403 * If the request is one not requiring root in the namespace,
404 * then having the same uid suffices. (i.e. uid 1000 has write
405 * access to files owned by uid 1000
406 */
407 if (!req_ns_root && uid == victim)
408 return true;
409
410 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
411 if (ret < 0 || ret >= PROCLEN)
412 return false;
3cf1e562
CB
413
414 f = fopen(fpath, "re");
580fe4df
CB
415 if (!f)
416 return false;
417
86521450 418 /* if caller's not root in their namespace, reject */
580fe4df
CB
419 nsuid = convert_id_to_ns(f, uid);
420 if (nsuid)
421 goto out;
422
423 /*
424 * If victim is not mapped into caller's ns, reject.
425 * XXX I'm not sure this check is needed given that fuse
426 * will be sending requests where the vfs has converted
427 */
428 nsuid = convert_id_to_ns(f, victim);
3cf1e562 429 if (nsuid == (uid_t)-1)
580fe4df
CB
430 goto out;
431
432 answer = true;
433
434out:
435 fclose(f);
436 return answer;
437}
438
439static bool perms_include(int fmode, mode_t req_mode)
440{
441 mode_t r;
442
443 switch (req_mode & O_ACCMODE) {
444 case O_RDONLY:
445 r = S_IROTH;
446 break;
447 case O_WRONLY:
448 r = S_IWOTH;
449 break;
450 case O_RDWR:
451 r = S_IROTH | S_IWOTH;
452 break;
453 default:
454 return false;
455 }
456 return ((fmode & r) == r);
457}
458
459static void free_key(struct cgfs_files *k)
460{
1281cd28
CB
461 if (k) {
462 free_disarm(k->name);
463 free_disarm(k);
464 }
580fe4df
CB
465}
466
467/*
468 * check whether a fuse context may access a cgroup dir or file
469 *
470 * If file is not null, it is a cgroup file to check under cg.
471 * If file is null, then we are checking perms on cg itself.
472 *
473 * For files we can check the mode of the list_keys result.
474 * For cgroups, we must make assumptions based on the files under the
475 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
476 * yet.
477 */
478static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
479{
480 struct cgfs_files *k = NULL;
481 bool ret = false;
482
483 k = cgfs_get_key(contrl, cg, file);
484 if (!k)
485 return false;
486
487 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
488 if (perms_include(k->mode >> 6, mode)) {
489 ret = true;
490 goto out;
491 }
492 }
493 if (fc->gid == k->gid) {
494 if (perms_include(k->mode >> 3, mode)) {
495 ret = true;
496 goto out;
497 }
498 }
499 ret = perms_include(k->mode, mode);
500
501out:
502 free_key(k);
503 return ret;
504}
505
2d7bcab7 506__lxcfs_fuse_ops int cg_getattr(const char *path, struct stat *sb)
580fe4df
CB
507{
508 struct timespec now;
509 struct fuse_context *fc = fuse_get_context();
510 char * cgdir = NULL;
511 char *last = NULL, *path1, *path2;
512 struct cgfs_files *k = NULL;
513 const char *cgroup;
514 const char *controller = NULL;
515 int ret = -ENOENT;
516
cbfc55fd
CB
517 if (!liblxcfs_functional())
518 return -EIO;
580fe4df
CB
519
520 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
521 return -EIO;
522
523 memset(sb, 0, sizeof(struct stat));
524
525 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
526 return -EINVAL;
527
528 sb->st_uid = sb->st_gid = 0;
529 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
530 sb->st_size = 0;
531
532 if (strcmp(path, "/cgroup") == 0) {
533 sb->st_mode = S_IFDIR | 00755;
534 sb->st_nlink = 2;
535 return 0;
536 }
537
538 controller = pick_controller_from_path(fc, path);
539 if (!controller)
540 return -errno;
541 cgroup = find_cgroup_in_path(path);
542 if (!cgroup) {
543 /* this is just /cgroup/controller, return it as a dir */
544 sb->st_mode = S_IFDIR | 00755;
545 sb->st_nlink = 2;
546 return 0;
547 }
548
549 get_cgdir_and_path(cgroup, &cgdir, &last);
550
551 if (!last) {
552 path1 = "/";
553 path2 = cgdir;
554 } else {
555 path1 = cgdir;
556 path2 = last;
557 }
558
559 pid_t initpid = lookup_initpid_in_store(fc->pid);
560 if (initpid <= 1 || is_shared_pidns(initpid))
561 initpid = fc->pid;
562 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
563 * Then check that caller's cgroup is under path if last is a child
564 * cgroup, or cgdir if last is a file */
565
566 if (is_child_cgroup(controller, path1, path2)) {
567 if (!caller_may_see_dir(initpid, controller, cgroup)) {
568 ret = -ENOENT;
569 goto out;
570 }
571 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
572 /* this is just /cgroup/controller, return it as a dir */
573 sb->st_mode = S_IFDIR | 00555;
574 sb->st_nlink = 2;
575 ret = 0;
576 goto out;
577 }
578 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
579 ret = -EACCES;
580 goto out;
581 }
582
583 // get uid, gid, from '/tasks' file and make up a mode
584 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
585 sb->st_mode = S_IFDIR | 00755;
586 k = cgfs_get_key(controller, cgroup, NULL);
587 if (!k) {
588 sb->st_uid = sb->st_gid = 0;
589 } else {
590 sb->st_uid = k->uid;
591 sb->st_gid = k->gid;
592 }
593 free_key(k);
594 sb->st_nlink = 2;
595 ret = 0;
596 goto out;
597 }
598
599 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
600 sb->st_mode = S_IFREG | k->mode;
601 sb->st_nlink = 1;
602 sb->st_uid = k->uid;
603 sb->st_gid = k->gid;
25982f5d 604 sb->st_size = 4096;
580fe4df
CB
605 free_key(k);
606 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
607 ret = -ENOENT;
608 goto out;
609 }
610 ret = 0;
611 }
612
613out:
614 free(cgdir);
615 return ret;
616}
617
618/*
619 * Chown all the files in the cgroup directory. We do this when we create a
620 * cgroup on behalf of a user.
621 */
622static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
623{
624 struct dirent *direntp;
625 char path[MAXPATHLEN];
626 size_t len;
627 DIR *d;
628 int fd1, ret;
629
630 len = strlen(dirname);
631 if (len >= MAXPATHLEN) {
632 lxcfs_error("Pathname too long: %s\n", dirname);
633 return;
634 }
635
636 fd1 = openat(fd, dirname, O_DIRECTORY);
637 if (fd1 < 0)
638 return;
639
640 d = fdopendir(fd1);
641 if (!d) {
642 lxcfs_error("Failed to open %s\n", dirname);
643 return;
644 }
645
646 while ((direntp = readdir(d))) {
647 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
648 continue;
649 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
650 if (ret < 0 || ret >= MAXPATHLEN) {
651 lxcfs_error("Pathname too long under %s\n", dirname);
652 continue;
653 }
654 if (fchownat(fd, path, uid, gid, 0) < 0)
655 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
656 }
657 closedir(d);
658}
659
660static int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
661{
7c554745 662 __do_free char *path = NULL;
580fe4df 663 int cfd;
580fe4df 664
8a03c08b 665 cfd = get_cgroup_fd_handle_named(controller);
580fe4df
CB
666 if (cfd < 0)
667 return -EINVAL;
668
7c554745
CB
669 path = must_make_path_relative(cg, NULL);
670 if (mkdirat(cfd, path, 0755) < 0)
580fe4df
CB
671 return -errno;
672
673 if (uid == 0 && gid == 0)
674 return 0;
675
7c554745 676 if (fchownat(cfd, path, uid, gid, 0) < 0)
580fe4df
CB
677 return -errno;
678
7c554745 679 chown_all_cgroup_files(path, uid, gid, cfd);
580fe4df
CB
680
681 return 0;
682}
683
2d7bcab7 684__lxcfs_fuse_ops int cg_mkdir(const char *path, mode_t mode)
580fe4df
CB
685{
686 struct fuse_context *fc = fuse_get_context();
687 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
688 const char *cgroup;
689 int ret;
690
cbfc55fd
CB
691 if (!liblxcfs_functional())
692 return -EIO;
693
580fe4df
CB
694 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
695 return -EIO;
696
697 controller = pick_controller_from_path(fc, path);
698 if (!controller)
699 return errno == ENOENT ? -EPERM : -errno;
700
701 cgroup = find_cgroup_in_path(path);
702 if (!cgroup)
703 return -errno;
704
705 get_cgdir_and_path(cgroup, &cgdir, &last);
706 if (!last)
707 path1 = "/";
708 else
709 path1 = cgdir;
710
711 pid_t initpid = lookup_initpid_in_store(fc->pid);
712 if (initpid <= 1 || is_shared_pidns(initpid))
713 initpid = fc->pid;
714 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
715 if (!next)
716 ret = -EINVAL;
717 else if (last && strcmp(next, last) == 0)
718 ret = -EEXIST;
719 else
720 ret = -EPERM;
721 goto out;
722 }
723
724 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
725 ret = -EACCES;
726 goto out;
727 }
728 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
729 ret = -EACCES;
730 goto out;
731 }
732
733 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
734
735out:
736 free(cgdir);
737 free(next);
738 return ret;
739}
740
741static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
742{
f58d0c0d
CB
743 __do_close int dupfd = -EBADF;
744 __do_closedir DIR *dir = NULL;
580fe4df 745 bool ret = false;
f58d0c0d 746 struct dirent *direntp;
580fe4df 747 char pathname[MAXPATHLEN];
580fe4df 748
f58d0c0d 749 dupfd = dup(fd);
580fe4df
CB
750 if (dupfd < 0)
751 return false;
752
753 dir = fdopendir(dupfd);
754 if (!dir) {
755 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
580fe4df
CB
756 return false;
757 }
f58d0c0d 758 move_fd(dupfd);
580fe4df
CB
759
760 while ((direntp = readdir(dir))) {
761 struct stat mystat;
762 int rc;
763
764 if (!strcmp(direntp->d_name, ".") ||
765 !strcmp(direntp->d_name, ".."))
766 continue;
767
768 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
769 if (rc < 0 || rc >= MAXPATHLEN) {
770 lxcfs_error("%s\n", "Pathname too long.");
771 continue;
772 }
773
774 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
775 if (rc) {
776 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
777 continue;
778 }
779 if (S_ISDIR(mystat.st_mode))
780 if (!recursive_rmdir(pathname, fd, cfd))
781 lxcfs_debug("Error removing %s.\n", pathname);
782 }
783
784 ret = true;
580fe4df
CB
785
786 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
787 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
788 ret = false;
789 }
790
580fe4df
CB
791 return ret;
792}
793
bb33f974 794static bool cgfs_remove(const char *controller, const char *cgroup)
580fe4df 795{
bb33f974
CB
796 __do_close int fd = -EBADF;
797 __do_free char *path = NULL;
798 int cfd;
580fe4df 799
8a03c08b 800 cfd = get_cgroup_fd_handle_named(controller);
580fe4df
CB
801 if (cfd < 0)
802 return false;
803
bb33f974
CB
804 path = must_make_path_relative(cgroup, NULL);
805 fd = openat(cfd, path, O_DIRECTORY);
580fe4df
CB
806 if (fd < 0)
807 return false;
808
bb33f974 809 return recursive_rmdir(path, fd, cfd);
580fe4df
CB
810}
811
2d7bcab7 812__lxcfs_fuse_ops int cg_rmdir(const char *path)
580fe4df
CB
813{
814 struct fuse_context *fc = fuse_get_context();
815 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
816 const char *cgroup;
817 int ret;
818
cbfc55fd
CB
819 if (!liblxcfs_functional())
820 return -EIO;
821
580fe4df
CB
822 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
823 return -EIO;
824
825 controller = pick_controller_from_path(fc, path);
826 if (!controller) /* Someone's trying to delete "/cgroup". */
827 return -EPERM;
828
829 cgroup = find_cgroup_in_path(path);
830 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
831 return -EPERM;
832
833 get_cgdir_and_path(cgroup, &cgdir, &last);
834 if (!last) {
835 /* Someone's trying to delete a cgroup on the same level as the
836 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
837 * rmdir "/cgroup/blkio/init.slice".
838 */
839 ret = -EPERM;
840 goto out;
841 }
842
843 pid_t initpid = lookup_initpid_in_store(fc->pid);
844 if (initpid <= 1 || is_shared_pidns(initpid))
845 initpid = fc->pid;
846 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
847 if (!last || (next && (strcmp(next, last) == 0)))
848 ret = -EBUSY;
849 else
850 ret = -ENOENT;
851 goto out;
852 }
853
854 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
855 ret = -EACCES;
856 goto out;
857 }
858 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
859 ret = -EACCES;
860 goto out;
861 }
862
863 if (!cgfs_remove(controller, cgroup)) {
864 ret = -EINVAL;
865 goto out;
866 }
867
868 ret = 0;
869
870out:
871 free(cgdir);
872 free(next);
873 return ret;
874}
875
876static bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
877{
7dc3d74c 878 __do_free char *path = NULL;
580fe4df 879 int cfd;
580fe4df 880
8a03c08b 881 cfd = get_cgroup_fd_handle_named(controller);
580fe4df
CB
882 if (cfd < 0)
883 return false;
884
7dc3d74c
CB
885 path = must_make_path_relative(file, NULL);
886 if (fchmodat(cfd, path, mode, 0) < 0)
580fe4df 887 return false;
7dc3d74c 888
580fe4df
CB
889 return true;
890}
891
2d7bcab7 892__lxcfs_fuse_ops int cg_chmod(const char *path, mode_t mode)
580fe4df
CB
893{
894 struct fuse_context *fc = fuse_get_context();
895 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
896 struct cgfs_files *k = NULL;
897 const char *cgroup;
898 int ret;
899
cbfc55fd
CB
900 if (!liblxcfs_functional())
901 return -EIO;
902
580fe4df
CB
903 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
904 return -EIO;
905
906 if (strcmp(path, "/cgroup") == 0)
907 return -EPERM;
908
909 controller = pick_controller_from_path(fc, path);
910 if (!controller)
911 return errno == ENOENT ? -EPERM : -errno;
912
913 cgroup = find_cgroup_in_path(path);
914 if (!cgroup)
915 /* this is just /cgroup/controller */
916 return -EPERM;
917
918 get_cgdir_and_path(cgroup, &cgdir, &last);
919
920 if (!last) {
921 path1 = "/";
922 path2 = cgdir;
923 } else {
924 path1 = cgdir;
925 path2 = last;
926 }
927
928 if (is_child_cgroup(controller, path1, path2)) {
929 // get uid, gid, from '/tasks' file and make up a mode
930 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
931 k = cgfs_get_key(controller, cgroup, "tasks");
932
933 } else
934 k = cgfs_get_key(controller, path1, path2);
935
936 if (!k) {
937 ret = -EINVAL;
938 goto out;
939 }
940
941 /*
942 * This being a fuse request, the uid and gid must be valid
943 * in the caller's namespace. So we can just check to make
86521450 944 * sure that the caller is root in their uid, and privileged
580fe4df
CB
945 * over the file's current owner.
946 */
947 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
948 ret = -EPERM;
949 goto out;
950 }
951
952 if (!cgfs_chmod_file(controller, cgroup, mode)) {
953 ret = -EINVAL;
954 goto out;
955 }
956
957 ret = 0;
958out:
959 free_key(k);
960 free(cgdir);
961 return ret;
962}
963
df637155 964static inline bool is_dir(int dirfd, const char *path)
580fe4df 965{
df637155
CB
966 struct stat st;
967 return fstatat(dirfd, path, &st, 0) == 0 && S_ISDIR(st.st_mode);
580fe4df
CB
968}
969
df637155 970static int chown_tasks_files(int dirfd, const char *dirname, uid_t uid, gid_t gid)
580fe4df 971{
df637155 972 __do_free char *path;
580fe4df 973
df637155
CB
974 path = must_make_path_relative(dirname, "tasks", NULL);
975 if (fchownat(dirfd, path, uid, gid, 0) != 0)
580fe4df 976 return -errno;
df637155
CB
977
978 free_disarm(path);
979 path = must_make_path_relative(dirname, "cgroup.procs", NULL);
980 if (fchownat(dirfd, path, uid, gid, 0) != 0)
580fe4df 981 return -errno;
df637155 982
580fe4df
CB
983 return 0;
984}
985
df637155 986static int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
580fe4df 987{
df637155 988 __do_free char *path = NULL;
580fe4df 989 int cfd;
580fe4df 990
8a03c08b 991 cfd = get_cgroup_fd_handle_named(controller);
580fe4df
CB
992 if (cfd < 0)
993 return false;
994
df637155
CB
995 path = must_make_path_relative(file, NULL);
996 if (fchownat(cfd, path, uid, gid, 0) < 0)
580fe4df
CB
997 return -errno;
998
df637155
CB
999 if (is_dir(cfd, path))
1000 return chown_tasks_files(cfd, path, uid, gid);
580fe4df
CB
1001
1002 return 0;
1003}
1004
2d7bcab7 1005__lxcfs_fuse_ops int cg_chown(const char *path, uid_t uid, gid_t gid)
580fe4df
CB
1006{
1007 struct fuse_context *fc = fuse_get_context();
1008 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
1009 struct cgfs_files *k = NULL;
1010 const char *cgroup;
1011 int ret;
1012
cbfc55fd
CB
1013 if (!liblxcfs_functional())
1014 return -EIO;
1015
580fe4df
CB
1016 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1017 return -EIO;
1018
1019 if (strcmp(path, "/cgroup") == 0)
1020 return -EPERM;
1021
1022 controller = pick_controller_from_path(fc, path);
1023 if (!controller)
1024 return errno == ENOENT ? -EPERM : -errno;
1025
1026 cgroup = find_cgroup_in_path(path);
1027 if (!cgroup)
1028 /* this is just /cgroup/controller */
1029 return -EPERM;
1030
1031 get_cgdir_and_path(cgroup, &cgdir, &last);
1032
1033 if (!last) {
1034 path1 = "/";
1035 path2 = cgdir;
1036 } else {
1037 path1 = cgdir;
1038 path2 = last;
1039 }
1040
1041 if (is_child_cgroup(controller, path1, path2)) {
1042 // get uid, gid, from '/tasks' file and make up a mode
1043 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1044 k = cgfs_get_key(controller, cgroup, "tasks");
1045
1046 } else
1047 k = cgfs_get_key(controller, path1, path2);
1048
1049 if (!k) {
1050 ret = -EINVAL;
1051 goto out;
1052 }
1053
1054 /*
1055 * This being a fuse request, the uid and gid must be valid
1056 * in the caller's namespace. So we can just check to make
86521450 1057 * sure that the caller is root in their uid, and privileged
580fe4df
CB
1058 * over the file's current owner.
1059 */
1060 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
1061 ret = -EACCES;
1062 goto out;
1063 }
1064
1065 ret = cgfs_chown_file(controller, cgroup, uid, gid);
1066
1067out:
1068 free_key(k);
1069 free(cgdir);
1070
1071 return ret;
1072}
1073
2d7bcab7 1074__lxcfs_fuse_ops int cg_open(const char *path, struct fuse_file_info *fi)
580fe4df
CB
1075{
1076 const char *cgroup;
1077 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1078 struct cgfs_files *k = NULL;
1079 struct file_info *file_info;
1080 struct fuse_context *fc = fuse_get_context();
1081 int ret;
1082
cbfc55fd
CB
1083 if (!liblxcfs_functional())
1084 return -EIO;
1085
580fe4df
CB
1086 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1087 return -EIO;
1088
1089 controller = pick_controller_from_path(fc, path);
1090 if (!controller)
1091 return -errno;
1092 cgroup = find_cgroup_in_path(path);
1093 if (!cgroup)
1094 return -errno;
1095
1096 get_cgdir_and_path(cgroup, &cgdir, &last);
1097 if (!last) {
1098 path1 = "/";
1099 path2 = cgdir;
1100 } else {
1101 path1 = cgdir;
1102 path2 = last;
1103 }
1104
1105 k = cgfs_get_key(controller, path1, path2);
1106 if (!k) {
1107 ret = -EINVAL;
1108 goto out;
1109 }
1110 free_key(k);
1111
1112 pid_t initpid = lookup_initpid_in_store(fc->pid);
1113 if (initpid <= 1 || is_shared_pidns(initpid))
1114 initpid = fc->pid;
1115 if (!caller_may_see_dir(initpid, controller, path1)) {
1116 ret = -ENOENT;
1117 goto out;
1118 }
1119 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
1120 ret = -EACCES;
1121 goto out;
1122 }
1123
1124 /* we'll free this at cg_release */
1125 file_info = malloc(sizeof(*file_info));
1126 if (!file_info) {
1127 ret = -ENOMEM;
1128 goto out;
1129 }
1130 file_info->controller = must_copy_string(controller);
1131 file_info->cgroup = must_copy_string(path1);
1132 file_info->file = must_copy_string(path2);
1133 file_info->type = LXC_TYPE_CGFILE;
1134 file_info->buf = NULL;
1135 file_info->buflen = 0;
1136
99b183fb 1137 fi->fh = PTR_TO_UINT64(file_info);
580fe4df
CB
1138 ret = 0;
1139
1140out:
1141 free(cgdir);
1142 return ret;
1143}
1144
1145#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
1146
580fe4df
CB
1147/*
1148 * pid_to_ns - reads pids from a ucred over a socket, then writes the
1149 * int value back over the socket. This shifts the pid from the
1150 * sender's pidns into tpid's pidns.
1151 */
1152static int pid_to_ns(int sock, pid_t tpid)
1153{
1154 char v = '0';
dac3dc93
CB
1155 struct ucred cred = {
1156 .pid = -1,
1157 .uid = -1,
1158 .gid = -1,
1159 };
580fe4df
CB
1160
1161 while (recv_creds(sock, &cred, &v)) {
1162 if (v == '1')
1163 return 0;
1164
dac3dc93 1165 if (write_nointr(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
580fe4df
CB
1166 return 1;
1167 }
1168
1169 return 0;
1170}
1171
1172/*
1173 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
1174 * with clone(). This simply writes '1' as ACK back to the parent
1175 * before calling the actual wrapped function.
1176 */
1177static int pid_ns_clone_wrapper(void *arg) {
1178 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
1179 char b = '1';
1180
1181 close(args->cpipe[0]);
1182 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
1183 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
1184 close(args->cpipe[1]);
1185 return args->wrapped(args->sock, args->tpid);
1186}
1187
1188/*
1189 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
1190 * in your old pidns. Only children which you clone will be in the target
1191 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
1192 * actually convert pids.
1193 *
1194 * Note: glibc's fork() does not respect pidns, which can lead to failed
1195 * assertions inside glibc (and thus failed forks) if the child's pid in
1196 * the pidns and the parent pid outside are identical. Using clone prevents
1197 * this issue.
1198 */
1199static void pid_to_ns_wrapper(int sock, pid_t tpid)
1200{
1201 int newnsfd = -1, ret, cpipe[2];
1202 char fnam[100];
1203 pid_t cpid;
1204 char v;
1205
1206 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
3cf1e562 1207 if (ret < 0 || (size_t)ret >= sizeof(fnam))
580fe4df
CB
1208 _exit(1);
1209 newnsfd = open(fnam, O_RDONLY);
1210 if (newnsfd < 0)
1211 _exit(1);
1212 if (setns(newnsfd, 0) < 0)
1213 _exit(1);
1214 close(newnsfd);
1215
1216 if (pipe(cpipe) < 0)
1217 _exit(1);
1218
1219 struct pid_ns_clone_args args = {
1220 .cpipe = cpipe,
1221 .sock = sock,
1222 .tpid = tpid,
1223 .wrapped = &pid_to_ns
1224 };
580fe4df 1225
6abff455 1226 cpid = lxcfs_clone(pid_ns_clone_wrapper, &args, 0);
580fe4df
CB
1227 if (cpid < 0)
1228 _exit(1);
1229
1230 /* Give the child 1 second to be done forking and write its ack. */
1231 if (!wait_for_sock(cpipe[0], 1))
1232 _exit(1);
1233 ret = read(cpipe[0], &v, 1);
1234 if (ret != sizeof(char) || v != '1')
1235 _exit(1);
1236
1237 if (!wait_for_pid(cpid))
1238 _exit(1);
1239 _exit(0);
1240}
1241
1242/*
1243 * append pid to *src.
1244 * src: a pointer to a char* in which ot append the pid.
1245 * sz: the number of characters printed so far, minus trailing \0.
1246 * asz: the allocated size so far
1247 * pid: the pid to append
1248 */
1249static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1250{
1251 must_strcat(src, sz, asz, "%d\n", (int)pid);
1252}
1253
1254/*
1255 * To read cgroup files with a particular pid, we will setns into the child
1256 * pidns, open a pipe, fork a child - which will be the first to really be in
1257 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
1258 */
1259static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg,
1260 const char *file, char **d)
1261{
1262 int sock[2] = {-1, -1};
1263 char *tmpdata = NULL;
1264 int ret;
1265 pid_t qpid, cpid = -1;
1266 bool answer = false;
1267 char v = '0';
1268 struct ucred cred;
1269 size_t sz = 0, asz = 0;
1270
8a03c08b 1271 if (!get_cgroup_handle_named(cgroup_ops, contrl, cg, file, &tmpdata))
580fe4df
CB
1272 return false;
1273
1274 /*
1275 * Now we read the pids from returned data one by one, pass
1276 * them into a child in the target namespace, read back the
1277 * translated pids, and put them into our to-return data
1278 */
1279
1280 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1281 perror("socketpair");
1282 free(tmpdata);
1283 return false;
1284 }
1285
1286 cpid = fork();
1287 if (cpid == -1)
1288 goto out;
1289
1290 if (!cpid) // child - exits when done
1291 pid_to_ns_wrapper(sock[1], tpid);
1292
1293 char *ptr = tmpdata;
1294 cred.uid = 0;
1295 cred.gid = 0;
1296 while (sscanf(ptr, "%d\n", &qpid) == 1) {
1297 cred.pid = qpid;
1298 ret = send_creds(sock[0], &cred, v, true);
1299
1300 if (ret == SEND_CREDS_NOTSK)
1301 goto next;
1302 if (ret == SEND_CREDS_FAIL)
1303 goto out;
1304
1305 // read converted results
1306 if (!wait_for_sock(sock[0], 2)) {
1307 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
1308 goto out;
1309 }
1310 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1311 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
1312 goto out;
1313 }
1314 must_strcat_pid(d, &sz, &asz, qpid);
1315next:
1316 ptr = strchr(ptr, '\n');
1317 if (!ptr)
1318 break;
1319 ptr++;
1320 }
1321
1322 cred.pid = getpid();
1323 v = '1';
1324 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
1325 // failed to ask child to exit
1326 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
1327 goto out;
1328 }
1329
1330 answer = true;
1331
1332out:
1333 free(tmpdata);
1334 if (cpid != -1)
1335 wait_for_pid(cpid);
1336 if (sock[0] != -1) {
1337 close(sock[0]);
1338 close(sock[1]);
1339 }
1340 return answer;
1341}
1342
2d7bcab7
CB
1343__lxcfs_fuse_ops int cg_read(const char *path, char *buf, size_t size,
1344 off_t offset, struct fuse_file_info *fi)
580fe4df
CB
1345{
1346 struct fuse_context *fc = fuse_get_context();
99b183fb 1347 struct file_info *f = INTTYPE_TO_PTR(fi->fh);
580fe4df
CB
1348 struct cgfs_files *k = NULL;
1349 char *data = NULL;
3cf1e562
CB
1350 int ret;
1351 size_t s;
580fe4df
CB
1352 bool r;
1353
cbfc55fd
CB
1354 if (!liblxcfs_functional())
1355 return -EIO;
1356
580fe4df
CB
1357 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1358 return -EIO;
1359
1360 if (f->type != LXC_TYPE_CGFILE) {
1361 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
1362 return -EIO;
1363 }
1364
1365 if (offset)
1366 return 0;
1367
1368 if (!f->controller)
1369 return -EINVAL;
1370
1371 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
1372 return -EINVAL;
1373 }
1374 free_key(k);
1375
1376
1377 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
1378 ret = -EACCES;
1379 goto out;
1380 }
1381
1382 if (strcmp(f->file, "tasks") == 0 ||
1383 strcmp(f->file, "/tasks") == 0 ||
1384 strcmp(f->file, "/cgroup.procs") == 0 ||
1385 strcmp(f->file, "cgroup.procs") == 0)
1386 // special case - we have to translate the pids
1387 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
1388 else
8a03c08b 1389 r = get_cgroup_handle_named(cgroup_ops, f->controller, f->cgroup, f->file, &data);
580fe4df
CB
1390
1391 if (!r) {
1392 ret = -EINVAL;
1393 goto out;
1394 }
1395
1396 if (!data) {
1397 ret = 0;
1398 goto out;
1399 }
1400 s = strlen(data);
1401 if (s > size)
1402 s = size;
1403 memcpy(buf, data, s);
3cf1e562 1404 if ((s > 0) && (s < size) && (data[s - 1] != '\n'))
580fe4df
CB
1405 buf[s++] = '\n';
1406
1407 ret = s;
1408
1409out:
1410 free(data);
1411 return ret;
1412}
1413
2d7bcab7 1414__lxcfs_fuse_ops int cg_opendir(const char *path, struct fuse_file_info *fi)
580fe4df
CB
1415{
1416 struct fuse_context *fc = fuse_get_context();
1417 const char *cgroup;
1418 struct file_info *dir_info;
1419 char *controller = NULL;
1420
cbfc55fd
CB
1421 if (!liblxcfs_functional())
1422 return -EIO;
1423
580fe4df
CB
1424 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1425 return -EIO;
1426
1427 if (strcmp(path, "/cgroup") == 0) {
1428 cgroup = NULL;
1429 controller = NULL;
1430 } else {
1431 // return list of keys for the controller, and list of child cgroups
1432 controller = pick_controller_from_path(fc, path);
1433 if (!controller)
1434 return -errno;
1435
1436 cgroup = find_cgroup_in_path(path);
1437 if (!cgroup) {
1438 /* this is just /cgroup/controller, return its contents */
1439 cgroup = "/";
1440 }
1441 }
1442
1443 pid_t initpid = lookup_initpid_in_store(fc->pid);
1444 if (initpid <= 1 || is_shared_pidns(initpid))
1445 initpid = fc->pid;
1446 if (cgroup) {
1447 if (!caller_may_see_dir(initpid, controller, cgroup))
1448 return -ENOENT;
1449 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1450 return -EACCES;
1451 }
1452
1453 /* we'll free this at cg_releasedir */
1454 dir_info = malloc(sizeof(*dir_info));
1455 if (!dir_info)
1456 return -ENOMEM;
1457 dir_info->controller = must_copy_string(controller);
1458 dir_info->cgroup = must_copy_string(cgroup);
1459 dir_info->type = LXC_TYPE_CGDIR;
1460 dir_info->buf = NULL;
1461 dir_info->file = NULL;
1462 dir_info->buflen = 0;
1463
99b183fb 1464 fi->fh = PTR_TO_UINT64(dir_info);
580fe4df
CB
1465 return 0;
1466}
1467
2d7bcab7 1468__lxcfs_fuse_ops int cg_release(const char *path, struct fuse_file_info *fi)
580fe4df
CB
1469{
1470 do_release_file_info(fi);
1471 return 0;
1472}
1473
2d7bcab7 1474__lxcfs_fuse_ops int cg_releasedir(const char *path, struct fuse_file_info *fi)
580fe4df
CB
1475{
1476 do_release_file_info(fi);
1477 return 0;
1478}
1479
1480static FILE *open_pids_file(const char *controller, const char *cgroup)
1481{
ee3a127b
CB
1482 __do_close int fd = -EBADF;
1483 __do_free char *path = NULL;
1484 int cfd;
1485 FILE *f;
580fe4df 1486
8a03c08b 1487 cfd = get_cgroup_fd_handle_named(controller);
580fe4df
CB
1488 if (cfd < 0)
1489 return false;
1490
ee3a127b
CB
1491 path = must_make_path_relative(cgroup, "cgroup.procs", NULL);
1492 fd = openat(cfd, path, O_WRONLY | O_CLOEXEC);
580fe4df
CB
1493 if (fd < 0)
1494 return NULL;
1495
ee3a127b
CB
1496 f = fdopen(fd, "we");
1497 if (!f)
1498 return NULL;
1499 /* Transfer ownership of fd to fdopen(). */
1500 move_fd(fd);
1501
1502 return f;
580fe4df
CB
1503}
1504
1505static int pid_from_ns(int sock, pid_t tpid)
1506{
1507 pid_t vpid;
1508 struct ucred cred;
1509 char v;
1510 int ret;
1511
1512 cred.uid = 0;
1513 cred.gid = 0;
1514 while (1) {
1515 if (!wait_for_sock(sock, 2)) {
1516 lxcfs_error("%s\n", "Timeout reading from parent.");
1517 return 1;
1518 }
1519 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1520 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
1521 return 1;
1522 }
1523 if (vpid == -1) // done
1524 break;
1525 v = '0';
1526 cred.pid = vpid;
1527 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
1528 v = '1';
1529 cred.pid = getpid();
1530 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
1531 return 1;
1532 }
1533 }
1534 return 0;
1535}
1536
1537static void pid_from_ns_wrapper(int sock, pid_t tpid)
1538{
1539 int newnsfd = -1, ret, cpipe[2];
1540 char fnam[100];
1541 pid_t cpid;
1542 char v;
1543
1544 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
3cf1e562 1545 if (ret < 0 || (size_t)ret >= sizeof(fnam))
580fe4df
CB
1546 _exit(1);
1547 newnsfd = open(fnam, O_RDONLY);
1548 if (newnsfd < 0)
1549 _exit(1);
1550 if (setns(newnsfd, 0) < 0)
1551 _exit(1);
1552 close(newnsfd);
1553
1554 if (pipe(cpipe) < 0)
1555 _exit(1);
1556
1557 struct pid_ns_clone_args args = {
1558 .cpipe = cpipe,
1559 .sock = sock,
1560 .tpid = tpid,
1561 .wrapped = &pid_from_ns
1562 };
580fe4df 1563
6abff455 1564 cpid = lxcfs_clone(pid_ns_clone_wrapper, &args, 0);
580fe4df
CB
1565 if (cpid < 0)
1566 _exit(1);
1567
1568 // give the child 1 second to be done forking and
1569 // write its ack
1570 if (!wait_for_sock(cpipe[0], 1))
1571 _exit(1);
1572 ret = read(cpipe[0], &v, 1);
1573 if (ret != sizeof(char) || v != '1')
1574 _exit(1);
1575
1576 if (!wait_for_pid(cpid))
1577 _exit(1);
1578 _exit(0);
1579}
1580
1581/*
1582 * get_pid_creds: get the real uid and gid of @pid from
1583 * /proc/$$/status
1584 * (XXX should we use euid here?)
1585 */
1586static void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
1587{
1588 char line[400];
1589 uid_t u;
1590 gid_t g;
1591 FILE *f;
1592
1593 *uid = -1;
1594 *gid = -1;
1595 sprintf(line, "/proc/%d/status", pid);
dbb1f822 1596 if ((f = fopen(line, "re")) == NULL) {
580fe4df
CB
1597 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
1598 return;
1599 }
1600 while (fgets(line, 400, f)) {
1601 if (strncmp(line, "Uid:", 4) == 0) {
1602 if (sscanf(line+4, "%u", &u) != 1) {
1603 lxcfs_error("bad uid line for pid %u\n", pid);
1604 fclose(f);
1605 return;
1606 }
1607 *uid = u;
1608 } else if (strncmp(line, "Gid:", 4) == 0) {
1609 if (sscanf(line+4, "%u", &g) != 1) {
1610 lxcfs_error("bad gid line for pid %u\n", pid);
1611 fclose(f);
1612 return;
1613 }
1614 *gid = g;
1615 }
1616 }
1617 fclose(f);
1618}
1619
1620/*
1621 * Given host @uid, return the uid to which it maps in
1622 * @pid's user namespace, or -1 if none.
1623 */
1624static bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
1625{
1626 FILE *f;
1627 char line[400];
1628
1629 sprintf(line, "/proc/%d/uid_map", pid);
dbb1f822 1630 if ((f = fopen(line, "re")) == NULL) {
580fe4df
CB
1631 return false;
1632 }
1633
1634 *answer = convert_id_to_ns(f, uid);
1635 fclose(f);
1636
3cf1e562 1637 if (*answer == (uid_t)-1)
580fe4df
CB
1638 return false;
1639 return true;
1640}
1641
1642/*
1643 * May the requestor @r move victim @v to a new cgroup?
1644 * This is allowed if
1645 * . they are the same task
1646 * . they are ownedy by the same uid
1647 * . @r is root on the host, or
1648 * . @v's uid is mapped into @r's where @r is root.
1649 */
1650static bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
1651{
1652 uid_t v_uid, tmpuid;
1653 gid_t v_gid;
1654
1655 if (r == v)
1656 return true;
1657 if (r_uid == 0)
1658 return true;
1659 get_pid_creds(v, &v_uid, &v_gid);
1660 if (r_uid == v_uid)
1661 return true;
1662 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
1663 && hostuid_to_ns(v_uid, r, &tmpuid))
1664 return true;
1665 return false;
1666}
1667
1668static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl,
1669 const char *cg, const char *file, const char *buf)
1670{
1671 int sock[2] = {-1, -1};
1672 pid_t qpid, cpid = -1;
1673 FILE *pids_file = NULL;
1674 bool answer = false, fail = false;
1675
1676 pids_file = open_pids_file(contrl, cg);
1677 if (!pids_file)
1678 return false;
1679
1680 /*
1681 * write the pids to a socket, have helper in writer's pidns
1682 * call movepid for us
1683 */
1684 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1685 perror("socketpair");
1686 goto out;
1687 }
1688
1689 cpid = fork();
1690 if (cpid == -1)
1691 goto out;
1692
1693 if (!cpid) { // child
1694 fclose(pids_file);
1695 pid_from_ns_wrapper(sock[1], tpid);
1696 }
1697
1698 const char *ptr = buf;
1699 while (sscanf(ptr, "%d", &qpid) == 1) {
1700 struct ucred cred;
1701 char v;
1702
1703 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1704 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
1705 goto out;
1706 }
1707
1708 if (recv_creds(sock[0], &cred, &v)) {
1709 if (v == '0') {
1710 if (!may_move_pid(tpid, tuid, cred.pid)) {
1711 fail = true;
1712 break;
1713 }
1714 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
1715 fail = true;
1716 }
1717 }
1718
1719 ptr = strchr(ptr, '\n');
1720 if (!ptr)
1721 break;
1722 ptr++;
1723 }
1724
1725 /* All good, write the value */
1726 qpid = -1;
1727 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1728 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
1729
1730 if (!fail)
1731 answer = true;
1732
1733out:
1734 if (cpid != -1)
1735 wait_for_pid(cpid);
1736 if (sock[0] != -1) {
1737 close(sock[0]);
1738 close(sock[1]);
1739 }
1740 if (pids_file) {
1741 if (fclose(pids_file) != 0)
1742 answer = false;
1743 }
1744 return answer;
1745}
1746
580fe4df
CB
1747static bool cgfs_set_value(const char *controller, const char *cgroup,
1748 const char *file, const char *value)
1749{
2f543378
CB
1750 __do_close int fd = -EBADF;
1751 __do_free char *path = NULL;
1752 int cfd;
580fe4df 1753 size_t len;
3cf1e562 1754 ssize_t ret;
580fe4df 1755
8a03c08b 1756 cfd = get_cgroup_fd_handle_named(controller);
580fe4df
CB
1757 if (cfd < 0)
1758 return false;
1759
2f543378 1760 path = must_make_path_relative(cgroup, file, NULL);
580fe4df 1761
2f543378 1762 fd = openat(cfd, path, O_WRONLY | O_CLOEXEC);
580fe4df
CB
1763 if (fd < 0)
1764 return false;
1765
2f543378 1766 len = strlen(value);
3cf1e562
CB
1767 ret = write_nointr(fd, value, len);
1768 if (ret < 0)
1769 return false;
1770
1771 return (size_t)ret == len;
580fe4df
CB
1772}
1773
2d7bcab7
CB
1774__lxcfs_fuse_ops int cg_write(const char *path, const char *buf, size_t size,
1775 off_t offset, struct fuse_file_info *fi)
580fe4df
CB
1776{
1777 struct fuse_context *fc = fuse_get_context();
1778 char *localbuf = NULL;
1779 struct cgfs_files *k = NULL;
99b183fb 1780 struct file_info *f = INTTYPE_TO_PTR(fi->fh);
580fe4df
CB
1781 bool r;
1782
cbfc55fd
CB
1783 if (!liblxcfs_functional())
1784 return -EIO;
1785
580fe4df
CB
1786 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1787 return -EIO;
1788
1789 if (f->type != LXC_TYPE_CGFILE) {
1790 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
1791 return -EIO;
1792 }
1793
1794 if (offset)
1795 return 0;
1796
1797 localbuf = alloca(size+1);
1798 localbuf[size] = '\0';
1799 memcpy(localbuf, buf, size);
1800
1801 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
1802 size = -EINVAL;
1803 goto out;
1804 }
1805
1806 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
1807 size = -EACCES;
1808 goto out;
1809 }
1810
1811 if (strcmp(f->file, "tasks") == 0 ||
1812 strcmp(f->file, "/tasks") == 0 ||
1813 strcmp(f->file, "/cgroup.procs") == 0 ||
1814 strcmp(f->file, "cgroup.procs") == 0)
1815 // special case - we have to translate the pids
1816 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
1817 else
1818 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
1819
1820 if (!r)
1821 size = -EINVAL;
1822
1823out:
1824 free_key(k);
1825 return size;
1826}
1827
1828static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup,
1829 bool directories, void ***list, size_t typesize,
1830 void *(*iterator)(const char *, const char *, const char *))
1831{
35511b72
CB
1832 __do_close int fd = -EBADF;
1833 __do_free char *path = NULL;
1834 __do_closedir DIR *dir = NULL;
580fe4df 1835 size_t sz = 0, asz = 0;
35511b72 1836 int cfd;
580fe4df 1837 struct dirent *dirent;
580fe4df 1838
8a03c08b 1839 cfd = get_cgroup_fd_handle_named(controller);
580fe4df
CB
1840 *list = NULL;
1841 if (cfd < 0)
1842 return false;
1843
35511b72
CB
1844 path = must_make_path_relative(cgroup, NULL);
1845 fd = openat(cfd, path, O_DIRECTORY | O_CLOEXEC);
580fe4df
CB
1846 if (fd < 0)
1847 return false;
1848
1849 dir = fdopendir(fd);
1850 if (!dir)
1851 return false;
35511b72
CB
1852 /* Transfer ownership of fd to fdopendir(). */
1853 move_fd(fd);
580fe4df
CB
1854
1855 while ((dirent = readdir(dir))) {
35511b72
CB
1856 int ret;
1857 char pathname[MAXPATHLEN];
580fe4df
CB
1858 struct stat mystat;
1859
35511b72 1860 if (strcmp(dirent->d_name, ".") == 0)
580fe4df
CB
1861 continue;
1862
35511b72
CB
1863 if (strcmp(dirent->d_name, "..") == 0)
1864 continue;
1865
1866 ret = snprintf(pathname, sizeof(pathname), "%s/%s", path, dirent->d_name);
3cf1e562 1867 if (ret < 0 || (size_t)ret >= sizeof(pathname)) {
35511b72 1868 lxcfs_error("Pathname too long under %s\n", path);
580fe4df
CB
1869 continue;
1870 }
1871
1872 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1873 if (ret) {
1874 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1875 continue;
1876 }
35511b72
CB
1877
1878 if (!directories && !S_ISREG(mystat.st_mode))
580fe4df
CB
1879 continue;
1880
35511b72
CB
1881 if (directories && !S_ISDIR(mystat.st_mode))
1882 continue;
1883
7981fe3d
CB
1884 if (sz + 2 >= asz) {
1885 asz += BATCH_SIZE;
35511b72 1886 *list = must_realloc(*list, asz * typesize);
7981fe3d 1887 }
35511b72
CB
1888 (*list)[sz] = (*iterator)(controller, path, dirent->d_name);
1889 (*list)[sz + 1] = NULL;
580fe4df
CB
1890 sz++;
1891 }
35511b72 1892
580fe4df
CB
1893 return true;
1894}
1895
1896static void *make_key_list_entry(const char *controller, const char *cgroup,
1897 const char *dir_entry)
1898{
1899 struct cgfs_files *entry;
1900
1901 entry = cgfs_get_key(controller, cgroup, dir_entry);
1902 if (!entry)
1903 lxcfs_error("Failed to retrieve files under %s:%s\n",
1904 controller, cgroup);
1905 return entry;
1906}
1907
1908static bool cgfs_list_keys(const char *controller, const char *cgroup,
1909 struct cgfs_files ***keys)
1910{
1911 return cgfs_iterate_cgroup(controller, cgroup, false, (void ***)keys,
1912 sizeof(*keys), &make_key_list_entry);
1913}
1914
1915static void *make_children_list_entry(const char *controller,
1916 const char *cgroup, const char *dir_entry)
1917{
1918 return strdup(dir_entry);
1919}
1920
1921static bool cgfs_list_children(const char *controller, const char *cgroup,
1922 char ***list)
1923{
1924 return cgfs_iterate_cgroup(controller, cgroup, true, (void ***)list,
1925 sizeof(*list), &make_children_list_entry);
1926}
1927
1928static void free_keys(struct cgfs_files **keys)
1929{
1930 if (!keys)
1931 return;
1932
1933 for (int i = 0; keys[i]; i++)
1934 free_key(keys[i]);
1935
1936 free_disarm(keys);
1937}
1938
2d7bcab7
CB
1939__lxcfs_fuse_ops int cg_readdir(const char *path, void *buf,
1940 fuse_fill_dir_t filler, off_t offset,
1941 struct fuse_file_info *fi)
580fe4df 1942{
99b183fb 1943 struct file_info *d = INTTYPE_TO_PTR(fi->fh);
580fe4df
CB
1944 struct cgfs_files **list = NULL;
1945 int i, ret;
1946 char *nextcg = NULL;
1947 struct fuse_context *fc = fuse_get_context();
1948 char **clist = NULL;
1949
cbfc55fd
CB
1950 if (!liblxcfs_functional())
1951 return -EIO;
1952
580fe4df
CB
1953 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
1954 return -EIO;
1955
f834b6bf 1956 if (DIR_FILLER(filler, buf, ".", NULL, 0) != 0 || DIR_FILLER(filler, buf, "..", NULL, 0) != 0)
580fe4df
CB
1957 return -EIO;
1958
1959 if (d->type != LXC_TYPE_CGDIR) {
1960 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1961 return -EIO;
1962 }
1963 if (!d->cgroup && !d->controller) {
1964 /*
1965 * ls /var/lib/lxcfs/cgroup - just show list of controllers.
1966 * This only works with the legacy hierarchy.
1967 */
1968 for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) {
1969 if (is_unified_hierarchy(*h))
1970 continue;
1971
f834b6bf 1972 if ((*h)->__controllers && DIR_FILLER(filler, buf, (*h)->__controllers, NULL, 0))
580fe4df
CB
1973 return -EIO;
1974 }
1975
1976 return 0;
1977 }
1978
1979 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1980 // not a valid cgroup
1981 ret = -EINVAL;
1982 goto out;
1983 }
1984
1985 pid_t initpid = lookup_initpid_in_store(fc->pid);
1986 if (initpid <= 1 || is_shared_pidns(initpid))
1987 initpid = fc->pid;
1988 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1989 if (nextcg) {
f834b6bf 1990 ret = DIR_FILLER(filler, buf, nextcg, NULL, 0);
580fe4df
CB
1991 free(nextcg);
1992 if (ret != 0) {
1993 ret = -EIO;
1994 goto out;
1995 }
1996 }
1997 ret = 0;
1998 goto out;
1999 }
2000
2001 for (i = 0; list && list[i]; i++) {
f834b6bf 2002 if (DIR_FILLER(filler, buf, list[i]->name, NULL, 0) != 0) {
580fe4df
CB
2003 ret = -EIO;
2004 goto out;
2005 }
2006 }
2007
2008 // now get the list of child cgroups
2009
2010 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2011 ret = 0;
2012 goto out;
2013 }
2014 if (clist) {
2015 for (i = 0; clist[i]; i++) {
f834b6bf 2016 if (DIR_FILLER(filler, buf, clist[i], NULL, 0) != 0) {
580fe4df
CB
2017 ret = -EIO;
2018 goto out;
2019 }
2020 }
2021 }
2022 ret = 0;
2023
2024out:
2025 free_keys(list);
2026 if (clist) {
2027 for (i = 0; clist[i]; i++)
2028 free(clist[i]);
2029 free(clist);
2030 }
2031 return ret;
2032}
2033
2d7bcab7 2034__lxcfs_fuse_ops int cg_access(const char *path, int mode)
580fe4df
CB
2035{
2036 int ret;
2037 const char *cgroup;
2038 char *path1, *path2, *controller;
2039 char *last = NULL, *cgdir = NULL;
2040 struct cgfs_files *k = NULL;
2041 struct fuse_context *fc = fuse_get_context();
2042
cbfc55fd
CB
2043 if (!liblxcfs_functional())
2044 return -EIO;
2045
580fe4df
CB
2046 if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops))
2047 return -EIO;
2048
2049 if (strcmp(path, "/cgroup") == 0)
2050 return 0;
2051
2052 controller = pick_controller_from_path(fc, path);
2053 if (!controller)
2054 return -errno;
2055 cgroup = find_cgroup_in_path(path);
2056 if (!cgroup) {
2057 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2058 if ((mode & W_OK) == 0)
2059 return 0;
2060 return -EACCES;
2061 }
2062
2063 get_cgdir_and_path(cgroup, &cgdir, &last);
2064 if (!last) {
2065 path1 = "/";
2066 path2 = cgdir;
2067 } else {
2068 path1 = cgdir;
2069 path2 = last;
2070 }
2071
2072 k = cgfs_get_key(controller, path1, path2);
2073 if (!k) {
2074 if ((mode & W_OK) == 0)
2075 ret = 0;
2076 else
2077 ret = -EACCES;
2078 goto out;
2079 }
2080 free_key(k);
2081
2082 pid_t initpid = lookup_initpid_in_store(fc->pid);
2083 if (initpid <= 1 || is_shared_pidns(initpid))
2084 initpid = fc->pid;
2085 if (!caller_may_see_dir(initpid, controller, path1)) {
2086 ret = -ENOENT;
2087 goto out;
2088 }
2089 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2090 ret = -EACCES;
2091 goto out;
2092 }
2093
2094 ret = 0;
2095
2096out:
2097 free(cgdir);
2098 return ret;
2099}