]> git.proxmox.com Git - mirror_lxcfs.git/blame - lxcfs.c
Merge branch 'readtasks.pass.creds.2' into m.2
[mirror_lxcfs.git] / lxcfs.c
CommitLineData
758ad80c
SH
1/* lxcfs
2 *
3 * Copyright © 2014 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
f2799430 6 * See COPYING file for details.
758ad80c
SH
7 */
8
9/*
10 * NOTES - make sure to run this as -s to avoid threading.
11 * TODO - can we enforce that here from the code?
12 */
13#define FUSE_USE_VERSION 26
14
2183082c 15#include <stdio.h>
758ad80c
SH
16#include <dirent.h>
17#include <fcntl.h>
18#include <fuse.h>
19#include <unistd.h>
20#include <errno.h>
21#include <stdbool.h>
22#include <time.h>
23#include <string.h>
24#include <stdlib.h>
25#include <libgen.h>
41bb9357
SH
26#include <sched.h>
27#include <linux/sched.h>
a05660a6 28#include <sys/socket.h>
41bb9357
SH
29#include <sys/mount.h>
30#include <wait.h>
758ad80c
SH
31
32#include <nih/alloc.h>
33#include <nih/string.h>
34
35#include "cgmanager.h"
36
37struct lxcfs_state {
38 /*
39 * a null-terminated, nih-allocated list of the mounted subsystems. We
40 * detect this at startup.
41 */
42 char **subsystems;
43};
44#define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
45
a05660a6
SH
46static int wait_for_pid(pid_t pid)
47{
48 int status, ret;
49
50again:
51 ret = waitpid(pid, &status, 0);
52 if (ret == -1) {
53 if (errno == EINTR)
54 goto again;
55 return -1;
56 }
57 if (ret != pid)
58 goto again;
59 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
60 return -1;
61 return 0;
62}
63
053a659d
SH
64/*
65 * Given a open file * to /proc/pid/{u,g}id_map, and an id
66 * valid in the caller's namespace, return the id mapped into
67 * pid's namespace.
68 * Returns the mapped id, or -1 on error.
69 */
70unsigned int
71convert_id_to_ns(FILE *idfile, unsigned int in_id)
72{
73 unsigned int nsuid, // base id for a range in the idfile's namespace
74 hostuid, // base id for a range in the caller's namespace
75 count; // number of ids in this range
76 char line[400];
77 int ret;
78
79 fseek(idfile, 0L, SEEK_SET);
80 while (fgets(line, 400, idfile)) {
81 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
82 if (ret != 3)
83 continue;
84 if (hostuid + count < hostuid || nsuid + count < nsuid) {
85 /*
86 * uids wrapped around - unexpected as this is a procfile,
87 * so just bail.
88 */
647c89e5 89 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
053a659d
SH
90 nsuid, hostuid, count, line);
91 return -1;
92 }
93 if (hostuid <= in_id && hostuid+count > in_id) {
94 /*
95 * now since hostuid <= in_id < hostuid+count, and
96 * hostuid+count and nsuid+count do not wrap around,
97 * we know that nsuid+(in_id-hostuid) which must be
98 * less that nsuid+(count) must not wrap around
99 */
100 return (in_id - hostuid) + nsuid;
101 }
102 }
103
104 // no answer found
105 return -1;
106}
107
341b21ad
SH
108/*
109 * for is_privileged_over,
110 * specify whether we require the calling uid to be root in his
111 * namespace
112 */
113#define NS_ROOT_REQD true
114#define NS_ROOT_OPT false
115
116static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
758ad80c 117{
053a659d
SH
118 nih_local char *fpath = NULL;
119 bool answer = false;
120 uid_t nsuid;
121
341b21ad
SH
122 if (victim == -1 || uid == -1)
123 return false;
124
125 /*
126 * If the request is one not requiring root in the namespace,
127 * then having the same uid suffices. (i.e. uid 1000 has write
128 * access to files owned by uid 1000
129 */
130 if (!req_ns_root && uid == victim)
758ad80c
SH
131 return true;
132
053a659d
SH
133 fpath = NIH_MUST( nih_sprintf(NULL, "/proc/%d/uid_map", pid) );
134 FILE *f = fopen(fpath, "r");
135 if (!f)
136 return false;
137
341b21ad 138 /* if caller's not root in his namespace, reject */
053a659d
SH
139 nsuid = convert_id_to_ns(f, uid);
140 if (nsuid)
141 goto out;
142
341b21ad
SH
143 /*
144 * If victim is not mapped into caller's ns, reject.
145 * XXX I'm not sure this check is needed given that fuse
146 * will be sending requests where the vfs has converted
147 */
053a659d
SH
148 nsuid = convert_id_to_ns(f, victim);
149 if (nsuid == -1)
150 goto out;
151
152 answer = true;
153
154out:
155 fclose(f);
156 return answer;
758ad80c
SH
157}
158
159static bool perms_include(int fmode, mode_t req_mode)
160{
2ad6d2bd
SH
161 mode_t r;
162
163 switch (req_mode & O_ACCMODE) {
164 case O_RDONLY:
165 r = S_IROTH;
166 break;
167 case O_WRONLY:
168 r = S_IWOTH;
169 break;
170 case O_RDWR:
171 r = S_IROTH | S_IWOTH;
172 break;
173 default:
174 return false;
175 }
176 return ((fmode & r) == r);
758ad80c
SH
177}
178
3db25a35
SH
179static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
180{
181 char *start, *end;
182
183 if (strlen(taskcg) <= strlen(querycg)) {
184 fprintf(stderr, "%s: I was fed bad input\n", __func__);
185 return NULL;
186 }
187
188 if (strcmp(querycg, "/") == 0)
189 start = NIH_MUST( nih_strdup(NULL, taskcg + 1) );
190 else
191 start = NIH_MUST( nih_strdup(NULL, taskcg + strlen(querycg) + 1) );
192 end = strchr(start, '/');
193 if (end)
194 *end = '\0';
195 return start;
196}
197
758ad80c
SH
198/*
199 * check whether a fuse context may access a cgroup dir or file
200 *
201 * If file is not null, it is a cgroup file to check under cg.
202 * If file is null, then we are checking perms on cg itself.
203 *
204 * For files we can check the mode of the list_keys result.
205 * For cgroups, we must make assumptions based on the files under the
206 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
207 * yet.
208 */
209static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
210{
211 nih_local struct cgm_keys **list = NULL;
212 int i;
213
214 if (!file)
215 file = "tasks";
216
217 if (*file == '/')
218 file++;
219
220 if (!cgm_list_keys(contrl, cg, &list))
221 return false;
222 for (i = 0; list[i]; i++) {
223 if (strcmp(list[i]->name, file) == 0) {
224 struct cgm_keys *k = list[i];
341b21ad 225 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
758ad80c
SH
226 if (perms_include(k->mode >> 6, mode))
227 return true;
228 }
229 if (fc->gid == k->gid) {
230 if (perms_include(k->mode >> 3, mode))
231 return true;
232 }
233 return perms_include(k->mode, mode);
234 }
235 }
236
237 return false;
238}
239
3db25a35
SH
240static void stripnewline(char *x)
241{
242 size_t l = strlen(x);
243 if (l && x[l-1] == '\n')
244 x[l-1] = '\0';
245}
246
247/*
248 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
249 * If caller is in /a, he may act on /a/b, but not on /b.
250 * if the answer is false and nextcg is not NULL, then *nextcg will point
251 * to a nih_alloc'd string containing the next cgroup directory under cg
252 */
253static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
254{
255 nih_local char *fnam = NULL;
256 FILE *f;
257 bool answer = false;
258 char *line = NULL;
259 size_t len = 0;
260
261 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
262 if (!(f = fopen(fnam, "r")))
263 return false;
264
265 while (getline(&line, &len, f) != -1) {
266 char *c1, *c2, *linecmp;
267 if (!line[0])
268 continue;
269 c1 = strchr(line, ':');
270 if (!c1)
271 goto out;
272 c1++;
273 c2 = strchr(c1, ':');
274 if (!c2)
275 goto out;
276 *c2 = '\0';
277 if (strcmp(c1, contrl) != 0)
278 continue;
279 c2++;
280 stripnewline(c2);
281 /*
282 * callers pass in '/' for root cgroup, otherwise they pass
283 * in a cgroup without leading '/'
284 */
285 linecmp = *cg == '/' ? c2 : c2+1;
286 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
287 if (nextcg)
288 *nextcg = get_next_cgroup_dir(linecmp, cg);
289 goto out;
290 }
291 answer = true;
292 goto out;
293 }
294
295out:
296 fclose(f);
297 free(line);
298 return answer;
299}
300
758ad80c
SH
301/*
302 * given /cgroup/freezer/a/b, return "freezer". this will be nih-allocated
303 * and needs to be nih_freed.
304 */
305static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
306{
307 const char *p1;
308 char *ret, *slash;
309
310 if (strlen(path) < 9)
311 return NULL;
312 p1 = path+8;
313 ret = nih_strdup(NULL, p1);
314 if (!ret)
315 return ret;
316 slash = strstr(ret, "/");
317 if (slash)
318 *slash = '\0';
319
320 /* verify that it is a subsystem */
321 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
322 int i;
323 if (!list) {
324 nih_free(ret);
325 return NULL;
326 }
327 for (i = 0; list[i]; i++) {
328 if (strcmp(list[i], ret) == 0)
329 return ret;
330 }
331 nih_free(ret);
332 return NULL;
333}
334
335/*
336 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
337 * Note that the returned value may include files (keynames) etc
338 */
339static const char *find_cgroup_in_path(const char *path)
340{
341 const char *p1;
342
343 if (strlen(path) < 9)
344 return NULL;
345 p1 = strstr(path+8, "/");
346 if (!p1)
347 return NULL;
348 return p1+1;
349}
350
351static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
352{
353 nih_local char **list = NULL;
354 int i;
355
356 if (!f)
357 return false;
358 if (*f == '/')
359 f++;
360
361 if (!cgm_list_children(contr, dir, &list))
362 return false;
363 for (i = 0; list[i]; i++) {
364 if (strcmp(list[i], f) == 0)
365 return true;
366 }
367
368 return false;
369}
370
371static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
372{
373 nih_local struct cgm_keys **list = NULL;
374 struct cgm_keys *k;
375 int i;
376
377 if (!f)
378 return NULL;
379 if (*f == '/')
380 f++;
381 if (!cgm_list_keys(contr, dir, &list))
382 return NULL;
383 for (i = 0; list[i]; i++) {
384 if (strcmp(list[i]->name, f) == 0) {
385 k = NIH_MUST( nih_alloc(NULL, (sizeof(*k))) );
386 k->name = NIH_MUST( nih_strdup(k, list[i]->name) );
387 k->uid = list[i]->uid;
388 k->gid = list[i]->gid;
389 k->mode = list[i]->mode;
390 return k;
391 }
392 }
393
394 return NULL;
395}
396
397static void get_cgdir_and_path(const char *cg, char **dir, char **file)
398{
758ad80c
SH
399 char *p;
400
401 *dir = NIH_MUST( nih_strdup(NULL, cg) );
402 *file = strrchr(cg, '/');
403 if (!*file) {
404 *file = NULL;
405 return;
406 }
407 p = strrchr(*dir, '/');
408 *p = '\0';
409}
410
99978832
SH
411static size_t get_file_size(const char *contrl, const char *cg, const char *f)
412{
413 nih_local char *data = NULL;
414 size_t s;
415 if (!cgm_get_value(contrl, cg, f, &data))
416 return -EINVAL;
417 s = strlen(data);
418 return s;
419}
2ad6d2bd 420
758ad80c 421/*
2ad6d2bd 422 * FUSE ops for /cgroup
758ad80c 423 */
2ad6d2bd 424
758ad80c
SH
425static int cg_getattr(const char *path, struct stat *sb)
426{
427 struct timespec now;
428 struct fuse_context *fc = fuse_get_context();
429 nih_local char * cgdir = NULL;
430 char *fpath = NULL, *path1, *path2;
431 nih_local struct cgm_keys *k = NULL;
432 const char *cgroup;
433 nih_local char *controller = NULL;
434
435
436 if (!fc)
437 return -EIO;
438
439 memset(sb, 0, sizeof(struct stat));
440
441 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
442 return -EINVAL;
443
444 sb->st_uid = sb->st_gid = 0;
445 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
446 sb->st_size = 0;
447
448 if (strcmp(path, "/cgroup") == 0) {
449 sb->st_mode = S_IFDIR | 00755;
450 sb->st_nlink = 2;
451 return 0;
452 }
453
454 controller = pick_controller_from_path(fc, path);
455 if (!controller)
456 return -EIO;
758ad80c
SH
457 cgroup = find_cgroup_in_path(path);
458 if (!cgroup) {
459 /* this is just /cgroup/controller, return it as a dir */
460 sb->st_mode = S_IFDIR | 00755;
461 sb->st_nlink = 2;
462 return 0;
463 }
341b21ad 464
758ad80c
SH
465 get_cgdir_and_path(cgroup, &cgdir, &fpath);
466
467 if (!fpath) {
468 path1 = "/";
469 path2 = cgdir;
470 } else {
471 path1 = cgdir;
472 path2 = fpath;
473 }
474
758ad80c
SH
475 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
476 * Then check that caller's cgroup is under path if fpath is a child
477 * cgroup, or cgdir if fpath is a file */
478
479 if (is_child_cgroup(controller, path1, path2)) {
f9a05025
SH
480 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
481 /* this is just /cgroup/controller, return it as a dir */
482 sb->st_mode = S_IFDIR | 00555;
483 sb->st_nlink = 2;
484 return 0;
485 }
758ad80c 486 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
f9a05025 487 return -EACCES;
758ad80c 488
053a659d
SH
489 // get uid, gid, from '/tasks' file and make up a mode
490 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
491 sb->st_mode = S_IFDIR | 00755;
492 k = get_cgroup_key(controller, cgroup, "tasks");
493 if (!k) {
053a659d
SH
494 sb->st_uid = sb->st_gid = 0;
495 } else {
053a659d
SH
496 sb->st_uid = k->uid;
497 sb->st_gid = k->gid;
498 }
758ad80c
SH
499 sb->st_nlink = 2;
500 return 0;
501 }
502
503 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
3db25a35
SH
504 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL))
505 return -ENOENT;
758ad80c 506 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
f9a05025 507 return -EACCES;
758ad80c 508
758ad80c 509 sb->st_mode = S_IFREG | k->mode;
053a659d 510 sb->st_nlink = 1;
758ad80c
SH
511 sb->st_uid = k->uid;
512 sb->st_gid = k->gid;
99978832 513 sb->st_size = get_file_size(controller, path1, path2);
758ad80c
SH
514 return 0;
515 }
516
ab54b798 517 return -ENOENT;
758ad80c 518}
2183082c 519
7f163b71
SH
520/*
521 * TODO - cache these results in a table for use in opendir, free
522 * in releasedir
523 */
758ad80c 524static int cg_opendir(const char *path, struct fuse_file_info *fi)
2183082c 525{
7f163b71
SH
526 struct fuse_context *fc = fuse_get_context();
527 nih_local struct cgm_keys **list = NULL;
528 const char *cgroup;
529 nih_local char *controller = NULL;
7f163b71
SH
530 nih_local char *nextcg = NULL;
531
532 if (!fc)
533 return -EIO;
534
535 if (strcmp(path, "/cgroup") == 0)
536 return 0;
537
538 // return list of keys for the controller, and list of child cgroups
539 controller = pick_controller_from_path(fc, path);
540 if (!controller)
541 return -EIO;
542
543 cgroup = find_cgroup_in_path(path);
544 if (!cgroup) {
545 /* this is just /cgroup/controller, return its contents */
546 cgroup = "/";
547 }
548
549 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
550 return -EACCES;
758ad80c
SH
551 return 0;
552}
553
758ad80c
SH
554static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
555 struct fuse_file_info *fi)
556{
557 struct fuse_context *fc = fuse_get_context();
558
559 if (!fc)
560 return -EIO;
561
562 if (strcmp(path, "/cgroup") == 0) {
563 // get list of controllers
564 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
565 int i;
566
567 if (!list)
568 return -EIO;
7f163b71 569
758ad80c
SH
570 for (i = 0; list[i]; i++) {
571 if (filler(buf, list[i], NULL, 0) != 0) {
572 return -EIO;
573 }
574 }
575 return 0;
576 }
577
578 // return list of keys for the controller, and list of child cgroups
579 nih_local struct cgm_keys **list = NULL;
580 const char *cgroup;
581 nih_local char *controller = NULL;
582 int i;
3db25a35 583 nih_local char *nextcg = NULL;
758ad80c
SH
584
585 controller = pick_controller_from_path(fc, path);
586 if (!controller)
587 return -EIO;
588
589 cgroup = find_cgroup_in_path(path);
590 if (!cgroup) {
591 /* this is just /cgroup/controller, return its contents */
592 cgroup = "/";
593 }
594
595 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
f9a05025 596 return -EACCES;
758ad80c
SH
597
598 if (!cgm_list_keys(controller, cgroup, &list))
3db25a35 599 // not a valid cgroup
758ad80c 600 return -EINVAL;
3db25a35
SH
601
602 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, &nextcg)) {
603 if (nextcg) {
604 int ret;
605 ret = filler(buf, nextcg, NULL, 0);
606 if (ret != 0)
607 return -EIO;
608 }
609 return 0;
610 }
611
758ad80c 612 for (i = 0; list[i]; i++) {
758ad80c
SH
613 if (filler(buf, list[i]->name, NULL, 0) != 0) {
614 return -EIO;
615 }
616 }
617
618 // now get the list of child cgroups
619 nih_local char **clist;
620
621 if (!cgm_list_children(controller, cgroup, &clist))
622 return 0;
623 for (i = 0; clist[i]; i++) {
758ad80c
SH
624 if (filler(buf, clist[i], NULL, 0) != 0) {
625 return -EIO;
626 }
627 }
628 return 0;
629}
630
631static int cg_releasedir(const char *path, struct fuse_file_info *fi)
632{
633 return 0;
634}
635
26faa701
SH
636/*
637 * TODO - cache info here for read/write, release in cg_release.
638 */
99978832
SH
639static int cg_open(const char *path, struct fuse_file_info *fi)
640{
641 nih_local char *controller = NULL;
642 const char *cgroup;
643 char *fpath = NULL, *path1, *path2;
644 nih_local char * cgdir = NULL;
645 nih_local struct cgm_keys *k = NULL;
646 struct fuse_context *fc = fuse_get_context();
647
648 if (!fc)
649 return -EIO;
650
651 controller = pick_controller_from_path(fc, path);
652 if (!controller)
653 return -EIO;
654 cgroup = find_cgroup_in_path(path);
655 if (!cgroup)
656 return -EINVAL;
657
658 get_cgdir_and_path(cgroup, &cgdir, &fpath);
659 if (!fpath) {
660 path1 = "/";
661 path2 = cgdir;
662 } else {
663 path1 = cgdir;
664 path2 = fpath;
665 }
666
667 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
668 if (!fc_may_access(fc, controller, path1, path2, fi->flags))
f9a05025
SH
669 // should never get here
670 return -EACCES;
99978832 671
99978832
SH
672 return 0;
673 }
674
675 return -EINVAL;
676}
677
a05660a6
SH
678static int msgrecv(int sockfd, void *buf, size_t len)
679{
680 struct timeval tv;
681 fd_set rfds;
682
683 FD_ZERO(&rfds);
684 FD_SET(sockfd, &rfds);
685 tv.tv_sec = 2;
686 tv.tv_usec = 0;
687
688 if (select(sockfd+1, &rfds, NULL, NULL, &tv) < 0)
689 return -1;
690 return recv(sockfd, buf, len, MSG_DONTWAIT);
691}
692
693static bool send_creds(int sock, struct ucred *cred, char v)
694{
695 struct msghdr msg = { 0 };
696 struct iovec iov;
697 struct cmsghdr *cmsg;
698 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
699 char buf[1];
700 buf[0] = 'p';
701
702 if (msgrecv(sock, buf, 1) != 1) {
703 printf("%s: Error getting reply from server over socketpair",
704 __func__);
705 return false;
706 }
707
708 msg.msg_control = cmsgbuf;
709 msg.msg_controllen = sizeof(cmsgbuf);
710
711 cmsg = CMSG_FIRSTHDR(&msg);
712 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
713 cmsg->cmsg_level = SOL_SOCKET;
714 cmsg->cmsg_type = SCM_CREDENTIALS;
715 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
716
717 msg.msg_name = NULL;
718 msg.msg_namelen = 0;
719
720 buf[0] = v;
721 iov.iov_base = buf;
722 iov.iov_len = sizeof(buf);
723 msg.msg_iov = &iov;
724 msg.msg_iovlen = 1;
725
726 if (sendmsg(sock, &msg, 0) < 0) {
727 printf("%s: failed at sendmsg: %s", __func__,
728 strerror(errno));
729 if (errno == 3)
730 return true;
731 return false;
732 }
733
734 return true;
735}
736
737static bool recv_creds(int sock, struct ucred *cred, char *v)
738{
739 struct msghdr msg = { 0 };
740 struct iovec iov;
741 struct cmsghdr *cmsg;
742 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
743 char buf[1];
744 int ret;
745 int optval = 1;
746
747 *v = '1';
748
749 cred->pid = -1;
750 cred->uid = -1;
751 cred->gid = -1;
752
753 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
754 printf("Failed to set passcred: %s", strerror(errno));
755 return false;
756 }
757 buf[0] = '1';
758 if (write(sock, buf, 1) != 1) {
759 printf("Failed to start write on scm fd: %s", strerror(errno));
760 return false;
761 }
762
763 msg.msg_name = NULL;
764 msg.msg_namelen = 0;
765 msg.msg_control = cmsgbuf;
766 msg.msg_controllen = sizeof(cmsgbuf);
767
768 iov.iov_base = buf;
769 iov.iov_len = sizeof(buf);
770 msg.msg_iov = &iov;
771 msg.msg_iovlen = 1;
772
773 // retry logic is not ideal, especially as we are not
774 // threaded. Sleep at most 1 second waiting for the client
775 // to send us the scm_cred
776 ret = recvmsg(sock, &msg, 0);
777 if (ret < 0) {
778 printf("Failed to receive scm_cred: %s",
779 strerror(errno));
780 return false;
781 }
782
783 cmsg = CMSG_FIRSTHDR(&msg);
784
785 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
786 cmsg->cmsg_level == SOL_SOCKET &&
787 cmsg->cmsg_type == SCM_CREDENTIALS) {
788 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
789 }
790 *v = buf[0];
791
792 return true;
793}
794
795
796/*
797 * pidreader - reads pids from a ucred over a socket, then writes the
798 * int value back over the socket
799 */
800static void pidreader(int sock, pid_t tpid)
801{
802 char v = '0';
803 struct ucred cred;
804
805 while (recv_creds(sock, &cred, &v)) {
806 if (v == '1')
807 exit(0);
808 printf("CCC: child received %d\n", cred.pid);
809 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
810 exit(1);
811 }
812 exit(0);
813}
814
815/*
816 * pidreader_wrapper: when you setns into a pidns, you yourself remain
817 * in your old pidns. Only children which you fork will be in the target
818 * pidns. So the pidreader_wrapper does the setns, then forks a child to
819 * actually convert pids
820 */
821static void pidreader_wrapper(int sock, pid_t tpid)
822{
823 int newnsfd = -1;
824 char fnam[100];
825 pid_t cpid;
826
827 sprintf(fnam, "/proc/%d/ns/pid", tpid);
828 newnsfd = open(fnam, O_RDONLY);
829 if (newnsfd < 0)
830 exit(1);
831 if (setns(newnsfd, 0) < 0)
832 exit(1);
833 close(newnsfd);
834
835 cpid = fork();
836
837 if (cpid < 0)
838 exit(1);
839 if (!cpid)
840 pidreader(sock, tpid);
841 if (!wait_for_pid(cpid))
842 exit(1);
843 exit(0);
844}
845
846/*
847 * To read cgroup files with a particular pid, we will setns into the child
848 * pidns, open a pipe, fork a child - which will be the first to really be in
849 * the child ns - which does the cgm_get_value and writes the data to the pipe.
850 */
851static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
852{
853 int sock[2] = {-1, -1};
854 nih_local char *tmpdata = NULL;
855 int ret;
856 pid_t qpid, cpid = -1;
857 bool answer = false;
858 char v = '0';
859 struct ucred cred;
860 struct timeval tv;
861 fd_set s;
862
863 if (!cgm_get_value(contrl, cg, file, &tmpdata))
864 return false;
865
866 /*
867 * Now we read the pids from returned data one by one, pass
868 * them into a child in the target namespace, read back the
869 * translated pids, and put them into our to-return data
870 */
871
872 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
873 perror("socketpair");
874 exit(1);
875 }
876
877 cpid = fork();
878 if (cpid == -1)
879 goto out;
880
881 if (!cpid) // child
882 pidreader_wrapper(sock[1], tpid);
883
884 char *ptr = tmpdata;
885 cred.uid = 0;
886 cred.gid = 0;
887 while (sscanf(ptr, "%d\n", &qpid) == 1) {
888 cred.pid = qpid;
889 printf("AAA: sending %d\n", qpid);
890 if (!send_creds(sock[0], &cred, v))
891 goto out;
892
893 // read converted results
894 FD_ZERO(&s);
895 FD_SET(sock[0], &s);
896 tv.tv_sec = 1;
897 tv.tv_usec = 0;
898 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
899 if (ret <= 0) {
900 kill(cpid, SIGTERM);
901 goto out;
902 }
903 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
904 kill(cpid, SIGTERM);
905 perror("read");
906 goto out;
907 }
908 printf("BBB: read %d\n", qpid);
909 NIH_MUST( nih_strcat_sprintf(d, NULL, "%d\n", qpid) );
910 ptr = strchr(ptr, '\n');
911 if (!ptr)
912 break;
913 ptr++;
914 }
915
916 cred.pid = getpid();
917 v = '1';
918 if (!send_creds(sock[0], &cred, v)) {
919 // failed to ask child to exit
920 kill(cpid, SIGTERM);
921 goto out;
922 }
923
924 answer = true;
925
926out:
927 if (cpid != -1)
928 wait_for_pid(cpid);
929 if (sock[0] != -1) {
930 close(sock[0]);
931 close(sock[1]);
932 }
933 return answer;
934}
935
99978832
SH
936static int cg_read(const char *path, char *buf, size_t size, off_t offset,
937 struct fuse_file_info *fi)
938{
939 nih_local char *controller = NULL;
940 const char *cgroup;
941 char *fpath = NULL, *path1, *path2;
942 struct fuse_context *fc = fuse_get_context();
943 nih_local char * cgdir = NULL;
944 nih_local struct cgm_keys *k = NULL;
945
946 if (offset)
947 return -EIO;
948
949 if (!fc)
950 return -EIO;
951
952 controller = pick_controller_from_path(fc, path);
953 if (!controller)
f9a05025 954 return -EINVAL;
99978832
SH
955 cgroup = find_cgroup_in_path(path);
956 if (!cgroup)
957 return -EINVAL;
958
959 get_cgdir_and_path(cgroup, &cgdir, &fpath);
960 if (!fpath) {
961 path1 = "/";
962 path2 = cgdir;
963 } else {
964 path1 = cgdir;
965 path2 = fpath;
966 }
967
968 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
969 nih_local char *data = NULL;
a05660a6 970 int s, ret;
99978832 971
2ad6d2bd 972 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
f9a05025
SH
973 // should never get here
974 return -EACCES;
99978832 975
a05660a6
SH
976 printf("XXX path2 is .%s.\n", path2);
977 if (strcmp(path2, "tasks") == 0 ||
978 strcmp(path2, "/tasks") == 0 ||
979 strcmp(path2, "/cgroup.procs") == 0 ||
980 strcmp(path2, "cgroup.procs") == 0)
981 // special case - we have to translate the pids
982 ret = do_read_pids(fc->pid, controller, path1, path2, &data);
983 else
984 ret = cgm_get_value(controller, path1, path2, &data);
985
986 if (ret == 0)
99978832
SH
987 return -EINVAL;
988
989 s = strlen(data);
990 if (s > size)
991 s = size;
992 memcpy(buf, data, s);
993
99978832
SH
994 return s;
995 }
996
997 return -EINVAL;
998}
999
2ad6d2bd
SH
1000int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1001 struct fuse_file_info *fi)
1002{
1003 nih_local char *controller = NULL;
1004 const char *cgroup;
1005 char *fpath = NULL, *path1, *path2;
1006 struct fuse_context *fc = fuse_get_context();
1007 nih_local char * cgdir = NULL;
1008 nih_local struct cgm_keys *k = NULL;
1009
2ad6d2bd 1010 if (offset)
f9a05025 1011 return -EINVAL;
2ad6d2bd
SH
1012
1013 if (!fc)
1014 return -EIO;
1015
1016 controller = pick_controller_from_path(fc, path);
1017 if (!controller)
f9a05025 1018 return -EINVAL;
2ad6d2bd
SH
1019 cgroup = find_cgroup_in_path(path);
1020 if (!cgroup)
1021 return -EINVAL;
1022
1023 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1024 if (!fpath) {
1025 path1 = "/";
1026 path2 = cgdir;
1027 } else {
1028 path1 = cgdir;
1029 path2 = fpath;
1030 }
1031
1032 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
1033 if (!fc_may_access(fc, controller, path1, path2, O_WRONLY))
f9a05025 1034 return -EACCES;
2ad6d2bd
SH
1035
1036 if (!cgm_set_value(controller, path1, path2, buf))
1037 return -EINVAL;
1038
1039 return size;
1040 }
1041
1042 return -EINVAL;
1043}
1044
341b21ad
SH
1045int cg_chown(const char *path, uid_t uid, gid_t gid)
1046{
1047 struct fuse_context *fc = fuse_get_context();
1048 nih_local char * cgdir = NULL;
1049 char *fpath = NULL, *path1, *path2;
1050 nih_local struct cgm_keys *k = NULL;
1051 const char *cgroup;
1052 nih_local char *controller = NULL;
1053
1054
1055 if (!fc)
1056 return -EIO;
1057
1058 if (strcmp(path, "/cgroup") == 0)
1059 return -EINVAL;
1060
1061 controller = pick_controller_from_path(fc, path);
1062 if (!controller)
f9a05025 1063 return -EINVAL;
341b21ad
SH
1064 cgroup = find_cgroup_in_path(path);
1065 if (!cgroup)
1066 /* this is just /cgroup/controller */
1067 return -EINVAL;
1068
1069 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1070
1071 if (!fpath) {
1072 path1 = "/";
1073 path2 = cgdir;
1074 } else {
1075 path1 = cgdir;
1076 path2 = fpath;
1077 }
1078
1079 if (is_child_cgroup(controller, path1, path2)) {
1080 // get uid, gid, from '/tasks' file and make up a mode
1081 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1082 k = get_cgroup_key(controller, cgroup, "tasks");
1083
1084 } else
1085 k = get_cgroup_key(controller, path1, path2);
1086
1087 if (!k)
1088 return -EINVAL;
1089
1090 /*
1091 * This being a fuse request, the uid and gid must be valid
1092 * in the caller's namespace. So we can just check to make
1093 * sure that the caller is root in his uid, and privileged
1094 * over the file's current owner.
1095 */
1096 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD))
f9a05025 1097 return -EACCES;
341b21ad
SH
1098
1099 if (!cgm_chown_file(controller, cgroup, uid, gid))
1100 return -EINVAL;
1101 return 0;
1102}
2ad6d2bd 1103
fd2e4e03
SH
1104int cg_chmod(const char *path, mode_t mode)
1105{
0a1bb5ea
SH
1106 struct fuse_context *fc = fuse_get_context();
1107 nih_local char * cgdir = NULL;
1108 char *fpath = NULL, *path1, *path2;
1109 nih_local struct cgm_keys *k = NULL;
1110 const char *cgroup;
1111 nih_local char *controller = NULL;
1112
1113 if (!fc)
1114 return -EIO;
1115
1116 if (strcmp(path, "/cgroup") == 0)
1117 return -EINVAL;
1118
1119 controller = pick_controller_from_path(fc, path);
1120 if (!controller)
f9a05025 1121 return -EINVAL;
0a1bb5ea
SH
1122 cgroup = find_cgroup_in_path(path);
1123 if (!cgroup)
1124 /* this is just /cgroup/controller */
1125 return -EINVAL;
1126
1127 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1128
1129 if (!fpath) {
1130 path1 = "/";
1131 path2 = cgdir;
1132 } else {
1133 path1 = cgdir;
1134 path2 = fpath;
1135 }
1136
1137 if (is_child_cgroup(controller, path1, path2)) {
1138 // get uid, gid, from '/tasks' file and make up a mode
1139 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1140 k = get_cgroup_key(controller, cgroup, "tasks");
1141
1142 } else
1143 k = get_cgroup_key(controller, path1, path2);
1144
1145 if (!k)
1146 return -EINVAL;
1147
1148 /*
1149 * This being a fuse request, the uid and gid must be valid
1150 * in the caller's namespace. So we can just check to make
1151 * sure that the caller is root in his uid, and privileged
1152 * over the file's current owner.
1153 */
1154 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT))
1155 return -EPERM;
1156
1157 if (!cgm_chmod_file(controller, cgroup, mode))
1158 return -EINVAL;
1159 return 0;
fd2e4e03
SH
1160}
1161
ab54b798
SH
1162int cg_mkdir(const char *path, mode_t mode)
1163{
1164 struct fuse_context *fc = fuse_get_context();
1165 nih_local struct cgm_keys **list = NULL;
1166 char *fpath = NULL, *path1;
1167 nih_local char * cgdir = NULL;
1168 const char *cgroup;
1169 nih_local char *controller = NULL;
1170
ab54b798
SH
1171 if (!fc)
1172 return -EIO;
1173
1174
1175 controller = pick_controller_from_path(fc, path);
1176 if (!controller)
f9a05025 1177 return -EINVAL;
ab54b798
SH
1178
1179 cgroup = find_cgroup_in_path(path);
1180 if (!cgroup)
f9a05025 1181 return -EINVAL;
ab54b798
SH
1182
1183 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1184 if (!fpath)
1185 path1 = "/";
1186 else
1187 path1 = cgdir;
1188
1189 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR))
f9a05025 1190 return -EACCES;
ab54b798
SH
1191
1192
1193 if (!cgm_create(controller, cgroup, fc->uid, fc->gid))
1194 return -EINVAL;
1195
1196 return 0;
1197}
1198
50d8d5b5
SH
1199static int cg_rmdir(const char *path)
1200{
1201 struct fuse_context *fc = fuse_get_context();
1202 nih_local struct cgm_keys **list = NULL;
1203 char *fpath = NULL;
1204 nih_local char * cgdir = NULL;
1205 const char *cgroup;
1206 nih_local char *controller = NULL;
1207
1208 if (!fc)
1209 return -EIO;
1210
1211
1212 controller = pick_controller_from_path(fc, path);
1213 if (!controller)
f9a05025 1214 return -EINVAL;
50d8d5b5
SH
1215
1216 cgroup = find_cgroup_in_path(path);
1217 if (!cgroup)
f9a05025 1218 return -EINVAL;
50d8d5b5
SH
1219
1220 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1221 if (!fpath)
1222 return -EINVAL;
1223
1224 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY))
f9a05025 1225 return -EACCES;
50d8d5b5
SH
1226
1227 if (!cgm_remove(controller, cgroup))
1228 return -EINVAL;
1229
1230 return 0;
1231}
1232
2dc17609
SH
1233static bool startswith(const char *line, const char *pref)
1234{
1235 if (strncmp(line, pref, strlen(pref)) == 0)
1236 return true;
1237 return false;
1238}
1239
1240static void get_mem_cached(char *memstat, unsigned long *v)
1241{
1242 char *eol;
1243
1244 *v = 0;
1245 while (*memstat) {
1246 if (startswith(memstat, "total_cache")) {
1247 sscanf(memstat + 11, "%lu", v);
1248 *v /= 1024;
1249 return;
1250 }
1251 eol = strchr(memstat, '\n');
1252 if (!eol)
1253 return;
1254 memstat = eol+1;
1255 }
1256}
1257
1258static char *get_pid_cgroup(pid_t pid, const char *contrl)
1259{
1260 nih_local char *fnam = NULL;
1261 FILE *f;
1262 char *answer = NULL;
1263 char *line = NULL;
1264 size_t len = 0;
1265
1266 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
1267 if (!(f = fopen(fnam, "r")))
1268 return false;
1269
1270 while (getline(&line, &len, f) != -1) {
1271 char *c1, *c2;
1272 if (!line[0])
1273 continue;
1274 c1 = strchr(line, ':');
1275 if (!c1)
1276 goto out;
1277 c1++;
1278 c2 = strchr(c1, ':');
1279 if (!c2)
1280 goto out;
1281 *c2 = '\0';
1282 if (strcmp(c1, contrl) != 0)
1283 continue;
1284 c2++;
1285 stripnewline(c2);
1286 answer = NIH_MUST( nih_strdup(NULL, c2) );
1287 goto out;
1288 }
1289
1290out:
1291 fclose(f);
1292 free(line);
1293 return answer;
1294}
1295
758ad80c 1296/*
2ad6d2bd 1297 * FUSE ops for /proc
758ad80c 1298 */
758ad80c 1299
23ce2127
SH
1300static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1301 struct fuse_file_info *fi)
1302{
2dc17609
SH
1303 struct fuse_context *fc = fuse_get_context();
1304 nih_local char *cg = get_pid_cgroup(fc->pid, "memory");
1305 nih_local char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
1306 unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
1307 char *line = NULL;
1308 size_t linelen = 0, total_len = 0;
1309 FILE *f;
1310
1311 if (offset)
1312 return -EINVAL;
1313
1314 if (!cg)
1315 return 0;
1316
1317 if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
1318 return 0;
1319 if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
1320 return 0;
1321 if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
1322 return 0;
1323 memlimit = strtoul(memlimit_str, NULL, 10);
1324 memusage = strtoul(memusage_str, NULL, 10);
1325 memlimit /= 1024;
1326 memusage /= 1024;
1327 get_mem_cached(memstat_str, &cached);
1328
1329 f = fopen("/proc/meminfo", "r");
1330 if (!f)
1331 return 0;
1332
1333 while (getline(&line, &linelen, f) != -1) {
1334 size_t l;
1335 char *printme, lbuf[100];
1336
1337 memset(lbuf, 0, 100);
1338 if (startswith(line, "MemTotal:")) {
1339 sscanf(line+14, "%lu", &hosttotal);
1340 if (hosttotal < memlimit)
1341 memlimit = hosttotal;
1342 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1343 printme = lbuf;
1344 } else if (startswith(line, "MemFree:")) {
1345 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1346 printme = lbuf;
1347 } else if (startswith(line, "MemAvailable:")) {
1348 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1349 printme = lbuf;
1350 } else if (startswith(line, "Buffers:")) {
1351 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1352 printme = lbuf;
1353 } else if (startswith(line, "Cached:")) {
1354 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
1355 printme = lbuf;
1356 } else if (startswith(line, "SwapCached:")) {
1357 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
1358 printme = lbuf;
1359 } else
1360 printme = line;
1361 l = snprintf(buf, size, "%s", printme);
1362 buf += l;
1363 size -= l;
1364 total_len += l;
1365 }
1366
1367 return total_len;
23ce2127
SH
1368}
1369
1370/*
1371 * Read the cpuset.cpus for cg
1372 * Return the answer in a nih_alloced string
1373 */
1374static char *get_cpuset(const char *cg)
1375{
1376 char *answer;
1377
1378 if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
1379 return NULL;
1380 return answer;
1381}
1382
1383/*
1384 * Helper functions for cpuset_in-set
1385 */
1386char *cpuset_nexttok(const char *c)
1387{
1388 char *r = strchr(c+1, ',');
1389 if (r)
1390 return r+1;
1391 return NULL;
1392}
1393
1394int cpuset_getrange(const char *c, int *a, int *b)
1395{
1396 int ret;
1397
1398 ret = sscanf(c, "%d-%d", a, b);
1399 return ret;
1400}
1401
1402/*
1403 * cpusets are in format "1,2-3,4"
1404 * iow, comma-delimited ranges
1405 */
aeb56147 1406static bool cpu_in_cpuset(int cpu, const char *cpuset)
23ce2127 1407{
23ce2127
SH
1408 const char *c;
1409
23ce2127
SH
1410 for (c = cpuset; c; c = cpuset_nexttok(c)) {
1411 int a, b, ret;
1412
1413 ret = cpuset_getrange(c, &a, &b);
1414 if (ret == 1 && cpu == a)
1415 return true;
1416 if (ret != 2) // bad cpuset!
1417 return false;
1418 if (cpu >= a && cpu <= b)
1419 return true;
1420 }
1421
1422 return false;
1423}
1424
aeb56147
SH
1425static bool cpuline_in_cpuset(const char *line, const char *cpuset)
1426{
1427 int cpu;
1428
1429 if (sscanf(line, "processor : %d", &cpu) != 1)
1430 return false;
1431 return cpu_in_cpuset(cpu, cpuset);
1432}
1433
23ce2127
SH
1434/*
1435 * check whether this is a '^processor" line in /proc/cpuinfo
1436 */
1437static bool is_processor_line(const char *line)
1438{
1439 int cpu;
1440
1441 if (sscanf(line, "processor : %d", &cpu) == 1)
1442 return true;
1443 return false;
1444}
1445
23ce2127
SH
1446static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
1447 struct fuse_file_info *fi)
1448{
1449 struct fuse_context *fc = fuse_get_context();
1450 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1451 nih_local char *cpuset = NULL;
1452 char *line = NULL;
1453 size_t linelen = 0, total_len = 0;
1454 bool am_printing = false;
1455 int curcpu = -1;
1456 FILE *f;
1457
1458 if (offset)
1459 return -EINVAL;
1460
1461 if (!cg)
1462 return 0;
1463
1464 cpuset = get_cpuset(cg);
1465 if (!cpuset)
1466 return 0;
1467
1468 f = fopen("/proc/cpuinfo", "r");
1469 if (!f)
1470 return 0;
1471
1472 while (getline(&line, &linelen, f) != -1) {
1473 size_t l;
1474 if (is_processor_line(line)) {
aeb56147 1475 am_printing = cpuline_in_cpuset(line, cpuset);
23ce2127
SH
1476 if (am_printing) {
1477 curcpu ++;
1478 l = snprintf(buf, size, "processor : %d\n", curcpu);
1479 buf += l;
1480 size -= l;
1481 total_len += l;
1482 }
1483 continue;
1484 }
1485 if (am_printing) {
1486 l = snprintf(buf, size, "%s", line);
1487 buf += l;
1488 size -= l;
1489 total_len += l;
1490 }
1491 }
1492
1493 return total_len;
1494}
1495
1496static int proc_stat_read(char *buf, size_t size, off_t offset,
1497 struct fuse_file_info *fi)
1498{
aeb56147
SH
1499 struct fuse_context *fc = fuse_get_context();
1500 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1501 nih_local char *cpuset = NULL;
1502 char *line = NULL;
1503 size_t linelen = 0, total_len = 0;
1504 int curcpu = 0;
1505 FILE *f;
1506
1507 if (offset)
1508 return -EINVAL;
1509
1510 if (!cg)
1511 return 0;
1512
1513 cpuset = get_cpuset(cg);
1514 if (!cpuset)
1515 return 0;
1516
1517 f = fopen("/proc/stat", "r");
1518 if (!f)
1519 return 0;
1520
1521 while (getline(&line, &linelen, f) != -1) {
1522 size_t l;
1523 int cpu;
1524 char *c;
1525
1526 if (sscanf(line, "cpu%d", &cpu) != 1) {
1527 /* not a ^cpu line, just print it */
1528 l = snprintf(buf, size, "%s", line);
1529 buf += l;
1530 size -= l;
1531 total_len += l;
1532 continue;
1533 }
1534 if (!cpu_in_cpuset(cpu, cpuset))
1535 continue;
1536 curcpu ++;
1537
1538 c = strchr(line, ' ');
1539 if (!c)
1540 continue;
1541 l = snprintf(buf, size, "cpu%d %s", curcpu, c);
1542 buf += l;
1543 size -= l;
1544 total_len += l;
1545 }
1546
1547 return total_len;
23ce2127
SH
1548}
1549
7bbf2246
SH
1550/*
1551 * How to guess what to present for uptime?
1552 * One thing we could do would be to take the date on the caller's
1553 * memory.usage_in_bytes file, which should equal the time of creation
1554 * of his cgroup. However, a task could be in a sub-cgroup of the
1555 * container. The same problem exists if we try to look at the ages
1556 * of processes in the caller's cgroup.
1557 *
1558 * So we'll fork a task that will enter the caller's pidns, mount a
1559 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
1560 *
1561 * For the second uptime #, we'll do as Stéphane had done, just copy
1562 * the number from /proc/uptime. Not sure how to best emulate 'idle'
1563 * time. Maybe someone can come up with a good algorithm and submit a
1564 * patch. Maybe something based on cpushare info?
1565 */
41bb9357
SH
1566
1567/* return age of the reaper for $pid, taken from ctime of its procdir */
1568static long int get_pid1_time(pid_t pid)
1569{
1570 char fnam[100];
1571 int fd;
1572 struct stat sb;
1573 int ret;
1574 pid_t npid;
1575
1576 if (unshare(CLONE_NEWNS))
1577 return 0;
1578
1579 sprintf(fnam, "/proc/%d/ns/pid", pid);
1580 fd = open(fnam, O_RDONLY);
1581 if (fd < 0) {
1582 perror("get_pid1_time open of ns/pid");
1583 return 0;
1584 }
1585 if (setns(fd, 0)) {
1586 perror("get_pid1_time setns 1");
1587 close(fd);
1588 return 0;
1589 }
1590 close(fd);
1591 npid = fork();
1592 if (npid < 0)
1593 return 0;
1594
1595 if (npid) {
1596 // child will do the writing for us
1597 wait_for_pid(npid);
1598 exit(0);
1599 }
1600
1601 umount2("/proc", MNT_DETACH);
1602
1603 if (mount("proc", "/proc", "proc", 0, NULL)) {
1604 perror("get_pid1_time mount");
1605 return 0;
1606 }
1607 ret = lstat("/proc/1", &sb);
1608 if (ret) {
1609 perror("get_pid1_time lstat");
1610 return 0;
1611 }
1612 return time(NULL) - sb.st_ctime;
1613}
1614
1615static long int getreaperage(pid_t qpid)
1616{
1617 int pid, mypipe[2], ret;
1618 struct timeval tv;
1619 fd_set s;
1620 long int mtime, answer = 0;
1621
1622 if (pipe(mypipe)) {
1623 return 0;
1624 }
1625
1626 pid = fork();
1627
1628 if (!pid) { // child
1629 mtime = get_pid1_time(qpid);
1630 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
1631 fprintf(stderr, "Warning: bad write from getreaperage\n");
1632 exit(0);
1633 }
1634
1635 close(mypipe[1]);
1636 FD_ZERO(&s);
1637 FD_SET(mypipe[0], &s);
1638 tv.tv_sec = 1;
1639 tv.tv_usec = 0;
1640 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
1641 if (ret == -1) {
1642 perror("select");
1643 goto out;
1644 }
1645 if (!ret) {
1646 printf("timed out\n");
1647 goto out;
1648 }
1649 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
1650 perror("read");
1651 goto out;
1652 }
1653 answer = mtime;
1654
1655out:
1656 wait_for_pid(pid);
1657 close(mypipe[0]);
1658 return answer;
1659}
1660
1661static long int getprocidle(void)
1662{
1663 FILE *f = fopen("/proc/uptime", "r");
1664 long int age, idle;
1665 if (!f)
1666 return 0;
1667 if (fscanf(f, "%ld %ld", &age, &idle) != 2)
1668 return 0;
1669 return idle;
1670}
1671
1672/*
1673 * We read /proc/uptime and reuse its second field.
1674 * For the first field, we use the mtime for the reaper for
1675 * the calling pid as returned by getreaperage
1676 */
23ce2127
SH
1677static int proc_uptime_read(char *buf, size_t size, off_t offset,
1678 struct fuse_file_info *fi)
1679{
41bb9357
SH
1680 struct fuse_context *fc = fuse_get_context();
1681 long int reaperage = getreaperage(fc->pid);;
1682 long int idletime = getprocidle();
1683
1684 if (offset)
1685 return -EINVAL;
1686 return snprintf(buf, size, "%ld %ld\n", reaperage, idletime);
23ce2127
SH
1687}
1688
1689static off_t get_procfile_size(const char *which)
1690{
1691 FILE *f = fopen(which, "r");
1692 char *line = NULL;
1693 size_t len = 0;
1694 ssize_t sz, answer = 0;
1695 if (!f)
1696 return 0;
1697
1698 while ((sz = getline(&line, &len, f)) != -1)
1699 answer += sz;
1700 fclose (f);
1701
1702 return answer;
1703}
1704
758ad80c
SH
1705static int proc_getattr(const char *path, struct stat *sb)
1706{
35629743
SH
1707 struct timespec now;
1708
1709 memset(sb, 0, sizeof(struct stat));
1710 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1711 return -EINVAL;
1712 sb->st_uid = sb->st_gid = 0;
1713 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1714 if (strcmp(path, "/proc") == 0) {
1715 sb->st_mode = S_IFDIR | 00555;
1716 sb->st_nlink = 2;
1717 return 0;
1718 }
1719 if (strcmp(path, "/proc/meminfo") == 0 ||
1720 strcmp(path, "/proc/cpuinfo") == 0 ||
1721 strcmp(path, "/proc/uptime") == 0 ||
1722 strcmp(path, "/proc/stat") == 0) {
23ce2127
SH
1723
1724 sb->st_size = get_procfile_size(path);
35629743
SH
1725 sb->st_mode = S_IFREG | 00444;
1726 sb->st_nlink = 1;
1727 return 0;
1728 }
1729
1730 return -ENOENT;
1731}
1732
1733static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1734 struct fuse_file_info *fi)
1735{
1736 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
1737 filler(buf, "meminfo", NULL, 0) != 0 ||
1738 filler(buf, "stat", NULL, 0) != 0 ||
1739 filler(buf, "uptime", NULL, 0) != 0)
758ad80c 1740 return -EINVAL;
758ad80c
SH
1741 return 0;
1742}
1743
35629743
SH
1744static int proc_open(const char *path, struct fuse_file_info *fi)
1745{
1746 if (strcmp(path, "/proc/meminfo") == 0 ||
1747 strcmp(path, "/proc/cpuinfo") == 0 ||
1748 strcmp(path, "/proc/uptime") == 0 ||
1749 strcmp(path, "/proc/stat") == 0)
1750 return 0;
1751 return -ENOENT;
1752}
1753
35629743
SH
1754static int proc_read(const char *path, char *buf, size_t size, off_t offset,
1755 struct fuse_file_info *fi)
1756{
1757 if (strcmp(path, "/proc/meminfo") == 0)
23ce2127 1758 return proc_meminfo_read(buf, size, offset, fi);
35629743 1759 if (strcmp(path, "/proc/cpuinfo") == 0)
23ce2127 1760 return proc_cpuinfo_read(buf, size, offset, fi);
35629743 1761 if (strcmp(path, "/proc/uptime") == 0)
23ce2127 1762 return proc_uptime_read(buf, size, offset, fi);
35629743 1763 if (strcmp(path, "/proc/stat") == 0)
23ce2127 1764 return proc_stat_read(buf, size, offset, fi);
35629743
SH
1765 return -EINVAL;
1766}
1767
2ad6d2bd
SH
1768/*
1769 * FUSE ops for /
1770 * these just delegate to the /proc and /cgroup ops as
1771 * needed
1772 */
758ad80c
SH
1773
1774static int lxcfs_getattr(const char *path, struct stat *sb)
1775{
1776 if (strcmp(path, "/") == 0) {
1777 sb->st_mode = S_IFDIR | 00755;
1778 sb->st_nlink = 2;
1779 return 0;
1780 }
1781 if (strncmp(path, "/cgroup", 7) == 0) {
1782 return cg_getattr(path, sb);
1783 }
35629743 1784 if (strncmp(path, "/proc", 5) == 0) {
758ad80c
SH
1785 return proc_getattr(path, sb);
1786 }
1787 return -EINVAL;
1788}
1789
1790static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
1791{
1792 if (strcmp(path, "/") == 0)
1793 return 0;
1794
1795 if (strncmp(path, "/cgroup", 7) == 0) {
1796 return cg_opendir(path, fi);
1797 }
35629743
SH
1798 if (strcmp(path, "/proc") == 0)
1799 return 0;
1800 return -ENOENT;
758ad80c
SH
1801}
1802
1803static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1804 struct fuse_file_info *fi)
1805{
1806 if (strcmp(path, "/") == 0) {
1807 if (filler(buf, "proc", NULL, 0) != 0 ||
1808 filler(buf, "cgroup", NULL, 0) != 0)
1809 return -EINVAL;
1810 return 0;
1811 }
35629743 1812 if (strncmp(path, "/cgroup", 7) == 0)
758ad80c 1813 return cg_readdir(path, buf, filler, offset, fi);
35629743
SH
1814 if (strcmp(path, "/proc") == 0)
1815 return proc_readdir(path, buf, filler, offset, fi);
758ad80c
SH
1816 return -EINVAL;
1817}
1818
1819static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
1820{
1821 if (strcmp(path, "/") == 0)
1822 return 0;
1823 if (strncmp(path, "/cgroup", 7) == 0) {
1824 return cg_releasedir(path, fi);
1825 }
35629743
SH
1826 if (strcmp(path, "/proc") == 0)
1827 return 0;
758ad80c
SH
1828 return -EINVAL;
1829}
1830
99978832
SH
1831static int lxcfs_open(const char *path, struct fuse_file_info *fi)
1832{
35629743 1833 if (strncmp(path, "/cgroup", 7) == 0)
99978832 1834 return cg_open(path, fi);
35629743
SH
1835 if (strncmp(path, "/proc", 5) == 0)
1836 return proc_open(path, fi);
99978832
SH
1837
1838 return -EINVAL;
1839}
1840
1841static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
1842 struct fuse_file_info *fi)
1843{
35629743 1844 if (strncmp(path, "/cgroup", 7) == 0)
99978832 1845 return cg_read(path, buf, size, offset, fi);
35629743
SH
1846 if (strncmp(path, "/proc", 5) == 0)
1847 return proc_read(path, buf, size, offset, fi);
99978832
SH
1848
1849 return -EINVAL;
1850}
1851
2ad6d2bd
SH
1852int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
1853 struct fuse_file_info *fi)
1854{
1855 if (strncmp(path, "/cgroup", 7) == 0) {
1856 return cg_write(path, buf, size, offset, fi);
1857 }
1858
1859 return -EINVAL;
1860}
1861
99978832
SH
1862static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
1863{
1864 return 0;
1865}
1866
1867static int lxcfs_release(const char *path, struct fuse_file_info *fi)
758ad80c 1868{
99978832
SH
1869 return 0;
1870}
1871
1872static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
1873{
1874 return 0;
758ad80c
SH
1875}
1876
ab54b798
SH
1877int lxcfs_mkdir(const char *path, mode_t mode)
1878{
1879 if (strncmp(path, "/cgroup", 7) == 0)
1880 return cg_mkdir(path, mode);
1881
1882 return -EINVAL;
1883}
1884
341b21ad
SH
1885int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
1886{
1887 if (strncmp(path, "/cgroup", 7) == 0)
1888 return cg_chown(path, uid, gid);
1889
1890 return -EINVAL;
1891}
1892
2ad6d2bd
SH
1893/*
1894 * cat first does a truncate before doing ops->write. This doesn't
1895 * really make sense for cgroups. So just return 0 always but do
1896 * nothing.
1897 */
1898int lxcfs_truncate(const char *path, off_t newsize)
1899{
1900 if (strncmp(path, "/cgroup", 7) == 0)
1901 return 0;
1902 return -EINVAL;
1903}
1904
50d8d5b5
SH
1905int lxcfs_rmdir(const char *path)
1906{
1907 if (strncmp(path, "/cgroup", 7) == 0)
1908 return cg_rmdir(path);
1909 return -EINVAL;
1910}
1911
fd2e4e03
SH
1912int lxcfs_chmod(const char *path, mode_t mode)
1913{
1914 if (strncmp(path, "/cgroup", 7) == 0)
1915 return cg_chmod(path, mode);
1916 return -EINVAL;
1917}
1918
758ad80c
SH
1919const struct fuse_operations lxcfs_ops = {
1920 .getattr = lxcfs_getattr,
1921 .readlink = NULL,
1922 .getdir = NULL,
1923 .mknod = NULL,
ab54b798 1924 .mkdir = lxcfs_mkdir,
758ad80c 1925 .unlink = NULL,
50d8d5b5 1926 .rmdir = lxcfs_rmdir,
758ad80c
SH
1927 .symlink = NULL,
1928 .rename = NULL,
1929 .link = NULL,
fd2e4e03 1930 .chmod = lxcfs_chmod,
341b21ad 1931 .chown = lxcfs_chown,
2ad6d2bd 1932 .truncate = lxcfs_truncate,
758ad80c 1933 .utime = NULL,
99978832
SH
1934
1935 .open = lxcfs_open,
1936 .read = lxcfs_read,
1937 .release = lxcfs_release,
2ad6d2bd 1938 .write = lxcfs_write,
99978832 1939
758ad80c 1940 .statfs = NULL,
99978832
SH
1941 .flush = lxcfs_flush,
1942 .fsync = lxcfs_fsync,
758ad80c
SH
1943
1944 .setxattr = NULL,
1945 .getxattr = NULL,
1946 .listxattr = NULL,
1947 .removexattr = NULL,
1948
1949 .opendir = lxcfs_opendir,
1950 .readdir = lxcfs_readdir,
1951 .releasedir = lxcfs_releasedir,
1952
1953 .fsyncdir = NULL,
1954 .init = NULL,
1955 .destroy = NULL,
1956 .access = NULL,
1957 .create = NULL,
1958 .ftruncate = NULL,
1959 .fgetattr = NULL,
1960};
1961
99978832 1962static void usage(const char *me)
758ad80c
SH
1963{
1964 fprintf(stderr, "Usage:\n");
1965 fprintf(stderr, "\n");
1966 fprintf(stderr, "%s [FUSE and mount options] mountpoint\n", me);
1967 exit(1);
1968}
1969
99978832 1970static bool is_help(char *w)
758ad80c
SH
1971{
1972 if (strcmp(w, "-h") == 0 ||
1973 strcmp(w, "--help") == 0 ||
1974 strcmp(w, "-help") == 0 ||
1975 strcmp(w, "help") == 0)
1976 return true;
1977 return false;
1978}
1979
1980int main(int argc, char *argv[])
1981{
1982 int ret;
1983 struct lxcfs_state *d;
1984
1985 if (argc < 2 || is_help(argv[1]))
1986 usage(argv[0]);
1987
1988 d = malloc(sizeof(*d));
1989 if (!d)
1990 return -1;
1991
1992 if (!cgm_escape_cgroup())
1993 fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
1994
1995 if (!cgm_get_controllers(&d->subsystems))
1996 return -1;
1997
1998 ret = fuse_main(argc, argv, &lxcfs_ops, d);
1999
2000 return ret;
2183082c 2001}