]> git.proxmox.com Git - mirror_lxcfs.git/blame - lxcfs.c
fix up pid conversion algorithm
[mirror_lxcfs.git] / lxcfs.c
CommitLineData
758ad80c
SH
1/* lxcfs
2 *
3 * Copyright © 2014 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
f2799430 6 * See COPYING file for details.
758ad80c
SH
7 */
8
9/*
10 * NOTES - make sure to run this as -s to avoid threading.
11 * TODO - can we enforce that here from the code?
12 */
13#define FUSE_USE_VERSION 26
14
2183082c 15#include <stdio.h>
758ad80c
SH
16#include <dirent.h>
17#include <fcntl.h>
18#include <fuse.h>
19#include <unistd.h>
20#include <errno.h>
21#include <stdbool.h>
22#include <time.h>
23#include <string.h>
24#include <stdlib.h>
25#include <libgen.h>
41bb9357
SH
26#include <sched.h>
27#include <linux/sched.h>
a05660a6 28#include <sys/socket.h>
41bb9357
SH
29#include <sys/mount.h>
30#include <wait.h>
758ad80c
SH
31
32#include <nih/alloc.h>
33#include <nih/string.h>
34
35#include "cgmanager.h"
36
37struct lxcfs_state {
38 /*
39 * a null-terminated, nih-allocated list of the mounted subsystems. We
40 * detect this at startup.
41 */
42 char **subsystems;
43};
44#define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
45
4775fba1
SH
46/*
47 * TODO - return value should denote whether child exited with failure
48 * so callers can return errors. Esp read/write of tasks and cgroup.procs
49 */
a05660a6
SH
50static int wait_for_pid(pid_t pid)
51{
52 int status, ret;
53
54again:
55 ret = waitpid(pid, &status, 0);
56 if (ret == -1) {
57 if (errno == EINTR)
58 goto again;
59 return -1;
60 }
61 if (ret != pid)
62 goto again;
63 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
64 return -1;
65 return 0;
66}
67
053a659d
SH
68/*
69 * Given a open file * to /proc/pid/{u,g}id_map, and an id
70 * valid in the caller's namespace, return the id mapped into
71 * pid's namespace.
72 * Returns the mapped id, or -1 on error.
73 */
74unsigned int
75convert_id_to_ns(FILE *idfile, unsigned int in_id)
76{
77 unsigned int nsuid, // base id for a range in the idfile's namespace
78 hostuid, // base id for a range in the caller's namespace
79 count; // number of ids in this range
80 char line[400];
81 int ret;
82
83 fseek(idfile, 0L, SEEK_SET);
84 while (fgets(line, 400, idfile)) {
85 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
86 if (ret != 3)
87 continue;
88 if (hostuid + count < hostuid || nsuid + count < nsuid) {
89 /*
90 * uids wrapped around - unexpected as this is a procfile,
91 * so just bail.
92 */
647c89e5 93 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
053a659d
SH
94 nsuid, hostuid, count, line);
95 return -1;
96 }
97 if (hostuid <= in_id && hostuid+count > in_id) {
98 /*
99 * now since hostuid <= in_id < hostuid+count, and
100 * hostuid+count and nsuid+count do not wrap around,
101 * we know that nsuid+(in_id-hostuid) which must be
102 * less that nsuid+(count) must not wrap around
103 */
104 return (in_id - hostuid) + nsuid;
105 }
106 }
107
108 // no answer found
109 return -1;
110}
111
341b21ad
SH
112/*
113 * for is_privileged_over,
114 * specify whether we require the calling uid to be root in his
115 * namespace
116 */
117#define NS_ROOT_REQD true
118#define NS_ROOT_OPT false
119
120static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
758ad80c 121{
053a659d
SH
122 nih_local char *fpath = NULL;
123 bool answer = false;
124 uid_t nsuid;
125
341b21ad
SH
126 if (victim == -1 || uid == -1)
127 return false;
128
129 /*
130 * If the request is one not requiring root in the namespace,
131 * then having the same uid suffices. (i.e. uid 1000 has write
132 * access to files owned by uid 1000
133 */
134 if (!req_ns_root && uid == victim)
758ad80c
SH
135 return true;
136
053a659d
SH
137 fpath = NIH_MUST( nih_sprintf(NULL, "/proc/%d/uid_map", pid) );
138 FILE *f = fopen(fpath, "r");
139 if (!f)
140 return false;
141
341b21ad 142 /* if caller's not root in his namespace, reject */
053a659d
SH
143 nsuid = convert_id_to_ns(f, uid);
144 if (nsuid)
145 goto out;
146
341b21ad
SH
147 /*
148 * If victim is not mapped into caller's ns, reject.
149 * XXX I'm not sure this check is needed given that fuse
150 * will be sending requests where the vfs has converted
151 */
053a659d
SH
152 nsuid = convert_id_to_ns(f, victim);
153 if (nsuid == -1)
154 goto out;
155
156 answer = true;
157
158out:
159 fclose(f);
160 return answer;
758ad80c
SH
161}
162
163static bool perms_include(int fmode, mode_t req_mode)
164{
2ad6d2bd
SH
165 mode_t r;
166
167 switch (req_mode & O_ACCMODE) {
168 case O_RDONLY:
169 r = S_IROTH;
170 break;
171 case O_WRONLY:
172 r = S_IWOTH;
173 break;
174 case O_RDWR:
175 r = S_IROTH | S_IWOTH;
176 break;
177 default:
178 return false;
179 }
180 return ((fmode & r) == r);
758ad80c
SH
181}
182
3db25a35
SH
183static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
184{
185 char *start, *end;
186
187 if (strlen(taskcg) <= strlen(querycg)) {
188 fprintf(stderr, "%s: I was fed bad input\n", __func__);
189 return NULL;
190 }
191
192 if (strcmp(querycg, "/") == 0)
193 start = NIH_MUST( nih_strdup(NULL, taskcg + 1) );
194 else
195 start = NIH_MUST( nih_strdup(NULL, taskcg + strlen(querycg) + 1) );
196 end = strchr(start, '/');
197 if (end)
198 *end = '\0';
199 return start;
200}
201
758ad80c
SH
202/*
203 * check whether a fuse context may access a cgroup dir or file
204 *
205 * If file is not null, it is a cgroup file to check under cg.
206 * If file is null, then we are checking perms on cg itself.
207 *
208 * For files we can check the mode of the list_keys result.
209 * For cgroups, we must make assumptions based on the files under the
210 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
211 * yet.
212 */
213static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
214{
215 nih_local struct cgm_keys **list = NULL;
216 int i;
217
218 if (!file)
219 file = "tasks";
220
221 if (*file == '/')
222 file++;
223
224 if (!cgm_list_keys(contrl, cg, &list))
225 return false;
226 for (i = 0; list[i]; i++) {
227 if (strcmp(list[i]->name, file) == 0) {
228 struct cgm_keys *k = list[i];
341b21ad 229 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
758ad80c
SH
230 if (perms_include(k->mode >> 6, mode))
231 return true;
232 }
233 if (fc->gid == k->gid) {
234 if (perms_include(k->mode >> 3, mode))
235 return true;
236 }
237 return perms_include(k->mode, mode);
238 }
239 }
240
241 return false;
242}
243
3db25a35
SH
244static void stripnewline(char *x)
245{
246 size_t l = strlen(x);
247 if (l && x[l-1] == '\n')
248 x[l-1] = '\0';
249}
250
251/*
252 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
253 * If caller is in /a, he may act on /a/b, but not on /b.
254 * if the answer is false and nextcg is not NULL, then *nextcg will point
255 * to a nih_alloc'd string containing the next cgroup directory under cg
256 */
257static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
258{
259 nih_local char *fnam = NULL;
260 FILE *f;
261 bool answer = false;
262 char *line = NULL;
263 size_t len = 0;
264
265 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
266 if (!(f = fopen(fnam, "r")))
267 return false;
268
269 while (getline(&line, &len, f) != -1) {
270 char *c1, *c2, *linecmp;
271 if (!line[0])
272 continue;
273 c1 = strchr(line, ':');
274 if (!c1)
275 goto out;
276 c1++;
277 c2 = strchr(c1, ':');
278 if (!c2)
279 goto out;
280 *c2 = '\0';
281 if (strcmp(c1, contrl) != 0)
282 continue;
283 c2++;
284 stripnewline(c2);
285 /*
286 * callers pass in '/' for root cgroup, otherwise they pass
287 * in a cgroup without leading '/'
288 */
289 linecmp = *cg == '/' ? c2 : c2+1;
290 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
291 if (nextcg)
292 *nextcg = get_next_cgroup_dir(linecmp, cg);
293 goto out;
294 }
295 answer = true;
296 goto out;
297 }
298
299out:
300 fclose(f);
301 free(line);
302 return answer;
303}
304
758ad80c
SH
305/*
306 * given /cgroup/freezer/a/b, return "freezer". this will be nih-allocated
307 * and needs to be nih_freed.
308 */
309static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
310{
311 const char *p1;
312 char *ret, *slash;
313
314 if (strlen(path) < 9)
315 return NULL;
316 p1 = path+8;
317 ret = nih_strdup(NULL, p1);
318 if (!ret)
319 return ret;
320 slash = strstr(ret, "/");
321 if (slash)
322 *slash = '\0';
323
324 /* verify that it is a subsystem */
325 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
326 int i;
327 if (!list) {
328 nih_free(ret);
329 return NULL;
330 }
331 for (i = 0; list[i]; i++) {
332 if (strcmp(list[i], ret) == 0)
333 return ret;
334 }
335 nih_free(ret);
336 return NULL;
337}
338
339/*
340 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
341 * Note that the returned value may include files (keynames) etc
342 */
343static const char *find_cgroup_in_path(const char *path)
344{
345 const char *p1;
346
347 if (strlen(path) < 9)
348 return NULL;
349 p1 = strstr(path+8, "/");
350 if (!p1)
351 return NULL;
352 return p1+1;
353}
354
355static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
356{
357 nih_local char **list = NULL;
358 int i;
359
360 if (!f)
361 return false;
362 if (*f == '/')
363 f++;
364
365 if (!cgm_list_children(contr, dir, &list))
366 return false;
367 for (i = 0; list[i]; i++) {
368 if (strcmp(list[i], f) == 0)
369 return true;
370 }
371
372 return false;
373}
374
375static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
376{
377 nih_local struct cgm_keys **list = NULL;
378 struct cgm_keys *k;
379 int i;
380
381 if (!f)
382 return NULL;
383 if (*f == '/')
384 f++;
385 if (!cgm_list_keys(contr, dir, &list))
386 return NULL;
387 for (i = 0; list[i]; i++) {
388 if (strcmp(list[i]->name, f) == 0) {
389 k = NIH_MUST( nih_alloc(NULL, (sizeof(*k))) );
390 k->name = NIH_MUST( nih_strdup(k, list[i]->name) );
391 k->uid = list[i]->uid;
392 k->gid = list[i]->gid;
393 k->mode = list[i]->mode;
394 return k;
395 }
396 }
397
398 return NULL;
399}
400
401static void get_cgdir_and_path(const char *cg, char **dir, char **file)
402{
758ad80c
SH
403 char *p;
404
405 *dir = NIH_MUST( nih_strdup(NULL, cg) );
406 *file = strrchr(cg, '/');
407 if (!*file) {
408 *file = NULL;
409 return;
410 }
411 p = strrchr(*dir, '/');
412 *p = '\0';
413}
414
99978832
SH
415static size_t get_file_size(const char *contrl, const char *cg, const char *f)
416{
417 nih_local char *data = NULL;
418 size_t s;
419 if (!cgm_get_value(contrl, cg, f, &data))
420 return -EINVAL;
421 s = strlen(data);
422 return s;
423}
2ad6d2bd 424
758ad80c 425/*
2ad6d2bd 426 * FUSE ops for /cgroup
758ad80c 427 */
2ad6d2bd 428
758ad80c
SH
429static int cg_getattr(const char *path, struct stat *sb)
430{
431 struct timespec now;
432 struct fuse_context *fc = fuse_get_context();
433 nih_local char * cgdir = NULL;
434 char *fpath = NULL, *path1, *path2;
435 nih_local struct cgm_keys *k = NULL;
436 const char *cgroup;
437 nih_local char *controller = NULL;
438
439
440 if (!fc)
441 return -EIO;
442
443 memset(sb, 0, sizeof(struct stat));
444
445 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
446 return -EINVAL;
447
448 sb->st_uid = sb->st_gid = 0;
449 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
450 sb->st_size = 0;
451
452 if (strcmp(path, "/cgroup") == 0) {
453 sb->st_mode = S_IFDIR | 00755;
454 sb->st_nlink = 2;
455 return 0;
456 }
457
458 controller = pick_controller_from_path(fc, path);
459 if (!controller)
460 return -EIO;
758ad80c
SH
461 cgroup = find_cgroup_in_path(path);
462 if (!cgroup) {
463 /* this is just /cgroup/controller, return it as a dir */
464 sb->st_mode = S_IFDIR | 00755;
465 sb->st_nlink = 2;
466 return 0;
467 }
341b21ad 468
758ad80c
SH
469 get_cgdir_and_path(cgroup, &cgdir, &fpath);
470
471 if (!fpath) {
472 path1 = "/";
473 path2 = cgdir;
474 } else {
475 path1 = cgdir;
476 path2 = fpath;
477 }
478
758ad80c
SH
479 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
480 * Then check that caller's cgroup is under path if fpath is a child
481 * cgroup, or cgdir if fpath is a file */
482
483 if (is_child_cgroup(controller, path1, path2)) {
f9a05025
SH
484 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
485 /* this is just /cgroup/controller, return it as a dir */
486 sb->st_mode = S_IFDIR | 00555;
487 sb->st_nlink = 2;
488 return 0;
489 }
758ad80c 490 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
f9a05025 491 return -EACCES;
758ad80c 492
053a659d
SH
493 // get uid, gid, from '/tasks' file and make up a mode
494 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
495 sb->st_mode = S_IFDIR | 00755;
496 k = get_cgroup_key(controller, cgroup, "tasks");
497 if (!k) {
053a659d
SH
498 sb->st_uid = sb->st_gid = 0;
499 } else {
053a659d
SH
500 sb->st_uid = k->uid;
501 sb->st_gid = k->gid;
502 }
758ad80c
SH
503 sb->st_nlink = 2;
504 return 0;
505 }
506
507 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
3db25a35
SH
508 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL))
509 return -ENOENT;
758ad80c 510 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
f9a05025 511 return -EACCES;
758ad80c 512
758ad80c 513 sb->st_mode = S_IFREG | k->mode;
053a659d 514 sb->st_nlink = 1;
758ad80c
SH
515 sb->st_uid = k->uid;
516 sb->st_gid = k->gid;
99978832 517 sb->st_size = get_file_size(controller, path1, path2);
758ad80c
SH
518 return 0;
519 }
520
ab54b798 521 return -ENOENT;
758ad80c 522}
2183082c 523
7f163b71
SH
524/*
525 * TODO - cache these results in a table for use in opendir, free
526 * in releasedir
527 */
758ad80c 528static int cg_opendir(const char *path, struct fuse_file_info *fi)
2183082c 529{
7f163b71
SH
530 struct fuse_context *fc = fuse_get_context();
531 nih_local struct cgm_keys **list = NULL;
532 const char *cgroup;
533 nih_local char *controller = NULL;
7f163b71
SH
534 nih_local char *nextcg = NULL;
535
536 if (!fc)
537 return -EIO;
538
539 if (strcmp(path, "/cgroup") == 0)
540 return 0;
541
542 // return list of keys for the controller, and list of child cgroups
543 controller = pick_controller_from_path(fc, path);
544 if (!controller)
545 return -EIO;
546
547 cgroup = find_cgroup_in_path(path);
548 if (!cgroup) {
549 /* this is just /cgroup/controller, return its contents */
550 cgroup = "/";
551 }
552
553 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
554 return -EACCES;
758ad80c
SH
555 return 0;
556}
557
758ad80c
SH
558static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
559 struct fuse_file_info *fi)
560{
561 struct fuse_context *fc = fuse_get_context();
562
563 if (!fc)
564 return -EIO;
565
566 if (strcmp(path, "/cgroup") == 0) {
567 // get list of controllers
568 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
569 int i;
570
571 if (!list)
572 return -EIO;
7f163b71 573
758ad80c
SH
574 for (i = 0; list[i]; i++) {
575 if (filler(buf, list[i], NULL, 0) != 0) {
576 return -EIO;
577 }
578 }
579 return 0;
580 }
581
582 // return list of keys for the controller, and list of child cgroups
583 nih_local struct cgm_keys **list = NULL;
584 const char *cgroup;
585 nih_local char *controller = NULL;
586 int i;
3db25a35 587 nih_local char *nextcg = NULL;
758ad80c
SH
588
589 controller = pick_controller_from_path(fc, path);
590 if (!controller)
591 return -EIO;
592
593 cgroup = find_cgroup_in_path(path);
594 if (!cgroup) {
595 /* this is just /cgroup/controller, return its contents */
596 cgroup = "/";
597 }
598
599 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
f9a05025 600 return -EACCES;
758ad80c
SH
601
602 if (!cgm_list_keys(controller, cgroup, &list))
3db25a35 603 // not a valid cgroup
758ad80c 604 return -EINVAL;
3db25a35
SH
605
606 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, &nextcg)) {
607 if (nextcg) {
608 int ret;
609 ret = filler(buf, nextcg, NULL, 0);
610 if (ret != 0)
611 return -EIO;
612 }
613 return 0;
614 }
615
758ad80c 616 for (i = 0; list[i]; i++) {
758ad80c
SH
617 if (filler(buf, list[i]->name, NULL, 0) != 0) {
618 return -EIO;
619 }
620 }
621
622 // now get the list of child cgroups
623 nih_local char **clist;
624
625 if (!cgm_list_children(controller, cgroup, &clist))
626 return 0;
627 for (i = 0; clist[i]; i++) {
758ad80c
SH
628 if (filler(buf, clist[i], NULL, 0) != 0) {
629 return -EIO;
630 }
631 }
632 return 0;
633}
634
635static int cg_releasedir(const char *path, struct fuse_file_info *fi)
636{
637 return 0;
638}
639
26faa701
SH
640/*
641 * TODO - cache info here for read/write, release in cg_release.
642 */
99978832
SH
643static int cg_open(const char *path, struct fuse_file_info *fi)
644{
645 nih_local char *controller = NULL;
646 const char *cgroup;
647 char *fpath = NULL, *path1, *path2;
648 nih_local char * cgdir = NULL;
649 nih_local struct cgm_keys *k = NULL;
650 struct fuse_context *fc = fuse_get_context();
651
652 if (!fc)
653 return -EIO;
654
655 controller = pick_controller_from_path(fc, path);
656 if (!controller)
657 return -EIO;
658 cgroup = find_cgroup_in_path(path);
659 if (!cgroup)
660 return -EINVAL;
661
662 get_cgdir_and_path(cgroup, &cgdir, &fpath);
663 if (!fpath) {
664 path1 = "/";
665 path2 = cgdir;
666 } else {
667 path1 = cgdir;
668 path2 = fpath;
669 }
670
671 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
672 if (!fc_may_access(fc, controller, path1, path2, fi->flags))
f9a05025
SH
673 // should never get here
674 return -EACCES;
99978832 675
99978832
SH
676 return 0;
677 }
678
679 return -EINVAL;
680}
681
a05660a6
SH
682static int msgrecv(int sockfd, void *buf, size_t len)
683{
684 struct timeval tv;
685 fd_set rfds;
686
687 FD_ZERO(&rfds);
688 FD_SET(sockfd, &rfds);
689 tv.tv_sec = 2;
690 tv.tv_usec = 0;
691
692 if (select(sockfd+1, &rfds, NULL, NULL, &tv) < 0)
693 return -1;
694 return recv(sockfd, buf, len, MSG_DONTWAIT);
695}
696
01e71852
SH
697#define SEND_CREDS_OK 0
698#define SEND_CREDS_NOTSK 1
699#define SEND_CREDS_FAIL 2
700static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
a05660a6
SH
701{
702 struct msghdr msg = { 0 };
703 struct iovec iov;
704 struct cmsghdr *cmsg;
705 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
706 char buf[1];
707 buf[0] = 'p';
708
01e71852
SH
709 if (pingfirst) {
710 if (msgrecv(sock, buf, 1) != 1) {
711 printf("%s: Error getting reply from server over socketpair",
712 __func__);
713 return SEND_CREDS_FAIL;
714 }
a05660a6
SH
715 }
716
717 msg.msg_control = cmsgbuf;
718 msg.msg_controllen = sizeof(cmsgbuf);
719
720 cmsg = CMSG_FIRSTHDR(&msg);
721 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
722 cmsg->cmsg_level = SOL_SOCKET;
723 cmsg->cmsg_type = SCM_CREDENTIALS;
724 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
725
726 msg.msg_name = NULL;
727 msg.msg_namelen = 0;
728
729 buf[0] = v;
730 iov.iov_base = buf;
731 iov.iov_len = sizeof(buf);
732 msg.msg_iov = &iov;
733 msg.msg_iovlen = 1;
734
735 if (sendmsg(sock, &msg, 0) < 0) {
736 printf("%s: failed at sendmsg: %s", __func__,
737 strerror(errno));
738 if (errno == 3)
01e71852
SH
739 return SEND_CREDS_NOTSK;
740 return SEND_CREDS_FAIL;
a05660a6
SH
741 }
742
01e71852 743 return SEND_CREDS_OK;
a05660a6
SH
744}
745
746static bool recv_creds(int sock, struct ucred *cred, char *v)
747{
748 struct msghdr msg = { 0 };
749 struct iovec iov;
750 struct cmsghdr *cmsg;
751 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
752 char buf[1];
753 int ret;
754 int optval = 1;
755
756 *v = '1';
757
758 cred->pid = -1;
759 cred->uid = -1;
760 cred->gid = -1;
761
762 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
763 printf("Failed to set passcred: %s", strerror(errno));
764 return false;
765 }
766 buf[0] = '1';
767 if (write(sock, buf, 1) != 1) {
768 printf("Failed to start write on scm fd: %s", strerror(errno));
769 return false;
770 }
771
772 msg.msg_name = NULL;
773 msg.msg_namelen = 0;
774 msg.msg_control = cmsgbuf;
775 msg.msg_controllen = sizeof(cmsgbuf);
776
777 iov.iov_base = buf;
778 iov.iov_len = sizeof(buf);
779 msg.msg_iov = &iov;
780 msg.msg_iovlen = 1;
781
782 // retry logic is not ideal, especially as we are not
783 // threaded. Sleep at most 1 second waiting for the client
784 // to send us the scm_cred
785 ret = recvmsg(sock, &msg, 0);
786 if (ret < 0) {
787 printf("Failed to receive scm_cred: %s",
788 strerror(errno));
789 return false;
790 }
791
792 cmsg = CMSG_FIRSTHDR(&msg);
793
794 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
795 cmsg->cmsg_level == SOL_SOCKET &&
796 cmsg->cmsg_type == SCM_CREDENTIALS) {
797 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
798 }
799 *v = buf[0];
800
801 return true;
802}
803
804
805/*
4775fba1
SH
806 * pid_to_ns - reads pids from a ucred over a socket, then writes the
807 * int value back over the socket. This shifts the pid from the
808 * sender's pidns into tpid's pidns.
a05660a6 809 */
4775fba1 810static void pid_to_ns(int sock, pid_t tpid)
a05660a6
SH
811{
812 char v = '0';
813 struct ucred cred;
814
815 while (recv_creds(sock, &cred, &v)) {
816 if (v == '1')
817 exit(0);
a05660a6
SH
818 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
819 exit(1);
820 }
821 exit(0);
822}
823
824/*
4775fba1 825 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
a05660a6 826 * in your old pidns. Only children which you fork will be in the target
4775fba1 827 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
a05660a6
SH
828 * actually convert pids
829 */
4775fba1 830static void pid_to_ns_wrapper(int sock, pid_t tpid)
a05660a6
SH
831{
832 int newnsfd = -1;
833 char fnam[100];
834 pid_t cpid;
835
836 sprintf(fnam, "/proc/%d/ns/pid", tpid);
837 newnsfd = open(fnam, O_RDONLY);
838 if (newnsfd < 0)
839 exit(1);
840 if (setns(newnsfd, 0) < 0)
841 exit(1);
842 close(newnsfd);
843
844 cpid = fork();
845
846 if (cpid < 0)
847 exit(1);
848 if (!cpid)
4775fba1 849 pid_to_ns(sock, tpid);
a05660a6
SH
850 if (!wait_for_pid(cpid))
851 exit(1);
852 exit(0);
853}
854
855/*
856 * To read cgroup files with a particular pid, we will setns into the child
857 * pidns, open a pipe, fork a child - which will be the first to really be in
858 * the child ns - which does the cgm_get_value and writes the data to the pipe.
859 */
860static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
861{
862 int sock[2] = {-1, -1};
863 nih_local char *tmpdata = NULL;
864 int ret;
865 pid_t qpid, cpid = -1;
866 bool answer = false;
867 char v = '0';
868 struct ucred cred;
869 struct timeval tv;
870 fd_set s;
871
872 if (!cgm_get_value(contrl, cg, file, &tmpdata))
873 return false;
874
875 /*
876 * Now we read the pids from returned data one by one, pass
877 * them into a child in the target namespace, read back the
878 * translated pids, and put them into our to-return data
879 */
880
881 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
882 perror("socketpair");
883 exit(1);
884 }
885
886 cpid = fork();
887 if (cpid == -1)
888 goto out;
889
890 if (!cpid) // child
4775fba1 891 pid_to_ns_wrapper(sock[1], tpid);
a05660a6
SH
892
893 char *ptr = tmpdata;
894 cred.uid = 0;
895 cred.gid = 0;
896 while (sscanf(ptr, "%d\n", &qpid) == 1) {
897 cred.pid = qpid;
01e71852
SH
898 ret = send_creds(sock[0], &cred, v, true);
899
900 if (ret == SEND_CREDS_NOTSK)
901 goto next;
902 if (ret == SEND_CREDS_FAIL)
a05660a6
SH
903 goto out;
904
905 // read converted results
906 FD_ZERO(&s);
907 FD_SET(sock[0], &s);
908 tv.tv_sec = 1;
909 tv.tv_usec = 0;
910 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
911 if (ret <= 0) {
912 kill(cpid, SIGTERM);
913 goto out;
914 }
915 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
916 kill(cpid, SIGTERM);
917 perror("read");
918 goto out;
919 }
a05660a6 920 NIH_MUST( nih_strcat_sprintf(d, NULL, "%d\n", qpid) );
01e71852 921next:
a05660a6
SH
922 ptr = strchr(ptr, '\n');
923 if (!ptr)
924 break;
925 ptr++;
926 }
927
928 cred.pid = getpid();
929 v = '1';
01e71852 930 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
a05660a6
SH
931 // failed to ask child to exit
932 kill(cpid, SIGTERM);
933 goto out;
934 }
935
936 answer = true;
937
938out:
939 if (cpid != -1)
940 wait_for_pid(cpid);
941 if (sock[0] != -1) {
942 close(sock[0]);
943 close(sock[1]);
944 }
945 return answer;
946}
947
99978832
SH
948static int cg_read(const char *path, char *buf, size_t size, off_t offset,
949 struct fuse_file_info *fi)
950{
951 nih_local char *controller = NULL;
952 const char *cgroup;
953 char *fpath = NULL, *path1, *path2;
954 struct fuse_context *fc = fuse_get_context();
955 nih_local char * cgdir = NULL;
956 nih_local struct cgm_keys *k = NULL;
957
958 if (offset)
959 return -EIO;
960
961 if (!fc)
962 return -EIO;
963
964 controller = pick_controller_from_path(fc, path);
965 if (!controller)
f9a05025 966 return -EINVAL;
99978832
SH
967 cgroup = find_cgroup_in_path(path);
968 if (!cgroup)
969 return -EINVAL;
970
971 get_cgdir_and_path(cgroup, &cgdir, &fpath);
972 if (!fpath) {
973 path1 = "/";
974 path2 = cgdir;
975 } else {
976 path1 = cgdir;
977 path2 = fpath;
978 }
979
980 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
981 nih_local char *data = NULL;
4775fba1
SH
982 int s;
983 bool r;
99978832 984
2ad6d2bd 985 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
f9a05025
SH
986 // should never get here
987 return -EACCES;
99978832 988
a05660a6
SH
989 if (strcmp(path2, "tasks") == 0 ||
990 strcmp(path2, "/tasks") == 0 ||
991 strcmp(path2, "/cgroup.procs") == 0 ||
992 strcmp(path2, "cgroup.procs") == 0)
993 // special case - we have to translate the pids
4775fba1 994 r = do_read_pids(fc->pid, controller, path1, path2, &data);
a05660a6 995 else
4775fba1 996 r = cgm_get_value(controller, path1, path2, &data);
a05660a6 997
4775fba1 998 if (!r)
99978832
SH
999 return -EINVAL;
1000
4775fba1
SH
1001 if (!data)
1002 return 0;
99978832
SH
1003 s = strlen(data);
1004 if (s > size)
1005 s = size;
1006 memcpy(buf, data, s);
1007
99978832
SH
1008 return s;
1009 }
1010
1011 return -EINVAL;
1012}
1013
4775fba1
SH
1014static void pid_from_ns(int sock, pid_t tpid)
1015{
1016 pid_t vpid;
1017 struct ucred cred;
1018 char v;
1019
1020 cred.uid = 0;
1021 cred.gid = 0;
1022 while (read(sock, &vpid, sizeof(pid_t)) == sizeof(pid_t)) {
1023 if (vpid == -1) // done
01e71852 1024 break;
4775fba1
SH
1025 v = '0';
1026 cred.pid = vpid;
01e71852 1027 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
4775fba1
SH
1028 v = '1';
1029 cred.pid = getpid();
01e71852 1030 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
4775fba1
SH
1031 exit(1);
1032 }
1033 }
1034 exit(0);
1035}
1036
1037static void pid_from_ns_wrapper(int sock, pid_t tpid)
1038{
1039 int newnsfd = -1;
1040 char fnam[100];
1041 pid_t cpid;
1042
1043 sprintf(fnam, "/proc/%d/ns/pid", tpid);
1044 newnsfd = open(fnam, O_RDONLY);
1045 if (newnsfd < 0)
1046 exit(1);
1047 if (setns(newnsfd, 0) < 0)
1048 exit(1);
1049 close(newnsfd);
1050
1051 cpid = fork();
1052
1053 if (cpid < 0)
1054 exit(1);
1055 if (!cpid)
1056 pid_from_ns(sock, tpid);
1057 if (!wait_for_pid(cpid))
1058 exit(1);
1059 exit(0);
1060}
1061
1062static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
1063{
1064 int sock[2] = {-1, -1};
1065 pid_t qpid, cpid = -1;
1066 bool answer = false, fail = false;
1067
1068 /*
1069 * write the pids to a socket, have helper in writer's pidns
1070 * call movepid for us
1071 */
1072 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1073 perror("socketpair");
1074 exit(1);
1075 }
1076
1077 cpid = fork();
1078 if (cpid == -1)
1079 goto out;
1080
1081 if (!cpid) // child
1082 pid_from_ns_wrapper(sock[1], tpid);
1083
1084 const char *ptr = buf;
1085 while (sscanf(ptr, "%d", &qpid) == 1) {
1086 struct ucred cred;
1087 char v;
1088
1089 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1090 kill(cpid, SIGTERM);
1091 perror("write");
1092 goto out;
1093 }
1094
01e71852
SH
1095 if (recv_creds(sock[0], &cred, &v)) {
1096 if (v == '0') {
1097 if (!cgm_move_pid(contrl, cg, cred.pid))
1098 fail = true;
1099 }
4775fba1
SH
1100 }
1101
1102 ptr = strchr(ptr, '\n');
1103 if (!ptr)
1104 break;
1105 ptr++;
1106 }
1107
1108 /* All good, write the value */
1109 qpid = -1;
1110 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1111 printf("Warning: failed to ask child to exit\n");
1112
1113 if (!fail)
1114 answer = true;
1115
1116out:
1117 if (cpid != -1)
1118 wait_for_pid(cpid);
1119 if (sock[0] != -1) {
1120 close(sock[0]);
1121 close(sock[1]);
1122 }
1123 return answer;
1124}
1125
2ad6d2bd
SH
1126int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1127 struct fuse_file_info *fi)
1128{
1129 nih_local char *controller = NULL;
1130 const char *cgroup;
1131 char *fpath = NULL, *path1, *path2;
1132 struct fuse_context *fc = fuse_get_context();
1133 nih_local char * cgdir = NULL;
1134 nih_local struct cgm_keys *k = NULL;
1135
2ad6d2bd 1136 if (offset)
f9a05025 1137 return -EINVAL;
2ad6d2bd
SH
1138
1139 if (!fc)
1140 return -EIO;
1141
1142 controller = pick_controller_from_path(fc, path);
1143 if (!controller)
f9a05025 1144 return -EINVAL;
2ad6d2bd
SH
1145 cgroup = find_cgroup_in_path(path);
1146 if (!cgroup)
1147 return -EINVAL;
1148
1149 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1150 if (!fpath) {
1151 path1 = "/";
1152 path2 = cgdir;
1153 } else {
1154 path1 = cgdir;
1155 path2 = fpath;
1156 }
1157
1158 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
4775fba1
SH
1159 bool r;
1160
2ad6d2bd 1161 if (!fc_may_access(fc, controller, path1, path2, O_WRONLY))
f9a05025 1162 return -EACCES;
2ad6d2bd 1163
4775fba1
SH
1164 if (strcmp(path2, "tasks") == 0 ||
1165 strcmp(path2, "/tasks") == 0 ||
1166 strcmp(path2, "/cgroup.procs") == 0 ||
1167 strcmp(path2, "cgroup.procs") == 0)
1168 // special case - we have to translate the pids
1169 r = do_write_pids(fc->pid, controller, path1, path2, buf);
1170 else
1171 r = cgm_set_value(controller, path1, path2, buf);
1172
1173 if (!r)
2ad6d2bd
SH
1174 return -EINVAL;
1175
1176 return size;
1177 }
1178
1179 return -EINVAL;
1180}
1181
341b21ad
SH
1182int cg_chown(const char *path, uid_t uid, gid_t gid)
1183{
1184 struct fuse_context *fc = fuse_get_context();
1185 nih_local char * cgdir = NULL;
1186 char *fpath = NULL, *path1, *path2;
1187 nih_local struct cgm_keys *k = NULL;
1188 const char *cgroup;
1189 nih_local char *controller = NULL;
1190
1191
1192 if (!fc)
1193 return -EIO;
1194
1195 if (strcmp(path, "/cgroup") == 0)
1196 return -EINVAL;
1197
1198 controller = pick_controller_from_path(fc, path);
1199 if (!controller)
f9a05025 1200 return -EINVAL;
341b21ad
SH
1201 cgroup = find_cgroup_in_path(path);
1202 if (!cgroup)
1203 /* this is just /cgroup/controller */
1204 return -EINVAL;
1205
1206 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1207
1208 if (!fpath) {
1209 path1 = "/";
1210 path2 = cgdir;
1211 } else {
1212 path1 = cgdir;
1213 path2 = fpath;
1214 }
1215
1216 if (is_child_cgroup(controller, path1, path2)) {
1217 // get uid, gid, from '/tasks' file and make up a mode
1218 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1219 k = get_cgroup_key(controller, cgroup, "tasks");
1220
1221 } else
1222 k = get_cgroup_key(controller, path1, path2);
1223
1224 if (!k)
1225 return -EINVAL;
1226
1227 /*
1228 * This being a fuse request, the uid and gid must be valid
1229 * in the caller's namespace. So we can just check to make
1230 * sure that the caller is root in his uid, and privileged
1231 * over the file's current owner.
1232 */
1233 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD))
f9a05025 1234 return -EACCES;
341b21ad
SH
1235
1236 if (!cgm_chown_file(controller, cgroup, uid, gid))
1237 return -EINVAL;
1238 return 0;
1239}
2ad6d2bd 1240
fd2e4e03
SH
1241int cg_chmod(const char *path, mode_t mode)
1242{
0a1bb5ea
SH
1243 struct fuse_context *fc = fuse_get_context();
1244 nih_local char * cgdir = NULL;
1245 char *fpath = NULL, *path1, *path2;
1246 nih_local struct cgm_keys *k = NULL;
1247 const char *cgroup;
1248 nih_local char *controller = NULL;
1249
1250 if (!fc)
1251 return -EIO;
1252
1253 if (strcmp(path, "/cgroup") == 0)
1254 return -EINVAL;
1255
1256 controller = pick_controller_from_path(fc, path);
1257 if (!controller)
f9a05025 1258 return -EINVAL;
0a1bb5ea
SH
1259 cgroup = find_cgroup_in_path(path);
1260 if (!cgroup)
1261 /* this is just /cgroup/controller */
1262 return -EINVAL;
1263
1264 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1265
1266 if (!fpath) {
1267 path1 = "/";
1268 path2 = cgdir;
1269 } else {
1270 path1 = cgdir;
1271 path2 = fpath;
1272 }
1273
1274 if (is_child_cgroup(controller, path1, path2)) {
1275 // get uid, gid, from '/tasks' file and make up a mode
1276 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1277 k = get_cgroup_key(controller, cgroup, "tasks");
1278
1279 } else
1280 k = get_cgroup_key(controller, path1, path2);
1281
1282 if (!k)
1283 return -EINVAL;
1284
1285 /*
1286 * This being a fuse request, the uid and gid must be valid
1287 * in the caller's namespace. So we can just check to make
1288 * sure that the caller is root in his uid, and privileged
1289 * over the file's current owner.
1290 */
1291 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT))
1292 return -EPERM;
1293
1294 if (!cgm_chmod_file(controller, cgroup, mode))
1295 return -EINVAL;
1296 return 0;
fd2e4e03
SH
1297}
1298
ab54b798
SH
1299int cg_mkdir(const char *path, mode_t mode)
1300{
1301 struct fuse_context *fc = fuse_get_context();
1302 nih_local struct cgm_keys **list = NULL;
1303 char *fpath = NULL, *path1;
1304 nih_local char * cgdir = NULL;
1305 const char *cgroup;
1306 nih_local char *controller = NULL;
1307
ab54b798
SH
1308 if (!fc)
1309 return -EIO;
1310
1311
1312 controller = pick_controller_from_path(fc, path);
1313 if (!controller)
f9a05025 1314 return -EINVAL;
ab54b798
SH
1315
1316 cgroup = find_cgroup_in_path(path);
1317 if (!cgroup)
f9a05025 1318 return -EINVAL;
ab54b798
SH
1319
1320 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1321 if (!fpath)
1322 path1 = "/";
1323 else
1324 path1 = cgdir;
1325
1326 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR))
f9a05025 1327 return -EACCES;
ab54b798
SH
1328
1329
1330 if (!cgm_create(controller, cgroup, fc->uid, fc->gid))
1331 return -EINVAL;
1332
1333 return 0;
1334}
1335
50d8d5b5
SH
1336static int cg_rmdir(const char *path)
1337{
1338 struct fuse_context *fc = fuse_get_context();
1339 nih_local struct cgm_keys **list = NULL;
1340 char *fpath = NULL;
1341 nih_local char * cgdir = NULL;
1342 const char *cgroup;
1343 nih_local char *controller = NULL;
1344
1345 if (!fc)
1346 return -EIO;
1347
1348
1349 controller = pick_controller_from_path(fc, path);
1350 if (!controller)
f9a05025 1351 return -EINVAL;
50d8d5b5
SH
1352
1353 cgroup = find_cgroup_in_path(path);
1354 if (!cgroup)
f9a05025 1355 return -EINVAL;
50d8d5b5
SH
1356
1357 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1358 if (!fpath)
1359 return -EINVAL;
1360
1361 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY))
f9a05025 1362 return -EACCES;
50d8d5b5
SH
1363
1364 if (!cgm_remove(controller, cgroup))
1365 return -EINVAL;
1366
1367 return 0;
1368}
1369
2dc17609
SH
1370static bool startswith(const char *line, const char *pref)
1371{
1372 if (strncmp(line, pref, strlen(pref)) == 0)
1373 return true;
1374 return false;
1375}
1376
1377static void get_mem_cached(char *memstat, unsigned long *v)
1378{
1379 char *eol;
1380
1381 *v = 0;
1382 while (*memstat) {
1383 if (startswith(memstat, "total_cache")) {
1384 sscanf(memstat + 11, "%lu", v);
1385 *v /= 1024;
1386 return;
1387 }
1388 eol = strchr(memstat, '\n');
1389 if (!eol)
1390 return;
1391 memstat = eol+1;
1392 }
1393}
1394
1395static char *get_pid_cgroup(pid_t pid, const char *contrl)
1396{
1397 nih_local char *fnam = NULL;
1398 FILE *f;
1399 char *answer = NULL;
1400 char *line = NULL;
1401 size_t len = 0;
1402
1403 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
1404 if (!(f = fopen(fnam, "r")))
1405 return false;
1406
1407 while (getline(&line, &len, f) != -1) {
1408 char *c1, *c2;
1409 if (!line[0])
1410 continue;
1411 c1 = strchr(line, ':');
1412 if (!c1)
1413 goto out;
1414 c1++;
1415 c2 = strchr(c1, ':');
1416 if (!c2)
1417 goto out;
1418 *c2 = '\0';
1419 if (strcmp(c1, contrl) != 0)
1420 continue;
1421 c2++;
1422 stripnewline(c2);
1423 answer = NIH_MUST( nih_strdup(NULL, c2) );
1424 goto out;
1425 }
1426
1427out:
1428 fclose(f);
1429 free(line);
1430 return answer;
1431}
1432
758ad80c 1433/*
2ad6d2bd 1434 * FUSE ops for /proc
758ad80c 1435 */
758ad80c 1436
23ce2127
SH
1437static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1438 struct fuse_file_info *fi)
1439{
2dc17609
SH
1440 struct fuse_context *fc = fuse_get_context();
1441 nih_local char *cg = get_pid_cgroup(fc->pid, "memory");
1442 nih_local char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
1443 unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
1444 char *line = NULL;
1445 size_t linelen = 0, total_len = 0;
1446 FILE *f;
1447
1448 if (offset)
1449 return -EINVAL;
1450
1451 if (!cg)
1452 return 0;
1453
1454 if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
1455 return 0;
1456 if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
1457 return 0;
1458 if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
1459 return 0;
1460 memlimit = strtoul(memlimit_str, NULL, 10);
1461 memusage = strtoul(memusage_str, NULL, 10);
1462 memlimit /= 1024;
1463 memusage /= 1024;
1464 get_mem_cached(memstat_str, &cached);
1465
1466 f = fopen("/proc/meminfo", "r");
1467 if (!f)
1468 return 0;
1469
1470 while (getline(&line, &linelen, f) != -1) {
1471 size_t l;
1472 char *printme, lbuf[100];
1473
1474 memset(lbuf, 0, 100);
1475 if (startswith(line, "MemTotal:")) {
1476 sscanf(line+14, "%lu", &hosttotal);
1477 if (hosttotal < memlimit)
1478 memlimit = hosttotal;
1479 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1480 printme = lbuf;
1481 } else if (startswith(line, "MemFree:")) {
1482 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1483 printme = lbuf;
1484 } else if (startswith(line, "MemAvailable:")) {
1485 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1486 printme = lbuf;
1487 } else if (startswith(line, "Buffers:")) {
1488 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1489 printme = lbuf;
1490 } else if (startswith(line, "Cached:")) {
1491 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
1492 printme = lbuf;
1493 } else if (startswith(line, "SwapCached:")) {
1494 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
1495 printme = lbuf;
1496 } else
1497 printme = line;
1498 l = snprintf(buf, size, "%s", printme);
1499 buf += l;
1500 size -= l;
1501 total_len += l;
1502 }
1503
1504 return total_len;
23ce2127
SH
1505}
1506
1507/*
1508 * Read the cpuset.cpus for cg
1509 * Return the answer in a nih_alloced string
1510 */
1511static char *get_cpuset(const char *cg)
1512{
1513 char *answer;
1514
1515 if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
1516 return NULL;
1517 return answer;
1518}
1519
1520/*
1521 * Helper functions for cpuset_in-set
1522 */
1523char *cpuset_nexttok(const char *c)
1524{
1525 char *r = strchr(c+1, ',');
1526 if (r)
1527 return r+1;
1528 return NULL;
1529}
1530
1531int cpuset_getrange(const char *c, int *a, int *b)
1532{
1533 int ret;
1534
1535 ret = sscanf(c, "%d-%d", a, b);
1536 return ret;
1537}
1538
1539/*
1540 * cpusets are in format "1,2-3,4"
1541 * iow, comma-delimited ranges
1542 */
aeb56147 1543static bool cpu_in_cpuset(int cpu, const char *cpuset)
23ce2127 1544{
23ce2127
SH
1545 const char *c;
1546
23ce2127
SH
1547 for (c = cpuset; c; c = cpuset_nexttok(c)) {
1548 int a, b, ret;
1549
1550 ret = cpuset_getrange(c, &a, &b);
1551 if (ret == 1 && cpu == a)
1552 return true;
1553 if (ret != 2) // bad cpuset!
1554 return false;
1555 if (cpu >= a && cpu <= b)
1556 return true;
1557 }
1558
1559 return false;
1560}
1561
aeb56147
SH
1562static bool cpuline_in_cpuset(const char *line, const char *cpuset)
1563{
1564 int cpu;
1565
1566 if (sscanf(line, "processor : %d", &cpu) != 1)
1567 return false;
1568 return cpu_in_cpuset(cpu, cpuset);
1569}
1570
23ce2127
SH
1571/*
1572 * check whether this is a '^processor" line in /proc/cpuinfo
1573 */
1574static bool is_processor_line(const char *line)
1575{
1576 int cpu;
1577
1578 if (sscanf(line, "processor : %d", &cpu) == 1)
1579 return true;
1580 return false;
1581}
1582
23ce2127
SH
1583static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
1584 struct fuse_file_info *fi)
1585{
1586 struct fuse_context *fc = fuse_get_context();
1587 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1588 nih_local char *cpuset = NULL;
1589 char *line = NULL;
1590 size_t linelen = 0, total_len = 0;
1591 bool am_printing = false;
1592 int curcpu = -1;
1593 FILE *f;
1594
1595 if (offset)
1596 return -EINVAL;
1597
1598 if (!cg)
1599 return 0;
1600
1601 cpuset = get_cpuset(cg);
1602 if (!cpuset)
1603 return 0;
1604
1605 f = fopen("/proc/cpuinfo", "r");
1606 if (!f)
1607 return 0;
1608
1609 while (getline(&line, &linelen, f) != -1) {
1610 size_t l;
1611 if (is_processor_line(line)) {
aeb56147 1612 am_printing = cpuline_in_cpuset(line, cpuset);
23ce2127
SH
1613 if (am_printing) {
1614 curcpu ++;
1615 l = snprintf(buf, size, "processor : %d\n", curcpu);
1616 buf += l;
1617 size -= l;
1618 total_len += l;
1619 }
1620 continue;
1621 }
1622 if (am_printing) {
1623 l = snprintf(buf, size, "%s", line);
1624 buf += l;
1625 size -= l;
1626 total_len += l;
1627 }
1628 }
1629
1630 return total_len;
1631}
1632
1633static int proc_stat_read(char *buf, size_t size, off_t offset,
1634 struct fuse_file_info *fi)
1635{
aeb56147
SH
1636 struct fuse_context *fc = fuse_get_context();
1637 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1638 nih_local char *cpuset = NULL;
1639 char *line = NULL;
1640 size_t linelen = 0, total_len = 0;
1641 int curcpu = 0;
1642 FILE *f;
1643
1644 if (offset)
1645 return -EINVAL;
1646
1647 if (!cg)
1648 return 0;
1649
1650 cpuset = get_cpuset(cg);
1651 if (!cpuset)
1652 return 0;
1653
1654 f = fopen("/proc/stat", "r");
1655 if (!f)
1656 return 0;
1657
1658 while (getline(&line, &linelen, f) != -1) {
1659 size_t l;
1660 int cpu;
1661 char *c;
1662
1663 if (sscanf(line, "cpu%d", &cpu) != 1) {
1664 /* not a ^cpu line, just print it */
1665 l = snprintf(buf, size, "%s", line);
1666 buf += l;
1667 size -= l;
1668 total_len += l;
1669 continue;
1670 }
1671 if (!cpu_in_cpuset(cpu, cpuset))
1672 continue;
1673 curcpu ++;
1674
1675 c = strchr(line, ' ');
1676 if (!c)
1677 continue;
1678 l = snprintf(buf, size, "cpu%d %s", curcpu, c);
1679 buf += l;
1680 size -= l;
1681 total_len += l;
1682 }
1683
1684 return total_len;
23ce2127
SH
1685}
1686
7bbf2246
SH
1687/*
1688 * How to guess what to present for uptime?
1689 * One thing we could do would be to take the date on the caller's
1690 * memory.usage_in_bytes file, which should equal the time of creation
1691 * of his cgroup. However, a task could be in a sub-cgroup of the
1692 * container. The same problem exists if we try to look at the ages
1693 * of processes in the caller's cgroup.
1694 *
1695 * So we'll fork a task that will enter the caller's pidns, mount a
1696 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
1697 *
1698 * For the second uptime #, we'll do as Stéphane had done, just copy
1699 * the number from /proc/uptime. Not sure how to best emulate 'idle'
1700 * time. Maybe someone can come up with a good algorithm and submit a
1701 * patch. Maybe something based on cpushare info?
1702 */
41bb9357
SH
1703
1704/* return age of the reaper for $pid, taken from ctime of its procdir */
1705static long int get_pid1_time(pid_t pid)
1706{
1707 char fnam[100];
1708 int fd;
1709 struct stat sb;
1710 int ret;
1711 pid_t npid;
1712
1713 if (unshare(CLONE_NEWNS))
1714 return 0;
1715
1716 sprintf(fnam, "/proc/%d/ns/pid", pid);
1717 fd = open(fnam, O_RDONLY);
1718 if (fd < 0) {
1719 perror("get_pid1_time open of ns/pid");
1720 return 0;
1721 }
1722 if (setns(fd, 0)) {
1723 perror("get_pid1_time setns 1");
1724 close(fd);
1725 return 0;
1726 }
1727 close(fd);
1728 npid = fork();
1729 if (npid < 0)
1730 return 0;
1731
1732 if (npid) {
1733 // child will do the writing for us
1734 wait_for_pid(npid);
1735 exit(0);
1736 }
1737
1738 umount2("/proc", MNT_DETACH);
1739
1740 if (mount("proc", "/proc", "proc", 0, NULL)) {
1741 perror("get_pid1_time mount");
1742 return 0;
1743 }
1744 ret = lstat("/proc/1", &sb);
1745 if (ret) {
1746 perror("get_pid1_time lstat");
1747 return 0;
1748 }
1749 return time(NULL) - sb.st_ctime;
1750}
1751
1752static long int getreaperage(pid_t qpid)
1753{
1754 int pid, mypipe[2], ret;
1755 struct timeval tv;
1756 fd_set s;
1757 long int mtime, answer = 0;
1758
1759 if (pipe(mypipe)) {
1760 return 0;
1761 }
1762
1763 pid = fork();
1764
1765 if (!pid) { // child
1766 mtime = get_pid1_time(qpid);
1767 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
1768 fprintf(stderr, "Warning: bad write from getreaperage\n");
1769 exit(0);
1770 }
1771
1772 close(mypipe[1]);
1773 FD_ZERO(&s);
1774 FD_SET(mypipe[0], &s);
1775 tv.tv_sec = 1;
1776 tv.tv_usec = 0;
1777 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
1778 if (ret == -1) {
1779 perror("select");
1780 goto out;
1781 }
1782 if (!ret) {
1783 printf("timed out\n");
1784 goto out;
1785 }
1786 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
1787 perror("read");
1788 goto out;
1789 }
1790 answer = mtime;
1791
1792out:
1793 wait_for_pid(pid);
1794 close(mypipe[0]);
1795 return answer;
1796}
1797
1798static long int getprocidle(void)
1799{
1800 FILE *f = fopen("/proc/uptime", "r");
1801 long int age, idle;
1802 if (!f)
1803 return 0;
1804 if (fscanf(f, "%ld %ld", &age, &idle) != 2)
1805 return 0;
1806 return idle;
1807}
1808
1809/*
1810 * We read /proc/uptime and reuse its second field.
1811 * For the first field, we use the mtime for the reaper for
1812 * the calling pid as returned by getreaperage
1813 */
23ce2127
SH
1814static int proc_uptime_read(char *buf, size_t size, off_t offset,
1815 struct fuse_file_info *fi)
1816{
41bb9357
SH
1817 struct fuse_context *fc = fuse_get_context();
1818 long int reaperage = getreaperage(fc->pid);;
1819 long int idletime = getprocidle();
1820
1821 if (offset)
1822 return -EINVAL;
1823 return snprintf(buf, size, "%ld %ld\n", reaperage, idletime);
23ce2127
SH
1824}
1825
1826static off_t get_procfile_size(const char *which)
1827{
1828 FILE *f = fopen(which, "r");
1829 char *line = NULL;
1830 size_t len = 0;
1831 ssize_t sz, answer = 0;
1832 if (!f)
1833 return 0;
1834
1835 while ((sz = getline(&line, &len, f)) != -1)
1836 answer += sz;
1837 fclose (f);
1838
1839 return answer;
1840}
1841
758ad80c
SH
1842static int proc_getattr(const char *path, struct stat *sb)
1843{
35629743
SH
1844 struct timespec now;
1845
1846 memset(sb, 0, sizeof(struct stat));
1847 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1848 return -EINVAL;
1849 sb->st_uid = sb->st_gid = 0;
1850 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1851 if (strcmp(path, "/proc") == 0) {
1852 sb->st_mode = S_IFDIR | 00555;
1853 sb->st_nlink = 2;
1854 return 0;
1855 }
1856 if (strcmp(path, "/proc/meminfo") == 0 ||
1857 strcmp(path, "/proc/cpuinfo") == 0 ||
1858 strcmp(path, "/proc/uptime") == 0 ||
1859 strcmp(path, "/proc/stat") == 0) {
23ce2127
SH
1860
1861 sb->st_size = get_procfile_size(path);
35629743
SH
1862 sb->st_mode = S_IFREG | 00444;
1863 sb->st_nlink = 1;
1864 return 0;
1865 }
1866
1867 return -ENOENT;
1868}
1869
1870static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1871 struct fuse_file_info *fi)
1872{
1873 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
1874 filler(buf, "meminfo", NULL, 0) != 0 ||
1875 filler(buf, "stat", NULL, 0) != 0 ||
1876 filler(buf, "uptime", NULL, 0) != 0)
758ad80c 1877 return -EINVAL;
758ad80c
SH
1878 return 0;
1879}
1880
35629743
SH
1881static int proc_open(const char *path, struct fuse_file_info *fi)
1882{
1883 if (strcmp(path, "/proc/meminfo") == 0 ||
1884 strcmp(path, "/proc/cpuinfo") == 0 ||
1885 strcmp(path, "/proc/uptime") == 0 ||
1886 strcmp(path, "/proc/stat") == 0)
1887 return 0;
1888 return -ENOENT;
1889}
1890
35629743
SH
1891static int proc_read(const char *path, char *buf, size_t size, off_t offset,
1892 struct fuse_file_info *fi)
1893{
1894 if (strcmp(path, "/proc/meminfo") == 0)
23ce2127 1895 return proc_meminfo_read(buf, size, offset, fi);
35629743 1896 if (strcmp(path, "/proc/cpuinfo") == 0)
23ce2127 1897 return proc_cpuinfo_read(buf, size, offset, fi);
35629743 1898 if (strcmp(path, "/proc/uptime") == 0)
23ce2127 1899 return proc_uptime_read(buf, size, offset, fi);
35629743 1900 if (strcmp(path, "/proc/stat") == 0)
23ce2127 1901 return proc_stat_read(buf, size, offset, fi);
35629743
SH
1902 return -EINVAL;
1903}
1904
2ad6d2bd
SH
1905/*
1906 * FUSE ops for /
1907 * these just delegate to the /proc and /cgroup ops as
1908 * needed
1909 */
758ad80c
SH
1910
1911static int lxcfs_getattr(const char *path, struct stat *sb)
1912{
1913 if (strcmp(path, "/") == 0) {
1914 sb->st_mode = S_IFDIR | 00755;
1915 sb->st_nlink = 2;
1916 return 0;
1917 }
1918 if (strncmp(path, "/cgroup", 7) == 0) {
1919 return cg_getattr(path, sb);
1920 }
35629743 1921 if (strncmp(path, "/proc", 5) == 0) {
758ad80c
SH
1922 return proc_getattr(path, sb);
1923 }
1924 return -EINVAL;
1925}
1926
1927static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
1928{
1929 if (strcmp(path, "/") == 0)
1930 return 0;
1931
1932 if (strncmp(path, "/cgroup", 7) == 0) {
1933 return cg_opendir(path, fi);
1934 }
35629743
SH
1935 if (strcmp(path, "/proc") == 0)
1936 return 0;
1937 return -ENOENT;
758ad80c
SH
1938}
1939
1940static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1941 struct fuse_file_info *fi)
1942{
1943 if (strcmp(path, "/") == 0) {
1944 if (filler(buf, "proc", NULL, 0) != 0 ||
1945 filler(buf, "cgroup", NULL, 0) != 0)
1946 return -EINVAL;
1947 return 0;
1948 }
35629743 1949 if (strncmp(path, "/cgroup", 7) == 0)
758ad80c 1950 return cg_readdir(path, buf, filler, offset, fi);
35629743
SH
1951 if (strcmp(path, "/proc") == 0)
1952 return proc_readdir(path, buf, filler, offset, fi);
758ad80c
SH
1953 return -EINVAL;
1954}
1955
1956static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
1957{
1958 if (strcmp(path, "/") == 0)
1959 return 0;
1960 if (strncmp(path, "/cgroup", 7) == 0) {
1961 return cg_releasedir(path, fi);
1962 }
35629743
SH
1963 if (strcmp(path, "/proc") == 0)
1964 return 0;
758ad80c
SH
1965 return -EINVAL;
1966}
1967
99978832
SH
1968static int lxcfs_open(const char *path, struct fuse_file_info *fi)
1969{
35629743 1970 if (strncmp(path, "/cgroup", 7) == 0)
99978832 1971 return cg_open(path, fi);
35629743
SH
1972 if (strncmp(path, "/proc", 5) == 0)
1973 return proc_open(path, fi);
99978832
SH
1974
1975 return -EINVAL;
1976}
1977
1978static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
1979 struct fuse_file_info *fi)
1980{
35629743 1981 if (strncmp(path, "/cgroup", 7) == 0)
99978832 1982 return cg_read(path, buf, size, offset, fi);
35629743
SH
1983 if (strncmp(path, "/proc", 5) == 0)
1984 return proc_read(path, buf, size, offset, fi);
99978832
SH
1985
1986 return -EINVAL;
1987}
1988
2ad6d2bd
SH
1989int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
1990 struct fuse_file_info *fi)
1991{
1992 if (strncmp(path, "/cgroup", 7) == 0) {
1993 return cg_write(path, buf, size, offset, fi);
1994 }
1995
1996 return -EINVAL;
1997}
1998
99978832
SH
1999static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2000{
2001 return 0;
2002}
2003
2004static int lxcfs_release(const char *path, struct fuse_file_info *fi)
758ad80c 2005{
99978832
SH
2006 return 0;
2007}
2008
2009static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2010{
2011 return 0;
758ad80c
SH
2012}
2013
ab54b798
SH
2014int lxcfs_mkdir(const char *path, mode_t mode)
2015{
2016 if (strncmp(path, "/cgroup", 7) == 0)
2017 return cg_mkdir(path, mode);
2018
2019 return -EINVAL;
2020}
2021
341b21ad
SH
2022int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2023{
2024 if (strncmp(path, "/cgroup", 7) == 0)
2025 return cg_chown(path, uid, gid);
2026
2027 return -EINVAL;
2028}
2029
2ad6d2bd
SH
2030/*
2031 * cat first does a truncate before doing ops->write. This doesn't
2032 * really make sense for cgroups. So just return 0 always but do
2033 * nothing.
2034 */
2035int lxcfs_truncate(const char *path, off_t newsize)
2036{
2037 if (strncmp(path, "/cgroup", 7) == 0)
2038 return 0;
2039 return -EINVAL;
2040}
2041
50d8d5b5
SH
2042int lxcfs_rmdir(const char *path)
2043{
2044 if (strncmp(path, "/cgroup", 7) == 0)
2045 return cg_rmdir(path);
2046 return -EINVAL;
2047}
2048
fd2e4e03
SH
2049int lxcfs_chmod(const char *path, mode_t mode)
2050{
2051 if (strncmp(path, "/cgroup", 7) == 0)
2052 return cg_chmod(path, mode);
2053 return -EINVAL;
2054}
2055
758ad80c
SH
2056const struct fuse_operations lxcfs_ops = {
2057 .getattr = lxcfs_getattr,
2058 .readlink = NULL,
2059 .getdir = NULL,
2060 .mknod = NULL,
ab54b798 2061 .mkdir = lxcfs_mkdir,
758ad80c 2062 .unlink = NULL,
50d8d5b5 2063 .rmdir = lxcfs_rmdir,
758ad80c
SH
2064 .symlink = NULL,
2065 .rename = NULL,
2066 .link = NULL,
fd2e4e03 2067 .chmod = lxcfs_chmod,
341b21ad 2068 .chown = lxcfs_chown,
2ad6d2bd 2069 .truncate = lxcfs_truncate,
758ad80c 2070 .utime = NULL,
99978832
SH
2071
2072 .open = lxcfs_open,
2073 .read = lxcfs_read,
2074 .release = lxcfs_release,
2ad6d2bd 2075 .write = lxcfs_write,
99978832 2076
758ad80c 2077 .statfs = NULL,
99978832
SH
2078 .flush = lxcfs_flush,
2079 .fsync = lxcfs_fsync,
758ad80c
SH
2080
2081 .setxattr = NULL,
2082 .getxattr = NULL,
2083 .listxattr = NULL,
2084 .removexattr = NULL,
2085
2086 .opendir = lxcfs_opendir,
2087 .readdir = lxcfs_readdir,
2088 .releasedir = lxcfs_releasedir,
2089
2090 .fsyncdir = NULL,
2091 .init = NULL,
2092 .destroy = NULL,
2093 .access = NULL,
2094 .create = NULL,
2095 .ftruncate = NULL,
2096 .fgetattr = NULL,
2097};
2098
99978832 2099static void usage(const char *me)
758ad80c
SH
2100{
2101 fprintf(stderr, "Usage:\n");
2102 fprintf(stderr, "\n");
2103 fprintf(stderr, "%s [FUSE and mount options] mountpoint\n", me);
2104 exit(1);
2105}
2106
99978832 2107static bool is_help(char *w)
758ad80c
SH
2108{
2109 if (strcmp(w, "-h") == 0 ||
2110 strcmp(w, "--help") == 0 ||
2111 strcmp(w, "-help") == 0 ||
2112 strcmp(w, "help") == 0)
2113 return true;
2114 return false;
2115}
2116
2117int main(int argc, char *argv[])
2118{
2119 int ret;
2120 struct lxcfs_state *d;
2121
2122 if (argc < 2 || is_help(argv[1]))
2123 usage(argv[0]);
2124
2125 d = malloc(sizeof(*d));
2126 if (!d)
2127 return -1;
2128
2129 if (!cgm_escape_cgroup())
2130 fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
2131
2132 if (!cgm_get_controllers(&d->subsystems))
2133 return -1;
2134
2135 ret = fuse_main(argc, argv, &lxcfs_ops, d);
2136
2137 return ret;
2183082c 2138}