]> git.proxmox.com Git - mirror_lxcfs.git/blame - lxcfs.c
add cache types for each procfile type that we provide
[mirror_lxcfs.git] / lxcfs.c
CommitLineData
758ad80c
SH
1/* lxcfs
2 *
3 * Copyright © 2014 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
f2799430 6 * See COPYING file for details.
758ad80c
SH
7 */
8
9/*
10 * NOTES - make sure to run this as -s to avoid threading.
11 * TODO - can we enforce that here from the code?
12 */
13#define FUSE_USE_VERSION 26
14
2183082c 15#include <stdio.h>
758ad80c
SH
16#include <dirent.h>
17#include <fcntl.h>
18#include <fuse.h>
19#include <unistd.h>
20#include <errno.h>
21#include <stdbool.h>
22#include <time.h>
23#include <string.h>
24#include <stdlib.h>
25#include <libgen.h>
41bb9357
SH
26#include <sched.h>
27#include <linux/sched.h>
a05660a6 28#include <sys/socket.h>
41bb9357
SH
29#include <sys/mount.h>
30#include <wait.h>
758ad80c
SH
31
32#include <nih/alloc.h>
33#include <nih/string.h>
c688e1b3 34#include <nih/alloc.h>
758ad80c
SH
35
36#include "cgmanager.h"
37
38struct lxcfs_state {
39 /*
40 * a null-terminated, nih-allocated list of the mounted subsystems. We
41 * detect this at startup.
42 */
43 char **subsystems;
44};
45#define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
46
443d13f5
SH
47enum {
48 LXC_TYPE_CGDIR,
49 LXC_TYPE_CGFILE,
50 LXC_TYPE_PROC_MEMINFO,
51 LXC_TYPE_PROC_CPUINFO,
52 LXC_TYPE_PROC_UPTIME,
53 LXC_TYPE_PROC_STAT,
54 LXC_TYPE_PROC_DISKSTATS,
55};
56
c688e1b3
SH
57struct file_info {
58 char *controller;
59 char *cgroup;
8f6e8f5e 60 char *file;
443d13f5 61 int type;
c688e1b3
SH
62 char *buf; // unused as of yet
63 int buflen;
64};
65
66static char *must_copy_string(const char *str)
67{
68 if (!str)
69 return NULL;
70 return NIH_MUST( nih_strdup(NULL, str) );
71}
72
4775fba1
SH
73/*
74 * TODO - return value should denote whether child exited with failure
75 * so callers can return errors. Esp read/write of tasks and cgroup.procs
76 */
a05660a6
SH
77static int wait_for_pid(pid_t pid)
78{
79 int status, ret;
80
81again:
82 ret = waitpid(pid, &status, 0);
83 if (ret == -1) {
84 if (errno == EINTR)
85 goto again;
86 return -1;
87 }
88 if (ret != pid)
89 goto again;
90 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
91 return -1;
92 return 0;
93}
94
053a659d
SH
95/*
96 * Given a open file * to /proc/pid/{u,g}id_map, and an id
97 * valid in the caller's namespace, return the id mapped into
98 * pid's namespace.
99 * Returns the mapped id, or -1 on error.
100 */
101unsigned int
102convert_id_to_ns(FILE *idfile, unsigned int in_id)
103{
104 unsigned int nsuid, // base id for a range in the idfile's namespace
105 hostuid, // base id for a range in the caller's namespace
106 count; // number of ids in this range
107 char line[400];
108 int ret;
109
110 fseek(idfile, 0L, SEEK_SET);
111 while (fgets(line, 400, idfile)) {
112 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
113 if (ret != 3)
114 continue;
115 if (hostuid + count < hostuid || nsuid + count < nsuid) {
116 /*
117 * uids wrapped around - unexpected as this is a procfile,
118 * so just bail.
119 */
647c89e5 120 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
053a659d
SH
121 nsuid, hostuid, count, line);
122 return -1;
123 }
124 if (hostuid <= in_id && hostuid+count > in_id) {
125 /*
126 * now since hostuid <= in_id < hostuid+count, and
127 * hostuid+count and nsuid+count do not wrap around,
128 * we know that nsuid+(in_id-hostuid) which must be
129 * less that nsuid+(count) must not wrap around
130 */
131 return (in_id - hostuid) + nsuid;
132 }
133 }
134
135 // no answer found
136 return -1;
137}
138
341b21ad
SH
139/*
140 * for is_privileged_over,
141 * specify whether we require the calling uid to be root in his
142 * namespace
143 */
144#define NS_ROOT_REQD true
145#define NS_ROOT_OPT false
146
147static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
758ad80c 148{
053a659d
SH
149 nih_local char *fpath = NULL;
150 bool answer = false;
151 uid_t nsuid;
152
341b21ad
SH
153 if (victim == -1 || uid == -1)
154 return false;
155
156 /*
157 * If the request is one not requiring root in the namespace,
158 * then having the same uid suffices. (i.e. uid 1000 has write
159 * access to files owned by uid 1000
160 */
161 if (!req_ns_root && uid == victim)
758ad80c
SH
162 return true;
163
053a659d
SH
164 fpath = NIH_MUST( nih_sprintf(NULL, "/proc/%d/uid_map", pid) );
165 FILE *f = fopen(fpath, "r");
166 if (!f)
167 return false;
168
341b21ad 169 /* if caller's not root in his namespace, reject */
053a659d
SH
170 nsuid = convert_id_to_ns(f, uid);
171 if (nsuid)
172 goto out;
173
341b21ad
SH
174 /*
175 * If victim is not mapped into caller's ns, reject.
176 * XXX I'm not sure this check is needed given that fuse
177 * will be sending requests where the vfs has converted
178 */
053a659d
SH
179 nsuid = convert_id_to_ns(f, victim);
180 if (nsuid == -1)
181 goto out;
182
183 answer = true;
184
185out:
186 fclose(f);
187 return answer;
758ad80c
SH
188}
189
190static bool perms_include(int fmode, mode_t req_mode)
191{
2ad6d2bd
SH
192 mode_t r;
193
194 switch (req_mode & O_ACCMODE) {
195 case O_RDONLY:
196 r = S_IROTH;
197 break;
198 case O_WRONLY:
199 r = S_IWOTH;
200 break;
201 case O_RDWR:
202 r = S_IROTH | S_IWOTH;
203 break;
204 default:
205 return false;
206 }
207 return ((fmode & r) == r);
758ad80c
SH
208}
209
3db25a35
SH
210static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
211{
212 char *start, *end;
213
214 if (strlen(taskcg) <= strlen(querycg)) {
215 fprintf(stderr, "%s: I was fed bad input\n", __func__);
216 return NULL;
217 }
218
219 if (strcmp(querycg, "/") == 0)
220 start = NIH_MUST( nih_strdup(NULL, taskcg + 1) );
221 else
222 start = NIH_MUST( nih_strdup(NULL, taskcg + strlen(querycg) + 1) );
223 end = strchr(start, '/');
224 if (end)
225 *end = '\0';
226 return start;
227}
228
758ad80c
SH
229/*
230 * check whether a fuse context may access a cgroup dir or file
231 *
232 * If file is not null, it is a cgroup file to check under cg.
233 * If file is null, then we are checking perms on cg itself.
234 *
235 * For files we can check the mode of the list_keys result.
236 * For cgroups, we must make assumptions based on the files under the
237 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
238 * yet.
239 */
240static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
241{
242 nih_local struct cgm_keys **list = NULL;
243 int i;
244
245 if (!file)
246 file = "tasks";
247
248 if (*file == '/')
249 file++;
250
251 if (!cgm_list_keys(contrl, cg, &list))
252 return false;
253 for (i = 0; list[i]; i++) {
254 if (strcmp(list[i]->name, file) == 0) {
255 struct cgm_keys *k = list[i];
341b21ad 256 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
758ad80c
SH
257 if (perms_include(k->mode >> 6, mode))
258 return true;
259 }
260 if (fc->gid == k->gid) {
261 if (perms_include(k->mode >> 3, mode))
262 return true;
263 }
264 return perms_include(k->mode, mode);
265 }
266 }
267
268 return false;
269}
270
3db25a35
SH
271static void stripnewline(char *x)
272{
273 size_t l = strlen(x);
274 if (l && x[l-1] == '\n')
275 x[l-1] = '\0';
276}
277
278/*
279 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
280 * If caller is in /a, he may act on /a/b, but not on /b.
281 * if the answer is false and nextcg is not NULL, then *nextcg will point
282 * to a nih_alloc'd string containing the next cgroup directory under cg
283 */
284static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
285{
286 nih_local char *fnam = NULL;
287 FILE *f;
288 bool answer = false;
289 char *line = NULL;
290 size_t len = 0;
291
292 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
293 if (!(f = fopen(fnam, "r")))
294 return false;
295
296 while (getline(&line, &len, f) != -1) {
297 char *c1, *c2, *linecmp;
298 if (!line[0])
299 continue;
300 c1 = strchr(line, ':');
301 if (!c1)
302 goto out;
303 c1++;
304 c2 = strchr(c1, ':');
305 if (!c2)
306 goto out;
307 *c2 = '\0';
308 if (strcmp(c1, contrl) != 0)
309 continue;
310 c2++;
311 stripnewline(c2);
312 /*
313 * callers pass in '/' for root cgroup, otherwise they pass
314 * in a cgroup without leading '/'
315 */
316 linecmp = *cg == '/' ? c2 : c2+1;
317 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
318 if (nextcg)
319 *nextcg = get_next_cgroup_dir(linecmp, cg);
320 goto out;
321 }
322 answer = true;
323 goto out;
324 }
325
326out:
327 fclose(f);
328 free(line);
329 return answer;
330}
331
758ad80c
SH
332/*
333 * given /cgroup/freezer/a/b, return "freezer". this will be nih-allocated
334 * and needs to be nih_freed.
335 */
336static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
337{
338 const char *p1;
339 char *ret, *slash;
340
341 if (strlen(path) < 9)
342 return NULL;
343 p1 = path+8;
344 ret = nih_strdup(NULL, p1);
345 if (!ret)
346 return ret;
347 slash = strstr(ret, "/");
348 if (slash)
349 *slash = '\0';
350
351 /* verify that it is a subsystem */
352 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
353 int i;
354 if (!list) {
355 nih_free(ret);
356 return NULL;
357 }
358 for (i = 0; list[i]; i++) {
359 if (strcmp(list[i], ret) == 0)
360 return ret;
361 }
362 nih_free(ret);
363 return NULL;
364}
365
366/*
367 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
368 * Note that the returned value may include files (keynames) etc
369 */
370static const char *find_cgroup_in_path(const char *path)
371{
372 const char *p1;
373
374 if (strlen(path) < 9)
375 return NULL;
376 p1 = strstr(path+8, "/");
377 if (!p1)
378 return NULL;
379 return p1+1;
380}
381
382static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
383{
384 nih_local char **list = NULL;
385 int i;
386
387 if (!f)
388 return false;
389 if (*f == '/')
390 f++;
391
392 if (!cgm_list_children(contr, dir, &list))
393 return false;
394 for (i = 0; list[i]; i++) {
395 if (strcmp(list[i], f) == 0)
396 return true;
397 }
398
399 return false;
400}
401
402static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
403{
404 nih_local struct cgm_keys **list = NULL;
405 struct cgm_keys *k;
406 int i;
407
408 if (!f)
409 return NULL;
410 if (*f == '/')
411 f++;
412 if (!cgm_list_keys(contr, dir, &list))
413 return NULL;
414 for (i = 0; list[i]; i++) {
415 if (strcmp(list[i]->name, f) == 0) {
416 k = NIH_MUST( nih_alloc(NULL, (sizeof(*k))) );
417 k->name = NIH_MUST( nih_strdup(k, list[i]->name) );
418 k->uid = list[i]->uid;
419 k->gid = list[i]->gid;
420 k->mode = list[i]->mode;
421 return k;
422 }
423 }
424
425 return NULL;
426}
427
428static void get_cgdir_and_path(const char *cg, char **dir, char **file)
429{
758ad80c
SH
430 char *p;
431
432 *dir = NIH_MUST( nih_strdup(NULL, cg) );
433 *file = strrchr(cg, '/');
434 if (!*file) {
435 *file = NULL;
436 return;
437 }
438 p = strrchr(*dir, '/');
439 *p = '\0';
440}
441
99978832
SH
442static size_t get_file_size(const char *contrl, const char *cg, const char *f)
443{
444 nih_local char *data = NULL;
445 size_t s;
446 if (!cgm_get_value(contrl, cg, f, &data))
447 return -EINVAL;
448 s = strlen(data);
449 return s;
450}
2ad6d2bd 451
758ad80c 452/*
2ad6d2bd 453 * FUSE ops for /cgroup
758ad80c 454 */
2ad6d2bd 455
758ad80c
SH
456static int cg_getattr(const char *path, struct stat *sb)
457{
458 struct timespec now;
459 struct fuse_context *fc = fuse_get_context();
460 nih_local char * cgdir = NULL;
461 char *fpath = NULL, *path1, *path2;
462 nih_local struct cgm_keys *k = NULL;
463 const char *cgroup;
464 nih_local char *controller = NULL;
465
466
467 if (!fc)
468 return -EIO;
469
470 memset(sb, 0, sizeof(struct stat));
471
472 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
473 return -EINVAL;
474
475 sb->st_uid = sb->st_gid = 0;
476 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
477 sb->st_size = 0;
478
479 if (strcmp(path, "/cgroup") == 0) {
480 sb->st_mode = S_IFDIR | 00755;
481 sb->st_nlink = 2;
482 return 0;
483 }
484
485 controller = pick_controller_from_path(fc, path);
486 if (!controller)
487 return -EIO;
758ad80c
SH
488 cgroup = find_cgroup_in_path(path);
489 if (!cgroup) {
490 /* this is just /cgroup/controller, return it as a dir */
491 sb->st_mode = S_IFDIR | 00755;
492 sb->st_nlink = 2;
493 return 0;
494 }
341b21ad 495
758ad80c
SH
496 get_cgdir_and_path(cgroup, &cgdir, &fpath);
497
498 if (!fpath) {
499 path1 = "/";
500 path2 = cgdir;
501 } else {
502 path1 = cgdir;
503 path2 = fpath;
504 }
505
758ad80c
SH
506 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
507 * Then check that caller's cgroup is under path if fpath is a child
508 * cgroup, or cgdir if fpath is a file */
509
510 if (is_child_cgroup(controller, path1, path2)) {
f9a05025
SH
511 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
512 /* this is just /cgroup/controller, return it as a dir */
513 sb->st_mode = S_IFDIR | 00555;
514 sb->st_nlink = 2;
515 return 0;
516 }
758ad80c 517 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
f9a05025 518 return -EACCES;
758ad80c 519
053a659d
SH
520 // get uid, gid, from '/tasks' file and make up a mode
521 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
522 sb->st_mode = S_IFDIR | 00755;
523 k = get_cgroup_key(controller, cgroup, "tasks");
524 if (!k) {
053a659d
SH
525 sb->st_uid = sb->st_gid = 0;
526 } else {
053a659d
SH
527 sb->st_uid = k->uid;
528 sb->st_gid = k->gid;
529 }
758ad80c
SH
530 sb->st_nlink = 2;
531 return 0;
532 }
533
534 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
3db25a35
SH
535 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL))
536 return -ENOENT;
758ad80c 537 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
f9a05025 538 return -EACCES;
758ad80c 539
758ad80c 540 sb->st_mode = S_IFREG | k->mode;
053a659d 541 sb->st_nlink = 1;
758ad80c
SH
542 sb->st_uid = k->uid;
543 sb->st_gid = k->gid;
99978832 544 sb->st_size = get_file_size(controller, path1, path2);
758ad80c
SH
545 return 0;
546 }
547
ab54b798 548 return -ENOENT;
758ad80c 549}
2183082c 550
7f163b71
SH
551/*
552 * TODO - cache these results in a table for use in opendir, free
553 * in releasedir
554 */
758ad80c 555static int cg_opendir(const char *path, struct fuse_file_info *fi)
2183082c 556{
7f163b71
SH
557 struct fuse_context *fc = fuse_get_context();
558 nih_local struct cgm_keys **list = NULL;
559 const char *cgroup;
c688e1b3 560 struct file_info *dir_info;
7f163b71 561 nih_local char *controller = NULL;
7f163b71
SH
562
563 if (!fc)
564 return -EIO;
565
c688e1b3
SH
566 if (strcmp(path, "/cgroup") == 0) {
567 cgroup = NULL;
568 controller = NULL;
569 } else {
570 // return list of keys for the controller, and list of child cgroups
571 controller = pick_controller_from_path(fc, path);
572 if (!controller)
573 return -EIO;
7f163b71 574
c688e1b3
SH
575 cgroup = find_cgroup_in_path(path);
576 if (!cgroup) {
577 /* this is just /cgroup/controller, return its contents */
578 cgroup = "/";
579 }
7f163b71
SH
580 }
581
582 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
583 return -EACCES;
c688e1b3
SH
584
585 /* we'll free this at cg_releasedir */
586 dir_info = NIH_MUST( nih_alloc(NULL, sizeof(*dir_info)) );
587 dir_info->controller = must_copy_string(controller);
588 dir_info->cgroup = must_copy_string(cgroup);
443d13f5 589 dir_info->type = LXC_TYPE_CGDIR;
c688e1b3 590 dir_info->buf = NULL;
8f6e8f5e 591 dir_info->file = NULL;
c688e1b3
SH
592 dir_info->buflen = 0;
593
594 fi->fh = (unsigned long)dir_info;
758ad80c
SH
595 return 0;
596}
597
758ad80c
SH
598static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
599 struct fuse_file_info *fi)
600{
c688e1b3
SH
601 struct file_info *d = (struct file_info *)fi->fh;
602 nih_local struct cgm_keys **list = NULL;
603 int i;
604 nih_local char *nextcg = NULL;
758ad80c
SH
605 struct fuse_context *fc = fuse_get_context();
606
443d13f5 607 if (d->type != LXC_TYPE_CGDIR) {
b845ad01
SH
608 fprintf(stderr, "Internal error: file cache info used in readdir\n");
609 return -EIO;
610 }
c688e1b3
SH
611 if (!d->cgroup && !d->controller) {
612 // ls /var/lib/lxcfs/cgroup - just show list of controllers
758ad80c
SH
613 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
614 int i;
615
616 if (!list)
617 return -EIO;
7f163b71 618
758ad80c
SH
619 for (i = 0; list[i]; i++) {
620 if (filler(buf, list[i], NULL, 0) != 0) {
621 return -EIO;
622 }
623 }
624 return 0;
625 }
626
c688e1b3 627 if (!cgm_list_keys(d->controller, d->cgroup, &list))
3db25a35 628 // not a valid cgroup
758ad80c 629 return -EINVAL;
3db25a35 630
c688e1b3 631 if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
3db25a35
SH
632 if (nextcg) {
633 int ret;
634 ret = filler(buf, nextcg, NULL, 0);
635 if (ret != 0)
636 return -EIO;
637 }
638 return 0;
639 }
640
758ad80c 641 for (i = 0; list[i]; i++) {
758ad80c
SH
642 if (filler(buf, list[i]->name, NULL, 0) != 0) {
643 return -EIO;
644 }
645 }
646
647 // now get the list of child cgroups
422aa4a5 648 nih_local char **clist = NULL;
758ad80c 649
c688e1b3 650 if (!cgm_list_children(d->controller, d->cgroup, &clist))
758ad80c
SH
651 return 0;
652 for (i = 0; clist[i]; i++) {
758ad80c
SH
653 if (filler(buf, clist[i], NULL, 0) != 0) {
654 return -EIO;
655 }
656 }
657 return 0;
658}
659
8f6e8f5e
SH
660static void do_release_file_info(struct file_info *f)
661{
662 if (f->controller)
663 nih_free(f->controller);
664 if (f->cgroup)
665 nih_free(f->cgroup);
666 if (f->file)
667 nih_free(f->file);
668 free(f->buf);
669 nih_free(f);
670}
671
758ad80c
SH
672static int cg_releasedir(const char *path, struct fuse_file_info *fi)
673{
c688e1b3
SH
674 struct file_info *d = (struct file_info *)fi->fh;
675
8f6e8f5e 676 do_release_file_info(d);
758ad80c
SH
677 return 0;
678}
679
99978832
SH
680static int cg_open(const char *path, struct fuse_file_info *fi)
681{
682 nih_local char *controller = NULL;
683 const char *cgroup;
684 char *fpath = NULL, *path1, *path2;
685 nih_local char * cgdir = NULL;
686 nih_local struct cgm_keys *k = NULL;
8f6e8f5e 687 struct file_info *file_info;
99978832
SH
688 struct fuse_context *fc = fuse_get_context();
689
690 if (!fc)
691 return -EIO;
692
693 controller = pick_controller_from_path(fc, path);
694 if (!controller)
695 return -EIO;
696 cgroup = find_cgroup_in_path(path);
697 if (!cgroup)
698 return -EINVAL;
699
700 get_cgdir_and_path(cgroup, &cgdir, &fpath);
701 if (!fpath) {
702 path1 = "/";
703 path2 = cgdir;
704 } else {
705 path1 = cgdir;
706 path2 = fpath;
707 }
708
8f6e8f5e
SH
709 k = get_cgroup_key(controller, path1, path2);
710 if (!k)
711 return -EINVAL;
99978832 712
8f6e8f5e
SH
713 if (!fc_may_access(fc, controller, path1, path2, fi->flags))
714 // should never get here
715 return -EACCES;
99978832 716
8f6e8f5e
SH
717 /* we'll free this at cg_release */
718 file_info = NIH_MUST( nih_alloc(NULL, sizeof(*file_info)) );
719 file_info->controller = must_copy_string(controller);
720 file_info->cgroup = must_copy_string(path1);
721 file_info->file = must_copy_string(path2);
443d13f5 722 file_info->type = LXC_TYPE_CGFILE;
8f6e8f5e
SH
723 file_info->buf = NULL;
724 file_info->buflen = 0;
725
726 fi->fh = (unsigned long)file_info;
727 return 0;
728}
729
730static int cg_release(const char *path, struct fuse_file_info *fi)
731{
732 struct file_info *f = (struct file_info *)fi->fh;
733
734 do_release_file_info(f);
735 return 0;
99978832
SH
736}
737
a05660a6
SH
738static int msgrecv(int sockfd, void *buf, size_t len)
739{
740 struct timeval tv;
741 fd_set rfds;
742
743 FD_ZERO(&rfds);
744 FD_SET(sockfd, &rfds);
745 tv.tv_sec = 2;
746 tv.tv_usec = 0;
747
ea56f722 748 if (select(sockfd+1, &rfds, NULL, NULL, &tv) <= 0)
a05660a6
SH
749 return -1;
750 return recv(sockfd, buf, len, MSG_DONTWAIT);
751}
752
01e71852
SH
753#define SEND_CREDS_OK 0
754#define SEND_CREDS_NOTSK 1
755#define SEND_CREDS_FAIL 2
756static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
a05660a6
SH
757{
758 struct msghdr msg = { 0 };
759 struct iovec iov;
760 struct cmsghdr *cmsg;
761 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
762 char buf[1];
763 buf[0] = 'p';
764
01e71852
SH
765 if (pingfirst) {
766 if (msgrecv(sock, buf, 1) != 1) {
1420baf8 767 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
01e71852
SH
768 __func__);
769 return SEND_CREDS_FAIL;
770 }
a05660a6
SH
771 }
772
773 msg.msg_control = cmsgbuf;
774 msg.msg_controllen = sizeof(cmsgbuf);
775
776 cmsg = CMSG_FIRSTHDR(&msg);
777 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
778 cmsg->cmsg_level = SOL_SOCKET;
779 cmsg->cmsg_type = SCM_CREDENTIALS;
780 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
781
782 msg.msg_name = NULL;
783 msg.msg_namelen = 0;
784
785 buf[0] = v;
786 iov.iov_base = buf;
787 iov.iov_len = sizeof(buf);
788 msg.msg_iov = &iov;
789 msg.msg_iovlen = 1;
790
791 if (sendmsg(sock, &msg, 0) < 0) {
1420baf8 792 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
a05660a6
SH
793 strerror(errno));
794 if (errno == 3)
01e71852
SH
795 return SEND_CREDS_NOTSK;
796 return SEND_CREDS_FAIL;
a05660a6
SH
797 }
798
01e71852 799 return SEND_CREDS_OK;
a05660a6
SH
800}
801
802static bool recv_creds(int sock, struct ucred *cred, char *v)
803{
804 struct msghdr msg = { 0 };
805 struct iovec iov;
806 struct cmsghdr *cmsg;
807 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
808 char buf[1];
809 int ret;
810 int optval = 1;
6ee867dc
SH
811 struct timeval tv;
812 fd_set rfds;
a05660a6
SH
813
814 *v = '1';
815
816 cred->pid = -1;
817 cred->uid = -1;
818 cred->gid = -1;
819
820 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
1420baf8 821 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
a05660a6
SH
822 return false;
823 }
824 buf[0] = '1';
825 if (write(sock, buf, 1) != 1) {
1420baf8 826 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
a05660a6
SH
827 return false;
828 }
829
830 msg.msg_name = NULL;
831 msg.msg_namelen = 0;
832 msg.msg_control = cmsgbuf;
833 msg.msg_controllen = sizeof(cmsgbuf);
834
835 iov.iov_base = buf;
836 iov.iov_len = sizeof(buf);
837 msg.msg_iov = &iov;
838 msg.msg_iovlen = 1;
839
6ee867dc
SH
840 FD_ZERO(&rfds);
841 FD_SET(sock, &rfds);
842 tv.tv_sec = 2;
843 tv.tv_usec = 0;
ea56f722 844 if (select(sock+1, &rfds, NULL, NULL, &tv) <= 0) {
6ee867dc
SH
845 fprintf(stderr, "Failed to select for scm_cred: %s\n",
846 strerror(errno));
847 return false;
848 }
849 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
a05660a6 850 if (ret < 0) {
1420baf8 851 fprintf(stderr, "Failed to receive scm_cred: %s\n",
a05660a6
SH
852 strerror(errno));
853 return false;
854 }
855
856 cmsg = CMSG_FIRSTHDR(&msg);
857
858 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
859 cmsg->cmsg_level == SOL_SOCKET &&
860 cmsg->cmsg_type == SCM_CREDENTIALS) {
861 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
862 }
863 *v = buf[0];
864
865 return true;
866}
867
868
869/*
4775fba1
SH
870 * pid_to_ns - reads pids from a ucred over a socket, then writes the
871 * int value back over the socket. This shifts the pid from the
872 * sender's pidns into tpid's pidns.
a05660a6 873 */
4775fba1 874static void pid_to_ns(int sock, pid_t tpid)
a05660a6
SH
875{
876 char v = '0';
877 struct ucred cred;
878
879 while (recv_creds(sock, &cred, &v)) {
880 if (v == '1')
881 exit(0);
a05660a6
SH
882 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
883 exit(1);
884 }
885 exit(0);
886}
887
888/*
4775fba1 889 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
a05660a6 890 * in your old pidns. Only children which you fork will be in the target
4775fba1 891 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
a05660a6
SH
892 * actually convert pids
893 */
4775fba1 894static void pid_to_ns_wrapper(int sock, pid_t tpid)
a05660a6 895{
ea56f722 896 int newnsfd = -1, ret, cpipe[2];
a05660a6
SH
897 char fnam[100];
898 pid_t cpid;
ea56f722
SH
899 struct timeval tv;
900 fd_set s;
901 char v;
a05660a6
SH
902
903 sprintf(fnam, "/proc/%d/ns/pid", tpid);
904 newnsfd = open(fnam, O_RDONLY);
905 if (newnsfd < 0)
906 exit(1);
907 if (setns(newnsfd, 0) < 0)
908 exit(1);
909 close(newnsfd);
910
ea56f722
SH
911 if (pipe(cpipe) < 0)
912 exit(1);
a05660a6 913
ea56f722
SH
914loop:
915 cpid = fork();
a05660a6
SH
916 if (cpid < 0)
917 exit(1);
ea56f722
SH
918
919 if (!cpid) {
920 char b = '1';
921 close(cpipe[0]);
922 if (write(cpipe[1], &b, sizeof(char)) < 0) {
923 fprintf(stderr, "%s (child): erorr on write: %s\n",
924 __func__, strerror(errno));
925 }
926 close(cpipe[1]);
4775fba1 927 pid_to_ns(sock, tpid);
ea56f722
SH
928 }
929 // give the child 1 second to be done forking and
930 // write it's ack
931 FD_ZERO(&s);
932 FD_SET(cpipe[0], &s);
933 tv.tv_sec = 1;
934 tv.tv_usec = 0;
935 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
936 if (ret <= 0)
937 goto again;
938 ret = read(cpipe[0], &v, 1);
939 if (ret != sizeof(char) || v != '1') {
940 goto again;
941 }
942
a05660a6
SH
943 if (!wait_for_pid(cpid))
944 exit(1);
945 exit(0);
ea56f722
SH
946
947again:
948 kill(cpid, SIGKILL);
949 wait_for_pid(cpid);
950 goto loop;
a05660a6
SH
951}
952
953/*
954 * To read cgroup files with a particular pid, we will setns into the child
955 * pidns, open a pipe, fork a child - which will be the first to really be in
956 * the child ns - which does the cgm_get_value and writes the data to the pipe.
957 */
958static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
959{
960 int sock[2] = {-1, -1};
961 nih_local char *tmpdata = NULL;
962 int ret;
963 pid_t qpid, cpid = -1;
964 bool answer = false;
965 char v = '0';
966 struct ucred cred;
967 struct timeval tv;
968 fd_set s;
969
970 if (!cgm_get_value(contrl, cg, file, &tmpdata))
971 return false;
972
973 /*
974 * Now we read the pids from returned data one by one, pass
975 * them into a child in the target namespace, read back the
976 * translated pids, and put them into our to-return data
977 */
978
979 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
980 perror("socketpair");
981 exit(1);
982 }
983
984 cpid = fork();
985 if (cpid == -1)
986 goto out;
987
988 if (!cpid) // child
4775fba1 989 pid_to_ns_wrapper(sock[1], tpid);
a05660a6
SH
990
991 char *ptr = tmpdata;
992 cred.uid = 0;
993 cred.gid = 0;
994 while (sscanf(ptr, "%d\n", &qpid) == 1) {
995 cred.pid = qpid;
01e71852
SH
996 ret = send_creds(sock[0], &cred, v, true);
997
998 if (ret == SEND_CREDS_NOTSK)
999 goto next;
1000 if (ret == SEND_CREDS_FAIL)
a05660a6
SH
1001 goto out;
1002
1003 // read converted results
1004 FD_ZERO(&s);
1005 FD_SET(sock[0], &s);
6ee867dc 1006 tv.tv_sec = 2;
a05660a6
SH
1007 tv.tv_usec = 0;
1008 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
1009 if (ret <= 0) {
6ee867dc
SH
1010 fprintf(stderr, "%s: select error waiting for pid from child: %s\n",
1011 __func__, strerror(errno));
a05660a6
SH
1012 goto out;
1013 }
1014 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
6ee867dc
SH
1015 fprintf(stderr, "%s: error reading pid from child: %s\n",
1016 __func__, strerror(errno));
a05660a6
SH
1017 goto out;
1018 }
a05660a6 1019 NIH_MUST( nih_strcat_sprintf(d, NULL, "%d\n", qpid) );
01e71852 1020next:
a05660a6
SH
1021 ptr = strchr(ptr, '\n');
1022 if (!ptr)
1023 break;
1024 ptr++;
1025 }
1026
1027 cred.pid = getpid();
1028 v = '1';
01e71852 1029 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
a05660a6 1030 // failed to ask child to exit
6ee867dc
SH
1031 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1032 __func__, strerror(errno));
a05660a6
SH
1033 goto out;
1034 }
1035
1036 answer = true;
1037
1038out:
1039 if (cpid != -1)
1040 wait_for_pid(cpid);
1041 if (sock[0] != -1) {
1042 close(sock[0]);
1043 close(sock[1]);
1044 }
1045 return answer;
1046}
1047
99978832
SH
1048static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1049 struct fuse_file_info *fi)
1050{
99978832 1051 struct fuse_context *fc = fuse_get_context();
8f6e8f5e 1052 struct file_info *f = (struct file_info *)fi->fh;
99978832
SH
1053 nih_local struct cgm_keys *k = NULL;
1054
443d13f5 1055 if (f->type != LXC_TYPE_CGFILE) {
b845ad01
SH
1056 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1057 return -EIO;
1058 }
1059
99978832
SH
1060 if (offset)
1061 return -EIO;
1062
1063 if (!fc)
1064 return -EIO;
1065
8f6e8f5e 1066 if (!f->controller)
99978832
SH
1067 return -EINVAL;
1068
8f6e8f5e 1069 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
99978832 1070 nih_local char *data = NULL;
4775fba1
SH
1071 int s;
1072 bool r;
99978832 1073
8f6e8f5e 1074 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY))
f9a05025
SH
1075 // should never get here
1076 return -EACCES;
99978832 1077
8f6e8f5e
SH
1078 if (strcmp(f->file, "tasks") == 0 ||
1079 strcmp(f->file, "/tasks") == 0 ||
1080 strcmp(f->file, "/cgroup.procs") == 0 ||
1081 strcmp(f->file, "cgroup.procs") == 0)
a05660a6 1082 // special case - we have to translate the pids
8f6e8f5e 1083 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
a05660a6 1084 else
8f6e8f5e 1085 r = cgm_get_value(f->controller, f->cgroup, f->file, &data);
a05660a6 1086
4775fba1 1087 if (!r)
99978832
SH
1088 return -EINVAL;
1089
4775fba1
SH
1090 if (!data)
1091 return 0;
99978832
SH
1092 s = strlen(data);
1093 if (s > size)
1094 s = size;
1095 memcpy(buf, data, s);
1096
99978832
SH
1097 return s;
1098 }
1099
1100 return -EINVAL;
1101}
1102
4775fba1
SH
1103static void pid_from_ns(int sock, pid_t tpid)
1104{
1105 pid_t vpid;
1106 struct ucred cred;
1107 char v;
6ee867dc
SH
1108 struct timeval tv;
1109 fd_set s;
1110 int ret;
4775fba1
SH
1111
1112 cred.uid = 0;
1113 cred.gid = 0;
6ee867dc
SH
1114 while (1) {
1115 FD_ZERO(&s);
1116 FD_SET(sock, &s);
1117 tv.tv_sec = 2;
1118 tv.tv_usec = 0;
1119 ret = select(sock+1, &s, NULL, NULL, &tv);
ea56f722
SH
1120 if (ret <= 0) {
1121 fprintf(stderr, "%s: bad select before read from parent: %s\n",
6ee867dc
SH
1122 __func__, strerror(errno));
1123 exit(1);
1124 }
1125 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1126 fprintf(stderr, "%s: bad read from parent: %s\n",
1127 __func__, strerror(errno));
1128 exit(1);
1129 }
4775fba1 1130 if (vpid == -1) // done
01e71852 1131 break;
4775fba1
SH
1132 v = '0';
1133 cred.pid = vpid;
01e71852 1134 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
4775fba1
SH
1135 v = '1';
1136 cred.pid = getpid();
01e71852 1137 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
4775fba1
SH
1138 exit(1);
1139 }
1140 }
1141 exit(0);
1142}
1143
1144static void pid_from_ns_wrapper(int sock, pid_t tpid)
1145{
ea56f722 1146 int newnsfd = -1, ret, cpipe[2];
4775fba1
SH
1147 char fnam[100];
1148 pid_t cpid;
ea56f722
SH
1149 fd_set s;
1150 struct timeval tv;
1151 char v;
4775fba1
SH
1152
1153 sprintf(fnam, "/proc/%d/ns/pid", tpid);
1154 newnsfd = open(fnam, O_RDONLY);
1155 if (newnsfd < 0)
1156 exit(1);
1157 if (setns(newnsfd, 0) < 0)
1158 exit(1);
1159 close(newnsfd);
1160
ea56f722
SH
1161 if (pipe(cpipe) < 0)
1162 exit(1);
1163
1164loop:
4775fba1
SH
1165 cpid = fork();
1166
1167 if (cpid < 0)
1168 exit(1);
ea56f722
SH
1169
1170 if (!cpid) {
1171 char b = '1';
1172 close(cpipe[0]);
1173 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1174 fprintf(stderr, "%s (child): erorr on write: %s\n",
1175 __func__, strerror(errno));
1176 }
1177 close(cpipe[1]);
4775fba1 1178 pid_from_ns(sock, tpid);
ea56f722
SH
1179 }
1180
1181 // give the child 1 second to be done forking and
1182 // write it's ack
1183 FD_ZERO(&s);
1184 FD_SET(cpipe[0], &s);
1185 tv.tv_sec = 1;
1186 tv.tv_usec = 0;
1187 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1188 if (ret <= 0)
1189 goto again;
1190 ret = read(cpipe[0], &v, 1);
1191 if (ret != sizeof(char) || v != '1') {
1192 goto again;
1193 }
1194
4775fba1
SH
1195 if (!wait_for_pid(cpid))
1196 exit(1);
1197 exit(0);
ea56f722
SH
1198
1199again:
1200 kill(cpid, SIGKILL);
1201 wait_for_pid(cpid);
1202 goto loop;
4775fba1
SH
1203}
1204
1205static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
1206{
1207 int sock[2] = {-1, -1};
1208 pid_t qpid, cpid = -1;
1209 bool answer = false, fail = false;
1210
1211 /*
1212 * write the pids to a socket, have helper in writer's pidns
1213 * call movepid for us
1214 */
1215 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1216 perror("socketpair");
1217 exit(1);
1218 }
1219
1220 cpid = fork();
1221 if (cpid == -1)
1222 goto out;
1223
1224 if (!cpid) // child
1225 pid_from_ns_wrapper(sock[1], tpid);
1226
1227 const char *ptr = buf;
1228 while (sscanf(ptr, "%d", &qpid) == 1) {
1229 struct ucred cred;
1230 char v;
1231
1232 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
6ee867dc
SH
1233 fprintf(stderr, "%s: error writing pid to child: %s\n",
1234 __func__, strerror(errno));
4775fba1
SH
1235 goto out;
1236 }
1237
01e71852
SH
1238 if (recv_creds(sock[0], &cred, &v)) {
1239 if (v == '0') {
1240 if (!cgm_move_pid(contrl, cg, cred.pid))
1241 fail = true;
1242 }
4775fba1
SH
1243 }
1244
1245 ptr = strchr(ptr, '\n');
1246 if (!ptr)
1247 break;
1248 ptr++;
1249 }
1250
1251 /* All good, write the value */
1252 qpid = -1;
1253 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1420baf8 1254 fprintf(stderr, "Warning: failed to ask child to exit\n");
4775fba1
SH
1255
1256 if (!fail)
1257 answer = true;
1258
1259out:
1260 if (cpid != -1)
1261 wait_for_pid(cpid);
1262 if (sock[0] != -1) {
1263 close(sock[0]);
1264 close(sock[1]);
1265 }
1266 return answer;
1267}
1268
2ad6d2bd
SH
1269int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1270 struct fuse_file_info *fi)
1271{
2ad6d2bd 1272 struct fuse_context *fc = fuse_get_context();
47cbf0e5 1273 nih_local char *localbuf = NULL;
8f6e8f5e
SH
1274 nih_local struct cgm_keys *k = NULL;
1275 struct file_info *f = (struct file_info *)fi->fh;
2ad6d2bd 1276
443d13f5 1277 if (f->type != LXC_TYPE_CGFILE) {
b845ad01
SH
1278 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1279 return -EIO;
1280 }
1281
2ad6d2bd 1282 if (offset)
f9a05025 1283 return -EINVAL;
2ad6d2bd
SH
1284
1285 if (!fc)
1286 return -EIO;
1287
47cbf0e5
SH
1288 localbuf = NIH_MUST( nih_alloc(NULL, size+1) );
1289 localbuf[size] = '\0';
1290 memcpy(localbuf, buf, size);
2ad6d2bd 1291
8f6e8f5e 1292 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
4775fba1
SH
1293 bool r;
1294
8f6e8f5e 1295 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY))
f9a05025 1296 return -EACCES;
2ad6d2bd 1297
8f6e8f5e
SH
1298 if (strcmp(f->file, "tasks") == 0 ||
1299 strcmp(f->file, "/tasks") == 0 ||
1300 strcmp(f->file, "/cgroup.procs") == 0 ||
1301 strcmp(f->file, "cgroup.procs") == 0)
4775fba1 1302 // special case - we have to translate the pids
8f6e8f5e 1303 r = do_write_pids(fc->pid, f->controller, f->cgroup, f->file, localbuf);
4775fba1 1304 else
8f6e8f5e 1305 r = cgm_set_value(f->controller, f->cgroup, f->file, localbuf);
4775fba1
SH
1306
1307 if (!r)
2ad6d2bd
SH
1308 return -EINVAL;
1309
1310 return size;
1311 }
1312
1313 return -EINVAL;
1314}
1315
341b21ad
SH
1316int cg_chown(const char *path, uid_t uid, gid_t gid)
1317{
1318 struct fuse_context *fc = fuse_get_context();
1319 nih_local char * cgdir = NULL;
1320 char *fpath = NULL, *path1, *path2;
1321 nih_local struct cgm_keys *k = NULL;
1322 const char *cgroup;
1323 nih_local char *controller = NULL;
1324
1325
1326 if (!fc)
1327 return -EIO;
1328
1329 if (strcmp(path, "/cgroup") == 0)
1330 return -EINVAL;
1331
1332 controller = pick_controller_from_path(fc, path);
1333 if (!controller)
f9a05025 1334 return -EINVAL;
341b21ad
SH
1335 cgroup = find_cgroup_in_path(path);
1336 if (!cgroup)
1337 /* this is just /cgroup/controller */
1338 return -EINVAL;
1339
1340 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1341
1342 if (!fpath) {
1343 path1 = "/";
1344 path2 = cgdir;
1345 } else {
1346 path1 = cgdir;
1347 path2 = fpath;
1348 }
1349
1350 if (is_child_cgroup(controller, path1, path2)) {
1351 // get uid, gid, from '/tasks' file and make up a mode
1352 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1353 k = get_cgroup_key(controller, cgroup, "tasks");
1354
1355 } else
1356 k = get_cgroup_key(controller, path1, path2);
1357
1358 if (!k)
1359 return -EINVAL;
1360
1361 /*
1362 * This being a fuse request, the uid and gid must be valid
1363 * in the caller's namespace. So we can just check to make
1364 * sure that the caller is root in his uid, and privileged
1365 * over the file's current owner.
1366 */
1367 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD))
f9a05025 1368 return -EACCES;
341b21ad
SH
1369
1370 if (!cgm_chown_file(controller, cgroup, uid, gid))
1371 return -EINVAL;
1372 return 0;
1373}
2ad6d2bd 1374
fd2e4e03
SH
1375int cg_chmod(const char *path, mode_t mode)
1376{
0a1bb5ea
SH
1377 struct fuse_context *fc = fuse_get_context();
1378 nih_local char * cgdir = NULL;
1379 char *fpath = NULL, *path1, *path2;
1380 nih_local struct cgm_keys *k = NULL;
1381 const char *cgroup;
1382 nih_local char *controller = NULL;
1383
1384 if (!fc)
1385 return -EIO;
1386
1387 if (strcmp(path, "/cgroup") == 0)
1388 return -EINVAL;
1389
1390 controller = pick_controller_from_path(fc, path);
1391 if (!controller)
f9a05025 1392 return -EINVAL;
0a1bb5ea
SH
1393 cgroup = find_cgroup_in_path(path);
1394 if (!cgroup)
1395 /* this is just /cgroup/controller */
1396 return -EINVAL;
1397
1398 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1399
1400 if (!fpath) {
1401 path1 = "/";
1402 path2 = cgdir;
1403 } else {
1404 path1 = cgdir;
1405 path2 = fpath;
1406 }
1407
1408 if (is_child_cgroup(controller, path1, path2)) {
1409 // get uid, gid, from '/tasks' file and make up a mode
1410 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1411 k = get_cgroup_key(controller, cgroup, "tasks");
1412
1413 } else
1414 k = get_cgroup_key(controller, path1, path2);
1415
1416 if (!k)
1417 return -EINVAL;
1418
1419 /*
1420 * This being a fuse request, the uid and gid must be valid
1421 * in the caller's namespace. So we can just check to make
1422 * sure that the caller is root in his uid, and privileged
1423 * over the file's current owner.
1424 */
1425 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT))
1426 return -EPERM;
1427
1428 if (!cgm_chmod_file(controller, cgroup, mode))
1429 return -EINVAL;
1430 return 0;
fd2e4e03
SH
1431}
1432
ab54b798
SH
1433int cg_mkdir(const char *path, mode_t mode)
1434{
1435 struct fuse_context *fc = fuse_get_context();
1436 nih_local struct cgm_keys **list = NULL;
1437 char *fpath = NULL, *path1;
1438 nih_local char * cgdir = NULL;
1439 const char *cgroup;
1440 nih_local char *controller = NULL;
1441
ab54b798
SH
1442 if (!fc)
1443 return -EIO;
1444
1445
1446 controller = pick_controller_from_path(fc, path);
1447 if (!controller)
f9a05025 1448 return -EINVAL;
ab54b798
SH
1449
1450 cgroup = find_cgroup_in_path(path);
1451 if (!cgroup)
f9a05025 1452 return -EINVAL;
ab54b798
SH
1453
1454 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1455 if (!fpath)
1456 path1 = "/";
1457 else
1458 path1 = cgdir;
1459
1460 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR))
f9a05025 1461 return -EACCES;
ab54b798
SH
1462
1463
1464 if (!cgm_create(controller, cgroup, fc->uid, fc->gid))
1465 return -EINVAL;
1466
1467 return 0;
1468}
1469
50d8d5b5
SH
1470static int cg_rmdir(const char *path)
1471{
1472 struct fuse_context *fc = fuse_get_context();
1473 nih_local struct cgm_keys **list = NULL;
1474 char *fpath = NULL;
1475 nih_local char * cgdir = NULL;
1476 const char *cgroup;
1477 nih_local char *controller = NULL;
1478
1479 if (!fc)
1480 return -EIO;
1481
1482
1483 controller = pick_controller_from_path(fc, path);
1484 if (!controller)
f9a05025 1485 return -EINVAL;
50d8d5b5
SH
1486
1487 cgroup = find_cgroup_in_path(path);
1488 if (!cgroup)
f9a05025 1489 return -EINVAL;
50d8d5b5
SH
1490
1491 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1492 if (!fpath)
1493 return -EINVAL;
1494
1495 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY))
f9a05025 1496 return -EACCES;
50d8d5b5
SH
1497
1498 if (!cgm_remove(controller, cgroup))
1499 return -EINVAL;
1500
1501 return 0;
1502}
1503
2dc17609
SH
1504static bool startswith(const char *line, const char *pref)
1505{
1506 if (strncmp(line, pref, strlen(pref)) == 0)
1507 return true;
1508 return false;
1509}
1510
1511static void get_mem_cached(char *memstat, unsigned long *v)
1512{
1513 char *eol;
1514
1515 *v = 0;
1516 while (*memstat) {
1517 if (startswith(memstat, "total_cache")) {
1518 sscanf(memstat + 11, "%lu", v);
1519 *v /= 1024;
1520 return;
1521 }
1522 eol = strchr(memstat, '\n');
1523 if (!eol)
1524 return;
1525 memstat = eol+1;
1526 }
1527}
1528
49878439
YY
1529static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
1530{
1531 char *eol;
1532 char key[32];
1533
1534 memset(key, 0, 32);
1535 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
1536
1537 size_t len = strlen(key);
1538 *v = 0;
1539
1540 while (*str) {
1541 if (startswith(str, key)) {
1542 sscanf(str + len, "%lu", v);
1543 return;
1544 }
1545 eol = strchr(str, '\n');
1546 if (!eol)
1547 return;
1548 str = eol+1;
1549 }
1550}
1551
2dc17609
SH
1552static char *get_pid_cgroup(pid_t pid, const char *contrl)
1553{
1554 nih_local char *fnam = NULL;
1555 FILE *f;
1556 char *answer = NULL;
1557 char *line = NULL;
1558 size_t len = 0;
1559
1560 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
1561 if (!(f = fopen(fnam, "r")))
1562 return false;
1563
1564 while (getline(&line, &len, f) != -1) {
1565 char *c1, *c2;
1566 if (!line[0])
1567 continue;
1568 c1 = strchr(line, ':');
1569 if (!c1)
1570 goto out;
1571 c1++;
1572 c2 = strchr(c1, ':');
1573 if (!c2)
1574 goto out;
1575 *c2 = '\0';
1576 if (strcmp(c1, contrl) != 0)
1577 continue;
1578 c2++;
1579 stripnewline(c2);
1580 answer = NIH_MUST( nih_strdup(NULL, c2) );
1581 goto out;
1582 }
1583
1584out:
1585 fclose(f);
1586 free(line);
1587 return answer;
1588}
1589
758ad80c 1590/*
2ad6d2bd 1591 * FUSE ops for /proc
758ad80c 1592 */
758ad80c 1593
23ce2127
SH
1594static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1595 struct fuse_file_info *fi)
1596{
2dc17609
SH
1597 struct fuse_context *fc = fuse_get_context();
1598 nih_local char *cg = get_pid_cgroup(fc->pid, "memory");
1599 nih_local char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
1600 unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
1601 char *line = NULL;
1602 size_t linelen = 0, total_len = 0;
1603 FILE *f;
1604
1605 if (offset)
1606 return -EINVAL;
1607
1608 if (!cg)
1609 return 0;
1610
1611 if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
1612 return 0;
1613 if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
1614 return 0;
1615 if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
1616 return 0;
1617 memlimit = strtoul(memlimit_str, NULL, 10);
1618 memusage = strtoul(memusage_str, NULL, 10);
1619 memlimit /= 1024;
1620 memusage /= 1024;
1621 get_mem_cached(memstat_str, &cached);
1622
1623 f = fopen("/proc/meminfo", "r");
1624 if (!f)
1625 return 0;
1626
1627 while (getline(&line, &linelen, f) != -1) {
1628 size_t l;
1629 char *printme, lbuf[100];
1630
1631 memset(lbuf, 0, 100);
1632 if (startswith(line, "MemTotal:")) {
1633 sscanf(line+14, "%lu", &hosttotal);
1634 if (hosttotal < memlimit)
1635 memlimit = hosttotal;
1636 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1637 printme = lbuf;
1638 } else if (startswith(line, "MemFree:")) {
1639 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1640 printme = lbuf;
1641 } else if (startswith(line, "MemAvailable:")) {
1642 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1643 printme = lbuf;
1644 } else if (startswith(line, "Buffers:")) {
1645 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1646 printme = lbuf;
1647 } else if (startswith(line, "Cached:")) {
1648 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
1649 printme = lbuf;
1650 } else if (startswith(line, "SwapCached:")) {
1651 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
1652 printme = lbuf;
1653 } else
1654 printme = line;
1655 l = snprintf(buf, size, "%s", printme);
1656 buf += l;
1657 size -= l;
1658 total_len += l;
1659 }
1660
92c84dc4
SH
1661 fclose(f);
1662 free(line);
2dc17609 1663 return total_len;
23ce2127
SH
1664}
1665
1666/*
1667 * Read the cpuset.cpus for cg
1668 * Return the answer in a nih_alloced string
1669 */
1670static char *get_cpuset(const char *cg)
1671{
1672 char *answer;
1673
1674 if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
1675 return NULL;
1676 return answer;
1677}
1678
1679/*
1680 * Helper functions for cpuset_in-set
1681 */
1682char *cpuset_nexttok(const char *c)
1683{
1684 char *r = strchr(c+1, ',');
1685 if (r)
1686 return r+1;
1687 return NULL;
1688}
1689
1690int cpuset_getrange(const char *c, int *a, int *b)
1691{
1692 int ret;
1693
1694 ret = sscanf(c, "%d-%d", a, b);
1695 return ret;
1696}
1697
1698/*
1699 * cpusets are in format "1,2-3,4"
1700 * iow, comma-delimited ranges
1701 */
aeb56147 1702static bool cpu_in_cpuset(int cpu, const char *cpuset)
23ce2127 1703{
23ce2127
SH
1704 const char *c;
1705
23ce2127
SH
1706 for (c = cpuset; c; c = cpuset_nexttok(c)) {
1707 int a, b, ret;
1708
1709 ret = cpuset_getrange(c, &a, &b);
1710 if (ret == 1 && cpu == a)
1711 return true;
1712 if (ret != 2) // bad cpuset!
1713 return false;
1714 if (cpu >= a && cpu <= b)
1715 return true;
1716 }
1717
1718 return false;
1719}
1720
aeb56147
SH
1721static bool cpuline_in_cpuset(const char *line, const char *cpuset)
1722{
1723 int cpu;
1724
1725 if (sscanf(line, "processor : %d", &cpu) != 1)
1726 return false;
1727 return cpu_in_cpuset(cpu, cpuset);
1728}
1729
23ce2127
SH
1730/*
1731 * check whether this is a '^processor" line in /proc/cpuinfo
1732 */
1733static bool is_processor_line(const char *line)
1734{
1735 int cpu;
1736
1737 if (sscanf(line, "processor : %d", &cpu) == 1)
1738 return true;
1739 return false;
1740}
1741
23ce2127
SH
1742static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
1743 struct fuse_file_info *fi)
1744{
1745 struct fuse_context *fc = fuse_get_context();
1746 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1747 nih_local char *cpuset = NULL;
1748 char *line = NULL;
1749 size_t linelen = 0, total_len = 0;
1750 bool am_printing = false;
1751 int curcpu = -1;
1752 FILE *f;
1753
1754 if (offset)
1755 return -EINVAL;
1756
1757 if (!cg)
1758 return 0;
1759
1760 cpuset = get_cpuset(cg);
1761 if (!cpuset)
1762 return 0;
1763
1764 f = fopen("/proc/cpuinfo", "r");
1765 if (!f)
1766 return 0;
1767
1768 while (getline(&line, &linelen, f) != -1) {
1769 size_t l;
1770 if (is_processor_line(line)) {
aeb56147 1771 am_printing = cpuline_in_cpuset(line, cpuset);
23ce2127
SH
1772 if (am_printing) {
1773 curcpu ++;
1774 l = snprintf(buf, size, "processor : %d\n", curcpu);
1775 buf += l;
1776 size -= l;
1777 total_len += l;
1778 }
1779 continue;
1780 }
1781 if (am_printing) {
1782 l = snprintf(buf, size, "%s", line);
1783 buf += l;
1784 size -= l;
1785 total_len += l;
1786 }
1787 }
1788
92c84dc4
SH
1789 fclose(f);
1790 free(line);
23ce2127
SH
1791 return total_len;
1792}
1793
1794static int proc_stat_read(char *buf, size_t size, off_t offset,
1795 struct fuse_file_info *fi)
1796{
aeb56147
SH
1797 struct fuse_context *fc = fuse_get_context();
1798 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1799 nih_local char *cpuset = NULL;
1800 char *line = NULL;
1801 size_t linelen = 0, total_len = 0;
2a0fde62 1802 int curcpu = -1; /* cpu numbering starts at 0 */
aeb56147
SH
1803 FILE *f;
1804
1805 if (offset)
1806 return -EINVAL;
1807
1808 if (!cg)
1809 return 0;
1810
1811 cpuset = get_cpuset(cg);
1812 if (!cpuset)
1813 return 0;
1814
1815 f = fopen("/proc/stat", "r");
1816 if (!f)
1817 return 0;
1818
1819 while (getline(&line, &linelen, f) != -1) {
1820 size_t l;
1821 int cpu;
2a0fde62 1822 char cpu_char[10]; /* That's a lot of cores */
aeb56147
SH
1823 char *c;
1824
2a0fde62
CB
1825 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
1826 /* not a ^cpuN line containing a number N, just print it */
aeb56147
SH
1827 l = snprintf(buf, size, "%s", line);
1828 buf += l;
1829 size -= l;
1830 total_len += l;
1831 continue;
1832 }
2a0fde62
CB
1833
1834 if (sscanf(cpu_char, "%d", &cpu) != 1)
1835 continue;
aeb56147
SH
1836 if (!cpu_in_cpuset(cpu, cpuset))
1837 continue;
1838 curcpu ++;
1839
1840 c = strchr(line, ' ');
1841 if (!c)
1842 continue;
1843 l = snprintf(buf, size, "cpu%d %s", curcpu, c);
1844 buf += l;
1845 size -= l;
1846 total_len += l;
1847 }
1848
92c84dc4
SH
1849 fclose(f);
1850 free(line);
aeb56147 1851 return total_len;
23ce2127
SH
1852}
1853
7bbf2246
SH
1854/*
1855 * How to guess what to present for uptime?
1856 * One thing we could do would be to take the date on the caller's
1857 * memory.usage_in_bytes file, which should equal the time of creation
1858 * of his cgroup. However, a task could be in a sub-cgroup of the
1859 * container. The same problem exists if we try to look at the ages
1860 * of processes in the caller's cgroup.
1861 *
1862 * So we'll fork a task that will enter the caller's pidns, mount a
1863 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
1864 *
1865 * For the second uptime #, we'll do as Stéphane had done, just copy
1866 * the number from /proc/uptime. Not sure how to best emulate 'idle'
1867 * time. Maybe someone can come up with a good algorithm and submit a
1868 * patch. Maybe something based on cpushare info?
1869 */
41bb9357
SH
1870
1871/* return age of the reaper for $pid, taken from ctime of its procdir */
1872static long int get_pid1_time(pid_t pid)
1873{
1874 char fnam[100];
ea56f722 1875 int fd, cpipe[2], ret;
41bb9357 1876 struct stat sb;
ea56f722
SH
1877 pid_t cpid;
1878 struct timeval tv;
1879 fd_set s;
1880 char v;
41bb9357
SH
1881
1882 if (unshare(CLONE_NEWNS))
1883 return 0;
1884
5ca64c2a
SG
1885 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
1886 perror("rslave mount failed");
1887 return 0;
1888 }
1889
41bb9357
SH
1890 sprintf(fnam, "/proc/%d/ns/pid", pid);
1891 fd = open(fnam, O_RDONLY);
1892 if (fd < 0) {
1893 perror("get_pid1_time open of ns/pid");
1894 return 0;
1895 }
1896 if (setns(fd, 0)) {
1897 perror("get_pid1_time setns 1");
1898 close(fd);
1899 return 0;
1900 }
1901 close(fd);
41bb9357 1902
ea56f722
SH
1903 if (pipe(cpipe) < 0)
1904 exit(1);
41bb9357 1905
ea56f722
SH
1906loop:
1907 cpid = fork();
1908 if (cpid < 0)
41bb9357 1909 return 0;
ea56f722
SH
1910
1911 if (!cpid) {
1912 char b = '1';
1913 close(cpipe[0]);
1914 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1915 fprintf(stderr, "%s (child): erorr on write: %s\n",
1916 __func__, strerror(errno));
1917 }
1918 close(cpipe[1]);
1919 umount2("/proc", MNT_DETACH);
1920 if (mount("proc", "/proc", "proc", 0, NULL)) {
1921 perror("get_pid1_time mount");
1922 return 0;
1923 }
1924 ret = lstat("/proc/1", &sb);
1925 if (ret) {
1926 perror("get_pid1_time lstat");
1927 return 0;
1928 }
1929 return time(NULL) - sb.st_ctime;
41bb9357 1930 }
ea56f722
SH
1931
1932 // give the child 1 second to be done forking and
1933 // write it's ack
1934 FD_ZERO(&s);
1935 FD_SET(cpipe[0], &s);
1936 tv.tv_sec = 1;
1937 tv.tv_usec = 0;
1938 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1939 if (ret <= 0)
1940 goto again;
1941 ret = read(cpipe[0], &v, 1);
1942 if (ret != sizeof(char) || v != '1') {
1943 goto again;
41bb9357 1944 }
ea56f722
SH
1945
1946 wait_for_pid(cpid);
1947 exit(0);
1948
1949again:
1950 kill(cpid, SIGKILL);
1951 wait_for_pid(cpid);
1952 goto loop;
41bb9357
SH
1953}
1954
1955static long int getreaperage(pid_t qpid)
1956{
1957 int pid, mypipe[2], ret;
1958 struct timeval tv;
1959 fd_set s;
1960 long int mtime, answer = 0;
1961
1962 if (pipe(mypipe)) {
1963 return 0;
1964 }
1965
1966 pid = fork();
1967
1968 if (!pid) { // child
1969 mtime = get_pid1_time(qpid);
1970 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
1971 fprintf(stderr, "Warning: bad write from getreaperage\n");
1972 exit(0);
1973 }
1974
1975 close(mypipe[1]);
1976 FD_ZERO(&s);
1977 FD_SET(mypipe[0], &s);
1978 tv.tv_sec = 1;
1979 tv.tv_usec = 0;
1980 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
ea56f722 1981 if (ret <= 0) {
41bb9357
SH
1982 perror("select");
1983 goto out;
1984 }
1985 if (!ret) {
1420baf8 1986 fprintf(stderr, "timed out\n");
41bb9357
SH
1987 goto out;
1988 }
1989 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
1990 perror("read");
1991 goto out;
1992 }
1993 answer = mtime;
1994
1995out:
1996 wait_for_pid(pid);
1997 close(mypipe[0]);
1998 return answer;
1999}
2000
2001static long int getprocidle(void)
2002{
2003 FILE *f = fopen("/proc/uptime", "r");
2004 long int age, idle;
92c84dc4 2005 int ret;
41bb9357
SH
2006 if (!f)
2007 return 0;
92c84dc4
SH
2008 ret = fscanf(f, "%ld %ld", &age, &idle);
2009 fclose(f);
2010 if (ret != 2)
41bb9357
SH
2011 return 0;
2012 return idle;
2013}
2014
2015/*
2016 * We read /proc/uptime and reuse its second field.
2017 * For the first field, we use the mtime for the reaper for
2018 * the calling pid as returned by getreaperage
2019 */
23ce2127
SH
2020static int proc_uptime_read(char *buf, size_t size, off_t offset,
2021 struct fuse_file_info *fi)
2022{
41bb9357
SH
2023 struct fuse_context *fc = fuse_get_context();
2024 long int reaperage = getreaperage(fc->pid);;
2025 long int idletime = getprocidle();
2026
2027 if (offset)
2028 return -EINVAL;
2029 return snprintf(buf, size, "%ld %ld\n", reaperage, idletime);
23ce2127
SH
2030}
2031
49878439
YY
2032static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2033 struct fuse_file_info *fi)
2034{
2035 char dev_name[72];
2036 struct fuse_context *fc = fuse_get_context();
2037 nih_local char *cg = get_pid_cgroup(fc->pid, "blkio");
2038 nih_local char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
2039 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2040 unsigned long read = 0, write = 0;
2041 unsigned long read_merged = 0, write_merged = 0;
2042 unsigned long read_sectors = 0, write_sectors = 0;
2043 unsigned long read_ticks = 0, write_ticks = 0;
2044 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2045 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
2046 char *line = NULL;
2047 size_t linelen = 0, total_len = 0;
2048 unsigned int major = 0, minor = 0;
2049 int i = 0;
2050 FILE *f;
2051
2052 if (offset)
2053 return -EINVAL;
2054
2055 if (!cg)
2056 return 0;
2057
2058 if (!cgm_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2059 return 0;
2060 if (!cgm_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2061 return 0;
2062 if (!cgm_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2063 return 0;
2064 if (!cgm_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2065 return 0;
2066 if (!cgm_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2067 return 0;
2068
2069
2070 f = fopen("/proc/diskstats", "r");
2071 if (!f)
2072 return 0;
2073
2074 while (getline(&line, &linelen, f) != -1) {
2075 size_t l;
2076 char *printme, lbuf[256];
2077
2078 i = sscanf(line, "%u %u %s", &major, &minor, dev_name);
2079 if(i == 3){
2080 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2081 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2082 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2083 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2084 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2085 read_sectors = read_sectors/512;
2086 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2087 write_sectors = write_sectors/512;
2088
2089 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2090 rd_svctm = rd_svctm/1000000;
2091 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2092 rd_wait = rd_wait/1000000;
2093 read_ticks = rd_svctm + rd_wait;
2094
2095 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2096 wr_svctm = wr_svctm/1000000;
2097 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2098 wr_wait = wr_wait/1000000;
2099 write_ticks = wr_svctm + wr_wait;
2100
2101 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2102 tot_ticks = tot_ticks/1000000;
2103 }else{
2104 continue;
2105 }
2106
2107 memset(lbuf, 0, 256);
2108 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2109 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2110 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2111 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2112 printme = lbuf;
2113 } else
2114 continue;
2115
2116 l = snprintf(buf, size, "%s", printme);
2117 buf += l;
2118 size -= l;
2119 total_len += l;
2120 }
2121
2122 fclose(f);
2123 free(line);
2124 return total_len;
2125}
2126
23ce2127
SH
2127static off_t get_procfile_size(const char *which)
2128{
2129 FILE *f = fopen(which, "r");
2130 char *line = NULL;
2131 size_t len = 0;
2132 ssize_t sz, answer = 0;
2133 if (!f)
2134 return 0;
2135
2136 while ((sz = getline(&line, &len, f)) != -1)
2137 answer += sz;
2138 fclose (f);
92c84dc4 2139 free(line);
23ce2127
SH
2140
2141 return answer;
2142}
2143
758ad80c
SH
2144static int proc_getattr(const char *path, struct stat *sb)
2145{
35629743
SH
2146 struct timespec now;
2147
2148 memset(sb, 0, sizeof(struct stat));
2149 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2150 return -EINVAL;
2151 sb->st_uid = sb->st_gid = 0;
2152 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2153 if (strcmp(path, "/proc") == 0) {
2154 sb->st_mode = S_IFDIR | 00555;
2155 sb->st_nlink = 2;
2156 return 0;
2157 }
2158 if (strcmp(path, "/proc/meminfo") == 0 ||
2159 strcmp(path, "/proc/cpuinfo") == 0 ||
2160 strcmp(path, "/proc/uptime") == 0 ||
49878439
YY
2161 strcmp(path, "/proc/stat") == 0 ||
2162 strcmp(path, "/proc/diskstats") == 0) {
23ce2127 2163 sb->st_size = get_procfile_size(path);
35629743
SH
2164 sb->st_mode = S_IFREG | 00444;
2165 sb->st_nlink = 1;
2166 return 0;
2167 }
2168
2169 return -ENOENT;
2170}
2171
2172static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2173 struct fuse_file_info *fi)
2174{
2175 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2176 filler(buf, "meminfo", NULL, 0) != 0 ||
2177 filler(buf, "stat", NULL, 0) != 0 ||
49878439
YY
2178 filler(buf, "uptime", NULL, 0) != 0 ||
2179 filler(buf, "diskstats", NULL, 0) != 0)
758ad80c 2180 return -EINVAL;
758ad80c
SH
2181 return 0;
2182}
2183
35629743
SH
2184static int proc_open(const char *path, struct fuse_file_info *fi)
2185{
2186 if (strcmp(path, "/proc/meminfo") == 0 ||
2187 strcmp(path, "/proc/cpuinfo") == 0 ||
2188 strcmp(path, "/proc/uptime") == 0 ||
49878439
YY
2189 strcmp(path, "/proc/stat") == 0 ||
2190 strcmp(path, "/proc/diskstats") == 0)
35629743
SH
2191 return 0;
2192 return -ENOENT;
2193}
2194
35629743
SH
2195static int proc_read(const char *path, char *buf, size_t size, off_t offset,
2196 struct fuse_file_info *fi)
2197{
2198 if (strcmp(path, "/proc/meminfo") == 0)
23ce2127 2199 return proc_meminfo_read(buf, size, offset, fi);
35629743 2200 if (strcmp(path, "/proc/cpuinfo") == 0)
23ce2127 2201 return proc_cpuinfo_read(buf, size, offset, fi);
35629743 2202 if (strcmp(path, "/proc/uptime") == 0)
23ce2127 2203 return proc_uptime_read(buf, size, offset, fi);
35629743 2204 if (strcmp(path, "/proc/stat") == 0)
23ce2127 2205 return proc_stat_read(buf, size, offset, fi);
49878439
YY
2206 if (strcmp(path, "/proc/diskstats") == 0)
2207 return proc_diskstats_read(buf, size, offset, fi);
35629743
SH
2208 return -EINVAL;
2209}
2210
2ad6d2bd
SH
2211/*
2212 * FUSE ops for /
2213 * these just delegate to the /proc and /cgroup ops as
2214 * needed
2215 */
758ad80c
SH
2216
2217static int lxcfs_getattr(const char *path, struct stat *sb)
2218{
2219 if (strcmp(path, "/") == 0) {
2220 sb->st_mode = S_IFDIR | 00755;
2221 sb->st_nlink = 2;
2222 return 0;
2223 }
2224 if (strncmp(path, "/cgroup", 7) == 0) {
2225 return cg_getattr(path, sb);
2226 }
35629743 2227 if (strncmp(path, "/proc", 5) == 0) {
758ad80c
SH
2228 return proc_getattr(path, sb);
2229 }
2230 return -EINVAL;
2231}
2232
2233static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
2234{
2235 if (strcmp(path, "/") == 0)
2236 return 0;
2237
2238 if (strncmp(path, "/cgroup", 7) == 0) {
2239 return cg_opendir(path, fi);
2240 }
35629743
SH
2241 if (strcmp(path, "/proc") == 0)
2242 return 0;
2243 return -ENOENT;
758ad80c
SH
2244}
2245
2246static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2247 struct fuse_file_info *fi)
2248{
2249 if (strcmp(path, "/") == 0) {
2250 if (filler(buf, "proc", NULL, 0) != 0 ||
2251 filler(buf, "cgroup", NULL, 0) != 0)
2252 return -EINVAL;
2253 return 0;
2254 }
35629743 2255 if (strncmp(path, "/cgroup", 7) == 0)
758ad80c 2256 return cg_readdir(path, buf, filler, offset, fi);
35629743
SH
2257 if (strcmp(path, "/proc") == 0)
2258 return proc_readdir(path, buf, filler, offset, fi);
758ad80c
SH
2259 return -EINVAL;
2260}
2261
2262static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
2263{
2264 if (strcmp(path, "/") == 0)
2265 return 0;
2266 if (strncmp(path, "/cgroup", 7) == 0) {
2267 return cg_releasedir(path, fi);
2268 }
35629743
SH
2269 if (strcmp(path, "/proc") == 0)
2270 return 0;
758ad80c
SH
2271 return -EINVAL;
2272}
2273
99978832
SH
2274static int lxcfs_open(const char *path, struct fuse_file_info *fi)
2275{
35629743 2276 if (strncmp(path, "/cgroup", 7) == 0)
99978832 2277 return cg_open(path, fi);
35629743
SH
2278 if (strncmp(path, "/proc", 5) == 0)
2279 return proc_open(path, fi);
99978832
SH
2280
2281 return -EINVAL;
2282}
2283
2284static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
2285 struct fuse_file_info *fi)
2286{
35629743 2287 if (strncmp(path, "/cgroup", 7) == 0)
99978832 2288 return cg_read(path, buf, size, offset, fi);
35629743
SH
2289 if (strncmp(path, "/proc", 5) == 0)
2290 return proc_read(path, buf, size, offset, fi);
99978832
SH
2291
2292 return -EINVAL;
2293}
2294
2ad6d2bd
SH
2295int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
2296 struct fuse_file_info *fi)
2297{
2298 if (strncmp(path, "/cgroup", 7) == 0) {
2299 return cg_write(path, buf, size, offset, fi);
2300 }
2301
2302 return -EINVAL;
2303}
2304
99978832
SH
2305static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2306{
2307 return 0;
2308}
2309
2310static int lxcfs_release(const char *path, struct fuse_file_info *fi)
758ad80c 2311{
8f6e8f5e
SH
2312 if (strncmp(path, "/cgroup", 7) == 0)
2313 return cg_release(path, fi);
2314#if 0
2315 if (strncmp(path, "/proc", 5) == 0)
2316 return proc_close(path, fi);
2317#endif
2318
2319 return -EINVAL;
99978832
SH
2320}
2321
2322static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2323{
2324 return 0;
758ad80c
SH
2325}
2326
ab54b798
SH
2327int lxcfs_mkdir(const char *path, mode_t mode)
2328{
2329 if (strncmp(path, "/cgroup", 7) == 0)
2330 return cg_mkdir(path, mode);
2331
2332 return -EINVAL;
2333}
2334
341b21ad
SH
2335int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2336{
2337 if (strncmp(path, "/cgroup", 7) == 0)
2338 return cg_chown(path, uid, gid);
2339
2340 return -EINVAL;
2341}
2342
2ad6d2bd
SH
2343/*
2344 * cat first does a truncate before doing ops->write. This doesn't
2345 * really make sense for cgroups. So just return 0 always but do
2346 * nothing.
2347 */
2348int lxcfs_truncate(const char *path, off_t newsize)
2349{
2350 if (strncmp(path, "/cgroup", 7) == 0)
2351 return 0;
2352 return -EINVAL;
2353}
2354
50d8d5b5
SH
2355int lxcfs_rmdir(const char *path)
2356{
2357 if (strncmp(path, "/cgroup", 7) == 0)
2358 return cg_rmdir(path);
2359 return -EINVAL;
2360}
2361
fd2e4e03
SH
2362int lxcfs_chmod(const char *path, mode_t mode)
2363{
2364 if (strncmp(path, "/cgroup", 7) == 0)
2365 return cg_chmod(path, mode);
2366 return -EINVAL;
2367}
2368
758ad80c
SH
2369const struct fuse_operations lxcfs_ops = {
2370 .getattr = lxcfs_getattr,
2371 .readlink = NULL,
2372 .getdir = NULL,
2373 .mknod = NULL,
ab54b798 2374 .mkdir = lxcfs_mkdir,
758ad80c 2375 .unlink = NULL,
50d8d5b5 2376 .rmdir = lxcfs_rmdir,
758ad80c
SH
2377 .symlink = NULL,
2378 .rename = NULL,
2379 .link = NULL,
fd2e4e03 2380 .chmod = lxcfs_chmod,
341b21ad 2381 .chown = lxcfs_chown,
2ad6d2bd 2382 .truncate = lxcfs_truncate,
758ad80c 2383 .utime = NULL,
99978832
SH
2384
2385 .open = lxcfs_open,
2386 .read = lxcfs_read,
2387 .release = lxcfs_release,
2ad6d2bd 2388 .write = lxcfs_write,
99978832 2389
758ad80c 2390 .statfs = NULL,
99978832
SH
2391 .flush = lxcfs_flush,
2392 .fsync = lxcfs_fsync,
758ad80c
SH
2393
2394 .setxattr = NULL,
2395 .getxattr = NULL,
2396 .listxattr = NULL,
2397 .removexattr = NULL,
2398
2399 .opendir = lxcfs_opendir,
2400 .readdir = lxcfs_readdir,
2401 .releasedir = lxcfs_releasedir,
2402
2403 .fsyncdir = NULL,
2404 .init = NULL,
2405 .destroy = NULL,
2406 .access = NULL,
2407 .create = NULL,
2408 .ftruncate = NULL,
2409 .fgetattr = NULL,
2410};
2411
99978832 2412static void usage(const char *me)
758ad80c
SH
2413{
2414 fprintf(stderr, "Usage:\n");
2415 fprintf(stderr, "\n");
2416 fprintf(stderr, "%s [FUSE and mount options] mountpoint\n", me);
2417 exit(1);
2418}
2419
99978832 2420static bool is_help(char *w)
758ad80c
SH
2421{
2422 if (strcmp(w, "-h") == 0 ||
2423 strcmp(w, "--help") == 0 ||
2424 strcmp(w, "-help") == 0 ||
2425 strcmp(w, "help") == 0)
2426 return true;
2427 return false;
2428}
2429
2430int main(int argc, char *argv[])
2431{
2432 int ret;
2433 struct lxcfs_state *d;
2434
2435 if (argc < 2 || is_help(argv[1]))
2436 usage(argv[0]);
2437
2438 d = malloc(sizeof(*d));
2439 if (!d)
2440 return -1;
2441
2442 if (!cgm_escape_cgroup())
2443 fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
2444
2445 if (!cgm_get_controllers(&d->subsystems))
2446 return -1;
2447
2448 ret = fuse_main(argc, argv, &lxcfs_ops, d);
2449
2450 return ret;
2183082c 2451}