]> git.proxmox.com Git - mirror_lxcfs.git/blame - lxcfs.c
cache: use file_dir object as parent for alloc'ed objects
[mirror_lxcfs.git] / lxcfs.c
CommitLineData
758ad80c
SH
1/* lxcfs
2 *
3 * Copyright © 2014 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
f2799430 6 * See COPYING file for details.
758ad80c
SH
7 */
8
9/*
10 * NOTES - make sure to run this as -s to avoid threading.
11 * TODO - can we enforce that here from the code?
12 */
13#define FUSE_USE_VERSION 26
14
2183082c 15#include <stdio.h>
758ad80c
SH
16#include <dirent.h>
17#include <fcntl.h>
18#include <fuse.h>
19#include <unistd.h>
20#include <errno.h>
21#include <stdbool.h>
22#include <time.h>
23#include <string.h>
24#include <stdlib.h>
25#include <libgen.h>
41bb9357
SH
26#include <sched.h>
27#include <linux/sched.h>
a05660a6 28#include <sys/socket.h>
41bb9357
SH
29#include <sys/mount.h>
30#include <wait.h>
758ad80c
SH
31
32#include <nih/alloc.h>
33#include <nih/string.h>
34
35#include "cgmanager.h"
36
37struct lxcfs_state {
38 /*
39 * a null-terminated, nih-allocated list of the mounted subsystems. We
40 * detect this at startup.
41 */
42 char **subsystems;
43};
44#define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
45
443d13f5
SH
46enum {
47 LXC_TYPE_CGDIR,
48 LXC_TYPE_CGFILE,
49 LXC_TYPE_PROC_MEMINFO,
50 LXC_TYPE_PROC_CPUINFO,
51 LXC_TYPE_PROC_UPTIME,
52 LXC_TYPE_PROC_STAT,
53 LXC_TYPE_PROC_DISKSTATS,
54};
55
c688e1b3
SH
56struct file_info {
57 char *controller;
58 char *cgroup;
8f6e8f5e 59 char *file;
443d13f5 60 int type;
c688e1b3
SH
61 char *buf; // unused as of yet
62 int buflen;
63};
64
bae07053 65static char *must_copy_string(void *parent, const char *str)
c688e1b3
SH
66{
67 if (!str)
68 return NULL;
bae07053 69 return NIH_MUST( nih_strdup(parent, str) );
c688e1b3
SH
70}
71
4775fba1
SH
72/*
73 * TODO - return value should denote whether child exited with failure
74 * so callers can return errors. Esp read/write of tasks and cgroup.procs
75 */
a05660a6
SH
76static int wait_for_pid(pid_t pid)
77{
78 int status, ret;
79
80again:
81 ret = waitpid(pid, &status, 0);
82 if (ret == -1) {
83 if (errno == EINTR)
84 goto again;
85 return -1;
86 }
87 if (ret != pid)
88 goto again;
89 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
90 return -1;
91 return 0;
92}
93
053a659d
SH
94/*
95 * Given a open file * to /proc/pid/{u,g}id_map, and an id
96 * valid in the caller's namespace, return the id mapped into
97 * pid's namespace.
98 * Returns the mapped id, or -1 on error.
99 */
100unsigned int
101convert_id_to_ns(FILE *idfile, unsigned int in_id)
102{
103 unsigned int nsuid, // base id for a range in the idfile's namespace
104 hostuid, // base id for a range in the caller's namespace
105 count; // number of ids in this range
106 char line[400];
107 int ret;
108
109 fseek(idfile, 0L, SEEK_SET);
110 while (fgets(line, 400, idfile)) {
111 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
112 if (ret != 3)
113 continue;
114 if (hostuid + count < hostuid || nsuid + count < nsuid) {
115 /*
116 * uids wrapped around - unexpected as this is a procfile,
117 * so just bail.
118 */
647c89e5 119 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
053a659d
SH
120 nsuid, hostuid, count, line);
121 return -1;
122 }
123 if (hostuid <= in_id && hostuid+count > in_id) {
124 /*
125 * now since hostuid <= in_id < hostuid+count, and
126 * hostuid+count and nsuid+count do not wrap around,
127 * we know that nsuid+(in_id-hostuid) which must be
128 * less that nsuid+(count) must not wrap around
129 */
130 return (in_id - hostuid) + nsuid;
131 }
132 }
133
134 // no answer found
135 return -1;
136}
137
341b21ad
SH
138/*
139 * for is_privileged_over,
140 * specify whether we require the calling uid to be root in his
141 * namespace
142 */
143#define NS_ROOT_REQD true
144#define NS_ROOT_OPT false
145
146static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
758ad80c 147{
053a659d
SH
148 nih_local char *fpath = NULL;
149 bool answer = false;
150 uid_t nsuid;
151
341b21ad
SH
152 if (victim == -1 || uid == -1)
153 return false;
154
155 /*
156 * If the request is one not requiring root in the namespace,
157 * then having the same uid suffices. (i.e. uid 1000 has write
158 * access to files owned by uid 1000
159 */
160 if (!req_ns_root && uid == victim)
758ad80c
SH
161 return true;
162
053a659d
SH
163 fpath = NIH_MUST( nih_sprintf(NULL, "/proc/%d/uid_map", pid) );
164 FILE *f = fopen(fpath, "r");
165 if (!f)
166 return false;
167
341b21ad 168 /* if caller's not root in his namespace, reject */
053a659d
SH
169 nsuid = convert_id_to_ns(f, uid);
170 if (nsuid)
171 goto out;
172
341b21ad
SH
173 /*
174 * If victim is not mapped into caller's ns, reject.
175 * XXX I'm not sure this check is needed given that fuse
176 * will be sending requests where the vfs has converted
177 */
053a659d
SH
178 nsuid = convert_id_to_ns(f, victim);
179 if (nsuid == -1)
180 goto out;
181
182 answer = true;
183
184out:
185 fclose(f);
186 return answer;
758ad80c
SH
187}
188
189static bool perms_include(int fmode, mode_t req_mode)
190{
2ad6d2bd
SH
191 mode_t r;
192
193 switch (req_mode & O_ACCMODE) {
194 case O_RDONLY:
195 r = S_IROTH;
196 break;
197 case O_WRONLY:
198 r = S_IWOTH;
199 break;
200 case O_RDWR:
201 r = S_IROTH | S_IWOTH;
202 break;
203 default:
204 return false;
205 }
206 return ((fmode & r) == r);
758ad80c
SH
207}
208
3db25a35
SH
209static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
210{
211 char *start, *end;
212
213 if (strlen(taskcg) <= strlen(querycg)) {
214 fprintf(stderr, "%s: I was fed bad input\n", __func__);
215 return NULL;
216 }
217
218 if (strcmp(querycg, "/") == 0)
219 start = NIH_MUST( nih_strdup(NULL, taskcg + 1) );
220 else
221 start = NIH_MUST( nih_strdup(NULL, taskcg + strlen(querycg) + 1) );
222 end = strchr(start, '/');
223 if (end)
224 *end = '\0';
225 return start;
226}
227
758ad80c
SH
228/*
229 * check whether a fuse context may access a cgroup dir or file
230 *
231 * If file is not null, it is a cgroup file to check under cg.
232 * If file is null, then we are checking perms on cg itself.
233 *
234 * For files we can check the mode of the list_keys result.
235 * For cgroups, we must make assumptions based on the files under the
236 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
237 * yet.
238 */
239static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
240{
241 nih_local struct cgm_keys **list = NULL;
242 int i;
243
244 if (!file)
245 file = "tasks";
246
247 if (*file == '/')
248 file++;
249
250 if (!cgm_list_keys(contrl, cg, &list))
251 return false;
252 for (i = 0; list[i]; i++) {
253 if (strcmp(list[i]->name, file) == 0) {
254 struct cgm_keys *k = list[i];
341b21ad 255 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
758ad80c
SH
256 if (perms_include(k->mode >> 6, mode))
257 return true;
258 }
259 if (fc->gid == k->gid) {
260 if (perms_include(k->mode >> 3, mode))
261 return true;
262 }
263 return perms_include(k->mode, mode);
264 }
265 }
266
267 return false;
268}
269
3db25a35
SH
270static void stripnewline(char *x)
271{
272 size_t l = strlen(x);
273 if (l && x[l-1] == '\n')
274 x[l-1] = '\0';
275}
276
277/*
278 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
279 * If caller is in /a, he may act on /a/b, but not on /b.
280 * if the answer is false and nextcg is not NULL, then *nextcg will point
281 * to a nih_alloc'd string containing the next cgroup directory under cg
282 */
283static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
284{
285 nih_local char *fnam = NULL;
286 FILE *f;
287 bool answer = false;
288 char *line = NULL;
289 size_t len = 0;
290
291 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
292 if (!(f = fopen(fnam, "r")))
293 return false;
294
295 while (getline(&line, &len, f) != -1) {
296 char *c1, *c2, *linecmp;
297 if (!line[0])
298 continue;
299 c1 = strchr(line, ':');
300 if (!c1)
301 goto out;
302 c1++;
303 c2 = strchr(c1, ':');
304 if (!c2)
305 goto out;
306 *c2 = '\0';
307 if (strcmp(c1, contrl) != 0)
308 continue;
309 c2++;
310 stripnewline(c2);
311 /*
312 * callers pass in '/' for root cgroup, otherwise they pass
313 * in a cgroup without leading '/'
314 */
315 linecmp = *cg == '/' ? c2 : c2+1;
316 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
317 if (nextcg)
318 *nextcg = get_next_cgroup_dir(linecmp, cg);
319 goto out;
320 }
321 answer = true;
322 goto out;
323 }
324
325out:
326 fclose(f);
327 free(line);
328 return answer;
329}
330
758ad80c
SH
331/*
332 * given /cgroup/freezer/a/b, return "freezer". this will be nih-allocated
333 * and needs to be nih_freed.
334 */
335static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
336{
337 const char *p1;
338 char *ret, *slash;
339
340 if (strlen(path) < 9)
341 return NULL;
342 p1 = path+8;
343 ret = nih_strdup(NULL, p1);
344 if (!ret)
345 return ret;
346 slash = strstr(ret, "/");
347 if (slash)
348 *slash = '\0';
349
350 /* verify that it is a subsystem */
351 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
352 int i;
353 if (!list) {
354 nih_free(ret);
355 return NULL;
356 }
357 for (i = 0; list[i]; i++) {
358 if (strcmp(list[i], ret) == 0)
359 return ret;
360 }
361 nih_free(ret);
362 return NULL;
363}
364
365/*
366 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
367 * Note that the returned value may include files (keynames) etc
368 */
369static const char *find_cgroup_in_path(const char *path)
370{
371 const char *p1;
372
373 if (strlen(path) < 9)
374 return NULL;
375 p1 = strstr(path+8, "/");
376 if (!p1)
377 return NULL;
378 return p1+1;
379}
380
381static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
382{
383 nih_local char **list = NULL;
384 int i;
385
386 if (!f)
387 return false;
388 if (*f == '/')
389 f++;
390
391 if (!cgm_list_children(contr, dir, &list))
392 return false;
393 for (i = 0; list[i]; i++) {
394 if (strcmp(list[i], f) == 0)
395 return true;
396 }
397
398 return false;
399}
400
401static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
402{
403 nih_local struct cgm_keys **list = NULL;
404 struct cgm_keys *k;
405 int i;
406
407 if (!f)
408 return NULL;
409 if (*f == '/')
410 f++;
411 if (!cgm_list_keys(contr, dir, &list))
412 return NULL;
413 for (i = 0; list[i]; i++) {
414 if (strcmp(list[i]->name, f) == 0) {
415 k = NIH_MUST( nih_alloc(NULL, (sizeof(*k))) );
416 k->name = NIH_MUST( nih_strdup(k, list[i]->name) );
417 k->uid = list[i]->uid;
418 k->gid = list[i]->gid;
419 k->mode = list[i]->mode;
420 return k;
421 }
422 }
423
424 return NULL;
425}
426
427static void get_cgdir_and_path(const char *cg, char **dir, char **file)
428{
758ad80c
SH
429 char *p;
430
431 *dir = NIH_MUST( nih_strdup(NULL, cg) );
432 *file = strrchr(cg, '/');
433 if (!*file) {
434 *file = NULL;
435 return;
436 }
437 p = strrchr(*dir, '/');
438 *p = '\0';
439}
440
99978832
SH
441static size_t get_file_size(const char *contrl, const char *cg, const char *f)
442{
443 nih_local char *data = NULL;
444 size_t s;
445 if (!cgm_get_value(contrl, cg, f, &data))
446 return -EINVAL;
447 s = strlen(data);
448 return s;
449}
2ad6d2bd 450
758ad80c 451/*
2ad6d2bd 452 * FUSE ops for /cgroup
758ad80c 453 */
2ad6d2bd 454
758ad80c
SH
455static int cg_getattr(const char *path, struct stat *sb)
456{
457 struct timespec now;
458 struct fuse_context *fc = fuse_get_context();
459 nih_local char * cgdir = NULL;
460 char *fpath = NULL, *path1, *path2;
461 nih_local struct cgm_keys *k = NULL;
462 const char *cgroup;
463 nih_local char *controller = NULL;
464
465
466 if (!fc)
467 return -EIO;
468
469 memset(sb, 0, sizeof(struct stat));
470
471 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
472 return -EINVAL;
473
474 sb->st_uid = sb->st_gid = 0;
475 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
476 sb->st_size = 0;
477
478 if (strcmp(path, "/cgroup") == 0) {
479 sb->st_mode = S_IFDIR | 00755;
480 sb->st_nlink = 2;
481 return 0;
482 }
483
484 controller = pick_controller_from_path(fc, path);
485 if (!controller)
486 return -EIO;
758ad80c
SH
487 cgroup = find_cgroup_in_path(path);
488 if (!cgroup) {
489 /* this is just /cgroup/controller, return it as a dir */
490 sb->st_mode = S_IFDIR | 00755;
491 sb->st_nlink = 2;
492 return 0;
493 }
341b21ad 494
758ad80c
SH
495 get_cgdir_and_path(cgroup, &cgdir, &fpath);
496
497 if (!fpath) {
498 path1 = "/";
499 path2 = cgdir;
500 } else {
501 path1 = cgdir;
502 path2 = fpath;
503 }
504
758ad80c
SH
505 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
506 * Then check that caller's cgroup is under path if fpath is a child
507 * cgroup, or cgdir if fpath is a file */
508
509 if (is_child_cgroup(controller, path1, path2)) {
f9a05025
SH
510 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
511 /* this is just /cgroup/controller, return it as a dir */
512 sb->st_mode = S_IFDIR | 00555;
513 sb->st_nlink = 2;
514 return 0;
515 }
758ad80c 516 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
f9a05025 517 return -EACCES;
758ad80c 518
053a659d
SH
519 // get uid, gid, from '/tasks' file and make up a mode
520 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
521 sb->st_mode = S_IFDIR | 00755;
522 k = get_cgroup_key(controller, cgroup, "tasks");
523 if (!k) {
053a659d
SH
524 sb->st_uid = sb->st_gid = 0;
525 } else {
053a659d
SH
526 sb->st_uid = k->uid;
527 sb->st_gid = k->gid;
528 }
758ad80c
SH
529 sb->st_nlink = 2;
530 return 0;
531 }
532
533 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
3db25a35
SH
534 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL))
535 return -ENOENT;
758ad80c 536 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
f9a05025 537 return -EACCES;
758ad80c 538
758ad80c 539 sb->st_mode = S_IFREG | k->mode;
053a659d 540 sb->st_nlink = 1;
758ad80c
SH
541 sb->st_uid = k->uid;
542 sb->st_gid = k->gid;
99978832 543 sb->st_size = get_file_size(controller, path1, path2);
758ad80c
SH
544 return 0;
545 }
546
ab54b798 547 return -ENOENT;
758ad80c 548}
2183082c 549
7f163b71
SH
550/*
551 * TODO - cache these results in a table for use in opendir, free
552 * in releasedir
553 */
758ad80c 554static int cg_opendir(const char *path, struct fuse_file_info *fi)
2183082c 555{
7f163b71
SH
556 struct fuse_context *fc = fuse_get_context();
557 nih_local struct cgm_keys **list = NULL;
558 const char *cgroup;
c688e1b3 559 struct file_info *dir_info;
7f163b71 560 nih_local char *controller = NULL;
7f163b71
SH
561
562 if (!fc)
563 return -EIO;
564
c688e1b3
SH
565 if (strcmp(path, "/cgroup") == 0) {
566 cgroup = NULL;
567 controller = NULL;
568 } else {
569 // return list of keys for the controller, and list of child cgroups
570 controller = pick_controller_from_path(fc, path);
571 if (!controller)
572 return -EIO;
7f163b71 573
c688e1b3
SH
574 cgroup = find_cgroup_in_path(path);
575 if (!cgroup) {
576 /* this is just /cgroup/controller, return its contents */
577 cgroup = "/";
578 }
7f163b71
SH
579 }
580
3a6e1a76 581 if (cgroup && !fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
7f163b71 582 return -EACCES;
c688e1b3
SH
583
584 /* we'll free this at cg_releasedir */
585 dir_info = NIH_MUST( nih_alloc(NULL, sizeof(*dir_info)) );
bae07053
SH
586 dir_info->controller = must_copy_string(dir_info, controller);
587 dir_info->cgroup = must_copy_string(dir_info, cgroup);
443d13f5 588 dir_info->type = LXC_TYPE_CGDIR;
c688e1b3 589 dir_info->buf = NULL;
8f6e8f5e 590 dir_info->file = NULL;
c688e1b3
SH
591 dir_info->buflen = 0;
592
593 fi->fh = (unsigned long)dir_info;
758ad80c
SH
594 return 0;
595}
596
758ad80c
SH
597static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
598 struct fuse_file_info *fi)
599{
c688e1b3
SH
600 struct file_info *d = (struct file_info *)fi->fh;
601 nih_local struct cgm_keys **list = NULL;
602 int i;
603 nih_local char *nextcg = NULL;
758ad80c
SH
604 struct fuse_context *fc = fuse_get_context();
605
443d13f5 606 if (d->type != LXC_TYPE_CGDIR) {
b845ad01
SH
607 fprintf(stderr, "Internal error: file cache info used in readdir\n");
608 return -EIO;
609 }
c688e1b3
SH
610 if (!d->cgroup && !d->controller) {
611 // ls /var/lib/lxcfs/cgroup - just show list of controllers
758ad80c
SH
612 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
613 int i;
614
615 if (!list)
616 return -EIO;
7f163b71 617
758ad80c
SH
618 for (i = 0; list[i]; i++) {
619 if (filler(buf, list[i], NULL, 0) != 0) {
620 return -EIO;
621 }
622 }
623 return 0;
624 }
625
c688e1b3 626 if (!cgm_list_keys(d->controller, d->cgroup, &list))
3db25a35 627 // not a valid cgroup
758ad80c 628 return -EINVAL;
3db25a35 629
c688e1b3 630 if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
3db25a35
SH
631 if (nextcg) {
632 int ret;
633 ret = filler(buf, nextcg, NULL, 0);
634 if (ret != 0)
635 return -EIO;
636 }
637 return 0;
638 }
639
758ad80c 640 for (i = 0; list[i]; i++) {
758ad80c
SH
641 if (filler(buf, list[i]->name, NULL, 0) != 0) {
642 return -EIO;
643 }
644 }
645
646 // now get the list of child cgroups
422aa4a5 647 nih_local char **clist = NULL;
758ad80c 648
c688e1b3 649 if (!cgm_list_children(d->controller, d->cgroup, &clist))
758ad80c
SH
650 return 0;
651 for (i = 0; clist[i]; i++) {
758ad80c
SH
652 if (filler(buf, clist[i], NULL, 0) != 0) {
653 return -EIO;
654 }
655 }
656 return 0;
657}
658
8f6e8f5e
SH
659static void do_release_file_info(struct file_info *f)
660{
bae07053
SH
661 /*
662 * all file_info fields which are nih_alloc()d with f as parent
663 * will be automatically freed
664 */
8f6e8f5e
SH
665 nih_free(f);
666}
667
758ad80c
SH
668static int cg_releasedir(const char *path, struct fuse_file_info *fi)
669{
c688e1b3
SH
670 struct file_info *d = (struct file_info *)fi->fh;
671
8f6e8f5e 672 do_release_file_info(d);
758ad80c
SH
673 return 0;
674}
675
99978832
SH
676static int cg_open(const char *path, struct fuse_file_info *fi)
677{
678 nih_local char *controller = NULL;
679 const char *cgroup;
680 char *fpath = NULL, *path1, *path2;
681 nih_local char * cgdir = NULL;
682 nih_local struct cgm_keys *k = NULL;
8f6e8f5e 683 struct file_info *file_info;
99978832
SH
684 struct fuse_context *fc = fuse_get_context();
685
686 if (!fc)
687 return -EIO;
688
689 controller = pick_controller_from_path(fc, path);
690 if (!controller)
691 return -EIO;
692 cgroup = find_cgroup_in_path(path);
693 if (!cgroup)
694 return -EINVAL;
695
696 get_cgdir_and_path(cgroup, &cgdir, &fpath);
697 if (!fpath) {
698 path1 = "/";
699 path2 = cgdir;
700 } else {
701 path1 = cgdir;
702 path2 = fpath;
703 }
704
8f6e8f5e
SH
705 k = get_cgroup_key(controller, path1, path2);
706 if (!k)
707 return -EINVAL;
99978832 708
8f6e8f5e
SH
709 if (!fc_may_access(fc, controller, path1, path2, fi->flags))
710 // should never get here
711 return -EACCES;
99978832 712
8f6e8f5e
SH
713 /* we'll free this at cg_release */
714 file_info = NIH_MUST( nih_alloc(NULL, sizeof(*file_info)) );
bae07053
SH
715 file_info->controller = must_copy_string(file_info, controller);
716 file_info->cgroup = must_copy_string(file_info, path1);
717 file_info->file = must_copy_string(file_info, path2);
443d13f5 718 file_info->type = LXC_TYPE_CGFILE;
8f6e8f5e
SH
719 file_info->buf = NULL;
720 file_info->buflen = 0;
721
722 fi->fh = (unsigned long)file_info;
723 return 0;
724}
725
726static int cg_release(const char *path, struct fuse_file_info *fi)
727{
728 struct file_info *f = (struct file_info *)fi->fh;
729
730 do_release_file_info(f);
731 return 0;
99978832
SH
732}
733
a05660a6
SH
734static int msgrecv(int sockfd, void *buf, size_t len)
735{
736 struct timeval tv;
737 fd_set rfds;
738
739 FD_ZERO(&rfds);
740 FD_SET(sockfd, &rfds);
741 tv.tv_sec = 2;
742 tv.tv_usec = 0;
743
ea56f722 744 if (select(sockfd+1, &rfds, NULL, NULL, &tv) <= 0)
a05660a6
SH
745 return -1;
746 return recv(sockfd, buf, len, MSG_DONTWAIT);
747}
748
01e71852
SH
749#define SEND_CREDS_OK 0
750#define SEND_CREDS_NOTSK 1
751#define SEND_CREDS_FAIL 2
752static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
a05660a6
SH
753{
754 struct msghdr msg = { 0 };
755 struct iovec iov;
756 struct cmsghdr *cmsg;
757 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
758 char buf[1];
759 buf[0] = 'p';
760
01e71852
SH
761 if (pingfirst) {
762 if (msgrecv(sock, buf, 1) != 1) {
1420baf8 763 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
01e71852
SH
764 __func__);
765 return SEND_CREDS_FAIL;
766 }
a05660a6
SH
767 }
768
769 msg.msg_control = cmsgbuf;
770 msg.msg_controllen = sizeof(cmsgbuf);
771
772 cmsg = CMSG_FIRSTHDR(&msg);
773 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
774 cmsg->cmsg_level = SOL_SOCKET;
775 cmsg->cmsg_type = SCM_CREDENTIALS;
776 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
777
778 msg.msg_name = NULL;
779 msg.msg_namelen = 0;
780
781 buf[0] = v;
782 iov.iov_base = buf;
783 iov.iov_len = sizeof(buf);
784 msg.msg_iov = &iov;
785 msg.msg_iovlen = 1;
786
787 if (sendmsg(sock, &msg, 0) < 0) {
1420baf8 788 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
a05660a6
SH
789 strerror(errno));
790 if (errno == 3)
01e71852
SH
791 return SEND_CREDS_NOTSK;
792 return SEND_CREDS_FAIL;
a05660a6
SH
793 }
794
01e71852 795 return SEND_CREDS_OK;
a05660a6
SH
796}
797
798static bool recv_creds(int sock, struct ucred *cred, char *v)
799{
800 struct msghdr msg = { 0 };
801 struct iovec iov;
802 struct cmsghdr *cmsg;
803 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
804 char buf[1];
805 int ret;
806 int optval = 1;
6ee867dc
SH
807 struct timeval tv;
808 fd_set rfds;
a05660a6
SH
809
810 *v = '1';
811
812 cred->pid = -1;
813 cred->uid = -1;
814 cred->gid = -1;
815
816 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
1420baf8 817 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
a05660a6
SH
818 return false;
819 }
820 buf[0] = '1';
821 if (write(sock, buf, 1) != 1) {
1420baf8 822 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
a05660a6
SH
823 return false;
824 }
825
826 msg.msg_name = NULL;
827 msg.msg_namelen = 0;
828 msg.msg_control = cmsgbuf;
829 msg.msg_controllen = sizeof(cmsgbuf);
830
831 iov.iov_base = buf;
832 iov.iov_len = sizeof(buf);
833 msg.msg_iov = &iov;
834 msg.msg_iovlen = 1;
835
6ee867dc
SH
836 FD_ZERO(&rfds);
837 FD_SET(sock, &rfds);
838 tv.tv_sec = 2;
839 tv.tv_usec = 0;
ea56f722 840 if (select(sock+1, &rfds, NULL, NULL, &tv) <= 0) {
6ee867dc
SH
841 fprintf(stderr, "Failed to select for scm_cred: %s\n",
842 strerror(errno));
843 return false;
844 }
845 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
a05660a6 846 if (ret < 0) {
1420baf8 847 fprintf(stderr, "Failed to receive scm_cred: %s\n",
a05660a6
SH
848 strerror(errno));
849 return false;
850 }
851
852 cmsg = CMSG_FIRSTHDR(&msg);
853
854 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
855 cmsg->cmsg_level == SOL_SOCKET &&
856 cmsg->cmsg_type == SCM_CREDENTIALS) {
857 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
858 }
859 *v = buf[0];
860
861 return true;
862}
863
864
865/*
4775fba1
SH
866 * pid_to_ns - reads pids from a ucred over a socket, then writes the
867 * int value back over the socket. This shifts the pid from the
868 * sender's pidns into tpid's pidns.
a05660a6 869 */
4775fba1 870static void pid_to_ns(int sock, pid_t tpid)
a05660a6
SH
871{
872 char v = '0';
873 struct ucred cred;
874
875 while (recv_creds(sock, &cred, &v)) {
876 if (v == '1')
877 exit(0);
a05660a6
SH
878 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
879 exit(1);
880 }
881 exit(0);
882}
883
884/*
4775fba1 885 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
a05660a6 886 * in your old pidns. Only children which you fork will be in the target
4775fba1 887 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
a05660a6
SH
888 * actually convert pids
889 */
4775fba1 890static void pid_to_ns_wrapper(int sock, pid_t tpid)
a05660a6 891{
ea56f722 892 int newnsfd = -1, ret, cpipe[2];
a05660a6
SH
893 char fnam[100];
894 pid_t cpid;
ea56f722
SH
895 struct timeval tv;
896 fd_set s;
897 char v;
a05660a6
SH
898
899 sprintf(fnam, "/proc/%d/ns/pid", tpid);
900 newnsfd = open(fnam, O_RDONLY);
901 if (newnsfd < 0)
902 exit(1);
903 if (setns(newnsfd, 0) < 0)
904 exit(1);
905 close(newnsfd);
906
ea56f722
SH
907 if (pipe(cpipe) < 0)
908 exit(1);
a05660a6 909
ea56f722
SH
910loop:
911 cpid = fork();
a05660a6
SH
912 if (cpid < 0)
913 exit(1);
ea56f722
SH
914
915 if (!cpid) {
916 char b = '1';
917 close(cpipe[0]);
918 if (write(cpipe[1], &b, sizeof(char)) < 0) {
919 fprintf(stderr, "%s (child): erorr on write: %s\n",
920 __func__, strerror(errno));
921 }
922 close(cpipe[1]);
4775fba1 923 pid_to_ns(sock, tpid);
ea56f722
SH
924 }
925 // give the child 1 second to be done forking and
926 // write it's ack
927 FD_ZERO(&s);
928 FD_SET(cpipe[0], &s);
929 tv.tv_sec = 1;
930 tv.tv_usec = 0;
931 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
932 if (ret <= 0)
933 goto again;
934 ret = read(cpipe[0], &v, 1);
935 if (ret != sizeof(char) || v != '1') {
936 goto again;
937 }
938
a05660a6
SH
939 if (!wait_for_pid(cpid))
940 exit(1);
941 exit(0);
ea56f722
SH
942
943again:
944 kill(cpid, SIGKILL);
945 wait_for_pid(cpid);
946 goto loop;
a05660a6
SH
947}
948
949/*
950 * To read cgroup files with a particular pid, we will setns into the child
951 * pidns, open a pipe, fork a child - which will be the first to really be in
952 * the child ns - which does the cgm_get_value and writes the data to the pipe.
953 */
954static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
955{
956 int sock[2] = {-1, -1};
957 nih_local char *tmpdata = NULL;
958 int ret;
959 pid_t qpid, cpid = -1;
960 bool answer = false;
961 char v = '0';
962 struct ucred cred;
963 struct timeval tv;
964 fd_set s;
965
966 if (!cgm_get_value(contrl, cg, file, &tmpdata))
967 return false;
968
969 /*
970 * Now we read the pids from returned data one by one, pass
971 * them into a child in the target namespace, read back the
972 * translated pids, and put them into our to-return data
973 */
974
975 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
976 perror("socketpair");
977 exit(1);
978 }
979
980 cpid = fork();
981 if (cpid == -1)
982 goto out;
983
984 if (!cpid) // child
4775fba1 985 pid_to_ns_wrapper(sock[1], tpid);
a05660a6
SH
986
987 char *ptr = tmpdata;
988 cred.uid = 0;
989 cred.gid = 0;
990 while (sscanf(ptr, "%d\n", &qpid) == 1) {
991 cred.pid = qpid;
01e71852
SH
992 ret = send_creds(sock[0], &cred, v, true);
993
994 if (ret == SEND_CREDS_NOTSK)
995 goto next;
996 if (ret == SEND_CREDS_FAIL)
a05660a6
SH
997 goto out;
998
999 // read converted results
1000 FD_ZERO(&s);
1001 FD_SET(sock[0], &s);
6ee867dc 1002 tv.tv_sec = 2;
a05660a6
SH
1003 tv.tv_usec = 0;
1004 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
1005 if (ret <= 0) {
6ee867dc
SH
1006 fprintf(stderr, "%s: select error waiting for pid from child: %s\n",
1007 __func__, strerror(errno));
a05660a6
SH
1008 goto out;
1009 }
1010 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
6ee867dc
SH
1011 fprintf(stderr, "%s: error reading pid from child: %s\n",
1012 __func__, strerror(errno));
a05660a6
SH
1013 goto out;
1014 }
a05660a6 1015 NIH_MUST( nih_strcat_sprintf(d, NULL, "%d\n", qpid) );
01e71852 1016next:
a05660a6
SH
1017 ptr = strchr(ptr, '\n');
1018 if (!ptr)
1019 break;
1020 ptr++;
1021 }
1022
1023 cred.pid = getpid();
1024 v = '1';
01e71852 1025 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
a05660a6 1026 // failed to ask child to exit
6ee867dc
SH
1027 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1028 __func__, strerror(errno));
a05660a6
SH
1029 goto out;
1030 }
1031
1032 answer = true;
1033
1034out:
1035 if (cpid != -1)
1036 wait_for_pid(cpid);
1037 if (sock[0] != -1) {
1038 close(sock[0]);
1039 close(sock[1]);
1040 }
1041 return answer;
1042}
1043
99978832
SH
1044static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1045 struct fuse_file_info *fi)
1046{
99978832 1047 struct fuse_context *fc = fuse_get_context();
8f6e8f5e 1048 struct file_info *f = (struct file_info *)fi->fh;
99978832
SH
1049 nih_local struct cgm_keys *k = NULL;
1050
443d13f5 1051 if (f->type != LXC_TYPE_CGFILE) {
b845ad01
SH
1052 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1053 return -EIO;
1054 }
1055
99978832
SH
1056 if (offset)
1057 return -EIO;
1058
1059 if (!fc)
1060 return -EIO;
1061
8f6e8f5e 1062 if (!f->controller)
99978832
SH
1063 return -EINVAL;
1064
8f6e8f5e 1065 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
99978832 1066 nih_local char *data = NULL;
4775fba1
SH
1067 int s;
1068 bool r;
99978832 1069
8f6e8f5e 1070 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY))
f9a05025
SH
1071 // should never get here
1072 return -EACCES;
99978832 1073
8f6e8f5e
SH
1074 if (strcmp(f->file, "tasks") == 0 ||
1075 strcmp(f->file, "/tasks") == 0 ||
1076 strcmp(f->file, "/cgroup.procs") == 0 ||
1077 strcmp(f->file, "cgroup.procs") == 0)
a05660a6 1078 // special case - we have to translate the pids
8f6e8f5e 1079 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
a05660a6 1080 else
8f6e8f5e 1081 r = cgm_get_value(f->controller, f->cgroup, f->file, &data);
a05660a6 1082
4775fba1 1083 if (!r)
99978832
SH
1084 return -EINVAL;
1085
4775fba1
SH
1086 if (!data)
1087 return 0;
99978832
SH
1088 s = strlen(data);
1089 if (s > size)
1090 s = size;
1091 memcpy(buf, data, s);
1092
99978832
SH
1093 return s;
1094 }
1095
1096 return -EINVAL;
1097}
1098
4775fba1
SH
1099static void pid_from_ns(int sock, pid_t tpid)
1100{
1101 pid_t vpid;
1102 struct ucred cred;
1103 char v;
6ee867dc
SH
1104 struct timeval tv;
1105 fd_set s;
1106 int ret;
4775fba1
SH
1107
1108 cred.uid = 0;
1109 cred.gid = 0;
6ee867dc
SH
1110 while (1) {
1111 FD_ZERO(&s);
1112 FD_SET(sock, &s);
1113 tv.tv_sec = 2;
1114 tv.tv_usec = 0;
1115 ret = select(sock+1, &s, NULL, NULL, &tv);
ea56f722
SH
1116 if (ret <= 0) {
1117 fprintf(stderr, "%s: bad select before read from parent: %s\n",
6ee867dc
SH
1118 __func__, strerror(errno));
1119 exit(1);
1120 }
1121 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1122 fprintf(stderr, "%s: bad read from parent: %s\n",
1123 __func__, strerror(errno));
1124 exit(1);
1125 }
4775fba1 1126 if (vpid == -1) // done
01e71852 1127 break;
4775fba1
SH
1128 v = '0';
1129 cred.pid = vpid;
01e71852 1130 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
4775fba1
SH
1131 v = '1';
1132 cred.pid = getpid();
01e71852 1133 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
4775fba1
SH
1134 exit(1);
1135 }
1136 }
1137 exit(0);
1138}
1139
1140static void pid_from_ns_wrapper(int sock, pid_t tpid)
1141{
ea56f722 1142 int newnsfd = -1, ret, cpipe[2];
4775fba1
SH
1143 char fnam[100];
1144 pid_t cpid;
ea56f722
SH
1145 fd_set s;
1146 struct timeval tv;
1147 char v;
4775fba1
SH
1148
1149 sprintf(fnam, "/proc/%d/ns/pid", tpid);
1150 newnsfd = open(fnam, O_RDONLY);
1151 if (newnsfd < 0)
1152 exit(1);
1153 if (setns(newnsfd, 0) < 0)
1154 exit(1);
1155 close(newnsfd);
1156
ea56f722
SH
1157 if (pipe(cpipe) < 0)
1158 exit(1);
1159
1160loop:
4775fba1
SH
1161 cpid = fork();
1162
1163 if (cpid < 0)
1164 exit(1);
ea56f722
SH
1165
1166 if (!cpid) {
1167 char b = '1';
1168 close(cpipe[0]);
1169 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1170 fprintf(stderr, "%s (child): erorr on write: %s\n",
1171 __func__, strerror(errno));
1172 }
1173 close(cpipe[1]);
4775fba1 1174 pid_from_ns(sock, tpid);
ea56f722
SH
1175 }
1176
1177 // give the child 1 second to be done forking and
1178 // write it's ack
1179 FD_ZERO(&s);
1180 FD_SET(cpipe[0], &s);
1181 tv.tv_sec = 1;
1182 tv.tv_usec = 0;
1183 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1184 if (ret <= 0)
1185 goto again;
1186 ret = read(cpipe[0], &v, 1);
1187 if (ret != sizeof(char) || v != '1') {
1188 goto again;
1189 }
1190
4775fba1
SH
1191 if (!wait_for_pid(cpid))
1192 exit(1);
1193 exit(0);
ea56f722
SH
1194
1195again:
1196 kill(cpid, SIGKILL);
1197 wait_for_pid(cpid);
1198 goto loop;
4775fba1
SH
1199}
1200
1201static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
1202{
1203 int sock[2] = {-1, -1};
1204 pid_t qpid, cpid = -1;
1205 bool answer = false, fail = false;
1206
1207 /*
1208 * write the pids to a socket, have helper in writer's pidns
1209 * call movepid for us
1210 */
1211 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1212 perror("socketpair");
1213 exit(1);
1214 }
1215
1216 cpid = fork();
1217 if (cpid == -1)
1218 goto out;
1219
1220 if (!cpid) // child
1221 pid_from_ns_wrapper(sock[1], tpid);
1222
1223 const char *ptr = buf;
1224 while (sscanf(ptr, "%d", &qpid) == 1) {
1225 struct ucred cred;
1226 char v;
1227
1228 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
6ee867dc
SH
1229 fprintf(stderr, "%s: error writing pid to child: %s\n",
1230 __func__, strerror(errno));
4775fba1
SH
1231 goto out;
1232 }
1233
01e71852
SH
1234 if (recv_creds(sock[0], &cred, &v)) {
1235 if (v == '0') {
1236 if (!cgm_move_pid(contrl, cg, cred.pid))
1237 fail = true;
1238 }
4775fba1
SH
1239 }
1240
1241 ptr = strchr(ptr, '\n');
1242 if (!ptr)
1243 break;
1244 ptr++;
1245 }
1246
1247 /* All good, write the value */
1248 qpid = -1;
1249 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1420baf8 1250 fprintf(stderr, "Warning: failed to ask child to exit\n");
4775fba1
SH
1251
1252 if (!fail)
1253 answer = true;
1254
1255out:
1256 if (cpid != -1)
1257 wait_for_pid(cpid);
1258 if (sock[0] != -1) {
1259 close(sock[0]);
1260 close(sock[1]);
1261 }
1262 return answer;
1263}
1264
2ad6d2bd
SH
1265int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1266 struct fuse_file_info *fi)
1267{
2ad6d2bd 1268 struct fuse_context *fc = fuse_get_context();
47cbf0e5 1269 nih_local char *localbuf = NULL;
8f6e8f5e
SH
1270 nih_local struct cgm_keys *k = NULL;
1271 struct file_info *f = (struct file_info *)fi->fh;
2ad6d2bd 1272
443d13f5 1273 if (f->type != LXC_TYPE_CGFILE) {
b845ad01
SH
1274 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1275 return -EIO;
1276 }
1277
2ad6d2bd 1278 if (offset)
f9a05025 1279 return -EINVAL;
2ad6d2bd
SH
1280
1281 if (!fc)
1282 return -EIO;
1283
47cbf0e5
SH
1284 localbuf = NIH_MUST( nih_alloc(NULL, size+1) );
1285 localbuf[size] = '\0';
1286 memcpy(localbuf, buf, size);
2ad6d2bd 1287
8f6e8f5e 1288 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
4775fba1
SH
1289 bool r;
1290
8f6e8f5e 1291 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY))
f9a05025 1292 return -EACCES;
2ad6d2bd 1293
8f6e8f5e
SH
1294 if (strcmp(f->file, "tasks") == 0 ||
1295 strcmp(f->file, "/tasks") == 0 ||
1296 strcmp(f->file, "/cgroup.procs") == 0 ||
1297 strcmp(f->file, "cgroup.procs") == 0)
4775fba1 1298 // special case - we have to translate the pids
8f6e8f5e 1299 r = do_write_pids(fc->pid, f->controller, f->cgroup, f->file, localbuf);
4775fba1 1300 else
8f6e8f5e 1301 r = cgm_set_value(f->controller, f->cgroup, f->file, localbuf);
4775fba1
SH
1302
1303 if (!r)
2ad6d2bd
SH
1304 return -EINVAL;
1305
1306 return size;
1307 }
1308
1309 return -EINVAL;
1310}
1311
341b21ad
SH
1312int cg_chown(const char *path, uid_t uid, gid_t gid)
1313{
1314 struct fuse_context *fc = fuse_get_context();
1315 nih_local char * cgdir = NULL;
1316 char *fpath = NULL, *path1, *path2;
1317 nih_local struct cgm_keys *k = NULL;
1318 const char *cgroup;
1319 nih_local char *controller = NULL;
1320
1321
1322 if (!fc)
1323 return -EIO;
1324
1325 if (strcmp(path, "/cgroup") == 0)
1326 return -EINVAL;
1327
1328 controller = pick_controller_from_path(fc, path);
1329 if (!controller)
f9a05025 1330 return -EINVAL;
341b21ad
SH
1331 cgroup = find_cgroup_in_path(path);
1332 if (!cgroup)
1333 /* this is just /cgroup/controller */
1334 return -EINVAL;
1335
1336 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1337
1338 if (!fpath) {
1339 path1 = "/";
1340 path2 = cgdir;
1341 } else {
1342 path1 = cgdir;
1343 path2 = fpath;
1344 }
1345
1346 if (is_child_cgroup(controller, path1, path2)) {
1347 // get uid, gid, from '/tasks' file and make up a mode
1348 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1349 k = get_cgroup_key(controller, cgroup, "tasks");
1350
1351 } else
1352 k = get_cgroup_key(controller, path1, path2);
1353
1354 if (!k)
1355 return -EINVAL;
1356
1357 /*
1358 * This being a fuse request, the uid and gid must be valid
1359 * in the caller's namespace. So we can just check to make
1360 * sure that the caller is root in his uid, and privileged
1361 * over the file's current owner.
1362 */
1363 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD))
f9a05025 1364 return -EACCES;
341b21ad
SH
1365
1366 if (!cgm_chown_file(controller, cgroup, uid, gid))
1367 return -EINVAL;
1368 return 0;
1369}
2ad6d2bd 1370
fd2e4e03
SH
1371int cg_chmod(const char *path, mode_t mode)
1372{
0a1bb5ea
SH
1373 struct fuse_context *fc = fuse_get_context();
1374 nih_local char * cgdir = NULL;
1375 char *fpath = NULL, *path1, *path2;
1376 nih_local struct cgm_keys *k = NULL;
1377 const char *cgroup;
1378 nih_local char *controller = NULL;
1379
1380 if (!fc)
1381 return -EIO;
1382
1383 if (strcmp(path, "/cgroup") == 0)
1384 return -EINVAL;
1385
1386 controller = pick_controller_from_path(fc, path);
1387 if (!controller)
f9a05025 1388 return -EINVAL;
0a1bb5ea
SH
1389 cgroup = find_cgroup_in_path(path);
1390 if (!cgroup)
1391 /* this is just /cgroup/controller */
1392 return -EINVAL;
1393
1394 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1395
1396 if (!fpath) {
1397 path1 = "/";
1398 path2 = cgdir;
1399 } else {
1400 path1 = cgdir;
1401 path2 = fpath;
1402 }
1403
1404 if (is_child_cgroup(controller, path1, path2)) {
1405 // get uid, gid, from '/tasks' file and make up a mode
1406 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1407 k = get_cgroup_key(controller, cgroup, "tasks");
1408
1409 } else
1410 k = get_cgroup_key(controller, path1, path2);
1411
1412 if (!k)
1413 return -EINVAL;
1414
1415 /*
1416 * This being a fuse request, the uid and gid must be valid
1417 * in the caller's namespace. So we can just check to make
1418 * sure that the caller is root in his uid, and privileged
1419 * over the file's current owner.
1420 */
1421 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT))
1422 return -EPERM;
1423
1424 if (!cgm_chmod_file(controller, cgroup, mode))
1425 return -EINVAL;
1426 return 0;
fd2e4e03
SH
1427}
1428
ab54b798
SH
1429int cg_mkdir(const char *path, mode_t mode)
1430{
1431 struct fuse_context *fc = fuse_get_context();
1432 nih_local struct cgm_keys **list = NULL;
1433 char *fpath = NULL, *path1;
1434 nih_local char * cgdir = NULL;
1435 const char *cgroup;
1436 nih_local char *controller = NULL;
1437
ab54b798
SH
1438 if (!fc)
1439 return -EIO;
1440
1441
1442 controller = pick_controller_from_path(fc, path);
1443 if (!controller)
f9a05025 1444 return -EINVAL;
ab54b798
SH
1445
1446 cgroup = find_cgroup_in_path(path);
1447 if (!cgroup)
f9a05025 1448 return -EINVAL;
ab54b798
SH
1449
1450 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1451 if (!fpath)
1452 path1 = "/";
1453 else
1454 path1 = cgdir;
1455
1456 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR))
f9a05025 1457 return -EACCES;
ab54b798
SH
1458
1459
1460 if (!cgm_create(controller, cgroup, fc->uid, fc->gid))
1461 return -EINVAL;
1462
1463 return 0;
1464}
1465
50d8d5b5
SH
1466static int cg_rmdir(const char *path)
1467{
1468 struct fuse_context *fc = fuse_get_context();
1469 nih_local struct cgm_keys **list = NULL;
1470 char *fpath = NULL;
1471 nih_local char * cgdir = NULL;
1472 const char *cgroup;
1473 nih_local char *controller = NULL;
1474
1475 if (!fc)
1476 return -EIO;
1477
1478
1479 controller = pick_controller_from_path(fc, path);
1480 if (!controller)
f9a05025 1481 return -EINVAL;
50d8d5b5
SH
1482
1483 cgroup = find_cgroup_in_path(path);
1484 if (!cgroup)
f9a05025 1485 return -EINVAL;
50d8d5b5
SH
1486
1487 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1488 if (!fpath)
1489 return -EINVAL;
1490
1491 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY))
f9a05025 1492 return -EACCES;
50d8d5b5
SH
1493
1494 if (!cgm_remove(controller, cgroup))
1495 return -EINVAL;
1496
1497 return 0;
1498}
1499
2dc17609
SH
1500static bool startswith(const char *line, const char *pref)
1501{
1502 if (strncmp(line, pref, strlen(pref)) == 0)
1503 return true;
1504 return false;
1505}
1506
1507static void get_mem_cached(char *memstat, unsigned long *v)
1508{
1509 char *eol;
1510
1511 *v = 0;
1512 while (*memstat) {
1513 if (startswith(memstat, "total_cache")) {
1514 sscanf(memstat + 11, "%lu", v);
1515 *v /= 1024;
1516 return;
1517 }
1518 eol = strchr(memstat, '\n');
1519 if (!eol)
1520 return;
1521 memstat = eol+1;
1522 }
1523}
1524
49878439
YY
1525static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
1526{
1527 char *eol;
1528 char key[32];
1529
1530 memset(key, 0, 32);
1531 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
1532
1533 size_t len = strlen(key);
1534 *v = 0;
1535
1536 while (*str) {
1537 if (startswith(str, key)) {
1538 sscanf(str + len, "%lu", v);
1539 return;
1540 }
1541 eol = strchr(str, '\n');
1542 if (!eol)
1543 return;
1544 str = eol+1;
1545 }
1546}
1547
2dc17609
SH
1548static char *get_pid_cgroup(pid_t pid, const char *contrl)
1549{
1550 nih_local char *fnam = NULL;
1551 FILE *f;
1552 char *answer = NULL;
1553 char *line = NULL;
1554 size_t len = 0;
1555
1556 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
1557 if (!(f = fopen(fnam, "r")))
1558 return false;
1559
1560 while (getline(&line, &len, f) != -1) {
1561 char *c1, *c2;
1562 if (!line[0])
1563 continue;
1564 c1 = strchr(line, ':');
1565 if (!c1)
1566 goto out;
1567 c1++;
1568 c2 = strchr(c1, ':');
1569 if (!c2)
1570 goto out;
1571 *c2 = '\0';
1572 if (strcmp(c1, contrl) != 0)
1573 continue;
1574 c2++;
1575 stripnewline(c2);
1576 answer = NIH_MUST( nih_strdup(NULL, c2) );
1577 goto out;
1578 }
1579
1580out:
1581 fclose(f);
1582 free(line);
1583 return answer;
1584}
1585
758ad80c 1586/*
2ad6d2bd 1587 * FUSE ops for /proc
758ad80c 1588 */
758ad80c 1589
23ce2127
SH
1590static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1591 struct fuse_file_info *fi)
1592{
2dc17609
SH
1593 struct fuse_context *fc = fuse_get_context();
1594 nih_local char *cg = get_pid_cgroup(fc->pid, "memory");
1595 nih_local char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
1596 unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
1597 char *line = NULL;
1598 size_t linelen = 0, total_len = 0;
1599 FILE *f;
1600
1601 if (offset)
1602 return -EINVAL;
1603
1604 if (!cg)
1605 return 0;
1606
1607 if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
1608 return 0;
1609 if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
1610 return 0;
1611 if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
1612 return 0;
1613 memlimit = strtoul(memlimit_str, NULL, 10);
1614 memusage = strtoul(memusage_str, NULL, 10);
1615 memlimit /= 1024;
1616 memusage /= 1024;
1617 get_mem_cached(memstat_str, &cached);
1618
1619 f = fopen("/proc/meminfo", "r");
1620 if (!f)
1621 return 0;
1622
1623 while (getline(&line, &linelen, f) != -1) {
1624 size_t l;
1625 char *printme, lbuf[100];
1626
1627 memset(lbuf, 0, 100);
1628 if (startswith(line, "MemTotal:")) {
1629 sscanf(line+14, "%lu", &hosttotal);
1630 if (hosttotal < memlimit)
1631 memlimit = hosttotal;
1632 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1633 printme = lbuf;
1634 } else if (startswith(line, "MemFree:")) {
1635 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1636 printme = lbuf;
1637 } else if (startswith(line, "MemAvailable:")) {
1638 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1639 printme = lbuf;
1640 } else if (startswith(line, "Buffers:")) {
1641 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1642 printme = lbuf;
1643 } else if (startswith(line, "Cached:")) {
1644 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
1645 printme = lbuf;
1646 } else if (startswith(line, "SwapCached:")) {
1647 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
1648 printme = lbuf;
1649 } else
1650 printme = line;
1651 l = snprintf(buf, size, "%s", printme);
1652 buf += l;
1653 size -= l;
1654 total_len += l;
1655 }
1656
92c84dc4
SH
1657 fclose(f);
1658 free(line);
2dc17609 1659 return total_len;
23ce2127
SH
1660}
1661
1662/*
1663 * Read the cpuset.cpus for cg
1664 * Return the answer in a nih_alloced string
1665 */
1666static char *get_cpuset(const char *cg)
1667{
1668 char *answer;
1669
1670 if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
1671 return NULL;
1672 return answer;
1673}
1674
1675/*
1676 * Helper functions for cpuset_in-set
1677 */
1678char *cpuset_nexttok(const char *c)
1679{
1680 char *r = strchr(c+1, ',');
1681 if (r)
1682 return r+1;
1683 return NULL;
1684}
1685
1686int cpuset_getrange(const char *c, int *a, int *b)
1687{
1688 int ret;
1689
1690 ret = sscanf(c, "%d-%d", a, b);
1691 return ret;
1692}
1693
1694/*
1695 * cpusets are in format "1,2-3,4"
1696 * iow, comma-delimited ranges
1697 */
aeb56147 1698static bool cpu_in_cpuset(int cpu, const char *cpuset)
23ce2127 1699{
23ce2127
SH
1700 const char *c;
1701
23ce2127
SH
1702 for (c = cpuset; c; c = cpuset_nexttok(c)) {
1703 int a, b, ret;
1704
1705 ret = cpuset_getrange(c, &a, &b);
1706 if (ret == 1 && cpu == a)
1707 return true;
1708 if (ret != 2) // bad cpuset!
1709 return false;
1710 if (cpu >= a && cpu <= b)
1711 return true;
1712 }
1713
1714 return false;
1715}
1716
aeb56147
SH
1717static bool cpuline_in_cpuset(const char *line, const char *cpuset)
1718{
1719 int cpu;
1720
1721 if (sscanf(line, "processor : %d", &cpu) != 1)
1722 return false;
1723 return cpu_in_cpuset(cpu, cpuset);
1724}
1725
23ce2127
SH
1726/*
1727 * check whether this is a '^processor" line in /proc/cpuinfo
1728 */
1729static bool is_processor_line(const char *line)
1730{
1731 int cpu;
1732
1733 if (sscanf(line, "processor : %d", &cpu) == 1)
1734 return true;
1735 return false;
1736}
1737
23ce2127
SH
1738static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
1739 struct fuse_file_info *fi)
1740{
1741 struct fuse_context *fc = fuse_get_context();
1742 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1743 nih_local char *cpuset = NULL;
1744 char *line = NULL;
1745 size_t linelen = 0, total_len = 0;
1746 bool am_printing = false;
1747 int curcpu = -1;
1748 FILE *f;
1749
1750 if (offset)
1751 return -EINVAL;
1752
1753 if (!cg)
1754 return 0;
1755
1756 cpuset = get_cpuset(cg);
1757 if (!cpuset)
1758 return 0;
1759
1760 f = fopen("/proc/cpuinfo", "r");
1761 if (!f)
1762 return 0;
1763
1764 while (getline(&line, &linelen, f) != -1) {
1765 size_t l;
1766 if (is_processor_line(line)) {
aeb56147 1767 am_printing = cpuline_in_cpuset(line, cpuset);
23ce2127
SH
1768 if (am_printing) {
1769 curcpu ++;
1770 l = snprintf(buf, size, "processor : %d\n", curcpu);
1771 buf += l;
1772 size -= l;
1773 total_len += l;
1774 }
1775 continue;
1776 }
1777 if (am_printing) {
1778 l = snprintf(buf, size, "%s", line);
1779 buf += l;
1780 size -= l;
1781 total_len += l;
1782 }
1783 }
1784
92c84dc4
SH
1785 fclose(f);
1786 free(line);
23ce2127
SH
1787 return total_len;
1788}
1789
1790static int proc_stat_read(char *buf, size_t size, off_t offset,
1791 struct fuse_file_info *fi)
1792{
aeb56147
SH
1793 struct fuse_context *fc = fuse_get_context();
1794 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1795 nih_local char *cpuset = NULL;
1796 char *line = NULL;
1797 size_t linelen = 0, total_len = 0;
2a0fde62 1798 int curcpu = -1; /* cpu numbering starts at 0 */
aeb56147
SH
1799 FILE *f;
1800
1801 if (offset)
1802 return -EINVAL;
1803
1804 if (!cg)
1805 return 0;
1806
1807 cpuset = get_cpuset(cg);
1808 if (!cpuset)
1809 return 0;
1810
1811 f = fopen("/proc/stat", "r");
1812 if (!f)
1813 return 0;
1814
1815 while (getline(&line, &linelen, f) != -1) {
1816 size_t l;
1817 int cpu;
2a0fde62 1818 char cpu_char[10]; /* That's a lot of cores */
aeb56147
SH
1819 char *c;
1820
2a0fde62
CB
1821 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
1822 /* not a ^cpuN line containing a number N, just print it */
aeb56147
SH
1823 l = snprintf(buf, size, "%s", line);
1824 buf += l;
1825 size -= l;
1826 total_len += l;
1827 continue;
1828 }
2a0fde62
CB
1829
1830 if (sscanf(cpu_char, "%d", &cpu) != 1)
1831 continue;
aeb56147
SH
1832 if (!cpu_in_cpuset(cpu, cpuset))
1833 continue;
1834 curcpu ++;
1835
1836 c = strchr(line, ' ');
1837 if (!c)
1838 continue;
1839 l = snprintf(buf, size, "cpu%d %s", curcpu, c);
1840 buf += l;
1841 size -= l;
1842 total_len += l;
1843 }
1844
92c84dc4
SH
1845 fclose(f);
1846 free(line);
aeb56147 1847 return total_len;
23ce2127
SH
1848}
1849
7bbf2246
SH
1850/*
1851 * How to guess what to present for uptime?
1852 * One thing we could do would be to take the date on the caller's
1853 * memory.usage_in_bytes file, which should equal the time of creation
1854 * of his cgroup. However, a task could be in a sub-cgroup of the
1855 * container. The same problem exists if we try to look at the ages
1856 * of processes in the caller's cgroup.
1857 *
1858 * So we'll fork a task that will enter the caller's pidns, mount a
1859 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
1860 *
1861 * For the second uptime #, we'll do as Stéphane had done, just copy
1862 * the number from /proc/uptime. Not sure how to best emulate 'idle'
1863 * time. Maybe someone can come up with a good algorithm and submit a
1864 * patch. Maybe something based on cpushare info?
1865 */
41bb9357
SH
1866
1867/* return age of the reaper for $pid, taken from ctime of its procdir */
1868static long int get_pid1_time(pid_t pid)
1869{
1870 char fnam[100];
ea56f722 1871 int fd, cpipe[2], ret;
41bb9357 1872 struct stat sb;
ea56f722
SH
1873 pid_t cpid;
1874 struct timeval tv;
1875 fd_set s;
1876 char v;
41bb9357
SH
1877
1878 if (unshare(CLONE_NEWNS))
1879 return 0;
1880
5ca64c2a
SG
1881 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
1882 perror("rslave mount failed");
1883 return 0;
1884 }
1885
41bb9357
SH
1886 sprintf(fnam, "/proc/%d/ns/pid", pid);
1887 fd = open(fnam, O_RDONLY);
1888 if (fd < 0) {
1889 perror("get_pid1_time open of ns/pid");
1890 return 0;
1891 }
1892 if (setns(fd, 0)) {
1893 perror("get_pid1_time setns 1");
1894 close(fd);
1895 return 0;
1896 }
1897 close(fd);
41bb9357 1898
ea56f722
SH
1899 if (pipe(cpipe) < 0)
1900 exit(1);
41bb9357 1901
ea56f722
SH
1902loop:
1903 cpid = fork();
1904 if (cpid < 0)
41bb9357 1905 return 0;
ea56f722
SH
1906
1907 if (!cpid) {
1908 char b = '1';
1909 close(cpipe[0]);
1910 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1911 fprintf(stderr, "%s (child): erorr on write: %s\n",
1912 __func__, strerror(errno));
1913 }
1914 close(cpipe[1]);
1915 umount2("/proc", MNT_DETACH);
1916 if (mount("proc", "/proc", "proc", 0, NULL)) {
1917 perror("get_pid1_time mount");
1918 return 0;
1919 }
1920 ret = lstat("/proc/1", &sb);
1921 if (ret) {
1922 perror("get_pid1_time lstat");
1923 return 0;
1924 }
1925 return time(NULL) - sb.st_ctime;
41bb9357 1926 }
ea56f722
SH
1927
1928 // give the child 1 second to be done forking and
1929 // write it's ack
1930 FD_ZERO(&s);
1931 FD_SET(cpipe[0], &s);
1932 tv.tv_sec = 1;
1933 tv.tv_usec = 0;
1934 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1935 if (ret <= 0)
1936 goto again;
1937 ret = read(cpipe[0], &v, 1);
1938 if (ret != sizeof(char) || v != '1') {
1939 goto again;
41bb9357 1940 }
ea56f722
SH
1941
1942 wait_for_pid(cpid);
1943 exit(0);
1944
1945again:
1946 kill(cpid, SIGKILL);
1947 wait_for_pid(cpid);
1948 goto loop;
41bb9357
SH
1949}
1950
1951static long int getreaperage(pid_t qpid)
1952{
1953 int pid, mypipe[2], ret;
1954 struct timeval tv;
1955 fd_set s;
1956 long int mtime, answer = 0;
1957
1958 if (pipe(mypipe)) {
1959 return 0;
1960 }
1961
1962 pid = fork();
1963
1964 if (!pid) { // child
1965 mtime = get_pid1_time(qpid);
1966 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
1967 fprintf(stderr, "Warning: bad write from getreaperage\n");
1968 exit(0);
1969 }
1970
1971 close(mypipe[1]);
1972 FD_ZERO(&s);
1973 FD_SET(mypipe[0], &s);
1974 tv.tv_sec = 1;
1975 tv.tv_usec = 0;
1976 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
ea56f722 1977 if (ret <= 0) {
41bb9357
SH
1978 perror("select");
1979 goto out;
1980 }
1981 if (!ret) {
1420baf8 1982 fprintf(stderr, "timed out\n");
41bb9357
SH
1983 goto out;
1984 }
1985 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
1986 perror("read");
1987 goto out;
1988 }
1989 answer = mtime;
1990
1991out:
1992 wait_for_pid(pid);
1993 close(mypipe[0]);
1994 return answer;
1995}
1996
1997static long int getprocidle(void)
1998{
1999 FILE *f = fopen("/proc/uptime", "r");
2000 long int age, idle;
92c84dc4 2001 int ret;
41bb9357
SH
2002 if (!f)
2003 return 0;
92c84dc4
SH
2004 ret = fscanf(f, "%ld %ld", &age, &idle);
2005 fclose(f);
2006 if (ret != 2)
41bb9357
SH
2007 return 0;
2008 return idle;
2009}
2010
2011/*
2012 * We read /proc/uptime and reuse its second field.
2013 * For the first field, we use the mtime for the reaper for
2014 * the calling pid as returned by getreaperage
2015 */
23ce2127
SH
2016static int proc_uptime_read(char *buf, size_t size, off_t offset,
2017 struct fuse_file_info *fi)
2018{
41bb9357
SH
2019 struct fuse_context *fc = fuse_get_context();
2020 long int reaperage = getreaperage(fc->pid);;
2021 long int idletime = getprocidle();
2022
2023 if (offset)
2024 return -EINVAL;
2025 return snprintf(buf, size, "%ld %ld\n", reaperage, idletime);
23ce2127
SH
2026}
2027
49878439
YY
2028static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2029 struct fuse_file_info *fi)
2030{
2031 char dev_name[72];
2032 struct fuse_context *fc = fuse_get_context();
2033 nih_local char *cg = get_pid_cgroup(fc->pid, "blkio");
2034 nih_local char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
2035 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2036 unsigned long read = 0, write = 0;
2037 unsigned long read_merged = 0, write_merged = 0;
2038 unsigned long read_sectors = 0, write_sectors = 0;
2039 unsigned long read_ticks = 0, write_ticks = 0;
2040 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2041 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
2042 char *line = NULL;
2043 size_t linelen = 0, total_len = 0;
2044 unsigned int major = 0, minor = 0;
2045 int i = 0;
2046 FILE *f;
2047
2048 if (offset)
2049 return -EINVAL;
2050
2051 if (!cg)
2052 return 0;
2053
2054 if (!cgm_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2055 return 0;
2056 if (!cgm_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2057 return 0;
2058 if (!cgm_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2059 return 0;
2060 if (!cgm_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2061 return 0;
2062 if (!cgm_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2063 return 0;
2064
2065
2066 f = fopen("/proc/diskstats", "r");
2067 if (!f)
2068 return 0;
2069
2070 while (getline(&line, &linelen, f) != -1) {
2071 size_t l;
2072 char *printme, lbuf[256];
2073
2074 i = sscanf(line, "%u %u %s", &major, &minor, dev_name);
2075 if(i == 3){
2076 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2077 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2078 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2079 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2080 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2081 read_sectors = read_sectors/512;
2082 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2083 write_sectors = write_sectors/512;
2084
2085 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2086 rd_svctm = rd_svctm/1000000;
2087 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2088 rd_wait = rd_wait/1000000;
2089 read_ticks = rd_svctm + rd_wait;
2090
2091 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2092 wr_svctm = wr_svctm/1000000;
2093 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2094 wr_wait = wr_wait/1000000;
2095 write_ticks = wr_svctm + wr_wait;
2096
2097 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2098 tot_ticks = tot_ticks/1000000;
2099 }else{
2100 continue;
2101 }
2102
2103 memset(lbuf, 0, 256);
2104 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2105 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2106 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2107 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2108 printme = lbuf;
2109 } else
2110 continue;
2111
2112 l = snprintf(buf, size, "%s", printme);
2113 buf += l;
2114 size -= l;
2115 total_len += l;
2116 }
2117
2118 fclose(f);
2119 free(line);
2120 return total_len;
2121}
2122
23ce2127
SH
2123static off_t get_procfile_size(const char *which)
2124{
2125 FILE *f = fopen(which, "r");
2126 char *line = NULL;
2127 size_t len = 0;
2128 ssize_t sz, answer = 0;
2129 if (!f)
2130 return 0;
2131
2132 while ((sz = getline(&line, &len, f)) != -1)
2133 answer += sz;
2134 fclose (f);
92c84dc4 2135 free(line);
23ce2127
SH
2136
2137 return answer;
2138}
2139
758ad80c
SH
2140static int proc_getattr(const char *path, struct stat *sb)
2141{
35629743
SH
2142 struct timespec now;
2143
2144 memset(sb, 0, sizeof(struct stat));
2145 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2146 return -EINVAL;
2147 sb->st_uid = sb->st_gid = 0;
2148 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2149 if (strcmp(path, "/proc") == 0) {
2150 sb->st_mode = S_IFDIR | 00555;
2151 sb->st_nlink = 2;
2152 return 0;
2153 }
2154 if (strcmp(path, "/proc/meminfo") == 0 ||
2155 strcmp(path, "/proc/cpuinfo") == 0 ||
2156 strcmp(path, "/proc/uptime") == 0 ||
49878439
YY
2157 strcmp(path, "/proc/stat") == 0 ||
2158 strcmp(path, "/proc/diskstats") == 0) {
23ce2127 2159 sb->st_size = get_procfile_size(path);
35629743
SH
2160 sb->st_mode = S_IFREG | 00444;
2161 sb->st_nlink = 1;
2162 return 0;
2163 }
2164
2165 return -ENOENT;
2166}
2167
2168static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2169 struct fuse_file_info *fi)
2170{
2171 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2172 filler(buf, "meminfo", NULL, 0) != 0 ||
2173 filler(buf, "stat", NULL, 0) != 0 ||
49878439
YY
2174 filler(buf, "uptime", NULL, 0) != 0 ||
2175 filler(buf, "diskstats", NULL, 0) != 0)
758ad80c 2176 return -EINVAL;
758ad80c
SH
2177 return 0;
2178}
2179
35629743
SH
2180static int proc_open(const char *path, struct fuse_file_info *fi)
2181{
96fc5ee6
SH
2182 int type = -1;
2183 struct file_info *info;
2184
2185 if (strcmp(path, "/proc/meminfo") == 0)
2186 type = LXC_TYPE_PROC_MEMINFO;
2187 else if (strcmp(path, "/proc/cpuinfo") == 0)
2188 type = LXC_TYPE_PROC_CPUINFO;
2189 else if (strcmp(path, "/proc/uptime") == 0)
2190 type = LXC_TYPE_PROC_UPTIME;
2191 else if (strcmp(path, "/proc/stat") == 0)
2192 type = LXC_TYPE_PROC_STAT;
2193 else if (strcmp(path, "/proc/diskstats") == 0)
2194 type = LXC_TYPE_PROC_DISKSTATS;
2195 if (type == -1)
2196 return -ENOENT;
2197
2198 info = NIH_MUST( nih_alloc(NULL, sizeof(*info)) );
2199 memset(info, 0, sizeof(*info));
2200 info->type = type;
2201
2202 fi->fh = (unsigned long)info;
2203 return 0;
2204}
2205
2206static int proc_release(const char *path, struct fuse_file_info *fi)
2207{
2208 struct file_info *f = (struct file_info *)fi->fh;
2209
2210 do_release_file_info(f);
2211 return 0;
35629743
SH
2212}
2213
35629743
SH
2214static int proc_read(const char *path, char *buf, size_t size, off_t offset,
2215 struct fuse_file_info *fi)
2216{
96fc5ee6
SH
2217 struct file_info *f = (struct file_info *) fi->fh;
2218
2219 switch (f->type) {
2220 case LXC_TYPE_PROC_MEMINFO:
23ce2127 2221 return proc_meminfo_read(buf, size, offset, fi);
96fc5ee6 2222 case LXC_TYPE_PROC_CPUINFO:
23ce2127 2223 return proc_cpuinfo_read(buf, size, offset, fi);
96fc5ee6 2224 case LXC_TYPE_PROC_UPTIME:
23ce2127 2225 return proc_uptime_read(buf, size, offset, fi);
96fc5ee6 2226 case LXC_TYPE_PROC_STAT:
23ce2127 2227 return proc_stat_read(buf, size, offset, fi);
96fc5ee6 2228 case LXC_TYPE_PROC_DISKSTATS:
49878439 2229 return proc_diskstats_read(buf, size, offset, fi);
96fc5ee6
SH
2230 default:
2231 return -EINVAL;
2232 }
35629743
SH
2233}
2234
2ad6d2bd
SH
2235/*
2236 * FUSE ops for /
2237 * these just delegate to the /proc and /cgroup ops as
2238 * needed
2239 */
758ad80c
SH
2240
2241static int lxcfs_getattr(const char *path, struct stat *sb)
2242{
2243 if (strcmp(path, "/") == 0) {
2244 sb->st_mode = S_IFDIR | 00755;
2245 sb->st_nlink = 2;
2246 return 0;
2247 }
2248 if (strncmp(path, "/cgroup", 7) == 0) {
2249 return cg_getattr(path, sb);
2250 }
35629743 2251 if (strncmp(path, "/proc", 5) == 0) {
758ad80c
SH
2252 return proc_getattr(path, sb);
2253 }
2254 return -EINVAL;
2255}
2256
2257static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
2258{
2259 if (strcmp(path, "/") == 0)
2260 return 0;
2261
2262 if (strncmp(path, "/cgroup", 7) == 0) {
2263 return cg_opendir(path, fi);
2264 }
35629743
SH
2265 if (strcmp(path, "/proc") == 0)
2266 return 0;
2267 return -ENOENT;
758ad80c
SH
2268}
2269
2270static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2271 struct fuse_file_info *fi)
2272{
2273 if (strcmp(path, "/") == 0) {
2274 if (filler(buf, "proc", NULL, 0) != 0 ||
2275 filler(buf, "cgroup", NULL, 0) != 0)
2276 return -EINVAL;
2277 return 0;
2278 }
35629743 2279 if (strncmp(path, "/cgroup", 7) == 0)
758ad80c 2280 return cg_readdir(path, buf, filler, offset, fi);
35629743
SH
2281 if (strcmp(path, "/proc") == 0)
2282 return proc_readdir(path, buf, filler, offset, fi);
758ad80c
SH
2283 return -EINVAL;
2284}
2285
2286static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
2287{
2288 if (strcmp(path, "/") == 0)
2289 return 0;
2290 if (strncmp(path, "/cgroup", 7) == 0) {
2291 return cg_releasedir(path, fi);
2292 }
35629743
SH
2293 if (strcmp(path, "/proc") == 0)
2294 return 0;
758ad80c
SH
2295 return -EINVAL;
2296}
2297
99978832
SH
2298static int lxcfs_open(const char *path, struct fuse_file_info *fi)
2299{
35629743 2300 if (strncmp(path, "/cgroup", 7) == 0)
99978832 2301 return cg_open(path, fi);
35629743
SH
2302 if (strncmp(path, "/proc", 5) == 0)
2303 return proc_open(path, fi);
99978832
SH
2304
2305 return -EINVAL;
2306}
2307
2308static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
2309 struct fuse_file_info *fi)
2310{
35629743 2311 if (strncmp(path, "/cgroup", 7) == 0)
99978832 2312 return cg_read(path, buf, size, offset, fi);
35629743
SH
2313 if (strncmp(path, "/proc", 5) == 0)
2314 return proc_read(path, buf, size, offset, fi);
99978832
SH
2315
2316 return -EINVAL;
2317}
2318
2ad6d2bd
SH
2319int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
2320 struct fuse_file_info *fi)
2321{
2322 if (strncmp(path, "/cgroup", 7) == 0) {
2323 return cg_write(path, buf, size, offset, fi);
2324 }
2325
2326 return -EINVAL;
2327}
2328
99978832
SH
2329static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2330{
2331 return 0;
2332}
2333
2334static int lxcfs_release(const char *path, struct fuse_file_info *fi)
758ad80c 2335{
8f6e8f5e
SH
2336 if (strncmp(path, "/cgroup", 7) == 0)
2337 return cg_release(path, fi);
8f6e8f5e 2338 if (strncmp(path, "/proc", 5) == 0)
96fc5ee6 2339 return proc_release(path, fi);
8f6e8f5e
SH
2340
2341 return -EINVAL;
99978832
SH
2342}
2343
2344static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2345{
2346 return 0;
758ad80c
SH
2347}
2348
ab54b798
SH
2349int lxcfs_mkdir(const char *path, mode_t mode)
2350{
2351 if (strncmp(path, "/cgroup", 7) == 0)
2352 return cg_mkdir(path, mode);
2353
2354 return -EINVAL;
2355}
2356
341b21ad
SH
2357int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2358{
2359 if (strncmp(path, "/cgroup", 7) == 0)
2360 return cg_chown(path, uid, gid);
2361
2362 return -EINVAL;
2363}
2364
2ad6d2bd
SH
2365/*
2366 * cat first does a truncate before doing ops->write. This doesn't
2367 * really make sense for cgroups. So just return 0 always but do
2368 * nothing.
2369 */
2370int lxcfs_truncate(const char *path, off_t newsize)
2371{
2372 if (strncmp(path, "/cgroup", 7) == 0)
2373 return 0;
2374 return -EINVAL;
2375}
2376
50d8d5b5
SH
2377int lxcfs_rmdir(const char *path)
2378{
2379 if (strncmp(path, "/cgroup", 7) == 0)
2380 return cg_rmdir(path);
2381 return -EINVAL;
2382}
2383
fd2e4e03
SH
2384int lxcfs_chmod(const char *path, mode_t mode)
2385{
2386 if (strncmp(path, "/cgroup", 7) == 0)
2387 return cg_chmod(path, mode);
2388 return -EINVAL;
2389}
2390
758ad80c
SH
2391const struct fuse_operations lxcfs_ops = {
2392 .getattr = lxcfs_getattr,
2393 .readlink = NULL,
2394 .getdir = NULL,
2395 .mknod = NULL,
ab54b798 2396 .mkdir = lxcfs_mkdir,
758ad80c 2397 .unlink = NULL,
50d8d5b5 2398 .rmdir = lxcfs_rmdir,
758ad80c
SH
2399 .symlink = NULL,
2400 .rename = NULL,
2401 .link = NULL,
fd2e4e03 2402 .chmod = lxcfs_chmod,
341b21ad 2403 .chown = lxcfs_chown,
2ad6d2bd 2404 .truncate = lxcfs_truncate,
758ad80c 2405 .utime = NULL,
99978832
SH
2406
2407 .open = lxcfs_open,
2408 .read = lxcfs_read,
2409 .release = lxcfs_release,
2ad6d2bd 2410 .write = lxcfs_write,
99978832 2411
758ad80c 2412 .statfs = NULL,
99978832
SH
2413 .flush = lxcfs_flush,
2414 .fsync = lxcfs_fsync,
758ad80c
SH
2415
2416 .setxattr = NULL,
2417 .getxattr = NULL,
2418 .listxattr = NULL,
2419 .removexattr = NULL,
2420
2421 .opendir = lxcfs_opendir,
2422 .readdir = lxcfs_readdir,
2423 .releasedir = lxcfs_releasedir,
2424
2425 .fsyncdir = NULL,
2426 .init = NULL,
2427 .destroy = NULL,
2428 .access = NULL,
2429 .create = NULL,
2430 .ftruncate = NULL,
2431 .fgetattr = NULL,
2432};
2433
99978832 2434static void usage(const char *me)
758ad80c
SH
2435{
2436 fprintf(stderr, "Usage:\n");
2437 fprintf(stderr, "\n");
2438 fprintf(stderr, "%s [FUSE and mount options] mountpoint\n", me);
2439 exit(1);
2440}
2441
99978832 2442static bool is_help(char *w)
758ad80c
SH
2443{
2444 if (strcmp(w, "-h") == 0 ||
2445 strcmp(w, "--help") == 0 ||
2446 strcmp(w, "-help") == 0 ||
2447 strcmp(w, "help") == 0)
2448 return true;
2449 return false;
2450}
2451
2452int main(int argc, char *argv[])
2453{
2454 int ret;
2455 struct lxcfs_state *d;
2456
2457 if (argc < 2 || is_help(argv[1]))
2458 usage(argv[0]);
2459
2460 d = malloc(sizeof(*d));
2461 if (!d)
2462 return -1;
2463
2464 if (!cgm_escape_cgroup())
2465 fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
2466
2467 if (!cgm_get_controllers(&d->subsystems))
2468 return -1;
2469
2470 ret = fuse_main(argc, argv, &lxcfs_ops, d);
2471
2472 return ret;
2183082c 2473}