]> git.proxmox.com Git - mirror_lxcfs.git/blame - lxcfs.c
Merge pull request #49 from hallyn/offset1
[mirror_lxcfs.git] / lxcfs.c
CommitLineData
758ad80c
SH
1/* lxcfs
2 *
2c51f8dd 3 * Copyright © 2014,2015 Canonical, Inc
758ad80c
SH
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
f2799430 6 * See COPYING file for details.
758ad80c
SH
7 */
8
758ad80c
SH
9#define FUSE_USE_VERSION 26
10
2183082c 11#include <stdio.h>
758ad80c
SH
12#include <dirent.h>
13#include <fcntl.h>
14#include <fuse.h>
15#include <unistd.h>
16#include <errno.h>
17#include <stdbool.h>
18#include <time.h>
19#include <string.h>
20#include <stdlib.h>
21#include <libgen.h>
41bb9357
SH
22#include <sched.h>
23#include <linux/sched.h>
a05660a6 24#include <sys/socket.h>
41bb9357
SH
25#include <sys/mount.h>
26#include <wait.h>
758ad80c 27
758ad80c 28#include "cgmanager.h"
2e9c0b32 29#include "config.h" // for VERSION
758ad80c
SH
30
31struct lxcfs_state {
32 /*
2c51f8dd 33 * a null-terminated list of the mounted subsystems. We
758ad80c
SH
34 * detect this at startup.
35 */
36 char **subsystems;
37};
38#define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
39
443d13f5
SH
40enum {
41 LXC_TYPE_CGDIR,
42 LXC_TYPE_CGFILE,
43 LXC_TYPE_PROC_MEMINFO,
44 LXC_TYPE_PROC_CPUINFO,
45 LXC_TYPE_PROC_UPTIME,
46 LXC_TYPE_PROC_STAT,
47 LXC_TYPE_PROC_DISKSTATS,
48};
49
c688e1b3
SH
50struct file_info {
51 char *controller;
52 char *cgroup;
8f6e8f5e 53 char *file;
443d13f5 54 int type;
c688e1b3
SH
55 char *buf; // unused as of yet
56 int buflen;
97f1f27b 57 int size; //actual data size
b5ad2d21 58 int cached;
c688e1b3
SH
59};
60
97f1f27b
YY
61/* reserve buffer size, for cpuall in /proc/stat */
62#define BUF_RESERVE_SIZE 256
63
2c51f8dd
SH
64/*
65 * append pid to *src.
66 * src: a pointer to a char* in which ot append the pid.
67 * sz: the number of characters printed so far, minus trailing \0.
68 * asz: the allocated size so far
69 * pid: the pid to append
70 */
71static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
72{
73 char *d = *src;
74 char tmp[30];
75
76 sprintf(tmp, "%d\n", (int)pid);
77
78 if (!d) {
79 do {
80 d = malloc(BUF_RESERVE_SIZE);
81 } while (!d);
82 *src = d;
83 *asz = BUF_RESERVE_SIZE;
84 } else if (strlen(tmp) + sz + 1 >= asz) {
85 do {
86 d = realloc(d, *asz + BUF_RESERVE_SIZE);
87 } while (!d);
88 *src = d;
89 *asz += BUF_RESERVE_SIZE;
90 }
91 memcpy(d+*sz, tmp, strlen(tmp));
92 *sz += strlen(tmp);
93 d[*sz] = '\0';
94}
95
bae07053 96static char *must_copy_string(void *parent, const char *str)
c688e1b3 97{
2c51f8dd 98 char *dup = NULL;
c688e1b3
SH
99 if (!str)
100 return NULL;
2c51f8dd
SH
101 do {
102 dup = strdup(str);
103 } while (!dup);
104
105 return dup;
c688e1b3
SH
106}
107
a05660a6
SH
108static int wait_for_pid(pid_t pid)
109{
110 int status, ret;
111
112again:
113 ret = waitpid(pid, &status, 0);
114 if (ret == -1) {
115 if (errno == EINTR)
116 goto again;
117 return -1;
118 }
119 if (ret != pid)
120 goto again;
121 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
122 return -1;
123 return 0;
124}
125
053a659d
SH
126/*
127 * Given a open file * to /proc/pid/{u,g}id_map, and an id
128 * valid in the caller's namespace, return the id mapped into
129 * pid's namespace.
130 * Returns the mapped id, or -1 on error.
131 */
132unsigned int
133convert_id_to_ns(FILE *idfile, unsigned int in_id)
134{
135 unsigned int nsuid, // base id for a range in the idfile's namespace
136 hostuid, // base id for a range in the caller's namespace
137 count; // number of ids in this range
138 char line[400];
139 int ret;
140
141 fseek(idfile, 0L, SEEK_SET);
142 while (fgets(line, 400, idfile)) {
143 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
144 if (ret != 3)
145 continue;
146 if (hostuid + count < hostuid || nsuid + count < nsuid) {
147 /*
148 * uids wrapped around - unexpected as this is a procfile,
149 * so just bail.
150 */
647c89e5 151 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
053a659d
SH
152 nsuid, hostuid, count, line);
153 return -1;
154 }
155 if (hostuid <= in_id && hostuid+count > in_id) {
156 /*
157 * now since hostuid <= in_id < hostuid+count, and
158 * hostuid+count and nsuid+count do not wrap around,
159 * we know that nsuid+(in_id-hostuid) which must be
160 * less that nsuid+(count) must not wrap around
161 */
162 return (in_id - hostuid) + nsuid;
163 }
164 }
165
166 // no answer found
167 return -1;
168}
169
341b21ad
SH
170/*
171 * for is_privileged_over,
172 * specify whether we require the calling uid to be root in his
173 * namespace
174 */
175#define NS_ROOT_REQD true
176#define NS_ROOT_OPT false
177
2c51f8dd
SH
178#define PROCLEN 100
179
341b21ad 180static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
758ad80c 181{
2c51f8dd
SH
182 char fpath[PROCLEN];
183 int ret;
053a659d
SH
184 bool answer = false;
185 uid_t nsuid;
186
341b21ad
SH
187 if (victim == -1 || uid == -1)
188 return false;
189
190 /*
191 * If the request is one not requiring root in the namespace,
192 * then having the same uid suffices. (i.e. uid 1000 has write
193 * access to files owned by uid 1000
194 */
195 if (!req_ns_root && uid == victim)
758ad80c
SH
196 return true;
197
2c51f8dd
SH
198 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
199 if (ret < 0 || ret >= PROCLEN)
200 return false;
053a659d
SH
201 FILE *f = fopen(fpath, "r");
202 if (!f)
203 return false;
204
341b21ad 205 /* if caller's not root in his namespace, reject */
053a659d
SH
206 nsuid = convert_id_to_ns(f, uid);
207 if (nsuid)
208 goto out;
209
341b21ad
SH
210 /*
211 * If victim is not mapped into caller's ns, reject.
212 * XXX I'm not sure this check is needed given that fuse
213 * will be sending requests where the vfs has converted
214 */
053a659d
SH
215 nsuid = convert_id_to_ns(f, victim);
216 if (nsuid == -1)
217 goto out;
218
219 answer = true;
220
221out:
222 fclose(f);
223 return answer;
758ad80c
SH
224}
225
226static bool perms_include(int fmode, mode_t req_mode)
227{
2ad6d2bd
SH
228 mode_t r;
229
230 switch (req_mode & O_ACCMODE) {
231 case O_RDONLY:
232 r = S_IROTH;
233 break;
234 case O_WRONLY:
235 r = S_IWOTH;
236 break;
237 case O_RDWR:
238 r = S_IROTH | S_IWOTH;
239 break;
240 default:
241 return false;
242 }
243 return ((fmode & r) == r);
758ad80c
SH
244}
245
3db25a35
SH
246static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
247{
248 char *start, *end;
249
250 if (strlen(taskcg) <= strlen(querycg)) {
251 fprintf(stderr, "%s: I was fed bad input\n", __func__);
252 return NULL;
253 }
254
255 if (strcmp(querycg, "/") == 0)
2c51f8dd 256 start = strdup(taskcg + 1);
3db25a35 257 else
2c51f8dd
SH
258 start = strdup(taskcg + strlen(querycg) + 1);
259 if (!start)
260 return NULL;
3db25a35
SH
261 end = strchr(start, '/');
262 if (end)
263 *end = '\0';
264 return start;
265}
266
2c51f8dd
SH
267static void stripnewline(char *x)
268{
269 size_t l = strlen(x);
270 if (l && x[l-1] == '\n')
271 x[l-1] = '\0';
272}
273
274static char *get_pid_cgroup(pid_t pid, const char *contrl)
275{
276 char fnam[PROCLEN];
277 FILE *f;
278 char *answer = NULL;
279 char *line = NULL;
280 size_t len = 0;
281 int ret;
282
283 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
284 if (ret < 0 || ret >= PROCLEN)
285 return NULL;
286 if (!(f = fopen(fnam, "r")))
287 return NULL;
288
289 while (getline(&line, &len, f) != -1) {
290 char *c1, *c2;
291 if (!line[0])
292 continue;
293 c1 = strchr(line, ':');
294 if (!c1)
295 goto out;
296 c1++;
297 c2 = strchr(c1, ':');
298 if (!c2)
299 goto out;
300 *c2 = '\0';
301 if (strcmp(c1, contrl) != 0)
302 continue;
303 c2++;
304 stripnewline(c2);
305 do {
306 answer = strdup(c2);
307 } while (!answer);
308 break;
309 }
310
311out:
312 fclose(f);
313 free(line);
314 return answer;
315}
316
758ad80c
SH
317/*
318 * check whether a fuse context may access a cgroup dir or file
319 *
320 * If file is not null, it is a cgroup file to check under cg.
321 * If file is null, then we are checking perms on cg itself.
322 *
323 * For files we can check the mode of the list_keys result.
324 * For cgroups, we must make assumptions based on the files under the
325 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
326 * yet.
327 */
328static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
329{
2c51f8dd
SH
330 struct cgm_keys **list = NULL;
331 bool ret = false;
758ad80c
SH
332 int i;
333
334 if (!file)
335 file = "tasks";
336
337 if (*file == '/')
338 file++;
339
340 if (!cgm_list_keys(contrl, cg, &list))
341 return false;
342 for (i = 0; list[i]; i++) {
343 if (strcmp(list[i]->name, file) == 0) {
344 struct cgm_keys *k = list[i];
341b21ad 345 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2c51f8dd
SH
346 if (perms_include(k->mode >> 6, mode)) {
347 ret = true;
348 goto out;
349 }
758ad80c
SH
350 }
351 if (fc->gid == k->gid) {
2c51f8dd
SH
352 if (perms_include(k->mode >> 3, mode)) {
353 ret = true;
354 goto out;
355 }
758ad80c 356 }
2c51f8dd
SH
357 ret = perms_include(k->mode, mode);
358 goto out;
758ad80c
SH
359 }
360 }
361
2c51f8dd
SH
362out:
363 free_keys(list);
364 return ret;
3db25a35
SH
365}
366
04b5cbdc
SH
367#define INITSCOPE "/init.scope"
368static void prune_init_slice(char *cg)
369{
370 char *point;
371 point = cg + strlen(cg) - strlen(INITSCOPE);
372 if (point < cg)
373 return;
374 if (strcmp(point, INITSCOPE) == 0) {
375 if (point == cg)
376 *(point+1) = '\0';
377 else
378 *point = '\0';
379 }
380}
381
3db25a35
SH
382/*
383 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
384 * If caller is in /a, he may act on /a/b, but not on /b.
385 * if the answer is false and nextcg is not NULL, then *nextcg will point
2c51f8dd
SH
386 * to a string containing the next cgroup directory under cg, which must be
387 * freed by the caller.
3db25a35
SH
388 */
389static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
390{
2c51f8dd 391 char fnam[PROCLEN];
3db25a35
SH
392 FILE *f;
393 bool answer = false;
394 char *line = NULL;
395 size_t len = 0;
2c51f8dd 396 int ret;
3db25a35 397
2c51f8dd
SH
398 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
399 if (ret < 0 || ret >= PROCLEN)
400 return false;
3db25a35
SH
401 if (!(f = fopen(fnam, "r")))
402 return false;
403
404 while (getline(&line, &len, f) != -1) {
405 char *c1, *c2, *linecmp;
406 if (!line[0])
407 continue;
408 c1 = strchr(line, ':');
409 if (!c1)
410 goto out;
411 c1++;
412 c2 = strchr(c1, ':');
413 if (!c2)
414 goto out;
415 *c2 = '\0';
416 if (strcmp(c1, contrl) != 0)
417 continue;
418 c2++;
419 stripnewline(c2);
04b5cbdc 420 prune_init_slice(c2);
3db25a35
SH
421 /*
422 * callers pass in '/' for root cgroup, otherwise they pass
423 * in a cgroup without leading '/'
424 */
425 linecmp = *cg == '/' ? c2 : c2+1;
426 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
427 if (nextcg)
428 *nextcg = get_next_cgroup_dir(linecmp, cg);
429 goto out;
430 }
431 answer = true;
432 goto out;
433 }
434
435out:
436 fclose(f);
437 free(line);
438 return answer;
439}
440
758ad80c 441/*
2c51f8dd
SH
442 * given /cgroup/freezer/a/b, return "freezer".
443 * the returned char* should NOT be freed.
758ad80c
SH
444 */
445static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
446{
447 const char *p1;
2c51f8dd 448 char *contr, *slash;
758ad80c
SH
449
450 if (strlen(path) < 9)
451 return NULL;
ac5d9d48
SH
452 if (*(path+7) != '/')
453 return NULL;
758ad80c 454 p1 = path+8;
2c51f8dd
SH
455 contr = strdupa(p1);
456 if (!contr)
457 return NULL;
458 slash = strstr(contr, "/");
758ad80c
SH
459 if (slash)
460 *slash = '\0';
461
462 /* verify that it is a subsystem */
463 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
464 int i;
2c51f8dd 465 if (!list)
758ad80c 466 return NULL;
758ad80c 467 for (i = 0; list[i]; i++) {
2c51f8dd
SH
468 if (strcmp(list[i], contr) == 0)
469 return list[i];
758ad80c 470 }
758ad80c
SH
471 return NULL;
472}
473
474/*
475 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
476 * Note that the returned value may include files (keynames) etc
477 */
478static const char *find_cgroup_in_path(const char *path)
479{
480 const char *p1;
481
482 if (strlen(path) < 9)
483 return NULL;
484 p1 = strstr(path+8, "/");
485 if (!p1)
486 return NULL;
487 return p1+1;
488}
489
490static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
491{
2c51f8dd
SH
492 char **list;
493 bool ret = false;
758ad80c
SH
494 int i;
495
496 if (!f)
497 return false;
498 if (*f == '/')
499 f++;
500
501 if (!cgm_list_children(contr, dir, &list))
502 return false;
503 for (i = 0; list[i]; i++) {
2c51f8dd
SH
504 if (strcmp(list[i], f) == 0) {
505 ret = true;
506 goto out;
507 }
758ad80c
SH
508 }
509
2c51f8dd
SH
510out:
511 for (i = 0; list[i]; i++)
512 free(list[i]);
513 free(list);
514 return ret;
758ad80c
SH
515}
516
517static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
518{
2c51f8dd
SH
519 struct cgm_keys **list = NULL;
520 struct cgm_keys *k = NULL;
758ad80c
SH
521 int i;
522
523 if (!f)
524 return NULL;
525 if (*f == '/')
526 f++;
527 if (!cgm_list_keys(contr, dir, &list))
528 return NULL;
529 for (i = 0; list[i]; i++) {
530 if (strcmp(list[i]->name, f) == 0) {
2c51f8dd
SH
531 int j;
532 // free all the keys we are not returning
533 k = list[i];
534 for (j = 0; list[j]; j++) {
535 if (i != j)
103f104c 536 free_key(list[j]);
2c51f8dd
SH
537 }
538 free(list);
758ad80c
SH
539 return k;
540 }
541 }
542
2c51f8dd 543 free_keys(list);
758ad80c
SH
544 return NULL;
545}
546
2c51f8dd
SH
547/*
548 * dir should be freed, file not
549 */
758ad80c
SH
550static void get_cgdir_and_path(const char *cg, char **dir, char **file)
551{
758ad80c
SH
552 char *p;
553
2c51f8dd
SH
554 do {
555 *dir = strdup(cg);
556 } while (!*dir);
758ad80c
SH
557 *file = strrchr(cg, '/');
558 if (!*file) {
559 *file = NULL;
560 return;
561 }
562 p = strrchr(*dir, '/');
563 *p = '\0';
564}
565
566/*
2ad6d2bd 567 * FUSE ops for /cgroup
758ad80c 568 */
2ad6d2bd 569
758ad80c
SH
570static int cg_getattr(const char *path, struct stat *sb)
571{
572 struct timespec now;
573 struct fuse_context *fc = fuse_get_context();
2c51f8dd 574 char * cgdir = NULL;
758ad80c 575 char *fpath = NULL, *path1, *path2;
2c51f8dd 576 struct cgm_keys *k = NULL;
758ad80c 577 const char *cgroup;
2c51f8dd
SH
578 const char *controller = NULL;
579 int ret = -ENOENT;
758ad80c
SH
580
581
582 if (!fc)
583 return -EIO;
584
585 memset(sb, 0, sizeof(struct stat));
586
587 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
588 return -EINVAL;
589
590 sb->st_uid = sb->st_gid = 0;
591 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
592 sb->st_size = 0;
593
594 if (strcmp(path, "/cgroup") == 0) {
595 sb->st_mode = S_IFDIR | 00755;
596 sb->st_nlink = 2;
597 return 0;
598 }
599
600 controller = pick_controller_from_path(fc, path);
601 if (!controller)
602 return -EIO;
758ad80c
SH
603 cgroup = find_cgroup_in_path(path);
604 if (!cgroup) {
605 /* this is just /cgroup/controller, return it as a dir */
606 sb->st_mode = S_IFDIR | 00755;
607 sb->st_nlink = 2;
608 return 0;
609 }
341b21ad 610
758ad80c
SH
611 get_cgdir_and_path(cgroup, &cgdir, &fpath);
612
613 if (!fpath) {
614 path1 = "/";
615 path2 = cgdir;
616 } else {
617 path1 = cgdir;
618 path2 = fpath;
619 }
620
758ad80c
SH
621 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
622 * Then check that caller's cgroup is under path if fpath is a child
623 * cgroup, or cgdir if fpath is a file */
624
625 if (is_child_cgroup(controller, path1, path2)) {
f9a05025
SH
626 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
627 /* this is just /cgroup/controller, return it as a dir */
628 sb->st_mode = S_IFDIR | 00555;
629 sb->st_nlink = 2;
2c51f8dd
SH
630 ret = 0;
631 goto out;
632 }
633 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
634 ret = -EACCES;
635 goto out;
f9a05025 636 }
758ad80c 637
053a659d
SH
638 // get uid, gid, from '/tasks' file and make up a mode
639 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
640 sb->st_mode = S_IFDIR | 00755;
641 k = get_cgroup_key(controller, cgroup, "tasks");
642 if (!k) {
053a659d
SH
643 sb->st_uid = sb->st_gid = 0;
644 } else {
053a659d
SH
645 sb->st_uid = k->uid;
646 sb->st_gid = k->gid;
647 }
2c51f8dd 648 free_key(k);
758ad80c 649 sb->st_nlink = 2;
2c51f8dd
SH
650 ret = 0;
651 goto out;
758ad80c
SH
652 }
653
654 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
758ad80c 655 sb->st_mode = S_IFREG | k->mode;
053a659d 656 sb->st_nlink = 1;
758ad80c
SH
657 sb->st_uid = k->uid;
658 sb->st_gid = k->gid;
7253e0a4 659 sb->st_size = 0;
2c51f8dd 660 free_key(k);
adc3867b
SH
661 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL)) {
662 ret = -ENOENT;
663 goto out;
664 }
665 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
666 ret = -EACCES;
667 goto out;
668 }
2c51f8dd
SH
669
670 ret = 0;
758ad80c
SH
671 }
672
2c51f8dd
SH
673out:
674 free(cgdir);
675 return ret;
758ad80c 676}
2183082c 677
758ad80c 678static int cg_opendir(const char *path, struct fuse_file_info *fi)
2183082c 679{
7f163b71 680 struct fuse_context *fc = fuse_get_context();
7f163b71 681 const char *cgroup;
c688e1b3 682 struct file_info *dir_info;
2c51f8dd 683 char *controller = NULL;
7f163b71
SH
684
685 if (!fc)
686 return -EIO;
687
c688e1b3
SH
688 if (strcmp(path, "/cgroup") == 0) {
689 cgroup = NULL;
690 controller = NULL;
691 } else {
692 // return list of keys for the controller, and list of child cgroups
693 controller = pick_controller_from_path(fc, path);
694 if (!controller)
695 return -EIO;
7f163b71 696
c688e1b3
SH
697 cgroup = find_cgroup_in_path(path);
698 if (!cgroup) {
699 /* this is just /cgroup/controller, return its contents */
700 cgroup = "/";
701 }
7f163b71
SH
702 }
703
2c51f8dd 704 if (cgroup && !fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
7f163b71 705 return -EACCES;
2c51f8dd 706 }
c688e1b3
SH
707
708 /* we'll free this at cg_releasedir */
2c51f8dd
SH
709 dir_info = malloc(sizeof(*dir_info));
710 if (!dir_info)
711 return -ENOMEM;
bae07053
SH
712 dir_info->controller = must_copy_string(dir_info, controller);
713 dir_info->cgroup = must_copy_string(dir_info, cgroup);
443d13f5 714 dir_info->type = LXC_TYPE_CGDIR;
c688e1b3 715 dir_info->buf = NULL;
8f6e8f5e 716 dir_info->file = NULL;
c688e1b3
SH
717 dir_info->buflen = 0;
718
719 fi->fh = (unsigned long)dir_info;
758ad80c
SH
720 return 0;
721}
722
758ad80c
SH
723static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
724 struct fuse_file_info *fi)
725{
c688e1b3 726 struct file_info *d = (struct file_info *)fi->fh;
2c51f8dd
SH
727 struct cgm_keys **list = NULL;
728 int i, ret;
729 char *nextcg = NULL;
758ad80c 730 struct fuse_context *fc = fuse_get_context();
2c51f8dd 731 char **clist = NULL;
758ad80c 732
443d13f5 733 if (d->type != LXC_TYPE_CGDIR) {
b845ad01
SH
734 fprintf(stderr, "Internal error: file cache info used in readdir\n");
735 return -EIO;
736 }
c688e1b3
SH
737 if (!d->cgroup && !d->controller) {
738 // ls /var/lib/lxcfs/cgroup - just show list of controllers
758ad80c
SH
739 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
740 int i;
741
742 if (!list)
743 return -EIO;
7f163b71 744
758ad80c
SH
745 for (i = 0; list[i]; i++) {
746 if (filler(buf, list[i], NULL, 0) != 0) {
747 return -EIO;
748 }
749 }
750 return 0;
751 }
752
2c51f8dd 753 if (!cgm_list_keys(d->controller, d->cgroup, &list)) {
3db25a35 754 // not a valid cgroup
2c51f8dd
SH
755 ret = -EINVAL;
756 goto out;
757 }
3db25a35 758
c688e1b3 759 if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
3db25a35
SH
760 if (nextcg) {
761 int ret;
762 ret = filler(buf, nextcg, NULL, 0);
2c51f8dd
SH
763 free(nextcg);
764 if (ret != 0) {
765 ret = -EIO;
766 goto out;
767 }
3db25a35 768 }
2c51f8dd
SH
769 ret = 0;
770 goto out;
3db25a35
SH
771 }
772
758ad80c 773 for (i = 0; list[i]; i++) {
758ad80c 774 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2c51f8dd
SH
775 ret = -EIO;
776 goto out;
758ad80c
SH
777 }
778 }
779
780 // now get the list of child cgroups
758ad80c 781
2c51f8dd
SH
782 if (!cgm_list_children(d->controller, d->cgroup, &clist)) {
783 ret = 0;
784 goto out;
785 }
758ad80c 786 for (i = 0; clist[i]; i++) {
758ad80c 787 if (filler(buf, clist[i], NULL, 0) != 0) {
2c51f8dd
SH
788 ret = -EIO;
789 goto out;
758ad80c
SH
790 }
791 }
2c51f8dd
SH
792 ret = 0;
793
794out:
795 free_keys(list);
796 if (clist) {
797 for (i = 0; clist[i]; i++)
798 free(clist[i]);
799 free(clist);
800 }
801 return ret;
758ad80c
SH
802}
803
8f6e8f5e
SH
804static void do_release_file_info(struct file_info *f)
805{
2c51f8dd
SH
806 if (!f)
807 return;
808 free(f->controller);
809 free(f->cgroup);
810 free(f->file);
811 free(f->buf);
812 free(f);
8f6e8f5e
SH
813}
814
758ad80c
SH
815static int cg_releasedir(const char *path, struct fuse_file_info *fi)
816{
c688e1b3
SH
817 struct file_info *d = (struct file_info *)fi->fh;
818
8f6e8f5e 819 do_release_file_info(d);
758ad80c
SH
820 return 0;
821}
822
99978832
SH
823static int cg_open(const char *path, struct fuse_file_info *fi)
824{
99978832 825 const char *cgroup;
2c51f8dd
SH
826 char *fpath = NULL, *path1, *path2, * cgdir = NULL, *controller;
827 struct cgm_keys *k = NULL;
8f6e8f5e 828 struct file_info *file_info;
99978832 829 struct fuse_context *fc = fuse_get_context();
2c51f8dd 830 int ret;
99978832
SH
831
832 if (!fc)
833 return -EIO;
834
835 controller = pick_controller_from_path(fc, path);
836 if (!controller)
837 return -EIO;
838 cgroup = find_cgroup_in_path(path);
839 if (!cgroup)
840 return -EINVAL;
841
842 get_cgdir_and_path(cgroup, &cgdir, &fpath);
843 if (!fpath) {
844 path1 = "/";
845 path2 = cgdir;
846 } else {
847 path1 = cgdir;
848 path2 = fpath;
849 }
850
8f6e8f5e 851 k = get_cgroup_key(controller, path1, path2);
2c51f8dd
SH
852 if (!k) {
853 ret = -EINVAL;
854 goto out;
855 }
856 free_key(k);
99978832 857
2c51f8dd 858 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
8f6e8f5e 859 // should never get here
2c51f8dd
SH
860 ret = -EACCES;
861 goto out;
862 }
99978832 863
8f6e8f5e 864 /* we'll free this at cg_release */
2c51f8dd
SH
865 file_info = malloc(sizeof(*file_info));
866 if (!file_info) {
867 ret = -ENOMEM;
868 goto out;
869 }
bae07053
SH
870 file_info->controller = must_copy_string(file_info, controller);
871 file_info->cgroup = must_copy_string(file_info, path1);
872 file_info->file = must_copy_string(file_info, path2);
443d13f5 873 file_info->type = LXC_TYPE_CGFILE;
8f6e8f5e
SH
874 file_info->buf = NULL;
875 file_info->buflen = 0;
876
877 fi->fh = (unsigned long)file_info;
2c51f8dd
SH
878 ret = 0;
879
880out:
881 free(cgdir);
882 return ret;
8f6e8f5e
SH
883}
884
885static int cg_release(const char *path, struct fuse_file_info *fi)
886{
887 struct file_info *f = (struct file_info *)fi->fh;
888
889 do_release_file_info(f);
890 return 0;
99978832
SH
891}
892
a05660a6
SH
893static int msgrecv(int sockfd, void *buf, size_t len)
894{
895 struct timeval tv;
896 fd_set rfds;
897
898 FD_ZERO(&rfds);
899 FD_SET(sockfd, &rfds);
900 tv.tv_sec = 2;
901 tv.tv_usec = 0;
902
ea56f722 903 if (select(sockfd+1, &rfds, NULL, NULL, &tv) <= 0)
a05660a6
SH
904 return -1;
905 return recv(sockfd, buf, len, MSG_DONTWAIT);
906}
907
01e71852
SH
908#define SEND_CREDS_OK 0
909#define SEND_CREDS_NOTSK 1
910#define SEND_CREDS_FAIL 2
911static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
a05660a6
SH
912{
913 struct msghdr msg = { 0 };
914 struct iovec iov;
915 struct cmsghdr *cmsg;
916 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
917 char buf[1];
918 buf[0] = 'p';
919
01e71852
SH
920 if (pingfirst) {
921 if (msgrecv(sock, buf, 1) != 1) {
1420baf8 922 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
01e71852
SH
923 __func__);
924 return SEND_CREDS_FAIL;
925 }
a05660a6
SH
926 }
927
928 msg.msg_control = cmsgbuf;
929 msg.msg_controllen = sizeof(cmsgbuf);
930
931 cmsg = CMSG_FIRSTHDR(&msg);
932 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
933 cmsg->cmsg_level = SOL_SOCKET;
934 cmsg->cmsg_type = SCM_CREDENTIALS;
935 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
936
937 msg.msg_name = NULL;
938 msg.msg_namelen = 0;
939
940 buf[0] = v;
941 iov.iov_base = buf;
942 iov.iov_len = sizeof(buf);
943 msg.msg_iov = &iov;
944 msg.msg_iovlen = 1;
945
946 if (sendmsg(sock, &msg, 0) < 0) {
1420baf8 947 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
a05660a6
SH
948 strerror(errno));
949 if (errno == 3)
01e71852
SH
950 return SEND_CREDS_NOTSK;
951 return SEND_CREDS_FAIL;
a05660a6
SH
952 }
953
01e71852 954 return SEND_CREDS_OK;
a05660a6
SH
955}
956
957static bool recv_creds(int sock, struct ucred *cred, char *v)
958{
959 struct msghdr msg = { 0 };
960 struct iovec iov;
961 struct cmsghdr *cmsg;
962 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
963 char buf[1];
964 int ret;
965 int optval = 1;
6ee867dc
SH
966 struct timeval tv;
967 fd_set rfds;
a05660a6
SH
968
969 *v = '1';
970
971 cred->pid = -1;
972 cred->uid = -1;
973 cred->gid = -1;
974
975 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
1420baf8 976 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
a05660a6
SH
977 return false;
978 }
979 buf[0] = '1';
980 if (write(sock, buf, 1) != 1) {
1420baf8 981 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
a05660a6
SH
982 return false;
983 }
984
985 msg.msg_name = NULL;
986 msg.msg_namelen = 0;
987 msg.msg_control = cmsgbuf;
988 msg.msg_controllen = sizeof(cmsgbuf);
989
990 iov.iov_base = buf;
991 iov.iov_len = sizeof(buf);
992 msg.msg_iov = &iov;
993 msg.msg_iovlen = 1;
994
6ee867dc
SH
995 FD_ZERO(&rfds);
996 FD_SET(sock, &rfds);
997 tv.tv_sec = 2;
998 tv.tv_usec = 0;
ea56f722 999 if (select(sock+1, &rfds, NULL, NULL, &tv) <= 0) {
6ee867dc
SH
1000 fprintf(stderr, "Failed to select for scm_cred: %s\n",
1001 strerror(errno));
1002 return false;
1003 }
1004 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
a05660a6 1005 if (ret < 0) {
1420baf8 1006 fprintf(stderr, "Failed to receive scm_cred: %s\n",
a05660a6
SH
1007 strerror(errno));
1008 return false;
1009 }
1010
1011 cmsg = CMSG_FIRSTHDR(&msg);
1012
1013 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
1014 cmsg->cmsg_level == SOL_SOCKET &&
1015 cmsg->cmsg_type == SCM_CREDENTIALS) {
1016 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
1017 }
1018 *v = buf[0];
1019
1020 return true;
1021}
1022
1023
1024/*
4775fba1
SH
1025 * pid_to_ns - reads pids from a ucred over a socket, then writes the
1026 * int value back over the socket. This shifts the pid from the
1027 * sender's pidns into tpid's pidns.
a05660a6 1028 */
4775fba1 1029static void pid_to_ns(int sock, pid_t tpid)
a05660a6
SH
1030{
1031 char v = '0';
1032 struct ucred cred;
1033
1034 while (recv_creds(sock, &cred, &v)) {
1035 if (v == '1')
67bd113f 1036 _exit(0);
a05660a6 1037 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
67bd113f 1038 _exit(1);
a05660a6 1039 }
67bd113f 1040 _exit(0);
a05660a6
SH
1041}
1042
1043/*
4775fba1 1044 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
a05660a6 1045 * in your old pidns. Only children which you fork will be in the target
4775fba1 1046 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
a05660a6
SH
1047 * actually convert pids
1048 */
4775fba1 1049static void pid_to_ns_wrapper(int sock, pid_t tpid)
a05660a6 1050{
ea56f722 1051 int newnsfd = -1, ret, cpipe[2];
a05660a6
SH
1052 char fnam[100];
1053 pid_t cpid;
ea56f722
SH
1054 struct timeval tv;
1055 fd_set s;
1056 char v;
a05660a6 1057
c0adec85
SH
1058 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1059 if (ret < 0 || ret >= sizeof(fnam))
67bd113f 1060 _exit(1);
a05660a6
SH
1061 newnsfd = open(fnam, O_RDONLY);
1062 if (newnsfd < 0)
67bd113f 1063 _exit(1);
a05660a6 1064 if (setns(newnsfd, 0) < 0)
67bd113f 1065 _exit(1);
a05660a6
SH
1066 close(newnsfd);
1067
ea56f722 1068 if (pipe(cpipe) < 0)
67bd113f 1069 _exit(1);
a05660a6 1070
ea56f722
SH
1071loop:
1072 cpid = fork();
a05660a6 1073 if (cpid < 0)
67bd113f 1074 _exit(1);
ea56f722
SH
1075
1076 if (!cpid) {
1077 char b = '1';
1078 close(cpipe[0]);
1079 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1080 fprintf(stderr, "%s (child): erorr on write: %s\n",
1081 __func__, strerror(errno));
1082 }
1083 close(cpipe[1]);
4775fba1 1084 pid_to_ns(sock, tpid);
ea56f722
SH
1085 }
1086 // give the child 1 second to be done forking and
1087 // write it's ack
1088 FD_ZERO(&s);
1089 FD_SET(cpipe[0], &s);
1090 tv.tv_sec = 1;
1091 tv.tv_usec = 0;
1092 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1093 if (ret <= 0)
1094 goto again;
1095 ret = read(cpipe[0], &v, 1);
1096 if (ret != sizeof(char) || v != '1') {
1097 goto again;
1098 }
1099
a05660a6 1100 if (!wait_for_pid(cpid))
67bd113f
SH
1101 _exit(1);
1102 _exit(0);
ea56f722
SH
1103
1104again:
1105 kill(cpid, SIGKILL);
1106 wait_for_pid(cpid);
1107 goto loop;
a05660a6
SH
1108}
1109
1110/*
1111 * To read cgroup files with a particular pid, we will setns into the child
1112 * pidns, open a pipe, fork a child - which will be the first to really be in
1113 * the child ns - which does the cgm_get_value and writes the data to the pipe.
1114 */
1115static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
1116{
1117 int sock[2] = {-1, -1};
2c51f8dd 1118 char *tmpdata = NULL;
a05660a6
SH
1119 int ret;
1120 pid_t qpid, cpid = -1;
1121 bool answer = false;
1122 char v = '0';
1123 struct ucred cred;
1124 struct timeval tv;
2c51f8dd 1125 size_t sz = 0, asz = 0;
a05660a6
SH
1126 fd_set s;
1127
1128 if (!cgm_get_value(contrl, cg, file, &tmpdata))
1129 return false;
1130
1131 /*
1132 * Now we read the pids from returned data one by one, pass
1133 * them into a child in the target namespace, read back the
1134 * translated pids, and put them into our to-return data
1135 */
1136
1137 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1138 perror("socketpair");
2c51f8dd
SH
1139 free(tmpdata);
1140 return false;
a05660a6
SH
1141 }
1142
1143 cpid = fork();
1144 if (cpid == -1)
1145 goto out;
1146
1147 if (!cpid) // child
4775fba1 1148 pid_to_ns_wrapper(sock[1], tpid);
a05660a6
SH
1149
1150 char *ptr = tmpdata;
1151 cred.uid = 0;
1152 cred.gid = 0;
1153 while (sscanf(ptr, "%d\n", &qpid) == 1) {
1154 cred.pid = qpid;
01e71852
SH
1155 ret = send_creds(sock[0], &cred, v, true);
1156
1157 if (ret == SEND_CREDS_NOTSK)
1158 goto next;
1159 if (ret == SEND_CREDS_FAIL)
a05660a6
SH
1160 goto out;
1161
1162 // read converted results
1163 FD_ZERO(&s);
1164 FD_SET(sock[0], &s);
6ee867dc 1165 tv.tv_sec = 2;
a05660a6
SH
1166 tv.tv_usec = 0;
1167 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
1168 if (ret <= 0) {
6ee867dc
SH
1169 fprintf(stderr, "%s: select error waiting for pid from child: %s\n",
1170 __func__, strerror(errno));
a05660a6
SH
1171 goto out;
1172 }
1173 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
6ee867dc
SH
1174 fprintf(stderr, "%s: error reading pid from child: %s\n",
1175 __func__, strerror(errno));
a05660a6
SH
1176 goto out;
1177 }
2c51f8dd 1178 must_strcat_pid(d, &sz, &asz, qpid);
01e71852 1179next:
a05660a6
SH
1180 ptr = strchr(ptr, '\n');
1181 if (!ptr)
1182 break;
1183 ptr++;
1184 }
1185
1186 cred.pid = getpid();
1187 v = '1';
01e71852 1188 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
a05660a6 1189 // failed to ask child to exit
6ee867dc
SH
1190 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1191 __func__, strerror(errno));
a05660a6
SH
1192 goto out;
1193 }
1194
1195 answer = true;
1196
1197out:
2c51f8dd 1198 free(tmpdata);
a05660a6
SH
1199 if (cpid != -1)
1200 wait_for_pid(cpid);
1201 if (sock[0] != -1) {
1202 close(sock[0]);
1203 close(sock[1]);
1204 }
1205 return answer;
1206}
1207
99978832
SH
1208static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1209 struct fuse_file_info *fi)
1210{
99978832 1211 struct fuse_context *fc = fuse_get_context();
8f6e8f5e 1212 struct file_info *f = (struct file_info *)fi->fh;
2c51f8dd
SH
1213 struct cgm_keys *k = NULL;
1214 char *data = NULL;
1215 int ret, s;
1216 bool r;
99978832 1217
443d13f5 1218 if (f->type != LXC_TYPE_CGFILE) {
b845ad01
SH
1219 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1220 return -EIO;
1221 }
1222
99978832 1223 if (offset)
7253e0a4 1224 return 0;
99978832
SH
1225
1226 if (!fc)
1227 return -EIO;
1228
8f6e8f5e 1229 if (!f->controller)
99978832
SH
1230 return -EINVAL;
1231
2c51f8dd
SH
1232 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) == NULL) {
1233 return -EINVAL;
1234 }
1235 free_key(k);
99978832 1236
99978832 1237
2c51f8dd
SH
1238 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { // should never get here
1239 ret = -EACCES;
1240 goto out;
1241 }
a05660a6 1242
2c51f8dd
SH
1243 if (strcmp(f->file, "tasks") == 0 ||
1244 strcmp(f->file, "/tasks") == 0 ||
1245 strcmp(f->file, "/cgroup.procs") == 0 ||
1246 strcmp(f->file, "cgroup.procs") == 0)
1247 // special case - we have to translate the pids
1248 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
1249 else
1250 r = cgm_get_value(f->controller, f->cgroup, f->file, &data);
99978832 1251
2c51f8dd
SH
1252 if (!r) {
1253 ret = -EINVAL;
1254 goto out;
1255 }
99978832 1256
2c51f8dd
SH
1257 if (!data) {
1258 ret = 0;
1259 goto out;
99978832 1260 }
2c51f8dd
SH
1261 s = strlen(data);
1262 if (s > size)
1263 s = size;
1264 memcpy(buf, data, s);
1265 if (s > 0 && s < size && data[s-1] != '\n')
1266 buf[s++] = '\n';
99978832 1267
2c51f8dd
SH
1268 ret = s;
1269
1270out:
1271 free(data);
1272 return ret;
99978832
SH
1273}
1274
4775fba1
SH
1275static void pid_from_ns(int sock, pid_t tpid)
1276{
1277 pid_t vpid;
1278 struct ucred cred;
1279 char v;
6ee867dc
SH
1280 struct timeval tv;
1281 fd_set s;
1282 int ret;
4775fba1
SH
1283
1284 cred.uid = 0;
1285 cred.gid = 0;
6ee867dc
SH
1286 while (1) {
1287 FD_ZERO(&s);
1288 FD_SET(sock, &s);
1289 tv.tv_sec = 2;
1290 tv.tv_usec = 0;
1291 ret = select(sock+1, &s, NULL, NULL, &tv);
ea56f722
SH
1292 if (ret <= 0) {
1293 fprintf(stderr, "%s: bad select before read from parent: %s\n",
6ee867dc 1294 __func__, strerror(errno));
67bd113f 1295 _exit(1);
6ee867dc
SH
1296 }
1297 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1298 fprintf(stderr, "%s: bad read from parent: %s\n",
1299 __func__, strerror(errno));
67bd113f 1300 _exit(1);
6ee867dc 1301 }
4775fba1 1302 if (vpid == -1) // done
01e71852 1303 break;
4775fba1
SH
1304 v = '0';
1305 cred.pid = vpid;
01e71852 1306 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
4775fba1
SH
1307 v = '1';
1308 cred.pid = getpid();
01e71852 1309 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
67bd113f 1310 _exit(1);
4775fba1
SH
1311 }
1312 }
67bd113f 1313 _exit(0);
4775fba1
SH
1314}
1315
1316static void pid_from_ns_wrapper(int sock, pid_t tpid)
1317{
ea56f722 1318 int newnsfd = -1, ret, cpipe[2];
4775fba1
SH
1319 char fnam[100];
1320 pid_t cpid;
ea56f722
SH
1321 fd_set s;
1322 struct timeval tv;
1323 char v;
4775fba1 1324
c0adec85
SH
1325 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1326 if (ret < 0 || ret >= sizeof(fnam))
67bd113f 1327 _exit(1);
4775fba1
SH
1328 newnsfd = open(fnam, O_RDONLY);
1329 if (newnsfd < 0)
67bd113f 1330 _exit(1);
4775fba1 1331 if (setns(newnsfd, 0) < 0)
67bd113f 1332 _exit(1);
4775fba1
SH
1333 close(newnsfd);
1334
ea56f722 1335 if (pipe(cpipe) < 0)
67bd113f 1336 _exit(1);
ea56f722
SH
1337
1338loop:
4775fba1
SH
1339 cpid = fork();
1340
1341 if (cpid < 0)
67bd113f 1342 _exit(1);
ea56f722
SH
1343
1344 if (!cpid) {
1345 char b = '1';
1346 close(cpipe[0]);
1347 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1348 fprintf(stderr, "%s (child): erorr on write: %s\n",
1349 __func__, strerror(errno));
1350 }
1351 close(cpipe[1]);
4775fba1 1352 pid_from_ns(sock, tpid);
ea56f722
SH
1353 }
1354
1355 // give the child 1 second to be done forking and
1356 // write it's ack
1357 FD_ZERO(&s);
1358 FD_SET(cpipe[0], &s);
1359 tv.tv_sec = 1;
1360 tv.tv_usec = 0;
1361 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1362 if (ret <= 0)
1363 goto again;
1364 ret = read(cpipe[0], &v, 1);
1365 if (ret != sizeof(char) || v != '1') {
1366 goto again;
1367 }
1368
4775fba1 1369 if (!wait_for_pid(cpid))
67bd113f
SH
1370 _exit(1);
1371 _exit(0);
ea56f722
SH
1372
1373again:
1374 kill(cpid, SIGKILL);
1375 wait_for_pid(cpid);
1376 goto loop;
4775fba1
SH
1377}
1378
1379static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
1380{
1381 int sock[2] = {-1, -1};
1382 pid_t qpid, cpid = -1;
1383 bool answer = false, fail = false;
1384
1385 /*
1386 * write the pids to a socket, have helper in writer's pidns
1387 * call movepid for us
1388 */
1389 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1390 perror("socketpair");
1391 exit(1);
1392 }
1393
1394 cpid = fork();
1395 if (cpid == -1)
1396 goto out;
1397
1398 if (!cpid) // child
1399 pid_from_ns_wrapper(sock[1], tpid);
1400
1401 const char *ptr = buf;
1402 while (sscanf(ptr, "%d", &qpid) == 1) {
1403 struct ucred cred;
1404 char v;
1405
1406 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
6ee867dc
SH
1407 fprintf(stderr, "%s: error writing pid to child: %s\n",
1408 __func__, strerror(errno));
4775fba1
SH
1409 goto out;
1410 }
1411
01e71852
SH
1412 if (recv_creds(sock[0], &cred, &v)) {
1413 if (v == '0') {
1414 if (!cgm_move_pid(contrl, cg, cred.pid))
1415 fail = true;
1416 }
4775fba1
SH
1417 }
1418
1419 ptr = strchr(ptr, '\n');
1420 if (!ptr)
1421 break;
1422 ptr++;
1423 }
1424
1425 /* All good, write the value */
1426 qpid = -1;
1427 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1420baf8 1428 fprintf(stderr, "Warning: failed to ask child to exit\n");
4775fba1
SH
1429
1430 if (!fail)
1431 answer = true;
1432
1433out:
1434 if (cpid != -1)
1435 wait_for_pid(cpid);
1436 if (sock[0] != -1) {
1437 close(sock[0]);
1438 close(sock[1]);
1439 }
1440 return answer;
1441}
1442
2ad6d2bd
SH
1443int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1444 struct fuse_file_info *fi)
1445{
2ad6d2bd 1446 struct fuse_context *fc = fuse_get_context();
2c51f8dd
SH
1447 char *localbuf = NULL;
1448 struct cgm_keys *k = NULL;
8f6e8f5e 1449 struct file_info *f = (struct file_info *)fi->fh;
2c51f8dd 1450 bool r;
2ad6d2bd 1451
443d13f5 1452 if (f->type != LXC_TYPE_CGFILE) {
b845ad01
SH
1453 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1454 return -EIO;
1455 }
1456
2ad6d2bd 1457 if (offset)
7253e0a4 1458 return 0;
2ad6d2bd
SH
1459
1460 if (!fc)
1461 return -EIO;
1462
2c51f8dd 1463 localbuf = alloca(size+1);
47cbf0e5
SH
1464 localbuf[size] = '\0';
1465 memcpy(localbuf, buf, size);
2ad6d2bd 1466
2c51f8dd
SH
1467 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) == NULL) {
1468 size = -EINVAL;
1469 goto out;
1470 }
2ad6d2bd 1471
2c51f8dd
SH
1472 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
1473 size = -EACCES;
1474 goto out;
1475 }
4775fba1 1476
2c51f8dd
SH
1477 if (strcmp(f->file, "tasks") == 0 ||
1478 strcmp(f->file, "/tasks") == 0 ||
1479 strcmp(f->file, "/cgroup.procs") == 0 ||
1480 strcmp(f->file, "cgroup.procs") == 0)
1481 // special case - we have to translate the pids
1482 r = do_write_pids(fc->pid, f->controller, f->cgroup, f->file, localbuf);
1483 else
1484 r = cgm_set_value(f->controller, f->cgroup, f->file, localbuf);
2ad6d2bd 1485
2c51f8dd
SH
1486 if (!r)
1487 size = -EINVAL;
2ad6d2bd 1488
2c51f8dd
SH
1489out:
1490 free_key(k);
1491 return size;
2ad6d2bd
SH
1492}
1493
341b21ad
SH
1494int cg_chown(const char *path, uid_t uid, gid_t gid)
1495{
1496 struct fuse_context *fc = fuse_get_context();
2c51f8dd
SH
1497 char *cgdir = NULL, *fpath = NULL, *path1, *path2, *controller;
1498 struct cgm_keys *k = NULL;
341b21ad 1499 const char *cgroup;
2c51f8dd 1500 int ret;
341b21ad
SH
1501
1502 if (!fc)
1503 return -EIO;
1504
1505 if (strcmp(path, "/cgroup") == 0)
1506 return -EINVAL;
1507
1508 controller = pick_controller_from_path(fc, path);
1509 if (!controller)
f9a05025 1510 return -EINVAL;
341b21ad
SH
1511 cgroup = find_cgroup_in_path(path);
1512 if (!cgroup)
1513 /* this is just /cgroup/controller */
1514 return -EINVAL;
1515
1516 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1517
1518 if (!fpath) {
1519 path1 = "/";
1520 path2 = cgdir;
1521 } else {
1522 path1 = cgdir;
1523 path2 = fpath;
1524 }
1525
1526 if (is_child_cgroup(controller, path1, path2)) {
1527 // get uid, gid, from '/tasks' file and make up a mode
1528 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1529 k = get_cgroup_key(controller, cgroup, "tasks");
1530
1531 } else
1532 k = get_cgroup_key(controller, path1, path2);
1533
2c51f8dd
SH
1534 if (!k) {
1535 ret = -EINVAL;
1536 goto out;
1537 }
341b21ad
SH
1538
1539 /*
1540 * This being a fuse request, the uid and gid must be valid
1541 * in the caller's namespace. So we can just check to make
1542 * sure that the caller is root in his uid, and privileged
1543 * over the file's current owner.
1544 */
2c51f8dd
SH
1545 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
1546 ret = -EACCES;
1547 goto out;
1548 }
341b21ad 1549
2c51f8dd
SH
1550 if (!cgm_chown_file(controller, cgroup, uid, gid)) {
1551 ret = -EINVAL;
1552 goto out;
1553 }
1554
1555 ret = 0;
1556
1557out:
1558 free_key(k);
1559 free(cgdir);
1560
1561 return ret;
341b21ad 1562}
2ad6d2bd 1563
fd2e4e03
SH
1564int cg_chmod(const char *path, mode_t mode)
1565{
0a1bb5ea 1566 struct fuse_context *fc = fuse_get_context();
2c51f8dd
SH
1567 char * cgdir = NULL, *fpath = NULL, *path1, *path2, *controller;
1568 struct cgm_keys *k = NULL;
0a1bb5ea 1569 const char *cgroup;
2c51f8dd 1570 int ret;
0a1bb5ea
SH
1571
1572 if (!fc)
1573 return -EIO;
1574
1575 if (strcmp(path, "/cgroup") == 0)
1576 return -EINVAL;
1577
1578 controller = pick_controller_from_path(fc, path);
1579 if (!controller)
f9a05025 1580 return -EINVAL;
0a1bb5ea
SH
1581 cgroup = find_cgroup_in_path(path);
1582 if (!cgroup)
1583 /* this is just /cgroup/controller */
1584 return -EINVAL;
1585
1586 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1587
1588 if (!fpath) {
1589 path1 = "/";
1590 path2 = cgdir;
1591 } else {
1592 path1 = cgdir;
1593 path2 = fpath;
1594 }
1595
1596 if (is_child_cgroup(controller, path1, path2)) {
1597 // get uid, gid, from '/tasks' file and make up a mode
1598 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1599 k = get_cgroup_key(controller, cgroup, "tasks");
1600
1601 } else
1602 k = get_cgroup_key(controller, path1, path2);
1603
2c51f8dd
SH
1604 if (!k) {
1605 ret = -EINVAL;
1606 goto out;
1607 }
0a1bb5ea
SH
1608
1609 /*
1610 * This being a fuse request, the uid and gid must be valid
1611 * in the caller's namespace. So we can just check to make
1612 * sure that the caller is root in his uid, and privileged
1613 * over the file's current owner.
1614 */
2c51f8dd
SH
1615 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1616 ret = -EPERM;
1617 goto out;
1618 }
0a1bb5ea 1619
2c51f8dd
SH
1620 if (!cgm_chmod_file(controller, cgroup, mode)) {
1621 ret = -EINVAL;
1622 goto out;
1623 }
1624
1625 ret = 0;
1626out:
1627 free_key(k);
1628 free(cgdir);
1629 return ret;
fd2e4e03
SH
1630}
1631
3e13a059
SH
1632#define LXCFS_MKDIR_PATH LIBEXECDIR "/lxcfs/lxcfs_mkdir"
1633
ab54b798
SH
1634int cg_mkdir(const char *path, mode_t mode)
1635{
1636 struct fuse_context *fc = fuse_get_context();
2c51f8dd 1637 char *fpath = NULL, *path1, *cgdir = NULL, *controller;
ab54b798 1638 const char *cgroup;
2c51f8dd 1639 int ret;
ab54b798 1640
ab54b798
SH
1641 if (!fc)
1642 return -EIO;
1643
1644
1645 controller = pick_controller_from_path(fc, path);
1646 if (!controller)
f9a05025 1647 return -EINVAL;
ab54b798
SH
1648
1649 cgroup = find_cgroup_in_path(path);
1650 if (!cgroup)
f9a05025 1651 return -EINVAL;
ab54b798
SH
1652
1653 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1654 if (!fpath)
1655 path1 = "/";
1656 else
1657 path1 = cgdir;
1658
2c51f8dd
SH
1659 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
1660 ret = -EACCES;
1661 goto out;
1662 }
1663 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL)) {
1664 ret = -EACCES;
1665 goto out;
1666 }
ab54b798 1667
2c51f8dd
SH
1668 if (fc->uid == 0 && fc->gid == 0) {
1669 if (!cgm_create(controller, cgroup)) {
1670 ret = -EINVAL;
1671 goto out;
1672 }
1673 } else {
1674 /*
40b8c791 1675 * exec a helper so as to get a clean dbus connection
2c51f8dd
SH
1676 * 17 for lxcfs_mkdir, and spaces and newline and \0. 50 for two ints.
1677 * 50 for two ints
1678 */
1679 size_t len = strlen(cgroup) + strlen(controller) + 17 + 50;
1680 char *cmd = alloca(len);
3e13a059 1681 ret = snprintf(cmd, len, "%s %d %d %s %s\n", LXCFS_MKDIR_PATH,
2c51f8dd
SH
1682 fc->uid, fc->gid, controller, cgroup);
1683 if (ret < 0 || ret >= len) {
1684 ret = -EINVAL;
1685 goto out;
1686 }
1687 ret = system(cmd);
1688 if (ret != 0)
1689 goto out;
1690 }
ab54b798 1691
2c51f8dd 1692 ret = 0;
ab54b798 1693
2c51f8dd
SH
1694out:
1695 free(cgdir);
1696 return ret;
ab54b798
SH
1697}
1698
50d8d5b5
SH
1699static int cg_rmdir(const char *path)
1700{
1701 struct fuse_context *fc = fuse_get_context();
2c51f8dd 1702 char *fpath = NULL, *cgdir = NULL, *controller;
50d8d5b5 1703 const char *cgroup;
2c51f8dd 1704 int ret;
50d8d5b5
SH
1705
1706 if (!fc)
1707 return -EIO;
1708
50d8d5b5
SH
1709 controller = pick_controller_from_path(fc, path);
1710 if (!controller)
f9a05025 1711 return -EINVAL;
50d8d5b5
SH
1712
1713 cgroup = find_cgroup_in_path(path);
1714 if (!cgroup)
f9a05025 1715 return -EINVAL;
50d8d5b5
SH
1716
1717 get_cgdir_and_path(cgroup, &cgdir, &fpath);
2c51f8dd
SH
1718 if (!fpath) {
1719 ret = -EINVAL;
1720 goto out;
1721 }
50d8d5b5 1722
2c51f8dd
SH
1723 fprintf(stderr, "rmdir: verifying access to %s:%s (req path %s)\n",
1724 controller, cgdir, path);
1725 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
1726 ret = -EACCES;
1727 goto out;
1728 }
1729 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
1730 ret = -EACCES;
1731 goto out;
1732 }
50d8d5b5 1733
2c51f8dd
SH
1734 if (!cgm_remove(controller, cgroup)) {
1735 ret = -EINVAL;
1736 goto out;
1737 }
50d8d5b5 1738
2c51f8dd
SH
1739 ret = 0;
1740
1741out:
1742 free(cgdir);
1743 return ret;
50d8d5b5
SH
1744}
1745
2dc17609
SH
1746static bool startswith(const char *line, const char *pref)
1747{
1748 if (strncmp(line, pref, strlen(pref)) == 0)
1749 return true;
1750 return false;
1751}
1752
1753static void get_mem_cached(char *memstat, unsigned long *v)
1754{
1755 char *eol;
1756
1757 *v = 0;
1758 while (*memstat) {
1759 if (startswith(memstat, "total_cache")) {
1760 sscanf(memstat + 11, "%lu", v);
1761 *v /= 1024;
1762 return;
1763 }
1764 eol = strchr(memstat, '\n');
1765 if (!eol)
1766 return;
1767 memstat = eol+1;
1768 }
1769}
1770
49878439 1771static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2f919d9d 1772{
49878439
YY
1773 char *eol;
1774 char key[32];
2f919d9d 1775
49878439
YY
1776 memset(key, 0, 32);
1777 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2f919d9d 1778
49878439
YY
1779 size_t len = strlen(key);
1780 *v = 0;
1781
1782 while (*str) {
1783 if (startswith(str, key)) {
2f919d9d
SH
1784 sscanf(str + len, "%lu", v);
1785 return;
1786 }
1787 eol = strchr(str, '\n');
49878439 1788 if (!eol)
2f919d9d 1789 return;
49878439
YY
1790 str = eol+1;
1791 }
1792}
1793
53b43826
SH
1794static int read_file(const char *path, char *buf, size_t size,
1795 struct file_info *d)
1796{
1797 size_t linelen = 0, total_len = 0, rv = 0;
1798 char *line = NULL;
1799 char *cache = d->buf;
1800 size_t cache_size = d->buflen;
1801 FILE *f = fopen(path, "r");
1802 if (!f)
1803 return 0;
1804
1805 while (getline(&line, &linelen, f) != -1) {
1806 size_t l = snprintf(cache, cache_size, "%s", line);
1807 if (l < 0) {
1808 perror("Error writing to cache");
1809 rv = 0;
1810 goto err;
1811 }
1812 if (l >= cache_size) {
1813 fprintf(stderr, "Internal error: truncated write to cache\n");
1814 rv = 0;
1815 goto err;
1816 }
1817 if (l < cache_size) {
1818 cache += l;
1819 cache_size -= l;
1820 total_len += l;
1821 } else {
1822 cache += cache_size;
1823 total_len += cache_size;
1824 cache_size = 0;
1825 break;
1826 }
1827 }
1828
1829 d->size = total_len;
1830 if (total_len > size ) total_len = size;
1831
1832 /* read from off 0 */
1833 memcpy(buf, d->buf, total_len);
1834 rv = total_len;
1835 err:
1836 fclose(f);
1837 free(line);
1838 return rv;
1839}
1840
758ad80c 1841/*
2ad6d2bd 1842 * FUSE ops for /proc
758ad80c 1843 */
758ad80c 1844
23ce2127
SH
1845static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1846 struct fuse_file_info *fi)
1847{
2dc17609 1848 struct fuse_context *fc = fuse_get_context();
97f1f27b 1849 struct file_info *d = (struct file_info *)fi->fh;
2c51f8dd
SH
1850 char *cg;
1851 char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
2dc17609
SH
1852 unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
1853 char *line = NULL;
e1068397 1854 size_t linelen = 0, total_len = 0, rv = 0;
97f1f27b
YY
1855 char *cache = d->buf;
1856 size_t cache_size = d->buflen;
2c51f8dd 1857 FILE *f = NULL;
2dc17609 1858
97f1f27b
YY
1859 if (offset){
1860 if (offset > d->size)
1861 return -EINVAL;
b5ad2d21
SH
1862 if (!d->cached)
1863 return 0;
97f1f27b
YY
1864 int left = d->size - offset;
1865 total_len = left > size ? size: left;
1866 memcpy(buf, cache + offset, total_len);
1867 return total_len;
1868 }
2dc17609 1869
2c51f8dd 1870 cg = get_pid_cgroup(fc->pid, "memory");
2dc17609 1871 if (!cg)
53b43826 1872 return read_file("/proc/meminfo", buf, size, d);
2dc17609
SH
1873
1874 if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
2c51f8dd 1875 goto err;
2dc17609 1876 if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
2c51f8dd 1877 goto err;
2dc17609 1878 if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
2c51f8dd 1879 goto err;
2dc17609
SH
1880 memlimit = strtoul(memlimit_str, NULL, 10);
1881 memusage = strtoul(memusage_str, NULL, 10);
1882 memlimit /= 1024;
1883 memusage /= 1024;
1884 get_mem_cached(memstat_str, &cached);
1885
1886 f = fopen("/proc/meminfo", "r");
1887 if (!f)
2c51f8dd 1888 goto err;
2dc17609
SH
1889
1890 while (getline(&line, &linelen, f) != -1) {
1891 size_t l;
1892 char *printme, lbuf[100];
1893
1894 memset(lbuf, 0, 100);
1895 if (startswith(line, "MemTotal:")) {
1896 sscanf(line+14, "%lu", &hosttotal);
1897 if (hosttotal < memlimit)
1898 memlimit = hosttotal;
1899 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1900 printme = lbuf;
1901 } else if (startswith(line, "MemFree:")) {
1902 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1903 printme = lbuf;
1904 } else if (startswith(line, "MemAvailable:")) {
1905 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1906 printme = lbuf;
1907 } else if (startswith(line, "Buffers:")) {
1908 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1909 printme = lbuf;
1910 } else if (startswith(line, "Cached:")) {
1911 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
1912 printme = lbuf;
1913 } else if (startswith(line, "SwapCached:")) {
1914 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
1915 printme = lbuf;
1916 } else
1917 printme = line;
97f1f27b
YY
1918
1919 l = snprintf(cache, cache_size, "%s", printme);
e1068397
MM
1920 if (l < 0) {
1921 perror("Error writing to cache");
1922 rv = 0;
1923 goto err;
1924
1925 }
1926 if (l >= cache_size) {
1927 fprintf(stderr, "Internal error: truncated write to cache\n");
1928 rv = 0;
1929 goto err;
1930 }
1931
97f1f27b
YY
1932 cache += l;
1933 cache_size -= l;
2f919d9d 1934 total_len += l;
2dc17609
SH
1935 }
1936
b5ad2d21 1937 d->cached = 1;
97f1f27b
YY
1938 d->size = total_len;
1939 if (total_len > size ) total_len = size;
1940 memcpy(buf, d->buf, total_len);
1941
e1068397 1942 rv = total_len;
2c51f8dd
SH
1943err:
1944 if (f)
1945 fclose(f);
92c84dc4 1946 free(line);
2c51f8dd
SH
1947 free(cg);
1948 free(memlimit_str);
1949 free(memusage_str);
1950 free(memstat_str);
e1068397 1951 return rv;
23ce2127
SH
1952}
1953
1954/*
1955 * Read the cpuset.cpus for cg
2c51f8dd 1956 * Return the answer in a newly allocated string which must be freed
23ce2127
SH
1957 */
1958static char *get_cpuset(const char *cg)
1959{
1960 char *answer;
1961
1962 if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
1963 return NULL;
1964 return answer;
1965}
1966
fa47bb52 1967bool cpu_in_cpuset(int cpu, const char *cpuset);
23ce2127 1968
aeb56147
SH
1969static bool cpuline_in_cpuset(const char *line, const char *cpuset)
1970{
1971 int cpu;
1972
1973 if (sscanf(line, "processor : %d", &cpu) != 1)
1974 return false;
1975 return cpu_in_cpuset(cpu, cpuset);
1976}
1977
23ce2127
SH
1978/*
1979 * check whether this is a '^processor" line in /proc/cpuinfo
1980 */
1981static bool is_processor_line(const char *line)
1982{
1983 int cpu;
1984
1985 if (sscanf(line, "processor : %d", &cpu) == 1)
1986 return true;
1987 return false;
1988}
1989
23ce2127
SH
1990static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
1991 struct fuse_file_info *fi)
1992{
1993 struct fuse_context *fc = fuse_get_context();
97f1f27b 1994 struct file_info *d = (struct file_info *)fi->fh;
2c51f8dd
SH
1995 char *cg;
1996 char *cpuset = NULL;
23ce2127 1997 char *line = NULL;
e1068397 1998 size_t linelen = 0, total_len = 0, rv = 0;
23ce2127
SH
1999 bool am_printing = false;
2000 int curcpu = -1;
97f1f27b
YY
2001 char *cache = d->buf;
2002 size_t cache_size = d->buflen;
2c51f8dd 2003 FILE *f = NULL;
23ce2127 2004
97f1f27b
YY
2005 if (offset){
2006 if (offset > d->size)
2007 return -EINVAL;
b5ad2d21
SH
2008 if (!d->cached)
2009 return 0;
97f1f27b
YY
2010 int left = d->size - offset;
2011 total_len = left > size ? size: left;
2012 memcpy(buf, cache + offset, total_len);
2f919d9d 2013 return total_len;
97f1f27b 2014 }
23ce2127 2015
2c51f8dd 2016 cg = get_pid_cgroup(fc->pid, "cpuset");
23ce2127 2017 if (!cg)
53b43826 2018 return read_file("proc/cpuinfo", buf, size, d);
23ce2127
SH
2019
2020 cpuset = get_cpuset(cg);
2021 if (!cpuset)
2c51f8dd 2022 goto err;
23ce2127
SH
2023
2024 f = fopen("/proc/cpuinfo", "r");
2025 if (!f)
2c51f8dd 2026 goto err;
23ce2127
SH
2027
2028 while (getline(&line, &linelen, f) != -1) {
2029 size_t l;
2030 if (is_processor_line(line)) {
aeb56147 2031 am_printing = cpuline_in_cpuset(line, cpuset);
23ce2127
SH
2032 if (am_printing) {
2033 curcpu ++;
97f1f27b 2034 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
e1068397
MM
2035 if (l < 0) {
2036 perror("Error writing to cache");
2037 rv = 0;
2038 goto err;
2039 }
2040 if (l >= cache_size) {
2041 fprintf(stderr, "Internal error: truncated write to cache\n");
2042 rv = 0;
2043 goto err;
2044 }
97f1f27b
YY
2045 if (l < cache_size){
2046 cache += l;
2047 cache_size -= l;
2048 total_len += l;
2049 }else{
2050 cache += cache_size;
2051 total_len += cache_size;
2052 cache_size = 0;
2053 break;
2054 }
23ce2127
SH
2055 }
2056 continue;
2057 }
2058 if (am_printing) {
97f1f27b 2059 l = snprintf(cache, cache_size, "%s", line);
e1068397
MM
2060 if (l < 0) {
2061 perror("Error writing to cache");
2062 rv = 0;
2063 goto err;
2064 }
2065 if (l >= cache_size) {
2066 fprintf(stderr, "Internal error: truncated write to cache\n");
2067 rv = 0;
2068 goto err;
2069 }
97f1f27b
YY
2070 if (l < cache_size) {
2071 cache += l;
2072 cache_size -= l;
2073 total_len += l;
2074 } else {
2075 cache += cache_size;
2076 total_len += cache_size;
2077 cache_size = 0;
2078 break;
2079 }
23ce2127
SH
2080 }
2081 }
2082
b5ad2d21 2083 d->cached = 1;
97f1f27b
YY
2084 d->size = total_len;
2085 if (total_len > size ) total_len = size;
2086
2087 /* read from off 0 */
2088 memcpy(buf, d->buf, total_len);
e1068397 2089 rv = total_len;
2c51f8dd
SH
2090err:
2091 if (f)
2092 fclose(f);
92c84dc4 2093 free(line);
2c51f8dd
SH
2094 free(cpuset);
2095 free(cg);
e1068397 2096 return rv;
23ce2127
SH
2097}
2098
2099static int proc_stat_read(char *buf, size_t size, off_t offset,
2100 struct fuse_file_info *fi)
2101{
aeb56147 2102 struct fuse_context *fc = fuse_get_context();
97f1f27b 2103 struct file_info *d = (struct file_info *)fi->fh;
2c51f8dd
SH
2104 char *cg;
2105 char *cpuset = NULL;
aeb56147 2106 char *line = NULL;
e1068397 2107 size_t linelen = 0, total_len = 0, rv = 0;
2a0fde62 2108 int curcpu = -1; /* cpu numbering starts at 0 */
97f1f27b
YY
2109 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
2110 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
2111 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
2112#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
2113 char cpuall[CPUALL_MAX_SIZE];
2114 /* reserve for cpu all */
2115 char *cache = d->buf + CPUALL_MAX_SIZE;
2116 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
2c51f8dd 2117 FILE *f = NULL;
aeb56147 2118
97f1f27b
YY
2119 if (offset){
2120 if (offset > d->size)
2121 return -EINVAL;
b5ad2d21
SH
2122 if (!d->cached)
2123 return 0;
97f1f27b
YY
2124 int left = d->size - offset;
2125 total_len = left > size ? size: left;
2126 memcpy(buf, d->buf + offset, total_len);
2f919d9d 2127 return total_len;
97f1f27b 2128 }
aeb56147 2129
2c51f8dd 2130 cg = get_pid_cgroup(fc->pid, "cpuset");
aeb56147 2131 if (!cg)
53b43826 2132 return read_file("/proc/stat", buf, size, d);
aeb56147
SH
2133
2134 cpuset = get_cpuset(cg);
2135 if (!cpuset)
2c51f8dd 2136 goto err;
aeb56147
SH
2137
2138 f = fopen("/proc/stat", "r");
2139 if (!f)
2c51f8dd 2140 goto err;
aeb56147 2141
97f1f27b
YY
2142 //skip first line
2143 if (getline(&line, &linelen, f) < 0) {
2144 fprintf(stderr, "proc_stat_read read first line failed\n");
2c51f8dd 2145 goto err;
97f1f27b
YY
2146 }
2147
aeb56147
SH
2148 while (getline(&line, &linelen, f) != -1) {
2149 size_t l;
2150 int cpu;
2a0fde62 2151 char cpu_char[10]; /* That's a lot of cores */
aeb56147
SH
2152 char *c;
2153
2a0fde62
CB
2154 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
2155 /* not a ^cpuN line containing a number N, just print it */
97f1f27b 2156 l = snprintf(cache, cache_size, "%s", line);
e1068397
MM
2157 if (l < 0) {
2158 perror("Error writing to cache");
2159 rv = 0;
2160 goto err;
2161 }
2162 if (l >= cache_size) {
2163 fprintf(stderr, "Internal error: truncated write to cache\n");
2164 rv = 0;
2165 goto err;
2166 }
2167 if (l < cache_size) {
97f1f27b
YY
2168 cache += l;
2169 cache_size -= l;
2170 total_len += l;
2171 continue;
e1068397 2172 } else {
97f1f27b
YY
2173 //no more space, break it
2174 cache += cache_size;
2175 total_len += cache_size;
2176 cache_size = 0;
2177 break;
2178 }
aeb56147 2179 }
2a0fde62
CB
2180
2181 if (sscanf(cpu_char, "%d", &cpu) != 1)
2182 continue;
aeb56147
SH
2183 if (!cpu_in_cpuset(cpu, cpuset))
2184 continue;
2185 curcpu ++;
2186
2187 c = strchr(line, ' ');
2188 if (!c)
2189 continue;
25c5e8fb 2190 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
e1068397
MM
2191 if (l < 0) {
2192 perror("Error writing to cache");
2193 rv = 0;
2194 goto err;
2195
2196 }
2197 if (l >= cache_size) {
2198 fprintf(stderr, "Internal error: truncated write to cache\n");
2199 rv = 0;
2200 goto err;
2201 }
2202
97f1f27b
YY
2203 cache += l;
2204 cache_size -= l;
aeb56147 2205 total_len += l;
2f919d9d 2206
97f1f27b
YY
2207 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
2208 &softirq, &steal, &guest) != 9)
2209 continue;
2210 user_sum += user;
2211 nice_sum += nice;
2212 system_sum += system;
2213 idle_sum += idle;
2214 iowait_sum += iowait;
2215 irq_sum += irq;
2216 softirq_sum += softirq;
2217 steal_sum += steal;
2f919d9d 2218 guest_sum += guest;
97f1f27b
YY
2219 }
2220
2221 cache = d->buf;
2222
2f919d9d 2223 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
97f1f27b
YY
2224 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
2225 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
2226 memcpy(cache, cpuall, cpuall_len);
2f919d9d 2227 cache += cpuall_len;
2c51f8dd 2228 } else{
97f1f27b
YY
2229 /* shouldn't happen */
2230 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
2231 cpuall_len = 0;
2232 }
2233
2234 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
2235 total_len += cpuall_len;
b5ad2d21 2236 d->cached = 1;
97f1f27b
YY
2237 d->size = total_len;
2238 if (total_len > size ) total_len = size;
2239
2240 memcpy(buf, d->buf, total_len);
e1068397 2241 rv = total_len;
2c51f8dd
SH
2242
2243err:
2244 if (f)
2245 fclose(f);
92c84dc4 2246 free(line);
2c51f8dd
SH
2247 free(cpuset);
2248 free(cg);
e1068397 2249 return rv;
23ce2127
SH
2250}
2251
7bbf2246
SH
2252/*
2253 * How to guess what to present for uptime?
2254 * One thing we could do would be to take the date on the caller's
2255 * memory.usage_in_bytes file, which should equal the time of creation
2256 * of his cgroup. However, a task could be in a sub-cgroup of the
2257 * container. The same problem exists if we try to look at the ages
2258 * of processes in the caller's cgroup.
2259 *
2260 * So we'll fork a task that will enter the caller's pidns, mount a
2261 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
2262 *
2263 * For the second uptime #, we'll do as Stéphane had done, just copy
2264 * the number from /proc/uptime. Not sure how to best emulate 'idle'
2265 * time. Maybe someone can come up with a good algorithm and submit a
2266 * patch. Maybe something based on cpushare info?
2267 */
41bb9357
SH
2268
2269/* return age of the reaper for $pid, taken from ctime of its procdir */
2270static long int get_pid1_time(pid_t pid)
2271{
2272 char fnam[100];
ea56f722 2273 int fd, cpipe[2], ret;
41bb9357 2274 struct stat sb;
ea56f722
SH
2275 pid_t cpid;
2276 struct timeval tv;
2277 fd_set s;
2278 char v;
41bb9357
SH
2279
2280 if (unshare(CLONE_NEWNS))
2281 return 0;
2282
5ca64c2a
SG
2283 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
2284 perror("rslave mount failed");
2285 return 0;
2286 }
2287
c0adec85
SH
2288 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", pid);
2289 if (ret < 0 || ret >= sizeof(fnam))
2290 return 0;
2291
41bb9357
SH
2292 fd = open(fnam, O_RDONLY);
2293 if (fd < 0) {
2294 perror("get_pid1_time open of ns/pid");
2295 return 0;
2296 }
2297 if (setns(fd, 0)) {
2298 perror("get_pid1_time setns 1");
2299 close(fd);
2300 return 0;
2301 }
2302 close(fd);
41bb9357 2303
ea56f722
SH
2304 if (pipe(cpipe) < 0)
2305 exit(1);
41bb9357 2306
ea56f722
SH
2307loop:
2308 cpid = fork();
2309 if (cpid < 0)
41bb9357 2310 return 0;
ea56f722
SH
2311
2312 if (!cpid) {
2313 char b = '1';
2314 close(cpipe[0]);
2315 if (write(cpipe[1], &b, sizeof(char)) < 0) {
2316 fprintf(stderr, "%s (child): erorr on write: %s\n",
2317 __func__, strerror(errno));
2318 }
2319 close(cpipe[1]);
2320 umount2("/proc", MNT_DETACH);
2321 if (mount("proc", "/proc", "proc", 0, NULL)) {
2322 perror("get_pid1_time mount");
2323 return 0;
2324 }
2325 ret = lstat("/proc/1", &sb);
2326 if (ret) {
2327 perror("get_pid1_time lstat");
2328 return 0;
2329 }
2330 return time(NULL) - sb.st_ctime;
41bb9357 2331 }
ea56f722
SH
2332
2333 // give the child 1 second to be done forking and
2334 // write it's ack
2335 FD_ZERO(&s);
2336 FD_SET(cpipe[0], &s);
2337 tv.tv_sec = 1;
2338 tv.tv_usec = 0;
2339 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
2340 if (ret <= 0)
2341 goto again;
2342 ret = read(cpipe[0], &v, 1);
2343 if (ret != sizeof(char) || v != '1') {
2344 goto again;
41bb9357 2345 }
ea56f722
SH
2346
2347 wait_for_pid(cpid);
67bd113f 2348 _exit(0);
ea56f722
SH
2349
2350again:
2351 kill(cpid, SIGKILL);
2352 wait_for_pid(cpid);
2353 goto loop;
41bb9357
SH
2354}
2355
2356static long int getreaperage(pid_t qpid)
2357{
2358 int pid, mypipe[2], ret;
2359 struct timeval tv;
2360 fd_set s;
2361 long int mtime, answer = 0;
2362
2363 if (pipe(mypipe)) {
2364 return 0;
2365 }
2366
2367 pid = fork();
2368
2369 if (!pid) { // child
2370 mtime = get_pid1_time(qpid);
2371 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
2372 fprintf(stderr, "Warning: bad write from getreaperage\n");
67bd113f 2373 _exit(0);
41bb9357
SH
2374 }
2375
2376 close(mypipe[1]);
2377 FD_ZERO(&s);
2378 FD_SET(mypipe[0], &s);
2379 tv.tv_sec = 1;
2380 tv.tv_usec = 0;
2381 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
ea56f722 2382 if (ret <= 0) {
41bb9357
SH
2383 perror("select");
2384 goto out;
2385 }
2386 if (!ret) {
1420baf8 2387 fprintf(stderr, "timed out\n");
41bb9357
SH
2388 goto out;
2389 }
2390 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
2391 perror("read");
2392 goto out;
2393 }
2394 answer = mtime;
2395
2396out:
2397 wait_for_pid(pid);
2398 close(mypipe[0]);
2399 return answer;
2400}
2401
38056ebc 2402static unsigned long int getprocidle(void)
41bb9357
SH
2403{
2404 FILE *f = fopen("/proc/uptime", "r");
38056ebc
BM
2405 unsigned long int age, idle;
2406 unsigned long int age_nsec, idle_nsec;
2407
92c84dc4 2408 int ret;
41bb9357
SH
2409 if (!f)
2410 return 0;
38056ebc 2411 ret = fscanf(f, "%lu.%02lu %lu.%02lu", &age, &age_nsec, &idle, &idle_nsec);
92c84dc4 2412 fclose(f);
38056ebc 2413 if (ret != 4)
41bb9357
SH
2414 return 0;
2415 return idle;
2416}
2417
2418/*
2419 * We read /proc/uptime and reuse its second field.
2420 * For the first field, we use the mtime for the reaper for
2421 * the calling pid as returned by getreaperage
2422 */
23ce2127
SH
2423static int proc_uptime_read(char *buf, size_t size, off_t offset,
2424 struct fuse_file_info *fi)
2425{
41bb9357 2426 struct fuse_context *fc = fuse_get_context();
97f1f27b 2427 struct file_info *d = (struct file_info *)fi->fh;
41bb9357 2428 long int reaperage = getreaperage(fc->pid);;
38056ebc 2429 unsigned long int idletime = getprocidle();
b5ad2d21 2430 char *cache = d->buf;
97f1f27b 2431 size_t total_len = 0;
41bb9357 2432
97f1f27b
YY
2433 if (offset){
2434 if (offset > d->size)
2435 return -EINVAL;
b5ad2d21
SH
2436 if (!d->cached)
2437 return 0;
2438 int left = d->size - offset;
2439 total_len = left > size ? size: left;
2440 memcpy(buf, cache + offset, total_len);
2441 return total_len;
97f1f27b
YY
2442 }
2443
b5ad2d21 2444 total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
e1068397
MM
2445 if (total_len < 0){
2446 perror("Error writing to cache");
2447 return 0;
2448 }
cdcdb29b 2449
b5ad2d21
SH
2450 d->size = (int)total_len;
2451 d->cached = 1;
2452
2453 if (total_len > size) total_len = size;
2454
2455 memcpy(buf, d->buf, total_len);
97f1f27b 2456 return total_len;
23ce2127
SH
2457}
2458
49878439
YY
2459static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2460 struct fuse_file_info *fi)
2461{
2462 char dev_name[72];
2463 struct fuse_context *fc = fuse_get_context();
97f1f27b 2464 struct file_info *d = (struct file_info *)fi->fh;
2c51f8dd
SH
2465 char *cg;
2466 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
49878439
YY
2467 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2468 unsigned long read = 0, write = 0;
2469 unsigned long read_merged = 0, write_merged = 0;
2470 unsigned long read_sectors = 0, write_sectors = 0;
2471 unsigned long read_ticks = 0, write_ticks = 0;
2472 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2473 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
b5ad2d21
SH
2474 char *cache = d->buf;
2475 size_t cache_size = d->buflen;
49878439 2476 char *line = NULL;
e1068397 2477 size_t linelen = 0, total_len = 0, rv = 0;
49878439
YY
2478 unsigned int major = 0, minor = 0;
2479 int i = 0;
2c51f8dd 2480 FILE *f = NULL;
49878439 2481
97f1f27b
YY
2482 if (offset){
2483 if (offset > d->size)
2484 return -EINVAL;
b5ad2d21
SH
2485 if (!d->cached)
2486 return 0;
2487 int left = d->size - offset;
2488 total_len = left > size ? size: left;
2489 memcpy(buf, cache + offset, total_len);
2490 return total_len;
97f1f27b 2491 }
49878439 2492
2c51f8dd 2493 cg = get_pid_cgroup(fc->pid, "blkio");
49878439 2494 if (!cg)
53b43826 2495 return read_file("/proc/diskstats", buf, size, d);
49878439
YY
2496
2497 if (!cgm_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2c51f8dd 2498 goto err;
49878439 2499 if (!cgm_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2c51f8dd 2500 goto err;
49878439 2501 if (!cgm_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2c51f8dd 2502 goto err;
49878439 2503 if (!cgm_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2c51f8dd 2504 goto err;
49878439 2505 if (!cgm_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2c51f8dd 2506 goto err;
49878439
YY
2507
2508
2509 f = fopen("/proc/diskstats", "r");
2510 if (!f)
2c51f8dd 2511 goto err;
49878439
YY
2512
2513 while (getline(&line, &linelen, f) != -1) {
2514 size_t l;
2515 char *printme, lbuf[256];
2516
c0adec85 2517 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
49878439
YY
2518 if(i == 3){
2519 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2520 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2521 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2522 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2523 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2524 read_sectors = read_sectors/512;
2525 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2526 write_sectors = write_sectors/512;
2f919d9d 2527
49878439
YY
2528 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2529 rd_svctm = rd_svctm/1000000;
2530 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2531 rd_wait = rd_wait/1000000;
2532 read_ticks = rd_svctm + rd_wait;
2533
2534 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2535 wr_svctm = wr_svctm/1000000;
2536 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2537 wr_wait = wr_wait/1000000;
2538 write_ticks = wr_svctm + wr_wait;
2539
2540 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2541 tot_ticks = tot_ticks/1000000;
2542 }else{
2543 continue;
2544 }
2545
2546 memset(lbuf, 0, 256);
2547 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2f919d9d 2548 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
49878439
YY
2549 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2550 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2551 printme = lbuf;
2552 } else
2553 continue;
2554
b5ad2d21 2555 l = snprintf(cache, cache_size, "%s", printme);
e1068397
MM
2556 if (l < 0) {
2557 perror("Error writing to fuse buf");
2558 rv = 0;
2559 goto err;
2560 }
b5ad2d21 2561 if (l >= cache_size) {
e1068397
MM
2562 fprintf(stderr, "Internal error: truncated write to cache\n");
2563 rv = 0;
2564 goto err;
2565 }
b5ad2d21
SH
2566 cache += l;
2567 cache_size -= l;
49878439
YY
2568 total_len += l;
2569 }
2570
b5ad2d21 2571 d->cached = 1;
97f1f27b 2572 d->size = total_len;
b5ad2d21
SH
2573 if (total_len > size ) total_len = size;
2574 memcpy(buf, d->buf, total_len);
2575
e1068397 2576 rv = total_len;
2c51f8dd
SH
2577err:
2578 free(cg);
2579 if (f)
2580 fclose(f);
49878439 2581 free(line);
2c51f8dd
SH
2582 free(io_serviced_str);
2583 free(io_merged_str);
2584 free(io_service_bytes_str);
2585 free(io_wait_time_str);
2586 free(io_service_time_str);
e1068397 2587 return rv;
49878439
YY
2588}
2589
23ce2127
SH
2590static off_t get_procfile_size(const char *which)
2591{
2592 FILE *f = fopen(which, "r");
2593 char *line = NULL;
2594 size_t len = 0;
2595 ssize_t sz, answer = 0;
2596 if (!f)
2597 return 0;
2598
2599 while ((sz = getline(&line, &len, f)) != -1)
2600 answer += sz;
2601 fclose (f);
92c84dc4 2602 free(line);
23ce2127
SH
2603
2604 return answer;
2605}
2606
758ad80c
SH
2607static int proc_getattr(const char *path, struct stat *sb)
2608{
35629743
SH
2609 struct timespec now;
2610
2611 memset(sb, 0, sizeof(struct stat));
2612 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2613 return -EINVAL;
2614 sb->st_uid = sb->st_gid = 0;
2615 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2616 if (strcmp(path, "/proc") == 0) {
2617 sb->st_mode = S_IFDIR | 00555;
2618 sb->st_nlink = 2;
2619 return 0;
2620 }
2621 if (strcmp(path, "/proc/meminfo") == 0 ||
2622 strcmp(path, "/proc/cpuinfo") == 0 ||
2623 strcmp(path, "/proc/uptime") == 0 ||
49878439
YY
2624 strcmp(path, "/proc/stat") == 0 ||
2625 strcmp(path, "/proc/diskstats") == 0) {
7253e0a4 2626 sb->st_size = 0;
35629743
SH
2627 sb->st_mode = S_IFREG | 00444;
2628 sb->st_nlink = 1;
2629 return 0;
2630 }
2631
2632 return -ENOENT;
2633}
2634
2635static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2636 struct fuse_file_info *fi)
2637{
2638 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2639 filler(buf, "meminfo", NULL, 0) != 0 ||
2640 filler(buf, "stat", NULL, 0) != 0 ||
49878439
YY
2641 filler(buf, "uptime", NULL, 0) != 0 ||
2642 filler(buf, "diskstats", NULL, 0) != 0)
758ad80c 2643 return -EINVAL;
758ad80c
SH
2644 return 0;
2645}
2646
35629743
SH
2647static int proc_open(const char *path, struct fuse_file_info *fi)
2648{
96fc5ee6
SH
2649 int type = -1;
2650 struct file_info *info;
2651
2652 if (strcmp(path, "/proc/meminfo") == 0)
2653 type = LXC_TYPE_PROC_MEMINFO;
2654 else if (strcmp(path, "/proc/cpuinfo") == 0)
2655 type = LXC_TYPE_PROC_CPUINFO;
2656 else if (strcmp(path, "/proc/uptime") == 0)
2657 type = LXC_TYPE_PROC_UPTIME;
2658 else if (strcmp(path, "/proc/stat") == 0)
2659 type = LXC_TYPE_PROC_STAT;
2660 else if (strcmp(path, "/proc/diskstats") == 0)
2661 type = LXC_TYPE_PROC_DISKSTATS;
2662 if (type == -1)
2663 return -ENOENT;
2664
2c51f8dd
SH
2665 info = malloc(sizeof(*info));
2666 if (!info)
2667 return -ENOMEM;
2668
96fc5ee6
SH
2669 memset(info, 0, sizeof(*info));
2670 info->type = type;
2671
97f1f27b 2672 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
2c51f8dd
SH
2673 do {
2674 info->buf = malloc(info->buflen);
2675 } while (!info->buf);
97f1f27b
YY
2676 memset(info->buf, 0, info->buflen);
2677 /* set actual size to buffer size */
2f919d9d 2678 info->size = info->buflen;
97f1f27b 2679
96fc5ee6
SH
2680 fi->fh = (unsigned long)info;
2681 return 0;
2682}
2683
2684static int proc_release(const char *path, struct fuse_file_info *fi)
2685{
2686 struct file_info *f = (struct file_info *)fi->fh;
2687
2688 do_release_file_info(f);
2689 return 0;
35629743
SH
2690}
2691
35629743
SH
2692static int proc_read(const char *path, char *buf, size_t size, off_t offset,
2693 struct fuse_file_info *fi)
2694{
96fc5ee6
SH
2695 struct file_info *f = (struct file_info *) fi->fh;
2696
2697 switch (f->type) {
2f919d9d 2698 case LXC_TYPE_PROC_MEMINFO:
23ce2127 2699 return proc_meminfo_read(buf, size, offset, fi);
96fc5ee6 2700 case LXC_TYPE_PROC_CPUINFO:
23ce2127 2701 return proc_cpuinfo_read(buf, size, offset, fi);
96fc5ee6 2702 case LXC_TYPE_PROC_UPTIME:
23ce2127 2703 return proc_uptime_read(buf, size, offset, fi);
96fc5ee6 2704 case LXC_TYPE_PROC_STAT:
23ce2127 2705 return proc_stat_read(buf, size, offset, fi);
96fc5ee6 2706 case LXC_TYPE_PROC_DISKSTATS:
49878439 2707 return proc_diskstats_read(buf, size, offset, fi);
96fc5ee6
SH
2708 default:
2709 return -EINVAL;
2710 }
35629743
SH
2711}
2712
2ad6d2bd
SH
2713/*
2714 * FUSE ops for /
2715 * these just delegate to the /proc and /cgroup ops as
2716 * needed
2717 */
758ad80c
SH
2718
2719static int lxcfs_getattr(const char *path, struct stat *sb)
2720{
2721 if (strcmp(path, "/") == 0) {
2722 sb->st_mode = S_IFDIR | 00755;
2723 sb->st_nlink = 2;
2724 return 0;
2725 }
2726 if (strncmp(path, "/cgroup", 7) == 0) {
2727 return cg_getattr(path, sb);
2728 }
35629743 2729 if (strncmp(path, "/proc", 5) == 0) {
758ad80c
SH
2730 return proc_getattr(path, sb);
2731 }
2732 return -EINVAL;
2733}
2734
2735static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
2736{
2737 if (strcmp(path, "/") == 0)
2738 return 0;
2739
2740 if (strncmp(path, "/cgroup", 7) == 0) {
2741 return cg_opendir(path, fi);
2742 }
35629743
SH
2743 if (strcmp(path, "/proc") == 0)
2744 return 0;
2745 return -ENOENT;
758ad80c
SH
2746}
2747
2748static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2749 struct fuse_file_info *fi)
2750{
2751 if (strcmp(path, "/") == 0) {
2752 if (filler(buf, "proc", NULL, 0) != 0 ||
2753 filler(buf, "cgroup", NULL, 0) != 0)
2754 return -EINVAL;
2755 return 0;
2756 }
35629743 2757 if (strncmp(path, "/cgroup", 7) == 0)
758ad80c 2758 return cg_readdir(path, buf, filler, offset, fi);
35629743
SH
2759 if (strcmp(path, "/proc") == 0)
2760 return proc_readdir(path, buf, filler, offset, fi);
758ad80c
SH
2761 return -EINVAL;
2762}
2763
2764static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
2765{
2766 if (strcmp(path, "/") == 0)
2767 return 0;
2768 if (strncmp(path, "/cgroup", 7) == 0) {
2769 return cg_releasedir(path, fi);
2770 }
35629743
SH
2771 if (strcmp(path, "/proc") == 0)
2772 return 0;
758ad80c
SH
2773 return -EINVAL;
2774}
2775
99978832
SH
2776static int lxcfs_open(const char *path, struct fuse_file_info *fi)
2777{
35629743 2778 if (strncmp(path, "/cgroup", 7) == 0)
99978832 2779 return cg_open(path, fi);
35629743
SH
2780 if (strncmp(path, "/proc", 5) == 0)
2781 return proc_open(path, fi);
99978832
SH
2782
2783 return -EINVAL;
2784}
2785
2786static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
2787 struct fuse_file_info *fi)
2788{
35629743 2789 if (strncmp(path, "/cgroup", 7) == 0)
99978832 2790 return cg_read(path, buf, size, offset, fi);
35629743
SH
2791 if (strncmp(path, "/proc", 5) == 0)
2792 return proc_read(path, buf, size, offset, fi);
99978832
SH
2793
2794 return -EINVAL;
2795}
2796
2ad6d2bd
SH
2797int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
2798 struct fuse_file_info *fi)
2799{
2800 if (strncmp(path, "/cgroup", 7) == 0) {
2801 return cg_write(path, buf, size, offset, fi);
2802 }
2803
2804 return -EINVAL;
2805}
2806
99978832
SH
2807static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2808{
2809 return 0;
2810}
2811
2812static int lxcfs_release(const char *path, struct fuse_file_info *fi)
758ad80c 2813{
8f6e8f5e
SH
2814 if (strncmp(path, "/cgroup", 7) == 0)
2815 return cg_release(path, fi);
8f6e8f5e 2816 if (strncmp(path, "/proc", 5) == 0)
96fc5ee6 2817 return proc_release(path, fi);
8f6e8f5e
SH
2818
2819 return -EINVAL;
99978832
SH
2820}
2821
2822static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2823{
2824 return 0;
758ad80c
SH
2825}
2826
ab54b798
SH
2827int lxcfs_mkdir(const char *path, mode_t mode)
2828{
2829 if (strncmp(path, "/cgroup", 7) == 0)
2830 return cg_mkdir(path, mode);
2831
2832 return -EINVAL;
2833}
2834
341b21ad
SH
2835int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2836{
2837 if (strncmp(path, "/cgroup", 7) == 0)
2838 return cg_chown(path, uid, gid);
2839
2840 return -EINVAL;
2841}
2842
2ad6d2bd
SH
2843/*
2844 * cat first does a truncate before doing ops->write. This doesn't
2845 * really make sense for cgroups. So just return 0 always but do
2846 * nothing.
2847 */
2848int lxcfs_truncate(const char *path, off_t newsize)
2849{
2850 if (strncmp(path, "/cgroup", 7) == 0)
2851 return 0;
2852 return -EINVAL;
2853}
2854
50d8d5b5
SH
2855int lxcfs_rmdir(const char *path)
2856{
2857 if (strncmp(path, "/cgroup", 7) == 0)
2858 return cg_rmdir(path);
2859 return -EINVAL;
2860}
2861
fd2e4e03
SH
2862int lxcfs_chmod(const char *path, mode_t mode)
2863{
2864 if (strncmp(path, "/cgroup", 7) == 0)
2865 return cg_chmod(path, mode);
2866 return -EINVAL;
2867}
2868
758ad80c
SH
2869const struct fuse_operations lxcfs_ops = {
2870 .getattr = lxcfs_getattr,
2871 .readlink = NULL,
2872 .getdir = NULL,
2873 .mknod = NULL,
ab54b798 2874 .mkdir = lxcfs_mkdir,
758ad80c 2875 .unlink = NULL,
50d8d5b5 2876 .rmdir = lxcfs_rmdir,
758ad80c
SH
2877 .symlink = NULL,
2878 .rename = NULL,
2879 .link = NULL,
fd2e4e03 2880 .chmod = lxcfs_chmod,
341b21ad 2881 .chown = lxcfs_chown,
2ad6d2bd 2882 .truncate = lxcfs_truncate,
758ad80c 2883 .utime = NULL,
99978832
SH
2884
2885 .open = lxcfs_open,
2886 .read = lxcfs_read,
2887 .release = lxcfs_release,
2ad6d2bd 2888 .write = lxcfs_write,
99978832 2889
758ad80c 2890 .statfs = NULL,
99978832
SH
2891 .flush = lxcfs_flush,
2892 .fsync = lxcfs_fsync,
758ad80c
SH
2893
2894 .setxattr = NULL,
2895 .getxattr = NULL,
2896 .listxattr = NULL,
2897 .removexattr = NULL,
2898
2899 .opendir = lxcfs_opendir,
2900 .readdir = lxcfs_readdir,
2901 .releasedir = lxcfs_releasedir,
2902
2903 .fsyncdir = NULL,
2904 .init = NULL,
2905 .destroy = NULL,
2906 .access = NULL,
2907 .create = NULL,
2908 .ftruncate = NULL,
2909 .fgetattr = NULL,
2910};
2911
99978832 2912static void usage(const char *me)
758ad80c
SH
2913{
2914 fprintf(stderr, "Usage:\n");
2915 fprintf(stderr, "\n");
0b0f73db
SH
2916 fprintf(stderr, "%s mountpoint\n", me);
2917 fprintf(stderr, "%s -h\n", me);
758ad80c
SH
2918 exit(1);
2919}
2920
99978832 2921static bool is_help(char *w)
758ad80c
SH
2922{
2923 if (strcmp(w, "-h") == 0 ||
2924 strcmp(w, "--help") == 0 ||
2925 strcmp(w, "-help") == 0 ||
2926 strcmp(w, "help") == 0)
2927 return true;
2928 return false;
2929}
2930
0b0f73db
SH
2931void swallow_arg(int *argcp, char *argv[], char *which)
2932{
2933 int i;
2934
2935 for (i = 1; argv[i]; i++) {
2936 if (strcmp(argv[i], which) != 0)
2937 continue;
2938 for (; argv[i]; i++) {
2939 argv[i] = argv[i+1];
2940 }
2941 (*argcp)--;
2942 return;
2943 }
2944}
2945
2946void swallow_option(int *argcp, char *argv[], char *opt, char *v)
2947{
2948 int i;
2949
2950 for (i = 1; argv[i]; i++) {
2951 if (!argv[i+1])
2952 continue;
2953 if (strcmp(argv[i], opt) != 0)
2954 continue;
2955 if (strcmp(argv[i+1], v) != 0) {
2956 fprintf(stderr, "Warning: unexpected fuse option %s\n", v);
2957 exit(1);
2958 }
2959 for (; argv[i+1]; i++) {
2960 argv[i] = argv[i+2];
2961 }
2962 (*argcp) -= 2;
2963 return;
2964 }
2965}
2966
758ad80c
SH
2967int main(int argc, char *argv[])
2968{
c0adec85 2969 int ret = -1;
e5d26e0b 2970 struct lxcfs_state *d = NULL;
0b0f73db
SH
2971 /*
2972 * what we pass to fuse_main is:
2973 * argv[0] -s -f -o allow_other,directio argv[1] NULL
2974 */
2c51f8dd
SH
2975 int nargs = 5, cnt = 0;
2976 char *newargv[6];
758ad80c 2977
0b0f73db
SH
2978 /* accomodate older init scripts */
2979 swallow_arg(&argc, argv, "-s");
2980 swallow_arg(&argc, argv, "-f");
2981 swallow_option(&argc, argv, "-o", "allow_other");
2982
2e9c0b32
SH
2983 if (argc == 2 && strcmp(argv[1], "--version") == 0) {
2984 fprintf(stderr, "%s\n", VERSION);
2985 exit(0);
2986 }
0b0f73db 2987 if (argc != 2 || is_help(argv[1]))
758ad80c
SH
2988 usage(argv[0]);
2989
2c51f8dd
SH
2990 do {
2991 d = malloc(sizeof(*d));
2992 } while (!d);
0b0f73db 2993
38a76a91 2994 newargv[cnt++] = argv[0];
38a76a91
SH
2995 newargv[cnt++] = "-f";
2996 newargv[cnt++] = "-o";
2997 newargv[cnt++] = "allow_other,direct_io";
2998 newargv[cnt++] = argv[1];
2999 newargv[cnt++] = NULL;
758ad80c
SH
3000
3001 if (!cgm_escape_cgroup())
3002 fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
3003
3004 if (!cgm_get_controllers(&d->subsystems))
c0adec85 3005 goto out;
758ad80c 3006
38a76a91 3007 ret = fuse_main(nargs, newargv, &lxcfs_ops, d);
2c51f8dd 3008 cgm_dbus_disconnect();
758ad80c 3009
c0adec85 3010out:
e5d26e0b 3011 free(d);
758ad80c 3012 return ret;
2183082c 3013}