]> git.proxmox.com Git - mirror_lxcfs.git/blame - lxcfs.c
Add some more sanity checks
[mirror_lxcfs.git] / lxcfs.c
CommitLineData
758ad80c
SH
1/* lxcfs
2 *
3 * Copyright © 2014 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
f2799430 6 * See COPYING file for details.
758ad80c
SH
7 */
8
758ad80c
SH
9#define FUSE_USE_VERSION 26
10
2183082c 11#include <stdio.h>
758ad80c
SH
12#include <dirent.h>
13#include <fcntl.h>
14#include <fuse.h>
15#include <unistd.h>
16#include <errno.h>
17#include <stdbool.h>
18#include <time.h>
19#include <string.h>
20#include <stdlib.h>
21#include <libgen.h>
41bb9357
SH
22#include <sched.h>
23#include <linux/sched.h>
a05660a6 24#include <sys/socket.h>
41bb9357
SH
25#include <sys/mount.h>
26#include <wait.h>
758ad80c
SH
27
28#include <nih/alloc.h>
29#include <nih/string.h>
30
31#include "cgmanager.h"
32
33struct lxcfs_state {
34 /*
35 * a null-terminated, nih-allocated list of the mounted subsystems. We
36 * detect this at startup.
37 */
38 char **subsystems;
39};
40#define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
41
443d13f5
SH
42enum {
43 LXC_TYPE_CGDIR,
44 LXC_TYPE_CGFILE,
45 LXC_TYPE_PROC_MEMINFO,
46 LXC_TYPE_PROC_CPUINFO,
47 LXC_TYPE_PROC_UPTIME,
48 LXC_TYPE_PROC_STAT,
49 LXC_TYPE_PROC_DISKSTATS,
50};
51
c688e1b3
SH
52struct file_info {
53 char *controller;
54 char *cgroup;
8f6e8f5e 55 char *file;
443d13f5 56 int type;
c688e1b3
SH
57 char *buf; // unused as of yet
58 int buflen;
97f1f27b 59 int size; //actual data size
c688e1b3
SH
60};
61
97f1f27b
YY
62/* reserve buffer size, for cpuall in /proc/stat */
63#define BUF_RESERVE_SIZE 256
64
bae07053 65static char *must_copy_string(void *parent, const char *str)
c688e1b3
SH
66{
67 if (!str)
68 return NULL;
bae07053 69 return NIH_MUST( nih_strdup(parent, str) );
c688e1b3
SH
70}
71
4775fba1
SH
72/*
73 * TODO - return value should denote whether child exited with failure
74 * so callers can return errors. Esp read/write of tasks and cgroup.procs
75 */
a05660a6
SH
76static int wait_for_pid(pid_t pid)
77{
78 int status, ret;
79
80again:
81 ret = waitpid(pid, &status, 0);
82 if (ret == -1) {
83 if (errno == EINTR)
84 goto again;
85 return -1;
86 }
87 if (ret != pid)
88 goto again;
89 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
90 return -1;
91 return 0;
92}
93
053a659d
SH
94/*
95 * Given a open file * to /proc/pid/{u,g}id_map, and an id
96 * valid in the caller's namespace, return the id mapped into
97 * pid's namespace.
98 * Returns the mapped id, or -1 on error.
99 */
100unsigned int
101convert_id_to_ns(FILE *idfile, unsigned int in_id)
102{
103 unsigned int nsuid, // base id for a range in the idfile's namespace
104 hostuid, // base id for a range in the caller's namespace
105 count; // number of ids in this range
106 char line[400];
107 int ret;
108
109 fseek(idfile, 0L, SEEK_SET);
110 while (fgets(line, 400, idfile)) {
111 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
112 if (ret != 3)
113 continue;
114 if (hostuid + count < hostuid || nsuid + count < nsuid) {
115 /*
116 * uids wrapped around - unexpected as this is a procfile,
117 * so just bail.
118 */
647c89e5 119 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
053a659d
SH
120 nsuid, hostuid, count, line);
121 return -1;
122 }
123 if (hostuid <= in_id && hostuid+count > in_id) {
124 /*
125 * now since hostuid <= in_id < hostuid+count, and
126 * hostuid+count and nsuid+count do not wrap around,
127 * we know that nsuid+(in_id-hostuid) which must be
128 * less that nsuid+(count) must not wrap around
129 */
130 return (in_id - hostuid) + nsuid;
131 }
132 }
133
134 // no answer found
135 return -1;
136}
137
341b21ad
SH
138/*
139 * for is_privileged_over,
140 * specify whether we require the calling uid to be root in his
141 * namespace
142 */
143#define NS_ROOT_REQD true
144#define NS_ROOT_OPT false
145
146static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
758ad80c 147{
053a659d
SH
148 nih_local char *fpath = NULL;
149 bool answer = false;
150 uid_t nsuid;
151
341b21ad
SH
152 if (victim == -1 || uid == -1)
153 return false;
154
155 /*
156 * If the request is one not requiring root in the namespace,
157 * then having the same uid suffices. (i.e. uid 1000 has write
158 * access to files owned by uid 1000
159 */
160 if (!req_ns_root && uid == victim)
758ad80c
SH
161 return true;
162
053a659d
SH
163 fpath = NIH_MUST( nih_sprintf(NULL, "/proc/%d/uid_map", pid) );
164 FILE *f = fopen(fpath, "r");
165 if (!f)
166 return false;
167
341b21ad 168 /* if caller's not root in his namespace, reject */
053a659d
SH
169 nsuid = convert_id_to_ns(f, uid);
170 if (nsuid)
171 goto out;
172
341b21ad
SH
173 /*
174 * If victim is not mapped into caller's ns, reject.
175 * XXX I'm not sure this check is needed given that fuse
176 * will be sending requests where the vfs has converted
177 */
053a659d
SH
178 nsuid = convert_id_to_ns(f, victim);
179 if (nsuid == -1)
180 goto out;
181
182 answer = true;
183
184out:
185 fclose(f);
186 return answer;
758ad80c
SH
187}
188
189static bool perms_include(int fmode, mode_t req_mode)
190{
2ad6d2bd
SH
191 mode_t r;
192
193 switch (req_mode & O_ACCMODE) {
194 case O_RDONLY:
195 r = S_IROTH;
196 break;
197 case O_WRONLY:
198 r = S_IWOTH;
199 break;
200 case O_RDWR:
201 r = S_IROTH | S_IWOTH;
202 break;
203 default:
204 return false;
205 }
206 return ((fmode & r) == r);
758ad80c
SH
207}
208
3db25a35
SH
209static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
210{
211 char *start, *end;
212
213 if (strlen(taskcg) <= strlen(querycg)) {
214 fprintf(stderr, "%s: I was fed bad input\n", __func__);
215 return NULL;
216 }
217
218 if (strcmp(querycg, "/") == 0)
219 start = NIH_MUST( nih_strdup(NULL, taskcg + 1) );
220 else
221 start = NIH_MUST( nih_strdup(NULL, taskcg + strlen(querycg) + 1) );
222 end = strchr(start, '/');
223 if (end)
224 *end = '\0';
225 return start;
226}
227
758ad80c
SH
228/*
229 * check whether a fuse context may access a cgroup dir or file
230 *
231 * If file is not null, it is a cgroup file to check under cg.
232 * If file is null, then we are checking perms on cg itself.
233 *
234 * For files we can check the mode of the list_keys result.
235 * For cgroups, we must make assumptions based on the files under the
236 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
237 * yet.
238 */
239static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
240{
241 nih_local struct cgm_keys **list = NULL;
242 int i;
243
244 if (!file)
245 file = "tasks";
246
247 if (*file == '/')
248 file++;
249
250 if (!cgm_list_keys(contrl, cg, &list))
251 return false;
252 for (i = 0; list[i]; i++) {
253 if (strcmp(list[i]->name, file) == 0) {
254 struct cgm_keys *k = list[i];
341b21ad 255 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
758ad80c
SH
256 if (perms_include(k->mode >> 6, mode))
257 return true;
258 }
259 if (fc->gid == k->gid) {
260 if (perms_include(k->mode >> 3, mode))
261 return true;
262 }
263 return perms_include(k->mode, mode);
264 }
265 }
266
267 return false;
268}
269
3db25a35
SH
270static void stripnewline(char *x)
271{
272 size_t l = strlen(x);
273 if (l && x[l-1] == '\n')
274 x[l-1] = '\0';
275}
276
277/*
278 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
279 * If caller is in /a, he may act on /a/b, but not on /b.
280 * if the answer is false and nextcg is not NULL, then *nextcg will point
281 * to a nih_alloc'd string containing the next cgroup directory under cg
282 */
283static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
284{
285 nih_local char *fnam = NULL;
286 FILE *f;
287 bool answer = false;
288 char *line = NULL;
289 size_t len = 0;
290
291 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
292 if (!(f = fopen(fnam, "r")))
293 return false;
294
295 while (getline(&line, &len, f) != -1) {
296 char *c1, *c2, *linecmp;
297 if (!line[0])
298 continue;
299 c1 = strchr(line, ':');
300 if (!c1)
301 goto out;
302 c1++;
303 c2 = strchr(c1, ':');
304 if (!c2)
305 goto out;
306 *c2 = '\0';
307 if (strcmp(c1, contrl) != 0)
308 continue;
309 c2++;
310 stripnewline(c2);
311 /*
312 * callers pass in '/' for root cgroup, otherwise they pass
313 * in a cgroup without leading '/'
314 */
315 linecmp = *cg == '/' ? c2 : c2+1;
316 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
317 if (nextcg)
318 *nextcg = get_next_cgroup_dir(linecmp, cg);
319 goto out;
320 }
321 answer = true;
322 goto out;
323 }
324
325out:
326 fclose(f);
327 free(line);
328 return answer;
329}
330
758ad80c
SH
331/*
332 * given /cgroup/freezer/a/b, return "freezer". this will be nih-allocated
333 * and needs to be nih_freed.
334 */
335static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
336{
337 const char *p1;
338 char *ret, *slash;
339
340 if (strlen(path) < 9)
341 return NULL;
ac5d9d48
SH
342 if (*(path+7) != '/')
343 return NULL;
758ad80c
SH
344 p1 = path+8;
345 ret = nih_strdup(NULL, p1);
346 if (!ret)
347 return ret;
348 slash = strstr(ret, "/");
349 if (slash)
350 *slash = '\0';
351
352 /* verify that it is a subsystem */
353 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
354 int i;
355 if (!list) {
356 nih_free(ret);
357 return NULL;
358 }
359 for (i = 0; list[i]; i++) {
360 if (strcmp(list[i], ret) == 0)
361 return ret;
362 }
363 nih_free(ret);
364 return NULL;
365}
366
367/*
368 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
369 * Note that the returned value may include files (keynames) etc
370 */
371static const char *find_cgroup_in_path(const char *path)
372{
373 const char *p1;
374
375 if (strlen(path) < 9)
376 return NULL;
377 p1 = strstr(path+8, "/");
378 if (!p1)
379 return NULL;
380 return p1+1;
381}
382
383static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
384{
385 nih_local char **list = NULL;
386 int i;
387
388 if (!f)
389 return false;
390 if (*f == '/')
391 f++;
392
393 if (!cgm_list_children(contr, dir, &list))
394 return false;
395 for (i = 0; list[i]; i++) {
396 if (strcmp(list[i], f) == 0)
397 return true;
398 }
399
400 return false;
401}
402
403static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
404{
405 nih_local struct cgm_keys **list = NULL;
406 struct cgm_keys *k;
407 int i;
408
409 if (!f)
410 return NULL;
411 if (*f == '/')
412 f++;
413 if (!cgm_list_keys(contr, dir, &list))
414 return NULL;
415 for (i = 0; list[i]; i++) {
416 if (strcmp(list[i]->name, f) == 0) {
417 k = NIH_MUST( nih_alloc(NULL, (sizeof(*k))) );
418 k->name = NIH_MUST( nih_strdup(k, list[i]->name) );
419 k->uid = list[i]->uid;
420 k->gid = list[i]->gid;
421 k->mode = list[i]->mode;
422 return k;
423 }
424 }
425
426 return NULL;
427}
428
429static void get_cgdir_and_path(const char *cg, char **dir, char **file)
430{
758ad80c
SH
431 char *p;
432
433 *dir = NIH_MUST( nih_strdup(NULL, cg) );
434 *file = strrchr(cg, '/');
435 if (!*file) {
436 *file = NULL;
437 return;
438 }
439 p = strrchr(*dir, '/');
440 *p = '\0';
441}
442
443/*
2ad6d2bd 444 * FUSE ops for /cgroup
758ad80c 445 */
2ad6d2bd 446
758ad80c
SH
447static int cg_getattr(const char *path, struct stat *sb)
448{
449 struct timespec now;
450 struct fuse_context *fc = fuse_get_context();
451 nih_local char * cgdir = NULL;
452 char *fpath = NULL, *path1, *path2;
453 nih_local struct cgm_keys *k = NULL;
454 const char *cgroup;
455 nih_local char *controller = NULL;
456
457
458 if (!fc)
459 return -EIO;
460
461 memset(sb, 0, sizeof(struct stat));
462
463 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
464 return -EINVAL;
465
466 sb->st_uid = sb->st_gid = 0;
467 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
468 sb->st_size = 0;
469
470 if (strcmp(path, "/cgroup") == 0) {
471 sb->st_mode = S_IFDIR | 00755;
472 sb->st_nlink = 2;
473 return 0;
474 }
475
476 controller = pick_controller_from_path(fc, path);
477 if (!controller)
478 return -EIO;
758ad80c
SH
479 cgroup = find_cgroup_in_path(path);
480 if (!cgroup) {
481 /* this is just /cgroup/controller, return it as a dir */
482 sb->st_mode = S_IFDIR | 00755;
483 sb->st_nlink = 2;
484 return 0;
485 }
341b21ad 486
758ad80c
SH
487 get_cgdir_and_path(cgroup, &cgdir, &fpath);
488
489 if (!fpath) {
490 path1 = "/";
491 path2 = cgdir;
492 } else {
493 path1 = cgdir;
494 path2 = fpath;
495 }
496
758ad80c
SH
497 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
498 * Then check that caller's cgroup is under path if fpath is a child
499 * cgroup, or cgdir if fpath is a file */
500
501 if (is_child_cgroup(controller, path1, path2)) {
f9a05025
SH
502 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
503 /* this is just /cgroup/controller, return it as a dir */
504 sb->st_mode = S_IFDIR | 00555;
505 sb->st_nlink = 2;
506 return 0;
507 }
758ad80c 508 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
f9a05025 509 return -EACCES;
758ad80c 510
053a659d
SH
511 // get uid, gid, from '/tasks' file and make up a mode
512 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
513 sb->st_mode = S_IFDIR | 00755;
514 k = get_cgroup_key(controller, cgroup, "tasks");
515 if (!k) {
053a659d
SH
516 sb->st_uid = sb->st_gid = 0;
517 } else {
053a659d
SH
518 sb->st_uid = k->uid;
519 sb->st_gid = k->gid;
520 }
758ad80c
SH
521 sb->st_nlink = 2;
522 return 0;
523 }
524
525 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
3db25a35
SH
526 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL))
527 return -ENOENT;
758ad80c 528 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
f9a05025 529 return -EACCES;
758ad80c 530
758ad80c 531 sb->st_mode = S_IFREG | k->mode;
053a659d 532 sb->st_nlink = 1;
758ad80c
SH
533 sb->st_uid = k->uid;
534 sb->st_gid = k->gid;
7253e0a4 535 sb->st_size = 0;
758ad80c
SH
536 return 0;
537 }
538
ab54b798 539 return -ENOENT;
758ad80c 540}
2183082c 541
7f163b71
SH
542/*
543 * TODO - cache these results in a table for use in opendir, free
544 * in releasedir
545 */
758ad80c 546static int cg_opendir(const char *path, struct fuse_file_info *fi)
2183082c 547{
7f163b71
SH
548 struct fuse_context *fc = fuse_get_context();
549 nih_local struct cgm_keys **list = NULL;
550 const char *cgroup;
c688e1b3 551 struct file_info *dir_info;
7f163b71 552 nih_local char *controller = NULL;
7f163b71
SH
553
554 if (!fc)
555 return -EIO;
556
c688e1b3
SH
557 if (strcmp(path, "/cgroup") == 0) {
558 cgroup = NULL;
559 controller = NULL;
560 } else {
561 // return list of keys for the controller, and list of child cgroups
562 controller = pick_controller_from_path(fc, path);
563 if (!controller)
564 return -EIO;
7f163b71 565
c688e1b3
SH
566 cgroup = find_cgroup_in_path(path);
567 if (!cgroup) {
568 /* this is just /cgroup/controller, return its contents */
569 cgroup = "/";
570 }
7f163b71
SH
571 }
572
3a6e1a76 573 if (cgroup && !fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
7f163b71 574 return -EACCES;
c688e1b3
SH
575
576 /* we'll free this at cg_releasedir */
577 dir_info = NIH_MUST( nih_alloc(NULL, sizeof(*dir_info)) );
bae07053
SH
578 dir_info->controller = must_copy_string(dir_info, controller);
579 dir_info->cgroup = must_copy_string(dir_info, cgroup);
443d13f5 580 dir_info->type = LXC_TYPE_CGDIR;
c688e1b3 581 dir_info->buf = NULL;
8f6e8f5e 582 dir_info->file = NULL;
c688e1b3
SH
583 dir_info->buflen = 0;
584
585 fi->fh = (unsigned long)dir_info;
758ad80c
SH
586 return 0;
587}
588
758ad80c
SH
589static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
590 struct fuse_file_info *fi)
591{
c688e1b3
SH
592 struct file_info *d = (struct file_info *)fi->fh;
593 nih_local struct cgm_keys **list = NULL;
594 int i;
595 nih_local char *nextcg = NULL;
758ad80c
SH
596 struct fuse_context *fc = fuse_get_context();
597
443d13f5 598 if (d->type != LXC_TYPE_CGDIR) {
b845ad01
SH
599 fprintf(stderr, "Internal error: file cache info used in readdir\n");
600 return -EIO;
601 }
c688e1b3
SH
602 if (!d->cgroup && !d->controller) {
603 // ls /var/lib/lxcfs/cgroup - just show list of controllers
758ad80c
SH
604 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
605 int i;
606
607 if (!list)
608 return -EIO;
7f163b71 609
758ad80c
SH
610 for (i = 0; list[i]; i++) {
611 if (filler(buf, list[i], NULL, 0) != 0) {
612 return -EIO;
613 }
614 }
615 return 0;
616 }
617
c688e1b3 618 if (!cgm_list_keys(d->controller, d->cgroup, &list))
3db25a35 619 // not a valid cgroup
758ad80c 620 return -EINVAL;
3db25a35 621
c688e1b3 622 if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
3db25a35
SH
623 if (nextcg) {
624 int ret;
625 ret = filler(buf, nextcg, NULL, 0);
626 if (ret != 0)
627 return -EIO;
628 }
629 return 0;
630 }
631
758ad80c 632 for (i = 0; list[i]; i++) {
758ad80c
SH
633 if (filler(buf, list[i]->name, NULL, 0) != 0) {
634 return -EIO;
635 }
636 }
637
638 // now get the list of child cgroups
422aa4a5 639 nih_local char **clist = NULL;
758ad80c 640
c688e1b3 641 if (!cgm_list_children(d->controller, d->cgroup, &clist))
758ad80c
SH
642 return 0;
643 for (i = 0; clist[i]; i++) {
758ad80c
SH
644 if (filler(buf, clist[i], NULL, 0) != 0) {
645 return -EIO;
646 }
647 }
648 return 0;
649}
650
8f6e8f5e
SH
651static void do_release_file_info(struct file_info *f)
652{
bae07053
SH
653 /*
654 * all file_info fields which are nih_alloc()d with f as parent
655 * will be automatically freed
656 */
8f6e8f5e
SH
657 nih_free(f);
658}
659
758ad80c
SH
660static int cg_releasedir(const char *path, struct fuse_file_info *fi)
661{
c688e1b3
SH
662 struct file_info *d = (struct file_info *)fi->fh;
663
8f6e8f5e 664 do_release_file_info(d);
758ad80c
SH
665 return 0;
666}
667
99978832
SH
668static int cg_open(const char *path, struct fuse_file_info *fi)
669{
670 nih_local char *controller = NULL;
671 const char *cgroup;
672 char *fpath = NULL, *path1, *path2;
673 nih_local char * cgdir = NULL;
674 nih_local struct cgm_keys *k = NULL;
8f6e8f5e 675 struct file_info *file_info;
99978832
SH
676 struct fuse_context *fc = fuse_get_context();
677
678 if (!fc)
679 return -EIO;
680
681 controller = pick_controller_from_path(fc, path);
682 if (!controller)
683 return -EIO;
684 cgroup = find_cgroup_in_path(path);
685 if (!cgroup)
686 return -EINVAL;
687
688 get_cgdir_and_path(cgroup, &cgdir, &fpath);
689 if (!fpath) {
690 path1 = "/";
691 path2 = cgdir;
692 } else {
693 path1 = cgdir;
694 path2 = fpath;
695 }
696
8f6e8f5e
SH
697 k = get_cgroup_key(controller, path1, path2);
698 if (!k)
699 return -EINVAL;
99978832 700
8f6e8f5e
SH
701 if (!fc_may_access(fc, controller, path1, path2, fi->flags))
702 // should never get here
703 return -EACCES;
99978832 704
8f6e8f5e
SH
705 /* we'll free this at cg_release */
706 file_info = NIH_MUST( nih_alloc(NULL, sizeof(*file_info)) );
bae07053
SH
707 file_info->controller = must_copy_string(file_info, controller);
708 file_info->cgroup = must_copy_string(file_info, path1);
709 file_info->file = must_copy_string(file_info, path2);
443d13f5 710 file_info->type = LXC_TYPE_CGFILE;
8f6e8f5e
SH
711 file_info->buf = NULL;
712 file_info->buflen = 0;
713
714 fi->fh = (unsigned long)file_info;
715 return 0;
716}
717
718static int cg_release(const char *path, struct fuse_file_info *fi)
719{
720 struct file_info *f = (struct file_info *)fi->fh;
721
722 do_release_file_info(f);
723 return 0;
99978832
SH
724}
725
a05660a6
SH
726static int msgrecv(int sockfd, void *buf, size_t len)
727{
728 struct timeval tv;
729 fd_set rfds;
730
731 FD_ZERO(&rfds);
732 FD_SET(sockfd, &rfds);
733 tv.tv_sec = 2;
734 tv.tv_usec = 0;
735
ea56f722 736 if (select(sockfd+1, &rfds, NULL, NULL, &tv) <= 0)
a05660a6
SH
737 return -1;
738 return recv(sockfd, buf, len, MSG_DONTWAIT);
739}
740
01e71852
SH
741#define SEND_CREDS_OK 0
742#define SEND_CREDS_NOTSK 1
743#define SEND_CREDS_FAIL 2
744static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
a05660a6
SH
745{
746 struct msghdr msg = { 0 };
747 struct iovec iov;
748 struct cmsghdr *cmsg;
749 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
750 char buf[1];
751 buf[0] = 'p';
752
01e71852
SH
753 if (pingfirst) {
754 if (msgrecv(sock, buf, 1) != 1) {
1420baf8 755 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
01e71852
SH
756 __func__);
757 return SEND_CREDS_FAIL;
758 }
a05660a6
SH
759 }
760
761 msg.msg_control = cmsgbuf;
762 msg.msg_controllen = sizeof(cmsgbuf);
763
764 cmsg = CMSG_FIRSTHDR(&msg);
765 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
766 cmsg->cmsg_level = SOL_SOCKET;
767 cmsg->cmsg_type = SCM_CREDENTIALS;
768 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
769
770 msg.msg_name = NULL;
771 msg.msg_namelen = 0;
772
773 buf[0] = v;
774 iov.iov_base = buf;
775 iov.iov_len = sizeof(buf);
776 msg.msg_iov = &iov;
777 msg.msg_iovlen = 1;
778
779 if (sendmsg(sock, &msg, 0) < 0) {
1420baf8 780 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
a05660a6
SH
781 strerror(errno));
782 if (errno == 3)
01e71852
SH
783 return SEND_CREDS_NOTSK;
784 return SEND_CREDS_FAIL;
a05660a6
SH
785 }
786
01e71852 787 return SEND_CREDS_OK;
a05660a6
SH
788}
789
790static bool recv_creds(int sock, struct ucred *cred, char *v)
791{
792 struct msghdr msg = { 0 };
793 struct iovec iov;
794 struct cmsghdr *cmsg;
795 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
796 char buf[1];
797 int ret;
798 int optval = 1;
6ee867dc
SH
799 struct timeval tv;
800 fd_set rfds;
a05660a6
SH
801
802 *v = '1';
803
804 cred->pid = -1;
805 cred->uid = -1;
806 cred->gid = -1;
807
808 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
1420baf8 809 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
a05660a6
SH
810 return false;
811 }
812 buf[0] = '1';
813 if (write(sock, buf, 1) != 1) {
1420baf8 814 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
a05660a6
SH
815 return false;
816 }
817
818 msg.msg_name = NULL;
819 msg.msg_namelen = 0;
820 msg.msg_control = cmsgbuf;
821 msg.msg_controllen = sizeof(cmsgbuf);
822
823 iov.iov_base = buf;
824 iov.iov_len = sizeof(buf);
825 msg.msg_iov = &iov;
826 msg.msg_iovlen = 1;
827
6ee867dc
SH
828 FD_ZERO(&rfds);
829 FD_SET(sock, &rfds);
830 tv.tv_sec = 2;
831 tv.tv_usec = 0;
ea56f722 832 if (select(sock+1, &rfds, NULL, NULL, &tv) <= 0) {
6ee867dc
SH
833 fprintf(stderr, "Failed to select for scm_cred: %s\n",
834 strerror(errno));
835 return false;
836 }
837 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
a05660a6 838 if (ret < 0) {
1420baf8 839 fprintf(stderr, "Failed to receive scm_cred: %s\n",
a05660a6
SH
840 strerror(errno));
841 return false;
842 }
843
844 cmsg = CMSG_FIRSTHDR(&msg);
845
846 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
847 cmsg->cmsg_level == SOL_SOCKET &&
848 cmsg->cmsg_type == SCM_CREDENTIALS) {
849 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
850 }
851 *v = buf[0];
852
853 return true;
854}
855
856
857/*
4775fba1
SH
858 * pid_to_ns - reads pids from a ucred over a socket, then writes the
859 * int value back over the socket. This shifts the pid from the
860 * sender's pidns into tpid's pidns.
a05660a6 861 */
4775fba1 862static void pid_to_ns(int sock, pid_t tpid)
a05660a6
SH
863{
864 char v = '0';
865 struct ucred cred;
866
867 while (recv_creds(sock, &cred, &v)) {
868 if (v == '1')
869 exit(0);
a05660a6
SH
870 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
871 exit(1);
872 }
873 exit(0);
874}
875
876/*
4775fba1 877 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
a05660a6 878 * in your old pidns. Only children which you fork will be in the target
4775fba1 879 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
a05660a6
SH
880 * actually convert pids
881 */
4775fba1 882static void pid_to_ns_wrapper(int sock, pid_t tpid)
a05660a6 883{
ea56f722 884 int newnsfd = -1, ret, cpipe[2];
a05660a6
SH
885 char fnam[100];
886 pid_t cpid;
ea56f722
SH
887 struct timeval tv;
888 fd_set s;
889 char v;
a05660a6 890
c0adec85
SH
891 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
892 if (ret < 0 || ret >= sizeof(fnam))
893 exit(1);
a05660a6
SH
894 newnsfd = open(fnam, O_RDONLY);
895 if (newnsfd < 0)
896 exit(1);
897 if (setns(newnsfd, 0) < 0)
898 exit(1);
899 close(newnsfd);
900
ea56f722
SH
901 if (pipe(cpipe) < 0)
902 exit(1);
a05660a6 903
ea56f722
SH
904loop:
905 cpid = fork();
a05660a6
SH
906 if (cpid < 0)
907 exit(1);
ea56f722
SH
908
909 if (!cpid) {
910 char b = '1';
911 close(cpipe[0]);
912 if (write(cpipe[1], &b, sizeof(char)) < 0) {
913 fprintf(stderr, "%s (child): erorr on write: %s\n",
914 __func__, strerror(errno));
915 }
916 close(cpipe[1]);
4775fba1 917 pid_to_ns(sock, tpid);
ea56f722
SH
918 }
919 // give the child 1 second to be done forking and
920 // write it's ack
921 FD_ZERO(&s);
922 FD_SET(cpipe[0], &s);
923 tv.tv_sec = 1;
924 tv.tv_usec = 0;
925 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
926 if (ret <= 0)
927 goto again;
928 ret = read(cpipe[0], &v, 1);
929 if (ret != sizeof(char) || v != '1') {
930 goto again;
931 }
932
a05660a6
SH
933 if (!wait_for_pid(cpid))
934 exit(1);
935 exit(0);
ea56f722
SH
936
937again:
938 kill(cpid, SIGKILL);
939 wait_for_pid(cpid);
940 goto loop;
a05660a6
SH
941}
942
943/*
944 * To read cgroup files with a particular pid, we will setns into the child
945 * pidns, open a pipe, fork a child - which will be the first to really be in
946 * the child ns - which does the cgm_get_value and writes the data to the pipe.
947 */
948static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
949{
950 int sock[2] = {-1, -1};
951 nih_local char *tmpdata = NULL;
952 int ret;
953 pid_t qpid, cpid = -1;
954 bool answer = false;
955 char v = '0';
956 struct ucred cred;
957 struct timeval tv;
958 fd_set s;
959
960 if (!cgm_get_value(contrl, cg, file, &tmpdata))
961 return false;
962
963 /*
964 * Now we read the pids from returned data one by one, pass
965 * them into a child in the target namespace, read back the
966 * translated pids, and put them into our to-return data
967 */
968
969 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
970 perror("socketpair");
971 exit(1);
972 }
973
974 cpid = fork();
975 if (cpid == -1)
976 goto out;
977
978 if (!cpid) // child
4775fba1 979 pid_to_ns_wrapper(sock[1], tpid);
a05660a6
SH
980
981 char *ptr = tmpdata;
982 cred.uid = 0;
983 cred.gid = 0;
984 while (sscanf(ptr, "%d\n", &qpid) == 1) {
985 cred.pid = qpid;
01e71852
SH
986 ret = send_creds(sock[0], &cred, v, true);
987
988 if (ret == SEND_CREDS_NOTSK)
989 goto next;
990 if (ret == SEND_CREDS_FAIL)
a05660a6
SH
991 goto out;
992
993 // read converted results
994 FD_ZERO(&s);
995 FD_SET(sock[0], &s);
6ee867dc 996 tv.tv_sec = 2;
a05660a6
SH
997 tv.tv_usec = 0;
998 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
999 if (ret <= 0) {
6ee867dc
SH
1000 fprintf(stderr, "%s: select error waiting for pid from child: %s\n",
1001 __func__, strerror(errno));
a05660a6
SH
1002 goto out;
1003 }
1004 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
6ee867dc
SH
1005 fprintf(stderr, "%s: error reading pid from child: %s\n",
1006 __func__, strerror(errno));
a05660a6
SH
1007 goto out;
1008 }
a05660a6 1009 NIH_MUST( nih_strcat_sprintf(d, NULL, "%d\n", qpid) );
01e71852 1010next:
a05660a6
SH
1011 ptr = strchr(ptr, '\n');
1012 if (!ptr)
1013 break;
1014 ptr++;
1015 }
1016
1017 cred.pid = getpid();
1018 v = '1';
01e71852 1019 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
a05660a6 1020 // failed to ask child to exit
6ee867dc
SH
1021 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1022 __func__, strerror(errno));
a05660a6
SH
1023 goto out;
1024 }
1025
1026 answer = true;
1027
1028out:
1029 if (cpid != -1)
1030 wait_for_pid(cpid);
1031 if (sock[0] != -1) {
1032 close(sock[0]);
1033 close(sock[1]);
1034 }
1035 return answer;
1036}
1037
99978832
SH
1038static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1039 struct fuse_file_info *fi)
1040{
99978832 1041 struct fuse_context *fc = fuse_get_context();
8f6e8f5e 1042 struct file_info *f = (struct file_info *)fi->fh;
99978832
SH
1043 nih_local struct cgm_keys *k = NULL;
1044
443d13f5 1045 if (f->type != LXC_TYPE_CGFILE) {
b845ad01
SH
1046 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1047 return -EIO;
1048 }
1049
99978832 1050 if (offset)
7253e0a4 1051 return 0;
99978832
SH
1052
1053 if (!fc)
1054 return -EIO;
1055
8f6e8f5e 1056 if (!f->controller)
99978832
SH
1057 return -EINVAL;
1058
8f6e8f5e 1059 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
99978832 1060 nih_local char *data = NULL;
4775fba1
SH
1061 int s;
1062 bool r;
99978832 1063
8f6e8f5e 1064 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY))
f9a05025
SH
1065 // should never get here
1066 return -EACCES;
99978832 1067
8f6e8f5e
SH
1068 if (strcmp(f->file, "tasks") == 0 ||
1069 strcmp(f->file, "/tasks") == 0 ||
1070 strcmp(f->file, "/cgroup.procs") == 0 ||
1071 strcmp(f->file, "cgroup.procs") == 0)
a05660a6 1072 // special case - we have to translate the pids
8f6e8f5e 1073 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
a05660a6 1074 else
8f6e8f5e 1075 r = cgm_get_value(f->controller, f->cgroup, f->file, &data);
a05660a6 1076
4775fba1 1077 if (!r)
99978832
SH
1078 return -EINVAL;
1079
4775fba1
SH
1080 if (!data)
1081 return 0;
99978832
SH
1082 s = strlen(data);
1083 if (s > size)
1084 s = size;
1085 memcpy(buf, data, s);
5ea0727e
SH
1086 if (s > 0 && s < size && data[s-1] != '\n')
1087 buf[s++] = '\n';
99978832 1088
99978832
SH
1089 return s;
1090 }
1091
1092 return -EINVAL;
1093}
1094
4775fba1
SH
1095static void pid_from_ns(int sock, pid_t tpid)
1096{
1097 pid_t vpid;
1098 struct ucred cred;
1099 char v;
6ee867dc
SH
1100 struct timeval tv;
1101 fd_set s;
1102 int ret;
4775fba1
SH
1103
1104 cred.uid = 0;
1105 cred.gid = 0;
6ee867dc
SH
1106 while (1) {
1107 FD_ZERO(&s);
1108 FD_SET(sock, &s);
1109 tv.tv_sec = 2;
1110 tv.tv_usec = 0;
1111 ret = select(sock+1, &s, NULL, NULL, &tv);
ea56f722
SH
1112 if (ret <= 0) {
1113 fprintf(stderr, "%s: bad select before read from parent: %s\n",
6ee867dc
SH
1114 __func__, strerror(errno));
1115 exit(1);
1116 }
1117 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1118 fprintf(stderr, "%s: bad read from parent: %s\n",
1119 __func__, strerror(errno));
1120 exit(1);
1121 }
4775fba1 1122 if (vpid == -1) // done
01e71852 1123 break;
4775fba1
SH
1124 v = '0';
1125 cred.pid = vpid;
01e71852 1126 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
4775fba1
SH
1127 v = '1';
1128 cred.pid = getpid();
01e71852 1129 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
4775fba1
SH
1130 exit(1);
1131 }
1132 }
1133 exit(0);
1134}
1135
1136static void pid_from_ns_wrapper(int sock, pid_t tpid)
1137{
ea56f722 1138 int newnsfd = -1, ret, cpipe[2];
4775fba1
SH
1139 char fnam[100];
1140 pid_t cpid;
ea56f722
SH
1141 fd_set s;
1142 struct timeval tv;
1143 char v;
4775fba1 1144
c0adec85
SH
1145 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1146 if (ret < 0 || ret >= sizeof(fnam))
1147 exit(1);
4775fba1
SH
1148 newnsfd = open(fnam, O_RDONLY);
1149 if (newnsfd < 0)
1150 exit(1);
1151 if (setns(newnsfd, 0) < 0)
1152 exit(1);
1153 close(newnsfd);
1154
ea56f722
SH
1155 if (pipe(cpipe) < 0)
1156 exit(1);
1157
1158loop:
4775fba1
SH
1159 cpid = fork();
1160
1161 if (cpid < 0)
1162 exit(1);
ea56f722
SH
1163
1164 if (!cpid) {
1165 char b = '1';
1166 close(cpipe[0]);
1167 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1168 fprintf(stderr, "%s (child): erorr on write: %s\n",
1169 __func__, strerror(errno));
1170 }
1171 close(cpipe[1]);
4775fba1 1172 pid_from_ns(sock, tpid);
ea56f722
SH
1173 }
1174
1175 // give the child 1 second to be done forking and
1176 // write it's ack
1177 FD_ZERO(&s);
1178 FD_SET(cpipe[0], &s);
1179 tv.tv_sec = 1;
1180 tv.tv_usec = 0;
1181 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1182 if (ret <= 0)
1183 goto again;
1184 ret = read(cpipe[0], &v, 1);
1185 if (ret != sizeof(char) || v != '1') {
1186 goto again;
1187 }
1188
4775fba1
SH
1189 if (!wait_for_pid(cpid))
1190 exit(1);
1191 exit(0);
ea56f722
SH
1192
1193again:
1194 kill(cpid, SIGKILL);
1195 wait_for_pid(cpid);
1196 goto loop;
4775fba1
SH
1197}
1198
1199static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
1200{
1201 int sock[2] = {-1, -1};
1202 pid_t qpid, cpid = -1;
1203 bool answer = false, fail = false;
1204
1205 /*
1206 * write the pids to a socket, have helper in writer's pidns
1207 * call movepid for us
1208 */
1209 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1210 perror("socketpair");
1211 exit(1);
1212 }
1213
1214 cpid = fork();
1215 if (cpid == -1)
1216 goto out;
1217
1218 if (!cpid) // child
1219 pid_from_ns_wrapper(sock[1], tpid);
1220
1221 const char *ptr = buf;
1222 while (sscanf(ptr, "%d", &qpid) == 1) {
1223 struct ucred cred;
1224 char v;
1225
1226 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
6ee867dc
SH
1227 fprintf(stderr, "%s: error writing pid to child: %s\n",
1228 __func__, strerror(errno));
4775fba1
SH
1229 goto out;
1230 }
1231
01e71852
SH
1232 if (recv_creds(sock[0], &cred, &v)) {
1233 if (v == '0') {
1234 if (!cgm_move_pid(contrl, cg, cred.pid))
1235 fail = true;
1236 }
4775fba1
SH
1237 }
1238
1239 ptr = strchr(ptr, '\n');
1240 if (!ptr)
1241 break;
1242 ptr++;
1243 }
1244
1245 /* All good, write the value */
1246 qpid = -1;
1247 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1420baf8 1248 fprintf(stderr, "Warning: failed to ask child to exit\n");
4775fba1
SH
1249
1250 if (!fail)
1251 answer = true;
1252
1253out:
1254 if (cpid != -1)
1255 wait_for_pid(cpid);
1256 if (sock[0] != -1) {
1257 close(sock[0]);
1258 close(sock[1]);
1259 }
1260 return answer;
1261}
1262
2ad6d2bd
SH
1263int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1264 struct fuse_file_info *fi)
1265{
2ad6d2bd 1266 struct fuse_context *fc = fuse_get_context();
47cbf0e5 1267 nih_local char *localbuf = NULL;
8f6e8f5e
SH
1268 nih_local struct cgm_keys *k = NULL;
1269 struct file_info *f = (struct file_info *)fi->fh;
2ad6d2bd 1270
443d13f5 1271 if (f->type != LXC_TYPE_CGFILE) {
b845ad01
SH
1272 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1273 return -EIO;
1274 }
1275
2ad6d2bd 1276 if (offset)
7253e0a4 1277 return 0;
2ad6d2bd
SH
1278
1279 if (!fc)
1280 return -EIO;
1281
47cbf0e5
SH
1282 localbuf = NIH_MUST( nih_alloc(NULL, size+1) );
1283 localbuf[size] = '\0';
1284 memcpy(localbuf, buf, size);
2ad6d2bd 1285
8f6e8f5e 1286 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
4775fba1
SH
1287 bool r;
1288
8f6e8f5e 1289 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY))
f9a05025 1290 return -EACCES;
2ad6d2bd 1291
8f6e8f5e
SH
1292 if (strcmp(f->file, "tasks") == 0 ||
1293 strcmp(f->file, "/tasks") == 0 ||
1294 strcmp(f->file, "/cgroup.procs") == 0 ||
1295 strcmp(f->file, "cgroup.procs") == 0)
4775fba1 1296 // special case - we have to translate the pids
8f6e8f5e 1297 r = do_write_pids(fc->pid, f->controller, f->cgroup, f->file, localbuf);
4775fba1 1298 else
8f6e8f5e 1299 r = cgm_set_value(f->controller, f->cgroup, f->file, localbuf);
4775fba1
SH
1300
1301 if (!r)
2ad6d2bd
SH
1302 return -EINVAL;
1303
1304 return size;
1305 }
1306
1307 return -EINVAL;
1308}
1309
341b21ad
SH
1310int cg_chown(const char *path, uid_t uid, gid_t gid)
1311{
1312 struct fuse_context *fc = fuse_get_context();
1313 nih_local char * cgdir = NULL;
1314 char *fpath = NULL, *path1, *path2;
1315 nih_local struct cgm_keys *k = NULL;
1316 const char *cgroup;
1317 nih_local char *controller = NULL;
1318
1319
1320 if (!fc)
1321 return -EIO;
1322
1323 if (strcmp(path, "/cgroup") == 0)
1324 return -EINVAL;
1325
1326 controller = pick_controller_from_path(fc, path);
1327 if (!controller)
f9a05025 1328 return -EINVAL;
341b21ad
SH
1329 cgroup = find_cgroup_in_path(path);
1330 if (!cgroup)
1331 /* this is just /cgroup/controller */
1332 return -EINVAL;
1333
1334 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1335
1336 if (!fpath) {
1337 path1 = "/";
1338 path2 = cgdir;
1339 } else {
1340 path1 = cgdir;
1341 path2 = fpath;
1342 }
1343
1344 if (is_child_cgroup(controller, path1, path2)) {
1345 // get uid, gid, from '/tasks' file and make up a mode
1346 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1347 k = get_cgroup_key(controller, cgroup, "tasks");
1348
1349 } else
1350 k = get_cgroup_key(controller, path1, path2);
1351
1352 if (!k)
1353 return -EINVAL;
1354
1355 /*
1356 * This being a fuse request, the uid and gid must be valid
1357 * in the caller's namespace. So we can just check to make
1358 * sure that the caller is root in his uid, and privileged
1359 * over the file's current owner.
1360 */
1361 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD))
f9a05025 1362 return -EACCES;
341b21ad
SH
1363
1364 if (!cgm_chown_file(controller, cgroup, uid, gid))
1365 return -EINVAL;
1366 return 0;
1367}
2ad6d2bd 1368
fd2e4e03
SH
1369int cg_chmod(const char *path, mode_t mode)
1370{
0a1bb5ea
SH
1371 struct fuse_context *fc = fuse_get_context();
1372 nih_local char * cgdir = NULL;
1373 char *fpath = NULL, *path1, *path2;
1374 nih_local struct cgm_keys *k = NULL;
1375 const char *cgroup;
1376 nih_local char *controller = NULL;
1377
1378 if (!fc)
1379 return -EIO;
1380
1381 if (strcmp(path, "/cgroup") == 0)
1382 return -EINVAL;
1383
1384 controller = pick_controller_from_path(fc, path);
1385 if (!controller)
f9a05025 1386 return -EINVAL;
0a1bb5ea
SH
1387 cgroup = find_cgroup_in_path(path);
1388 if (!cgroup)
1389 /* this is just /cgroup/controller */
1390 return -EINVAL;
1391
1392 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1393
1394 if (!fpath) {
1395 path1 = "/";
1396 path2 = cgdir;
1397 } else {
1398 path1 = cgdir;
1399 path2 = fpath;
1400 }
1401
1402 if (is_child_cgroup(controller, path1, path2)) {
1403 // get uid, gid, from '/tasks' file and make up a mode
1404 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1405 k = get_cgroup_key(controller, cgroup, "tasks");
1406
1407 } else
1408 k = get_cgroup_key(controller, path1, path2);
1409
1410 if (!k)
1411 return -EINVAL;
1412
1413 /*
1414 * This being a fuse request, the uid and gid must be valid
1415 * in the caller's namespace. So we can just check to make
1416 * sure that the caller is root in his uid, and privileged
1417 * over the file's current owner.
1418 */
1419 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT))
1420 return -EPERM;
1421
1422 if (!cgm_chmod_file(controller, cgroup, mode))
1423 return -EINVAL;
1424 return 0;
fd2e4e03
SH
1425}
1426
ab54b798
SH
1427int cg_mkdir(const char *path, mode_t mode)
1428{
1429 struct fuse_context *fc = fuse_get_context();
1430 nih_local struct cgm_keys **list = NULL;
1431 char *fpath = NULL, *path1;
1432 nih_local char * cgdir = NULL;
1433 const char *cgroup;
1434 nih_local char *controller = NULL;
1435
ab54b798
SH
1436 if (!fc)
1437 return -EIO;
1438
1439
1440 controller = pick_controller_from_path(fc, path);
1441 if (!controller)
f9a05025 1442 return -EINVAL;
ab54b798
SH
1443
1444 cgroup = find_cgroup_in_path(path);
1445 if (!cgroup)
f9a05025 1446 return -EINVAL;
ab54b798
SH
1447
1448 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1449 if (!fpath)
1450 path1 = "/";
1451 else
1452 path1 = cgdir;
1453
1454 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR))
f9a05025 1455 return -EACCES;
ab54b798
SH
1456
1457
1458 if (!cgm_create(controller, cgroup, fc->uid, fc->gid))
1459 return -EINVAL;
1460
1461 return 0;
1462}
1463
50d8d5b5
SH
1464static int cg_rmdir(const char *path)
1465{
1466 struct fuse_context *fc = fuse_get_context();
1467 nih_local struct cgm_keys **list = NULL;
1468 char *fpath = NULL;
1469 nih_local char * cgdir = NULL;
1470 const char *cgroup;
1471 nih_local char *controller = NULL;
1472
1473 if (!fc)
1474 return -EIO;
1475
1476
1477 controller = pick_controller_from_path(fc, path);
1478 if (!controller)
f9a05025 1479 return -EINVAL;
50d8d5b5
SH
1480
1481 cgroup = find_cgroup_in_path(path);
1482 if (!cgroup)
f9a05025 1483 return -EINVAL;
50d8d5b5
SH
1484
1485 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1486 if (!fpath)
1487 return -EINVAL;
1488
1489 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY))
f9a05025 1490 return -EACCES;
50d8d5b5
SH
1491
1492 if (!cgm_remove(controller, cgroup))
1493 return -EINVAL;
1494
1495 return 0;
1496}
1497
2dc17609
SH
1498static bool startswith(const char *line, const char *pref)
1499{
1500 if (strncmp(line, pref, strlen(pref)) == 0)
1501 return true;
1502 return false;
1503}
1504
1505static void get_mem_cached(char *memstat, unsigned long *v)
1506{
1507 char *eol;
1508
1509 *v = 0;
1510 while (*memstat) {
1511 if (startswith(memstat, "total_cache")) {
1512 sscanf(memstat + 11, "%lu", v);
1513 *v /= 1024;
1514 return;
1515 }
1516 eol = strchr(memstat, '\n');
1517 if (!eol)
1518 return;
1519 memstat = eol+1;
1520 }
1521}
1522
49878439 1523static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2f919d9d 1524{
49878439
YY
1525 char *eol;
1526 char key[32];
2f919d9d 1527
49878439
YY
1528 memset(key, 0, 32);
1529 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2f919d9d 1530
49878439
YY
1531 size_t len = strlen(key);
1532 *v = 0;
1533
1534 while (*str) {
1535 if (startswith(str, key)) {
2f919d9d
SH
1536 sscanf(str + len, "%lu", v);
1537 return;
1538 }
1539 eol = strchr(str, '\n');
49878439 1540 if (!eol)
2f919d9d 1541 return;
49878439
YY
1542 str = eol+1;
1543 }
1544}
1545
2dc17609
SH
1546static char *get_pid_cgroup(pid_t pid, const char *contrl)
1547{
1548 nih_local char *fnam = NULL;
1549 FILE *f;
1550 char *answer = NULL;
1551 char *line = NULL;
1552 size_t len = 0;
1553
1554 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
1555 if (!(f = fopen(fnam, "r")))
1556 return false;
1557
1558 while (getline(&line, &len, f) != -1) {
1559 char *c1, *c2;
1560 if (!line[0])
1561 continue;
1562 c1 = strchr(line, ':');
1563 if (!c1)
1564 goto out;
1565 c1++;
1566 c2 = strchr(c1, ':');
1567 if (!c2)
1568 goto out;
1569 *c2 = '\0';
1570 if (strcmp(c1, contrl) != 0)
1571 continue;
1572 c2++;
1573 stripnewline(c2);
1574 answer = NIH_MUST( nih_strdup(NULL, c2) );
1575 goto out;
1576 }
1577
1578out:
1579 fclose(f);
1580 free(line);
1581 return answer;
1582}
1583
758ad80c 1584/*
2ad6d2bd 1585 * FUSE ops for /proc
758ad80c 1586 */
758ad80c 1587
23ce2127
SH
1588static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1589 struct fuse_file_info *fi)
1590{
2dc17609 1591 struct fuse_context *fc = fuse_get_context();
97f1f27b 1592 struct file_info *d = (struct file_info *)fi->fh;
2dc17609
SH
1593 nih_local char *cg = get_pid_cgroup(fc->pid, "memory");
1594 nih_local char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
1595 unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
1596 char *line = NULL;
1597 size_t linelen = 0, total_len = 0;
97f1f27b
YY
1598 char *cache = d->buf;
1599 size_t cache_size = d->buflen;
2dc17609
SH
1600 FILE *f;
1601
97f1f27b
YY
1602 if (offset){
1603 if (offset > d->size)
1604 return -EINVAL;
1605 int left = d->size - offset;
1606 total_len = left > size ? size: left;
1607 memcpy(buf, cache + offset, total_len);
1608 return total_len;
1609 }
2dc17609
SH
1610
1611 if (!cg)
1612 return 0;
1613
1614 if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
1615 return 0;
1616 if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
1617 return 0;
1618 if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
1619 return 0;
1620 memlimit = strtoul(memlimit_str, NULL, 10);
1621 memusage = strtoul(memusage_str, NULL, 10);
1622 memlimit /= 1024;
1623 memusage /= 1024;
1624 get_mem_cached(memstat_str, &cached);
1625
1626 f = fopen("/proc/meminfo", "r");
1627 if (!f)
1628 return 0;
1629
1630 while (getline(&line, &linelen, f) != -1) {
1631 size_t l;
1632 char *printme, lbuf[100];
1633
1634 memset(lbuf, 0, 100);
1635 if (startswith(line, "MemTotal:")) {
1636 sscanf(line+14, "%lu", &hosttotal);
1637 if (hosttotal < memlimit)
1638 memlimit = hosttotal;
1639 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1640 printme = lbuf;
1641 } else if (startswith(line, "MemFree:")) {
1642 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1643 printme = lbuf;
1644 } else if (startswith(line, "MemAvailable:")) {
1645 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1646 printme = lbuf;
1647 } else if (startswith(line, "Buffers:")) {
1648 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1649 printme = lbuf;
1650 } else if (startswith(line, "Cached:")) {
1651 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
1652 printme = lbuf;
1653 } else if (startswith(line, "SwapCached:")) {
1654 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
1655 printme = lbuf;
1656 } else
1657 printme = line;
97f1f27b
YY
1658
1659 l = snprintf(cache, cache_size, "%s", printme);
1660 cache += l;
1661 cache_size -= l;
2f919d9d 1662 total_len += l;
2dc17609
SH
1663 }
1664
97f1f27b
YY
1665 d->size = total_len;
1666 if (total_len > size ) total_len = size;
1667 memcpy(buf, d->buf, total_len);
1668
92c84dc4
SH
1669 fclose(f);
1670 free(line);
2dc17609 1671 return total_len;
23ce2127
SH
1672}
1673
1674/*
1675 * Read the cpuset.cpus for cg
1676 * Return the answer in a nih_alloced string
1677 */
1678static char *get_cpuset(const char *cg)
1679{
1680 char *answer;
1681
1682 if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
1683 return NULL;
1684 return answer;
1685}
1686
1687/*
1688 * Helper functions for cpuset_in-set
1689 */
1690char *cpuset_nexttok(const char *c)
1691{
1692 char *r = strchr(c+1, ',');
1693 if (r)
1694 return r+1;
1695 return NULL;
1696}
1697
1698int cpuset_getrange(const char *c, int *a, int *b)
1699{
1700 int ret;
1701
1702 ret = sscanf(c, "%d-%d", a, b);
1703 return ret;
1704}
1705
1706/*
1707 * cpusets are in format "1,2-3,4"
1708 * iow, comma-delimited ranges
1709 */
aeb56147 1710static bool cpu_in_cpuset(int cpu, const char *cpuset)
23ce2127 1711{
23ce2127
SH
1712 const char *c;
1713
23ce2127
SH
1714 for (c = cpuset; c; c = cpuset_nexttok(c)) {
1715 int a, b, ret;
1716
1717 ret = cpuset_getrange(c, &a, &b);
1718 if (ret == 1 && cpu == a)
1719 return true;
1720 if (ret != 2) // bad cpuset!
1721 return false;
1722 if (cpu >= a && cpu <= b)
1723 return true;
1724 }
1725
1726 return false;
1727}
1728
aeb56147
SH
1729static bool cpuline_in_cpuset(const char *line, const char *cpuset)
1730{
1731 int cpu;
1732
1733 if (sscanf(line, "processor : %d", &cpu) != 1)
1734 return false;
1735 return cpu_in_cpuset(cpu, cpuset);
1736}
1737
23ce2127
SH
1738/*
1739 * check whether this is a '^processor" line in /proc/cpuinfo
1740 */
1741static bool is_processor_line(const char *line)
1742{
1743 int cpu;
1744
1745 if (sscanf(line, "processor : %d", &cpu) == 1)
1746 return true;
1747 return false;
1748}
1749
23ce2127
SH
1750static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
1751 struct fuse_file_info *fi)
1752{
1753 struct fuse_context *fc = fuse_get_context();
97f1f27b 1754 struct file_info *d = (struct file_info *)fi->fh;
23ce2127
SH
1755 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1756 nih_local char *cpuset = NULL;
1757 char *line = NULL;
1758 size_t linelen = 0, total_len = 0;
1759 bool am_printing = false;
1760 int curcpu = -1;
97f1f27b
YY
1761 char *cache = d->buf;
1762 size_t cache_size = d->buflen;
23ce2127
SH
1763 FILE *f;
1764
97f1f27b
YY
1765 if (offset){
1766 if (offset > d->size)
1767 return -EINVAL;
1768 int left = d->size - offset;
1769 total_len = left > size ? size: left;
1770 memcpy(buf, cache + offset, total_len);
2f919d9d 1771 return total_len;
97f1f27b 1772 }
23ce2127
SH
1773
1774 if (!cg)
1775 return 0;
1776
1777 cpuset = get_cpuset(cg);
1778 if (!cpuset)
1779 return 0;
1780
1781 f = fopen("/proc/cpuinfo", "r");
1782 if (!f)
1783 return 0;
1784
1785 while (getline(&line, &linelen, f) != -1) {
1786 size_t l;
1787 if (is_processor_line(line)) {
aeb56147 1788 am_printing = cpuline_in_cpuset(line, cpuset);
23ce2127
SH
1789 if (am_printing) {
1790 curcpu ++;
97f1f27b
YY
1791 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
1792 if (l < cache_size){
1793 cache += l;
1794 cache_size -= l;
1795 total_len += l;
1796 }else{
1797 cache += cache_size;
1798 total_len += cache_size;
1799 cache_size = 0;
1800 break;
1801 }
23ce2127
SH
1802 }
1803 continue;
1804 }
1805 if (am_printing) {
97f1f27b
YY
1806 l = snprintf(cache, cache_size, "%s", line);
1807 if (l < cache_size) {
1808 cache += l;
1809 cache_size -= l;
1810 total_len += l;
1811 } else {
1812 cache += cache_size;
1813 total_len += cache_size;
1814 cache_size = 0;
1815 break;
1816 }
23ce2127
SH
1817 }
1818 }
1819
97f1f27b
YY
1820 d->size = total_len;
1821 if (total_len > size ) total_len = size;
1822
1823 /* read from off 0 */
1824 memcpy(buf, d->buf, total_len);
1825
92c84dc4
SH
1826 fclose(f);
1827 free(line);
23ce2127
SH
1828 return total_len;
1829}
1830
1831static int proc_stat_read(char *buf, size_t size, off_t offset,
1832 struct fuse_file_info *fi)
1833{
aeb56147 1834 struct fuse_context *fc = fuse_get_context();
97f1f27b 1835 struct file_info *d = (struct file_info *)fi->fh;
aeb56147
SH
1836 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1837 nih_local char *cpuset = NULL;
1838 char *line = NULL;
1839 size_t linelen = 0, total_len = 0;
2a0fde62 1840 int curcpu = -1; /* cpu numbering starts at 0 */
97f1f27b
YY
1841 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
1842 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
1843 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
1844#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
1845 char cpuall[CPUALL_MAX_SIZE];
1846 /* reserve for cpu all */
1847 char *cache = d->buf + CPUALL_MAX_SIZE;
1848 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
aeb56147
SH
1849 FILE *f;
1850
97f1f27b
YY
1851 if (offset){
1852 if (offset > d->size)
1853 return -EINVAL;
1854 int left = d->size - offset;
1855 total_len = left > size ? size: left;
1856 memcpy(buf, d->buf + offset, total_len);
2f919d9d 1857 return total_len;
97f1f27b 1858 }
aeb56147
SH
1859
1860 if (!cg)
1861 return 0;
1862
1863 cpuset = get_cpuset(cg);
1864 if (!cpuset)
1865 return 0;
1866
1867 f = fopen("/proc/stat", "r");
1868 if (!f)
1869 return 0;
1870
97f1f27b
YY
1871 //skip first line
1872 if (getline(&line, &linelen, f) < 0) {
1873 fprintf(stderr, "proc_stat_read read first line failed\n");
1874 goto out;
1875 }
1876
aeb56147
SH
1877 while (getline(&line, &linelen, f) != -1) {
1878 size_t l;
1879 int cpu;
2a0fde62 1880 char cpu_char[10]; /* That's a lot of cores */
aeb56147
SH
1881 char *c;
1882
2a0fde62
CB
1883 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
1884 /* not a ^cpuN line containing a number N, just print it */
97f1f27b
YY
1885 l = snprintf(cache, cache_size, "%s", line);
1886 if (l < cache_size){
1887 cache += l;
1888 cache_size -= l;
1889 total_len += l;
1890 continue;
1891 }else{
1892 //no more space, break it
1893 cache += cache_size;
1894 total_len += cache_size;
1895 cache_size = 0;
1896 break;
1897 }
aeb56147 1898 }
2a0fde62
CB
1899
1900 if (sscanf(cpu_char, "%d", &cpu) != 1)
1901 continue;
aeb56147
SH
1902 if (!cpu_in_cpuset(cpu, cpuset))
1903 continue;
1904 curcpu ++;
1905
1906 c = strchr(line, ' ');
1907 if (!c)
1908 continue;
25c5e8fb 1909 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
97f1f27b
YY
1910 cache += l;
1911 cache_size -= l;
aeb56147 1912 total_len += l;
2f919d9d 1913
97f1f27b
YY
1914 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
1915 &softirq, &steal, &guest) != 9)
1916 continue;
1917 user_sum += user;
1918 nice_sum += nice;
1919 system_sum += system;
1920 idle_sum += idle;
1921 iowait_sum += iowait;
1922 irq_sum += irq;
1923 softirq_sum += softirq;
1924 steal_sum += steal;
2f919d9d 1925 guest_sum += guest;
97f1f27b
YY
1926 }
1927
1928 cache = d->buf;
1929
2f919d9d 1930 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
97f1f27b
YY
1931 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
1932 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
1933 memcpy(cache, cpuall, cpuall_len);
2f919d9d 1934 cache += cpuall_len;
97f1f27b
YY
1935 }else{
1936 /* shouldn't happen */
1937 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
1938 cpuall_len = 0;
1939 }
1940
1941 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
1942 total_len += cpuall_len;
1943 d->size = total_len;
1944 if (total_len > size ) total_len = size;
1945
1946 memcpy(buf, d->buf, total_len);
97f1f27b 1947out:
92c84dc4
SH
1948 fclose(f);
1949 free(line);
aeb56147 1950 return total_len;
23ce2127
SH
1951}
1952
7bbf2246
SH
1953/*
1954 * How to guess what to present for uptime?
1955 * One thing we could do would be to take the date on the caller's
1956 * memory.usage_in_bytes file, which should equal the time of creation
1957 * of his cgroup. However, a task could be in a sub-cgroup of the
1958 * container. The same problem exists if we try to look at the ages
1959 * of processes in the caller's cgroup.
1960 *
1961 * So we'll fork a task that will enter the caller's pidns, mount a
1962 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
1963 *
1964 * For the second uptime #, we'll do as Stéphane had done, just copy
1965 * the number from /proc/uptime. Not sure how to best emulate 'idle'
1966 * time. Maybe someone can come up with a good algorithm and submit a
1967 * patch. Maybe something based on cpushare info?
1968 */
41bb9357
SH
1969
1970/* return age of the reaper for $pid, taken from ctime of its procdir */
1971static long int get_pid1_time(pid_t pid)
1972{
1973 char fnam[100];
ea56f722 1974 int fd, cpipe[2], ret;
41bb9357 1975 struct stat sb;
ea56f722
SH
1976 pid_t cpid;
1977 struct timeval tv;
1978 fd_set s;
1979 char v;
41bb9357
SH
1980
1981 if (unshare(CLONE_NEWNS))
1982 return 0;
1983
5ca64c2a
SG
1984 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
1985 perror("rslave mount failed");
1986 return 0;
1987 }
1988
c0adec85
SH
1989 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", pid);
1990 if (ret < 0 || ret >= sizeof(fnam))
1991 return 0;
1992
41bb9357
SH
1993 fd = open(fnam, O_RDONLY);
1994 if (fd < 0) {
1995 perror("get_pid1_time open of ns/pid");
1996 return 0;
1997 }
1998 if (setns(fd, 0)) {
1999 perror("get_pid1_time setns 1");
2000 close(fd);
2001 return 0;
2002 }
2003 close(fd);
41bb9357 2004
ea56f722
SH
2005 if (pipe(cpipe) < 0)
2006 exit(1);
41bb9357 2007
ea56f722
SH
2008loop:
2009 cpid = fork();
2010 if (cpid < 0)
41bb9357 2011 return 0;
ea56f722
SH
2012
2013 if (!cpid) {
2014 char b = '1';
2015 close(cpipe[0]);
2016 if (write(cpipe[1], &b, sizeof(char)) < 0) {
2017 fprintf(stderr, "%s (child): erorr on write: %s\n",
2018 __func__, strerror(errno));
2019 }
2020 close(cpipe[1]);
2021 umount2("/proc", MNT_DETACH);
2022 if (mount("proc", "/proc", "proc", 0, NULL)) {
2023 perror("get_pid1_time mount");
2024 return 0;
2025 }
2026 ret = lstat("/proc/1", &sb);
2027 if (ret) {
2028 perror("get_pid1_time lstat");
2029 return 0;
2030 }
2031 return time(NULL) - sb.st_ctime;
41bb9357 2032 }
ea56f722
SH
2033
2034 // give the child 1 second to be done forking and
2035 // write it's ack
2036 FD_ZERO(&s);
2037 FD_SET(cpipe[0], &s);
2038 tv.tv_sec = 1;
2039 tv.tv_usec = 0;
2040 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
2041 if (ret <= 0)
2042 goto again;
2043 ret = read(cpipe[0], &v, 1);
2044 if (ret != sizeof(char) || v != '1') {
2045 goto again;
41bb9357 2046 }
ea56f722
SH
2047
2048 wait_for_pid(cpid);
2049 exit(0);
2050
2051again:
2052 kill(cpid, SIGKILL);
2053 wait_for_pid(cpid);
2054 goto loop;
41bb9357
SH
2055}
2056
2057static long int getreaperage(pid_t qpid)
2058{
2059 int pid, mypipe[2], ret;
2060 struct timeval tv;
2061 fd_set s;
2062 long int mtime, answer = 0;
2063
2064 if (pipe(mypipe)) {
2065 return 0;
2066 }
2067
2068 pid = fork();
2069
2070 if (!pid) { // child
2071 mtime = get_pid1_time(qpid);
2072 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
2073 fprintf(stderr, "Warning: bad write from getreaperage\n");
2074 exit(0);
2075 }
2076
2077 close(mypipe[1]);
2078 FD_ZERO(&s);
2079 FD_SET(mypipe[0], &s);
2080 tv.tv_sec = 1;
2081 tv.tv_usec = 0;
2082 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
ea56f722 2083 if (ret <= 0) {
41bb9357
SH
2084 perror("select");
2085 goto out;
2086 }
2087 if (!ret) {
1420baf8 2088 fprintf(stderr, "timed out\n");
41bb9357
SH
2089 goto out;
2090 }
2091 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
2092 perror("read");
2093 goto out;
2094 }
2095 answer = mtime;
2096
2097out:
2098 wait_for_pid(pid);
2099 close(mypipe[0]);
2100 return answer;
2101}
2102
2103static long int getprocidle(void)
2104{
2105 FILE *f = fopen("/proc/uptime", "r");
2106 long int age, idle;
92c84dc4 2107 int ret;
41bb9357
SH
2108 if (!f)
2109 return 0;
92c84dc4
SH
2110 ret = fscanf(f, "%ld %ld", &age, &idle);
2111 fclose(f);
2112 if (ret != 2)
41bb9357
SH
2113 return 0;
2114 return idle;
2115}
2116
2117/*
2118 * We read /proc/uptime and reuse its second field.
2119 * For the first field, we use the mtime for the reaper for
2120 * the calling pid as returned by getreaperage
2121 */
23ce2127
SH
2122static int proc_uptime_read(char *buf, size_t size, off_t offset,
2123 struct fuse_file_info *fi)
2124{
41bb9357 2125 struct fuse_context *fc = fuse_get_context();
97f1f27b 2126 struct file_info *d = (struct file_info *)fi->fh;
41bb9357
SH
2127 long int reaperage = getreaperage(fc->pid);;
2128 long int idletime = getprocidle();
97f1f27b 2129 size_t total_len = 0;
41bb9357 2130
97f1f27b
YY
2131 if (offset){
2132 if (offset > d->size)
2133 return -EINVAL;
2134 return 0;
2135 }
2136
2137 total_len = snprintf(buf, size, "%ld %ld\n", reaperage, idletime);
2138 d->size = total_len;
2139 return total_len;
23ce2127
SH
2140}
2141
49878439
YY
2142static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2143 struct fuse_file_info *fi)
2144{
2145 char dev_name[72];
2146 struct fuse_context *fc = fuse_get_context();
97f1f27b 2147 struct file_info *d = (struct file_info *)fi->fh;
49878439
YY
2148 nih_local char *cg = get_pid_cgroup(fc->pid, "blkio");
2149 nih_local char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
2150 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2151 unsigned long read = 0, write = 0;
2152 unsigned long read_merged = 0, write_merged = 0;
2153 unsigned long read_sectors = 0, write_sectors = 0;
2154 unsigned long read_ticks = 0, write_ticks = 0;
2155 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2156 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
2157 char *line = NULL;
2158 size_t linelen = 0, total_len = 0;
2159 unsigned int major = 0, minor = 0;
2160 int i = 0;
2161 FILE *f;
2162
97f1f27b
YY
2163 if (offset){
2164 if (offset > d->size)
2165 return -EINVAL;
2166 return 0;
2167 }
49878439
YY
2168
2169 if (!cg)
2170 return 0;
2171
2172 if (!cgm_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2173 return 0;
2174 if (!cgm_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2175 return 0;
2176 if (!cgm_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2177 return 0;
2178 if (!cgm_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2179 return 0;
2180 if (!cgm_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2181 return 0;
2182
2183
2184 f = fopen("/proc/diskstats", "r");
2185 if (!f)
2186 return 0;
2187
2188 while (getline(&line, &linelen, f) != -1) {
2189 size_t l;
2190 char *printme, lbuf[256];
2191
c0adec85 2192 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
49878439
YY
2193 if(i == 3){
2194 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2195 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2196 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2197 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2198 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2199 read_sectors = read_sectors/512;
2200 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2201 write_sectors = write_sectors/512;
2f919d9d 2202
49878439
YY
2203 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2204 rd_svctm = rd_svctm/1000000;
2205 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2206 rd_wait = rd_wait/1000000;
2207 read_ticks = rd_svctm + rd_wait;
2208
2209 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2210 wr_svctm = wr_svctm/1000000;
2211 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2212 wr_wait = wr_wait/1000000;
2213 write_ticks = wr_svctm + wr_wait;
2214
2215 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2216 tot_ticks = tot_ticks/1000000;
2217 }else{
2218 continue;
2219 }
2220
2221 memset(lbuf, 0, 256);
2222 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2f919d9d 2223 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
49878439
YY
2224 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2225 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2226 printme = lbuf;
2227 } else
2228 continue;
2229
2230 l = snprintf(buf, size, "%s", printme);
2231 buf += l;
2232 size -= l;
2233 total_len += l;
2234 }
2235
97f1f27b
YY
2236 d->size = total_len;
2237
49878439
YY
2238 fclose(f);
2239 free(line);
2240 return total_len;
2241}
2242
23ce2127
SH
2243static off_t get_procfile_size(const char *which)
2244{
2245 FILE *f = fopen(which, "r");
2246 char *line = NULL;
2247 size_t len = 0;
2248 ssize_t sz, answer = 0;
2249 if (!f)
2250 return 0;
2251
2252 while ((sz = getline(&line, &len, f)) != -1)
2253 answer += sz;
2254 fclose (f);
92c84dc4 2255 free(line);
23ce2127
SH
2256
2257 return answer;
2258}
2259
758ad80c
SH
2260static int proc_getattr(const char *path, struct stat *sb)
2261{
35629743
SH
2262 struct timespec now;
2263
2264 memset(sb, 0, sizeof(struct stat));
2265 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2266 return -EINVAL;
2267 sb->st_uid = sb->st_gid = 0;
2268 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2269 if (strcmp(path, "/proc") == 0) {
2270 sb->st_mode = S_IFDIR | 00555;
2271 sb->st_nlink = 2;
2272 return 0;
2273 }
2274 if (strcmp(path, "/proc/meminfo") == 0 ||
2275 strcmp(path, "/proc/cpuinfo") == 0 ||
2276 strcmp(path, "/proc/uptime") == 0 ||
49878439
YY
2277 strcmp(path, "/proc/stat") == 0 ||
2278 strcmp(path, "/proc/diskstats") == 0) {
7253e0a4 2279 sb->st_size = 0;
35629743
SH
2280 sb->st_mode = S_IFREG | 00444;
2281 sb->st_nlink = 1;
2282 return 0;
2283 }
2284
2285 return -ENOENT;
2286}
2287
2288static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2289 struct fuse_file_info *fi)
2290{
2291 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2292 filler(buf, "meminfo", NULL, 0) != 0 ||
2293 filler(buf, "stat", NULL, 0) != 0 ||
49878439
YY
2294 filler(buf, "uptime", NULL, 0) != 0 ||
2295 filler(buf, "diskstats", NULL, 0) != 0)
758ad80c 2296 return -EINVAL;
758ad80c
SH
2297 return 0;
2298}
2299
35629743
SH
2300static int proc_open(const char *path, struct fuse_file_info *fi)
2301{
96fc5ee6
SH
2302 int type = -1;
2303 struct file_info *info;
2304
2305 if (strcmp(path, "/proc/meminfo") == 0)
2306 type = LXC_TYPE_PROC_MEMINFO;
2307 else if (strcmp(path, "/proc/cpuinfo") == 0)
2308 type = LXC_TYPE_PROC_CPUINFO;
2309 else if (strcmp(path, "/proc/uptime") == 0)
2310 type = LXC_TYPE_PROC_UPTIME;
2311 else if (strcmp(path, "/proc/stat") == 0)
2312 type = LXC_TYPE_PROC_STAT;
2313 else if (strcmp(path, "/proc/diskstats") == 0)
2314 type = LXC_TYPE_PROC_DISKSTATS;
2315 if (type == -1)
2316 return -ENOENT;
2317
2318 info = NIH_MUST( nih_alloc(NULL, sizeof(*info)) );
2319 memset(info, 0, sizeof(*info));
2320 info->type = type;
2321
97f1f27b 2322 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
25c5e8fb 2323 info->buf = NIH_MUST( nih_alloc(info, info->buflen) );
97f1f27b
YY
2324 memset(info->buf, 0, info->buflen);
2325 /* set actual size to buffer size */
2f919d9d 2326 info->size = info->buflen;
97f1f27b 2327
96fc5ee6
SH
2328 fi->fh = (unsigned long)info;
2329 return 0;
2330}
2331
2332static int proc_release(const char *path, struct fuse_file_info *fi)
2333{
2334 struct file_info *f = (struct file_info *)fi->fh;
2335
2336 do_release_file_info(f);
2337 return 0;
35629743
SH
2338}
2339
35629743
SH
2340static int proc_read(const char *path, char *buf, size_t size, off_t offset,
2341 struct fuse_file_info *fi)
2342{
96fc5ee6
SH
2343 struct file_info *f = (struct file_info *) fi->fh;
2344
2345 switch (f->type) {
2f919d9d 2346 case LXC_TYPE_PROC_MEMINFO:
23ce2127 2347 return proc_meminfo_read(buf, size, offset, fi);
96fc5ee6 2348 case LXC_TYPE_PROC_CPUINFO:
23ce2127 2349 return proc_cpuinfo_read(buf, size, offset, fi);
96fc5ee6 2350 case LXC_TYPE_PROC_UPTIME:
23ce2127 2351 return proc_uptime_read(buf, size, offset, fi);
96fc5ee6 2352 case LXC_TYPE_PROC_STAT:
23ce2127 2353 return proc_stat_read(buf, size, offset, fi);
96fc5ee6 2354 case LXC_TYPE_PROC_DISKSTATS:
49878439 2355 return proc_diskstats_read(buf, size, offset, fi);
96fc5ee6
SH
2356 default:
2357 return -EINVAL;
2358 }
35629743
SH
2359}
2360
2ad6d2bd
SH
2361/*
2362 * FUSE ops for /
2363 * these just delegate to the /proc and /cgroup ops as
2364 * needed
2365 */
758ad80c
SH
2366
2367static int lxcfs_getattr(const char *path, struct stat *sb)
2368{
2369 if (strcmp(path, "/") == 0) {
2370 sb->st_mode = S_IFDIR | 00755;
2371 sb->st_nlink = 2;
2372 return 0;
2373 }
2374 if (strncmp(path, "/cgroup", 7) == 0) {
2375 return cg_getattr(path, sb);
2376 }
35629743 2377 if (strncmp(path, "/proc", 5) == 0) {
758ad80c
SH
2378 return proc_getattr(path, sb);
2379 }
2380 return -EINVAL;
2381}
2382
2383static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
2384{
2385 if (strcmp(path, "/") == 0)
2386 return 0;
2387
2388 if (strncmp(path, "/cgroup", 7) == 0) {
2389 return cg_opendir(path, fi);
2390 }
35629743
SH
2391 if (strcmp(path, "/proc") == 0)
2392 return 0;
2393 return -ENOENT;
758ad80c
SH
2394}
2395
2396static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2397 struct fuse_file_info *fi)
2398{
2399 if (strcmp(path, "/") == 0) {
2400 if (filler(buf, "proc", NULL, 0) != 0 ||
2401 filler(buf, "cgroup", NULL, 0) != 0)
2402 return -EINVAL;
2403 return 0;
2404 }
35629743 2405 if (strncmp(path, "/cgroup", 7) == 0)
758ad80c 2406 return cg_readdir(path, buf, filler, offset, fi);
35629743
SH
2407 if (strcmp(path, "/proc") == 0)
2408 return proc_readdir(path, buf, filler, offset, fi);
758ad80c
SH
2409 return -EINVAL;
2410}
2411
2412static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
2413{
2414 if (strcmp(path, "/") == 0)
2415 return 0;
2416 if (strncmp(path, "/cgroup", 7) == 0) {
2417 return cg_releasedir(path, fi);
2418 }
35629743
SH
2419 if (strcmp(path, "/proc") == 0)
2420 return 0;
758ad80c
SH
2421 return -EINVAL;
2422}
2423
99978832
SH
2424static int lxcfs_open(const char *path, struct fuse_file_info *fi)
2425{
35629743 2426 if (strncmp(path, "/cgroup", 7) == 0)
99978832 2427 return cg_open(path, fi);
35629743
SH
2428 if (strncmp(path, "/proc", 5) == 0)
2429 return proc_open(path, fi);
99978832
SH
2430
2431 return -EINVAL;
2432}
2433
2434static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
2435 struct fuse_file_info *fi)
2436{
35629743 2437 if (strncmp(path, "/cgroup", 7) == 0)
99978832 2438 return cg_read(path, buf, size, offset, fi);
35629743
SH
2439 if (strncmp(path, "/proc", 5) == 0)
2440 return proc_read(path, buf, size, offset, fi);
99978832
SH
2441
2442 return -EINVAL;
2443}
2444
2ad6d2bd
SH
2445int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
2446 struct fuse_file_info *fi)
2447{
2448 if (strncmp(path, "/cgroup", 7) == 0) {
2449 return cg_write(path, buf, size, offset, fi);
2450 }
2451
2452 return -EINVAL;
2453}
2454
99978832
SH
2455static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2456{
2457 return 0;
2458}
2459
2460static int lxcfs_release(const char *path, struct fuse_file_info *fi)
758ad80c 2461{
8f6e8f5e
SH
2462 if (strncmp(path, "/cgroup", 7) == 0)
2463 return cg_release(path, fi);
8f6e8f5e 2464 if (strncmp(path, "/proc", 5) == 0)
96fc5ee6 2465 return proc_release(path, fi);
8f6e8f5e
SH
2466
2467 return -EINVAL;
99978832
SH
2468}
2469
2470static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2471{
2472 return 0;
758ad80c
SH
2473}
2474
ab54b798
SH
2475int lxcfs_mkdir(const char *path, mode_t mode)
2476{
2477 if (strncmp(path, "/cgroup", 7) == 0)
2478 return cg_mkdir(path, mode);
2479
2480 return -EINVAL;
2481}
2482
341b21ad
SH
2483int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2484{
2485 if (strncmp(path, "/cgroup", 7) == 0)
2486 return cg_chown(path, uid, gid);
2487
2488 return -EINVAL;
2489}
2490
2ad6d2bd
SH
2491/*
2492 * cat first does a truncate before doing ops->write. This doesn't
2493 * really make sense for cgroups. So just return 0 always but do
2494 * nothing.
2495 */
2496int lxcfs_truncate(const char *path, off_t newsize)
2497{
2498 if (strncmp(path, "/cgroup", 7) == 0)
2499 return 0;
2500 return -EINVAL;
2501}
2502
50d8d5b5
SH
2503int lxcfs_rmdir(const char *path)
2504{
2505 if (strncmp(path, "/cgroup", 7) == 0)
2506 return cg_rmdir(path);
2507 return -EINVAL;
2508}
2509
fd2e4e03
SH
2510int lxcfs_chmod(const char *path, mode_t mode)
2511{
2512 if (strncmp(path, "/cgroup", 7) == 0)
2513 return cg_chmod(path, mode);
2514 return -EINVAL;
2515}
2516
758ad80c
SH
2517const struct fuse_operations lxcfs_ops = {
2518 .getattr = lxcfs_getattr,
2519 .readlink = NULL,
2520 .getdir = NULL,
2521 .mknod = NULL,
ab54b798 2522 .mkdir = lxcfs_mkdir,
758ad80c 2523 .unlink = NULL,
50d8d5b5 2524 .rmdir = lxcfs_rmdir,
758ad80c
SH
2525 .symlink = NULL,
2526 .rename = NULL,
2527 .link = NULL,
fd2e4e03 2528 .chmod = lxcfs_chmod,
341b21ad 2529 .chown = lxcfs_chown,
2ad6d2bd 2530 .truncate = lxcfs_truncate,
758ad80c 2531 .utime = NULL,
99978832
SH
2532
2533 .open = lxcfs_open,
2534 .read = lxcfs_read,
2535 .release = lxcfs_release,
2ad6d2bd 2536 .write = lxcfs_write,
99978832 2537
758ad80c 2538 .statfs = NULL,
99978832
SH
2539 .flush = lxcfs_flush,
2540 .fsync = lxcfs_fsync,
758ad80c
SH
2541
2542 .setxattr = NULL,
2543 .getxattr = NULL,
2544 .listxattr = NULL,
2545 .removexattr = NULL,
2546
2547 .opendir = lxcfs_opendir,
2548 .readdir = lxcfs_readdir,
2549 .releasedir = lxcfs_releasedir,
2550
2551 .fsyncdir = NULL,
2552 .init = NULL,
2553 .destroy = NULL,
2554 .access = NULL,
2555 .create = NULL,
2556 .ftruncate = NULL,
2557 .fgetattr = NULL,
2558};
2559
99978832 2560static void usage(const char *me)
758ad80c
SH
2561{
2562 fprintf(stderr, "Usage:\n");
2563 fprintf(stderr, "\n");
0b0f73db
SH
2564 fprintf(stderr, "%s mountpoint\n", me);
2565 fprintf(stderr, "%s -h\n", me);
758ad80c
SH
2566 exit(1);
2567}
2568
99978832 2569static bool is_help(char *w)
758ad80c
SH
2570{
2571 if (strcmp(w, "-h") == 0 ||
2572 strcmp(w, "--help") == 0 ||
2573 strcmp(w, "-help") == 0 ||
2574 strcmp(w, "help") == 0)
2575 return true;
2576 return false;
2577}
2578
0b0f73db
SH
2579void swallow_arg(int *argcp, char *argv[], char *which)
2580{
2581 int i;
2582
2583 for (i = 1; argv[i]; i++) {
2584 if (strcmp(argv[i], which) != 0)
2585 continue;
2586 for (; argv[i]; i++) {
2587 argv[i] = argv[i+1];
2588 }
2589 (*argcp)--;
2590 return;
2591 }
2592}
2593
2594void swallow_option(int *argcp, char *argv[], char *opt, char *v)
2595{
2596 int i;
2597
2598 for (i = 1; argv[i]; i++) {
2599 if (!argv[i+1])
2600 continue;
2601 if (strcmp(argv[i], opt) != 0)
2602 continue;
2603 if (strcmp(argv[i+1], v) != 0) {
2604 fprintf(stderr, "Warning: unexpected fuse option %s\n", v);
2605 exit(1);
2606 }
2607 for (; argv[i+1]; i++) {
2608 argv[i] = argv[i+2];
2609 }
2610 (*argcp) -= 2;
2611 return;
2612 }
2613}
2614
758ad80c
SH
2615int main(int argc, char *argv[])
2616{
c0adec85 2617 int ret = -1;
e5d26e0b 2618 struct lxcfs_state *d = NULL;
0b0f73db
SH
2619 /*
2620 * what we pass to fuse_main is:
2621 * argv[0] -s -f -o allow_other,directio argv[1] NULL
2622 */
2623#define NARGS 7
2624 char *newargv[7];
758ad80c 2625
0b0f73db
SH
2626 /* accomodate older init scripts */
2627 swallow_arg(&argc, argv, "-s");
2628 swallow_arg(&argc, argv, "-f");
2629 swallow_option(&argc, argv, "-o", "allow_other");
2630
2631 if (argc != 2 || is_help(argv[1]))
758ad80c
SH
2632 usage(argv[0]);
2633
0b0f73db
SH
2634 d = NIH_MUST( malloc(sizeof(*d)) );
2635
2636 newargv[0] = argv[0];
2637 newargv[1] = "-s";
2638 newargv[2] = "-f";
2639 newargv[3] = "-o";
7253e0a4 2640 newargv[4] = "allow_other,direct_io";
0b0f73db
SH
2641 newargv[5] = argv[1];
2642 newargv[6] = NULL;
758ad80c
SH
2643
2644 if (!cgm_escape_cgroup())
2645 fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
2646
2647 if (!cgm_get_controllers(&d->subsystems))
c0adec85 2648 goto out;
758ad80c 2649
0b0f73db 2650 ret = fuse_main(NARGS - 1, newargv, &lxcfs_ops, d);
758ad80c 2651
c0adec85 2652out:
e5d26e0b 2653 free(d);
758ad80c 2654 return ret;
2183082c 2655}