]> git.proxmox.com Git - mirror_lxcfs.git/blame - lxcfs.c
Provide the fuse options we want ourselves
[mirror_lxcfs.git] / lxcfs.c
CommitLineData
758ad80c
SH
1/* lxcfs
2 *
3 * Copyright © 2014 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
f2799430 6 * See COPYING file for details.
758ad80c
SH
7 */
8
758ad80c
SH
9#define FUSE_USE_VERSION 26
10
2183082c 11#include <stdio.h>
758ad80c
SH
12#include <dirent.h>
13#include <fcntl.h>
14#include <fuse.h>
15#include <unistd.h>
16#include <errno.h>
17#include <stdbool.h>
18#include <time.h>
19#include <string.h>
20#include <stdlib.h>
21#include <libgen.h>
41bb9357
SH
22#include <sched.h>
23#include <linux/sched.h>
a05660a6 24#include <sys/socket.h>
41bb9357
SH
25#include <sys/mount.h>
26#include <wait.h>
758ad80c
SH
27
28#include <nih/alloc.h>
29#include <nih/string.h>
30
31#include "cgmanager.h"
32
33struct lxcfs_state {
34 /*
35 * a null-terminated, nih-allocated list of the mounted subsystems. We
36 * detect this at startup.
37 */
38 char **subsystems;
39};
40#define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
41
443d13f5
SH
42enum {
43 LXC_TYPE_CGDIR,
44 LXC_TYPE_CGFILE,
45 LXC_TYPE_PROC_MEMINFO,
46 LXC_TYPE_PROC_CPUINFO,
47 LXC_TYPE_PROC_UPTIME,
48 LXC_TYPE_PROC_STAT,
49 LXC_TYPE_PROC_DISKSTATS,
50};
51
c688e1b3
SH
52struct file_info {
53 char *controller;
54 char *cgroup;
8f6e8f5e 55 char *file;
443d13f5 56 int type;
c688e1b3
SH
57 char *buf; // unused as of yet
58 int buflen;
97f1f27b 59 int size; //actual data size
c688e1b3
SH
60};
61
97f1f27b
YY
62/* reserve buffer size, for cpuall in /proc/stat */
63#define BUF_RESERVE_SIZE 256
64
bae07053 65static char *must_copy_string(void *parent, const char *str)
c688e1b3
SH
66{
67 if (!str)
68 return NULL;
bae07053 69 return NIH_MUST( nih_strdup(parent, str) );
c688e1b3
SH
70}
71
4775fba1
SH
72/*
73 * TODO - return value should denote whether child exited with failure
74 * so callers can return errors. Esp read/write of tasks and cgroup.procs
75 */
a05660a6
SH
76static int wait_for_pid(pid_t pid)
77{
78 int status, ret;
79
80again:
81 ret = waitpid(pid, &status, 0);
82 if (ret == -1) {
83 if (errno == EINTR)
84 goto again;
85 return -1;
86 }
87 if (ret != pid)
88 goto again;
89 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
90 return -1;
91 return 0;
92}
93
053a659d
SH
94/*
95 * Given a open file * to /proc/pid/{u,g}id_map, and an id
96 * valid in the caller's namespace, return the id mapped into
97 * pid's namespace.
98 * Returns the mapped id, or -1 on error.
99 */
100unsigned int
101convert_id_to_ns(FILE *idfile, unsigned int in_id)
102{
103 unsigned int nsuid, // base id for a range in the idfile's namespace
104 hostuid, // base id for a range in the caller's namespace
105 count; // number of ids in this range
106 char line[400];
107 int ret;
108
109 fseek(idfile, 0L, SEEK_SET);
110 while (fgets(line, 400, idfile)) {
111 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
112 if (ret != 3)
113 continue;
114 if (hostuid + count < hostuid || nsuid + count < nsuid) {
115 /*
116 * uids wrapped around - unexpected as this is a procfile,
117 * so just bail.
118 */
647c89e5 119 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
053a659d
SH
120 nsuid, hostuid, count, line);
121 return -1;
122 }
123 if (hostuid <= in_id && hostuid+count > in_id) {
124 /*
125 * now since hostuid <= in_id < hostuid+count, and
126 * hostuid+count and nsuid+count do not wrap around,
127 * we know that nsuid+(in_id-hostuid) which must be
128 * less that nsuid+(count) must not wrap around
129 */
130 return (in_id - hostuid) + nsuid;
131 }
132 }
133
134 // no answer found
135 return -1;
136}
137
341b21ad
SH
138/*
139 * for is_privileged_over,
140 * specify whether we require the calling uid to be root in his
141 * namespace
142 */
143#define NS_ROOT_REQD true
144#define NS_ROOT_OPT false
145
146static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
758ad80c 147{
053a659d
SH
148 nih_local char *fpath = NULL;
149 bool answer = false;
150 uid_t nsuid;
151
341b21ad
SH
152 if (victim == -1 || uid == -1)
153 return false;
154
155 /*
156 * If the request is one not requiring root in the namespace,
157 * then having the same uid suffices. (i.e. uid 1000 has write
158 * access to files owned by uid 1000
159 */
160 if (!req_ns_root && uid == victim)
758ad80c
SH
161 return true;
162
053a659d
SH
163 fpath = NIH_MUST( nih_sprintf(NULL, "/proc/%d/uid_map", pid) );
164 FILE *f = fopen(fpath, "r");
165 if (!f)
166 return false;
167
341b21ad 168 /* if caller's not root in his namespace, reject */
053a659d
SH
169 nsuid = convert_id_to_ns(f, uid);
170 if (nsuid)
171 goto out;
172
341b21ad
SH
173 /*
174 * If victim is not mapped into caller's ns, reject.
175 * XXX I'm not sure this check is needed given that fuse
176 * will be sending requests where the vfs has converted
177 */
053a659d
SH
178 nsuid = convert_id_to_ns(f, victim);
179 if (nsuid == -1)
180 goto out;
181
182 answer = true;
183
184out:
185 fclose(f);
186 return answer;
758ad80c
SH
187}
188
189static bool perms_include(int fmode, mode_t req_mode)
190{
2ad6d2bd
SH
191 mode_t r;
192
193 switch (req_mode & O_ACCMODE) {
194 case O_RDONLY:
195 r = S_IROTH;
196 break;
197 case O_WRONLY:
198 r = S_IWOTH;
199 break;
200 case O_RDWR:
201 r = S_IROTH | S_IWOTH;
202 break;
203 default:
204 return false;
205 }
206 return ((fmode & r) == r);
758ad80c
SH
207}
208
3db25a35
SH
209static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
210{
211 char *start, *end;
212
213 if (strlen(taskcg) <= strlen(querycg)) {
214 fprintf(stderr, "%s: I was fed bad input\n", __func__);
215 return NULL;
216 }
217
218 if (strcmp(querycg, "/") == 0)
219 start = NIH_MUST( nih_strdup(NULL, taskcg + 1) );
220 else
221 start = NIH_MUST( nih_strdup(NULL, taskcg + strlen(querycg) + 1) );
222 end = strchr(start, '/');
223 if (end)
224 *end = '\0';
225 return start;
226}
227
758ad80c
SH
228/*
229 * check whether a fuse context may access a cgroup dir or file
230 *
231 * If file is not null, it is a cgroup file to check under cg.
232 * If file is null, then we are checking perms on cg itself.
233 *
234 * For files we can check the mode of the list_keys result.
235 * For cgroups, we must make assumptions based on the files under the
236 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
237 * yet.
238 */
239static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
240{
241 nih_local struct cgm_keys **list = NULL;
242 int i;
243
244 if (!file)
245 file = "tasks";
246
247 if (*file == '/')
248 file++;
249
250 if (!cgm_list_keys(contrl, cg, &list))
251 return false;
252 for (i = 0; list[i]; i++) {
253 if (strcmp(list[i]->name, file) == 0) {
254 struct cgm_keys *k = list[i];
341b21ad 255 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
758ad80c
SH
256 if (perms_include(k->mode >> 6, mode))
257 return true;
258 }
259 if (fc->gid == k->gid) {
260 if (perms_include(k->mode >> 3, mode))
261 return true;
262 }
263 return perms_include(k->mode, mode);
264 }
265 }
266
267 return false;
268}
269
3db25a35
SH
270static void stripnewline(char *x)
271{
272 size_t l = strlen(x);
273 if (l && x[l-1] == '\n')
274 x[l-1] = '\0';
275}
276
277/*
278 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
279 * If caller is in /a, he may act on /a/b, but not on /b.
280 * if the answer is false and nextcg is not NULL, then *nextcg will point
281 * to a nih_alloc'd string containing the next cgroup directory under cg
282 */
283static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
284{
285 nih_local char *fnam = NULL;
286 FILE *f;
287 bool answer = false;
288 char *line = NULL;
289 size_t len = 0;
290
291 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
292 if (!(f = fopen(fnam, "r")))
293 return false;
294
295 while (getline(&line, &len, f) != -1) {
296 char *c1, *c2, *linecmp;
297 if (!line[0])
298 continue;
299 c1 = strchr(line, ':');
300 if (!c1)
301 goto out;
302 c1++;
303 c2 = strchr(c1, ':');
304 if (!c2)
305 goto out;
306 *c2 = '\0';
307 if (strcmp(c1, contrl) != 0)
308 continue;
309 c2++;
310 stripnewline(c2);
311 /*
312 * callers pass in '/' for root cgroup, otherwise they pass
313 * in a cgroup without leading '/'
314 */
315 linecmp = *cg == '/' ? c2 : c2+1;
316 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
317 if (nextcg)
318 *nextcg = get_next_cgroup_dir(linecmp, cg);
319 goto out;
320 }
321 answer = true;
322 goto out;
323 }
324
325out:
326 fclose(f);
327 free(line);
328 return answer;
329}
330
758ad80c
SH
331/*
332 * given /cgroup/freezer/a/b, return "freezer". this will be nih-allocated
333 * and needs to be nih_freed.
334 */
335static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
336{
337 const char *p1;
338 char *ret, *slash;
339
340 if (strlen(path) < 9)
341 return NULL;
342 p1 = path+8;
343 ret = nih_strdup(NULL, p1);
344 if (!ret)
345 return ret;
346 slash = strstr(ret, "/");
347 if (slash)
348 *slash = '\0';
349
350 /* verify that it is a subsystem */
351 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
352 int i;
353 if (!list) {
354 nih_free(ret);
355 return NULL;
356 }
357 for (i = 0; list[i]; i++) {
358 if (strcmp(list[i], ret) == 0)
359 return ret;
360 }
361 nih_free(ret);
362 return NULL;
363}
364
365/*
366 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
367 * Note that the returned value may include files (keynames) etc
368 */
369static const char *find_cgroup_in_path(const char *path)
370{
371 const char *p1;
372
373 if (strlen(path) < 9)
374 return NULL;
375 p1 = strstr(path+8, "/");
376 if (!p1)
377 return NULL;
378 return p1+1;
379}
380
381static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
382{
383 nih_local char **list = NULL;
384 int i;
385
386 if (!f)
387 return false;
388 if (*f == '/')
389 f++;
390
391 if (!cgm_list_children(contr, dir, &list))
392 return false;
393 for (i = 0; list[i]; i++) {
394 if (strcmp(list[i], f) == 0)
395 return true;
396 }
397
398 return false;
399}
400
401static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
402{
403 nih_local struct cgm_keys **list = NULL;
404 struct cgm_keys *k;
405 int i;
406
407 if (!f)
408 return NULL;
409 if (*f == '/')
410 f++;
411 if (!cgm_list_keys(contr, dir, &list))
412 return NULL;
413 for (i = 0; list[i]; i++) {
414 if (strcmp(list[i]->name, f) == 0) {
415 k = NIH_MUST( nih_alloc(NULL, (sizeof(*k))) );
416 k->name = NIH_MUST( nih_strdup(k, list[i]->name) );
417 k->uid = list[i]->uid;
418 k->gid = list[i]->gid;
419 k->mode = list[i]->mode;
420 return k;
421 }
422 }
423
424 return NULL;
425}
426
427static void get_cgdir_and_path(const char *cg, char **dir, char **file)
428{
758ad80c
SH
429 char *p;
430
431 *dir = NIH_MUST( nih_strdup(NULL, cg) );
432 *file = strrchr(cg, '/');
433 if (!*file) {
434 *file = NULL;
435 return;
436 }
437 p = strrchr(*dir, '/');
438 *p = '\0';
439}
440
99978832
SH
441static size_t get_file_size(const char *contrl, const char *cg, const char *f)
442{
443 nih_local char *data = NULL;
444 size_t s;
445 if (!cgm_get_value(contrl, cg, f, &data))
446 return -EINVAL;
447 s = strlen(data);
448 return s;
449}
2ad6d2bd 450
758ad80c 451/*
2ad6d2bd 452 * FUSE ops for /cgroup
758ad80c 453 */
2ad6d2bd 454
758ad80c
SH
455static int cg_getattr(const char *path, struct stat *sb)
456{
457 struct timespec now;
458 struct fuse_context *fc = fuse_get_context();
459 nih_local char * cgdir = NULL;
460 char *fpath = NULL, *path1, *path2;
461 nih_local struct cgm_keys *k = NULL;
462 const char *cgroup;
463 nih_local char *controller = NULL;
464
465
466 if (!fc)
467 return -EIO;
468
469 memset(sb, 0, sizeof(struct stat));
470
471 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
472 return -EINVAL;
473
474 sb->st_uid = sb->st_gid = 0;
475 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
476 sb->st_size = 0;
477
478 if (strcmp(path, "/cgroup") == 0) {
479 sb->st_mode = S_IFDIR | 00755;
480 sb->st_nlink = 2;
481 return 0;
482 }
483
484 controller = pick_controller_from_path(fc, path);
485 if (!controller)
486 return -EIO;
758ad80c
SH
487 cgroup = find_cgroup_in_path(path);
488 if (!cgroup) {
489 /* this is just /cgroup/controller, return it as a dir */
490 sb->st_mode = S_IFDIR | 00755;
491 sb->st_nlink = 2;
492 return 0;
493 }
341b21ad 494
758ad80c
SH
495 get_cgdir_and_path(cgroup, &cgdir, &fpath);
496
497 if (!fpath) {
498 path1 = "/";
499 path2 = cgdir;
500 } else {
501 path1 = cgdir;
502 path2 = fpath;
503 }
504
758ad80c
SH
505 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
506 * Then check that caller's cgroup is under path if fpath is a child
507 * cgroup, or cgdir if fpath is a file */
508
509 if (is_child_cgroup(controller, path1, path2)) {
f9a05025
SH
510 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
511 /* this is just /cgroup/controller, return it as a dir */
512 sb->st_mode = S_IFDIR | 00555;
513 sb->st_nlink = 2;
514 return 0;
515 }
758ad80c 516 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
f9a05025 517 return -EACCES;
758ad80c 518
053a659d
SH
519 // get uid, gid, from '/tasks' file and make up a mode
520 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
521 sb->st_mode = S_IFDIR | 00755;
522 k = get_cgroup_key(controller, cgroup, "tasks");
523 if (!k) {
053a659d
SH
524 sb->st_uid = sb->st_gid = 0;
525 } else {
053a659d
SH
526 sb->st_uid = k->uid;
527 sb->st_gid = k->gid;
528 }
758ad80c
SH
529 sb->st_nlink = 2;
530 return 0;
531 }
532
533 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
3db25a35
SH
534 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL))
535 return -ENOENT;
758ad80c 536 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
f9a05025 537 return -EACCES;
758ad80c 538
758ad80c 539 sb->st_mode = S_IFREG | k->mode;
053a659d 540 sb->st_nlink = 1;
758ad80c
SH
541 sb->st_uid = k->uid;
542 sb->st_gid = k->gid;
99978832 543 sb->st_size = get_file_size(controller, path1, path2);
758ad80c
SH
544 return 0;
545 }
546
ab54b798 547 return -ENOENT;
758ad80c 548}
2183082c 549
7f163b71
SH
550/*
551 * TODO - cache these results in a table for use in opendir, free
552 * in releasedir
553 */
758ad80c 554static int cg_opendir(const char *path, struct fuse_file_info *fi)
2183082c 555{
7f163b71
SH
556 struct fuse_context *fc = fuse_get_context();
557 nih_local struct cgm_keys **list = NULL;
558 const char *cgroup;
c688e1b3 559 struct file_info *dir_info;
7f163b71 560 nih_local char *controller = NULL;
7f163b71
SH
561
562 if (!fc)
563 return -EIO;
564
c688e1b3
SH
565 if (strcmp(path, "/cgroup") == 0) {
566 cgroup = NULL;
567 controller = NULL;
568 } else {
569 // return list of keys for the controller, and list of child cgroups
570 controller = pick_controller_from_path(fc, path);
571 if (!controller)
572 return -EIO;
7f163b71 573
c688e1b3
SH
574 cgroup = find_cgroup_in_path(path);
575 if (!cgroup) {
576 /* this is just /cgroup/controller, return its contents */
577 cgroup = "/";
578 }
7f163b71
SH
579 }
580
3a6e1a76 581 if (cgroup && !fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
7f163b71 582 return -EACCES;
c688e1b3
SH
583
584 /* we'll free this at cg_releasedir */
585 dir_info = NIH_MUST( nih_alloc(NULL, sizeof(*dir_info)) );
bae07053
SH
586 dir_info->controller = must_copy_string(dir_info, controller);
587 dir_info->cgroup = must_copy_string(dir_info, cgroup);
443d13f5 588 dir_info->type = LXC_TYPE_CGDIR;
c688e1b3 589 dir_info->buf = NULL;
8f6e8f5e 590 dir_info->file = NULL;
c688e1b3
SH
591 dir_info->buflen = 0;
592
593 fi->fh = (unsigned long)dir_info;
758ad80c
SH
594 return 0;
595}
596
758ad80c
SH
597static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
598 struct fuse_file_info *fi)
599{
c688e1b3
SH
600 struct file_info *d = (struct file_info *)fi->fh;
601 nih_local struct cgm_keys **list = NULL;
602 int i;
603 nih_local char *nextcg = NULL;
758ad80c
SH
604 struct fuse_context *fc = fuse_get_context();
605
443d13f5 606 if (d->type != LXC_TYPE_CGDIR) {
b845ad01
SH
607 fprintf(stderr, "Internal error: file cache info used in readdir\n");
608 return -EIO;
609 }
c688e1b3
SH
610 if (!d->cgroup && !d->controller) {
611 // ls /var/lib/lxcfs/cgroup - just show list of controllers
758ad80c
SH
612 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
613 int i;
614
615 if (!list)
616 return -EIO;
7f163b71 617
758ad80c
SH
618 for (i = 0; list[i]; i++) {
619 if (filler(buf, list[i], NULL, 0) != 0) {
620 return -EIO;
621 }
622 }
623 return 0;
624 }
625
c688e1b3 626 if (!cgm_list_keys(d->controller, d->cgroup, &list))
3db25a35 627 // not a valid cgroup
758ad80c 628 return -EINVAL;
3db25a35 629
c688e1b3 630 if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
3db25a35
SH
631 if (nextcg) {
632 int ret;
633 ret = filler(buf, nextcg, NULL, 0);
634 if (ret != 0)
635 return -EIO;
636 }
637 return 0;
638 }
639
758ad80c 640 for (i = 0; list[i]; i++) {
758ad80c
SH
641 if (filler(buf, list[i]->name, NULL, 0) != 0) {
642 return -EIO;
643 }
644 }
645
646 // now get the list of child cgroups
422aa4a5 647 nih_local char **clist = NULL;
758ad80c 648
c688e1b3 649 if (!cgm_list_children(d->controller, d->cgroup, &clist))
758ad80c
SH
650 return 0;
651 for (i = 0; clist[i]; i++) {
758ad80c
SH
652 if (filler(buf, clist[i], NULL, 0) != 0) {
653 return -EIO;
654 }
655 }
656 return 0;
657}
658
8f6e8f5e
SH
659static void do_release_file_info(struct file_info *f)
660{
bae07053
SH
661 /*
662 * all file_info fields which are nih_alloc()d with f as parent
663 * will be automatically freed
664 */
8f6e8f5e
SH
665 nih_free(f);
666}
667
758ad80c
SH
668static int cg_releasedir(const char *path, struct fuse_file_info *fi)
669{
c688e1b3
SH
670 struct file_info *d = (struct file_info *)fi->fh;
671
8f6e8f5e 672 do_release_file_info(d);
758ad80c
SH
673 return 0;
674}
675
99978832
SH
676static int cg_open(const char *path, struct fuse_file_info *fi)
677{
678 nih_local char *controller = NULL;
679 const char *cgroup;
680 char *fpath = NULL, *path1, *path2;
681 nih_local char * cgdir = NULL;
682 nih_local struct cgm_keys *k = NULL;
8f6e8f5e 683 struct file_info *file_info;
99978832
SH
684 struct fuse_context *fc = fuse_get_context();
685
686 if (!fc)
687 return -EIO;
688
689 controller = pick_controller_from_path(fc, path);
690 if (!controller)
691 return -EIO;
692 cgroup = find_cgroup_in_path(path);
693 if (!cgroup)
694 return -EINVAL;
695
696 get_cgdir_and_path(cgroup, &cgdir, &fpath);
697 if (!fpath) {
698 path1 = "/";
699 path2 = cgdir;
700 } else {
701 path1 = cgdir;
702 path2 = fpath;
703 }
704
8f6e8f5e
SH
705 k = get_cgroup_key(controller, path1, path2);
706 if (!k)
707 return -EINVAL;
99978832 708
8f6e8f5e
SH
709 if (!fc_may_access(fc, controller, path1, path2, fi->flags))
710 // should never get here
711 return -EACCES;
99978832 712
8f6e8f5e
SH
713 /* we'll free this at cg_release */
714 file_info = NIH_MUST( nih_alloc(NULL, sizeof(*file_info)) );
bae07053
SH
715 file_info->controller = must_copy_string(file_info, controller);
716 file_info->cgroup = must_copy_string(file_info, path1);
717 file_info->file = must_copy_string(file_info, path2);
443d13f5 718 file_info->type = LXC_TYPE_CGFILE;
8f6e8f5e
SH
719 file_info->buf = NULL;
720 file_info->buflen = 0;
721
722 fi->fh = (unsigned long)file_info;
723 return 0;
724}
725
726static int cg_release(const char *path, struct fuse_file_info *fi)
727{
728 struct file_info *f = (struct file_info *)fi->fh;
729
730 do_release_file_info(f);
731 return 0;
99978832
SH
732}
733
a05660a6
SH
734static int msgrecv(int sockfd, void *buf, size_t len)
735{
736 struct timeval tv;
737 fd_set rfds;
738
739 FD_ZERO(&rfds);
740 FD_SET(sockfd, &rfds);
741 tv.tv_sec = 2;
742 tv.tv_usec = 0;
743
ea56f722 744 if (select(sockfd+1, &rfds, NULL, NULL, &tv) <= 0)
a05660a6
SH
745 return -1;
746 return recv(sockfd, buf, len, MSG_DONTWAIT);
747}
748
01e71852
SH
749#define SEND_CREDS_OK 0
750#define SEND_CREDS_NOTSK 1
751#define SEND_CREDS_FAIL 2
752static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
a05660a6
SH
753{
754 struct msghdr msg = { 0 };
755 struct iovec iov;
756 struct cmsghdr *cmsg;
757 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
758 char buf[1];
759 buf[0] = 'p';
760
01e71852
SH
761 if (pingfirst) {
762 if (msgrecv(sock, buf, 1) != 1) {
1420baf8 763 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
01e71852
SH
764 __func__);
765 return SEND_CREDS_FAIL;
766 }
a05660a6
SH
767 }
768
769 msg.msg_control = cmsgbuf;
770 msg.msg_controllen = sizeof(cmsgbuf);
771
772 cmsg = CMSG_FIRSTHDR(&msg);
773 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
774 cmsg->cmsg_level = SOL_SOCKET;
775 cmsg->cmsg_type = SCM_CREDENTIALS;
776 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
777
778 msg.msg_name = NULL;
779 msg.msg_namelen = 0;
780
781 buf[0] = v;
782 iov.iov_base = buf;
783 iov.iov_len = sizeof(buf);
784 msg.msg_iov = &iov;
785 msg.msg_iovlen = 1;
786
787 if (sendmsg(sock, &msg, 0) < 0) {
1420baf8 788 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
a05660a6
SH
789 strerror(errno));
790 if (errno == 3)
01e71852
SH
791 return SEND_CREDS_NOTSK;
792 return SEND_CREDS_FAIL;
a05660a6
SH
793 }
794
01e71852 795 return SEND_CREDS_OK;
a05660a6
SH
796}
797
798static bool recv_creds(int sock, struct ucred *cred, char *v)
799{
800 struct msghdr msg = { 0 };
801 struct iovec iov;
802 struct cmsghdr *cmsg;
803 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
804 char buf[1];
805 int ret;
806 int optval = 1;
6ee867dc
SH
807 struct timeval tv;
808 fd_set rfds;
a05660a6
SH
809
810 *v = '1';
811
812 cred->pid = -1;
813 cred->uid = -1;
814 cred->gid = -1;
815
816 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
1420baf8 817 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
a05660a6
SH
818 return false;
819 }
820 buf[0] = '1';
821 if (write(sock, buf, 1) != 1) {
1420baf8 822 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
a05660a6
SH
823 return false;
824 }
825
826 msg.msg_name = NULL;
827 msg.msg_namelen = 0;
828 msg.msg_control = cmsgbuf;
829 msg.msg_controllen = sizeof(cmsgbuf);
830
831 iov.iov_base = buf;
832 iov.iov_len = sizeof(buf);
833 msg.msg_iov = &iov;
834 msg.msg_iovlen = 1;
835
6ee867dc
SH
836 FD_ZERO(&rfds);
837 FD_SET(sock, &rfds);
838 tv.tv_sec = 2;
839 tv.tv_usec = 0;
ea56f722 840 if (select(sock+1, &rfds, NULL, NULL, &tv) <= 0) {
6ee867dc
SH
841 fprintf(stderr, "Failed to select for scm_cred: %s\n",
842 strerror(errno));
843 return false;
844 }
845 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
a05660a6 846 if (ret < 0) {
1420baf8 847 fprintf(stderr, "Failed to receive scm_cred: %s\n",
a05660a6
SH
848 strerror(errno));
849 return false;
850 }
851
852 cmsg = CMSG_FIRSTHDR(&msg);
853
854 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
855 cmsg->cmsg_level == SOL_SOCKET &&
856 cmsg->cmsg_type == SCM_CREDENTIALS) {
857 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
858 }
859 *v = buf[0];
860
861 return true;
862}
863
864
865/*
4775fba1
SH
866 * pid_to_ns - reads pids from a ucred over a socket, then writes the
867 * int value back over the socket. This shifts the pid from the
868 * sender's pidns into tpid's pidns.
a05660a6 869 */
4775fba1 870static void pid_to_ns(int sock, pid_t tpid)
a05660a6
SH
871{
872 char v = '0';
873 struct ucred cred;
874
875 while (recv_creds(sock, &cred, &v)) {
876 if (v == '1')
877 exit(0);
a05660a6
SH
878 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
879 exit(1);
880 }
881 exit(0);
882}
883
884/*
4775fba1 885 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
a05660a6 886 * in your old pidns. Only children which you fork will be in the target
4775fba1 887 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
a05660a6
SH
888 * actually convert pids
889 */
4775fba1 890static void pid_to_ns_wrapper(int sock, pid_t tpid)
a05660a6 891{
ea56f722 892 int newnsfd = -1, ret, cpipe[2];
a05660a6
SH
893 char fnam[100];
894 pid_t cpid;
ea56f722
SH
895 struct timeval tv;
896 fd_set s;
897 char v;
a05660a6
SH
898
899 sprintf(fnam, "/proc/%d/ns/pid", tpid);
900 newnsfd = open(fnam, O_RDONLY);
901 if (newnsfd < 0)
902 exit(1);
903 if (setns(newnsfd, 0) < 0)
904 exit(1);
905 close(newnsfd);
906
ea56f722
SH
907 if (pipe(cpipe) < 0)
908 exit(1);
a05660a6 909
ea56f722
SH
910loop:
911 cpid = fork();
a05660a6
SH
912 if (cpid < 0)
913 exit(1);
ea56f722
SH
914
915 if (!cpid) {
916 char b = '1';
917 close(cpipe[0]);
918 if (write(cpipe[1], &b, sizeof(char)) < 0) {
919 fprintf(stderr, "%s (child): erorr on write: %s\n",
920 __func__, strerror(errno));
921 }
922 close(cpipe[1]);
4775fba1 923 pid_to_ns(sock, tpid);
ea56f722
SH
924 }
925 // give the child 1 second to be done forking and
926 // write it's ack
927 FD_ZERO(&s);
928 FD_SET(cpipe[0], &s);
929 tv.tv_sec = 1;
930 tv.tv_usec = 0;
931 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
932 if (ret <= 0)
933 goto again;
934 ret = read(cpipe[0], &v, 1);
935 if (ret != sizeof(char) || v != '1') {
936 goto again;
937 }
938
a05660a6
SH
939 if (!wait_for_pid(cpid))
940 exit(1);
941 exit(0);
ea56f722
SH
942
943again:
944 kill(cpid, SIGKILL);
945 wait_for_pid(cpid);
946 goto loop;
a05660a6
SH
947}
948
949/*
950 * To read cgroup files with a particular pid, we will setns into the child
951 * pidns, open a pipe, fork a child - which will be the first to really be in
952 * the child ns - which does the cgm_get_value and writes the data to the pipe.
953 */
954static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
955{
956 int sock[2] = {-1, -1};
957 nih_local char *tmpdata = NULL;
958 int ret;
959 pid_t qpid, cpid = -1;
960 bool answer = false;
961 char v = '0';
962 struct ucred cred;
963 struct timeval tv;
964 fd_set s;
965
966 if (!cgm_get_value(contrl, cg, file, &tmpdata))
967 return false;
968
969 /*
970 * Now we read the pids from returned data one by one, pass
971 * them into a child in the target namespace, read back the
972 * translated pids, and put them into our to-return data
973 */
974
975 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
976 perror("socketpair");
977 exit(1);
978 }
979
980 cpid = fork();
981 if (cpid == -1)
982 goto out;
983
984 if (!cpid) // child
4775fba1 985 pid_to_ns_wrapper(sock[1], tpid);
a05660a6
SH
986
987 char *ptr = tmpdata;
988 cred.uid = 0;
989 cred.gid = 0;
990 while (sscanf(ptr, "%d\n", &qpid) == 1) {
991 cred.pid = qpid;
01e71852
SH
992 ret = send_creds(sock[0], &cred, v, true);
993
994 if (ret == SEND_CREDS_NOTSK)
995 goto next;
996 if (ret == SEND_CREDS_FAIL)
a05660a6
SH
997 goto out;
998
999 // read converted results
1000 FD_ZERO(&s);
1001 FD_SET(sock[0], &s);
6ee867dc 1002 tv.tv_sec = 2;
a05660a6
SH
1003 tv.tv_usec = 0;
1004 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
1005 if (ret <= 0) {
6ee867dc
SH
1006 fprintf(stderr, "%s: select error waiting for pid from child: %s\n",
1007 __func__, strerror(errno));
a05660a6
SH
1008 goto out;
1009 }
1010 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
6ee867dc
SH
1011 fprintf(stderr, "%s: error reading pid from child: %s\n",
1012 __func__, strerror(errno));
a05660a6
SH
1013 goto out;
1014 }
a05660a6 1015 NIH_MUST( nih_strcat_sprintf(d, NULL, "%d\n", qpid) );
01e71852 1016next:
a05660a6
SH
1017 ptr = strchr(ptr, '\n');
1018 if (!ptr)
1019 break;
1020 ptr++;
1021 }
1022
1023 cred.pid = getpid();
1024 v = '1';
01e71852 1025 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
a05660a6 1026 // failed to ask child to exit
6ee867dc
SH
1027 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1028 __func__, strerror(errno));
a05660a6
SH
1029 goto out;
1030 }
1031
1032 answer = true;
1033
1034out:
1035 if (cpid != -1)
1036 wait_for_pid(cpid);
1037 if (sock[0] != -1) {
1038 close(sock[0]);
1039 close(sock[1]);
1040 }
1041 return answer;
1042}
1043
99978832
SH
1044static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1045 struct fuse_file_info *fi)
1046{
99978832 1047 struct fuse_context *fc = fuse_get_context();
8f6e8f5e 1048 struct file_info *f = (struct file_info *)fi->fh;
99978832
SH
1049 nih_local struct cgm_keys *k = NULL;
1050
443d13f5 1051 if (f->type != LXC_TYPE_CGFILE) {
b845ad01
SH
1052 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1053 return -EIO;
1054 }
1055
99978832
SH
1056 if (offset)
1057 return -EIO;
1058
1059 if (!fc)
1060 return -EIO;
1061
8f6e8f5e 1062 if (!f->controller)
99978832
SH
1063 return -EINVAL;
1064
8f6e8f5e 1065 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
99978832 1066 nih_local char *data = NULL;
4775fba1
SH
1067 int s;
1068 bool r;
99978832 1069
8f6e8f5e 1070 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY))
f9a05025
SH
1071 // should never get here
1072 return -EACCES;
99978832 1073
8f6e8f5e
SH
1074 if (strcmp(f->file, "tasks") == 0 ||
1075 strcmp(f->file, "/tasks") == 0 ||
1076 strcmp(f->file, "/cgroup.procs") == 0 ||
1077 strcmp(f->file, "cgroup.procs") == 0)
a05660a6 1078 // special case - we have to translate the pids
8f6e8f5e 1079 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
a05660a6 1080 else
8f6e8f5e 1081 r = cgm_get_value(f->controller, f->cgroup, f->file, &data);
a05660a6 1082
4775fba1 1083 if (!r)
99978832
SH
1084 return -EINVAL;
1085
4775fba1
SH
1086 if (!data)
1087 return 0;
99978832
SH
1088 s = strlen(data);
1089 if (s > size)
1090 s = size;
1091 memcpy(buf, data, s);
1092
99978832
SH
1093 return s;
1094 }
1095
1096 return -EINVAL;
1097}
1098
4775fba1
SH
1099static void pid_from_ns(int sock, pid_t tpid)
1100{
1101 pid_t vpid;
1102 struct ucred cred;
1103 char v;
6ee867dc
SH
1104 struct timeval tv;
1105 fd_set s;
1106 int ret;
4775fba1
SH
1107
1108 cred.uid = 0;
1109 cred.gid = 0;
6ee867dc
SH
1110 while (1) {
1111 FD_ZERO(&s);
1112 FD_SET(sock, &s);
1113 tv.tv_sec = 2;
1114 tv.tv_usec = 0;
1115 ret = select(sock+1, &s, NULL, NULL, &tv);
ea56f722
SH
1116 if (ret <= 0) {
1117 fprintf(stderr, "%s: bad select before read from parent: %s\n",
6ee867dc
SH
1118 __func__, strerror(errno));
1119 exit(1);
1120 }
1121 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1122 fprintf(stderr, "%s: bad read from parent: %s\n",
1123 __func__, strerror(errno));
1124 exit(1);
1125 }
4775fba1 1126 if (vpid == -1) // done
01e71852 1127 break;
4775fba1
SH
1128 v = '0';
1129 cred.pid = vpid;
01e71852 1130 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
4775fba1
SH
1131 v = '1';
1132 cred.pid = getpid();
01e71852 1133 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
4775fba1
SH
1134 exit(1);
1135 }
1136 }
1137 exit(0);
1138}
1139
1140static void pid_from_ns_wrapper(int sock, pid_t tpid)
1141{
ea56f722 1142 int newnsfd = -1, ret, cpipe[2];
4775fba1
SH
1143 char fnam[100];
1144 pid_t cpid;
ea56f722
SH
1145 fd_set s;
1146 struct timeval tv;
1147 char v;
4775fba1
SH
1148
1149 sprintf(fnam, "/proc/%d/ns/pid", tpid);
1150 newnsfd = open(fnam, O_RDONLY);
1151 if (newnsfd < 0)
1152 exit(1);
1153 if (setns(newnsfd, 0) < 0)
1154 exit(1);
1155 close(newnsfd);
1156
ea56f722
SH
1157 if (pipe(cpipe) < 0)
1158 exit(1);
1159
1160loop:
4775fba1
SH
1161 cpid = fork();
1162
1163 if (cpid < 0)
1164 exit(1);
ea56f722
SH
1165
1166 if (!cpid) {
1167 char b = '1';
1168 close(cpipe[0]);
1169 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1170 fprintf(stderr, "%s (child): erorr on write: %s\n",
1171 __func__, strerror(errno));
1172 }
1173 close(cpipe[1]);
4775fba1 1174 pid_from_ns(sock, tpid);
ea56f722
SH
1175 }
1176
1177 // give the child 1 second to be done forking and
1178 // write it's ack
1179 FD_ZERO(&s);
1180 FD_SET(cpipe[0], &s);
1181 tv.tv_sec = 1;
1182 tv.tv_usec = 0;
1183 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1184 if (ret <= 0)
1185 goto again;
1186 ret = read(cpipe[0], &v, 1);
1187 if (ret != sizeof(char) || v != '1') {
1188 goto again;
1189 }
1190
4775fba1
SH
1191 if (!wait_for_pid(cpid))
1192 exit(1);
1193 exit(0);
ea56f722
SH
1194
1195again:
1196 kill(cpid, SIGKILL);
1197 wait_for_pid(cpid);
1198 goto loop;
4775fba1
SH
1199}
1200
1201static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
1202{
1203 int sock[2] = {-1, -1};
1204 pid_t qpid, cpid = -1;
1205 bool answer = false, fail = false;
1206
1207 /*
1208 * write the pids to a socket, have helper in writer's pidns
1209 * call movepid for us
1210 */
1211 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1212 perror("socketpair");
1213 exit(1);
1214 }
1215
1216 cpid = fork();
1217 if (cpid == -1)
1218 goto out;
1219
1220 if (!cpid) // child
1221 pid_from_ns_wrapper(sock[1], tpid);
1222
1223 const char *ptr = buf;
1224 while (sscanf(ptr, "%d", &qpid) == 1) {
1225 struct ucred cred;
1226 char v;
1227
1228 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
6ee867dc
SH
1229 fprintf(stderr, "%s: error writing pid to child: %s\n",
1230 __func__, strerror(errno));
4775fba1
SH
1231 goto out;
1232 }
1233
01e71852
SH
1234 if (recv_creds(sock[0], &cred, &v)) {
1235 if (v == '0') {
1236 if (!cgm_move_pid(contrl, cg, cred.pid))
1237 fail = true;
1238 }
4775fba1
SH
1239 }
1240
1241 ptr = strchr(ptr, '\n');
1242 if (!ptr)
1243 break;
1244 ptr++;
1245 }
1246
1247 /* All good, write the value */
1248 qpid = -1;
1249 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1420baf8 1250 fprintf(stderr, "Warning: failed to ask child to exit\n");
4775fba1
SH
1251
1252 if (!fail)
1253 answer = true;
1254
1255out:
1256 if (cpid != -1)
1257 wait_for_pid(cpid);
1258 if (sock[0] != -1) {
1259 close(sock[0]);
1260 close(sock[1]);
1261 }
1262 return answer;
1263}
1264
2ad6d2bd
SH
1265int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1266 struct fuse_file_info *fi)
1267{
2ad6d2bd 1268 struct fuse_context *fc = fuse_get_context();
47cbf0e5 1269 nih_local char *localbuf = NULL;
8f6e8f5e
SH
1270 nih_local struct cgm_keys *k = NULL;
1271 struct file_info *f = (struct file_info *)fi->fh;
2ad6d2bd 1272
443d13f5 1273 if (f->type != LXC_TYPE_CGFILE) {
b845ad01
SH
1274 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1275 return -EIO;
1276 }
1277
2ad6d2bd 1278 if (offset)
f9a05025 1279 return -EINVAL;
2ad6d2bd
SH
1280
1281 if (!fc)
1282 return -EIO;
1283
47cbf0e5
SH
1284 localbuf = NIH_MUST( nih_alloc(NULL, size+1) );
1285 localbuf[size] = '\0';
1286 memcpy(localbuf, buf, size);
2ad6d2bd 1287
8f6e8f5e 1288 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
4775fba1
SH
1289 bool r;
1290
8f6e8f5e 1291 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY))
f9a05025 1292 return -EACCES;
2ad6d2bd 1293
8f6e8f5e
SH
1294 if (strcmp(f->file, "tasks") == 0 ||
1295 strcmp(f->file, "/tasks") == 0 ||
1296 strcmp(f->file, "/cgroup.procs") == 0 ||
1297 strcmp(f->file, "cgroup.procs") == 0)
4775fba1 1298 // special case - we have to translate the pids
8f6e8f5e 1299 r = do_write_pids(fc->pid, f->controller, f->cgroup, f->file, localbuf);
4775fba1 1300 else
8f6e8f5e 1301 r = cgm_set_value(f->controller, f->cgroup, f->file, localbuf);
4775fba1
SH
1302
1303 if (!r)
2ad6d2bd
SH
1304 return -EINVAL;
1305
1306 return size;
1307 }
1308
1309 return -EINVAL;
1310}
1311
341b21ad
SH
1312int cg_chown(const char *path, uid_t uid, gid_t gid)
1313{
1314 struct fuse_context *fc = fuse_get_context();
1315 nih_local char * cgdir = NULL;
1316 char *fpath = NULL, *path1, *path2;
1317 nih_local struct cgm_keys *k = NULL;
1318 const char *cgroup;
1319 nih_local char *controller = NULL;
1320
1321
1322 if (!fc)
1323 return -EIO;
1324
1325 if (strcmp(path, "/cgroup") == 0)
1326 return -EINVAL;
1327
1328 controller = pick_controller_from_path(fc, path);
1329 if (!controller)
f9a05025 1330 return -EINVAL;
341b21ad
SH
1331 cgroup = find_cgroup_in_path(path);
1332 if (!cgroup)
1333 /* this is just /cgroup/controller */
1334 return -EINVAL;
1335
1336 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1337
1338 if (!fpath) {
1339 path1 = "/";
1340 path2 = cgdir;
1341 } else {
1342 path1 = cgdir;
1343 path2 = fpath;
1344 }
1345
1346 if (is_child_cgroup(controller, path1, path2)) {
1347 // get uid, gid, from '/tasks' file and make up a mode
1348 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1349 k = get_cgroup_key(controller, cgroup, "tasks");
1350
1351 } else
1352 k = get_cgroup_key(controller, path1, path2);
1353
1354 if (!k)
1355 return -EINVAL;
1356
1357 /*
1358 * This being a fuse request, the uid and gid must be valid
1359 * in the caller's namespace. So we can just check to make
1360 * sure that the caller is root in his uid, and privileged
1361 * over the file's current owner.
1362 */
1363 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD))
f9a05025 1364 return -EACCES;
341b21ad
SH
1365
1366 if (!cgm_chown_file(controller, cgroup, uid, gid))
1367 return -EINVAL;
1368 return 0;
1369}
2ad6d2bd 1370
fd2e4e03
SH
1371int cg_chmod(const char *path, mode_t mode)
1372{
0a1bb5ea
SH
1373 struct fuse_context *fc = fuse_get_context();
1374 nih_local char * cgdir = NULL;
1375 char *fpath = NULL, *path1, *path2;
1376 nih_local struct cgm_keys *k = NULL;
1377 const char *cgroup;
1378 nih_local char *controller = NULL;
1379
1380 if (!fc)
1381 return -EIO;
1382
1383 if (strcmp(path, "/cgroup") == 0)
1384 return -EINVAL;
1385
1386 controller = pick_controller_from_path(fc, path);
1387 if (!controller)
f9a05025 1388 return -EINVAL;
0a1bb5ea
SH
1389 cgroup = find_cgroup_in_path(path);
1390 if (!cgroup)
1391 /* this is just /cgroup/controller */
1392 return -EINVAL;
1393
1394 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1395
1396 if (!fpath) {
1397 path1 = "/";
1398 path2 = cgdir;
1399 } else {
1400 path1 = cgdir;
1401 path2 = fpath;
1402 }
1403
1404 if (is_child_cgroup(controller, path1, path2)) {
1405 // get uid, gid, from '/tasks' file and make up a mode
1406 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1407 k = get_cgroup_key(controller, cgroup, "tasks");
1408
1409 } else
1410 k = get_cgroup_key(controller, path1, path2);
1411
1412 if (!k)
1413 return -EINVAL;
1414
1415 /*
1416 * This being a fuse request, the uid and gid must be valid
1417 * in the caller's namespace. So we can just check to make
1418 * sure that the caller is root in his uid, and privileged
1419 * over the file's current owner.
1420 */
1421 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT))
1422 return -EPERM;
1423
1424 if (!cgm_chmod_file(controller, cgroup, mode))
1425 return -EINVAL;
1426 return 0;
fd2e4e03
SH
1427}
1428
ab54b798
SH
1429int cg_mkdir(const char *path, mode_t mode)
1430{
1431 struct fuse_context *fc = fuse_get_context();
1432 nih_local struct cgm_keys **list = NULL;
1433 char *fpath = NULL, *path1;
1434 nih_local char * cgdir = NULL;
1435 const char *cgroup;
1436 nih_local char *controller = NULL;
1437
ab54b798
SH
1438 if (!fc)
1439 return -EIO;
1440
1441
1442 controller = pick_controller_from_path(fc, path);
1443 if (!controller)
f9a05025 1444 return -EINVAL;
ab54b798
SH
1445
1446 cgroup = find_cgroup_in_path(path);
1447 if (!cgroup)
f9a05025 1448 return -EINVAL;
ab54b798
SH
1449
1450 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1451 if (!fpath)
1452 path1 = "/";
1453 else
1454 path1 = cgdir;
1455
1456 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR))
f9a05025 1457 return -EACCES;
ab54b798
SH
1458
1459
1460 if (!cgm_create(controller, cgroup, fc->uid, fc->gid))
1461 return -EINVAL;
1462
1463 return 0;
1464}
1465
50d8d5b5
SH
1466static int cg_rmdir(const char *path)
1467{
1468 struct fuse_context *fc = fuse_get_context();
1469 nih_local struct cgm_keys **list = NULL;
1470 char *fpath = NULL;
1471 nih_local char * cgdir = NULL;
1472 const char *cgroup;
1473 nih_local char *controller = NULL;
1474
1475 if (!fc)
1476 return -EIO;
1477
1478
1479 controller = pick_controller_from_path(fc, path);
1480 if (!controller)
f9a05025 1481 return -EINVAL;
50d8d5b5
SH
1482
1483 cgroup = find_cgroup_in_path(path);
1484 if (!cgroup)
f9a05025 1485 return -EINVAL;
50d8d5b5
SH
1486
1487 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1488 if (!fpath)
1489 return -EINVAL;
1490
1491 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY))
f9a05025 1492 return -EACCES;
50d8d5b5
SH
1493
1494 if (!cgm_remove(controller, cgroup))
1495 return -EINVAL;
1496
1497 return 0;
1498}
1499
2dc17609
SH
1500static bool startswith(const char *line, const char *pref)
1501{
1502 if (strncmp(line, pref, strlen(pref)) == 0)
1503 return true;
1504 return false;
1505}
1506
1507static void get_mem_cached(char *memstat, unsigned long *v)
1508{
1509 char *eol;
1510
1511 *v = 0;
1512 while (*memstat) {
1513 if (startswith(memstat, "total_cache")) {
1514 sscanf(memstat + 11, "%lu", v);
1515 *v /= 1024;
1516 return;
1517 }
1518 eol = strchr(memstat, '\n');
1519 if (!eol)
1520 return;
1521 memstat = eol+1;
1522 }
1523}
1524
49878439 1525static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2f919d9d 1526{
49878439
YY
1527 char *eol;
1528 char key[32];
2f919d9d 1529
49878439
YY
1530 memset(key, 0, 32);
1531 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2f919d9d 1532
49878439
YY
1533 size_t len = strlen(key);
1534 *v = 0;
1535
1536 while (*str) {
1537 if (startswith(str, key)) {
2f919d9d
SH
1538 sscanf(str + len, "%lu", v);
1539 return;
1540 }
1541 eol = strchr(str, '\n');
49878439 1542 if (!eol)
2f919d9d 1543 return;
49878439
YY
1544 str = eol+1;
1545 }
1546}
1547
2dc17609
SH
1548static char *get_pid_cgroup(pid_t pid, const char *contrl)
1549{
1550 nih_local char *fnam = NULL;
1551 FILE *f;
1552 char *answer = NULL;
1553 char *line = NULL;
1554 size_t len = 0;
1555
1556 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
1557 if (!(f = fopen(fnam, "r")))
1558 return false;
1559
1560 while (getline(&line, &len, f) != -1) {
1561 char *c1, *c2;
1562 if (!line[0])
1563 continue;
1564 c1 = strchr(line, ':');
1565 if (!c1)
1566 goto out;
1567 c1++;
1568 c2 = strchr(c1, ':');
1569 if (!c2)
1570 goto out;
1571 *c2 = '\0';
1572 if (strcmp(c1, contrl) != 0)
1573 continue;
1574 c2++;
1575 stripnewline(c2);
1576 answer = NIH_MUST( nih_strdup(NULL, c2) );
1577 goto out;
1578 }
1579
1580out:
1581 fclose(f);
1582 free(line);
1583 return answer;
1584}
1585
758ad80c 1586/*
2ad6d2bd 1587 * FUSE ops for /proc
758ad80c 1588 */
758ad80c 1589
23ce2127
SH
1590static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1591 struct fuse_file_info *fi)
1592{
2dc17609 1593 struct fuse_context *fc = fuse_get_context();
97f1f27b 1594 struct file_info *d = (struct file_info *)fi->fh;
2dc17609
SH
1595 nih_local char *cg = get_pid_cgroup(fc->pid, "memory");
1596 nih_local char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
1597 unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
1598 char *line = NULL;
1599 size_t linelen = 0, total_len = 0;
97f1f27b
YY
1600 char *cache = d->buf;
1601 size_t cache_size = d->buflen;
2dc17609
SH
1602 FILE *f;
1603
97f1f27b
YY
1604 if (offset){
1605 if (offset > d->size)
1606 return -EINVAL;
1607 int left = d->size - offset;
1608 total_len = left > size ? size: left;
1609 memcpy(buf, cache + offset, total_len);
1610 return total_len;
1611 }
2dc17609
SH
1612
1613 if (!cg)
1614 return 0;
1615
1616 if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
1617 return 0;
1618 if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
1619 return 0;
1620 if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
1621 return 0;
1622 memlimit = strtoul(memlimit_str, NULL, 10);
1623 memusage = strtoul(memusage_str, NULL, 10);
1624 memlimit /= 1024;
1625 memusage /= 1024;
1626 get_mem_cached(memstat_str, &cached);
1627
1628 f = fopen("/proc/meminfo", "r");
1629 if (!f)
1630 return 0;
1631
1632 while (getline(&line, &linelen, f) != -1) {
1633 size_t l;
1634 char *printme, lbuf[100];
1635
1636 memset(lbuf, 0, 100);
1637 if (startswith(line, "MemTotal:")) {
1638 sscanf(line+14, "%lu", &hosttotal);
1639 if (hosttotal < memlimit)
1640 memlimit = hosttotal;
1641 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1642 printme = lbuf;
1643 } else if (startswith(line, "MemFree:")) {
1644 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1645 printme = lbuf;
1646 } else if (startswith(line, "MemAvailable:")) {
1647 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1648 printme = lbuf;
1649 } else if (startswith(line, "Buffers:")) {
1650 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1651 printme = lbuf;
1652 } else if (startswith(line, "Cached:")) {
1653 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
1654 printme = lbuf;
1655 } else if (startswith(line, "SwapCached:")) {
1656 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
1657 printme = lbuf;
1658 } else
1659 printme = line;
97f1f27b
YY
1660
1661 l = snprintf(cache, cache_size, "%s", printme);
1662 cache += l;
1663 cache_size -= l;
2f919d9d 1664 total_len += l;
2dc17609
SH
1665 }
1666
97f1f27b
YY
1667 d->size = total_len;
1668 if (total_len > size ) total_len = size;
1669 memcpy(buf, d->buf, total_len);
1670
92c84dc4
SH
1671 fclose(f);
1672 free(line);
2dc17609 1673 return total_len;
23ce2127
SH
1674}
1675
1676/*
1677 * Read the cpuset.cpus for cg
1678 * Return the answer in a nih_alloced string
1679 */
1680static char *get_cpuset(const char *cg)
1681{
1682 char *answer;
1683
1684 if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
1685 return NULL;
1686 return answer;
1687}
1688
1689/*
1690 * Helper functions for cpuset_in-set
1691 */
1692char *cpuset_nexttok(const char *c)
1693{
1694 char *r = strchr(c+1, ',');
1695 if (r)
1696 return r+1;
1697 return NULL;
1698}
1699
1700int cpuset_getrange(const char *c, int *a, int *b)
1701{
1702 int ret;
1703
1704 ret = sscanf(c, "%d-%d", a, b);
1705 return ret;
1706}
1707
1708/*
1709 * cpusets are in format "1,2-3,4"
1710 * iow, comma-delimited ranges
1711 */
aeb56147 1712static bool cpu_in_cpuset(int cpu, const char *cpuset)
23ce2127 1713{
23ce2127
SH
1714 const char *c;
1715
23ce2127
SH
1716 for (c = cpuset; c; c = cpuset_nexttok(c)) {
1717 int a, b, ret;
1718
1719 ret = cpuset_getrange(c, &a, &b);
1720 if (ret == 1 && cpu == a)
1721 return true;
1722 if (ret != 2) // bad cpuset!
1723 return false;
1724 if (cpu >= a && cpu <= b)
1725 return true;
1726 }
1727
1728 return false;
1729}
1730
aeb56147
SH
1731static bool cpuline_in_cpuset(const char *line, const char *cpuset)
1732{
1733 int cpu;
1734
1735 if (sscanf(line, "processor : %d", &cpu) != 1)
1736 return false;
1737 return cpu_in_cpuset(cpu, cpuset);
1738}
1739
23ce2127
SH
1740/*
1741 * check whether this is a '^processor" line in /proc/cpuinfo
1742 */
1743static bool is_processor_line(const char *line)
1744{
1745 int cpu;
1746
1747 if (sscanf(line, "processor : %d", &cpu) == 1)
1748 return true;
1749 return false;
1750}
1751
23ce2127
SH
1752static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
1753 struct fuse_file_info *fi)
1754{
1755 struct fuse_context *fc = fuse_get_context();
97f1f27b 1756 struct file_info *d = (struct file_info *)fi->fh;
23ce2127
SH
1757 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1758 nih_local char *cpuset = NULL;
1759 char *line = NULL;
1760 size_t linelen = 0, total_len = 0;
1761 bool am_printing = false;
1762 int curcpu = -1;
97f1f27b
YY
1763 char *cache = d->buf;
1764 size_t cache_size = d->buflen;
23ce2127
SH
1765 FILE *f;
1766
97f1f27b
YY
1767 if (offset){
1768 if (offset > d->size)
1769 return -EINVAL;
1770 int left = d->size - offset;
1771 total_len = left > size ? size: left;
1772 memcpy(buf, cache + offset, total_len);
2f919d9d 1773 return total_len;
97f1f27b 1774 }
23ce2127
SH
1775
1776 if (!cg)
1777 return 0;
1778
1779 cpuset = get_cpuset(cg);
1780 if (!cpuset)
1781 return 0;
1782
1783 f = fopen("/proc/cpuinfo", "r");
1784 if (!f)
1785 return 0;
1786
1787 while (getline(&line, &linelen, f) != -1) {
1788 size_t l;
1789 if (is_processor_line(line)) {
aeb56147 1790 am_printing = cpuline_in_cpuset(line, cpuset);
23ce2127
SH
1791 if (am_printing) {
1792 curcpu ++;
97f1f27b
YY
1793 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
1794 if (l < cache_size){
1795 cache += l;
1796 cache_size -= l;
1797 total_len += l;
1798 }else{
1799 cache += cache_size;
1800 total_len += cache_size;
1801 cache_size = 0;
1802 break;
1803 }
23ce2127
SH
1804 }
1805 continue;
1806 }
1807 if (am_printing) {
97f1f27b
YY
1808 l = snprintf(cache, cache_size, "%s", line);
1809 if (l < cache_size) {
1810 cache += l;
1811 cache_size -= l;
1812 total_len += l;
1813 } else {
1814 cache += cache_size;
1815 total_len += cache_size;
1816 cache_size = 0;
1817 break;
1818 }
23ce2127
SH
1819 }
1820 }
1821
97f1f27b
YY
1822 d->size = total_len;
1823 if (total_len > size ) total_len = size;
1824
1825 /* read from off 0 */
1826 memcpy(buf, d->buf, total_len);
1827
92c84dc4
SH
1828 fclose(f);
1829 free(line);
23ce2127
SH
1830 return total_len;
1831}
1832
1833static int proc_stat_read(char *buf, size_t size, off_t offset,
1834 struct fuse_file_info *fi)
1835{
aeb56147 1836 struct fuse_context *fc = fuse_get_context();
97f1f27b 1837 struct file_info *d = (struct file_info *)fi->fh;
aeb56147
SH
1838 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1839 nih_local char *cpuset = NULL;
1840 char *line = NULL;
1841 size_t linelen = 0, total_len = 0;
2a0fde62 1842 int curcpu = -1; /* cpu numbering starts at 0 */
97f1f27b
YY
1843 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
1844 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
1845 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
1846#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
1847 char cpuall[CPUALL_MAX_SIZE];
1848 /* reserve for cpu all */
1849 char *cache = d->buf + CPUALL_MAX_SIZE;
1850 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
aeb56147
SH
1851 FILE *f;
1852
97f1f27b
YY
1853 if (offset){
1854 if (offset > d->size)
1855 return -EINVAL;
1856 int left = d->size - offset;
1857 total_len = left > size ? size: left;
1858 memcpy(buf, d->buf + offset, total_len);
2f919d9d 1859 return total_len;
97f1f27b 1860 }
aeb56147
SH
1861
1862 if (!cg)
1863 return 0;
1864
1865 cpuset = get_cpuset(cg);
1866 if (!cpuset)
1867 return 0;
1868
1869 f = fopen("/proc/stat", "r");
1870 if (!f)
1871 return 0;
1872
97f1f27b
YY
1873 //skip first line
1874 if (getline(&line, &linelen, f) < 0) {
1875 fprintf(stderr, "proc_stat_read read first line failed\n");
1876 goto out;
1877 }
1878
aeb56147
SH
1879 while (getline(&line, &linelen, f) != -1) {
1880 size_t l;
1881 int cpu;
2a0fde62 1882 char cpu_char[10]; /* That's a lot of cores */
aeb56147
SH
1883 char *c;
1884
2a0fde62
CB
1885 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
1886 /* not a ^cpuN line containing a number N, just print it */
97f1f27b
YY
1887 l = snprintf(cache, cache_size, "%s", line);
1888 if (l < cache_size){
1889 cache += l;
1890 cache_size -= l;
1891 total_len += l;
1892 continue;
1893 }else{
1894 //no more space, break it
1895 cache += cache_size;
1896 total_len += cache_size;
1897 cache_size = 0;
1898 break;
1899 }
aeb56147 1900 }
2a0fde62
CB
1901
1902 if (sscanf(cpu_char, "%d", &cpu) != 1)
1903 continue;
aeb56147
SH
1904 if (!cpu_in_cpuset(cpu, cpuset))
1905 continue;
1906 curcpu ++;
1907
1908 c = strchr(line, ' ');
1909 if (!c)
1910 continue;
25c5e8fb 1911 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
97f1f27b
YY
1912 cache += l;
1913 cache_size -= l;
aeb56147 1914 total_len += l;
2f919d9d 1915
97f1f27b
YY
1916 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
1917 &softirq, &steal, &guest) != 9)
1918 continue;
1919 user_sum += user;
1920 nice_sum += nice;
1921 system_sum += system;
1922 idle_sum += idle;
1923 iowait_sum += iowait;
1924 irq_sum += irq;
1925 softirq_sum += softirq;
1926 steal_sum += steal;
2f919d9d 1927 guest_sum += guest;
97f1f27b
YY
1928 }
1929
1930 cache = d->buf;
1931
2f919d9d 1932 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
97f1f27b
YY
1933 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
1934 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
1935 memcpy(cache, cpuall, cpuall_len);
2f919d9d 1936 cache += cpuall_len;
97f1f27b
YY
1937 }else{
1938 /* shouldn't happen */
1939 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
1940 cpuall_len = 0;
1941 }
1942
1943 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
1944 total_len += cpuall_len;
1945 d->size = total_len;
1946 if (total_len > size ) total_len = size;
1947
1948 memcpy(buf, d->buf, total_len);
97f1f27b 1949out:
92c84dc4
SH
1950 fclose(f);
1951 free(line);
aeb56147 1952 return total_len;
23ce2127
SH
1953}
1954
7bbf2246
SH
1955/*
1956 * How to guess what to present for uptime?
1957 * One thing we could do would be to take the date on the caller's
1958 * memory.usage_in_bytes file, which should equal the time of creation
1959 * of his cgroup. However, a task could be in a sub-cgroup of the
1960 * container. The same problem exists if we try to look at the ages
1961 * of processes in the caller's cgroup.
1962 *
1963 * So we'll fork a task that will enter the caller's pidns, mount a
1964 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
1965 *
1966 * For the second uptime #, we'll do as Stéphane had done, just copy
1967 * the number from /proc/uptime. Not sure how to best emulate 'idle'
1968 * time. Maybe someone can come up with a good algorithm and submit a
1969 * patch. Maybe something based on cpushare info?
1970 */
41bb9357
SH
1971
1972/* return age of the reaper for $pid, taken from ctime of its procdir */
1973static long int get_pid1_time(pid_t pid)
1974{
1975 char fnam[100];
ea56f722 1976 int fd, cpipe[2], ret;
41bb9357 1977 struct stat sb;
ea56f722
SH
1978 pid_t cpid;
1979 struct timeval tv;
1980 fd_set s;
1981 char v;
41bb9357
SH
1982
1983 if (unshare(CLONE_NEWNS))
1984 return 0;
1985
5ca64c2a
SG
1986 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
1987 perror("rslave mount failed");
1988 return 0;
1989 }
1990
41bb9357
SH
1991 sprintf(fnam, "/proc/%d/ns/pid", pid);
1992 fd = open(fnam, O_RDONLY);
1993 if (fd < 0) {
1994 perror("get_pid1_time open of ns/pid");
1995 return 0;
1996 }
1997 if (setns(fd, 0)) {
1998 perror("get_pid1_time setns 1");
1999 close(fd);
2000 return 0;
2001 }
2002 close(fd);
41bb9357 2003
ea56f722
SH
2004 if (pipe(cpipe) < 0)
2005 exit(1);
41bb9357 2006
ea56f722
SH
2007loop:
2008 cpid = fork();
2009 if (cpid < 0)
41bb9357 2010 return 0;
ea56f722
SH
2011
2012 if (!cpid) {
2013 char b = '1';
2014 close(cpipe[0]);
2015 if (write(cpipe[1], &b, sizeof(char)) < 0) {
2016 fprintf(stderr, "%s (child): erorr on write: %s\n",
2017 __func__, strerror(errno));
2018 }
2019 close(cpipe[1]);
2020 umount2("/proc", MNT_DETACH);
2021 if (mount("proc", "/proc", "proc", 0, NULL)) {
2022 perror("get_pid1_time mount");
2023 return 0;
2024 }
2025 ret = lstat("/proc/1", &sb);
2026 if (ret) {
2027 perror("get_pid1_time lstat");
2028 return 0;
2029 }
2030 return time(NULL) - sb.st_ctime;
41bb9357 2031 }
ea56f722
SH
2032
2033 // give the child 1 second to be done forking and
2034 // write it's ack
2035 FD_ZERO(&s);
2036 FD_SET(cpipe[0], &s);
2037 tv.tv_sec = 1;
2038 tv.tv_usec = 0;
2039 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
2040 if (ret <= 0)
2041 goto again;
2042 ret = read(cpipe[0], &v, 1);
2043 if (ret != sizeof(char) || v != '1') {
2044 goto again;
41bb9357 2045 }
ea56f722
SH
2046
2047 wait_for_pid(cpid);
2048 exit(0);
2049
2050again:
2051 kill(cpid, SIGKILL);
2052 wait_for_pid(cpid);
2053 goto loop;
41bb9357
SH
2054}
2055
2056static long int getreaperage(pid_t qpid)
2057{
2058 int pid, mypipe[2], ret;
2059 struct timeval tv;
2060 fd_set s;
2061 long int mtime, answer = 0;
2062
2063 if (pipe(mypipe)) {
2064 return 0;
2065 }
2066
2067 pid = fork();
2068
2069 if (!pid) { // child
2070 mtime = get_pid1_time(qpid);
2071 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
2072 fprintf(stderr, "Warning: bad write from getreaperage\n");
2073 exit(0);
2074 }
2075
2076 close(mypipe[1]);
2077 FD_ZERO(&s);
2078 FD_SET(mypipe[0], &s);
2079 tv.tv_sec = 1;
2080 tv.tv_usec = 0;
2081 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
ea56f722 2082 if (ret <= 0) {
41bb9357
SH
2083 perror("select");
2084 goto out;
2085 }
2086 if (!ret) {
1420baf8 2087 fprintf(stderr, "timed out\n");
41bb9357
SH
2088 goto out;
2089 }
2090 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
2091 perror("read");
2092 goto out;
2093 }
2094 answer = mtime;
2095
2096out:
2097 wait_for_pid(pid);
2098 close(mypipe[0]);
2099 return answer;
2100}
2101
2102static long int getprocidle(void)
2103{
2104 FILE *f = fopen("/proc/uptime", "r");
2105 long int age, idle;
92c84dc4 2106 int ret;
41bb9357
SH
2107 if (!f)
2108 return 0;
92c84dc4
SH
2109 ret = fscanf(f, "%ld %ld", &age, &idle);
2110 fclose(f);
2111 if (ret != 2)
41bb9357
SH
2112 return 0;
2113 return idle;
2114}
2115
2116/*
2117 * We read /proc/uptime and reuse its second field.
2118 * For the first field, we use the mtime for the reaper for
2119 * the calling pid as returned by getreaperage
2120 */
23ce2127
SH
2121static int proc_uptime_read(char *buf, size_t size, off_t offset,
2122 struct fuse_file_info *fi)
2123{
41bb9357 2124 struct fuse_context *fc = fuse_get_context();
97f1f27b 2125 struct file_info *d = (struct file_info *)fi->fh;
41bb9357
SH
2126 long int reaperage = getreaperage(fc->pid);;
2127 long int idletime = getprocidle();
97f1f27b 2128 size_t total_len = 0;
41bb9357 2129
97f1f27b
YY
2130 if (offset){
2131 if (offset > d->size)
2132 return -EINVAL;
2133 return 0;
2134 }
2135
2136 total_len = snprintf(buf, size, "%ld %ld\n", reaperage, idletime);
2137 d->size = total_len;
2138 return total_len;
23ce2127
SH
2139}
2140
49878439
YY
2141static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2142 struct fuse_file_info *fi)
2143{
2144 char dev_name[72];
2145 struct fuse_context *fc = fuse_get_context();
97f1f27b 2146 struct file_info *d = (struct file_info *)fi->fh;
49878439
YY
2147 nih_local char *cg = get_pid_cgroup(fc->pid, "blkio");
2148 nih_local char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
2149 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2150 unsigned long read = 0, write = 0;
2151 unsigned long read_merged = 0, write_merged = 0;
2152 unsigned long read_sectors = 0, write_sectors = 0;
2153 unsigned long read_ticks = 0, write_ticks = 0;
2154 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2155 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
2156 char *line = NULL;
2157 size_t linelen = 0, total_len = 0;
2158 unsigned int major = 0, minor = 0;
2159 int i = 0;
2160 FILE *f;
2161
97f1f27b
YY
2162 if (offset){
2163 if (offset > d->size)
2164 return -EINVAL;
2165 return 0;
2166 }
49878439
YY
2167
2168 if (!cg)
2169 return 0;
2170
2171 if (!cgm_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2172 return 0;
2173 if (!cgm_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2174 return 0;
2175 if (!cgm_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2176 return 0;
2177 if (!cgm_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2178 return 0;
2179 if (!cgm_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2180 return 0;
2181
2182
2183 f = fopen("/proc/diskstats", "r");
2184 if (!f)
2185 return 0;
2186
2187 while (getline(&line, &linelen, f) != -1) {
2188 size_t l;
2189 char *printme, lbuf[256];
2190
2191 i = sscanf(line, "%u %u %s", &major, &minor, dev_name);
2192 if(i == 3){
2193 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2194 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2195 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2196 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2197 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2198 read_sectors = read_sectors/512;
2199 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2200 write_sectors = write_sectors/512;
2f919d9d 2201
49878439
YY
2202 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2203 rd_svctm = rd_svctm/1000000;
2204 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2205 rd_wait = rd_wait/1000000;
2206 read_ticks = rd_svctm + rd_wait;
2207
2208 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2209 wr_svctm = wr_svctm/1000000;
2210 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2211 wr_wait = wr_wait/1000000;
2212 write_ticks = wr_svctm + wr_wait;
2213
2214 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2215 tot_ticks = tot_ticks/1000000;
2216 }else{
2217 continue;
2218 }
2219
2220 memset(lbuf, 0, 256);
2221 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2f919d9d 2222 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
49878439
YY
2223 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2224 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2225 printme = lbuf;
2226 } else
2227 continue;
2228
2229 l = snprintf(buf, size, "%s", printme);
2230 buf += l;
2231 size -= l;
2232 total_len += l;
2233 }
2234
97f1f27b
YY
2235 d->size = total_len;
2236
49878439
YY
2237 fclose(f);
2238 free(line);
2239 return total_len;
2240}
2241
23ce2127
SH
2242static off_t get_procfile_size(const char *which)
2243{
2244 FILE *f = fopen(which, "r");
2245 char *line = NULL;
2246 size_t len = 0;
2247 ssize_t sz, answer = 0;
2248 if (!f)
2249 return 0;
2250
2251 while ((sz = getline(&line, &len, f)) != -1)
2252 answer += sz;
2253 fclose (f);
92c84dc4 2254 free(line);
23ce2127
SH
2255
2256 return answer;
2257}
2258
758ad80c
SH
2259static int proc_getattr(const char *path, struct stat *sb)
2260{
35629743
SH
2261 struct timespec now;
2262
2263 memset(sb, 0, sizeof(struct stat));
2264 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2265 return -EINVAL;
2266 sb->st_uid = sb->st_gid = 0;
2267 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2268 if (strcmp(path, "/proc") == 0) {
2269 sb->st_mode = S_IFDIR | 00555;
2270 sb->st_nlink = 2;
2271 return 0;
2272 }
2273 if (strcmp(path, "/proc/meminfo") == 0 ||
2274 strcmp(path, "/proc/cpuinfo") == 0 ||
2275 strcmp(path, "/proc/uptime") == 0 ||
49878439
YY
2276 strcmp(path, "/proc/stat") == 0 ||
2277 strcmp(path, "/proc/diskstats") == 0) {
23ce2127 2278 sb->st_size = get_procfile_size(path);
35629743
SH
2279 sb->st_mode = S_IFREG | 00444;
2280 sb->st_nlink = 1;
2281 return 0;
2282 }
2283
2284 return -ENOENT;
2285}
2286
2287static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2288 struct fuse_file_info *fi)
2289{
2290 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2291 filler(buf, "meminfo", NULL, 0) != 0 ||
2292 filler(buf, "stat", NULL, 0) != 0 ||
49878439
YY
2293 filler(buf, "uptime", NULL, 0) != 0 ||
2294 filler(buf, "diskstats", NULL, 0) != 0)
758ad80c 2295 return -EINVAL;
758ad80c
SH
2296 return 0;
2297}
2298
35629743
SH
2299static int proc_open(const char *path, struct fuse_file_info *fi)
2300{
96fc5ee6
SH
2301 int type = -1;
2302 struct file_info *info;
2303
2304 if (strcmp(path, "/proc/meminfo") == 0)
2305 type = LXC_TYPE_PROC_MEMINFO;
2306 else if (strcmp(path, "/proc/cpuinfo") == 0)
2307 type = LXC_TYPE_PROC_CPUINFO;
2308 else if (strcmp(path, "/proc/uptime") == 0)
2309 type = LXC_TYPE_PROC_UPTIME;
2310 else if (strcmp(path, "/proc/stat") == 0)
2311 type = LXC_TYPE_PROC_STAT;
2312 else if (strcmp(path, "/proc/diskstats") == 0)
2313 type = LXC_TYPE_PROC_DISKSTATS;
2314 if (type == -1)
2315 return -ENOENT;
2316
2317 info = NIH_MUST( nih_alloc(NULL, sizeof(*info)) );
2318 memset(info, 0, sizeof(*info));
2319 info->type = type;
2320
97f1f27b 2321 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
25c5e8fb 2322 info->buf = NIH_MUST( nih_alloc(info, info->buflen) );
97f1f27b
YY
2323 memset(info->buf, 0, info->buflen);
2324 /* set actual size to buffer size */
2f919d9d 2325 info->size = info->buflen;
97f1f27b 2326
96fc5ee6
SH
2327 fi->fh = (unsigned long)info;
2328 return 0;
2329}
2330
2331static int proc_release(const char *path, struct fuse_file_info *fi)
2332{
2333 struct file_info *f = (struct file_info *)fi->fh;
2334
2335 do_release_file_info(f);
2336 return 0;
35629743
SH
2337}
2338
35629743
SH
2339static int proc_read(const char *path, char *buf, size_t size, off_t offset,
2340 struct fuse_file_info *fi)
2341{
96fc5ee6
SH
2342 struct file_info *f = (struct file_info *) fi->fh;
2343
2344 switch (f->type) {
2f919d9d 2345 case LXC_TYPE_PROC_MEMINFO:
23ce2127 2346 return proc_meminfo_read(buf, size, offset, fi);
96fc5ee6 2347 case LXC_TYPE_PROC_CPUINFO:
23ce2127 2348 return proc_cpuinfo_read(buf, size, offset, fi);
96fc5ee6 2349 case LXC_TYPE_PROC_UPTIME:
23ce2127 2350 return proc_uptime_read(buf, size, offset, fi);
96fc5ee6 2351 case LXC_TYPE_PROC_STAT:
23ce2127 2352 return proc_stat_read(buf, size, offset, fi);
96fc5ee6 2353 case LXC_TYPE_PROC_DISKSTATS:
49878439 2354 return proc_diskstats_read(buf, size, offset, fi);
96fc5ee6
SH
2355 default:
2356 return -EINVAL;
2357 }
35629743
SH
2358}
2359
2ad6d2bd
SH
2360/*
2361 * FUSE ops for /
2362 * these just delegate to the /proc and /cgroup ops as
2363 * needed
2364 */
758ad80c
SH
2365
2366static int lxcfs_getattr(const char *path, struct stat *sb)
2367{
2368 if (strcmp(path, "/") == 0) {
2369 sb->st_mode = S_IFDIR | 00755;
2370 sb->st_nlink = 2;
2371 return 0;
2372 }
2373 if (strncmp(path, "/cgroup", 7) == 0) {
2374 return cg_getattr(path, sb);
2375 }
35629743 2376 if (strncmp(path, "/proc", 5) == 0) {
758ad80c
SH
2377 return proc_getattr(path, sb);
2378 }
2379 return -EINVAL;
2380}
2381
2382static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
2383{
2384 if (strcmp(path, "/") == 0)
2385 return 0;
2386
2387 if (strncmp(path, "/cgroup", 7) == 0) {
2388 return cg_opendir(path, fi);
2389 }
35629743
SH
2390 if (strcmp(path, "/proc") == 0)
2391 return 0;
2392 return -ENOENT;
758ad80c
SH
2393}
2394
2395static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2396 struct fuse_file_info *fi)
2397{
2398 if (strcmp(path, "/") == 0) {
2399 if (filler(buf, "proc", NULL, 0) != 0 ||
2400 filler(buf, "cgroup", NULL, 0) != 0)
2401 return -EINVAL;
2402 return 0;
2403 }
35629743 2404 if (strncmp(path, "/cgroup", 7) == 0)
758ad80c 2405 return cg_readdir(path, buf, filler, offset, fi);
35629743
SH
2406 if (strcmp(path, "/proc") == 0)
2407 return proc_readdir(path, buf, filler, offset, fi);
758ad80c
SH
2408 return -EINVAL;
2409}
2410
2411static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
2412{
2413 if (strcmp(path, "/") == 0)
2414 return 0;
2415 if (strncmp(path, "/cgroup", 7) == 0) {
2416 return cg_releasedir(path, fi);
2417 }
35629743
SH
2418 if (strcmp(path, "/proc") == 0)
2419 return 0;
758ad80c
SH
2420 return -EINVAL;
2421}
2422
99978832
SH
2423static int lxcfs_open(const char *path, struct fuse_file_info *fi)
2424{
35629743 2425 if (strncmp(path, "/cgroup", 7) == 0)
99978832 2426 return cg_open(path, fi);
35629743
SH
2427 if (strncmp(path, "/proc", 5) == 0)
2428 return proc_open(path, fi);
99978832
SH
2429
2430 return -EINVAL;
2431}
2432
2433static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
2434 struct fuse_file_info *fi)
2435{
35629743 2436 if (strncmp(path, "/cgroup", 7) == 0)
99978832 2437 return cg_read(path, buf, size, offset, fi);
35629743
SH
2438 if (strncmp(path, "/proc", 5) == 0)
2439 return proc_read(path, buf, size, offset, fi);
99978832
SH
2440
2441 return -EINVAL;
2442}
2443
2ad6d2bd
SH
2444int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
2445 struct fuse_file_info *fi)
2446{
2447 if (strncmp(path, "/cgroup", 7) == 0) {
2448 return cg_write(path, buf, size, offset, fi);
2449 }
2450
2451 return -EINVAL;
2452}
2453
99978832
SH
2454static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2455{
2456 return 0;
2457}
2458
2459static int lxcfs_release(const char *path, struct fuse_file_info *fi)
758ad80c 2460{
8f6e8f5e
SH
2461 if (strncmp(path, "/cgroup", 7) == 0)
2462 return cg_release(path, fi);
8f6e8f5e 2463 if (strncmp(path, "/proc", 5) == 0)
96fc5ee6 2464 return proc_release(path, fi);
8f6e8f5e
SH
2465
2466 return -EINVAL;
99978832
SH
2467}
2468
2469static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2470{
2471 return 0;
758ad80c
SH
2472}
2473
ab54b798
SH
2474int lxcfs_mkdir(const char *path, mode_t mode)
2475{
2476 if (strncmp(path, "/cgroup", 7) == 0)
2477 return cg_mkdir(path, mode);
2478
2479 return -EINVAL;
2480}
2481
341b21ad
SH
2482int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2483{
2484 if (strncmp(path, "/cgroup", 7) == 0)
2485 return cg_chown(path, uid, gid);
2486
2487 return -EINVAL;
2488}
2489
2ad6d2bd
SH
2490/*
2491 * cat first does a truncate before doing ops->write. This doesn't
2492 * really make sense for cgroups. So just return 0 always but do
2493 * nothing.
2494 */
2495int lxcfs_truncate(const char *path, off_t newsize)
2496{
2497 if (strncmp(path, "/cgroup", 7) == 0)
2498 return 0;
2499 return -EINVAL;
2500}
2501
50d8d5b5
SH
2502int lxcfs_rmdir(const char *path)
2503{
2504 if (strncmp(path, "/cgroup", 7) == 0)
2505 return cg_rmdir(path);
2506 return -EINVAL;
2507}
2508
fd2e4e03
SH
2509int lxcfs_chmod(const char *path, mode_t mode)
2510{
2511 if (strncmp(path, "/cgroup", 7) == 0)
2512 return cg_chmod(path, mode);
2513 return -EINVAL;
2514}
2515
758ad80c
SH
2516const struct fuse_operations lxcfs_ops = {
2517 .getattr = lxcfs_getattr,
2518 .readlink = NULL,
2519 .getdir = NULL,
2520 .mknod = NULL,
ab54b798 2521 .mkdir = lxcfs_mkdir,
758ad80c 2522 .unlink = NULL,
50d8d5b5 2523 .rmdir = lxcfs_rmdir,
758ad80c
SH
2524 .symlink = NULL,
2525 .rename = NULL,
2526 .link = NULL,
fd2e4e03 2527 .chmod = lxcfs_chmod,
341b21ad 2528 .chown = lxcfs_chown,
2ad6d2bd 2529 .truncate = lxcfs_truncate,
758ad80c 2530 .utime = NULL,
99978832
SH
2531
2532 .open = lxcfs_open,
2533 .read = lxcfs_read,
2534 .release = lxcfs_release,
2ad6d2bd 2535 .write = lxcfs_write,
99978832 2536
758ad80c 2537 .statfs = NULL,
99978832
SH
2538 .flush = lxcfs_flush,
2539 .fsync = lxcfs_fsync,
758ad80c
SH
2540
2541 .setxattr = NULL,
2542 .getxattr = NULL,
2543 .listxattr = NULL,
2544 .removexattr = NULL,
2545
2546 .opendir = lxcfs_opendir,
2547 .readdir = lxcfs_readdir,
2548 .releasedir = lxcfs_releasedir,
2549
2550 .fsyncdir = NULL,
2551 .init = NULL,
2552 .destroy = NULL,
2553 .access = NULL,
2554 .create = NULL,
2555 .ftruncate = NULL,
2556 .fgetattr = NULL,
2557};
2558
99978832 2559static void usage(const char *me)
758ad80c
SH
2560{
2561 fprintf(stderr, "Usage:\n");
2562 fprintf(stderr, "\n");
0b0f73db
SH
2563 fprintf(stderr, "%s mountpoint\n", me);
2564 fprintf(stderr, "%s -h\n", me);
758ad80c
SH
2565 exit(1);
2566}
2567
99978832 2568static bool is_help(char *w)
758ad80c
SH
2569{
2570 if (strcmp(w, "-h") == 0 ||
2571 strcmp(w, "--help") == 0 ||
2572 strcmp(w, "-help") == 0 ||
2573 strcmp(w, "help") == 0)
2574 return true;
2575 return false;
2576}
2577
0b0f73db
SH
2578void swallow_arg(int *argcp, char *argv[], char *which)
2579{
2580 int i;
2581
2582 for (i = 1; argv[i]; i++) {
2583 if (strcmp(argv[i], which) != 0)
2584 continue;
2585 for (; argv[i]; i++) {
2586 argv[i] = argv[i+1];
2587 }
2588 (*argcp)--;
2589 return;
2590 }
2591}
2592
2593void swallow_option(int *argcp, char *argv[], char *opt, char *v)
2594{
2595 int i;
2596
2597 for (i = 1; argv[i]; i++) {
2598 if (!argv[i+1])
2599 continue;
2600 if (strcmp(argv[i], opt) != 0)
2601 continue;
2602 if (strcmp(argv[i+1], v) != 0) {
2603 fprintf(stderr, "Warning: unexpected fuse option %s\n", v);
2604 exit(1);
2605 }
2606 for (; argv[i+1]; i++) {
2607 argv[i] = argv[i+2];
2608 }
2609 (*argcp) -= 2;
2610 return;
2611 }
2612}
2613
758ad80c
SH
2614int main(int argc, char *argv[])
2615{
2616 int ret;
2617 struct lxcfs_state *d;
0b0f73db
SH
2618 /*
2619 * what we pass to fuse_main is:
2620 * argv[0] -s -f -o allow_other,directio argv[1] NULL
2621 */
2622#define NARGS 7
2623 char *newargv[7];
758ad80c 2624
0b0f73db
SH
2625 /* accomodate older init scripts */
2626 swallow_arg(&argc, argv, "-s");
2627 swallow_arg(&argc, argv, "-f");
2628 swallow_option(&argc, argv, "-o", "allow_other");
2629
2630 if (argc != 2 || is_help(argv[1]))
758ad80c
SH
2631 usage(argv[0]);
2632
0b0f73db
SH
2633 d = NIH_MUST( malloc(sizeof(*d)) );
2634
2635 newargv[0] = argv[0];
2636 newargv[1] = "-s";
2637 newargv[2] = "-f";
2638 newargv[3] = "-o";
2639 newargv[4] = "allow_other";
2640 newargv[5] = argv[1];
2641 newargv[6] = NULL;
758ad80c
SH
2642
2643 if (!cgm_escape_cgroup())
2644 fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
2645
2646 if (!cgm_get_controllers(&d->subsystems))
2647 return -1;
2648
0b0f73db 2649 ret = fuse_main(NARGS - 1, newargv, &lxcfs_ops, d);
758ad80c
SH
2650
2651 return ret;
2183082c 2652}