]> git.proxmox.com Git - mirror_lxcfs.git/blame - lxcfs.c
implement --version
[mirror_lxcfs.git] / lxcfs.c
CommitLineData
758ad80c
SH
1/* lxcfs
2 *
3 * Copyright © 2014 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
f2799430 6 * See COPYING file for details.
758ad80c
SH
7 */
8
758ad80c
SH
9#define FUSE_USE_VERSION 26
10
2183082c 11#include <stdio.h>
758ad80c
SH
12#include <dirent.h>
13#include <fcntl.h>
14#include <fuse.h>
15#include <unistd.h>
16#include <errno.h>
17#include <stdbool.h>
18#include <time.h>
19#include <string.h>
20#include <stdlib.h>
21#include <libgen.h>
41bb9357
SH
22#include <sched.h>
23#include <linux/sched.h>
a05660a6 24#include <sys/socket.h>
41bb9357
SH
25#include <sys/mount.h>
26#include <wait.h>
758ad80c
SH
27
28#include <nih/alloc.h>
29#include <nih/string.h>
30
31#include "cgmanager.h"
2e9c0b32 32#include "config.h" // for VERSION
758ad80c
SH
33
34struct lxcfs_state {
35 /*
36 * a null-terminated, nih-allocated list of the mounted subsystems. We
37 * detect this at startup.
38 */
39 char **subsystems;
40};
41#define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
42
443d13f5
SH
43enum {
44 LXC_TYPE_CGDIR,
45 LXC_TYPE_CGFILE,
46 LXC_TYPE_PROC_MEMINFO,
47 LXC_TYPE_PROC_CPUINFO,
48 LXC_TYPE_PROC_UPTIME,
49 LXC_TYPE_PROC_STAT,
50 LXC_TYPE_PROC_DISKSTATS,
51};
52
c688e1b3
SH
53struct file_info {
54 char *controller;
55 char *cgroup;
8f6e8f5e 56 char *file;
443d13f5 57 int type;
c688e1b3
SH
58 char *buf; // unused as of yet
59 int buflen;
97f1f27b 60 int size; //actual data size
c688e1b3
SH
61};
62
97f1f27b
YY
63/* reserve buffer size, for cpuall in /proc/stat */
64#define BUF_RESERVE_SIZE 256
65
bae07053 66static char *must_copy_string(void *parent, const char *str)
c688e1b3
SH
67{
68 if (!str)
69 return NULL;
bae07053 70 return NIH_MUST( nih_strdup(parent, str) );
c688e1b3
SH
71}
72
4775fba1
SH
73/*
74 * TODO - return value should denote whether child exited with failure
75 * so callers can return errors. Esp read/write of tasks and cgroup.procs
76 */
a05660a6
SH
77static int wait_for_pid(pid_t pid)
78{
79 int status, ret;
80
81again:
82 ret = waitpid(pid, &status, 0);
83 if (ret == -1) {
84 if (errno == EINTR)
85 goto again;
86 return -1;
87 }
88 if (ret != pid)
89 goto again;
90 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
91 return -1;
92 return 0;
93}
94
053a659d
SH
95/*
96 * Given a open file * to /proc/pid/{u,g}id_map, and an id
97 * valid in the caller's namespace, return the id mapped into
98 * pid's namespace.
99 * Returns the mapped id, or -1 on error.
100 */
101unsigned int
102convert_id_to_ns(FILE *idfile, unsigned int in_id)
103{
104 unsigned int nsuid, // base id for a range in the idfile's namespace
105 hostuid, // base id for a range in the caller's namespace
106 count; // number of ids in this range
107 char line[400];
108 int ret;
109
110 fseek(idfile, 0L, SEEK_SET);
111 while (fgets(line, 400, idfile)) {
112 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
113 if (ret != 3)
114 continue;
115 if (hostuid + count < hostuid || nsuid + count < nsuid) {
116 /*
117 * uids wrapped around - unexpected as this is a procfile,
118 * so just bail.
119 */
647c89e5 120 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
053a659d
SH
121 nsuid, hostuid, count, line);
122 return -1;
123 }
124 if (hostuid <= in_id && hostuid+count > in_id) {
125 /*
126 * now since hostuid <= in_id < hostuid+count, and
127 * hostuid+count and nsuid+count do not wrap around,
128 * we know that nsuid+(in_id-hostuid) which must be
129 * less that nsuid+(count) must not wrap around
130 */
131 return (in_id - hostuid) + nsuid;
132 }
133 }
134
135 // no answer found
136 return -1;
137}
138
341b21ad
SH
139/*
140 * for is_privileged_over,
141 * specify whether we require the calling uid to be root in his
142 * namespace
143 */
144#define NS_ROOT_REQD true
145#define NS_ROOT_OPT false
146
147static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
758ad80c 148{
053a659d
SH
149 nih_local char *fpath = NULL;
150 bool answer = false;
151 uid_t nsuid;
152
341b21ad
SH
153 if (victim == -1 || uid == -1)
154 return false;
155
156 /*
157 * If the request is one not requiring root in the namespace,
158 * then having the same uid suffices. (i.e. uid 1000 has write
159 * access to files owned by uid 1000
160 */
161 if (!req_ns_root && uid == victim)
758ad80c
SH
162 return true;
163
053a659d
SH
164 fpath = NIH_MUST( nih_sprintf(NULL, "/proc/%d/uid_map", pid) );
165 FILE *f = fopen(fpath, "r");
166 if (!f)
167 return false;
168
341b21ad 169 /* if caller's not root in his namespace, reject */
053a659d
SH
170 nsuid = convert_id_to_ns(f, uid);
171 if (nsuid)
172 goto out;
173
341b21ad
SH
174 /*
175 * If victim is not mapped into caller's ns, reject.
176 * XXX I'm not sure this check is needed given that fuse
177 * will be sending requests where the vfs has converted
178 */
053a659d
SH
179 nsuid = convert_id_to_ns(f, victim);
180 if (nsuid == -1)
181 goto out;
182
183 answer = true;
184
185out:
186 fclose(f);
187 return answer;
758ad80c
SH
188}
189
190static bool perms_include(int fmode, mode_t req_mode)
191{
2ad6d2bd
SH
192 mode_t r;
193
194 switch (req_mode & O_ACCMODE) {
195 case O_RDONLY:
196 r = S_IROTH;
197 break;
198 case O_WRONLY:
199 r = S_IWOTH;
200 break;
201 case O_RDWR:
202 r = S_IROTH | S_IWOTH;
203 break;
204 default:
205 return false;
206 }
207 return ((fmode & r) == r);
758ad80c
SH
208}
209
3db25a35
SH
210static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
211{
212 char *start, *end;
213
214 if (strlen(taskcg) <= strlen(querycg)) {
215 fprintf(stderr, "%s: I was fed bad input\n", __func__);
216 return NULL;
217 }
218
219 if (strcmp(querycg, "/") == 0)
220 start = NIH_MUST( nih_strdup(NULL, taskcg + 1) );
221 else
222 start = NIH_MUST( nih_strdup(NULL, taskcg + strlen(querycg) + 1) );
223 end = strchr(start, '/');
224 if (end)
225 *end = '\0';
226 return start;
227}
228
758ad80c
SH
229/*
230 * check whether a fuse context may access a cgroup dir or file
231 *
232 * If file is not null, it is a cgroup file to check under cg.
233 * If file is null, then we are checking perms on cg itself.
234 *
235 * For files we can check the mode of the list_keys result.
236 * For cgroups, we must make assumptions based on the files under the
237 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
238 * yet.
239 */
240static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
241{
242 nih_local struct cgm_keys **list = NULL;
243 int i;
244
245 if (!file)
246 file = "tasks";
247
248 if (*file == '/')
249 file++;
250
251 if (!cgm_list_keys(contrl, cg, &list))
252 return false;
253 for (i = 0; list[i]; i++) {
254 if (strcmp(list[i]->name, file) == 0) {
255 struct cgm_keys *k = list[i];
341b21ad 256 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
758ad80c
SH
257 if (perms_include(k->mode >> 6, mode))
258 return true;
259 }
260 if (fc->gid == k->gid) {
261 if (perms_include(k->mode >> 3, mode))
262 return true;
263 }
264 return perms_include(k->mode, mode);
265 }
266 }
267
268 return false;
269}
270
3db25a35
SH
271static void stripnewline(char *x)
272{
273 size_t l = strlen(x);
274 if (l && x[l-1] == '\n')
275 x[l-1] = '\0';
276}
277
278/*
279 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
280 * If caller is in /a, he may act on /a/b, but not on /b.
281 * if the answer is false and nextcg is not NULL, then *nextcg will point
282 * to a nih_alloc'd string containing the next cgroup directory under cg
283 */
284static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
285{
286 nih_local char *fnam = NULL;
287 FILE *f;
288 bool answer = false;
289 char *line = NULL;
290 size_t len = 0;
291
292 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
293 if (!(f = fopen(fnam, "r")))
294 return false;
295
296 while (getline(&line, &len, f) != -1) {
297 char *c1, *c2, *linecmp;
298 if (!line[0])
299 continue;
300 c1 = strchr(line, ':');
301 if (!c1)
302 goto out;
303 c1++;
304 c2 = strchr(c1, ':');
305 if (!c2)
306 goto out;
307 *c2 = '\0';
308 if (strcmp(c1, contrl) != 0)
309 continue;
310 c2++;
311 stripnewline(c2);
312 /*
313 * callers pass in '/' for root cgroup, otherwise they pass
314 * in a cgroup without leading '/'
315 */
316 linecmp = *cg == '/' ? c2 : c2+1;
317 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
318 if (nextcg)
319 *nextcg = get_next_cgroup_dir(linecmp, cg);
320 goto out;
321 }
322 answer = true;
323 goto out;
324 }
325
326out:
327 fclose(f);
328 free(line);
329 return answer;
330}
331
758ad80c
SH
332/*
333 * given /cgroup/freezer/a/b, return "freezer". this will be nih-allocated
334 * and needs to be nih_freed.
335 */
336static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
337{
338 const char *p1;
339 char *ret, *slash;
340
341 if (strlen(path) < 9)
342 return NULL;
ac5d9d48
SH
343 if (*(path+7) != '/')
344 return NULL;
758ad80c
SH
345 p1 = path+8;
346 ret = nih_strdup(NULL, p1);
347 if (!ret)
348 return ret;
349 slash = strstr(ret, "/");
350 if (slash)
351 *slash = '\0';
352
353 /* verify that it is a subsystem */
354 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
355 int i;
356 if (!list) {
357 nih_free(ret);
358 return NULL;
359 }
360 for (i = 0; list[i]; i++) {
361 if (strcmp(list[i], ret) == 0)
362 return ret;
363 }
364 nih_free(ret);
365 return NULL;
366}
367
368/*
369 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
370 * Note that the returned value may include files (keynames) etc
371 */
372static const char *find_cgroup_in_path(const char *path)
373{
374 const char *p1;
375
376 if (strlen(path) < 9)
377 return NULL;
378 p1 = strstr(path+8, "/");
379 if (!p1)
380 return NULL;
381 return p1+1;
382}
383
384static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
385{
386 nih_local char **list = NULL;
387 int i;
388
389 if (!f)
390 return false;
391 if (*f == '/')
392 f++;
393
394 if (!cgm_list_children(contr, dir, &list))
395 return false;
396 for (i = 0; list[i]; i++) {
397 if (strcmp(list[i], f) == 0)
398 return true;
399 }
400
401 return false;
402}
403
404static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
405{
406 nih_local struct cgm_keys **list = NULL;
407 struct cgm_keys *k;
408 int i;
409
410 if (!f)
411 return NULL;
412 if (*f == '/')
413 f++;
414 if (!cgm_list_keys(contr, dir, &list))
415 return NULL;
416 for (i = 0; list[i]; i++) {
417 if (strcmp(list[i]->name, f) == 0) {
418 k = NIH_MUST( nih_alloc(NULL, (sizeof(*k))) );
419 k->name = NIH_MUST( nih_strdup(k, list[i]->name) );
420 k->uid = list[i]->uid;
421 k->gid = list[i]->gid;
422 k->mode = list[i]->mode;
423 return k;
424 }
425 }
426
427 return NULL;
428}
429
430static void get_cgdir_and_path(const char *cg, char **dir, char **file)
431{
758ad80c
SH
432 char *p;
433
434 *dir = NIH_MUST( nih_strdup(NULL, cg) );
435 *file = strrchr(cg, '/');
436 if (!*file) {
437 *file = NULL;
438 return;
439 }
440 p = strrchr(*dir, '/');
441 *p = '\0';
442}
443
444/*
2ad6d2bd 445 * FUSE ops for /cgroup
758ad80c 446 */
2ad6d2bd 447
758ad80c
SH
448static int cg_getattr(const char *path, struct stat *sb)
449{
450 struct timespec now;
451 struct fuse_context *fc = fuse_get_context();
452 nih_local char * cgdir = NULL;
453 char *fpath = NULL, *path1, *path2;
454 nih_local struct cgm_keys *k = NULL;
455 const char *cgroup;
456 nih_local char *controller = NULL;
457
458
459 if (!fc)
460 return -EIO;
461
462 memset(sb, 0, sizeof(struct stat));
463
464 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
465 return -EINVAL;
466
467 sb->st_uid = sb->st_gid = 0;
468 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
469 sb->st_size = 0;
470
471 if (strcmp(path, "/cgroup") == 0) {
472 sb->st_mode = S_IFDIR | 00755;
473 sb->st_nlink = 2;
474 return 0;
475 }
476
477 controller = pick_controller_from_path(fc, path);
478 if (!controller)
479 return -EIO;
758ad80c
SH
480 cgroup = find_cgroup_in_path(path);
481 if (!cgroup) {
482 /* this is just /cgroup/controller, return it as a dir */
483 sb->st_mode = S_IFDIR | 00755;
484 sb->st_nlink = 2;
485 return 0;
486 }
341b21ad 487
758ad80c
SH
488 get_cgdir_and_path(cgroup, &cgdir, &fpath);
489
490 if (!fpath) {
491 path1 = "/";
492 path2 = cgdir;
493 } else {
494 path1 = cgdir;
495 path2 = fpath;
496 }
497
758ad80c
SH
498 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
499 * Then check that caller's cgroup is under path if fpath is a child
500 * cgroup, or cgdir if fpath is a file */
501
502 if (is_child_cgroup(controller, path1, path2)) {
f9a05025
SH
503 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
504 /* this is just /cgroup/controller, return it as a dir */
505 sb->st_mode = S_IFDIR | 00555;
506 sb->st_nlink = 2;
507 return 0;
508 }
758ad80c 509 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
f9a05025 510 return -EACCES;
758ad80c 511
053a659d
SH
512 // get uid, gid, from '/tasks' file and make up a mode
513 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
514 sb->st_mode = S_IFDIR | 00755;
515 k = get_cgroup_key(controller, cgroup, "tasks");
516 if (!k) {
053a659d
SH
517 sb->st_uid = sb->st_gid = 0;
518 } else {
053a659d
SH
519 sb->st_uid = k->uid;
520 sb->st_gid = k->gid;
521 }
758ad80c
SH
522 sb->st_nlink = 2;
523 return 0;
524 }
525
526 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
3db25a35
SH
527 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL))
528 return -ENOENT;
758ad80c 529 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
f9a05025 530 return -EACCES;
758ad80c 531
758ad80c 532 sb->st_mode = S_IFREG | k->mode;
053a659d 533 sb->st_nlink = 1;
758ad80c
SH
534 sb->st_uid = k->uid;
535 sb->st_gid = k->gid;
7253e0a4 536 sb->st_size = 0;
758ad80c
SH
537 return 0;
538 }
539
ab54b798 540 return -ENOENT;
758ad80c 541}
2183082c 542
7f163b71
SH
543/*
544 * TODO - cache these results in a table for use in opendir, free
545 * in releasedir
546 */
758ad80c 547static int cg_opendir(const char *path, struct fuse_file_info *fi)
2183082c 548{
7f163b71
SH
549 struct fuse_context *fc = fuse_get_context();
550 nih_local struct cgm_keys **list = NULL;
551 const char *cgroup;
c688e1b3 552 struct file_info *dir_info;
7f163b71 553 nih_local char *controller = NULL;
7f163b71
SH
554
555 if (!fc)
556 return -EIO;
557
c688e1b3
SH
558 if (strcmp(path, "/cgroup") == 0) {
559 cgroup = NULL;
560 controller = NULL;
561 } else {
562 // return list of keys for the controller, and list of child cgroups
563 controller = pick_controller_from_path(fc, path);
564 if (!controller)
565 return -EIO;
7f163b71 566
c688e1b3
SH
567 cgroup = find_cgroup_in_path(path);
568 if (!cgroup) {
569 /* this is just /cgroup/controller, return its contents */
570 cgroup = "/";
571 }
7f163b71
SH
572 }
573
3a6e1a76 574 if (cgroup && !fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
7f163b71 575 return -EACCES;
c688e1b3
SH
576
577 /* we'll free this at cg_releasedir */
578 dir_info = NIH_MUST( nih_alloc(NULL, sizeof(*dir_info)) );
bae07053
SH
579 dir_info->controller = must_copy_string(dir_info, controller);
580 dir_info->cgroup = must_copy_string(dir_info, cgroup);
443d13f5 581 dir_info->type = LXC_TYPE_CGDIR;
c688e1b3 582 dir_info->buf = NULL;
8f6e8f5e 583 dir_info->file = NULL;
c688e1b3
SH
584 dir_info->buflen = 0;
585
586 fi->fh = (unsigned long)dir_info;
758ad80c
SH
587 return 0;
588}
589
758ad80c
SH
590static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
591 struct fuse_file_info *fi)
592{
c688e1b3
SH
593 struct file_info *d = (struct file_info *)fi->fh;
594 nih_local struct cgm_keys **list = NULL;
595 int i;
596 nih_local char *nextcg = NULL;
758ad80c
SH
597 struct fuse_context *fc = fuse_get_context();
598
443d13f5 599 if (d->type != LXC_TYPE_CGDIR) {
b845ad01
SH
600 fprintf(stderr, "Internal error: file cache info used in readdir\n");
601 return -EIO;
602 }
c688e1b3
SH
603 if (!d->cgroup && !d->controller) {
604 // ls /var/lib/lxcfs/cgroup - just show list of controllers
758ad80c
SH
605 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
606 int i;
607
608 if (!list)
609 return -EIO;
7f163b71 610
758ad80c
SH
611 for (i = 0; list[i]; i++) {
612 if (filler(buf, list[i], NULL, 0) != 0) {
613 return -EIO;
614 }
615 }
616 return 0;
617 }
618
c688e1b3 619 if (!cgm_list_keys(d->controller, d->cgroup, &list))
3db25a35 620 // not a valid cgroup
758ad80c 621 return -EINVAL;
3db25a35 622
c688e1b3 623 if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
3db25a35
SH
624 if (nextcg) {
625 int ret;
626 ret = filler(buf, nextcg, NULL, 0);
627 if (ret != 0)
628 return -EIO;
629 }
630 return 0;
631 }
632
758ad80c 633 for (i = 0; list[i]; i++) {
758ad80c
SH
634 if (filler(buf, list[i]->name, NULL, 0) != 0) {
635 return -EIO;
636 }
637 }
638
639 // now get the list of child cgroups
422aa4a5 640 nih_local char **clist = NULL;
758ad80c 641
c688e1b3 642 if (!cgm_list_children(d->controller, d->cgroup, &clist))
758ad80c
SH
643 return 0;
644 for (i = 0; clist[i]; i++) {
758ad80c
SH
645 if (filler(buf, clist[i], NULL, 0) != 0) {
646 return -EIO;
647 }
648 }
649 return 0;
650}
651
8f6e8f5e
SH
652static void do_release_file_info(struct file_info *f)
653{
bae07053
SH
654 /*
655 * all file_info fields which are nih_alloc()d with f as parent
656 * will be automatically freed
657 */
8f6e8f5e
SH
658 nih_free(f);
659}
660
758ad80c
SH
661static int cg_releasedir(const char *path, struct fuse_file_info *fi)
662{
c688e1b3
SH
663 struct file_info *d = (struct file_info *)fi->fh;
664
8f6e8f5e 665 do_release_file_info(d);
758ad80c
SH
666 return 0;
667}
668
99978832
SH
669static int cg_open(const char *path, struct fuse_file_info *fi)
670{
671 nih_local char *controller = NULL;
672 const char *cgroup;
673 char *fpath = NULL, *path1, *path2;
674 nih_local char * cgdir = NULL;
675 nih_local struct cgm_keys *k = NULL;
8f6e8f5e 676 struct file_info *file_info;
99978832
SH
677 struct fuse_context *fc = fuse_get_context();
678
679 if (!fc)
680 return -EIO;
681
682 controller = pick_controller_from_path(fc, path);
683 if (!controller)
684 return -EIO;
685 cgroup = find_cgroup_in_path(path);
686 if (!cgroup)
687 return -EINVAL;
688
689 get_cgdir_and_path(cgroup, &cgdir, &fpath);
690 if (!fpath) {
691 path1 = "/";
692 path2 = cgdir;
693 } else {
694 path1 = cgdir;
695 path2 = fpath;
696 }
697
8f6e8f5e
SH
698 k = get_cgroup_key(controller, path1, path2);
699 if (!k)
700 return -EINVAL;
99978832 701
8f6e8f5e
SH
702 if (!fc_may_access(fc, controller, path1, path2, fi->flags))
703 // should never get here
704 return -EACCES;
99978832 705
8f6e8f5e
SH
706 /* we'll free this at cg_release */
707 file_info = NIH_MUST( nih_alloc(NULL, sizeof(*file_info)) );
bae07053
SH
708 file_info->controller = must_copy_string(file_info, controller);
709 file_info->cgroup = must_copy_string(file_info, path1);
710 file_info->file = must_copy_string(file_info, path2);
443d13f5 711 file_info->type = LXC_TYPE_CGFILE;
8f6e8f5e
SH
712 file_info->buf = NULL;
713 file_info->buflen = 0;
714
715 fi->fh = (unsigned long)file_info;
716 return 0;
717}
718
719static int cg_release(const char *path, struct fuse_file_info *fi)
720{
721 struct file_info *f = (struct file_info *)fi->fh;
722
723 do_release_file_info(f);
724 return 0;
99978832
SH
725}
726
a05660a6
SH
727static int msgrecv(int sockfd, void *buf, size_t len)
728{
729 struct timeval tv;
730 fd_set rfds;
731
732 FD_ZERO(&rfds);
733 FD_SET(sockfd, &rfds);
734 tv.tv_sec = 2;
735 tv.tv_usec = 0;
736
ea56f722 737 if (select(sockfd+1, &rfds, NULL, NULL, &tv) <= 0)
a05660a6
SH
738 return -1;
739 return recv(sockfd, buf, len, MSG_DONTWAIT);
740}
741
01e71852
SH
742#define SEND_CREDS_OK 0
743#define SEND_CREDS_NOTSK 1
744#define SEND_CREDS_FAIL 2
745static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
a05660a6
SH
746{
747 struct msghdr msg = { 0 };
748 struct iovec iov;
749 struct cmsghdr *cmsg;
750 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
751 char buf[1];
752 buf[0] = 'p';
753
01e71852
SH
754 if (pingfirst) {
755 if (msgrecv(sock, buf, 1) != 1) {
1420baf8 756 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
01e71852
SH
757 __func__);
758 return SEND_CREDS_FAIL;
759 }
a05660a6
SH
760 }
761
762 msg.msg_control = cmsgbuf;
763 msg.msg_controllen = sizeof(cmsgbuf);
764
765 cmsg = CMSG_FIRSTHDR(&msg);
766 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
767 cmsg->cmsg_level = SOL_SOCKET;
768 cmsg->cmsg_type = SCM_CREDENTIALS;
769 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
770
771 msg.msg_name = NULL;
772 msg.msg_namelen = 0;
773
774 buf[0] = v;
775 iov.iov_base = buf;
776 iov.iov_len = sizeof(buf);
777 msg.msg_iov = &iov;
778 msg.msg_iovlen = 1;
779
780 if (sendmsg(sock, &msg, 0) < 0) {
1420baf8 781 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
a05660a6
SH
782 strerror(errno));
783 if (errno == 3)
01e71852
SH
784 return SEND_CREDS_NOTSK;
785 return SEND_CREDS_FAIL;
a05660a6
SH
786 }
787
01e71852 788 return SEND_CREDS_OK;
a05660a6
SH
789}
790
791static bool recv_creds(int sock, struct ucred *cred, char *v)
792{
793 struct msghdr msg = { 0 };
794 struct iovec iov;
795 struct cmsghdr *cmsg;
796 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
797 char buf[1];
798 int ret;
799 int optval = 1;
6ee867dc
SH
800 struct timeval tv;
801 fd_set rfds;
a05660a6
SH
802
803 *v = '1';
804
805 cred->pid = -1;
806 cred->uid = -1;
807 cred->gid = -1;
808
809 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
1420baf8 810 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
a05660a6
SH
811 return false;
812 }
813 buf[0] = '1';
814 if (write(sock, buf, 1) != 1) {
1420baf8 815 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
a05660a6
SH
816 return false;
817 }
818
819 msg.msg_name = NULL;
820 msg.msg_namelen = 0;
821 msg.msg_control = cmsgbuf;
822 msg.msg_controllen = sizeof(cmsgbuf);
823
824 iov.iov_base = buf;
825 iov.iov_len = sizeof(buf);
826 msg.msg_iov = &iov;
827 msg.msg_iovlen = 1;
828
6ee867dc
SH
829 FD_ZERO(&rfds);
830 FD_SET(sock, &rfds);
831 tv.tv_sec = 2;
832 tv.tv_usec = 0;
ea56f722 833 if (select(sock+1, &rfds, NULL, NULL, &tv) <= 0) {
6ee867dc
SH
834 fprintf(stderr, "Failed to select for scm_cred: %s\n",
835 strerror(errno));
836 return false;
837 }
838 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
a05660a6 839 if (ret < 0) {
1420baf8 840 fprintf(stderr, "Failed to receive scm_cred: %s\n",
a05660a6
SH
841 strerror(errno));
842 return false;
843 }
844
845 cmsg = CMSG_FIRSTHDR(&msg);
846
847 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
848 cmsg->cmsg_level == SOL_SOCKET &&
849 cmsg->cmsg_type == SCM_CREDENTIALS) {
850 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
851 }
852 *v = buf[0];
853
854 return true;
855}
856
857
858/*
4775fba1
SH
859 * pid_to_ns - reads pids from a ucred over a socket, then writes the
860 * int value back over the socket. This shifts the pid from the
861 * sender's pidns into tpid's pidns.
a05660a6 862 */
4775fba1 863static void pid_to_ns(int sock, pid_t tpid)
a05660a6
SH
864{
865 char v = '0';
866 struct ucred cred;
867
868 while (recv_creds(sock, &cred, &v)) {
869 if (v == '1')
870 exit(0);
a05660a6
SH
871 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
872 exit(1);
873 }
874 exit(0);
875}
876
877/*
4775fba1 878 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
a05660a6 879 * in your old pidns. Only children which you fork will be in the target
4775fba1 880 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
a05660a6
SH
881 * actually convert pids
882 */
4775fba1 883static void pid_to_ns_wrapper(int sock, pid_t tpid)
a05660a6 884{
ea56f722 885 int newnsfd = -1, ret, cpipe[2];
a05660a6
SH
886 char fnam[100];
887 pid_t cpid;
ea56f722
SH
888 struct timeval tv;
889 fd_set s;
890 char v;
a05660a6 891
c0adec85
SH
892 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
893 if (ret < 0 || ret >= sizeof(fnam))
894 exit(1);
a05660a6
SH
895 newnsfd = open(fnam, O_RDONLY);
896 if (newnsfd < 0)
897 exit(1);
898 if (setns(newnsfd, 0) < 0)
899 exit(1);
900 close(newnsfd);
901
ea56f722
SH
902 if (pipe(cpipe) < 0)
903 exit(1);
a05660a6 904
ea56f722
SH
905loop:
906 cpid = fork();
a05660a6
SH
907 if (cpid < 0)
908 exit(1);
ea56f722
SH
909
910 if (!cpid) {
911 char b = '1';
912 close(cpipe[0]);
913 if (write(cpipe[1], &b, sizeof(char)) < 0) {
914 fprintf(stderr, "%s (child): erorr on write: %s\n",
915 __func__, strerror(errno));
916 }
917 close(cpipe[1]);
4775fba1 918 pid_to_ns(sock, tpid);
ea56f722
SH
919 }
920 // give the child 1 second to be done forking and
921 // write it's ack
922 FD_ZERO(&s);
923 FD_SET(cpipe[0], &s);
924 tv.tv_sec = 1;
925 tv.tv_usec = 0;
926 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
927 if (ret <= 0)
928 goto again;
929 ret = read(cpipe[0], &v, 1);
930 if (ret != sizeof(char) || v != '1') {
931 goto again;
932 }
933
a05660a6
SH
934 if (!wait_for_pid(cpid))
935 exit(1);
936 exit(0);
ea56f722
SH
937
938again:
939 kill(cpid, SIGKILL);
940 wait_for_pid(cpid);
941 goto loop;
a05660a6
SH
942}
943
944/*
945 * To read cgroup files with a particular pid, we will setns into the child
946 * pidns, open a pipe, fork a child - which will be the first to really be in
947 * the child ns - which does the cgm_get_value and writes the data to the pipe.
948 */
949static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
950{
951 int sock[2] = {-1, -1};
952 nih_local char *tmpdata = NULL;
953 int ret;
954 pid_t qpid, cpid = -1;
955 bool answer = false;
956 char v = '0';
957 struct ucred cred;
958 struct timeval tv;
959 fd_set s;
960
961 if (!cgm_get_value(contrl, cg, file, &tmpdata))
962 return false;
963
964 /*
965 * Now we read the pids from returned data one by one, pass
966 * them into a child in the target namespace, read back the
967 * translated pids, and put them into our to-return data
968 */
969
970 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
971 perror("socketpair");
972 exit(1);
973 }
974
975 cpid = fork();
976 if (cpid == -1)
977 goto out;
978
979 if (!cpid) // child
4775fba1 980 pid_to_ns_wrapper(sock[1], tpid);
a05660a6
SH
981
982 char *ptr = tmpdata;
983 cred.uid = 0;
984 cred.gid = 0;
985 while (sscanf(ptr, "%d\n", &qpid) == 1) {
986 cred.pid = qpid;
01e71852
SH
987 ret = send_creds(sock[0], &cred, v, true);
988
989 if (ret == SEND_CREDS_NOTSK)
990 goto next;
991 if (ret == SEND_CREDS_FAIL)
a05660a6
SH
992 goto out;
993
994 // read converted results
995 FD_ZERO(&s);
996 FD_SET(sock[0], &s);
6ee867dc 997 tv.tv_sec = 2;
a05660a6
SH
998 tv.tv_usec = 0;
999 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
1000 if (ret <= 0) {
6ee867dc
SH
1001 fprintf(stderr, "%s: select error waiting for pid from child: %s\n",
1002 __func__, strerror(errno));
a05660a6
SH
1003 goto out;
1004 }
1005 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
6ee867dc
SH
1006 fprintf(stderr, "%s: error reading pid from child: %s\n",
1007 __func__, strerror(errno));
a05660a6
SH
1008 goto out;
1009 }
a05660a6 1010 NIH_MUST( nih_strcat_sprintf(d, NULL, "%d\n", qpid) );
01e71852 1011next:
a05660a6
SH
1012 ptr = strchr(ptr, '\n');
1013 if (!ptr)
1014 break;
1015 ptr++;
1016 }
1017
1018 cred.pid = getpid();
1019 v = '1';
01e71852 1020 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
a05660a6 1021 // failed to ask child to exit
6ee867dc
SH
1022 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1023 __func__, strerror(errno));
a05660a6
SH
1024 goto out;
1025 }
1026
1027 answer = true;
1028
1029out:
1030 if (cpid != -1)
1031 wait_for_pid(cpid);
1032 if (sock[0] != -1) {
1033 close(sock[0]);
1034 close(sock[1]);
1035 }
1036 return answer;
1037}
1038
99978832
SH
1039static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1040 struct fuse_file_info *fi)
1041{
99978832 1042 struct fuse_context *fc = fuse_get_context();
8f6e8f5e 1043 struct file_info *f = (struct file_info *)fi->fh;
99978832
SH
1044 nih_local struct cgm_keys *k = NULL;
1045
443d13f5 1046 if (f->type != LXC_TYPE_CGFILE) {
b845ad01
SH
1047 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1048 return -EIO;
1049 }
1050
99978832 1051 if (offset)
7253e0a4 1052 return 0;
99978832
SH
1053
1054 if (!fc)
1055 return -EIO;
1056
8f6e8f5e 1057 if (!f->controller)
99978832
SH
1058 return -EINVAL;
1059
8f6e8f5e 1060 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
99978832 1061 nih_local char *data = NULL;
4775fba1
SH
1062 int s;
1063 bool r;
99978832 1064
8f6e8f5e 1065 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY))
f9a05025
SH
1066 // should never get here
1067 return -EACCES;
99978832 1068
8f6e8f5e
SH
1069 if (strcmp(f->file, "tasks") == 0 ||
1070 strcmp(f->file, "/tasks") == 0 ||
1071 strcmp(f->file, "/cgroup.procs") == 0 ||
1072 strcmp(f->file, "cgroup.procs") == 0)
a05660a6 1073 // special case - we have to translate the pids
8f6e8f5e 1074 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
a05660a6 1075 else
8f6e8f5e 1076 r = cgm_get_value(f->controller, f->cgroup, f->file, &data);
a05660a6 1077
4775fba1 1078 if (!r)
99978832
SH
1079 return -EINVAL;
1080
4775fba1
SH
1081 if (!data)
1082 return 0;
99978832
SH
1083 s = strlen(data);
1084 if (s > size)
1085 s = size;
1086 memcpy(buf, data, s);
5ea0727e
SH
1087 if (s > 0 && s < size && data[s-1] != '\n')
1088 buf[s++] = '\n';
99978832 1089
99978832
SH
1090 return s;
1091 }
1092
1093 return -EINVAL;
1094}
1095
4775fba1
SH
1096static void pid_from_ns(int sock, pid_t tpid)
1097{
1098 pid_t vpid;
1099 struct ucred cred;
1100 char v;
6ee867dc
SH
1101 struct timeval tv;
1102 fd_set s;
1103 int ret;
4775fba1
SH
1104
1105 cred.uid = 0;
1106 cred.gid = 0;
6ee867dc
SH
1107 while (1) {
1108 FD_ZERO(&s);
1109 FD_SET(sock, &s);
1110 tv.tv_sec = 2;
1111 tv.tv_usec = 0;
1112 ret = select(sock+1, &s, NULL, NULL, &tv);
ea56f722
SH
1113 if (ret <= 0) {
1114 fprintf(stderr, "%s: bad select before read from parent: %s\n",
6ee867dc
SH
1115 __func__, strerror(errno));
1116 exit(1);
1117 }
1118 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1119 fprintf(stderr, "%s: bad read from parent: %s\n",
1120 __func__, strerror(errno));
1121 exit(1);
1122 }
4775fba1 1123 if (vpid == -1) // done
01e71852 1124 break;
4775fba1
SH
1125 v = '0';
1126 cred.pid = vpid;
01e71852 1127 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
4775fba1
SH
1128 v = '1';
1129 cred.pid = getpid();
01e71852 1130 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
4775fba1
SH
1131 exit(1);
1132 }
1133 }
1134 exit(0);
1135}
1136
1137static void pid_from_ns_wrapper(int sock, pid_t tpid)
1138{
ea56f722 1139 int newnsfd = -1, ret, cpipe[2];
4775fba1
SH
1140 char fnam[100];
1141 pid_t cpid;
ea56f722
SH
1142 fd_set s;
1143 struct timeval tv;
1144 char v;
4775fba1 1145
c0adec85
SH
1146 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1147 if (ret < 0 || ret >= sizeof(fnam))
1148 exit(1);
4775fba1
SH
1149 newnsfd = open(fnam, O_RDONLY);
1150 if (newnsfd < 0)
1151 exit(1);
1152 if (setns(newnsfd, 0) < 0)
1153 exit(1);
1154 close(newnsfd);
1155
ea56f722
SH
1156 if (pipe(cpipe) < 0)
1157 exit(1);
1158
1159loop:
4775fba1
SH
1160 cpid = fork();
1161
1162 if (cpid < 0)
1163 exit(1);
ea56f722
SH
1164
1165 if (!cpid) {
1166 char b = '1';
1167 close(cpipe[0]);
1168 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1169 fprintf(stderr, "%s (child): erorr on write: %s\n",
1170 __func__, strerror(errno));
1171 }
1172 close(cpipe[1]);
4775fba1 1173 pid_from_ns(sock, tpid);
ea56f722
SH
1174 }
1175
1176 // give the child 1 second to be done forking and
1177 // write it's ack
1178 FD_ZERO(&s);
1179 FD_SET(cpipe[0], &s);
1180 tv.tv_sec = 1;
1181 tv.tv_usec = 0;
1182 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1183 if (ret <= 0)
1184 goto again;
1185 ret = read(cpipe[0], &v, 1);
1186 if (ret != sizeof(char) || v != '1') {
1187 goto again;
1188 }
1189
4775fba1
SH
1190 if (!wait_for_pid(cpid))
1191 exit(1);
1192 exit(0);
ea56f722
SH
1193
1194again:
1195 kill(cpid, SIGKILL);
1196 wait_for_pid(cpid);
1197 goto loop;
4775fba1
SH
1198}
1199
1200static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
1201{
1202 int sock[2] = {-1, -1};
1203 pid_t qpid, cpid = -1;
1204 bool answer = false, fail = false;
1205
1206 /*
1207 * write the pids to a socket, have helper in writer's pidns
1208 * call movepid for us
1209 */
1210 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1211 perror("socketpair");
1212 exit(1);
1213 }
1214
1215 cpid = fork();
1216 if (cpid == -1)
1217 goto out;
1218
1219 if (!cpid) // child
1220 pid_from_ns_wrapper(sock[1], tpid);
1221
1222 const char *ptr = buf;
1223 while (sscanf(ptr, "%d", &qpid) == 1) {
1224 struct ucred cred;
1225 char v;
1226
1227 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
6ee867dc
SH
1228 fprintf(stderr, "%s: error writing pid to child: %s\n",
1229 __func__, strerror(errno));
4775fba1
SH
1230 goto out;
1231 }
1232
01e71852
SH
1233 if (recv_creds(sock[0], &cred, &v)) {
1234 if (v == '0') {
1235 if (!cgm_move_pid(contrl, cg, cred.pid))
1236 fail = true;
1237 }
4775fba1
SH
1238 }
1239
1240 ptr = strchr(ptr, '\n');
1241 if (!ptr)
1242 break;
1243 ptr++;
1244 }
1245
1246 /* All good, write the value */
1247 qpid = -1;
1248 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1420baf8 1249 fprintf(stderr, "Warning: failed to ask child to exit\n");
4775fba1
SH
1250
1251 if (!fail)
1252 answer = true;
1253
1254out:
1255 if (cpid != -1)
1256 wait_for_pid(cpid);
1257 if (sock[0] != -1) {
1258 close(sock[0]);
1259 close(sock[1]);
1260 }
1261 return answer;
1262}
1263
2ad6d2bd
SH
1264int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1265 struct fuse_file_info *fi)
1266{
2ad6d2bd 1267 struct fuse_context *fc = fuse_get_context();
47cbf0e5 1268 nih_local char *localbuf = NULL;
8f6e8f5e
SH
1269 nih_local struct cgm_keys *k = NULL;
1270 struct file_info *f = (struct file_info *)fi->fh;
2ad6d2bd 1271
443d13f5 1272 if (f->type != LXC_TYPE_CGFILE) {
b845ad01
SH
1273 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1274 return -EIO;
1275 }
1276
2ad6d2bd 1277 if (offset)
7253e0a4 1278 return 0;
2ad6d2bd
SH
1279
1280 if (!fc)
1281 return -EIO;
1282
47cbf0e5
SH
1283 localbuf = NIH_MUST( nih_alloc(NULL, size+1) );
1284 localbuf[size] = '\0';
1285 memcpy(localbuf, buf, size);
2ad6d2bd 1286
8f6e8f5e 1287 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
4775fba1
SH
1288 bool r;
1289
8f6e8f5e 1290 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY))
f9a05025 1291 return -EACCES;
2ad6d2bd 1292
8f6e8f5e
SH
1293 if (strcmp(f->file, "tasks") == 0 ||
1294 strcmp(f->file, "/tasks") == 0 ||
1295 strcmp(f->file, "/cgroup.procs") == 0 ||
1296 strcmp(f->file, "cgroup.procs") == 0)
4775fba1 1297 // special case - we have to translate the pids
8f6e8f5e 1298 r = do_write_pids(fc->pid, f->controller, f->cgroup, f->file, localbuf);
4775fba1 1299 else
8f6e8f5e 1300 r = cgm_set_value(f->controller, f->cgroup, f->file, localbuf);
4775fba1
SH
1301
1302 if (!r)
2ad6d2bd
SH
1303 return -EINVAL;
1304
1305 return size;
1306 }
1307
1308 return -EINVAL;
1309}
1310
341b21ad
SH
1311int cg_chown(const char *path, uid_t uid, gid_t gid)
1312{
1313 struct fuse_context *fc = fuse_get_context();
1314 nih_local char * cgdir = NULL;
1315 char *fpath = NULL, *path1, *path2;
1316 nih_local struct cgm_keys *k = NULL;
1317 const char *cgroup;
1318 nih_local char *controller = NULL;
1319
1320
1321 if (!fc)
1322 return -EIO;
1323
1324 if (strcmp(path, "/cgroup") == 0)
1325 return -EINVAL;
1326
1327 controller = pick_controller_from_path(fc, path);
1328 if (!controller)
f9a05025 1329 return -EINVAL;
341b21ad
SH
1330 cgroup = find_cgroup_in_path(path);
1331 if (!cgroup)
1332 /* this is just /cgroup/controller */
1333 return -EINVAL;
1334
1335 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1336
1337 if (!fpath) {
1338 path1 = "/";
1339 path2 = cgdir;
1340 } else {
1341 path1 = cgdir;
1342 path2 = fpath;
1343 }
1344
1345 if (is_child_cgroup(controller, path1, path2)) {
1346 // get uid, gid, from '/tasks' file and make up a mode
1347 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1348 k = get_cgroup_key(controller, cgroup, "tasks");
1349
1350 } else
1351 k = get_cgroup_key(controller, path1, path2);
1352
1353 if (!k)
1354 return -EINVAL;
1355
1356 /*
1357 * This being a fuse request, the uid and gid must be valid
1358 * in the caller's namespace. So we can just check to make
1359 * sure that the caller is root in his uid, and privileged
1360 * over the file's current owner.
1361 */
1362 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD))
f9a05025 1363 return -EACCES;
341b21ad
SH
1364
1365 if (!cgm_chown_file(controller, cgroup, uid, gid))
1366 return -EINVAL;
1367 return 0;
1368}
2ad6d2bd 1369
fd2e4e03
SH
1370int cg_chmod(const char *path, mode_t mode)
1371{
0a1bb5ea
SH
1372 struct fuse_context *fc = fuse_get_context();
1373 nih_local char * cgdir = NULL;
1374 char *fpath = NULL, *path1, *path2;
1375 nih_local struct cgm_keys *k = NULL;
1376 const char *cgroup;
1377 nih_local char *controller = NULL;
1378
1379 if (!fc)
1380 return -EIO;
1381
1382 if (strcmp(path, "/cgroup") == 0)
1383 return -EINVAL;
1384
1385 controller = pick_controller_from_path(fc, path);
1386 if (!controller)
f9a05025 1387 return -EINVAL;
0a1bb5ea
SH
1388 cgroup = find_cgroup_in_path(path);
1389 if (!cgroup)
1390 /* this is just /cgroup/controller */
1391 return -EINVAL;
1392
1393 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1394
1395 if (!fpath) {
1396 path1 = "/";
1397 path2 = cgdir;
1398 } else {
1399 path1 = cgdir;
1400 path2 = fpath;
1401 }
1402
1403 if (is_child_cgroup(controller, path1, path2)) {
1404 // get uid, gid, from '/tasks' file and make up a mode
1405 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1406 k = get_cgroup_key(controller, cgroup, "tasks");
1407
1408 } else
1409 k = get_cgroup_key(controller, path1, path2);
1410
1411 if (!k)
1412 return -EINVAL;
1413
1414 /*
1415 * This being a fuse request, the uid and gid must be valid
1416 * in the caller's namespace. So we can just check to make
1417 * sure that the caller is root in his uid, and privileged
1418 * over the file's current owner.
1419 */
1420 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT))
1421 return -EPERM;
1422
1423 if (!cgm_chmod_file(controller, cgroup, mode))
1424 return -EINVAL;
1425 return 0;
fd2e4e03
SH
1426}
1427
ab54b798
SH
1428int cg_mkdir(const char *path, mode_t mode)
1429{
1430 struct fuse_context *fc = fuse_get_context();
1431 nih_local struct cgm_keys **list = NULL;
1432 char *fpath = NULL, *path1;
1433 nih_local char * cgdir = NULL;
1434 const char *cgroup;
1435 nih_local char *controller = NULL;
1436
ab54b798
SH
1437 if (!fc)
1438 return -EIO;
1439
1440
1441 controller = pick_controller_from_path(fc, path);
1442 if (!controller)
f9a05025 1443 return -EINVAL;
ab54b798
SH
1444
1445 cgroup = find_cgroup_in_path(path);
1446 if (!cgroup)
f9a05025 1447 return -EINVAL;
ab54b798
SH
1448
1449 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1450 if (!fpath)
1451 path1 = "/";
1452 else
1453 path1 = cgdir;
1454
1455 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR))
f9a05025 1456 return -EACCES;
ab54b798
SH
1457
1458
1459 if (!cgm_create(controller, cgroup, fc->uid, fc->gid))
1460 return -EINVAL;
1461
1462 return 0;
1463}
1464
50d8d5b5
SH
1465static int cg_rmdir(const char *path)
1466{
1467 struct fuse_context *fc = fuse_get_context();
1468 nih_local struct cgm_keys **list = NULL;
1469 char *fpath = NULL;
1470 nih_local char * cgdir = NULL;
1471 const char *cgroup;
1472 nih_local char *controller = NULL;
1473
1474 if (!fc)
1475 return -EIO;
1476
1477
1478 controller = pick_controller_from_path(fc, path);
1479 if (!controller)
f9a05025 1480 return -EINVAL;
50d8d5b5
SH
1481
1482 cgroup = find_cgroup_in_path(path);
1483 if (!cgroup)
f9a05025 1484 return -EINVAL;
50d8d5b5
SH
1485
1486 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1487 if (!fpath)
1488 return -EINVAL;
1489
1490 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY))
f9a05025 1491 return -EACCES;
50d8d5b5
SH
1492
1493 if (!cgm_remove(controller, cgroup))
1494 return -EINVAL;
1495
1496 return 0;
1497}
1498
2dc17609
SH
1499static bool startswith(const char *line, const char *pref)
1500{
1501 if (strncmp(line, pref, strlen(pref)) == 0)
1502 return true;
1503 return false;
1504}
1505
1506static void get_mem_cached(char *memstat, unsigned long *v)
1507{
1508 char *eol;
1509
1510 *v = 0;
1511 while (*memstat) {
1512 if (startswith(memstat, "total_cache")) {
1513 sscanf(memstat + 11, "%lu", v);
1514 *v /= 1024;
1515 return;
1516 }
1517 eol = strchr(memstat, '\n');
1518 if (!eol)
1519 return;
1520 memstat = eol+1;
1521 }
1522}
1523
49878439 1524static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2f919d9d 1525{
49878439
YY
1526 char *eol;
1527 char key[32];
2f919d9d 1528
49878439
YY
1529 memset(key, 0, 32);
1530 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2f919d9d 1531
49878439
YY
1532 size_t len = strlen(key);
1533 *v = 0;
1534
1535 while (*str) {
1536 if (startswith(str, key)) {
2f919d9d
SH
1537 sscanf(str + len, "%lu", v);
1538 return;
1539 }
1540 eol = strchr(str, '\n');
49878439 1541 if (!eol)
2f919d9d 1542 return;
49878439
YY
1543 str = eol+1;
1544 }
1545}
1546
2dc17609
SH
1547static char *get_pid_cgroup(pid_t pid, const char *contrl)
1548{
1549 nih_local char *fnam = NULL;
1550 FILE *f;
1551 char *answer = NULL;
1552 char *line = NULL;
1553 size_t len = 0;
1554
1555 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
1556 if (!(f = fopen(fnam, "r")))
1557 return false;
1558
1559 while (getline(&line, &len, f) != -1) {
1560 char *c1, *c2;
1561 if (!line[0])
1562 continue;
1563 c1 = strchr(line, ':');
1564 if (!c1)
1565 goto out;
1566 c1++;
1567 c2 = strchr(c1, ':');
1568 if (!c2)
1569 goto out;
1570 *c2 = '\0';
1571 if (strcmp(c1, contrl) != 0)
1572 continue;
1573 c2++;
1574 stripnewline(c2);
1575 answer = NIH_MUST( nih_strdup(NULL, c2) );
1576 goto out;
1577 }
1578
1579out:
1580 fclose(f);
1581 free(line);
1582 return answer;
1583}
1584
758ad80c 1585/*
2ad6d2bd 1586 * FUSE ops for /proc
758ad80c 1587 */
758ad80c 1588
23ce2127
SH
1589static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1590 struct fuse_file_info *fi)
1591{
2dc17609 1592 struct fuse_context *fc = fuse_get_context();
97f1f27b 1593 struct file_info *d = (struct file_info *)fi->fh;
2dc17609
SH
1594 nih_local char *cg = get_pid_cgroup(fc->pid, "memory");
1595 nih_local char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
1596 unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
1597 char *line = NULL;
1598 size_t linelen = 0, total_len = 0;
97f1f27b
YY
1599 char *cache = d->buf;
1600 size_t cache_size = d->buflen;
2dc17609
SH
1601 FILE *f;
1602
97f1f27b
YY
1603 if (offset){
1604 if (offset > d->size)
1605 return -EINVAL;
1606 int left = d->size - offset;
1607 total_len = left > size ? size: left;
1608 memcpy(buf, cache + offset, total_len);
1609 return total_len;
1610 }
2dc17609
SH
1611
1612 if (!cg)
1613 return 0;
1614
1615 if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
1616 return 0;
1617 if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
1618 return 0;
1619 if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
1620 return 0;
1621 memlimit = strtoul(memlimit_str, NULL, 10);
1622 memusage = strtoul(memusage_str, NULL, 10);
1623 memlimit /= 1024;
1624 memusage /= 1024;
1625 get_mem_cached(memstat_str, &cached);
1626
1627 f = fopen("/proc/meminfo", "r");
1628 if (!f)
1629 return 0;
1630
1631 while (getline(&line, &linelen, f) != -1) {
1632 size_t l;
1633 char *printme, lbuf[100];
1634
1635 memset(lbuf, 0, 100);
1636 if (startswith(line, "MemTotal:")) {
1637 sscanf(line+14, "%lu", &hosttotal);
1638 if (hosttotal < memlimit)
1639 memlimit = hosttotal;
1640 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1641 printme = lbuf;
1642 } else if (startswith(line, "MemFree:")) {
1643 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1644 printme = lbuf;
1645 } else if (startswith(line, "MemAvailable:")) {
1646 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1647 printme = lbuf;
1648 } else if (startswith(line, "Buffers:")) {
1649 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1650 printme = lbuf;
1651 } else if (startswith(line, "Cached:")) {
1652 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
1653 printme = lbuf;
1654 } else if (startswith(line, "SwapCached:")) {
1655 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
1656 printme = lbuf;
1657 } else
1658 printme = line;
97f1f27b
YY
1659
1660 l = snprintf(cache, cache_size, "%s", printme);
1661 cache += l;
1662 cache_size -= l;
2f919d9d 1663 total_len += l;
2dc17609
SH
1664 }
1665
97f1f27b
YY
1666 d->size = total_len;
1667 if (total_len > size ) total_len = size;
1668 memcpy(buf, d->buf, total_len);
1669
92c84dc4
SH
1670 fclose(f);
1671 free(line);
2dc17609 1672 return total_len;
23ce2127
SH
1673}
1674
1675/*
1676 * Read the cpuset.cpus for cg
1677 * Return the answer in a nih_alloced string
1678 */
1679static char *get_cpuset(const char *cg)
1680{
1681 char *answer;
1682
1683 if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
1684 return NULL;
1685 return answer;
1686}
1687
fa47bb52 1688bool cpu_in_cpuset(int cpu, const char *cpuset);
23ce2127 1689
aeb56147
SH
1690static bool cpuline_in_cpuset(const char *line, const char *cpuset)
1691{
1692 int cpu;
1693
1694 if (sscanf(line, "processor : %d", &cpu) != 1)
1695 return false;
1696 return cpu_in_cpuset(cpu, cpuset);
1697}
1698
23ce2127
SH
1699/*
1700 * check whether this is a '^processor" line in /proc/cpuinfo
1701 */
1702static bool is_processor_line(const char *line)
1703{
1704 int cpu;
1705
1706 if (sscanf(line, "processor : %d", &cpu) == 1)
1707 return true;
1708 return false;
1709}
1710
23ce2127
SH
1711static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
1712 struct fuse_file_info *fi)
1713{
1714 struct fuse_context *fc = fuse_get_context();
97f1f27b 1715 struct file_info *d = (struct file_info *)fi->fh;
23ce2127
SH
1716 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1717 nih_local char *cpuset = NULL;
1718 char *line = NULL;
1719 size_t linelen = 0, total_len = 0;
1720 bool am_printing = false;
1721 int curcpu = -1;
97f1f27b
YY
1722 char *cache = d->buf;
1723 size_t cache_size = d->buflen;
23ce2127
SH
1724 FILE *f;
1725
97f1f27b
YY
1726 if (offset){
1727 if (offset > d->size)
1728 return -EINVAL;
1729 int left = d->size - offset;
1730 total_len = left > size ? size: left;
1731 memcpy(buf, cache + offset, total_len);
2f919d9d 1732 return total_len;
97f1f27b 1733 }
23ce2127
SH
1734
1735 if (!cg)
1736 return 0;
1737
1738 cpuset = get_cpuset(cg);
1739 if (!cpuset)
1740 return 0;
1741
1742 f = fopen("/proc/cpuinfo", "r");
1743 if (!f)
1744 return 0;
1745
1746 while (getline(&line, &linelen, f) != -1) {
1747 size_t l;
1748 if (is_processor_line(line)) {
aeb56147 1749 am_printing = cpuline_in_cpuset(line, cpuset);
23ce2127
SH
1750 if (am_printing) {
1751 curcpu ++;
97f1f27b
YY
1752 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
1753 if (l < cache_size){
1754 cache += l;
1755 cache_size -= l;
1756 total_len += l;
1757 }else{
1758 cache += cache_size;
1759 total_len += cache_size;
1760 cache_size = 0;
1761 break;
1762 }
23ce2127
SH
1763 }
1764 continue;
1765 }
1766 if (am_printing) {
97f1f27b
YY
1767 l = snprintf(cache, cache_size, "%s", line);
1768 if (l < cache_size) {
1769 cache += l;
1770 cache_size -= l;
1771 total_len += l;
1772 } else {
1773 cache += cache_size;
1774 total_len += cache_size;
1775 cache_size = 0;
1776 break;
1777 }
23ce2127
SH
1778 }
1779 }
1780
97f1f27b
YY
1781 d->size = total_len;
1782 if (total_len > size ) total_len = size;
1783
1784 /* read from off 0 */
1785 memcpy(buf, d->buf, total_len);
1786
92c84dc4
SH
1787 fclose(f);
1788 free(line);
23ce2127
SH
1789 return total_len;
1790}
1791
1792static int proc_stat_read(char *buf, size_t size, off_t offset,
1793 struct fuse_file_info *fi)
1794{
aeb56147 1795 struct fuse_context *fc = fuse_get_context();
97f1f27b 1796 struct file_info *d = (struct file_info *)fi->fh;
aeb56147
SH
1797 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1798 nih_local char *cpuset = NULL;
1799 char *line = NULL;
1800 size_t linelen = 0, total_len = 0;
2a0fde62 1801 int curcpu = -1; /* cpu numbering starts at 0 */
97f1f27b
YY
1802 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
1803 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
1804 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
1805#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
1806 char cpuall[CPUALL_MAX_SIZE];
1807 /* reserve for cpu all */
1808 char *cache = d->buf + CPUALL_MAX_SIZE;
1809 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
aeb56147
SH
1810 FILE *f;
1811
97f1f27b
YY
1812 if (offset){
1813 if (offset > d->size)
1814 return -EINVAL;
1815 int left = d->size - offset;
1816 total_len = left > size ? size: left;
1817 memcpy(buf, d->buf + offset, total_len);
2f919d9d 1818 return total_len;
97f1f27b 1819 }
aeb56147
SH
1820
1821 if (!cg)
1822 return 0;
1823
1824 cpuset = get_cpuset(cg);
1825 if (!cpuset)
1826 return 0;
1827
1828 f = fopen("/proc/stat", "r");
1829 if (!f)
1830 return 0;
1831
97f1f27b
YY
1832 //skip first line
1833 if (getline(&line, &linelen, f) < 0) {
1834 fprintf(stderr, "proc_stat_read read first line failed\n");
1835 goto out;
1836 }
1837
aeb56147
SH
1838 while (getline(&line, &linelen, f) != -1) {
1839 size_t l;
1840 int cpu;
2a0fde62 1841 char cpu_char[10]; /* That's a lot of cores */
aeb56147
SH
1842 char *c;
1843
2a0fde62
CB
1844 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
1845 /* not a ^cpuN line containing a number N, just print it */
97f1f27b
YY
1846 l = snprintf(cache, cache_size, "%s", line);
1847 if (l < cache_size){
1848 cache += l;
1849 cache_size -= l;
1850 total_len += l;
1851 continue;
1852 }else{
1853 //no more space, break it
1854 cache += cache_size;
1855 total_len += cache_size;
1856 cache_size = 0;
1857 break;
1858 }
aeb56147 1859 }
2a0fde62
CB
1860
1861 if (sscanf(cpu_char, "%d", &cpu) != 1)
1862 continue;
aeb56147
SH
1863 if (!cpu_in_cpuset(cpu, cpuset))
1864 continue;
1865 curcpu ++;
1866
1867 c = strchr(line, ' ');
1868 if (!c)
1869 continue;
25c5e8fb 1870 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
97f1f27b
YY
1871 cache += l;
1872 cache_size -= l;
aeb56147 1873 total_len += l;
2f919d9d 1874
97f1f27b
YY
1875 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
1876 &softirq, &steal, &guest) != 9)
1877 continue;
1878 user_sum += user;
1879 nice_sum += nice;
1880 system_sum += system;
1881 idle_sum += idle;
1882 iowait_sum += iowait;
1883 irq_sum += irq;
1884 softirq_sum += softirq;
1885 steal_sum += steal;
2f919d9d 1886 guest_sum += guest;
97f1f27b
YY
1887 }
1888
1889 cache = d->buf;
1890
2f919d9d 1891 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
97f1f27b
YY
1892 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
1893 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
1894 memcpy(cache, cpuall, cpuall_len);
2f919d9d 1895 cache += cpuall_len;
97f1f27b
YY
1896 }else{
1897 /* shouldn't happen */
1898 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
1899 cpuall_len = 0;
1900 }
1901
1902 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
1903 total_len += cpuall_len;
1904 d->size = total_len;
1905 if (total_len > size ) total_len = size;
1906
1907 memcpy(buf, d->buf, total_len);
97f1f27b 1908out:
92c84dc4
SH
1909 fclose(f);
1910 free(line);
aeb56147 1911 return total_len;
23ce2127
SH
1912}
1913
7bbf2246
SH
1914/*
1915 * How to guess what to present for uptime?
1916 * One thing we could do would be to take the date on the caller's
1917 * memory.usage_in_bytes file, which should equal the time of creation
1918 * of his cgroup. However, a task could be in a sub-cgroup of the
1919 * container. The same problem exists if we try to look at the ages
1920 * of processes in the caller's cgroup.
1921 *
1922 * So we'll fork a task that will enter the caller's pidns, mount a
1923 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
1924 *
1925 * For the second uptime #, we'll do as Stéphane had done, just copy
1926 * the number from /proc/uptime. Not sure how to best emulate 'idle'
1927 * time. Maybe someone can come up with a good algorithm and submit a
1928 * patch. Maybe something based on cpushare info?
1929 */
41bb9357
SH
1930
1931/* return age of the reaper for $pid, taken from ctime of its procdir */
1932static long int get_pid1_time(pid_t pid)
1933{
1934 char fnam[100];
ea56f722 1935 int fd, cpipe[2], ret;
41bb9357 1936 struct stat sb;
ea56f722
SH
1937 pid_t cpid;
1938 struct timeval tv;
1939 fd_set s;
1940 char v;
41bb9357
SH
1941
1942 if (unshare(CLONE_NEWNS))
1943 return 0;
1944
5ca64c2a
SG
1945 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
1946 perror("rslave mount failed");
1947 return 0;
1948 }
1949
c0adec85
SH
1950 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", pid);
1951 if (ret < 0 || ret >= sizeof(fnam))
1952 return 0;
1953
41bb9357
SH
1954 fd = open(fnam, O_RDONLY);
1955 if (fd < 0) {
1956 perror("get_pid1_time open of ns/pid");
1957 return 0;
1958 }
1959 if (setns(fd, 0)) {
1960 perror("get_pid1_time setns 1");
1961 close(fd);
1962 return 0;
1963 }
1964 close(fd);
41bb9357 1965
ea56f722
SH
1966 if (pipe(cpipe) < 0)
1967 exit(1);
41bb9357 1968
ea56f722
SH
1969loop:
1970 cpid = fork();
1971 if (cpid < 0)
41bb9357 1972 return 0;
ea56f722
SH
1973
1974 if (!cpid) {
1975 char b = '1';
1976 close(cpipe[0]);
1977 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1978 fprintf(stderr, "%s (child): erorr on write: %s\n",
1979 __func__, strerror(errno));
1980 }
1981 close(cpipe[1]);
1982 umount2("/proc", MNT_DETACH);
1983 if (mount("proc", "/proc", "proc", 0, NULL)) {
1984 perror("get_pid1_time mount");
1985 return 0;
1986 }
1987 ret = lstat("/proc/1", &sb);
1988 if (ret) {
1989 perror("get_pid1_time lstat");
1990 return 0;
1991 }
1992 return time(NULL) - sb.st_ctime;
41bb9357 1993 }
ea56f722
SH
1994
1995 // give the child 1 second to be done forking and
1996 // write it's ack
1997 FD_ZERO(&s);
1998 FD_SET(cpipe[0], &s);
1999 tv.tv_sec = 1;
2000 tv.tv_usec = 0;
2001 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
2002 if (ret <= 0)
2003 goto again;
2004 ret = read(cpipe[0], &v, 1);
2005 if (ret != sizeof(char) || v != '1') {
2006 goto again;
41bb9357 2007 }
ea56f722
SH
2008
2009 wait_for_pid(cpid);
2010 exit(0);
2011
2012again:
2013 kill(cpid, SIGKILL);
2014 wait_for_pid(cpid);
2015 goto loop;
41bb9357
SH
2016}
2017
2018static long int getreaperage(pid_t qpid)
2019{
2020 int pid, mypipe[2], ret;
2021 struct timeval tv;
2022 fd_set s;
2023 long int mtime, answer = 0;
2024
2025 if (pipe(mypipe)) {
2026 return 0;
2027 }
2028
2029 pid = fork();
2030
2031 if (!pid) { // child
2032 mtime = get_pid1_time(qpid);
2033 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
2034 fprintf(stderr, "Warning: bad write from getreaperage\n");
2035 exit(0);
2036 }
2037
2038 close(mypipe[1]);
2039 FD_ZERO(&s);
2040 FD_SET(mypipe[0], &s);
2041 tv.tv_sec = 1;
2042 tv.tv_usec = 0;
2043 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
ea56f722 2044 if (ret <= 0) {
41bb9357
SH
2045 perror("select");
2046 goto out;
2047 }
2048 if (!ret) {
1420baf8 2049 fprintf(stderr, "timed out\n");
41bb9357
SH
2050 goto out;
2051 }
2052 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
2053 perror("read");
2054 goto out;
2055 }
2056 answer = mtime;
2057
2058out:
2059 wait_for_pid(pid);
2060 close(mypipe[0]);
2061 return answer;
2062}
2063
2064static long int getprocidle(void)
2065{
2066 FILE *f = fopen("/proc/uptime", "r");
2067 long int age, idle;
92c84dc4 2068 int ret;
41bb9357
SH
2069 if (!f)
2070 return 0;
92c84dc4
SH
2071 ret = fscanf(f, "%ld %ld", &age, &idle);
2072 fclose(f);
2073 if (ret != 2)
41bb9357
SH
2074 return 0;
2075 return idle;
2076}
2077
2078/*
2079 * We read /proc/uptime and reuse its second field.
2080 * For the first field, we use the mtime for the reaper for
2081 * the calling pid as returned by getreaperage
2082 */
23ce2127
SH
2083static int proc_uptime_read(char *buf, size_t size, off_t offset,
2084 struct fuse_file_info *fi)
2085{
41bb9357 2086 struct fuse_context *fc = fuse_get_context();
97f1f27b 2087 struct file_info *d = (struct file_info *)fi->fh;
41bb9357
SH
2088 long int reaperage = getreaperage(fc->pid);;
2089 long int idletime = getprocidle();
97f1f27b 2090 size_t total_len = 0;
41bb9357 2091
97f1f27b
YY
2092 if (offset){
2093 if (offset > d->size)
2094 return -EINVAL;
2095 return 0;
2096 }
2097
2098 total_len = snprintf(buf, size, "%ld %ld\n", reaperage, idletime);
2099 d->size = total_len;
2100 return total_len;
23ce2127
SH
2101}
2102
49878439
YY
2103static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2104 struct fuse_file_info *fi)
2105{
2106 char dev_name[72];
2107 struct fuse_context *fc = fuse_get_context();
97f1f27b 2108 struct file_info *d = (struct file_info *)fi->fh;
49878439
YY
2109 nih_local char *cg = get_pid_cgroup(fc->pid, "blkio");
2110 nih_local char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
2111 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2112 unsigned long read = 0, write = 0;
2113 unsigned long read_merged = 0, write_merged = 0;
2114 unsigned long read_sectors = 0, write_sectors = 0;
2115 unsigned long read_ticks = 0, write_ticks = 0;
2116 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2117 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
2118 char *line = NULL;
2119 size_t linelen = 0, total_len = 0;
2120 unsigned int major = 0, minor = 0;
2121 int i = 0;
2122 FILE *f;
2123
97f1f27b
YY
2124 if (offset){
2125 if (offset > d->size)
2126 return -EINVAL;
2127 return 0;
2128 }
49878439
YY
2129
2130 if (!cg)
2131 return 0;
2132
2133 if (!cgm_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2134 return 0;
2135 if (!cgm_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2136 return 0;
2137 if (!cgm_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2138 return 0;
2139 if (!cgm_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2140 return 0;
2141 if (!cgm_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2142 return 0;
2143
2144
2145 f = fopen("/proc/diskstats", "r");
2146 if (!f)
2147 return 0;
2148
2149 while (getline(&line, &linelen, f) != -1) {
2150 size_t l;
2151 char *printme, lbuf[256];
2152
c0adec85 2153 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
49878439
YY
2154 if(i == 3){
2155 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2156 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2157 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2158 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2159 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2160 read_sectors = read_sectors/512;
2161 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2162 write_sectors = write_sectors/512;
2f919d9d 2163
49878439
YY
2164 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2165 rd_svctm = rd_svctm/1000000;
2166 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2167 rd_wait = rd_wait/1000000;
2168 read_ticks = rd_svctm + rd_wait;
2169
2170 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2171 wr_svctm = wr_svctm/1000000;
2172 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2173 wr_wait = wr_wait/1000000;
2174 write_ticks = wr_svctm + wr_wait;
2175
2176 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2177 tot_ticks = tot_ticks/1000000;
2178 }else{
2179 continue;
2180 }
2181
2182 memset(lbuf, 0, 256);
2183 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2f919d9d 2184 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
49878439
YY
2185 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2186 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2187 printme = lbuf;
2188 } else
2189 continue;
2190
2191 l = snprintf(buf, size, "%s", printme);
2192 buf += l;
2193 size -= l;
2194 total_len += l;
2195 }
2196
97f1f27b
YY
2197 d->size = total_len;
2198
49878439
YY
2199 fclose(f);
2200 free(line);
2201 return total_len;
2202}
2203
23ce2127
SH
2204static off_t get_procfile_size(const char *which)
2205{
2206 FILE *f = fopen(which, "r");
2207 char *line = NULL;
2208 size_t len = 0;
2209 ssize_t sz, answer = 0;
2210 if (!f)
2211 return 0;
2212
2213 while ((sz = getline(&line, &len, f)) != -1)
2214 answer += sz;
2215 fclose (f);
92c84dc4 2216 free(line);
23ce2127
SH
2217
2218 return answer;
2219}
2220
758ad80c
SH
2221static int proc_getattr(const char *path, struct stat *sb)
2222{
35629743
SH
2223 struct timespec now;
2224
2225 memset(sb, 0, sizeof(struct stat));
2226 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2227 return -EINVAL;
2228 sb->st_uid = sb->st_gid = 0;
2229 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2230 if (strcmp(path, "/proc") == 0) {
2231 sb->st_mode = S_IFDIR | 00555;
2232 sb->st_nlink = 2;
2233 return 0;
2234 }
2235 if (strcmp(path, "/proc/meminfo") == 0 ||
2236 strcmp(path, "/proc/cpuinfo") == 0 ||
2237 strcmp(path, "/proc/uptime") == 0 ||
49878439
YY
2238 strcmp(path, "/proc/stat") == 0 ||
2239 strcmp(path, "/proc/diskstats") == 0) {
7253e0a4 2240 sb->st_size = 0;
35629743
SH
2241 sb->st_mode = S_IFREG | 00444;
2242 sb->st_nlink = 1;
2243 return 0;
2244 }
2245
2246 return -ENOENT;
2247}
2248
2249static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2250 struct fuse_file_info *fi)
2251{
2252 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2253 filler(buf, "meminfo", NULL, 0) != 0 ||
2254 filler(buf, "stat", NULL, 0) != 0 ||
49878439
YY
2255 filler(buf, "uptime", NULL, 0) != 0 ||
2256 filler(buf, "diskstats", NULL, 0) != 0)
758ad80c 2257 return -EINVAL;
758ad80c
SH
2258 return 0;
2259}
2260
35629743
SH
2261static int proc_open(const char *path, struct fuse_file_info *fi)
2262{
96fc5ee6
SH
2263 int type = -1;
2264 struct file_info *info;
2265
2266 if (strcmp(path, "/proc/meminfo") == 0)
2267 type = LXC_TYPE_PROC_MEMINFO;
2268 else if (strcmp(path, "/proc/cpuinfo") == 0)
2269 type = LXC_TYPE_PROC_CPUINFO;
2270 else if (strcmp(path, "/proc/uptime") == 0)
2271 type = LXC_TYPE_PROC_UPTIME;
2272 else if (strcmp(path, "/proc/stat") == 0)
2273 type = LXC_TYPE_PROC_STAT;
2274 else if (strcmp(path, "/proc/diskstats") == 0)
2275 type = LXC_TYPE_PROC_DISKSTATS;
2276 if (type == -1)
2277 return -ENOENT;
2278
2279 info = NIH_MUST( nih_alloc(NULL, sizeof(*info)) );
2280 memset(info, 0, sizeof(*info));
2281 info->type = type;
2282
97f1f27b 2283 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
25c5e8fb 2284 info->buf = NIH_MUST( nih_alloc(info, info->buflen) );
97f1f27b
YY
2285 memset(info->buf, 0, info->buflen);
2286 /* set actual size to buffer size */
2f919d9d 2287 info->size = info->buflen;
97f1f27b 2288
96fc5ee6
SH
2289 fi->fh = (unsigned long)info;
2290 return 0;
2291}
2292
2293static int proc_release(const char *path, struct fuse_file_info *fi)
2294{
2295 struct file_info *f = (struct file_info *)fi->fh;
2296
2297 do_release_file_info(f);
2298 return 0;
35629743
SH
2299}
2300
35629743
SH
2301static int proc_read(const char *path, char *buf, size_t size, off_t offset,
2302 struct fuse_file_info *fi)
2303{
96fc5ee6
SH
2304 struct file_info *f = (struct file_info *) fi->fh;
2305
2306 switch (f->type) {
2f919d9d 2307 case LXC_TYPE_PROC_MEMINFO:
23ce2127 2308 return proc_meminfo_read(buf, size, offset, fi);
96fc5ee6 2309 case LXC_TYPE_PROC_CPUINFO:
23ce2127 2310 return proc_cpuinfo_read(buf, size, offset, fi);
96fc5ee6 2311 case LXC_TYPE_PROC_UPTIME:
23ce2127 2312 return proc_uptime_read(buf, size, offset, fi);
96fc5ee6 2313 case LXC_TYPE_PROC_STAT:
23ce2127 2314 return proc_stat_read(buf, size, offset, fi);
96fc5ee6 2315 case LXC_TYPE_PROC_DISKSTATS:
49878439 2316 return proc_diskstats_read(buf, size, offset, fi);
96fc5ee6
SH
2317 default:
2318 return -EINVAL;
2319 }
35629743
SH
2320}
2321
2ad6d2bd
SH
2322/*
2323 * FUSE ops for /
2324 * these just delegate to the /proc and /cgroup ops as
2325 * needed
2326 */
758ad80c
SH
2327
2328static int lxcfs_getattr(const char *path, struct stat *sb)
2329{
2330 if (strcmp(path, "/") == 0) {
2331 sb->st_mode = S_IFDIR | 00755;
2332 sb->st_nlink = 2;
2333 return 0;
2334 }
2335 if (strncmp(path, "/cgroup", 7) == 0) {
2336 return cg_getattr(path, sb);
2337 }
35629743 2338 if (strncmp(path, "/proc", 5) == 0) {
758ad80c
SH
2339 return proc_getattr(path, sb);
2340 }
2341 return -EINVAL;
2342}
2343
2344static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
2345{
2346 if (strcmp(path, "/") == 0)
2347 return 0;
2348
2349 if (strncmp(path, "/cgroup", 7) == 0) {
2350 return cg_opendir(path, fi);
2351 }
35629743
SH
2352 if (strcmp(path, "/proc") == 0)
2353 return 0;
2354 return -ENOENT;
758ad80c
SH
2355}
2356
2357static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2358 struct fuse_file_info *fi)
2359{
2360 if (strcmp(path, "/") == 0) {
2361 if (filler(buf, "proc", NULL, 0) != 0 ||
2362 filler(buf, "cgroup", NULL, 0) != 0)
2363 return -EINVAL;
2364 return 0;
2365 }
35629743 2366 if (strncmp(path, "/cgroup", 7) == 0)
758ad80c 2367 return cg_readdir(path, buf, filler, offset, fi);
35629743
SH
2368 if (strcmp(path, "/proc") == 0)
2369 return proc_readdir(path, buf, filler, offset, fi);
758ad80c
SH
2370 return -EINVAL;
2371}
2372
2373static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
2374{
2375 if (strcmp(path, "/") == 0)
2376 return 0;
2377 if (strncmp(path, "/cgroup", 7) == 0) {
2378 return cg_releasedir(path, fi);
2379 }
35629743
SH
2380 if (strcmp(path, "/proc") == 0)
2381 return 0;
758ad80c
SH
2382 return -EINVAL;
2383}
2384
99978832
SH
2385static int lxcfs_open(const char *path, struct fuse_file_info *fi)
2386{
35629743 2387 if (strncmp(path, "/cgroup", 7) == 0)
99978832 2388 return cg_open(path, fi);
35629743
SH
2389 if (strncmp(path, "/proc", 5) == 0)
2390 return proc_open(path, fi);
99978832
SH
2391
2392 return -EINVAL;
2393}
2394
2395static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
2396 struct fuse_file_info *fi)
2397{
35629743 2398 if (strncmp(path, "/cgroup", 7) == 0)
99978832 2399 return cg_read(path, buf, size, offset, fi);
35629743
SH
2400 if (strncmp(path, "/proc", 5) == 0)
2401 return proc_read(path, buf, size, offset, fi);
99978832
SH
2402
2403 return -EINVAL;
2404}
2405
2ad6d2bd
SH
2406int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
2407 struct fuse_file_info *fi)
2408{
2409 if (strncmp(path, "/cgroup", 7) == 0) {
2410 return cg_write(path, buf, size, offset, fi);
2411 }
2412
2413 return -EINVAL;
2414}
2415
99978832
SH
2416static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2417{
2418 return 0;
2419}
2420
2421static int lxcfs_release(const char *path, struct fuse_file_info *fi)
758ad80c 2422{
8f6e8f5e
SH
2423 if (strncmp(path, "/cgroup", 7) == 0)
2424 return cg_release(path, fi);
8f6e8f5e 2425 if (strncmp(path, "/proc", 5) == 0)
96fc5ee6 2426 return proc_release(path, fi);
8f6e8f5e
SH
2427
2428 return -EINVAL;
99978832
SH
2429}
2430
2431static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2432{
2433 return 0;
758ad80c
SH
2434}
2435
ab54b798
SH
2436int lxcfs_mkdir(const char *path, mode_t mode)
2437{
2438 if (strncmp(path, "/cgroup", 7) == 0)
2439 return cg_mkdir(path, mode);
2440
2441 return -EINVAL;
2442}
2443
341b21ad
SH
2444int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2445{
2446 if (strncmp(path, "/cgroup", 7) == 0)
2447 return cg_chown(path, uid, gid);
2448
2449 return -EINVAL;
2450}
2451
2ad6d2bd
SH
2452/*
2453 * cat first does a truncate before doing ops->write. This doesn't
2454 * really make sense for cgroups. So just return 0 always but do
2455 * nothing.
2456 */
2457int lxcfs_truncate(const char *path, off_t newsize)
2458{
2459 if (strncmp(path, "/cgroup", 7) == 0)
2460 return 0;
2461 return -EINVAL;
2462}
2463
50d8d5b5
SH
2464int lxcfs_rmdir(const char *path)
2465{
2466 if (strncmp(path, "/cgroup", 7) == 0)
2467 return cg_rmdir(path);
2468 return -EINVAL;
2469}
2470
fd2e4e03
SH
2471int lxcfs_chmod(const char *path, mode_t mode)
2472{
2473 if (strncmp(path, "/cgroup", 7) == 0)
2474 return cg_chmod(path, mode);
2475 return -EINVAL;
2476}
2477
758ad80c
SH
2478const struct fuse_operations lxcfs_ops = {
2479 .getattr = lxcfs_getattr,
2480 .readlink = NULL,
2481 .getdir = NULL,
2482 .mknod = NULL,
ab54b798 2483 .mkdir = lxcfs_mkdir,
758ad80c 2484 .unlink = NULL,
50d8d5b5 2485 .rmdir = lxcfs_rmdir,
758ad80c
SH
2486 .symlink = NULL,
2487 .rename = NULL,
2488 .link = NULL,
fd2e4e03 2489 .chmod = lxcfs_chmod,
341b21ad 2490 .chown = lxcfs_chown,
2ad6d2bd 2491 .truncate = lxcfs_truncate,
758ad80c 2492 .utime = NULL,
99978832
SH
2493
2494 .open = lxcfs_open,
2495 .read = lxcfs_read,
2496 .release = lxcfs_release,
2ad6d2bd 2497 .write = lxcfs_write,
99978832 2498
758ad80c 2499 .statfs = NULL,
99978832
SH
2500 .flush = lxcfs_flush,
2501 .fsync = lxcfs_fsync,
758ad80c
SH
2502
2503 .setxattr = NULL,
2504 .getxattr = NULL,
2505 .listxattr = NULL,
2506 .removexattr = NULL,
2507
2508 .opendir = lxcfs_opendir,
2509 .readdir = lxcfs_readdir,
2510 .releasedir = lxcfs_releasedir,
2511
2512 .fsyncdir = NULL,
2513 .init = NULL,
2514 .destroy = NULL,
2515 .access = NULL,
2516 .create = NULL,
2517 .ftruncate = NULL,
2518 .fgetattr = NULL,
2519};
2520
99978832 2521static void usage(const char *me)
758ad80c
SH
2522{
2523 fprintf(stderr, "Usage:\n");
2524 fprintf(stderr, "\n");
0b0f73db
SH
2525 fprintf(stderr, "%s mountpoint\n", me);
2526 fprintf(stderr, "%s -h\n", me);
758ad80c
SH
2527 exit(1);
2528}
2529
99978832 2530static bool is_help(char *w)
758ad80c
SH
2531{
2532 if (strcmp(w, "-h") == 0 ||
2533 strcmp(w, "--help") == 0 ||
2534 strcmp(w, "-help") == 0 ||
2535 strcmp(w, "help") == 0)
2536 return true;
2537 return false;
2538}
2539
0b0f73db
SH
2540void swallow_arg(int *argcp, char *argv[], char *which)
2541{
2542 int i;
2543
2544 for (i = 1; argv[i]; i++) {
2545 if (strcmp(argv[i], which) != 0)
2546 continue;
2547 for (; argv[i]; i++) {
2548 argv[i] = argv[i+1];
2549 }
2550 (*argcp)--;
2551 return;
2552 }
2553}
2554
2555void swallow_option(int *argcp, char *argv[], char *opt, char *v)
2556{
2557 int i;
2558
2559 for (i = 1; argv[i]; i++) {
2560 if (!argv[i+1])
2561 continue;
2562 if (strcmp(argv[i], opt) != 0)
2563 continue;
2564 if (strcmp(argv[i+1], v) != 0) {
2565 fprintf(stderr, "Warning: unexpected fuse option %s\n", v);
2566 exit(1);
2567 }
2568 for (; argv[i+1]; i++) {
2569 argv[i] = argv[i+2];
2570 }
2571 (*argcp) -= 2;
2572 return;
2573 }
2574}
2575
758ad80c
SH
2576int main(int argc, char *argv[])
2577{
c0adec85 2578 int ret = -1;
e5d26e0b 2579 struct lxcfs_state *d = NULL;
0b0f73db
SH
2580 /*
2581 * what we pass to fuse_main is:
2582 * argv[0] -s -f -o allow_other,directio argv[1] NULL
2583 */
2584#define NARGS 7
2585 char *newargv[7];
758ad80c 2586
0b0f73db
SH
2587 /* accomodate older init scripts */
2588 swallow_arg(&argc, argv, "-s");
2589 swallow_arg(&argc, argv, "-f");
2590 swallow_option(&argc, argv, "-o", "allow_other");
2591
2e9c0b32
SH
2592 if (argc == 2 && strcmp(argv[1], "--version") == 0) {
2593 fprintf(stderr, "%s\n", VERSION);
2594 exit(0);
2595 }
0b0f73db 2596 if (argc != 2 || is_help(argv[1]))
758ad80c
SH
2597 usage(argv[0]);
2598
0b0f73db
SH
2599 d = NIH_MUST( malloc(sizeof(*d)) );
2600
2601 newargv[0] = argv[0];
2602 newargv[1] = "-s";
2603 newargv[2] = "-f";
2604 newargv[3] = "-o";
7253e0a4 2605 newargv[4] = "allow_other,direct_io";
0b0f73db
SH
2606 newargv[5] = argv[1];
2607 newargv[6] = NULL;
758ad80c
SH
2608
2609 if (!cgm_escape_cgroup())
2610 fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
2611
2612 if (!cgm_get_controllers(&d->subsystems))
c0adec85 2613 goto out;
758ad80c 2614
0b0f73db 2615 ret = fuse_main(NARGS - 1, newargv, &lxcfs_ops, d);
758ad80c 2616
c0adec85 2617out:
e5d26e0b 2618 free(d);
758ad80c 2619 return ret;
2183082c 2620}