]> git.proxmox.com Git - mirror_lxcfs.git/blob - lxcfs.c
add cache for proc file, for support multiple read
[mirror_lxcfs.git] / lxcfs.c
1 /* lxcfs
2 *
3 * Copyright © 2014 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 /*
10 * NOTES - make sure to run this as -s to avoid threading.
11 * TODO - can we enforce that here from the code?
12 */
13 #define FUSE_USE_VERSION 26
14
15 #include <stdio.h>
16 #include <dirent.h>
17 #include <fcntl.h>
18 #include <fuse.h>
19 #include <unistd.h>
20 #include <errno.h>
21 #include <stdbool.h>
22 #include <time.h>
23 #include <string.h>
24 #include <stdlib.h>
25 #include <libgen.h>
26 #include <sched.h>
27 #include <linux/sched.h>
28 #include <sys/socket.h>
29 #include <sys/mount.h>
30 #include <wait.h>
31
32 #include <nih/alloc.h>
33 #include <nih/string.h>
34
35 #include "cgmanager.h"
36
37 struct lxcfs_state {
38 /*
39 * a null-terminated, nih-allocated list of the mounted subsystems. We
40 * detect this at startup.
41 */
42 char **subsystems;
43 };
44 #define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
45
46 enum {
47 LXC_TYPE_CGDIR,
48 LXC_TYPE_CGFILE,
49 LXC_TYPE_PROC_MEMINFO,
50 LXC_TYPE_PROC_CPUINFO,
51 LXC_TYPE_PROC_UPTIME,
52 LXC_TYPE_PROC_STAT,
53 LXC_TYPE_PROC_DISKSTATS,
54 };
55
56 struct file_info {
57 char *controller;
58 char *cgroup;
59 char *file;
60 int type;
61 char *buf; // unused as of yet
62 int buflen;
63 int size; //actual data size
64 };
65
66 /* reserve buffer size, for cpuall in /proc/stat */
67 #define BUF_RESERVE_SIZE 256
68
69 static char *must_copy_string(void *parent, const char *str)
70 {
71 if (!str)
72 return NULL;
73 return NIH_MUST( nih_strdup(parent, str) );
74 }
75
76 /*
77 * TODO - return value should denote whether child exited with failure
78 * so callers can return errors. Esp read/write of tasks and cgroup.procs
79 */
80 static int wait_for_pid(pid_t pid)
81 {
82 int status, ret;
83
84 again:
85 ret = waitpid(pid, &status, 0);
86 if (ret == -1) {
87 if (errno == EINTR)
88 goto again;
89 return -1;
90 }
91 if (ret != pid)
92 goto again;
93 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
94 return -1;
95 return 0;
96 }
97
98 /*
99 * Given a open file * to /proc/pid/{u,g}id_map, and an id
100 * valid in the caller's namespace, return the id mapped into
101 * pid's namespace.
102 * Returns the mapped id, or -1 on error.
103 */
104 unsigned int
105 convert_id_to_ns(FILE *idfile, unsigned int in_id)
106 {
107 unsigned int nsuid, // base id for a range in the idfile's namespace
108 hostuid, // base id for a range in the caller's namespace
109 count; // number of ids in this range
110 char line[400];
111 int ret;
112
113 fseek(idfile, 0L, SEEK_SET);
114 while (fgets(line, 400, idfile)) {
115 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
116 if (ret != 3)
117 continue;
118 if (hostuid + count < hostuid || nsuid + count < nsuid) {
119 /*
120 * uids wrapped around - unexpected as this is a procfile,
121 * so just bail.
122 */
123 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
124 nsuid, hostuid, count, line);
125 return -1;
126 }
127 if (hostuid <= in_id && hostuid+count > in_id) {
128 /*
129 * now since hostuid <= in_id < hostuid+count, and
130 * hostuid+count and nsuid+count do not wrap around,
131 * we know that nsuid+(in_id-hostuid) which must be
132 * less that nsuid+(count) must not wrap around
133 */
134 return (in_id - hostuid) + nsuid;
135 }
136 }
137
138 // no answer found
139 return -1;
140 }
141
142 /*
143 * for is_privileged_over,
144 * specify whether we require the calling uid to be root in his
145 * namespace
146 */
147 #define NS_ROOT_REQD true
148 #define NS_ROOT_OPT false
149
150 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
151 {
152 nih_local char *fpath = NULL;
153 bool answer = false;
154 uid_t nsuid;
155
156 if (victim == -1 || uid == -1)
157 return false;
158
159 /*
160 * If the request is one not requiring root in the namespace,
161 * then having the same uid suffices. (i.e. uid 1000 has write
162 * access to files owned by uid 1000
163 */
164 if (!req_ns_root && uid == victim)
165 return true;
166
167 fpath = NIH_MUST( nih_sprintf(NULL, "/proc/%d/uid_map", pid) );
168 FILE *f = fopen(fpath, "r");
169 if (!f)
170 return false;
171
172 /* if caller's not root in his namespace, reject */
173 nsuid = convert_id_to_ns(f, uid);
174 if (nsuid)
175 goto out;
176
177 /*
178 * If victim is not mapped into caller's ns, reject.
179 * XXX I'm not sure this check is needed given that fuse
180 * will be sending requests where the vfs has converted
181 */
182 nsuid = convert_id_to_ns(f, victim);
183 if (nsuid == -1)
184 goto out;
185
186 answer = true;
187
188 out:
189 fclose(f);
190 return answer;
191 }
192
193 static bool perms_include(int fmode, mode_t req_mode)
194 {
195 mode_t r;
196
197 switch (req_mode & O_ACCMODE) {
198 case O_RDONLY:
199 r = S_IROTH;
200 break;
201 case O_WRONLY:
202 r = S_IWOTH;
203 break;
204 case O_RDWR:
205 r = S_IROTH | S_IWOTH;
206 break;
207 default:
208 return false;
209 }
210 return ((fmode & r) == r);
211 }
212
213 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
214 {
215 char *start, *end;
216
217 if (strlen(taskcg) <= strlen(querycg)) {
218 fprintf(stderr, "%s: I was fed bad input\n", __func__);
219 return NULL;
220 }
221
222 if (strcmp(querycg, "/") == 0)
223 start = NIH_MUST( nih_strdup(NULL, taskcg + 1) );
224 else
225 start = NIH_MUST( nih_strdup(NULL, taskcg + strlen(querycg) + 1) );
226 end = strchr(start, '/');
227 if (end)
228 *end = '\0';
229 return start;
230 }
231
232 /*
233 * check whether a fuse context may access a cgroup dir or file
234 *
235 * If file is not null, it is a cgroup file to check under cg.
236 * If file is null, then we are checking perms on cg itself.
237 *
238 * For files we can check the mode of the list_keys result.
239 * For cgroups, we must make assumptions based on the files under the
240 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
241 * yet.
242 */
243 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
244 {
245 nih_local struct cgm_keys **list = NULL;
246 int i;
247
248 if (!file)
249 file = "tasks";
250
251 if (*file == '/')
252 file++;
253
254 if (!cgm_list_keys(contrl, cg, &list))
255 return false;
256 for (i = 0; list[i]; i++) {
257 if (strcmp(list[i]->name, file) == 0) {
258 struct cgm_keys *k = list[i];
259 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
260 if (perms_include(k->mode >> 6, mode))
261 return true;
262 }
263 if (fc->gid == k->gid) {
264 if (perms_include(k->mode >> 3, mode))
265 return true;
266 }
267 return perms_include(k->mode, mode);
268 }
269 }
270
271 return false;
272 }
273
274 static void stripnewline(char *x)
275 {
276 size_t l = strlen(x);
277 if (l && x[l-1] == '\n')
278 x[l-1] = '\0';
279 }
280
281 /*
282 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
283 * If caller is in /a, he may act on /a/b, but not on /b.
284 * if the answer is false and nextcg is not NULL, then *nextcg will point
285 * to a nih_alloc'd string containing the next cgroup directory under cg
286 */
287 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
288 {
289 nih_local char *fnam = NULL;
290 FILE *f;
291 bool answer = false;
292 char *line = NULL;
293 size_t len = 0;
294
295 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
296 if (!(f = fopen(fnam, "r")))
297 return false;
298
299 while (getline(&line, &len, f) != -1) {
300 char *c1, *c2, *linecmp;
301 if (!line[0])
302 continue;
303 c1 = strchr(line, ':');
304 if (!c1)
305 goto out;
306 c1++;
307 c2 = strchr(c1, ':');
308 if (!c2)
309 goto out;
310 *c2 = '\0';
311 if (strcmp(c1, contrl) != 0)
312 continue;
313 c2++;
314 stripnewline(c2);
315 /*
316 * callers pass in '/' for root cgroup, otherwise they pass
317 * in a cgroup without leading '/'
318 */
319 linecmp = *cg == '/' ? c2 : c2+1;
320 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
321 if (nextcg)
322 *nextcg = get_next_cgroup_dir(linecmp, cg);
323 goto out;
324 }
325 answer = true;
326 goto out;
327 }
328
329 out:
330 fclose(f);
331 free(line);
332 return answer;
333 }
334
335 /*
336 * given /cgroup/freezer/a/b, return "freezer". this will be nih-allocated
337 * and needs to be nih_freed.
338 */
339 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
340 {
341 const char *p1;
342 char *ret, *slash;
343
344 if (strlen(path) < 9)
345 return NULL;
346 p1 = path+8;
347 ret = nih_strdup(NULL, p1);
348 if (!ret)
349 return ret;
350 slash = strstr(ret, "/");
351 if (slash)
352 *slash = '\0';
353
354 /* verify that it is a subsystem */
355 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
356 int i;
357 if (!list) {
358 nih_free(ret);
359 return NULL;
360 }
361 for (i = 0; list[i]; i++) {
362 if (strcmp(list[i], ret) == 0)
363 return ret;
364 }
365 nih_free(ret);
366 return NULL;
367 }
368
369 /*
370 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
371 * Note that the returned value may include files (keynames) etc
372 */
373 static const char *find_cgroup_in_path(const char *path)
374 {
375 const char *p1;
376
377 if (strlen(path) < 9)
378 return NULL;
379 p1 = strstr(path+8, "/");
380 if (!p1)
381 return NULL;
382 return p1+1;
383 }
384
385 static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
386 {
387 nih_local char **list = NULL;
388 int i;
389
390 if (!f)
391 return false;
392 if (*f == '/')
393 f++;
394
395 if (!cgm_list_children(contr, dir, &list))
396 return false;
397 for (i = 0; list[i]; i++) {
398 if (strcmp(list[i], f) == 0)
399 return true;
400 }
401
402 return false;
403 }
404
405 static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
406 {
407 nih_local struct cgm_keys **list = NULL;
408 struct cgm_keys *k;
409 int i;
410
411 if (!f)
412 return NULL;
413 if (*f == '/')
414 f++;
415 if (!cgm_list_keys(contr, dir, &list))
416 return NULL;
417 for (i = 0; list[i]; i++) {
418 if (strcmp(list[i]->name, f) == 0) {
419 k = NIH_MUST( nih_alloc(NULL, (sizeof(*k))) );
420 k->name = NIH_MUST( nih_strdup(k, list[i]->name) );
421 k->uid = list[i]->uid;
422 k->gid = list[i]->gid;
423 k->mode = list[i]->mode;
424 return k;
425 }
426 }
427
428 return NULL;
429 }
430
431 static void get_cgdir_and_path(const char *cg, char **dir, char **file)
432 {
433 char *p;
434
435 *dir = NIH_MUST( nih_strdup(NULL, cg) );
436 *file = strrchr(cg, '/');
437 if (!*file) {
438 *file = NULL;
439 return;
440 }
441 p = strrchr(*dir, '/');
442 *p = '\0';
443 }
444
445 static size_t get_file_size(const char *contrl, const char *cg, const char *f)
446 {
447 nih_local char *data = NULL;
448 size_t s;
449 if (!cgm_get_value(contrl, cg, f, &data))
450 return -EINVAL;
451 s = strlen(data);
452 return s;
453 }
454
455 /*
456 * FUSE ops for /cgroup
457 */
458
459 static int cg_getattr(const char *path, struct stat *sb)
460 {
461 struct timespec now;
462 struct fuse_context *fc = fuse_get_context();
463 nih_local char * cgdir = NULL;
464 char *fpath = NULL, *path1, *path2;
465 nih_local struct cgm_keys *k = NULL;
466 const char *cgroup;
467 nih_local char *controller = NULL;
468
469
470 if (!fc)
471 return -EIO;
472
473 memset(sb, 0, sizeof(struct stat));
474
475 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
476 return -EINVAL;
477
478 sb->st_uid = sb->st_gid = 0;
479 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
480 sb->st_size = 0;
481
482 if (strcmp(path, "/cgroup") == 0) {
483 sb->st_mode = S_IFDIR | 00755;
484 sb->st_nlink = 2;
485 return 0;
486 }
487
488 controller = pick_controller_from_path(fc, path);
489 if (!controller)
490 return -EIO;
491 cgroup = find_cgroup_in_path(path);
492 if (!cgroup) {
493 /* this is just /cgroup/controller, return it as a dir */
494 sb->st_mode = S_IFDIR | 00755;
495 sb->st_nlink = 2;
496 return 0;
497 }
498
499 get_cgdir_and_path(cgroup, &cgdir, &fpath);
500
501 if (!fpath) {
502 path1 = "/";
503 path2 = cgdir;
504 } else {
505 path1 = cgdir;
506 path2 = fpath;
507 }
508
509 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
510 * Then check that caller's cgroup is under path if fpath is a child
511 * cgroup, or cgdir if fpath is a file */
512
513 if (is_child_cgroup(controller, path1, path2)) {
514 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
515 /* this is just /cgroup/controller, return it as a dir */
516 sb->st_mode = S_IFDIR | 00555;
517 sb->st_nlink = 2;
518 return 0;
519 }
520 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
521 return -EACCES;
522
523 // get uid, gid, from '/tasks' file and make up a mode
524 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
525 sb->st_mode = S_IFDIR | 00755;
526 k = get_cgroup_key(controller, cgroup, "tasks");
527 if (!k) {
528 sb->st_uid = sb->st_gid = 0;
529 } else {
530 sb->st_uid = k->uid;
531 sb->st_gid = k->gid;
532 }
533 sb->st_nlink = 2;
534 return 0;
535 }
536
537 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
538 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL))
539 return -ENOENT;
540 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
541 return -EACCES;
542
543 sb->st_mode = S_IFREG | k->mode;
544 sb->st_nlink = 1;
545 sb->st_uid = k->uid;
546 sb->st_gid = k->gid;
547 sb->st_size = get_file_size(controller, path1, path2);
548 return 0;
549 }
550
551 return -ENOENT;
552 }
553
554 /*
555 * TODO - cache these results in a table for use in opendir, free
556 * in releasedir
557 */
558 static int cg_opendir(const char *path, struct fuse_file_info *fi)
559 {
560 struct fuse_context *fc = fuse_get_context();
561 nih_local struct cgm_keys **list = NULL;
562 const char *cgroup;
563 struct file_info *dir_info;
564 nih_local char *controller = NULL;
565
566 if (!fc)
567 return -EIO;
568
569 if (strcmp(path, "/cgroup") == 0) {
570 cgroup = NULL;
571 controller = NULL;
572 } else {
573 // return list of keys for the controller, and list of child cgroups
574 controller = pick_controller_from_path(fc, path);
575 if (!controller)
576 return -EIO;
577
578 cgroup = find_cgroup_in_path(path);
579 if (!cgroup) {
580 /* this is just /cgroup/controller, return its contents */
581 cgroup = "/";
582 }
583 }
584
585 if (cgroup && !fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
586 return -EACCES;
587
588 /* we'll free this at cg_releasedir */
589 dir_info = NIH_MUST( nih_alloc(NULL, sizeof(*dir_info)) );
590 dir_info->controller = must_copy_string(dir_info, controller);
591 dir_info->cgroup = must_copy_string(dir_info, cgroup);
592 dir_info->type = LXC_TYPE_CGDIR;
593 dir_info->buf = NULL;
594 dir_info->file = NULL;
595 dir_info->buflen = 0;
596
597 fi->fh = (unsigned long)dir_info;
598 return 0;
599 }
600
601 static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
602 struct fuse_file_info *fi)
603 {
604 struct file_info *d = (struct file_info *)fi->fh;
605 nih_local struct cgm_keys **list = NULL;
606 int i;
607 nih_local char *nextcg = NULL;
608 struct fuse_context *fc = fuse_get_context();
609
610 if (d->type != LXC_TYPE_CGDIR) {
611 fprintf(stderr, "Internal error: file cache info used in readdir\n");
612 return -EIO;
613 }
614 if (!d->cgroup && !d->controller) {
615 // ls /var/lib/lxcfs/cgroup - just show list of controllers
616 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
617 int i;
618
619 if (!list)
620 return -EIO;
621
622 for (i = 0; list[i]; i++) {
623 if (filler(buf, list[i], NULL, 0) != 0) {
624 return -EIO;
625 }
626 }
627 return 0;
628 }
629
630 if (!cgm_list_keys(d->controller, d->cgroup, &list))
631 // not a valid cgroup
632 return -EINVAL;
633
634 if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
635 if (nextcg) {
636 int ret;
637 ret = filler(buf, nextcg, NULL, 0);
638 if (ret != 0)
639 return -EIO;
640 }
641 return 0;
642 }
643
644 for (i = 0; list[i]; i++) {
645 if (filler(buf, list[i]->name, NULL, 0) != 0) {
646 return -EIO;
647 }
648 }
649
650 // now get the list of child cgroups
651 nih_local char **clist = NULL;
652
653 if (!cgm_list_children(d->controller, d->cgroup, &clist))
654 return 0;
655 for (i = 0; clist[i]; i++) {
656 if (filler(buf, clist[i], NULL, 0) != 0) {
657 return -EIO;
658 }
659 }
660 return 0;
661 }
662
663 static void do_release_file_info(struct file_info *f)
664 {
665 /*
666 * all file_info fields which are nih_alloc()d with f as parent
667 * will be automatically freed
668 */
669 if (!f->buf) {
670 nih_free(f->buf);
671 f->buf = NULL;
672 }
673 nih_free(f);
674 }
675
676 static int cg_releasedir(const char *path, struct fuse_file_info *fi)
677 {
678 struct file_info *d = (struct file_info *)fi->fh;
679
680 do_release_file_info(d);
681 return 0;
682 }
683
684 static int cg_open(const char *path, struct fuse_file_info *fi)
685 {
686 nih_local char *controller = NULL;
687 const char *cgroup;
688 char *fpath = NULL, *path1, *path2;
689 nih_local char * cgdir = NULL;
690 nih_local struct cgm_keys *k = NULL;
691 struct file_info *file_info;
692 struct fuse_context *fc = fuse_get_context();
693
694 if (!fc)
695 return -EIO;
696
697 controller = pick_controller_from_path(fc, path);
698 if (!controller)
699 return -EIO;
700 cgroup = find_cgroup_in_path(path);
701 if (!cgroup)
702 return -EINVAL;
703
704 get_cgdir_and_path(cgroup, &cgdir, &fpath);
705 if (!fpath) {
706 path1 = "/";
707 path2 = cgdir;
708 } else {
709 path1 = cgdir;
710 path2 = fpath;
711 }
712
713 k = get_cgroup_key(controller, path1, path2);
714 if (!k)
715 return -EINVAL;
716
717 if (!fc_may_access(fc, controller, path1, path2, fi->flags))
718 // should never get here
719 return -EACCES;
720
721 /* we'll free this at cg_release */
722 file_info = NIH_MUST( nih_alloc(NULL, sizeof(*file_info)) );
723 file_info->controller = must_copy_string(file_info, controller);
724 file_info->cgroup = must_copy_string(file_info, path1);
725 file_info->file = must_copy_string(file_info, path2);
726 file_info->type = LXC_TYPE_CGFILE;
727 file_info->buf = NULL;
728 file_info->buflen = 0;
729
730 fi->fh = (unsigned long)file_info;
731 return 0;
732 }
733
734 static int cg_release(const char *path, struct fuse_file_info *fi)
735 {
736 struct file_info *f = (struct file_info *)fi->fh;
737
738 do_release_file_info(f);
739 return 0;
740 }
741
742 static int msgrecv(int sockfd, void *buf, size_t len)
743 {
744 struct timeval tv;
745 fd_set rfds;
746
747 FD_ZERO(&rfds);
748 FD_SET(sockfd, &rfds);
749 tv.tv_sec = 2;
750 tv.tv_usec = 0;
751
752 if (select(sockfd+1, &rfds, NULL, NULL, &tv) <= 0)
753 return -1;
754 return recv(sockfd, buf, len, MSG_DONTWAIT);
755 }
756
757 #define SEND_CREDS_OK 0
758 #define SEND_CREDS_NOTSK 1
759 #define SEND_CREDS_FAIL 2
760 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
761 {
762 struct msghdr msg = { 0 };
763 struct iovec iov;
764 struct cmsghdr *cmsg;
765 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
766 char buf[1];
767 buf[0] = 'p';
768
769 if (pingfirst) {
770 if (msgrecv(sock, buf, 1) != 1) {
771 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
772 __func__);
773 return SEND_CREDS_FAIL;
774 }
775 }
776
777 msg.msg_control = cmsgbuf;
778 msg.msg_controllen = sizeof(cmsgbuf);
779
780 cmsg = CMSG_FIRSTHDR(&msg);
781 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
782 cmsg->cmsg_level = SOL_SOCKET;
783 cmsg->cmsg_type = SCM_CREDENTIALS;
784 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
785
786 msg.msg_name = NULL;
787 msg.msg_namelen = 0;
788
789 buf[0] = v;
790 iov.iov_base = buf;
791 iov.iov_len = sizeof(buf);
792 msg.msg_iov = &iov;
793 msg.msg_iovlen = 1;
794
795 if (sendmsg(sock, &msg, 0) < 0) {
796 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
797 strerror(errno));
798 if (errno == 3)
799 return SEND_CREDS_NOTSK;
800 return SEND_CREDS_FAIL;
801 }
802
803 return SEND_CREDS_OK;
804 }
805
806 static bool recv_creds(int sock, struct ucred *cred, char *v)
807 {
808 struct msghdr msg = { 0 };
809 struct iovec iov;
810 struct cmsghdr *cmsg;
811 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
812 char buf[1];
813 int ret;
814 int optval = 1;
815 struct timeval tv;
816 fd_set rfds;
817
818 *v = '1';
819
820 cred->pid = -1;
821 cred->uid = -1;
822 cred->gid = -1;
823
824 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
825 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
826 return false;
827 }
828 buf[0] = '1';
829 if (write(sock, buf, 1) != 1) {
830 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
831 return false;
832 }
833
834 msg.msg_name = NULL;
835 msg.msg_namelen = 0;
836 msg.msg_control = cmsgbuf;
837 msg.msg_controllen = sizeof(cmsgbuf);
838
839 iov.iov_base = buf;
840 iov.iov_len = sizeof(buf);
841 msg.msg_iov = &iov;
842 msg.msg_iovlen = 1;
843
844 FD_ZERO(&rfds);
845 FD_SET(sock, &rfds);
846 tv.tv_sec = 2;
847 tv.tv_usec = 0;
848 if (select(sock+1, &rfds, NULL, NULL, &tv) <= 0) {
849 fprintf(stderr, "Failed to select for scm_cred: %s\n",
850 strerror(errno));
851 return false;
852 }
853 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
854 if (ret < 0) {
855 fprintf(stderr, "Failed to receive scm_cred: %s\n",
856 strerror(errno));
857 return false;
858 }
859
860 cmsg = CMSG_FIRSTHDR(&msg);
861
862 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
863 cmsg->cmsg_level == SOL_SOCKET &&
864 cmsg->cmsg_type == SCM_CREDENTIALS) {
865 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
866 }
867 *v = buf[0];
868
869 return true;
870 }
871
872
873 /*
874 * pid_to_ns - reads pids from a ucred over a socket, then writes the
875 * int value back over the socket. This shifts the pid from the
876 * sender's pidns into tpid's pidns.
877 */
878 static void pid_to_ns(int sock, pid_t tpid)
879 {
880 char v = '0';
881 struct ucred cred;
882
883 while (recv_creds(sock, &cred, &v)) {
884 if (v == '1')
885 exit(0);
886 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
887 exit(1);
888 }
889 exit(0);
890 }
891
892 /*
893 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
894 * in your old pidns. Only children which you fork will be in the target
895 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
896 * actually convert pids
897 */
898 static void pid_to_ns_wrapper(int sock, pid_t tpid)
899 {
900 int newnsfd = -1, ret, cpipe[2];
901 char fnam[100];
902 pid_t cpid;
903 struct timeval tv;
904 fd_set s;
905 char v;
906
907 sprintf(fnam, "/proc/%d/ns/pid", tpid);
908 newnsfd = open(fnam, O_RDONLY);
909 if (newnsfd < 0)
910 exit(1);
911 if (setns(newnsfd, 0) < 0)
912 exit(1);
913 close(newnsfd);
914
915 if (pipe(cpipe) < 0)
916 exit(1);
917
918 loop:
919 cpid = fork();
920 if (cpid < 0)
921 exit(1);
922
923 if (!cpid) {
924 char b = '1';
925 close(cpipe[0]);
926 if (write(cpipe[1], &b, sizeof(char)) < 0) {
927 fprintf(stderr, "%s (child): erorr on write: %s\n",
928 __func__, strerror(errno));
929 }
930 close(cpipe[1]);
931 pid_to_ns(sock, tpid);
932 }
933 // give the child 1 second to be done forking and
934 // write it's ack
935 FD_ZERO(&s);
936 FD_SET(cpipe[0], &s);
937 tv.tv_sec = 1;
938 tv.tv_usec = 0;
939 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
940 if (ret <= 0)
941 goto again;
942 ret = read(cpipe[0], &v, 1);
943 if (ret != sizeof(char) || v != '1') {
944 goto again;
945 }
946
947 if (!wait_for_pid(cpid))
948 exit(1);
949 exit(0);
950
951 again:
952 kill(cpid, SIGKILL);
953 wait_for_pid(cpid);
954 goto loop;
955 }
956
957 /*
958 * To read cgroup files with a particular pid, we will setns into the child
959 * pidns, open a pipe, fork a child - which will be the first to really be in
960 * the child ns - which does the cgm_get_value and writes the data to the pipe.
961 */
962 static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
963 {
964 int sock[2] = {-1, -1};
965 nih_local char *tmpdata = NULL;
966 int ret;
967 pid_t qpid, cpid = -1;
968 bool answer = false;
969 char v = '0';
970 struct ucred cred;
971 struct timeval tv;
972 fd_set s;
973
974 if (!cgm_get_value(contrl, cg, file, &tmpdata))
975 return false;
976
977 /*
978 * Now we read the pids from returned data one by one, pass
979 * them into a child in the target namespace, read back the
980 * translated pids, and put them into our to-return data
981 */
982
983 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
984 perror("socketpair");
985 exit(1);
986 }
987
988 cpid = fork();
989 if (cpid == -1)
990 goto out;
991
992 if (!cpid) // child
993 pid_to_ns_wrapper(sock[1], tpid);
994
995 char *ptr = tmpdata;
996 cred.uid = 0;
997 cred.gid = 0;
998 while (sscanf(ptr, "%d\n", &qpid) == 1) {
999 cred.pid = qpid;
1000 ret = send_creds(sock[0], &cred, v, true);
1001
1002 if (ret == SEND_CREDS_NOTSK)
1003 goto next;
1004 if (ret == SEND_CREDS_FAIL)
1005 goto out;
1006
1007 // read converted results
1008 FD_ZERO(&s);
1009 FD_SET(sock[0], &s);
1010 tv.tv_sec = 2;
1011 tv.tv_usec = 0;
1012 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
1013 if (ret <= 0) {
1014 fprintf(stderr, "%s: select error waiting for pid from child: %s\n",
1015 __func__, strerror(errno));
1016 goto out;
1017 }
1018 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1019 fprintf(stderr, "%s: error reading pid from child: %s\n",
1020 __func__, strerror(errno));
1021 goto out;
1022 }
1023 NIH_MUST( nih_strcat_sprintf(d, NULL, "%d\n", qpid) );
1024 next:
1025 ptr = strchr(ptr, '\n');
1026 if (!ptr)
1027 break;
1028 ptr++;
1029 }
1030
1031 cred.pid = getpid();
1032 v = '1';
1033 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
1034 // failed to ask child to exit
1035 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1036 __func__, strerror(errno));
1037 goto out;
1038 }
1039
1040 answer = true;
1041
1042 out:
1043 if (cpid != -1)
1044 wait_for_pid(cpid);
1045 if (sock[0] != -1) {
1046 close(sock[0]);
1047 close(sock[1]);
1048 }
1049 return answer;
1050 }
1051
1052 static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1053 struct fuse_file_info *fi)
1054 {
1055 struct fuse_context *fc = fuse_get_context();
1056 struct file_info *f = (struct file_info *)fi->fh;
1057 nih_local struct cgm_keys *k = NULL;
1058
1059 if (f->type != LXC_TYPE_CGFILE) {
1060 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1061 return -EIO;
1062 }
1063
1064 if (offset)
1065 return -EIO;
1066
1067 if (!fc)
1068 return -EIO;
1069
1070 if (!f->controller)
1071 return -EINVAL;
1072
1073 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
1074 nih_local char *data = NULL;
1075 int s;
1076 bool r;
1077
1078 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY))
1079 // should never get here
1080 return -EACCES;
1081
1082 if (strcmp(f->file, "tasks") == 0 ||
1083 strcmp(f->file, "/tasks") == 0 ||
1084 strcmp(f->file, "/cgroup.procs") == 0 ||
1085 strcmp(f->file, "cgroup.procs") == 0)
1086 // special case - we have to translate the pids
1087 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
1088 else
1089 r = cgm_get_value(f->controller, f->cgroup, f->file, &data);
1090
1091 if (!r)
1092 return -EINVAL;
1093
1094 if (!data)
1095 return 0;
1096 s = strlen(data);
1097 if (s > size)
1098 s = size;
1099 memcpy(buf, data, s);
1100
1101 return s;
1102 }
1103
1104 return -EINVAL;
1105 }
1106
1107 static void pid_from_ns(int sock, pid_t tpid)
1108 {
1109 pid_t vpid;
1110 struct ucred cred;
1111 char v;
1112 struct timeval tv;
1113 fd_set s;
1114 int ret;
1115
1116 cred.uid = 0;
1117 cred.gid = 0;
1118 while (1) {
1119 FD_ZERO(&s);
1120 FD_SET(sock, &s);
1121 tv.tv_sec = 2;
1122 tv.tv_usec = 0;
1123 ret = select(sock+1, &s, NULL, NULL, &tv);
1124 if (ret <= 0) {
1125 fprintf(stderr, "%s: bad select before read from parent: %s\n",
1126 __func__, strerror(errno));
1127 exit(1);
1128 }
1129 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1130 fprintf(stderr, "%s: bad read from parent: %s\n",
1131 __func__, strerror(errno));
1132 exit(1);
1133 }
1134 if (vpid == -1) // done
1135 break;
1136 v = '0';
1137 cred.pid = vpid;
1138 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
1139 v = '1';
1140 cred.pid = getpid();
1141 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
1142 exit(1);
1143 }
1144 }
1145 exit(0);
1146 }
1147
1148 static void pid_from_ns_wrapper(int sock, pid_t tpid)
1149 {
1150 int newnsfd = -1, ret, cpipe[2];
1151 char fnam[100];
1152 pid_t cpid;
1153 fd_set s;
1154 struct timeval tv;
1155 char v;
1156
1157 sprintf(fnam, "/proc/%d/ns/pid", tpid);
1158 newnsfd = open(fnam, O_RDONLY);
1159 if (newnsfd < 0)
1160 exit(1);
1161 if (setns(newnsfd, 0) < 0)
1162 exit(1);
1163 close(newnsfd);
1164
1165 if (pipe(cpipe) < 0)
1166 exit(1);
1167
1168 loop:
1169 cpid = fork();
1170
1171 if (cpid < 0)
1172 exit(1);
1173
1174 if (!cpid) {
1175 char b = '1';
1176 close(cpipe[0]);
1177 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1178 fprintf(stderr, "%s (child): erorr on write: %s\n",
1179 __func__, strerror(errno));
1180 }
1181 close(cpipe[1]);
1182 pid_from_ns(sock, tpid);
1183 }
1184
1185 // give the child 1 second to be done forking and
1186 // write it's ack
1187 FD_ZERO(&s);
1188 FD_SET(cpipe[0], &s);
1189 tv.tv_sec = 1;
1190 tv.tv_usec = 0;
1191 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1192 if (ret <= 0)
1193 goto again;
1194 ret = read(cpipe[0], &v, 1);
1195 if (ret != sizeof(char) || v != '1') {
1196 goto again;
1197 }
1198
1199 if (!wait_for_pid(cpid))
1200 exit(1);
1201 exit(0);
1202
1203 again:
1204 kill(cpid, SIGKILL);
1205 wait_for_pid(cpid);
1206 goto loop;
1207 }
1208
1209 static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
1210 {
1211 int sock[2] = {-1, -1};
1212 pid_t qpid, cpid = -1;
1213 bool answer = false, fail = false;
1214
1215 /*
1216 * write the pids to a socket, have helper in writer's pidns
1217 * call movepid for us
1218 */
1219 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1220 perror("socketpair");
1221 exit(1);
1222 }
1223
1224 cpid = fork();
1225 if (cpid == -1)
1226 goto out;
1227
1228 if (!cpid) // child
1229 pid_from_ns_wrapper(sock[1], tpid);
1230
1231 const char *ptr = buf;
1232 while (sscanf(ptr, "%d", &qpid) == 1) {
1233 struct ucred cred;
1234 char v;
1235
1236 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1237 fprintf(stderr, "%s: error writing pid to child: %s\n",
1238 __func__, strerror(errno));
1239 goto out;
1240 }
1241
1242 if (recv_creds(sock[0], &cred, &v)) {
1243 if (v == '0') {
1244 if (!cgm_move_pid(contrl, cg, cred.pid))
1245 fail = true;
1246 }
1247 }
1248
1249 ptr = strchr(ptr, '\n');
1250 if (!ptr)
1251 break;
1252 ptr++;
1253 }
1254
1255 /* All good, write the value */
1256 qpid = -1;
1257 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1258 fprintf(stderr, "Warning: failed to ask child to exit\n");
1259
1260 if (!fail)
1261 answer = true;
1262
1263 out:
1264 if (cpid != -1)
1265 wait_for_pid(cpid);
1266 if (sock[0] != -1) {
1267 close(sock[0]);
1268 close(sock[1]);
1269 }
1270 return answer;
1271 }
1272
1273 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1274 struct fuse_file_info *fi)
1275 {
1276 struct fuse_context *fc = fuse_get_context();
1277 nih_local char *localbuf = NULL;
1278 nih_local struct cgm_keys *k = NULL;
1279 struct file_info *f = (struct file_info *)fi->fh;
1280
1281 if (f->type != LXC_TYPE_CGFILE) {
1282 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1283 return -EIO;
1284 }
1285
1286 if (offset)
1287 return -EINVAL;
1288
1289 if (!fc)
1290 return -EIO;
1291
1292 localbuf = NIH_MUST( nih_alloc(NULL, size+1) );
1293 localbuf[size] = '\0';
1294 memcpy(localbuf, buf, size);
1295
1296 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
1297 bool r;
1298
1299 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY))
1300 return -EACCES;
1301
1302 if (strcmp(f->file, "tasks") == 0 ||
1303 strcmp(f->file, "/tasks") == 0 ||
1304 strcmp(f->file, "/cgroup.procs") == 0 ||
1305 strcmp(f->file, "cgroup.procs") == 0)
1306 // special case - we have to translate the pids
1307 r = do_write_pids(fc->pid, f->controller, f->cgroup, f->file, localbuf);
1308 else
1309 r = cgm_set_value(f->controller, f->cgroup, f->file, localbuf);
1310
1311 if (!r)
1312 return -EINVAL;
1313
1314 return size;
1315 }
1316
1317 return -EINVAL;
1318 }
1319
1320 int cg_chown(const char *path, uid_t uid, gid_t gid)
1321 {
1322 struct fuse_context *fc = fuse_get_context();
1323 nih_local char * cgdir = NULL;
1324 char *fpath = NULL, *path1, *path2;
1325 nih_local struct cgm_keys *k = NULL;
1326 const char *cgroup;
1327 nih_local char *controller = NULL;
1328
1329
1330 if (!fc)
1331 return -EIO;
1332
1333 if (strcmp(path, "/cgroup") == 0)
1334 return -EINVAL;
1335
1336 controller = pick_controller_from_path(fc, path);
1337 if (!controller)
1338 return -EINVAL;
1339 cgroup = find_cgroup_in_path(path);
1340 if (!cgroup)
1341 /* this is just /cgroup/controller */
1342 return -EINVAL;
1343
1344 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1345
1346 if (!fpath) {
1347 path1 = "/";
1348 path2 = cgdir;
1349 } else {
1350 path1 = cgdir;
1351 path2 = fpath;
1352 }
1353
1354 if (is_child_cgroup(controller, path1, path2)) {
1355 // get uid, gid, from '/tasks' file and make up a mode
1356 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1357 k = get_cgroup_key(controller, cgroup, "tasks");
1358
1359 } else
1360 k = get_cgroup_key(controller, path1, path2);
1361
1362 if (!k)
1363 return -EINVAL;
1364
1365 /*
1366 * This being a fuse request, the uid and gid must be valid
1367 * in the caller's namespace. So we can just check to make
1368 * sure that the caller is root in his uid, and privileged
1369 * over the file's current owner.
1370 */
1371 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD))
1372 return -EACCES;
1373
1374 if (!cgm_chown_file(controller, cgroup, uid, gid))
1375 return -EINVAL;
1376 return 0;
1377 }
1378
1379 int cg_chmod(const char *path, mode_t mode)
1380 {
1381 struct fuse_context *fc = fuse_get_context();
1382 nih_local char * cgdir = NULL;
1383 char *fpath = NULL, *path1, *path2;
1384 nih_local struct cgm_keys *k = NULL;
1385 const char *cgroup;
1386 nih_local char *controller = NULL;
1387
1388 if (!fc)
1389 return -EIO;
1390
1391 if (strcmp(path, "/cgroup") == 0)
1392 return -EINVAL;
1393
1394 controller = pick_controller_from_path(fc, path);
1395 if (!controller)
1396 return -EINVAL;
1397 cgroup = find_cgroup_in_path(path);
1398 if (!cgroup)
1399 /* this is just /cgroup/controller */
1400 return -EINVAL;
1401
1402 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1403
1404 if (!fpath) {
1405 path1 = "/";
1406 path2 = cgdir;
1407 } else {
1408 path1 = cgdir;
1409 path2 = fpath;
1410 }
1411
1412 if (is_child_cgroup(controller, path1, path2)) {
1413 // get uid, gid, from '/tasks' file and make up a mode
1414 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1415 k = get_cgroup_key(controller, cgroup, "tasks");
1416
1417 } else
1418 k = get_cgroup_key(controller, path1, path2);
1419
1420 if (!k)
1421 return -EINVAL;
1422
1423 /*
1424 * This being a fuse request, the uid and gid must be valid
1425 * in the caller's namespace. So we can just check to make
1426 * sure that the caller is root in his uid, and privileged
1427 * over the file's current owner.
1428 */
1429 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT))
1430 return -EPERM;
1431
1432 if (!cgm_chmod_file(controller, cgroup, mode))
1433 return -EINVAL;
1434 return 0;
1435 }
1436
1437 int cg_mkdir(const char *path, mode_t mode)
1438 {
1439 struct fuse_context *fc = fuse_get_context();
1440 nih_local struct cgm_keys **list = NULL;
1441 char *fpath = NULL, *path1;
1442 nih_local char * cgdir = NULL;
1443 const char *cgroup;
1444 nih_local char *controller = NULL;
1445
1446 if (!fc)
1447 return -EIO;
1448
1449
1450 controller = pick_controller_from_path(fc, path);
1451 if (!controller)
1452 return -EINVAL;
1453
1454 cgroup = find_cgroup_in_path(path);
1455 if (!cgroup)
1456 return -EINVAL;
1457
1458 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1459 if (!fpath)
1460 path1 = "/";
1461 else
1462 path1 = cgdir;
1463
1464 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR))
1465 return -EACCES;
1466
1467
1468 if (!cgm_create(controller, cgroup, fc->uid, fc->gid))
1469 return -EINVAL;
1470
1471 return 0;
1472 }
1473
1474 static int cg_rmdir(const char *path)
1475 {
1476 struct fuse_context *fc = fuse_get_context();
1477 nih_local struct cgm_keys **list = NULL;
1478 char *fpath = NULL;
1479 nih_local char * cgdir = NULL;
1480 const char *cgroup;
1481 nih_local char *controller = NULL;
1482
1483 if (!fc)
1484 return -EIO;
1485
1486
1487 controller = pick_controller_from_path(fc, path);
1488 if (!controller)
1489 return -EINVAL;
1490
1491 cgroup = find_cgroup_in_path(path);
1492 if (!cgroup)
1493 return -EINVAL;
1494
1495 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1496 if (!fpath)
1497 return -EINVAL;
1498
1499 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY))
1500 return -EACCES;
1501
1502 if (!cgm_remove(controller, cgroup))
1503 return -EINVAL;
1504
1505 return 0;
1506 }
1507
1508 static bool startswith(const char *line, const char *pref)
1509 {
1510 if (strncmp(line, pref, strlen(pref)) == 0)
1511 return true;
1512 return false;
1513 }
1514
1515 static void get_mem_cached(char *memstat, unsigned long *v)
1516 {
1517 char *eol;
1518
1519 *v = 0;
1520 while (*memstat) {
1521 if (startswith(memstat, "total_cache")) {
1522 sscanf(memstat + 11, "%lu", v);
1523 *v /= 1024;
1524 return;
1525 }
1526 eol = strchr(memstat, '\n');
1527 if (!eol)
1528 return;
1529 memstat = eol+1;
1530 }
1531 }
1532
1533 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
1534 {
1535 char *eol;
1536 char key[32];
1537
1538 memset(key, 0, 32);
1539 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
1540
1541 size_t len = strlen(key);
1542 *v = 0;
1543
1544 while (*str) {
1545 if (startswith(str, key)) {
1546 sscanf(str + len, "%lu", v);
1547 return;
1548 }
1549 eol = strchr(str, '\n');
1550 if (!eol)
1551 return;
1552 str = eol+1;
1553 }
1554 }
1555
1556 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1557 {
1558 nih_local char *fnam = NULL;
1559 FILE *f;
1560 char *answer = NULL;
1561 char *line = NULL;
1562 size_t len = 0;
1563
1564 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
1565 if (!(f = fopen(fnam, "r")))
1566 return false;
1567
1568 while (getline(&line, &len, f) != -1) {
1569 char *c1, *c2;
1570 if (!line[0])
1571 continue;
1572 c1 = strchr(line, ':');
1573 if (!c1)
1574 goto out;
1575 c1++;
1576 c2 = strchr(c1, ':');
1577 if (!c2)
1578 goto out;
1579 *c2 = '\0';
1580 if (strcmp(c1, contrl) != 0)
1581 continue;
1582 c2++;
1583 stripnewline(c2);
1584 answer = NIH_MUST( nih_strdup(NULL, c2) );
1585 goto out;
1586 }
1587
1588 out:
1589 fclose(f);
1590 free(line);
1591 return answer;
1592 }
1593
1594 /*
1595 * FUSE ops for /proc
1596 */
1597
1598 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1599 struct fuse_file_info *fi)
1600 {
1601 struct fuse_context *fc = fuse_get_context();
1602 struct file_info *d = (struct file_info *)fi->fh;
1603 nih_local char *cg = get_pid_cgroup(fc->pid, "memory");
1604 nih_local char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
1605 unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
1606 char *line = NULL;
1607 size_t linelen = 0, total_len = 0;
1608 char *cache = d->buf;
1609 size_t cache_size = d->buflen;
1610 FILE *f;
1611
1612 if (offset){
1613 if (offset > d->size)
1614 return -EINVAL;
1615 int left = d->size - offset;
1616 total_len = left > size ? size: left;
1617 memcpy(buf, cache + offset, total_len);
1618 return total_len;
1619 }
1620
1621 if (!cg)
1622 return 0;
1623
1624 if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
1625 return 0;
1626 if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
1627 return 0;
1628 if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
1629 return 0;
1630 memlimit = strtoul(memlimit_str, NULL, 10);
1631 memusage = strtoul(memusage_str, NULL, 10);
1632 memlimit /= 1024;
1633 memusage /= 1024;
1634 get_mem_cached(memstat_str, &cached);
1635
1636 f = fopen("/proc/meminfo", "r");
1637 if (!f)
1638 return 0;
1639
1640 while (getline(&line, &linelen, f) != -1) {
1641 size_t l;
1642 char *printme, lbuf[100];
1643
1644 memset(lbuf, 0, 100);
1645 if (startswith(line, "MemTotal:")) {
1646 sscanf(line+14, "%lu", &hosttotal);
1647 if (hosttotal < memlimit)
1648 memlimit = hosttotal;
1649 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1650 printme = lbuf;
1651 } else if (startswith(line, "MemFree:")) {
1652 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1653 printme = lbuf;
1654 } else if (startswith(line, "MemAvailable:")) {
1655 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1656 printme = lbuf;
1657 } else if (startswith(line, "Buffers:")) {
1658 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1659 printme = lbuf;
1660 } else if (startswith(line, "Cached:")) {
1661 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
1662 printme = lbuf;
1663 } else if (startswith(line, "SwapCached:")) {
1664 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
1665 printme = lbuf;
1666 } else
1667 printme = line;
1668
1669 l = snprintf(cache, cache_size, "%s", printme);
1670 cache += l;
1671 cache_size -= l;
1672 total_len += l;
1673 }
1674
1675 d->size = total_len;
1676 if (total_len > size ) total_len = size;
1677 memcpy(buf, d->buf, total_len);
1678
1679 fclose(f);
1680 free(line);
1681 return total_len;
1682 }
1683
1684 /*
1685 * Read the cpuset.cpus for cg
1686 * Return the answer in a nih_alloced string
1687 */
1688 static char *get_cpuset(const char *cg)
1689 {
1690 char *answer;
1691
1692 if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
1693 return NULL;
1694 return answer;
1695 }
1696
1697 /*
1698 * Helper functions for cpuset_in-set
1699 */
1700 char *cpuset_nexttok(const char *c)
1701 {
1702 char *r = strchr(c+1, ',');
1703 if (r)
1704 return r+1;
1705 return NULL;
1706 }
1707
1708 int cpuset_getrange(const char *c, int *a, int *b)
1709 {
1710 int ret;
1711
1712 ret = sscanf(c, "%d-%d", a, b);
1713 return ret;
1714 }
1715
1716 /*
1717 * cpusets are in format "1,2-3,4"
1718 * iow, comma-delimited ranges
1719 */
1720 static bool cpu_in_cpuset(int cpu, const char *cpuset)
1721 {
1722 const char *c;
1723
1724 for (c = cpuset; c; c = cpuset_nexttok(c)) {
1725 int a, b, ret;
1726
1727 ret = cpuset_getrange(c, &a, &b);
1728 if (ret == 1 && cpu == a)
1729 return true;
1730 if (ret != 2) // bad cpuset!
1731 return false;
1732 if (cpu >= a && cpu <= b)
1733 return true;
1734 }
1735
1736 return false;
1737 }
1738
1739 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
1740 {
1741 int cpu;
1742
1743 if (sscanf(line, "processor : %d", &cpu) != 1)
1744 return false;
1745 return cpu_in_cpuset(cpu, cpuset);
1746 }
1747
1748 /*
1749 * check whether this is a '^processor" line in /proc/cpuinfo
1750 */
1751 static bool is_processor_line(const char *line)
1752 {
1753 int cpu;
1754
1755 if (sscanf(line, "processor : %d", &cpu) == 1)
1756 return true;
1757 return false;
1758 }
1759
1760 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
1761 struct fuse_file_info *fi)
1762 {
1763 struct fuse_context *fc = fuse_get_context();
1764 struct file_info *d = (struct file_info *)fi->fh;
1765 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1766 nih_local char *cpuset = NULL;
1767 char *line = NULL;
1768 size_t linelen = 0, total_len = 0;
1769 bool am_printing = false;
1770 int curcpu = -1;
1771 char *cache = d->buf;
1772 size_t cache_size = d->buflen;
1773 FILE *f;
1774
1775 if (offset){
1776 if (offset > d->size)
1777 return -EINVAL;
1778 int left = d->size - offset;
1779 total_len = left > size ? size: left;
1780 memcpy(buf, cache + offset, total_len);
1781 return total_len;
1782 }
1783
1784 if (!cg)
1785 return 0;
1786
1787 cpuset = get_cpuset(cg);
1788 if (!cpuset)
1789 return 0;
1790
1791 f = fopen("/proc/cpuinfo", "r");
1792 if (!f)
1793 return 0;
1794
1795 while (getline(&line, &linelen, f) != -1) {
1796 size_t l;
1797 if (is_processor_line(line)) {
1798 am_printing = cpuline_in_cpuset(line, cpuset);
1799 if (am_printing) {
1800 curcpu ++;
1801 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
1802 if (l < cache_size){
1803 cache += l;
1804 cache_size -= l;
1805 total_len += l;
1806 }else{
1807 cache += cache_size;
1808 total_len += cache_size;
1809 cache_size = 0;
1810 break;
1811 }
1812 }
1813 continue;
1814 }
1815 if (am_printing) {
1816 l = snprintf(cache, cache_size, "%s", line);
1817 if (l < cache_size) {
1818 cache += l;
1819 cache_size -= l;
1820 total_len += l;
1821 } else {
1822 cache += cache_size;
1823 total_len += cache_size;
1824 cache_size = 0;
1825 break;
1826 }
1827 }
1828 }
1829
1830 d->size = total_len;
1831 if (total_len > size ) total_len = size;
1832
1833 /* read from off 0 */
1834 memcpy(buf, d->buf, total_len);
1835
1836 fclose(f);
1837 free(line);
1838 return total_len;
1839 }
1840
1841 static int proc_stat_read(char *buf, size_t size, off_t offset,
1842 struct fuse_file_info *fi)
1843 {
1844 struct fuse_context *fc = fuse_get_context();
1845 struct file_info *d = (struct file_info *)fi->fh;
1846 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1847 nih_local char *cpuset = NULL;
1848 char *line = NULL;
1849 size_t linelen = 0, total_len = 0;
1850 int curcpu = -1; /* cpu numbering starts at 0 */
1851 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
1852 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
1853 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
1854 #define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
1855 char cpuall[CPUALL_MAX_SIZE];
1856 /* reserve for cpu all */
1857 char *cache = d->buf + CPUALL_MAX_SIZE;
1858 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
1859 FILE *f;
1860
1861 if (offset){
1862 if (offset > d->size)
1863 return -EINVAL;
1864 int left = d->size - offset;
1865 total_len = left > size ? size: left;
1866 memcpy(buf, d->buf + offset, total_len);
1867 return total_len;
1868 }
1869
1870 if (!cg)
1871 return 0;
1872
1873 cpuset = get_cpuset(cg);
1874 if (!cpuset)
1875 return 0;
1876
1877 f = fopen("/proc/stat", "r");
1878 if (!f)
1879 return 0;
1880
1881 //skip first line
1882 if (getline(&line, &linelen, f) < 0) {
1883 fprintf(stderr, "proc_stat_read read first line failed\n");
1884 goto out;
1885 }
1886
1887 while (getline(&line, &linelen, f) != -1) {
1888 size_t l;
1889 int cpu;
1890 char cpu_char[10]; /* That's a lot of cores */
1891 char *c;
1892
1893 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
1894 /* not a ^cpuN line containing a number N, just print it */
1895 l = snprintf(cache, cache_size, "%s", line);
1896 if (l < cache_size){
1897 cache += l;
1898 cache_size -= l;
1899 total_len += l;
1900 continue;
1901 }else{
1902 //no more space, break it
1903 cache += cache_size;
1904 total_len += cache_size;
1905 cache_size = 0;
1906 break;
1907 }
1908 }
1909
1910 if (sscanf(cpu_char, "%d", &cpu) != 1)
1911 continue;
1912 if (!cpu_in_cpuset(cpu, cpuset))
1913 continue;
1914 curcpu ++;
1915
1916 c = strchr(line, ' ');
1917 if (!c)
1918 continue;
1919 l = snprintf(cache, cache_size, "cpu%d %s", curcpu, c);
1920 cache += l;
1921 cache_size -= l;
1922 total_len += l;
1923
1924 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
1925 &softirq, &steal, &guest) != 9)
1926 continue;
1927 user_sum += user;
1928 nice_sum += nice;
1929 system_sum += system;
1930 idle_sum += idle;
1931 iowait_sum += iowait;
1932 irq_sum += irq;
1933 softirq_sum += softirq;
1934 steal_sum += steal;
1935 guest_sum += guest;
1936 }
1937
1938 cache = d->buf;
1939
1940 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
1941 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
1942 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
1943 memcpy(cache, cpuall, cpuall_len);
1944 cache += cpuall_len;
1945 }else{
1946 /* shouldn't happen */
1947 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
1948 cpuall_len = 0;
1949 }
1950
1951 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
1952 total_len += cpuall_len;
1953 d->size = total_len;
1954 if (total_len > size ) total_len = size;
1955
1956 memcpy(buf, d->buf, total_len);
1957 fprintf(stderr, "total_len = %d, buflen = %d\n", d->size, d->buflen);
1958 out:
1959 fclose(f);
1960 free(line);
1961 return total_len;
1962 }
1963
1964 /*
1965 * How to guess what to present for uptime?
1966 * One thing we could do would be to take the date on the caller's
1967 * memory.usage_in_bytes file, which should equal the time of creation
1968 * of his cgroup. However, a task could be in a sub-cgroup of the
1969 * container. The same problem exists if we try to look at the ages
1970 * of processes in the caller's cgroup.
1971 *
1972 * So we'll fork a task that will enter the caller's pidns, mount a
1973 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
1974 *
1975 * For the second uptime #, we'll do as Stéphane had done, just copy
1976 * the number from /proc/uptime. Not sure how to best emulate 'idle'
1977 * time. Maybe someone can come up with a good algorithm and submit a
1978 * patch. Maybe something based on cpushare info?
1979 */
1980
1981 /* return age of the reaper for $pid, taken from ctime of its procdir */
1982 static long int get_pid1_time(pid_t pid)
1983 {
1984 char fnam[100];
1985 int fd, cpipe[2], ret;
1986 struct stat sb;
1987 pid_t cpid;
1988 struct timeval tv;
1989 fd_set s;
1990 char v;
1991
1992 if (unshare(CLONE_NEWNS))
1993 return 0;
1994
1995 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
1996 perror("rslave mount failed");
1997 return 0;
1998 }
1999
2000 sprintf(fnam, "/proc/%d/ns/pid", pid);
2001 fd = open(fnam, O_RDONLY);
2002 if (fd < 0) {
2003 perror("get_pid1_time open of ns/pid");
2004 return 0;
2005 }
2006 if (setns(fd, 0)) {
2007 perror("get_pid1_time setns 1");
2008 close(fd);
2009 return 0;
2010 }
2011 close(fd);
2012
2013 if (pipe(cpipe) < 0)
2014 exit(1);
2015
2016 loop:
2017 cpid = fork();
2018 if (cpid < 0)
2019 return 0;
2020
2021 if (!cpid) {
2022 char b = '1';
2023 close(cpipe[0]);
2024 if (write(cpipe[1], &b, sizeof(char)) < 0) {
2025 fprintf(stderr, "%s (child): erorr on write: %s\n",
2026 __func__, strerror(errno));
2027 }
2028 close(cpipe[1]);
2029 umount2("/proc", MNT_DETACH);
2030 if (mount("proc", "/proc", "proc", 0, NULL)) {
2031 perror("get_pid1_time mount");
2032 return 0;
2033 }
2034 ret = lstat("/proc/1", &sb);
2035 if (ret) {
2036 perror("get_pid1_time lstat");
2037 return 0;
2038 }
2039 return time(NULL) - sb.st_ctime;
2040 }
2041
2042 // give the child 1 second to be done forking and
2043 // write it's ack
2044 FD_ZERO(&s);
2045 FD_SET(cpipe[0], &s);
2046 tv.tv_sec = 1;
2047 tv.tv_usec = 0;
2048 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
2049 if (ret <= 0)
2050 goto again;
2051 ret = read(cpipe[0], &v, 1);
2052 if (ret != sizeof(char) || v != '1') {
2053 goto again;
2054 }
2055
2056 wait_for_pid(cpid);
2057 exit(0);
2058
2059 again:
2060 kill(cpid, SIGKILL);
2061 wait_for_pid(cpid);
2062 goto loop;
2063 }
2064
2065 static long int getreaperage(pid_t qpid)
2066 {
2067 int pid, mypipe[2], ret;
2068 struct timeval tv;
2069 fd_set s;
2070 long int mtime, answer = 0;
2071
2072 if (pipe(mypipe)) {
2073 return 0;
2074 }
2075
2076 pid = fork();
2077
2078 if (!pid) { // child
2079 mtime = get_pid1_time(qpid);
2080 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
2081 fprintf(stderr, "Warning: bad write from getreaperage\n");
2082 exit(0);
2083 }
2084
2085 close(mypipe[1]);
2086 FD_ZERO(&s);
2087 FD_SET(mypipe[0], &s);
2088 tv.tv_sec = 1;
2089 tv.tv_usec = 0;
2090 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
2091 if (ret <= 0) {
2092 perror("select");
2093 goto out;
2094 }
2095 if (!ret) {
2096 fprintf(stderr, "timed out\n");
2097 goto out;
2098 }
2099 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
2100 perror("read");
2101 goto out;
2102 }
2103 answer = mtime;
2104
2105 out:
2106 wait_for_pid(pid);
2107 close(mypipe[0]);
2108 return answer;
2109 }
2110
2111 static long int getprocidle(void)
2112 {
2113 FILE *f = fopen("/proc/uptime", "r");
2114 long int age, idle;
2115 int ret;
2116 if (!f)
2117 return 0;
2118 ret = fscanf(f, "%ld %ld", &age, &idle);
2119 fclose(f);
2120 if (ret != 2)
2121 return 0;
2122 return idle;
2123 }
2124
2125 /*
2126 * We read /proc/uptime and reuse its second field.
2127 * For the first field, we use the mtime for the reaper for
2128 * the calling pid as returned by getreaperage
2129 */
2130 static int proc_uptime_read(char *buf, size_t size, off_t offset,
2131 struct fuse_file_info *fi)
2132 {
2133 struct fuse_context *fc = fuse_get_context();
2134 struct file_info *d = (struct file_info *)fi->fh;
2135 long int reaperage = getreaperage(fc->pid);;
2136 long int idletime = getprocidle();
2137 size_t total_len = 0;
2138
2139 if (offset){
2140 if (offset > d->size)
2141 return -EINVAL;
2142 return 0;
2143 }
2144
2145 total_len = snprintf(buf, size, "%ld %ld\n", reaperage, idletime);
2146 d->size = total_len;
2147 return total_len;
2148 }
2149
2150 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2151 struct fuse_file_info *fi)
2152 {
2153 char dev_name[72];
2154 struct fuse_context *fc = fuse_get_context();
2155 struct file_info *d = (struct file_info *)fi->fh;
2156 nih_local char *cg = get_pid_cgroup(fc->pid, "blkio");
2157 nih_local char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
2158 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2159 unsigned long read = 0, write = 0;
2160 unsigned long read_merged = 0, write_merged = 0;
2161 unsigned long read_sectors = 0, write_sectors = 0;
2162 unsigned long read_ticks = 0, write_ticks = 0;
2163 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2164 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
2165 char *line = NULL;
2166 size_t linelen = 0, total_len = 0;
2167 unsigned int major = 0, minor = 0;
2168 int i = 0;
2169 FILE *f;
2170
2171 if (offset){
2172 if (offset > d->size)
2173 return -EINVAL;
2174 return 0;
2175 }
2176
2177 if (!cg)
2178 return 0;
2179
2180 if (!cgm_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2181 return 0;
2182 if (!cgm_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2183 return 0;
2184 if (!cgm_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2185 return 0;
2186 if (!cgm_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2187 return 0;
2188 if (!cgm_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2189 return 0;
2190
2191
2192 f = fopen("/proc/diskstats", "r");
2193 if (!f)
2194 return 0;
2195
2196 while (getline(&line, &linelen, f) != -1) {
2197 size_t l;
2198 char *printme, lbuf[256];
2199
2200 i = sscanf(line, "%u %u %s", &major, &minor, dev_name);
2201 if(i == 3){
2202 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2203 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2204 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2205 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2206 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2207 read_sectors = read_sectors/512;
2208 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2209 write_sectors = write_sectors/512;
2210
2211 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2212 rd_svctm = rd_svctm/1000000;
2213 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2214 rd_wait = rd_wait/1000000;
2215 read_ticks = rd_svctm + rd_wait;
2216
2217 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2218 wr_svctm = wr_svctm/1000000;
2219 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2220 wr_wait = wr_wait/1000000;
2221 write_ticks = wr_svctm + wr_wait;
2222
2223 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2224 tot_ticks = tot_ticks/1000000;
2225 }else{
2226 continue;
2227 }
2228
2229 memset(lbuf, 0, 256);
2230 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2231 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2232 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2233 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2234 printme = lbuf;
2235 } else
2236 continue;
2237
2238 l = snprintf(buf, size, "%s", printme);
2239 buf += l;
2240 size -= l;
2241 total_len += l;
2242 }
2243
2244 d->size = total_len;
2245
2246 fclose(f);
2247 free(line);
2248 return total_len;
2249 }
2250
2251 static off_t get_procfile_size(const char *which)
2252 {
2253 FILE *f = fopen(which, "r");
2254 char *line = NULL;
2255 size_t len = 0;
2256 ssize_t sz, answer = 0;
2257 if (!f)
2258 return 0;
2259
2260 while ((sz = getline(&line, &len, f)) != -1)
2261 answer += sz;
2262 fclose (f);
2263 free(line);
2264
2265 return answer;
2266 }
2267
2268 static int proc_getattr(const char *path, struct stat *sb)
2269 {
2270 struct timespec now;
2271
2272 memset(sb, 0, sizeof(struct stat));
2273 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2274 return -EINVAL;
2275 sb->st_uid = sb->st_gid = 0;
2276 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2277 if (strcmp(path, "/proc") == 0) {
2278 sb->st_mode = S_IFDIR | 00555;
2279 sb->st_nlink = 2;
2280 return 0;
2281 }
2282 if (strcmp(path, "/proc/meminfo") == 0 ||
2283 strcmp(path, "/proc/cpuinfo") == 0 ||
2284 strcmp(path, "/proc/uptime") == 0 ||
2285 strcmp(path, "/proc/stat") == 0 ||
2286 strcmp(path, "/proc/diskstats") == 0) {
2287 sb->st_size = get_procfile_size(path);
2288 sb->st_mode = S_IFREG | 00444;
2289 sb->st_nlink = 1;
2290 return 0;
2291 }
2292
2293 return -ENOENT;
2294 }
2295
2296 static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2297 struct fuse_file_info *fi)
2298 {
2299 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2300 filler(buf, "meminfo", NULL, 0) != 0 ||
2301 filler(buf, "stat", NULL, 0) != 0 ||
2302 filler(buf, "uptime", NULL, 0) != 0 ||
2303 filler(buf, "diskstats", NULL, 0) != 0)
2304 return -EINVAL;
2305 return 0;
2306 }
2307
2308 static int proc_open(const char *path, struct fuse_file_info *fi)
2309 {
2310 int type = -1;
2311 struct file_info *info;
2312
2313 if (strcmp(path, "/proc/meminfo") == 0)
2314 type = LXC_TYPE_PROC_MEMINFO;
2315 else if (strcmp(path, "/proc/cpuinfo") == 0)
2316 type = LXC_TYPE_PROC_CPUINFO;
2317 else if (strcmp(path, "/proc/uptime") == 0)
2318 type = LXC_TYPE_PROC_UPTIME;
2319 else if (strcmp(path, "/proc/stat") == 0)
2320 type = LXC_TYPE_PROC_STAT;
2321 else if (strcmp(path, "/proc/diskstats") == 0)
2322 type = LXC_TYPE_PROC_DISKSTATS;
2323 if (type == -1)
2324 return -ENOENT;
2325
2326 info = NIH_MUST( nih_alloc(NULL, sizeof(*info)) );
2327 memset(info, 0, sizeof(*info));
2328 info->type = type;
2329
2330 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
2331 info->buf = NIH_MUST( nih_alloc(NULL, info->buflen) );
2332 memset(info->buf, 0, info->buflen);
2333 /* set actual size to buffer size */
2334 info->size = info->buflen;
2335
2336 fi->fh = (unsigned long)info;
2337 return 0;
2338 }
2339
2340 static int proc_release(const char *path, struct fuse_file_info *fi)
2341 {
2342 struct file_info *f = (struct file_info *)fi->fh;
2343
2344 do_release_file_info(f);
2345 return 0;
2346 }
2347
2348 static int proc_read(const char *path, char *buf, size_t size, off_t offset,
2349 struct fuse_file_info *fi)
2350 {
2351 struct file_info *f = (struct file_info *) fi->fh;
2352
2353 switch (f->type) {
2354 case LXC_TYPE_PROC_MEMINFO:
2355 return proc_meminfo_read(buf, size, offset, fi);
2356 case LXC_TYPE_PROC_CPUINFO:
2357 return proc_cpuinfo_read(buf, size, offset, fi);
2358 case LXC_TYPE_PROC_UPTIME:
2359 return proc_uptime_read(buf, size, offset, fi);
2360 case LXC_TYPE_PROC_STAT:
2361 return proc_stat_read(buf, size, offset, fi);
2362 case LXC_TYPE_PROC_DISKSTATS:
2363 return proc_diskstats_read(buf, size, offset, fi);
2364 default:
2365 return -EINVAL;
2366 }
2367 }
2368
2369 /*
2370 * FUSE ops for /
2371 * these just delegate to the /proc and /cgroup ops as
2372 * needed
2373 */
2374
2375 static int lxcfs_getattr(const char *path, struct stat *sb)
2376 {
2377 if (strcmp(path, "/") == 0) {
2378 sb->st_mode = S_IFDIR | 00755;
2379 sb->st_nlink = 2;
2380 return 0;
2381 }
2382 if (strncmp(path, "/cgroup", 7) == 0) {
2383 return cg_getattr(path, sb);
2384 }
2385 if (strncmp(path, "/proc", 5) == 0) {
2386 return proc_getattr(path, sb);
2387 }
2388 return -EINVAL;
2389 }
2390
2391 static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
2392 {
2393 if (strcmp(path, "/") == 0)
2394 return 0;
2395
2396 if (strncmp(path, "/cgroup", 7) == 0) {
2397 return cg_opendir(path, fi);
2398 }
2399 if (strcmp(path, "/proc") == 0)
2400 return 0;
2401 return -ENOENT;
2402 }
2403
2404 static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2405 struct fuse_file_info *fi)
2406 {
2407 if (strcmp(path, "/") == 0) {
2408 if (filler(buf, "proc", NULL, 0) != 0 ||
2409 filler(buf, "cgroup", NULL, 0) != 0)
2410 return -EINVAL;
2411 return 0;
2412 }
2413 if (strncmp(path, "/cgroup", 7) == 0)
2414 return cg_readdir(path, buf, filler, offset, fi);
2415 if (strcmp(path, "/proc") == 0)
2416 return proc_readdir(path, buf, filler, offset, fi);
2417 return -EINVAL;
2418 }
2419
2420 static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
2421 {
2422 if (strcmp(path, "/") == 0)
2423 return 0;
2424 if (strncmp(path, "/cgroup", 7) == 0) {
2425 return cg_releasedir(path, fi);
2426 }
2427 if (strcmp(path, "/proc") == 0)
2428 return 0;
2429 return -EINVAL;
2430 }
2431
2432 static int lxcfs_open(const char *path, struct fuse_file_info *fi)
2433 {
2434 if (strncmp(path, "/cgroup", 7) == 0)
2435 return cg_open(path, fi);
2436 if (strncmp(path, "/proc", 5) == 0)
2437 return proc_open(path, fi);
2438
2439 return -EINVAL;
2440 }
2441
2442 static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
2443 struct fuse_file_info *fi)
2444 {
2445 if (strncmp(path, "/cgroup", 7) == 0)
2446 return cg_read(path, buf, size, offset, fi);
2447 if (strncmp(path, "/proc", 5) == 0)
2448 return proc_read(path, buf, size, offset, fi);
2449
2450 return -EINVAL;
2451 }
2452
2453 int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
2454 struct fuse_file_info *fi)
2455 {
2456 if (strncmp(path, "/cgroup", 7) == 0) {
2457 return cg_write(path, buf, size, offset, fi);
2458 }
2459
2460 return -EINVAL;
2461 }
2462
2463 static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2464 {
2465 return 0;
2466 }
2467
2468 static int lxcfs_release(const char *path, struct fuse_file_info *fi)
2469 {
2470 if (strncmp(path, "/cgroup", 7) == 0)
2471 return cg_release(path, fi);
2472 if (strncmp(path, "/proc", 5) == 0)
2473 return proc_release(path, fi);
2474
2475 return -EINVAL;
2476 }
2477
2478 static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2479 {
2480 return 0;
2481 }
2482
2483 int lxcfs_mkdir(const char *path, mode_t mode)
2484 {
2485 if (strncmp(path, "/cgroup", 7) == 0)
2486 return cg_mkdir(path, mode);
2487
2488 return -EINVAL;
2489 }
2490
2491 int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2492 {
2493 if (strncmp(path, "/cgroup", 7) == 0)
2494 return cg_chown(path, uid, gid);
2495
2496 return -EINVAL;
2497 }
2498
2499 /*
2500 * cat first does a truncate before doing ops->write. This doesn't
2501 * really make sense for cgroups. So just return 0 always but do
2502 * nothing.
2503 */
2504 int lxcfs_truncate(const char *path, off_t newsize)
2505 {
2506 if (strncmp(path, "/cgroup", 7) == 0)
2507 return 0;
2508 return -EINVAL;
2509 }
2510
2511 int lxcfs_rmdir(const char *path)
2512 {
2513 if (strncmp(path, "/cgroup", 7) == 0)
2514 return cg_rmdir(path);
2515 return -EINVAL;
2516 }
2517
2518 int lxcfs_chmod(const char *path, mode_t mode)
2519 {
2520 if (strncmp(path, "/cgroup", 7) == 0)
2521 return cg_chmod(path, mode);
2522 return -EINVAL;
2523 }
2524
2525 const struct fuse_operations lxcfs_ops = {
2526 .getattr = lxcfs_getattr,
2527 .readlink = NULL,
2528 .getdir = NULL,
2529 .mknod = NULL,
2530 .mkdir = lxcfs_mkdir,
2531 .unlink = NULL,
2532 .rmdir = lxcfs_rmdir,
2533 .symlink = NULL,
2534 .rename = NULL,
2535 .link = NULL,
2536 .chmod = lxcfs_chmod,
2537 .chown = lxcfs_chown,
2538 .truncate = lxcfs_truncate,
2539 .utime = NULL,
2540
2541 .open = lxcfs_open,
2542 .read = lxcfs_read,
2543 .release = lxcfs_release,
2544 .write = lxcfs_write,
2545
2546 .statfs = NULL,
2547 .flush = lxcfs_flush,
2548 .fsync = lxcfs_fsync,
2549
2550 .setxattr = NULL,
2551 .getxattr = NULL,
2552 .listxattr = NULL,
2553 .removexattr = NULL,
2554
2555 .opendir = lxcfs_opendir,
2556 .readdir = lxcfs_readdir,
2557 .releasedir = lxcfs_releasedir,
2558
2559 .fsyncdir = NULL,
2560 .init = NULL,
2561 .destroy = NULL,
2562 .access = NULL,
2563 .create = NULL,
2564 .ftruncate = NULL,
2565 .fgetattr = NULL,
2566 };
2567
2568 static void usage(const char *me)
2569 {
2570 fprintf(stderr, "Usage:\n");
2571 fprintf(stderr, "\n");
2572 fprintf(stderr, "%s [FUSE and mount options] mountpoint\n", me);
2573 exit(1);
2574 }
2575
2576 static bool is_help(char *w)
2577 {
2578 if (strcmp(w, "-h") == 0 ||
2579 strcmp(w, "--help") == 0 ||
2580 strcmp(w, "-help") == 0 ||
2581 strcmp(w, "help") == 0)
2582 return true;
2583 return false;
2584 }
2585
2586 int main(int argc, char *argv[])
2587 {
2588 int ret;
2589 struct lxcfs_state *d;
2590
2591 if (argc < 2 || is_help(argv[1]))
2592 usage(argv[0]);
2593
2594 d = malloc(sizeof(*d));
2595 if (!d)
2596 return -1;
2597
2598 if (!cgm_escape_cgroup())
2599 fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
2600
2601 if (!cgm_get_controllers(&d->subsystems))
2602 return -1;
2603
2604 ret = fuse_main(argc, argv, &lxcfs_ops, d);
2605
2606 return ret;
2607 }