]> git.proxmox.com Git - mirror_lxcfs.git/blob - lxcfs.c
cache: use file_dir object as parent for alloc'ed objects
[mirror_lxcfs.git] / lxcfs.c
1 /* lxcfs
2 *
3 * Copyright © 2014 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 /*
10 * NOTES - make sure to run this as -s to avoid threading.
11 * TODO - can we enforce that here from the code?
12 */
13 #define FUSE_USE_VERSION 26
14
15 #include <stdio.h>
16 #include <dirent.h>
17 #include <fcntl.h>
18 #include <fuse.h>
19 #include <unistd.h>
20 #include <errno.h>
21 #include <stdbool.h>
22 #include <time.h>
23 #include <string.h>
24 #include <stdlib.h>
25 #include <libgen.h>
26 #include <sched.h>
27 #include <linux/sched.h>
28 #include <sys/socket.h>
29 #include <sys/mount.h>
30 #include <wait.h>
31
32 #include <nih/alloc.h>
33 #include <nih/string.h>
34
35 #include "cgmanager.h"
36
37 struct lxcfs_state {
38 /*
39 * a null-terminated, nih-allocated list of the mounted subsystems. We
40 * detect this at startup.
41 */
42 char **subsystems;
43 };
44 #define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
45
46 enum {
47 LXC_TYPE_CGDIR,
48 LXC_TYPE_CGFILE,
49 LXC_TYPE_PROC_MEMINFO,
50 LXC_TYPE_PROC_CPUINFO,
51 LXC_TYPE_PROC_UPTIME,
52 LXC_TYPE_PROC_STAT,
53 LXC_TYPE_PROC_DISKSTATS,
54 };
55
56 struct file_info {
57 char *controller;
58 char *cgroup;
59 char *file;
60 int type;
61 char *buf; // unused as of yet
62 int buflen;
63 };
64
65 static char *must_copy_string(void *parent, const char *str)
66 {
67 if (!str)
68 return NULL;
69 return NIH_MUST( nih_strdup(parent, str) );
70 }
71
72 /*
73 * TODO - return value should denote whether child exited with failure
74 * so callers can return errors. Esp read/write of tasks and cgroup.procs
75 */
76 static int wait_for_pid(pid_t pid)
77 {
78 int status, ret;
79
80 again:
81 ret = waitpid(pid, &status, 0);
82 if (ret == -1) {
83 if (errno == EINTR)
84 goto again;
85 return -1;
86 }
87 if (ret != pid)
88 goto again;
89 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
90 return -1;
91 return 0;
92 }
93
94 /*
95 * Given a open file * to /proc/pid/{u,g}id_map, and an id
96 * valid in the caller's namespace, return the id mapped into
97 * pid's namespace.
98 * Returns the mapped id, or -1 on error.
99 */
100 unsigned int
101 convert_id_to_ns(FILE *idfile, unsigned int in_id)
102 {
103 unsigned int nsuid, // base id for a range in the idfile's namespace
104 hostuid, // base id for a range in the caller's namespace
105 count; // number of ids in this range
106 char line[400];
107 int ret;
108
109 fseek(idfile, 0L, SEEK_SET);
110 while (fgets(line, 400, idfile)) {
111 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
112 if (ret != 3)
113 continue;
114 if (hostuid + count < hostuid || nsuid + count < nsuid) {
115 /*
116 * uids wrapped around - unexpected as this is a procfile,
117 * so just bail.
118 */
119 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
120 nsuid, hostuid, count, line);
121 return -1;
122 }
123 if (hostuid <= in_id && hostuid+count > in_id) {
124 /*
125 * now since hostuid <= in_id < hostuid+count, and
126 * hostuid+count and nsuid+count do not wrap around,
127 * we know that nsuid+(in_id-hostuid) which must be
128 * less that nsuid+(count) must not wrap around
129 */
130 return (in_id - hostuid) + nsuid;
131 }
132 }
133
134 // no answer found
135 return -1;
136 }
137
138 /*
139 * for is_privileged_over,
140 * specify whether we require the calling uid to be root in his
141 * namespace
142 */
143 #define NS_ROOT_REQD true
144 #define NS_ROOT_OPT false
145
146 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
147 {
148 nih_local char *fpath = NULL;
149 bool answer = false;
150 uid_t nsuid;
151
152 if (victim == -1 || uid == -1)
153 return false;
154
155 /*
156 * If the request is one not requiring root in the namespace,
157 * then having the same uid suffices. (i.e. uid 1000 has write
158 * access to files owned by uid 1000
159 */
160 if (!req_ns_root && uid == victim)
161 return true;
162
163 fpath = NIH_MUST( nih_sprintf(NULL, "/proc/%d/uid_map", pid) );
164 FILE *f = fopen(fpath, "r");
165 if (!f)
166 return false;
167
168 /* if caller's not root in his namespace, reject */
169 nsuid = convert_id_to_ns(f, uid);
170 if (nsuid)
171 goto out;
172
173 /*
174 * If victim is not mapped into caller's ns, reject.
175 * XXX I'm not sure this check is needed given that fuse
176 * will be sending requests where the vfs has converted
177 */
178 nsuid = convert_id_to_ns(f, victim);
179 if (nsuid == -1)
180 goto out;
181
182 answer = true;
183
184 out:
185 fclose(f);
186 return answer;
187 }
188
189 static bool perms_include(int fmode, mode_t req_mode)
190 {
191 mode_t r;
192
193 switch (req_mode & O_ACCMODE) {
194 case O_RDONLY:
195 r = S_IROTH;
196 break;
197 case O_WRONLY:
198 r = S_IWOTH;
199 break;
200 case O_RDWR:
201 r = S_IROTH | S_IWOTH;
202 break;
203 default:
204 return false;
205 }
206 return ((fmode & r) == r);
207 }
208
209 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
210 {
211 char *start, *end;
212
213 if (strlen(taskcg) <= strlen(querycg)) {
214 fprintf(stderr, "%s: I was fed bad input\n", __func__);
215 return NULL;
216 }
217
218 if (strcmp(querycg, "/") == 0)
219 start = NIH_MUST( nih_strdup(NULL, taskcg + 1) );
220 else
221 start = NIH_MUST( nih_strdup(NULL, taskcg + strlen(querycg) + 1) );
222 end = strchr(start, '/');
223 if (end)
224 *end = '\0';
225 return start;
226 }
227
228 /*
229 * check whether a fuse context may access a cgroup dir or file
230 *
231 * If file is not null, it is a cgroup file to check under cg.
232 * If file is null, then we are checking perms on cg itself.
233 *
234 * For files we can check the mode of the list_keys result.
235 * For cgroups, we must make assumptions based on the files under the
236 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
237 * yet.
238 */
239 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
240 {
241 nih_local struct cgm_keys **list = NULL;
242 int i;
243
244 if (!file)
245 file = "tasks";
246
247 if (*file == '/')
248 file++;
249
250 if (!cgm_list_keys(contrl, cg, &list))
251 return false;
252 for (i = 0; list[i]; i++) {
253 if (strcmp(list[i]->name, file) == 0) {
254 struct cgm_keys *k = list[i];
255 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
256 if (perms_include(k->mode >> 6, mode))
257 return true;
258 }
259 if (fc->gid == k->gid) {
260 if (perms_include(k->mode >> 3, mode))
261 return true;
262 }
263 return perms_include(k->mode, mode);
264 }
265 }
266
267 return false;
268 }
269
270 static void stripnewline(char *x)
271 {
272 size_t l = strlen(x);
273 if (l && x[l-1] == '\n')
274 x[l-1] = '\0';
275 }
276
277 /*
278 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
279 * If caller is in /a, he may act on /a/b, but not on /b.
280 * if the answer is false and nextcg is not NULL, then *nextcg will point
281 * to a nih_alloc'd string containing the next cgroup directory under cg
282 */
283 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
284 {
285 nih_local char *fnam = NULL;
286 FILE *f;
287 bool answer = false;
288 char *line = NULL;
289 size_t len = 0;
290
291 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
292 if (!(f = fopen(fnam, "r")))
293 return false;
294
295 while (getline(&line, &len, f) != -1) {
296 char *c1, *c2, *linecmp;
297 if (!line[0])
298 continue;
299 c1 = strchr(line, ':');
300 if (!c1)
301 goto out;
302 c1++;
303 c2 = strchr(c1, ':');
304 if (!c2)
305 goto out;
306 *c2 = '\0';
307 if (strcmp(c1, contrl) != 0)
308 continue;
309 c2++;
310 stripnewline(c2);
311 /*
312 * callers pass in '/' for root cgroup, otherwise they pass
313 * in a cgroup without leading '/'
314 */
315 linecmp = *cg == '/' ? c2 : c2+1;
316 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
317 if (nextcg)
318 *nextcg = get_next_cgroup_dir(linecmp, cg);
319 goto out;
320 }
321 answer = true;
322 goto out;
323 }
324
325 out:
326 fclose(f);
327 free(line);
328 return answer;
329 }
330
331 /*
332 * given /cgroup/freezer/a/b, return "freezer". this will be nih-allocated
333 * and needs to be nih_freed.
334 */
335 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
336 {
337 const char *p1;
338 char *ret, *slash;
339
340 if (strlen(path) < 9)
341 return NULL;
342 p1 = path+8;
343 ret = nih_strdup(NULL, p1);
344 if (!ret)
345 return ret;
346 slash = strstr(ret, "/");
347 if (slash)
348 *slash = '\0';
349
350 /* verify that it is a subsystem */
351 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
352 int i;
353 if (!list) {
354 nih_free(ret);
355 return NULL;
356 }
357 for (i = 0; list[i]; i++) {
358 if (strcmp(list[i], ret) == 0)
359 return ret;
360 }
361 nih_free(ret);
362 return NULL;
363 }
364
365 /*
366 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
367 * Note that the returned value may include files (keynames) etc
368 */
369 static const char *find_cgroup_in_path(const char *path)
370 {
371 const char *p1;
372
373 if (strlen(path) < 9)
374 return NULL;
375 p1 = strstr(path+8, "/");
376 if (!p1)
377 return NULL;
378 return p1+1;
379 }
380
381 static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
382 {
383 nih_local char **list = NULL;
384 int i;
385
386 if (!f)
387 return false;
388 if (*f == '/')
389 f++;
390
391 if (!cgm_list_children(contr, dir, &list))
392 return false;
393 for (i = 0; list[i]; i++) {
394 if (strcmp(list[i], f) == 0)
395 return true;
396 }
397
398 return false;
399 }
400
401 static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
402 {
403 nih_local struct cgm_keys **list = NULL;
404 struct cgm_keys *k;
405 int i;
406
407 if (!f)
408 return NULL;
409 if (*f == '/')
410 f++;
411 if (!cgm_list_keys(contr, dir, &list))
412 return NULL;
413 for (i = 0; list[i]; i++) {
414 if (strcmp(list[i]->name, f) == 0) {
415 k = NIH_MUST( nih_alloc(NULL, (sizeof(*k))) );
416 k->name = NIH_MUST( nih_strdup(k, list[i]->name) );
417 k->uid = list[i]->uid;
418 k->gid = list[i]->gid;
419 k->mode = list[i]->mode;
420 return k;
421 }
422 }
423
424 return NULL;
425 }
426
427 static void get_cgdir_and_path(const char *cg, char **dir, char **file)
428 {
429 char *p;
430
431 *dir = NIH_MUST( nih_strdup(NULL, cg) );
432 *file = strrchr(cg, '/');
433 if (!*file) {
434 *file = NULL;
435 return;
436 }
437 p = strrchr(*dir, '/');
438 *p = '\0';
439 }
440
441 static size_t get_file_size(const char *contrl, const char *cg, const char *f)
442 {
443 nih_local char *data = NULL;
444 size_t s;
445 if (!cgm_get_value(contrl, cg, f, &data))
446 return -EINVAL;
447 s = strlen(data);
448 return s;
449 }
450
451 /*
452 * FUSE ops for /cgroup
453 */
454
455 static int cg_getattr(const char *path, struct stat *sb)
456 {
457 struct timespec now;
458 struct fuse_context *fc = fuse_get_context();
459 nih_local char * cgdir = NULL;
460 char *fpath = NULL, *path1, *path2;
461 nih_local struct cgm_keys *k = NULL;
462 const char *cgroup;
463 nih_local char *controller = NULL;
464
465
466 if (!fc)
467 return -EIO;
468
469 memset(sb, 0, sizeof(struct stat));
470
471 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
472 return -EINVAL;
473
474 sb->st_uid = sb->st_gid = 0;
475 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
476 sb->st_size = 0;
477
478 if (strcmp(path, "/cgroup") == 0) {
479 sb->st_mode = S_IFDIR | 00755;
480 sb->st_nlink = 2;
481 return 0;
482 }
483
484 controller = pick_controller_from_path(fc, path);
485 if (!controller)
486 return -EIO;
487 cgroup = find_cgroup_in_path(path);
488 if (!cgroup) {
489 /* this is just /cgroup/controller, return it as a dir */
490 sb->st_mode = S_IFDIR | 00755;
491 sb->st_nlink = 2;
492 return 0;
493 }
494
495 get_cgdir_and_path(cgroup, &cgdir, &fpath);
496
497 if (!fpath) {
498 path1 = "/";
499 path2 = cgdir;
500 } else {
501 path1 = cgdir;
502 path2 = fpath;
503 }
504
505 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
506 * Then check that caller's cgroup is under path if fpath is a child
507 * cgroup, or cgdir if fpath is a file */
508
509 if (is_child_cgroup(controller, path1, path2)) {
510 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
511 /* this is just /cgroup/controller, return it as a dir */
512 sb->st_mode = S_IFDIR | 00555;
513 sb->st_nlink = 2;
514 return 0;
515 }
516 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
517 return -EACCES;
518
519 // get uid, gid, from '/tasks' file and make up a mode
520 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
521 sb->st_mode = S_IFDIR | 00755;
522 k = get_cgroup_key(controller, cgroup, "tasks");
523 if (!k) {
524 sb->st_uid = sb->st_gid = 0;
525 } else {
526 sb->st_uid = k->uid;
527 sb->st_gid = k->gid;
528 }
529 sb->st_nlink = 2;
530 return 0;
531 }
532
533 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
534 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL))
535 return -ENOENT;
536 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
537 return -EACCES;
538
539 sb->st_mode = S_IFREG | k->mode;
540 sb->st_nlink = 1;
541 sb->st_uid = k->uid;
542 sb->st_gid = k->gid;
543 sb->st_size = get_file_size(controller, path1, path2);
544 return 0;
545 }
546
547 return -ENOENT;
548 }
549
550 /*
551 * TODO - cache these results in a table for use in opendir, free
552 * in releasedir
553 */
554 static int cg_opendir(const char *path, struct fuse_file_info *fi)
555 {
556 struct fuse_context *fc = fuse_get_context();
557 nih_local struct cgm_keys **list = NULL;
558 const char *cgroup;
559 struct file_info *dir_info;
560 nih_local char *controller = NULL;
561
562 if (!fc)
563 return -EIO;
564
565 if (strcmp(path, "/cgroup") == 0) {
566 cgroup = NULL;
567 controller = NULL;
568 } else {
569 // return list of keys for the controller, and list of child cgroups
570 controller = pick_controller_from_path(fc, path);
571 if (!controller)
572 return -EIO;
573
574 cgroup = find_cgroup_in_path(path);
575 if (!cgroup) {
576 /* this is just /cgroup/controller, return its contents */
577 cgroup = "/";
578 }
579 }
580
581 if (cgroup && !fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
582 return -EACCES;
583
584 /* we'll free this at cg_releasedir */
585 dir_info = NIH_MUST( nih_alloc(NULL, sizeof(*dir_info)) );
586 dir_info->controller = must_copy_string(dir_info, controller);
587 dir_info->cgroup = must_copy_string(dir_info, cgroup);
588 dir_info->type = LXC_TYPE_CGDIR;
589 dir_info->buf = NULL;
590 dir_info->file = NULL;
591 dir_info->buflen = 0;
592
593 fi->fh = (unsigned long)dir_info;
594 return 0;
595 }
596
597 static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
598 struct fuse_file_info *fi)
599 {
600 struct file_info *d = (struct file_info *)fi->fh;
601 nih_local struct cgm_keys **list = NULL;
602 int i;
603 nih_local char *nextcg = NULL;
604 struct fuse_context *fc = fuse_get_context();
605
606 if (d->type != LXC_TYPE_CGDIR) {
607 fprintf(stderr, "Internal error: file cache info used in readdir\n");
608 return -EIO;
609 }
610 if (!d->cgroup && !d->controller) {
611 // ls /var/lib/lxcfs/cgroup - just show list of controllers
612 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
613 int i;
614
615 if (!list)
616 return -EIO;
617
618 for (i = 0; list[i]; i++) {
619 if (filler(buf, list[i], NULL, 0) != 0) {
620 return -EIO;
621 }
622 }
623 return 0;
624 }
625
626 if (!cgm_list_keys(d->controller, d->cgroup, &list))
627 // not a valid cgroup
628 return -EINVAL;
629
630 if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
631 if (nextcg) {
632 int ret;
633 ret = filler(buf, nextcg, NULL, 0);
634 if (ret != 0)
635 return -EIO;
636 }
637 return 0;
638 }
639
640 for (i = 0; list[i]; i++) {
641 if (filler(buf, list[i]->name, NULL, 0) != 0) {
642 return -EIO;
643 }
644 }
645
646 // now get the list of child cgroups
647 nih_local char **clist = NULL;
648
649 if (!cgm_list_children(d->controller, d->cgroup, &clist))
650 return 0;
651 for (i = 0; clist[i]; i++) {
652 if (filler(buf, clist[i], NULL, 0) != 0) {
653 return -EIO;
654 }
655 }
656 return 0;
657 }
658
659 static void do_release_file_info(struct file_info *f)
660 {
661 /*
662 * all file_info fields which are nih_alloc()d with f as parent
663 * will be automatically freed
664 */
665 nih_free(f);
666 }
667
668 static int cg_releasedir(const char *path, struct fuse_file_info *fi)
669 {
670 struct file_info *d = (struct file_info *)fi->fh;
671
672 do_release_file_info(d);
673 return 0;
674 }
675
676 static int cg_open(const char *path, struct fuse_file_info *fi)
677 {
678 nih_local char *controller = NULL;
679 const char *cgroup;
680 char *fpath = NULL, *path1, *path2;
681 nih_local char * cgdir = NULL;
682 nih_local struct cgm_keys *k = NULL;
683 struct file_info *file_info;
684 struct fuse_context *fc = fuse_get_context();
685
686 if (!fc)
687 return -EIO;
688
689 controller = pick_controller_from_path(fc, path);
690 if (!controller)
691 return -EIO;
692 cgroup = find_cgroup_in_path(path);
693 if (!cgroup)
694 return -EINVAL;
695
696 get_cgdir_and_path(cgroup, &cgdir, &fpath);
697 if (!fpath) {
698 path1 = "/";
699 path2 = cgdir;
700 } else {
701 path1 = cgdir;
702 path2 = fpath;
703 }
704
705 k = get_cgroup_key(controller, path1, path2);
706 if (!k)
707 return -EINVAL;
708
709 if (!fc_may_access(fc, controller, path1, path2, fi->flags))
710 // should never get here
711 return -EACCES;
712
713 /* we'll free this at cg_release */
714 file_info = NIH_MUST( nih_alloc(NULL, sizeof(*file_info)) );
715 file_info->controller = must_copy_string(file_info, controller);
716 file_info->cgroup = must_copy_string(file_info, path1);
717 file_info->file = must_copy_string(file_info, path2);
718 file_info->type = LXC_TYPE_CGFILE;
719 file_info->buf = NULL;
720 file_info->buflen = 0;
721
722 fi->fh = (unsigned long)file_info;
723 return 0;
724 }
725
726 static int cg_release(const char *path, struct fuse_file_info *fi)
727 {
728 struct file_info *f = (struct file_info *)fi->fh;
729
730 do_release_file_info(f);
731 return 0;
732 }
733
734 static int msgrecv(int sockfd, void *buf, size_t len)
735 {
736 struct timeval tv;
737 fd_set rfds;
738
739 FD_ZERO(&rfds);
740 FD_SET(sockfd, &rfds);
741 tv.tv_sec = 2;
742 tv.tv_usec = 0;
743
744 if (select(sockfd+1, &rfds, NULL, NULL, &tv) <= 0)
745 return -1;
746 return recv(sockfd, buf, len, MSG_DONTWAIT);
747 }
748
749 #define SEND_CREDS_OK 0
750 #define SEND_CREDS_NOTSK 1
751 #define SEND_CREDS_FAIL 2
752 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
753 {
754 struct msghdr msg = { 0 };
755 struct iovec iov;
756 struct cmsghdr *cmsg;
757 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
758 char buf[1];
759 buf[0] = 'p';
760
761 if (pingfirst) {
762 if (msgrecv(sock, buf, 1) != 1) {
763 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
764 __func__);
765 return SEND_CREDS_FAIL;
766 }
767 }
768
769 msg.msg_control = cmsgbuf;
770 msg.msg_controllen = sizeof(cmsgbuf);
771
772 cmsg = CMSG_FIRSTHDR(&msg);
773 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
774 cmsg->cmsg_level = SOL_SOCKET;
775 cmsg->cmsg_type = SCM_CREDENTIALS;
776 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
777
778 msg.msg_name = NULL;
779 msg.msg_namelen = 0;
780
781 buf[0] = v;
782 iov.iov_base = buf;
783 iov.iov_len = sizeof(buf);
784 msg.msg_iov = &iov;
785 msg.msg_iovlen = 1;
786
787 if (sendmsg(sock, &msg, 0) < 0) {
788 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
789 strerror(errno));
790 if (errno == 3)
791 return SEND_CREDS_NOTSK;
792 return SEND_CREDS_FAIL;
793 }
794
795 return SEND_CREDS_OK;
796 }
797
798 static bool recv_creds(int sock, struct ucred *cred, char *v)
799 {
800 struct msghdr msg = { 0 };
801 struct iovec iov;
802 struct cmsghdr *cmsg;
803 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
804 char buf[1];
805 int ret;
806 int optval = 1;
807 struct timeval tv;
808 fd_set rfds;
809
810 *v = '1';
811
812 cred->pid = -1;
813 cred->uid = -1;
814 cred->gid = -1;
815
816 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
817 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
818 return false;
819 }
820 buf[0] = '1';
821 if (write(sock, buf, 1) != 1) {
822 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
823 return false;
824 }
825
826 msg.msg_name = NULL;
827 msg.msg_namelen = 0;
828 msg.msg_control = cmsgbuf;
829 msg.msg_controllen = sizeof(cmsgbuf);
830
831 iov.iov_base = buf;
832 iov.iov_len = sizeof(buf);
833 msg.msg_iov = &iov;
834 msg.msg_iovlen = 1;
835
836 FD_ZERO(&rfds);
837 FD_SET(sock, &rfds);
838 tv.tv_sec = 2;
839 tv.tv_usec = 0;
840 if (select(sock+1, &rfds, NULL, NULL, &tv) <= 0) {
841 fprintf(stderr, "Failed to select for scm_cred: %s\n",
842 strerror(errno));
843 return false;
844 }
845 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
846 if (ret < 0) {
847 fprintf(stderr, "Failed to receive scm_cred: %s\n",
848 strerror(errno));
849 return false;
850 }
851
852 cmsg = CMSG_FIRSTHDR(&msg);
853
854 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
855 cmsg->cmsg_level == SOL_SOCKET &&
856 cmsg->cmsg_type == SCM_CREDENTIALS) {
857 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
858 }
859 *v = buf[0];
860
861 return true;
862 }
863
864
865 /*
866 * pid_to_ns - reads pids from a ucred over a socket, then writes the
867 * int value back over the socket. This shifts the pid from the
868 * sender's pidns into tpid's pidns.
869 */
870 static void pid_to_ns(int sock, pid_t tpid)
871 {
872 char v = '0';
873 struct ucred cred;
874
875 while (recv_creds(sock, &cred, &v)) {
876 if (v == '1')
877 exit(0);
878 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
879 exit(1);
880 }
881 exit(0);
882 }
883
884 /*
885 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
886 * in your old pidns. Only children which you fork will be in the target
887 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
888 * actually convert pids
889 */
890 static void pid_to_ns_wrapper(int sock, pid_t tpid)
891 {
892 int newnsfd = -1, ret, cpipe[2];
893 char fnam[100];
894 pid_t cpid;
895 struct timeval tv;
896 fd_set s;
897 char v;
898
899 sprintf(fnam, "/proc/%d/ns/pid", tpid);
900 newnsfd = open(fnam, O_RDONLY);
901 if (newnsfd < 0)
902 exit(1);
903 if (setns(newnsfd, 0) < 0)
904 exit(1);
905 close(newnsfd);
906
907 if (pipe(cpipe) < 0)
908 exit(1);
909
910 loop:
911 cpid = fork();
912 if (cpid < 0)
913 exit(1);
914
915 if (!cpid) {
916 char b = '1';
917 close(cpipe[0]);
918 if (write(cpipe[1], &b, sizeof(char)) < 0) {
919 fprintf(stderr, "%s (child): erorr on write: %s\n",
920 __func__, strerror(errno));
921 }
922 close(cpipe[1]);
923 pid_to_ns(sock, tpid);
924 }
925 // give the child 1 second to be done forking and
926 // write it's ack
927 FD_ZERO(&s);
928 FD_SET(cpipe[0], &s);
929 tv.tv_sec = 1;
930 tv.tv_usec = 0;
931 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
932 if (ret <= 0)
933 goto again;
934 ret = read(cpipe[0], &v, 1);
935 if (ret != sizeof(char) || v != '1') {
936 goto again;
937 }
938
939 if (!wait_for_pid(cpid))
940 exit(1);
941 exit(0);
942
943 again:
944 kill(cpid, SIGKILL);
945 wait_for_pid(cpid);
946 goto loop;
947 }
948
949 /*
950 * To read cgroup files with a particular pid, we will setns into the child
951 * pidns, open a pipe, fork a child - which will be the first to really be in
952 * the child ns - which does the cgm_get_value and writes the data to the pipe.
953 */
954 static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
955 {
956 int sock[2] = {-1, -1};
957 nih_local char *tmpdata = NULL;
958 int ret;
959 pid_t qpid, cpid = -1;
960 bool answer = false;
961 char v = '0';
962 struct ucred cred;
963 struct timeval tv;
964 fd_set s;
965
966 if (!cgm_get_value(contrl, cg, file, &tmpdata))
967 return false;
968
969 /*
970 * Now we read the pids from returned data one by one, pass
971 * them into a child in the target namespace, read back the
972 * translated pids, and put them into our to-return data
973 */
974
975 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
976 perror("socketpair");
977 exit(1);
978 }
979
980 cpid = fork();
981 if (cpid == -1)
982 goto out;
983
984 if (!cpid) // child
985 pid_to_ns_wrapper(sock[1], tpid);
986
987 char *ptr = tmpdata;
988 cred.uid = 0;
989 cred.gid = 0;
990 while (sscanf(ptr, "%d\n", &qpid) == 1) {
991 cred.pid = qpid;
992 ret = send_creds(sock[0], &cred, v, true);
993
994 if (ret == SEND_CREDS_NOTSK)
995 goto next;
996 if (ret == SEND_CREDS_FAIL)
997 goto out;
998
999 // read converted results
1000 FD_ZERO(&s);
1001 FD_SET(sock[0], &s);
1002 tv.tv_sec = 2;
1003 tv.tv_usec = 0;
1004 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
1005 if (ret <= 0) {
1006 fprintf(stderr, "%s: select error waiting for pid from child: %s\n",
1007 __func__, strerror(errno));
1008 goto out;
1009 }
1010 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1011 fprintf(stderr, "%s: error reading pid from child: %s\n",
1012 __func__, strerror(errno));
1013 goto out;
1014 }
1015 NIH_MUST( nih_strcat_sprintf(d, NULL, "%d\n", qpid) );
1016 next:
1017 ptr = strchr(ptr, '\n');
1018 if (!ptr)
1019 break;
1020 ptr++;
1021 }
1022
1023 cred.pid = getpid();
1024 v = '1';
1025 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
1026 // failed to ask child to exit
1027 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1028 __func__, strerror(errno));
1029 goto out;
1030 }
1031
1032 answer = true;
1033
1034 out:
1035 if (cpid != -1)
1036 wait_for_pid(cpid);
1037 if (sock[0] != -1) {
1038 close(sock[0]);
1039 close(sock[1]);
1040 }
1041 return answer;
1042 }
1043
1044 static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1045 struct fuse_file_info *fi)
1046 {
1047 struct fuse_context *fc = fuse_get_context();
1048 struct file_info *f = (struct file_info *)fi->fh;
1049 nih_local struct cgm_keys *k = NULL;
1050
1051 if (f->type != LXC_TYPE_CGFILE) {
1052 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1053 return -EIO;
1054 }
1055
1056 if (offset)
1057 return -EIO;
1058
1059 if (!fc)
1060 return -EIO;
1061
1062 if (!f->controller)
1063 return -EINVAL;
1064
1065 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
1066 nih_local char *data = NULL;
1067 int s;
1068 bool r;
1069
1070 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY))
1071 // should never get here
1072 return -EACCES;
1073
1074 if (strcmp(f->file, "tasks") == 0 ||
1075 strcmp(f->file, "/tasks") == 0 ||
1076 strcmp(f->file, "/cgroup.procs") == 0 ||
1077 strcmp(f->file, "cgroup.procs") == 0)
1078 // special case - we have to translate the pids
1079 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
1080 else
1081 r = cgm_get_value(f->controller, f->cgroup, f->file, &data);
1082
1083 if (!r)
1084 return -EINVAL;
1085
1086 if (!data)
1087 return 0;
1088 s = strlen(data);
1089 if (s > size)
1090 s = size;
1091 memcpy(buf, data, s);
1092
1093 return s;
1094 }
1095
1096 return -EINVAL;
1097 }
1098
1099 static void pid_from_ns(int sock, pid_t tpid)
1100 {
1101 pid_t vpid;
1102 struct ucred cred;
1103 char v;
1104 struct timeval tv;
1105 fd_set s;
1106 int ret;
1107
1108 cred.uid = 0;
1109 cred.gid = 0;
1110 while (1) {
1111 FD_ZERO(&s);
1112 FD_SET(sock, &s);
1113 tv.tv_sec = 2;
1114 tv.tv_usec = 0;
1115 ret = select(sock+1, &s, NULL, NULL, &tv);
1116 if (ret <= 0) {
1117 fprintf(stderr, "%s: bad select before read from parent: %s\n",
1118 __func__, strerror(errno));
1119 exit(1);
1120 }
1121 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1122 fprintf(stderr, "%s: bad read from parent: %s\n",
1123 __func__, strerror(errno));
1124 exit(1);
1125 }
1126 if (vpid == -1) // done
1127 break;
1128 v = '0';
1129 cred.pid = vpid;
1130 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
1131 v = '1';
1132 cred.pid = getpid();
1133 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
1134 exit(1);
1135 }
1136 }
1137 exit(0);
1138 }
1139
1140 static void pid_from_ns_wrapper(int sock, pid_t tpid)
1141 {
1142 int newnsfd = -1, ret, cpipe[2];
1143 char fnam[100];
1144 pid_t cpid;
1145 fd_set s;
1146 struct timeval tv;
1147 char v;
1148
1149 sprintf(fnam, "/proc/%d/ns/pid", tpid);
1150 newnsfd = open(fnam, O_RDONLY);
1151 if (newnsfd < 0)
1152 exit(1);
1153 if (setns(newnsfd, 0) < 0)
1154 exit(1);
1155 close(newnsfd);
1156
1157 if (pipe(cpipe) < 0)
1158 exit(1);
1159
1160 loop:
1161 cpid = fork();
1162
1163 if (cpid < 0)
1164 exit(1);
1165
1166 if (!cpid) {
1167 char b = '1';
1168 close(cpipe[0]);
1169 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1170 fprintf(stderr, "%s (child): erorr on write: %s\n",
1171 __func__, strerror(errno));
1172 }
1173 close(cpipe[1]);
1174 pid_from_ns(sock, tpid);
1175 }
1176
1177 // give the child 1 second to be done forking and
1178 // write it's ack
1179 FD_ZERO(&s);
1180 FD_SET(cpipe[0], &s);
1181 tv.tv_sec = 1;
1182 tv.tv_usec = 0;
1183 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1184 if (ret <= 0)
1185 goto again;
1186 ret = read(cpipe[0], &v, 1);
1187 if (ret != sizeof(char) || v != '1') {
1188 goto again;
1189 }
1190
1191 if (!wait_for_pid(cpid))
1192 exit(1);
1193 exit(0);
1194
1195 again:
1196 kill(cpid, SIGKILL);
1197 wait_for_pid(cpid);
1198 goto loop;
1199 }
1200
1201 static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
1202 {
1203 int sock[2] = {-1, -1};
1204 pid_t qpid, cpid = -1;
1205 bool answer = false, fail = false;
1206
1207 /*
1208 * write the pids to a socket, have helper in writer's pidns
1209 * call movepid for us
1210 */
1211 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1212 perror("socketpair");
1213 exit(1);
1214 }
1215
1216 cpid = fork();
1217 if (cpid == -1)
1218 goto out;
1219
1220 if (!cpid) // child
1221 pid_from_ns_wrapper(sock[1], tpid);
1222
1223 const char *ptr = buf;
1224 while (sscanf(ptr, "%d", &qpid) == 1) {
1225 struct ucred cred;
1226 char v;
1227
1228 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1229 fprintf(stderr, "%s: error writing pid to child: %s\n",
1230 __func__, strerror(errno));
1231 goto out;
1232 }
1233
1234 if (recv_creds(sock[0], &cred, &v)) {
1235 if (v == '0') {
1236 if (!cgm_move_pid(contrl, cg, cred.pid))
1237 fail = true;
1238 }
1239 }
1240
1241 ptr = strchr(ptr, '\n');
1242 if (!ptr)
1243 break;
1244 ptr++;
1245 }
1246
1247 /* All good, write the value */
1248 qpid = -1;
1249 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1250 fprintf(stderr, "Warning: failed to ask child to exit\n");
1251
1252 if (!fail)
1253 answer = true;
1254
1255 out:
1256 if (cpid != -1)
1257 wait_for_pid(cpid);
1258 if (sock[0] != -1) {
1259 close(sock[0]);
1260 close(sock[1]);
1261 }
1262 return answer;
1263 }
1264
1265 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1266 struct fuse_file_info *fi)
1267 {
1268 struct fuse_context *fc = fuse_get_context();
1269 nih_local char *localbuf = NULL;
1270 nih_local struct cgm_keys *k = NULL;
1271 struct file_info *f = (struct file_info *)fi->fh;
1272
1273 if (f->type != LXC_TYPE_CGFILE) {
1274 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1275 return -EIO;
1276 }
1277
1278 if (offset)
1279 return -EINVAL;
1280
1281 if (!fc)
1282 return -EIO;
1283
1284 localbuf = NIH_MUST( nih_alloc(NULL, size+1) );
1285 localbuf[size] = '\0';
1286 memcpy(localbuf, buf, size);
1287
1288 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
1289 bool r;
1290
1291 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY))
1292 return -EACCES;
1293
1294 if (strcmp(f->file, "tasks") == 0 ||
1295 strcmp(f->file, "/tasks") == 0 ||
1296 strcmp(f->file, "/cgroup.procs") == 0 ||
1297 strcmp(f->file, "cgroup.procs") == 0)
1298 // special case - we have to translate the pids
1299 r = do_write_pids(fc->pid, f->controller, f->cgroup, f->file, localbuf);
1300 else
1301 r = cgm_set_value(f->controller, f->cgroup, f->file, localbuf);
1302
1303 if (!r)
1304 return -EINVAL;
1305
1306 return size;
1307 }
1308
1309 return -EINVAL;
1310 }
1311
1312 int cg_chown(const char *path, uid_t uid, gid_t gid)
1313 {
1314 struct fuse_context *fc = fuse_get_context();
1315 nih_local char * cgdir = NULL;
1316 char *fpath = NULL, *path1, *path2;
1317 nih_local struct cgm_keys *k = NULL;
1318 const char *cgroup;
1319 nih_local char *controller = NULL;
1320
1321
1322 if (!fc)
1323 return -EIO;
1324
1325 if (strcmp(path, "/cgroup") == 0)
1326 return -EINVAL;
1327
1328 controller = pick_controller_from_path(fc, path);
1329 if (!controller)
1330 return -EINVAL;
1331 cgroup = find_cgroup_in_path(path);
1332 if (!cgroup)
1333 /* this is just /cgroup/controller */
1334 return -EINVAL;
1335
1336 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1337
1338 if (!fpath) {
1339 path1 = "/";
1340 path2 = cgdir;
1341 } else {
1342 path1 = cgdir;
1343 path2 = fpath;
1344 }
1345
1346 if (is_child_cgroup(controller, path1, path2)) {
1347 // get uid, gid, from '/tasks' file and make up a mode
1348 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1349 k = get_cgroup_key(controller, cgroup, "tasks");
1350
1351 } else
1352 k = get_cgroup_key(controller, path1, path2);
1353
1354 if (!k)
1355 return -EINVAL;
1356
1357 /*
1358 * This being a fuse request, the uid and gid must be valid
1359 * in the caller's namespace. So we can just check to make
1360 * sure that the caller is root in his uid, and privileged
1361 * over the file's current owner.
1362 */
1363 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD))
1364 return -EACCES;
1365
1366 if (!cgm_chown_file(controller, cgroup, uid, gid))
1367 return -EINVAL;
1368 return 0;
1369 }
1370
1371 int cg_chmod(const char *path, mode_t mode)
1372 {
1373 struct fuse_context *fc = fuse_get_context();
1374 nih_local char * cgdir = NULL;
1375 char *fpath = NULL, *path1, *path2;
1376 nih_local struct cgm_keys *k = NULL;
1377 const char *cgroup;
1378 nih_local char *controller = NULL;
1379
1380 if (!fc)
1381 return -EIO;
1382
1383 if (strcmp(path, "/cgroup") == 0)
1384 return -EINVAL;
1385
1386 controller = pick_controller_from_path(fc, path);
1387 if (!controller)
1388 return -EINVAL;
1389 cgroup = find_cgroup_in_path(path);
1390 if (!cgroup)
1391 /* this is just /cgroup/controller */
1392 return -EINVAL;
1393
1394 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1395
1396 if (!fpath) {
1397 path1 = "/";
1398 path2 = cgdir;
1399 } else {
1400 path1 = cgdir;
1401 path2 = fpath;
1402 }
1403
1404 if (is_child_cgroup(controller, path1, path2)) {
1405 // get uid, gid, from '/tasks' file and make up a mode
1406 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1407 k = get_cgroup_key(controller, cgroup, "tasks");
1408
1409 } else
1410 k = get_cgroup_key(controller, path1, path2);
1411
1412 if (!k)
1413 return -EINVAL;
1414
1415 /*
1416 * This being a fuse request, the uid and gid must be valid
1417 * in the caller's namespace. So we can just check to make
1418 * sure that the caller is root in his uid, and privileged
1419 * over the file's current owner.
1420 */
1421 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT))
1422 return -EPERM;
1423
1424 if (!cgm_chmod_file(controller, cgroup, mode))
1425 return -EINVAL;
1426 return 0;
1427 }
1428
1429 int cg_mkdir(const char *path, mode_t mode)
1430 {
1431 struct fuse_context *fc = fuse_get_context();
1432 nih_local struct cgm_keys **list = NULL;
1433 char *fpath = NULL, *path1;
1434 nih_local char * cgdir = NULL;
1435 const char *cgroup;
1436 nih_local char *controller = NULL;
1437
1438 if (!fc)
1439 return -EIO;
1440
1441
1442 controller = pick_controller_from_path(fc, path);
1443 if (!controller)
1444 return -EINVAL;
1445
1446 cgroup = find_cgroup_in_path(path);
1447 if (!cgroup)
1448 return -EINVAL;
1449
1450 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1451 if (!fpath)
1452 path1 = "/";
1453 else
1454 path1 = cgdir;
1455
1456 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR))
1457 return -EACCES;
1458
1459
1460 if (!cgm_create(controller, cgroup, fc->uid, fc->gid))
1461 return -EINVAL;
1462
1463 return 0;
1464 }
1465
1466 static int cg_rmdir(const char *path)
1467 {
1468 struct fuse_context *fc = fuse_get_context();
1469 nih_local struct cgm_keys **list = NULL;
1470 char *fpath = NULL;
1471 nih_local char * cgdir = NULL;
1472 const char *cgroup;
1473 nih_local char *controller = NULL;
1474
1475 if (!fc)
1476 return -EIO;
1477
1478
1479 controller = pick_controller_from_path(fc, path);
1480 if (!controller)
1481 return -EINVAL;
1482
1483 cgroup = find_cgroup_in_path(path);
1484 if (!cgroup)
1485 return -EINVAL;
1486
1487 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1488 if (!fpath)
1489 return -EINVAL;
1490
1491 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY))
1492 return -EACCES;
1493
1494 if (!cgm_remove(controller, cgroup))
1495 return -EINVAL;
1496
1497 return 0;
1498 }
1499
1500 static bool startswith(const char *line, const char *pref)
1501 {
1502 if (strncmp(line, pref, strlen(pref)) == 0)
1503 return true;
1504 return false;
1505 }
1506
1507 static void get_mem_cached(char *memstat, unsigned long *v)
1508 {
1509 char *eol;
1510
1511 *v = 0;
1512 while (*memstat) {
1513 if (startswith(memstat, "total_cache")) {
1514 sscanf(memstat + 11, "%lu", v);
1515 *v /= 1024;
1516 return;
1517 }
1518 eol = strchr(memstat, '\n');
1519 if (!eol)
1520 return;
1521 memstat = eol+1;
1522 }
1523 }
1524
1525 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
1526 {
1527 char *eol;
1528 char key[32];
1529
1530 memset(key, 0, 32);
1531 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
1532
1533 size_t len = strlen(key);
1534 *v = 0;
1535
1536 while (*str) {
1537 if (startswith(str, key)) {
1538 sscanf(str + len, "%lu", v);
1539 return;
1540 }
1541 eol = strchr(str, '\n');
1542 if (!eol)
1543 return;
1544 str = eol+1;
1545 }
1546 }
1547
1548 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1549 {
1550 nih_local char *fnam = NULL;
1551 FILE *f;
1552 char *answer = NULL;
1553 char *line = NULL;
1554 size_t len = 0;
1555
1556 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
1557 if (!(f = fopen(fnam, "r")))
1558 return false;
1559
1560 while (getline(&line, &len, f) != -1) {
1561 char *c1, *c2;
1562 if (!line[0])
1563 continue;
1564 c1 = strchr(line, ':');
1565 if (!c1)
1566 goto out;
1567 c1++;
1568 c2 = strchr(c1, ':');
1569 if (!c2)
1570 goto out;
1571 *c2 = '\0';
1572 if (strcmp(c1, contrl) != 0)
1573 continue;
1574 c2++;
1575 stripnewline(c2);
1576 answer = NIH_MUST( nih_strdup(NULL, c2) );
1577 goto out;
1578 }
1579
1580 out:
1581 fclose(f);
1582 free(line);
1583 return answer;
1584 }
1585
1586 /*
1587 * FUSE ops for /proc
1588 */
1589
1590 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1591 struct fuse_file_info *fi)
1592 {
1593 struct fuse_context *fc = fuse_get_context();
1594 nih_local char *cg = get_pid_cgroup(fc->pid, "memory");
1595 nih_local char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
1596 unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
1597 char *line = NULL;
1598 size_t linelen = 0, total_len = 0;
1599 FILE *f;
1600
1601 if (offset)
1602 return -EINVAL;
1603
1604 if (!cg)
1605 return 0;
1606
1607 if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
1608 return 0;
1609 if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
1610 return 0;
1611 if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
1612 return 0;
1613 memlimit = strtoul(memlimit_str, NULL, 10);
1614 memusage = strtoul(memusage_str, NULL, 10);
1615 memlimit /= 1024;
1616 memusage /= 1024;
1617 get_mem_cached(memstat_str, &cached);
1618
1619 f = fopen("/proc/meminfo", "r");
1620 if (!f)
1621 return 0;
1622
1623 while (getline(&line, &linelen, f) != -1) {
1624 size_t l;
1625 char *printme, lbuf[100];
1626
1627 memset(lbuf, 0, 100);
1628 if (startswith(line, "MemTotal:")) {
1629 sscanf(line+14, "%lu", &hosttotal);
1630 if (hosttotal < memlimit)
1631 memlimit = hosttotal;
1632 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1633 printme = lbuf;
1634 } else if (startswith(line, "MemFree:")) {
1635 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1636 printme = lbuf;
1637 } else if (startswith(line, "MemAvailable:")) {
1638 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1639 printme = lbuf;
1640 } else if (startswith(line, "Buffers:")) {
1641 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1642 printme = lbuf;
1643 } else if (startswith(line, "Cached:")) {
1644 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
1645 printme = lbuf;
1646 } else if (startswith(line, "SwapCached:")) {
1647 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
1648 printme = lbuf;
1649 } else
1650 printme = line;
1651 l = snprintf(buf, size, "%s", printme);
1652 buf += l;
1653 size -= l;
1654 total_len += l;
1655 }
1656
1657 fclose(f);
1658 free(line);
1659 return total_len;
1660 }
1661
1662 /*
1663 * Read the cpuset.cpus for cg
1664 * Return the answer in a nih_alloced string
1665 */
1666 static char *get_cpuset(const char *cg)
1667 {
1668 char *answer;
1669
1670 if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
1671 return NULL;
1672 return answer;
1673 }
1674
1675 /*
1676 * Helper functions for cpuset_in-set
1677 */
1678 char *cpuset_nexttok(const char *c)
1679 {
1680 char *r = strchr(c+1, ',');
1681 if (r)
1682 return r+1;
1683 return NULL;
1684 }
1685
1686 int cpuset_getrange(const char *c, int *a, int *b)
1687 {
1688 int ret;
1689
1690 ret = sscanf(c, "%d-%d", a, b);
1691 return ret;
1692 }
1693
1694 /*
1695 * cpusets are in format "1,2-3,4"
1696 * iow, comma-delimited ranges
1697 */
1698 static bool cpu_in_cpuset(int cpu, const char *cpuset)
1699 {
1700 const char *c;
1701
1702 for (c = cpuset; c; c = cpuset_nexttok(c)) {
1703 int a, b, ret;
1704
1705 ret = cpuset_getrange(c, &a, &b);
1706 if (ret == 1 && cpu == a)
1707 return true;
1708 if (ret != 2) // bad cpuset!
1709 return false;
1710 if (cpu >= a && cpu <= b)
1711 return true;
1712 }
1713
1714 return false;
1715 }
1716
1717 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
1718 {
1719 int cpu;
1720
1721 if (sscanf(line, "processor : %d", &cpu) != 1)
1722 return false;
1723 return cpu_in_cpuset(cpu, cpuset);
1724 }
1725
1726 /*
1727 * check whether this is a '^processor" line in /proc/cpuinfo
1728 */
1729 static bool is_processor_line(const char *line)
1730 {
1731 int cpu;
1732
1733 if (sscanf(line, "processor : %d", &cpu) == 1)
1734 return true;
1735 return false;
1736 }
1737
1738 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
1739 struct fuse_file_info *fi)
1740 {
1741 struct fuse_context *fc = fuse_get_context();
1742 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1743 nih_local char *cpuset = NULL;
1744 char *line = NULL;
1745 size_t linelen = 0, total_len = 0;
1746 bool am_printing = false;
1747 int curcpu = -1;
1748 FILE *f;
1749
1750 if (offset)
1751 return -EINVAL;
1752
1753 if (!cg)
1754 return 0;
1755
1756 cpuset = get_cpuset(cg);
1757 if (!cpuset)
1758 return 0;
1759
1760 f = fopen("/proc/cpuinfo", "r");
1761 if (!f)
1762 return 0;
1763
1764 while (getline(&line, &linelen, f) != -1) {
1765 size_t l;
1766 if (is_processor_line(line)) {
1767 am_printing = cpuline_in_cpuset(line, cpuset);
1768 if (am_printing) {
1769 curcpu ++;
1770 l = snprintf(buf, size, "processor : %d\n", curcpu);
1771 buf += l;
1772 size -= l;
1773 total_len += l;
1774 }
1775 continue;
1776 }
1777 if (am_printing) {
1778 l = snprintf(buf, size, "%s", line);
1779 buf += l;
1780 size -= l;
1781 total_len += l;
1782 }
1783 }
1784
1785 fclose(f);
1786 free(line);
1787 return total_len;
1788 }
1789
1790 static int proc_stat_read(char *buf, size_t size, off_t offset,
1791 struct fuse_file_info *fi)
1792 {
1793 struct fuse_context *fc = fuse_get_context();
1794 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1795 nih_local char *cpuset = NULL;
1796 char *line = NULL;
1797 size_t linelen = 0, total_len = 0;
1798 int curcpu = -1; /* cpu numbering starts at 0 */
1799 FILE *f;
1800
1801 if (offset)
1802 return -EINVAL;
1803
1804 if (!cg)
1805 return 0;
1806
1807 cpuset = get_cpuset(cg);
1808 if (!cpuset)
1809 return 0;
1810
1811 f = fopen("/proc/stat", "r");
1812 if (!f)
1813 return 0;
1814
1815 while (getline(&line, &linelen, f) != -1) {
1816 size_t l;
1817 int cpu;
1818 char cpu_char[10]; /* That's a lot of cores */
1819 char *c;
1820
1821 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
1822 /* not a ^cpuN line containing a number N, just print it */
1823 l = snprintf(buf, size, "%s", line);
1824 buf += l;
1825 size -= l;
1826 total_len += l;
1827 continue;
1828 }
1829
1830 if (sscanf(cpu_char, "%d", &cpu) != 1)
1831 continue;
1832 if (!cpu_in_cpuset(cpu, cpuset))
1833 continue;
1834 curcpu ++;
1835
1836 c = strchr(line, ' ');
1837 if (!c)
1838 continue;
1839 l = snprintf(buf, size, "cpu%d %s", curcpu, c);
1840 buf += l;
1841 size -= l;
1842 total_len += l;
1843 }
1844
1845 fclose(f);
1846 free(line);
1847 return total_len;
1848 }
1849
1850 /*
1851 * How to guess what to present for uptime?
1852 * One thing we could do would be to take the date on the caller's
1853 * memory.usage_in_bytes file, which should equal the time of creation
1854 * of his cgroup. However, a task could be in a sub-cgroup of the
1855 * container. The same problem exists if we try to look at the ages
1856 * of processes in the caller's cgroup.
1857 *
1858 * So we'll fork a task that will enter the caller's pidns, mount a
1859 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
1860 *
1861 * For the second uptime #, we'll do as Stéphane had done, just copy
1862 * the number from /proc/uptime. Not sure how to best emulate 'idle'
1863 * time. Maybe someone can come up with a good algorithm and submit a
1864 * patch. Maybe something based on cpushare info?
1865 */
1866
1867 /* return age of the reaper for $pid, taken from ctime of its procdir */
1868 static long int get_pid1_time(pid_t pid)
1869 {
1870 char fnam[100];
1871 int fd, cpipe[2], ret;
1872 struct stat sb;
1873 pid_t cpid;
1874 struct timeval tv;
1875 fd_set s;
1876 char v;
1877
1878 if (unshare(CLONE_NEWNS))
1879 return 0;
1880
1881 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
1882 perror("rslave mount failed");
1883 return 0;
1884 }
1885
1886 sprintf(fnam, "/proc/%d/ns/pid", pid);
1887 fd = open(fnam, O_RDONLY);
1888 if (fd < 0) {
1889 perror("get_pid1_time open of ns/pid");
1890 return 0;
1891 }
1892 if (setns(fd, 0)) {
1893 perror("get_pid1_time setns 1");
1894 close(fd);
1895 return 0;
1896 }
1897 close(fd);
1898
1899 if (pipe(cpipe) < 0)
1900 exit(1);
1901
1902 loop:
1903 cpid = fork();
1904 if (cpid < 0)
1905 return 0;
1906
1907 if (!cpid) {
1908 char b = '1';
1909 close(cpipe[0]);
1910 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1911 fprintf(stderr, "%s (child): erorr on write: %s\n",
1912 __func__, strerror(errno));
1913 }
1914 close(cpipe[1]);
1915 umount2("/proc", MNT_DETACH);
1916 if (mount("proc", "/proc", "proc", 0, NULL)) {
1917 perror("get_pid1_time mount");
1918 return 0;
1919 }
1920 ret = lstat("/proc/1", &sb);
1921 if (ret) {
1922 perror("get_pid1_time lstat");
1923 return 0;
1924 }
1925 return time(NULL) - sb.st_ctime;
1926 }
1927
1928 // give the child 1 second to be done forking and
1929 // write it's ack
1930 FD_ZERO(&s);
1931 FD_SET(cpipe[0], &s);
1932 tv.tv_sec = 1;
1933 tv.tv_usec = 0;
1934 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1935 if (ret <= 0)
1936 goto again;
1937 ret = read(cpipe[0], &v, 1);
1938 if (ret != sizeof(char) || v != '1') {
1939 goto again;
1940 }
1941
1942 wait_for_pid(cpid);
1943 exit(0);
1944
1945 again:
1946 kill(cpid, SIGKILL);
1947 wait_for_pid(cpid);
1948 goto loop;
1949 }
1950
1951 static long int getreaperage(pid_t qpid)
1952 {
1953 int pid, mypipe[2], ret;
1954 struct timeval tv;
1955 fd_set s;
1956 long int mtime, answer = 0;
1957
1958 if (pipe(mypipe)) {
1959 return 0;
1960 }
1961
1962 pid = fork();
1963
1964 if (!pid) { // child
1965 mtime = get_pid1_time(qpid);
1966 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
1967 fprintf(stderr, "Warning: bad write from getreaperage\n");
1968 exit(0);
1969 }
1970
1971 close(mypipe[1]);
1972 FD_ZERO(&s);
1973 FD_SET(mypipe[0], &s);
1974 tv.tv_sec = 1;
1975 tv.tv_usec = 0;
1976 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
1977 if (ret <= 0) {
1978 perror("select");
1979 goto out;
1980 }
1981 if (!ret) {
1982 fprintf(stderr, "timed out\n");
1983 goto out;
1984 }
1985 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
1986 perror("read");
1987 goto out;
1988 }
1989 answer = mtime;
1990
1991 out:
1992 wait_for_pid(pid);
1993 close(mypipe[0]);
1994 return answer;
1995 }
1996
1997 static long int getprocidle(void)
1998 {
1999 FILE *f = fopen("/proc/uptime", "r");
2000 long int age, idle;
2001 int ret;
2002 if (!f)
2003 return 0;
2004 ret = fscanf(f, "%ld %ld", &age, &idle);
2005 fclose(f);
2006 if (ret != 2)
2007 return 0;
2008 return idle;
2009 }
2010
2011 /*
2012 * We read /proc/uptime and reuse its second field.
2013 * For the first field, we use the mtime for the reaper for
2014 * the calling pid as returned by getreaperage
2015 */
2016 static int proc_uptime_read(char *buf, size_t size, off_t offset,
2017 struct fuse_file_info *fi)
2018 {
2019 struct fuse_context *fc = fuse_get_context();
2020 long int reaperage = getreaperage(fc->pid);;
2021 long int idletime = getprocidle();
2022
2023 if (offset)
2024 return -EINVAL;
2025 return snprintf(buf, size, "%ld %ld\n", reaperage, idletime);
2026 }
2027
2028 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2029 struct fuse_file_info *fi)
2030 {
2031 char dev_name[72];
2032 struct fuse_context *fc = fuse_get_context();
2033 nih_local char *cg = get_pid_cgroup(fc->pid, "blkio");
2034 nih_local char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
2035 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2036 unsigned long read = 0, write = 0;
2037 unsigned long read_merged = 0, write_merged = 0;
2038 unsigned long read_sectors = 0, write_sectors = 0;
2039 unsigned long read_ticks = 0, write_ticks = 0;
2040 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2041 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
2042 char *line = NULL;
2043 size_t linelen = 0, total_len = 0;
2044 unsigned int major = 0, minor = 0;
2045 int i = 0;
2046 FILE *f;
2047
2048 if (offset)
2049 return -EINVAL;
2050
2051 if (!cg)
2052 return 0;
2053
2054 if (!cgm_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2055 return 0;
2056 if (!cgm_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2057 return 0;
2058 if (!cgm_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2059 return 0;
2060 if (!cgm_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2061 return 0;
2062 if (!cgm_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2063 return 0;
2064
2065
2066 f = fopen("/proc/diskstats", "r");
2067 if (!f)
2068 return 0;
2069
2070 while (getline(&line, &linelen, f) != -1) {
2071 size_t l;
2072 char *printme, lbuf[256];
2073
2074 i = sscanf(line, "%u %u %s", &major, &minor, dev_name);
2075 if(i == 3){
2076 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2077 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2078 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2079 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2080 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2081 read_sectors = read_sectors/512;
2082 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2083 write_sectors = write_sectors/512;
2084
2085 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2086 rd_svctm = rd_svctm/1000000;
2087 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2088 rd_wait = rd_wait/1000000;
2089 read_ticks = rd_svctm + rd_wait;
2090
2091 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2092 wr_svctm = wr_svctm/1000000;
2093 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2094 wr_wait = wr_wait/1000000;
2095 write_ticks = wr_svctm + wr_wait;
2096
2097 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2098 tot_ticks = tot_ticks/1000000;
2099 }else{
2100 continue;
2101 }
2102
2103 memset(lbuf, 0, 256);
2104 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2105 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2106 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2107 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2108 printme = lbuf;
2109 } else
2110 continue;
2111
2112 l = snprintf(buf, size, "%s", printme);
2113 buf += l;
2114 size -= l;
2115 total_len += l;
2116 }
2117
2118 fclose(f);
2119 free(line);
2120 return total_len;
2121 }
2122
2123 static off_t get_procfile_size(const char *which)
2124 {
2125 FILE *f = fopen(which, "r");
2126 char *line = NULL;
2127 size_t len = 0;
2128 ssize_t sz, answer = 0;
2129 if (!f)
2130 return 0;
2131
2132 while ((sz = getline(&line, &len, f)) != -1)
2133 answer += sz;
2134 fclose (f);
2135 free(line);
2136
2137 return answer;
2138 }
2139
2140 static int proc_getattr(const char *path, struct stat *sb)
2141 {
2142 struct timespec now;
2143
2144 memset(sb, 0, sizeof(struct stat));
2145 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2146 return -EINVAL;
2147 sb->st_uid = sb->st_gid = 0;
2148 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2149 if (strcmp(path, "/proc") == 0) {
2150 sb->st_mode = S_IFDIR | 00555;
2151 sb->st_nlink = 2;
2152 return 0;
2153 }
2154 if (strcmp(path, "/proc/meminfo") == 0 ||
2155 strcmp(path, "/proc/cpuinfo") == 0 ||
2156 strcmp(path, "/proc/uptime") == 0 ||
2157 strcmp(path, "/proc/stat") == 0 ||
2158 strcmp(path, "/proc/diskstats") == 0) {
2159 sb->st_size = get_procfile_size(path);
2160 sb->st_mode = S_IFREG | 00444;
2161 sb->st_nlink = 1;
2162 return 0;
2163 }
2164
2165 return -ENOENT;
2166 }
2167
2168 static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2169 struct fuse_file_info *fi)
2170 {
2171 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2172 filler(buf, "meminfo", NULL, 0) != 0 ||
2173 filler(buf, "stat", NULL, 0) != 0 ||
2174 filler(buf, "uptime", NULL, 0) != 0 ||
2175 filler(buf, "diskstats", NULL, 0) != 0)
2176 return -EINVAL;
2177 return 0;
2178 }
2179
2180 static int proc_open(const char *path, struct fuse_file_info *fi)
2181 {
2182 int type = -1;
2183 struct file_info *info;
2184
2185 if (strcmp(path, "/proc/meminfo") == 0)
2186 type = LXC_TYPE_PROC_MEMINFO;
2187 else if (strcmp(path, "/proc/cpuinfo") == 0)
2188 type = LXC_TYPE_PROC_CPUINFO;
2189 else if (strcmp(path, "/proc/uptime") == 0)
2190 type = LXC_TYPE_PROC_UPTIME;
2191 else if (strcmp(path, "/proc/stat") == 0)
2192 type = LXC_TYPE_PROC_STAT;
2193 else if (strcmp(path, "/proc/diskstats") == 0)
2194 type = LXC_TYPE_PROC_DISKSTATS;
2195 if (type == -1)
2196 return -ENOENT;
2197
2198 info = NIH_MUST( nih_alloc(NULL, sizeof(*info)) );
2199 memset(info, 0, sizeof(*info));
2200 info->type = type;
2201
2202 fi->fh = (unsigned long)info;
2203 return 0;
2204 }
2205
2206 static int proc_release(const char *path, struct fuse_file_info *fi)
2207 {
2208 struct file_info *f = (struct file_info *)fi->fh;
2209
2210 do_release_file_info(f);
2211 return 0;
2212 }
2213
2214 static int proc_read(const char *path, char *buf, size_t size, off_t offset,
2215 struct fuse_file_info *fi)
2216 {
2217 struct file_info *f = (struct file_info *) fi->fh;
2218
2219 switch (f->type) {
2220 case LXC_TYPE_PROC_MEMINFO:
2221 return proc_meminfo_read(buf, size, offset, fi);
2222 case LXC_TYPE_PROC_CPUINFO:
2223 return proc_cpuinfo_read(buf, size, offset, fi);
2224 case LXC_TYPE_PROC_UPTIME:
2225 return proc_uptime_read(buf, size, offset, fi);
2226 case LXC_TYPE_PROC_STAT:
2227 return proc_stat_read(buf, size, offset, fi);
2228 case LXC_TYPE_PROC_DISKSTATS:
2229 return proc_diskstats_read(buf, size, offset, fi);
2230 default:
2231 return -EINVAL;
2232 }
2233 }
2234
2235 /*
2236 * FUSE ops for /
2237 * these just delegate to the /proc and /cgroup ops as
2238 * needed
2239 */
2240
2241 static int lxcfs_getattr(const char *path, struct stat *sb)
2242 {
2243 if (strcmp(path, "/") == 0) {
2244 sb->st_mode = S_IFDIR | 00755;
2245 sb->st_nlink = 2;
2246 return 0;
2247 }
2248 if (strncmp(path, "/cgroup", 7) == 0) {
2249 return cg_getattr(path, sb);
2250 }
2251 if (strncmp(path, "/proc", 5) == 0) {
2252 return proc_getattr(path, sb);
2253 }
2254 return -EINVAL;
2255 }
2256
2257 static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
2258 {
2259 if (strcmp(path, "/") == 0)
2260 return 0;
2261
2262 if (strncmp(path, "/cgroup", 7) == 0) {
2263 return cg_opendir(path, fi);
2264 }
2265 if (strcmp(path, "/proc") == 0)
2266 return 0;
2267 return -ENOENT;
2268 }
2269
2270 static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2271 struct fuse_file_info *fi)
2272 {
2273 if (strcmp(path, "/") == 0) {
2274 if (filler(buf, "proc", NULL, 0) != 0 ||
2275 filler(buf, "cgroup", NULL, 0) != 0)
2276 return -EINVAL;
2277 return 0;
2278 }
2279 if (strncmp(path, "/cgroup", 7) == 0)
2280 return cg_readdir(path, buf, filler, offset, fi);
2281 if (strcmp(path, "/proc") == 0)
2282 return proc_readdir(path, buf, filler, offset, fi);
2283 return -EINVAL;
2284 }
2285
2286 static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
2287 {
2288 if (strcmp(path, "/") == 0)
2289 return 0;
2290 if (strncmp(path, "/cgroup", 7) == 0) {
2291 return cg_releasedir(path, fi);
2292 }
2293 if (strcmp(path, "/proc") == 0)
2294 return 0;
2295 return -EINVAL;
2296 }
2297
2298 static int lxcfs_open(const char *path, struct fuse_file_info *fi)
2299 {
2300 if (strncmp(path, "/cgroup", 7) == 0)
2301 return cg_open(path, fi);
2302 if (strncmp(path, "/proc", 5) == 0)
2303 return proc_open(path, fi);
2304
2305 return -EINVAL;
2306 }
2307
2308 static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
2309 struct fuse_file_info *fi)
2310 {
2311 if (strncmp(path, "/cgroup", 7) == 0)
2312 return cg_read(path, buf, size, offset, fi);
2313 if (strncmp(path, "/proc", 5) == 0)
2314 return proc_read(path, buf, size, offset, fi);
2315
2316 return -EINVAL;
2317 }
2318
2319 int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
2320 struct fuse_file_info *fi)
2321 {
2322 if (strncmp(path, "/cgroup", 7) == 0) {
2323 return cg_write(path, buf, size, offset, fi);
2324 }
2325
2326 return -EINVAL;
2327 }
2328
2329 static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2330 {
2331 return 0;
2332 }
2333
2334 static int lxcfs_release(const char *path, struct fuse_file_info *fi)
2335 {
2336 if (strncmp(path, "/cgroup", 7) == 0)
2337 return cg_release(path, fi);
2338 if (strncmp(path, "/proc", 5) == 0)
2339 return proc_release(path, fi);
2340
2341 return -EINVAL;
2342 }
2343
2344 static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2345 {
2346 return 0;
2347 }
2348
2349 int lxcfs_mkdir(const char *path, mode_t mode)
2350 {
2351 if (strncmp(path, "/cgroup", 7) == 0)
2352 return cg_mkdir(path, mode);
2353
2354 return -EINVAL;
2355 }
2356
2357 int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2358 {
2359 if (strncmp(path, "/cgroup", 7) == 0)
2360 return cg_chown(path, uid, gid);
2361
2362 return -EINVAL;
2363 }
2364
2365 /*
2366 * cat first does a truncate before doing ops->write. This doesn't
2367 * really make sense for cgroups. So just return 0 always but do
2368 * nothing.
2369 */
2370 int lxcfs_truncate(const char *path, off_t newsize)
2371 {
2372 if (strncmp(path, "/cgroup", 7) == 0)
2373 return 0;
2374 return -EINVAL;
2375 }
2376
2377 int lxcfs_rmdir(const char *path)
2378 {
2379 if (strncmp(path, "/cgroup", 7) == 0)
2380 return cg_rmdir(path);
2381 return -EINVAL;
2382 }
2383
2384 int lxcfs_chmod(const char *path, mode_t mode)
2385 {
2386 if (strncmp(path, "/cgroup", 7) == 0)
2387 return cg_chmod(path, mode);
2388 return -EINVAL;
2389 }
2390
2391 const struct fuse_operations lxcfs_ops = {
2392 .getattr = lxcfs_getattr,
2393 .readlink = NULL,
2394 .getdir = NULL,
2395 .mknod = NULL,
2396 .mkdir = lxcfs_mkdir,
2397 .unlink = NULL,
2398 .rmdir = lxcfs_rmdir,
2399 .symlink = NULL,
2400 .rename = NULL,
2401 .link = NULL,
2402 .chmod = lxcfs_chmod,
2403 .chown = lxcfs_chown,
2404 .truncate = lxcfs_truncate,
2405 .utime = NULL,
2406
2407 .open = lxcfs_open,
2408 .read = lxcfs_read,
2409 .release = lxcfs_release,
2410 .write = lxcfs_write,
2411
2412 .statfs = NULL,
2413 .flush = lxcfs_flush,
2414 .fsync = lxcfs_fsync,
2415
2416 .setxattr = NULL,
2417 .getxattr = NULL,
2418 .listxattr = NULL,
2419 .removexattr = NULL,
2420
2421 .opendir = lxcfs_opendir,
2422 .readdir = lxcfs_readdir,
2423 .releasedir = lxcfs_releasedir,
2424
2425 .fsyncdir = NULL,
2426 .init = NULL,
2427 .destroy = NULL,
2428 .access = NULL,
2429 .create = NULL,
2430 .ftruncate = NULL,
2431 .fgetattr = NULL,
2432 };
2433
2434 static void usage(const char *me)
2435 {
2436 fprintf(stderr, "Usage:\n");
2437 fprintf(stderr, "\n");
2438 fprintf(stderr, "%s [FUSE and mount options] mountpoint\n", me);
2439 exit(1);
2440 }
2441
2442 static bool is_help(char *w)
2443 {
2444 if (strcmp(w, "-h") == 0 ||
2445 strcmp(w, "--help") == 0 ||
2446 strcmp(w, "-help") == 0 ||
2447 strcmp(w, "help") == 0)
2448 return true;
2449 return false;
2450 }
2451
2452 int main(int argc, char *argv[])
2453 {
2454 int ret;
2455 struct lxcfs_state *d;
2456
2457 if (argc < 2 || is_help(argv[1]))
2458 usage(argv[0]);
2459
2460 d = malloc(sizeof(*d));
2461 if (!d)
2462 return -1;
2463
2464 if (!cgm_escape_cgroup())
2465 fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
2466
2467 if (!cgm_get_controllers(&d->subsystems))
2468 return -1;
2469
2470 ret = fuse_main(argc, argv, &lxcfs_ops, d);
2471
2472 return ret;
2473 }