]> git.proxmox.com Git - mirror_lxcfs.git/blob - lxcfs.c
Make sure that that '/cgroup' and the controller are sep'd by /
[mirror_lxcfs.git] / lxcfs.c
1 /* lxcfs
2 *
3 * Copyright © 2014 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #include <stdio.h>
12 #include <dirent.h>
13 #include <fcntl.h>
14 #include <fuse.h>
15 #include <unistd.h>
16 #include <errno.h>
17 #include <stdbool.h>
18 #include <time.h>
19 #include <string.h>
20 #include <stdlib.h>
21 #include <libgen.h>
22 #include <sched.h>
23 #include <linux/sched.h>
24 #include <sys/socket.h>
25 #include <sys/mount.h>
26 #include <wait.h>
27
28 #include <nih/alloc.h>
29 #include <nih/string.h>
30
31 #include "cgmanager.h"
32
33 struct lxcfs_state {
34 /*
35 * a null-terminated, nih-allocated list of the mounted subsystems. We
36 * detect this at startup.
37 */
38 char **subsystems;
39 };
40 #define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
41
42 enum {
43 LXC_TYPE_CGDIR,
44 LXC_TYPE_CGFILE,
45 LXC_TYPE_PROC_MEMINFO,
46 LXC_TYPE_PROC_CPUINFO,
47 LXC_TYPE_PROC_UPTIME,
48 LXC_TYPE_PROC_STAT,
49 LXC_TYPE_PROC_DISKSTATS,
50 };
51
52 struct file_info {
53 char *controller;
54 char *cgroup;
55 char *file;
56 int type;
57 char *buf; // unused as of yet
58 int buflen;
59 int size; //actual data size
60 };
61
62 /* reserve buffer size, for cpuall in /proc/stat */
63 #define BUF_RESERVE_SIZE 256
64
65 static char *must_copy_string(void *parent, const char *str)
66 {
67 if (!str)
68 return NULL;
69 return NIH_MUST( nih_strdup(parent, str) );
70 }
71
72 /*
73 * TODO - return value should denote whether child exited with failure
74 * so callers can return errors. Esp read/write of tasks and cgroup.procs
75 */
76 static int wait_for_pid(pid_t pid)
77 {
78 int status, ret;
79
80 again:
81 ret = waitpid(pid, &status, 0);
82 if (ret == -1) {
83 if (errno == EINTR)
84 goto again;
85 return -1;
86 }
87 if (ret != pid)
88 goto again;
89 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
90 return -1;
91 return 0;
92 }
93
94 /*
95 * Given a open file * to /proc/pid/{u,g}id_map, and an id
96 * valid in the caller's namespace, return the id mapped into
97 * pid's namespace.
98 * Returns the mapped id, or -1 on error.
99 */
100 unsigned int
101 convert_id_to_ns(FILE *idfile, unsigned int in_id)
102 {
103 unsigned int nsuid, // base id for a range in the idfile's namespace
104 hostuid, // base id for a range in the caller's namespace
105 count; // number of ids in this range
106 char line[400];
107 int ret;
108
109 fseek(idfile, 0L, SEEK_SET);
110 while (fgets(line, 400, idfile)) {
111 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
112 if (ret != 3)
113 continue;
114 if (hostuid + count < hostuid || nsuid + count < nsuid) {
115 /*
116 * uids wrapped around - unexpected as this is a procfile,
117 * so just bail.
118 */
119 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
120 nsuid, hostuid, count, line);
121 return -1;
122 }
123 if (hostuid <= in_id && hostuid+count > in_id) {
124 /*
125 * now since hostuid <= in_id < hostuid+count, and
126 * hostuid+count and nsuid+count do not wrap around,
127 * we know that nsuid+(in_id-hostuid) which must be
128 * less that nsuid+(count) must not wrap around
129 */
130 return (in_id - hostuid) + nsuid;
131 }
132 }
133
134 // no answer found
135 return -1;
136 }
137
138 /*
139 * for is_privileged_over,
140 * specify whether we require the calling uid to be root in his
141 * namespace
142 */
143 #define NS_ROOT_REQD true
144 #define NS_ROOT_OPT false
145
146 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
147 {
148 nih_local char *fpath = NULL;
149 bool answer = false;
150 uid_t nsuid;
151
152 if (victim == -1 || uid == -1)
153 return false;
154
155 /*
156 * If the request is one not requiring root in the namespace,
157 * then having the same uid suffices. (i.e. uid 1000 has write
158 * access to files owned by uid 1000
159 */
160 if (!req_ns_root && uid == victim)
161 return true;
162
163 fpath = NIH_MUST( nih_sprintf(NULL, "/proc/%d/uid_map", pid) );
164 FILE *f = fopen(fpath, "r");
165 if (!f)
166 return false;
167
168 /* if caller's not root in his namespace, reject */
169 nsuid = convert_id_to_ns(f, uid);
170 if (nsuid)
171 goto out;
172
173 /*
174 * If victim is not mapped into caller's ns, reject.
175 * XXX I'm not sure this check is needed given that fuse
176 * will be sending requests where the vfs has converted
177 */
178 nsuid = convert_id_to_ns(f, victim);
179 if (nsuid == -1)
180 goto out;
181
182 answer = true;
183
184 out:
185 fclose(f);
186 return answer;
187 }
188
189 static bool perms_include(int fmode, mode_t req_mode)
190 {
191 mode_t r;
192
193 switch (req_mode & O_ACCMODE) {
194 case O_RDONLY:
195 r = S_IROTH;
196 break;
197 case O_WRONLY:
198 r = S_IWOTH;
199 break;
200 case O_RDWR:
201 r = S_IROTH | S_IWOTH;
202 break;
203 default:
204 return false;
205 }
206 return ((fmode & r) == r);
207 }
208
209 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
210 {
211 char *start, *end;
212
213 if (strlen(taskcg) <= strlen(querycg)) {
214 fprintf(stderr, "%s: I was fed bad input\n", __func__);
215 return NULL;
216 }
217
218 if (strcmp(querycg, "/") == 0)
219 start = NIH_MUST( nih_strdup(NULL, taskcg + 1) );
220 else
221 start = NIH_MUST( nih_strdup(NULL, taskcg + strlen(querycg) + 1) );
222 end = strchr(start, '/');
223 if (end)
224 *end = '\0';
225 return start;
226 }
227
228 /*
229 * check whether a fuse context may access a cgroup dir or file
230 *
231 * If file is not null, it is a cgroup file to check under cg.
232 * If file is null, then we are checking perms on cg itself.
233 *
234 * For files we can check the mode of the list_keys result.
235 * For cgroups, we must make assumptions based on the files under the
236 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
237 * yet.
238 */
239 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
240 {
241 nih_local struct cgm_keys **list = NULL;
242 int i;
243
244 if (!file)
245 file = "tasks";
246
247 if (*file == '/')
248 file++;
249
250 if (!cgm_list_keys(contrl, cg, &list))
251 return false;
252 for (i = 0; list[i]; i++) {
253 if (strcmp(list[i]->name, file) == 0) {
254 struct cgm_keys *k = list[i];
255 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
256 if (perms_include(k->mode >> 6, mode))
257 return true;
258 }
259 if (fc->gid == k->gid) {
260 if (perms_include(k->mode >> 3, mode))
261 return true;
262 }
263 return perms_include(k->mode, mode);
264 }
265 }
266
267 return false;
268 }
269
270 static void stripnewline(char *x)
271 {
272 size_t l = strlen(x);
273 if (l && x[l-1] == '\n')
274 x[l-1] = '\0';
275 }
276
277 /*
278 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
279 * If caller is in /a, he may act on /a/b, but not on /b.
280 * if the answer is false and nextcg is not NULL, then *nextcg will point
281 * to a nih_alloc'd string containing the next cgroup directory under cg
282 */
283 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
284 {
285 nih_local char *fnam = NULL;
286 FILE *f;
287 bool answer = false;
288 char *line = NULL;
289 size_t len = 0;
290
291 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
292 if (!(f = fopen(fnam, "r")))
293 return false;
294
295 while (getline(&line, &len, f) != -1) {
296 char *c1, *c2, *linecmp;
297 if (!line[0])
298 continue;
299 c1 = strchr(line, ':');
300 if (!c1)
301 goto out;
302 c1++;
303 c2 = strchr(c1, ':');
304 if (!c2)
305 goto out;
306 *c2 = '\0';
307 if (strcmp(c1, contrl) != 0)
308 continue;
309 c2++;
310 stripnewline(c2);
311 /*
312 * callers pass in '/' for root cgroup, otherwise they pass
313 * in a cgroup without leading '/'
314 */
315 linecmp = *cg == '/' ? c2 : c2+1;
316 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
317 if (nextcg)
318 *nextcg = get_next_cgroup_dir(linecmp, cg);
319 goto out;
320 }
321 answer = true;
322 goto out;
323 }
324
325 out:
326 fclose(f);
327 free(line);
328 return answer;
329 }
330
331 /*
332 * given /cgroup/freezer/a/b, return "freezer". this will be nih-allocated
333 * and needs to be nih_freed.
334 */
335 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
336 {
337 const char *p1;
338 char *ret, *slash;
339
340 if (strlen(path) < 9)
341 return NULL;
342 if (*(path+7) != '/')
343 return NULL;
344 p1 = path+8;
345 ret = nih_strdup(NULL, p1);
346 if (!ret)
347 return ret;
348 slash = strstr(ret, "/");
349 if (slash)
350 *slash = '\0';
351
352 /* verify that it is a subsystem */
353 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
354 int i;
355 if (!list) {
356 nih_free(ret);
357 return NULL;
358 }
359 for (i = 0; list[i]; i++) {
360 if (strcmp(list[i], ret) == 0)
361 return ret;
362 }
363 nih_free(ret);
364 return NULL;
365 }
366
367 /*
368 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
369 * Note that the returned value may include files (keynames) etc
370 */
371 static const char *find_cgroup_in_path(const char *path)
372 {
373 const char *p1;
374
375 if (strlen(path) < 9)
376 return NULL;
377 p1 = strstr(path+8, "/");
378 if (!p1)
379 return NULL;
380 return p1+1;
381 }
382
383 static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
384 {
385 nih_local char **list = NULL;
386 int i;
387
388 if (!f)
389 return false;
390 if (*f == '/')
391 f++;
392
393 if (!cgm_list_children(contr, dir, &list))
394 return false;
395 for (i = 0; list[i]; i++) {
396 if (strcmp(list[i], f) == 0)
397 return true;
398 }
399
400 return false;
401 }
402
403 static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
404 {
405 nih_local struct cgm_keys **list = NULL;
406 struct cgm_keys *k;
407 int i;
408
409 if (!f)
410 return NULL;
411 if (*f == '/')
412 f++;
413 if (!cgm_list_keys(contr, dir, &list))
414 return NULL;
415 for (i = 0; list[i]; i++) {
416 if (strcmp(list[i]->name, f) == 0) {
417 k = NIH_MUST( nih_alloc(NULL, (sizeof(*k))) );
418 k->name = NIH_MUST( nih_strdup(k, list[i]->name) );
419 k->uid = list[i]->uid;
420 k->gid = list[i]->gid;
421 k->mode = list[i]->mode;
422 return k;
423 }
424 }
425
426 return NULL;
427 }
428
429 static void get_cgdir_and_path(const char *cg, char **dir, char **file)
430 {
431 char *p;
432
433 *dir = NIH_MUST( nih_strdup(NULL, cg) );
434 *file = strrchr(cg, '/');
435 if (!*file) {
436 *file = NULL;
437 return;
438 }
439 p = strrchr(*dir, '/');
440 *p = '\0';
441 }
442
443 /*
444 * FUSE ops for /cgroup
445 */
446
447 static int cg_getattr(const char *path, struct stat *sb)
448 {
449 struct timespec now;
450 struct fuse_context *fc = fuse_get_context();
451 nih_local char * cgdir = NULL;
452 char *fpath = NULL, *path1, *path2;
453 nih_local struct cgm_keys *k = NULL;
454 const char *cgroup;
455 nih_local char *controller = NULL;
456
457
458 if (!fc)
459 return -EIO;
460
461 memset(sb, 0, sizeof(struct stat));
462
463 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
464 return -EINVAL;
465
466 sb->st_uid = sb->st_gid = 0;
467 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
468 sb->st_size = 0;
469
470 if (strcmp(path, "/cgroup") == 0) {
471 sb->st_mode = S_IFDIR | 00755;
472 sb->st_nlink = 2;
473 return 0;
474 }
475
476 controller = pick_controller_from_path(fc, path);
477 if (!controller)
478 return -EIO;
479 cgroup = find_cgroup_in_path(path);
480 if (!cgroup) {
481 /* this is just /cgroup/controller, return it as a dir */
482 sb->st_mode = S_IFDIR | 00755;
483 sb->st_nlink = 2;
484 return 0;
485 }
486
487 get_cgdir_and_path(cgroup, &cgdir, &fpath);
488
489 if (!fpath) {
490 path1 = "/";
491 path2 = cgdir;
492 } else {
493 path1 = cgdir;
494 path2 = fpath;
495 }
496
497 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
498 * Then check that caller's cgroup is under path if fpath is a child
499 * cgroup, or cgdir if fpath is a file */
500
501 if (is_child_cgroup(controller, path1, path2)) {
502 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
503 /* this is just /cgroup/controller, return it as a dir */
504 sb->st_mode = S_IFDIR | 00555;
505 sb->st_nlink = 2;
506 return 0;
507 }
508 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
509 return -EACCES;
510
511 // get uid, gid, from '/tasks' file and make up a mode
512 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
513 sb->st_mode = S_IFDIR | 00755;
514 k = get_cgroup_key(controller, cgroup, "tasks");
515 if (!k) {
516 sb->st_uid = sb->st_gid = 0;
517 } else {
518 sb->st_uid = k->uid;
519 sb->st_gid = k->gid;
520 }
521 sb->st_nlink = 2;
522 return 0;
523 }
524
525 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
526 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL))
527 return -ENOENT;
528 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
529 return -EACCES;
530
531 sb->st_mode = S_IFREG | k->mode;
532 sb->st_nlink = 1;
533 sb->st_uid = k->uid;
534 sb->st_gid = k->gid;
535 sb->st_size = 0;
536 return 0;
537 }
538
539 return -ENOENT;
540 }
541
542 /*
543 * TODO - cache these results in a table for use in opendir, free
544 * in releasedir
545 */
546 static int cg_opendir(const char *path, struct fuse_file_info *fi)
547 {
548 struct fuse_context *fc = fuse_get_context();
549 nih_local struct cgm_keys **list = NULL;
550 const char *cgroup;
551 struct file_info *dir_info;
552 nih_local char *controller = NULL;
553
554 if (!fc)
555 return -EIO;
556
557 if (strcmp(path, "/cgroup") == 0) {
558 cgroup = NULL;
559 controller = NULL;
560 } else {
561 // return list of keys for the controller, and list of child cgroups
562 controller = pick_controller_from_path(fc, path);
563 if (!controller)
564 return -EIO;
565
566 cgroup = find_cgroup_in_path(path);
567 if (!cgroup) {
568 /* this is just /cgroup/controller, return its contents */
569 cgroup = "/";
570 }
571 }
572
573 if (cgroup && !fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
574 return -EACCES;
575
576 /* we'll free this at cg_releasedir */
577 dir_info = NIH_MUST( nih_alloc(NULL, sizeof(*dir_info)) );
578 dir_info->controller = must_copy_string(dir_info, controller);
579 dir_info->cgroup = must_copy_string(dir_info, cgroup);
580 dir_info->type = LXC_TYPE_CGDIR;
581 dir_info->buf = NULL;
582 dir_info->file = NULL;
583 dir_info->buflen = 0;
584
585 fi->fh = (unsigned long)dir_info;
586 return 0;
587 }
588
589 static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
590 struct fuse_file_info *fi)
591 {
592 struct file_info *d = (struct file_info *)fi->fh;
593 nih_local struct cgm_keys **list = NULL;
594 int i;
595 nih_local char *nextcg = NULL;
596 struct fuse_context *fc = fuse_get_context();
597
598 if (d->type != LXC_TYPE_CGDIR) {
599 fprintf(stderr, "Internal error: file cache info used in readdir\n");
600 return -EIO;
601 }
602 if (!d->cgroup && !d->controller) {
603 // ls /var/lib/lxcfs/cgroup - just show list of controllers
604 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
605 int i;
606
607 if (!list)
608 return -EIO;
609
610 for (i = 0; list[i]; i++) {
611 if (filler(buf, list[i], NULL, 0) != 0) {
612 return -EIO;
613 }
614 }
615 return 0;
616 }
617
618 if (!cgm_list_keys(d->controller, d->cgroup, &list))
619 // not a valid cgroup
620 return -EINVAL;
621
622 if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
623 if (nextcg) {
624 int ret;
625 ret = filler(buf, nextcg, NULL, 0);
626 if (ret != 0)
627 return -EIO;
628 }
629 return 0;
630 }
631
632 for (i = 0; list[i]; i++) {
633 if (filler(buf, list[i]->name, NULL, 0) != 0) {
634 return -EIO;
635 }
636 }
637
638 // now get the list of child cgroups
639 nih_local char **clist = NULL;
640
641 if (!cgm_list_children(d->controller, d->cgroup, &clist))
642 return 0;
643 for (i = 0; clist[i]; i++) {
644 if (filler(buf, clist[i], NULL, 0) != 0) {
645 return -EIO;
646 }
647 }
648 return 0;
649 }
650
651 static void do_release_file_info(struct file_info *f)
652 {
653 /*
654 * all file_info fields which are nih_alloc()d with f as parent
655 * will be automatically freed
656 */
657 nih_free(f);
658 }
659
660 static int cg_releasedir(const char *path, struct fuse_file_info *fi)
661 {
662 struct file_info *d = (struct file_info *)fi->fh;
663
664 do_release_file_info(d);
665 return 0;
666 }
667
668 static int cg_open(const char *path, struct fuse_file_info *fi)
669 {
670 nih_local char *controller = NULL;
671 const char *cgroup;
672 char *fpath = NULL, *path1, *path2;
673 nih_local char * cgdir = NULL;
674 nih_local struct cgm_keys *k = NULL;
675 struct file_info *file_info;
676 struct fuse_context *fc = fuse_get_context();
677
678 if (!fc)
679 return -EIO;
680
681 controller = pick_controller_from_path(fc, path);
682 if (!controller)
683 return -EIO;
684 cgroup = find_cgroup_in_path(path);
685 if (!cgroup)
686 return -EINVAL;
687
688 get_cgdir_and_path(cgroup, &cgdir, &fpath);
689 if (!fpath) {
690 path1 = "/";
691 path2 = cgdir;
692 } else {
693 path1 = cgdir;
694 path2 = fpath;
695 }
696
697 k = get_cgroup_key(controller, path1, path2);
698 if (!k)
699 return -EINVAL;
700
701 if (!fc_may_access(fc, controller, path1, path2, fi->flags))
702 // should never get here
703 return -EACCES;
704
705 /* we'll free this at cg_release */
706 file_info = NIH_MUST( nih_alloc(NULL, sizeof(*file_info)) );
707 file_info->controller = must_copy_string(file_info, controller);
708 file_info->cgroup = must_copy_string(file_info, path1);
709 file_info->file = must_copy_string(file_info, path2);
710 file_info->type = LXC_TYPE_CGFILE;
711 file_info->buf = NULL;
712 file_info->buflen = 0;
713
714 fi->fh = (unsigned long)file_info;
715 return 0;
716 }
717
718 static int cg_release(const char *path, struct fuse_file_info *fi)
719 {
720 struct file_info *f = (struct file_info *)fi->fh;
721
722 do_release_file_info(f);
723 return 0;
724 }
725
726 static int msgrecv(int sockfd, void *buf, size_t len)
727 {
728 struct timeval tv;
729 fd_set rfds;
730
731 FD_ZERO(&rfds);
732 FD_SET(sockfd, &rfds);
733 tv.tv_sec = 2;
734 tv.tv_usec = 0;
735
736 if (select(sockfd+1, &rfds, NULL, NULL, &tv) <= 0)
737 return -1;
738 return recv(sockfd, buf, len, MSG_DONTWAIT);
739 }
740
741 #define SEND_CREDS_OK 0
742 #define SEND_CREDS_NOTSK 1
743 #define SEND_CREDS_FAIL 2
744 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
745 {
746 struct msghdr msg = { 0 };
747 struct iovec iov;
748 struct cmsghdr *cmsg;
749 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
750 char buf[1];
751 buf[0] = 'p';
752
753 if (pingfirst) {
754 if (msgrecv(sock, buf, 1) != 1) {
755 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
756 __func__);
757 return SEND_CREDS_FAIL;
758 }
759 }
760
761 msg.msg_control = cmsgbuf;
762 msg.msg_controllen = sizeof(cmsgbuf);
763
764 cmsg = CMSG_FIRSTHDR(&msg);
765 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
766 cmsg->cmsg_level = SOL_SOCKET;
767 cmsg->cmsg_type = SCM_CREDENTIALS;
768 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
769
770 msg.msg_name = NULL;
771 msg.msg_namelen = 0;
772
773 buf[0] = v;
774 iov.iov_base = buf;
775 iov.iov_len = sizeof(buf);
776 msg.msg_iov = &iov;
777 msg.msg_iovlen = 1;
778
779 if (sendmsg(sock, &msg, 0) < 0) {
780 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
781 strerror(errno));
782 if (errno == 3)
783 return SEND_CREDS_NOTSK;
784 return SEND_CREDS_FAIL;
785 }
786
787 return SEND_CREDS_OK;
788 }
789
790 static bool recv_creds(int sock, struct ucred *cred, char *v)
791 {
792 struct msghdr msg = { 0 };
793 struct iovec iov;
794 struct cmsghdr *cmsg;
795 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
796 char buf[1];
797 int ret;
798 int optval = 1;
799 struct timeval tv;
800 fd_set rfds;
801
802 *v = '1';
803
804 cred->pid = -1;
805 cred->uid = -1;
806 cred->gid = -1;
807
808 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
809 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
810 return false;
811 }
812 buf[0] = '1';
813 if (write(sock, buf, 1) != 1) {
814 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
815 return false;
816 }
817
818 msg.msg_name = NULL;
819 msg.msg_namelen = 0;
820 msg.msg_control = cmsgbuf;
821 msg.msg_controllen = sizeof(cmsgbuf);
822
823 iov.iov_base = buf;
824 iov.iov_len = sizeof(buf);
825 msg.msg_iov = &iov;
826 msg.msg_iovlen = 1;
827
828 FD_ZERO(&rfds);
829 FD_SET(sock, &rfds);
830 tv.tv_sec = 2;
831 tv.tv_usec = 0;
832 if (select(sock+1, &rfds, NULL, NULL, &tv) <= 0) {
833 fprintf(stderr, "Failed to select for scm_cred: %s\n",
834 strerror(errno));
835 return false;
836 }
837 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
838 if (ret < 0) {
839 fprintf(stderr, "Failed to receive scm_cred: %s\n",
840 strerror(errno));
841 return false;
842 }
843
844 cmsg = CMSG_FIRSTHDR(&msg);
845
846 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
847 cmsg->cmsg_level == SOL_SOCKET &&
848 cmsg->cmsg_type == SCM_CREDENTIALS) {
849 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
850 }
851 *v = buf[0];
852
853 return true;
854 }
855
856
857 /*
858 * pid_to_ns - reads pids from a ucred over a socket, then writes the
859 * int value back over the socket. This shifts the pid from the
860 * sender's pidns into tpid's pidns.
861 */
862 static void pid_to_ns(int sock, pid_t tpid)
863 {
864 char v = '0';
865 struct ucred cred;
866
867 while (recv_creds(sock, &cred, &v)) {
868 if (v == '1')
869 exit(0);
870 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
871 exit(1);
872 }
873 exit(0);
874 }
875
876 /*
877 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
878 * in your old pidns. Only children which you fork will be in the target
879 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
880 * actually convert pids
881 */
882 static void pid_to_ns_wrapper(int sock, pid_t tpid)
883 {
884 int newnsfd = -1, ret, cpipe[2];
885 char fnam[100];
886 pid_t cpid;
887 struct timeval tv;
888 fd_set s;
889 char v;
890
891 sprintf(fnam, "/proc/%d/ns/pid", tpid);
892 newnsfd = open(fnam, O_RDONLY);
893 if (newnsfd < 0)
894 exit(1);
895 if (setns(newnsfd, 0) < 0)
896 exit(1);
897 close(newnsfd);
898
899 if (pipe(cpipe) < 0)
900 exit(1);
901
902 loop:
903 cpid = fork();
904 if (cpid < 0)
905 exit(1);
906
907 if (!cpid) {
908 char b = '1';
909 close(cpipe[0]);
910 if (write(cpipe[1], &b, sizeof(char)) < 0) {
911 fprintf(stderr, "%s (child): erorr on write: %s\n",
912 __func__, strerror(errno));
913 }
914 close(cpipe[1]);
915 pid_to_ns(sock, tpid);
916 }
917 // give the child 1 second to be done forking and
918 // write it's ack
919 FD_ZERO(&s);
920 FD_SET(cpipe[0], &s);
921 tv.tv_sec = 1;
922 tv.tv_usec = 0;
923 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
924 if (ret <= 0)
925 goto again;
926 ret = read(cpipe[0], &v, 1);
927 if (ret != sizeof(char) || v != '1') {
928 goto again;
929 }
930
931 if (!wait_for_pid(cpid))
932 exit(1);
933 exit(0);
934
935 again:
936 kill(cpid, SIGKILL);
937 wait_for_pid(cpid);
938 goto loop;
939 }
940
941 /*
942 * To read cgroup files with a particular pid, we will setns into the child
943 * pidns, open a pipe, fork a child - which will be the first to really be in
944 * the child ns - which does the cgm_get_value and writes the data to the pipe.
945 */
946 static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
947 {
948 int sock[2] = {-1, -1};
949 nih_local char *tmpdata = NULL;
950 int ret;
951 pid_t qpid, cpid = -1;
952 bool answer = false;
953 char v = '0';
954 struct ucred cred;
955 struct timeval tv;
956 fd_set s;
957
958 if (!cgm_get_value(contrl, cg, file, &tmpdata))
959 return false;
960
961 /*
962 * Now we read the pids from returned data one by one, pass
963 * them into a child in the target namespace, read back the
964 * translated pids, and put them into our to-return data
965 */
966
967 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
968 perror("socketpair");
969 exit(1);
970 }
971
972 cpid = fork();
973 if (cpid == -1)
974 goto out;
975
976 if (!cpid) // child
977 pid_to_ns_wrapper(sock[1], tpid);
978
979 char *ptr = tmpdata;
980 cred.uid = 0;
981 cred.gid = 0;
982 while (sscanf(ptr, "%d\n", &qpid) == 1) {
983 cred.pid = qpid;
984 ret = send_creds(sock[0], &cred, v, true);
985
986 if (ret == SEND_CREDS_NOTSK)
987 goto next;
988 if (ret == SEND_CREDS_FAIL)
989 goto out;
990
991 // read converted results
992 FD_ZERO(&s);
993 FD_SET(sock[0], &s);
994 tv.tv_sec = 2;
995 tv.tv_usec = 0;
996 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
997 if (ret <= 0) {
998 fprintf(stderr, "%s: select error waiting for pid from child: %s\n",
999 __func__, strerror(errno));
1000 goto out;
1001 }
1002 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1003 fprintf(stderr, "%s: error reading pid from child: %s\n",
1004 __func__, strerror(errno));
1005 goto out;
1006 }
1007 NIH_MUST( nih_strcat_sprintf(d, NULL, "%d\n", qpid) );
1008 next:
1009 ptr = strchr(ptr, '\n');
1010 if (!ptr)
1011 break;
1012 ptr++;
1013 }
1014
1015 cred.pid = getpid();
1016 v = '1';
1017 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
1018 // failed to ask child to exit
1019 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1020 __func__, strerror(errno));
1021 goto out;
1022 }
1023
1024 answer = true;
1025
1026 out:
1027 if (cpid != -1)
1028 wait_for_pid(cpid);
1029 if (sock[0] != -1) {
1030 close(sock[0]);
1031 close(sock[1]);
1032 }
1033 return answer;
1034 }
1035
1036 static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1037 struct fuse_file_info *fi)
1038 {
1039 struct fuse_context *fc = fuse_get_context();
1040 struct file_info *f = (struct file_info *)fi->fh;
1041 nih_local struct cgm_keys *k = NULL;
1042
1043 if (f->type != LXC_TYPE_CGFILE) {
1044 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1045 return -EIO;
1046 }
1047
1048 if (offset)
1049 return 0;
1050
1051 if (!fc)
1052 return -EIO;
1053
1054 if (!f->controller)
1055 return -EINVAL;
1056
1057 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
1058 nih_local char *data = NULL;
1059 int s;
1060 bool r;
1061
1062 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY))
1063 // should never get here
1064 return -EACCES;
1065
1066 if (strcmp(f->file, "tasks") == 0 ||
1067 strcmp(f->file, "/tasks") == 0 ||
1068 strcmp(f->file, "/cgroup.procs") == 0 ||
1069 strcmp(f->file, "cgroup.procs") == 0)
1070 // special case - we have to translate the pids
1071 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
1072 else
1073 r = cgm_get_value(f->controller, f->cgroup, f->file, &data);
1074
1075 if (!r)
1076 return -EINVAL;
1077
1078 if (!data)
1079 return 0;
1080 s = strlen(data);
1081 if (s > size)
1082 s = size;
1083 memcpy(buf, data, s);
1084 if (s > 0 && s < size && data[s-1] != '\n')
1085 buf[s++] = '\n';
1086
1087 return s;
1088 }
1089
1090 return -EINVAL;
1091 }
1092
1093 static void pid_from_ns(int sock, pid_t tpid)
1094 {
1095 pid_t vpid;
1096 struct ucred cred;
1097 char v;
1098 struct timeval tv;
1099 fd_set s;
1100 int ret;
1101
1102 cred.uid = 0;
1103 cred.gid = 0;
1104 while (1) {
1105 FD_ZERO(&s);
1106 FD_SET(sock, &s);
1107 tv.tv_sec = 2;
1108 tv.tv_usec = 0;
1109 ret = select(sock+1, &s, NULL, NULL, &tv);
1110 if (ret <= 0) {
1111 fprintf(stderr, "%s: bad select before read from parent: %s\n",
1112 __func__, strerror(errno));
1113 exit(1);
1114 }
1115 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1116 fprintf(stderr, "%s: bad read from parent: %s\n",
1117 __func__, strerror(errno));
1118 exit(1);
1119 }
1120 if (vpid == -1) // done
1121 break;
1122 v = '0';
1123 cred.pid = vpid;
1124 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
1125 v = '1';
1126 cred.pid = getpid();
1127 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
1128 exit(1);
1129 }
1130 }
1131 exit(0);
1132 }
1133
1134 static void pid_from_ns_wrapper(int sock, pid_t tpid)
1135 {
1136 int newnsfd = -1, ret, cpipe[2];
1137 char fnam[100];
1138 pid_t cpid;
1139 fd_set s;
1140 struct timeval tv;
1141 char v;
1142
1143 sprintf(fnam, "/proc/%d/ns/pid", tpid);
1144 newnsfd = open(fnam, O_RDONLY);
1145 if (newnsfd < 0)
1146 exit(1);
1147 if (setns(newnsfd, 0) < 0)
1148 exit(1);
1149 close(newnsfd);
1150
1151 if (pipe(cpipe) < 0)
1152 exit(1);
1153
1154 loop:
1155 cpid = fork();
1156
1157 if (cpid < 0)
1158 exit(1);
1159
1160 if (!cpid) {
1161 char b = '1';
1162 close(cpipe[0]);
1163 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1164 fprintf(stderr, "%s (child): erorr on write: %s\n",
1165 __func__, strerror(errno));
1166 }
1167 close(cpipe[1]);
1168 pid_from_ns(sock, tpid);
1169 }
1170
1171 // give the child 1 second to be done forking and
1172 // write it's ack
1173 FD_ZERO(&s);
1174 FD_SET(cpipe[0], &s);
1175 tv.tv_sec = 1;
1176 tv.tv_usec = 0;
1177 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1178 if (ret <= 0)
1179 goto again;
1180 ret = read(cpipe[0], &v, 1);
1181 if (ret != sizeof(char) || v != '1') {
1182 goto again;
1183 }
1184
1185 if (!wait_for_pid(cpid))
1186 exit(1);
1187 exit(0);
1188
1189 again:
1190 kill(cpid, SIGKILL);
1191 wait_for_pid(cpid);
1192 goto loop;
1193 }
1194
1195 static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
1196 {
1197 int sock[2] = {-1, -1};
1198 pid_t qpid, cpid = -1;
1199 bool answer = false, fail = false;
1200
1201 /*
1202 * write the pids to a socket, have helper in writer's pidns
1203 * call movepid for us
1204 */
1205 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1206 perror("socketpair");
1207 exit(1);
1208 }
1209
1210 cpid = fork();
1211 if (cpid == -1)
1212 goto out;
1213
1214 if (!cpid) // child
1215 pid_from_ns_wrapper(sock[1], tpid);
1216
1217 const char *ptr = buf;
1218 while (sscanf(ptr, "%d", &qpid) == 1) {
1219 struct ucred cred;
1220 char v;
1221
1222 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1223 fprintf(stderr, "%s: error writing pid to child: %s\n",
1224 __func__, strerror(errno));
1225 goto out;
1226 }
1227
1228 if (recv_creds(sock[0], &cred, &v)) {
1229 if (v == '0') {
1230 if (!cgm_move_pid(contrl, cg, cred.pid))
1231 fail = true;
1232 }
1233 }
1234
1235 ptr = strchr(ptr, '\n');
1236 if (!ptr)
1237 break;
1238 ptr++;
1239 }
1240
1241 /* All good, write the value */
1242 qpid = -1;
1243 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1244 fprintf(stderr, "Warning: failed to ask child to exit\n");
1245
1246 if (!fail)
1247 answer = true;
1248
1249 out:
1250 if (cpid != -1)
1251 wait_for_pid(cpid);
1252 if (sock[0] != -1) {
1253 close(sock[0]);
1254 close(sock[1]);
1255 }
1256 return answer;
1257 }
1258
1259 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1260 struct fuse_file_info *fi)
1261 {
1262 struct fuse_context *fc = fuse_get_context();
1263 nih_local char *localbuf = NULL;
1264 nih_local struct cgm_keys *k = NULL;
1265 struct file_info *f = (struct file_info *)fi->fh;
1266
1267 if (f->type != LXC_TYPE_CGFILE) {
1268 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1269 return -EIO;
1270 }
1271
1272 if (offset)
1273 return 0;
1274
1275 if (!fc)
1276 return -EIO;
1277
1278 localbuf = NIH_MUST( nih_alloc(NULL, size+1) );
1279 localbuf[size] = '\0';
1280 memcpy(localbuf, buf, size);
1281
1282 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
1283 bool r;
1284
1285 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY))
1286 return -EACCES;
1287
1288 if (strcmp(f->file, "tasks") == 0 ||
1289 strcmp(f->file, "/tasks") == 0 ||
1290 strcmp(f->file, "/cgroup.procs") == 0 ||
1291 strcmp(f->file, "cgroup.procs") == 0)
1292 // special case - we have to translate the pids
1293 r = do_write_pids(fc->pid, f->controller, f->cgroup, f->file, localbuf);
1294 else
1295 r = cgm_set_value(f->controller, f->cgroup, f->file, localbuf);
1296
1297 if (!r)
1298 return -EINVAL;
1299
1300 return size;
1301 }
1302
1303 return -EINVAL;
1304 }
1305
1306 int cg_chown(const char *path, uid_t uid, gid_t gid)
1307 {
1308 struct fuse_context *fc = fuse_get_context();
1309 nih_local char * cgdir = NULL;
1310 char *fpath = NULL, *path1, *path2;
1311 nih_local struct cgm_keys *k = NULL;
1312 const char *cgroup;
1313 nih_local char *controller = NULL;
1314
1315
1316 if (!fc)
1317 return -EIO;
1318
1319 if (strcmp(path, "/cgroup") == 0)
1320 return -EINVAL;
1321
1322 controller = pick_controller_from_path(fc, path);
1323 if (!controller)
1324 return -EINVAL;
1325 cgroup = find_cgroup_in_path(path);
1326 if (!cgroup)
1327 /* this is just /cgroup/controller */
1328 return -EINVAL;
1329
1330 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1331
1332 if (!fpath) {
1333 path1 = "/";
1334 path2 = cgdir;
1335 } else {
1336 path1 = cgdir;
1337 path2 = fpath;
1338 }
1339
1340 if (is_child_cgroup(controller, path1, path2)) {
1341 // get uid, gid, from '/tasks' file and make up a mode
1342 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1343 k = get_cgroup_key(controller, cgroup, "tasks");
1344
1345 } else
1346 k = get_cgroup_key(controller, path1, path2);
1347
1348 if (!k)
1349 return -EINVAL;
1350
1351 /*
1352 * This being a fuse request, the uid and gid must be valid
1353 * in the caller's namespace. So we can just check to make
1354 * sure that the caller is root in his uid, and privileged
1355 * over the file's current owner.
1356 */
1357 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD))
1358 return -EACCES;
1359
1360 if (!cgm_chown_file(controller, cgroup, uid, gid))
1361 return -EINVAL;
1362 return 0;
1363 }
1364
1365 int cg_chmod(const char *path, mode_t mode)
1366 {
1367 struct fuse_context *fc = fuse_get_context();
1368 nih_local char * cgdir = NULL;
1369 char *fpath = NULL, *path1, *path2;
1370 nih_local struct cgm_keys *k = NULL;
1371 const char *cgroup;
1372 nih_local char *controller = NULL;
1373
1374 if (!fc)
1375 return -EIO;
1376
1377 if (strcmp(path, "/cgroup") == 0)
1378 return -EINVAL;
1379
1380 controller = pick_controller_from_path(fc, path);
1381 if (!controller)
1382 return -EINVAL;
1383 cgroup = find_cgroup_in_path(path);
1384 if (!cgroup)
1385 /* this is just /cgroup/controller */
1386 return -EINVAL;
1387
1388 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1389
1390 if (!fpath) {
1391 path1 = "/";
1392 path2 = cgdir;
1393 } else {
1394 path1 = cgdir;
1395 path2 = fpath;
1396 }
1397
1398 if (is_child_cgroup(controller, path1, path2)) {
1399 // get uid, gid, from '/tasks' file and make up a mode
1400 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1401 k = get_cgroup_key(controller, cgroup, "tasks");
1402
1403 } else
1404 k = get_cgroup_key(controller, path1, path2);
1405
1406 if (!k)
1407 return -EINVAL;
1408
1409 /*
1410 * This being a fuse request, the uid and gid must be valid
1411 * in the caller's namespace. So we can just check to make
1412 * sure that the caller is root in his uid, and privileged
1413 * over the file's current owner.
1414 */
1415 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT))
1416 return -EPERM;
1417
1418 if (!cgm_chmod_file(controller, cgroup, mode))
1419 return -EINVAL;
1420 return 0;
1421 }
1422
1423 int cg_mkdir(const char *path, mode_t mode)
1424 {
1425 struct fuse_context *fc = fuse_get_context();
1426 nih_local struct cgm_keys **list = NULL;
1427 char *fpath = NULL, *path1;
1428 nih_local char * cgdir = NULL;
1429 const char *cgroup;
1430 nih_local char *controller = NULL;
1431
1432 if (!fc)
1433 return -EIO;
1434
1435
1436 controller = pick_controller_from_path(fc, path);
1437 if (!controller)
1438 return -EINVAL;
1439
1440 cgroup = find_cgroup_in_path(path);
1441 if (!cgroup)
1442 return -EINVAL;
1443
1444 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1445 if (!fpath)
1446 path1 = "/";
1447 else
1448 path1 = cgdir;
1449
1450 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR))
1451 return -EACCES;
1452
1453
1454 if (!cgm_create(controller, cgroup, fc->uid, fc->gid))
1455 return -EINVAL;
1456
1457 return 0;
1458 }
1459
1460 static int cg_rmdir(const char *path)
1461 {
1462 struct fuse_context *fc = fuse_get_context();
1463 nih_local struct cgm_keys **list = NULL;
1464 char *fpath = NULL;
1465 nih_local char * cgdir = NULL;
1466 const char *cgroup;
1467 nih_local char *controller = NULL;
1468
1469 if (!fc)
1470 return -EIO;
1471
1472
1473 controller = pick_controller_from_path(fc, path);
1474 if (!controller)
1475 return -EINVAL;
1476
1477 cgroup = find_cgroup_in_path(path);
1478 if (!cgroup)
1479 return -EINVAL;
1480
1481 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1482 if (!fpath)
1483 return -EINVAL;
1484
1485 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY))
1486 return -EACCES;
1487
1488 if (!cgm_remove(controller, cgroup))
1489 return -EINVAL;
1490
1491 return 0;
1492 }
1493
1494 static bool startswith(const char *line, const char *pref)
1495 {
1496 if (strncmp(line, pref, strlen(pref)) == 0)
1497 return true;
1498 return false;
1499 }
1500
1501 static void get_mem_cached(char *memstat, unsigned long *v)
1502 {
1503 char *eol;
1504
1505 *v = 0;
1506 while (*memstat) {
1507 if (startswith(memstat, "total_cache")) {
1508 sscanf(memstat + 11, "%lu", v);
1509 *v /= 1024;
1510 return;
1511 }
1512 eol = strchr(memstat, '\n');
1513 if (!eol)
1514 return;
1515 memstat = eol+1;
1516 }
1517 }
1518
1519 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
1520 {
1521 char *eol;
1522 char key[32];
1523
1524 memset(key, 0, 32);
1525 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
1526
1527 size_t len = strlen(key);
1528 *v = 0;
1529
1530 while (*str) {
1531 if (startswith(str, key)) {
1532 sscanf(str + len, "%lu", v);
1533 return;
1534 }
1535 eol = strchr(str, '\n');
1536 if (!eol)
1537 return;
1538 str = eol+1;
1539 }
1540 }
1541
1542 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1543 {
1544 nih_local char *fnam = NULL;
1545 FILE *f;
1546 char *answer = NULL;
1547 char *line = NULL;
1548 size_t len = 0;
1549
1550 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
1551 if (!(f = fopen(fnam, "r")))
1552 return false;
1553
1554 while (getline(&line, &len, f) != -1) {
1555 char *c1, *c2;
1556 if (!line[0])
1557 continue;
1558 c1 = strchr(line, ':');
1559 if (!c1)
1560 goto out;
1561 c1++;
1562 c2 = strchr(c1, ':');
1563 if (!c2)
1564 goto out;
1565 *c2 = '\0';
1566 if (strcmp(c1, contrl) != 0)
1567 continue;
1568 c2++;
1569 stripnewline(c2);
1570 answer = NIH_MUST( nih_strdup(NULL, c2) );
1571 goto out;
1572 }
1573
1574 out:
1575 fclose(f);
1576 free(line);
1577 return answer;
1578 }
1579
1580 /*
1581 * FUSE ops for /proc
1582 */
1583
1584 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1585 struct fuse_file_info *fi)
1586 {
1587 struct fuse_context *fc = fuse_get_context();
1588 struct file_info *d = (struct file_info *)fi->fh;
1589 nih_local char *cg = get_pid_cgroup(fc->pid, "memory");
1590 nih_local char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
1591 unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
1592 char *line = NULL;
1593 size_t linelen = 0, total_len = 0;
1594 char *cache = d->buf;
1595 size_t cache_size = d->buflen;
1596 FILE *f;
1597
1598 if (offset){
1599 if (offset > d->size)
1600 return -EINVAL;
1601 int left = d->size - offset;
1602 total_len = left > size ? size: left;
1603 memcpy(buf, cache + offset, total_len);
1604 return total_len;
1605 }
1606
1607 if (!cg)
1608 return 0;
1609
1610 if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
1611 return 0;
1612 if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
1613 return 0;
1614 if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
1615 return 0;
1616 memlimit = strtoul(memlimit_str, NULL, 10);
1617 memusage = strtoul(memusage_str, NULL, 10);
1618 memlimit /= 1024;
1619 memusage /= 1024;
1620 get_mem_cached(memstat_str, &cached);
1621
1622 f = fopen("/proc/meminfo", "r");
1623 if (!f)
1624 return 0;
1625
1626 while (getline(&line, &linelen, f) != -1) {
1627 size_t l;
1628 char *printme, lbuf[100];
1629
1630 memset(lbuf, 0, 100);
1631 if (startswith(line, "MemTotal:")) {
1632 sscanf(line+14, "%lu", &hosttotal);
1633 if (hosttotal < memlimit)
1634 memlimit = hosttotal;
1635 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1636 printme = lbuf;
1637 } else if (startswith(line, "MemFree:")) {
1638 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1639 printme = lbuf;
1640 } else if (startswith(line, "MemAvailable:")) {
1641 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1642 printme = lbuf;
1643 } else if (startswith(line, "Buffers:")) {
1644 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1645 printme = lbuf;
1646 } else if (startswith(line, "Cached:")) {
1647 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
1648 printme = lbuf;
1649 } else if (startswith(line, "SwapCached:")) {
1650 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
1651 printme = lbuf;
1652 } else
1653 printme = line;
1654
1655 l = snprintf(cache, cache_size, "%s", printme);
1656 cache += l;
1657 cache_size -= l;
1658 total_len += l;
1659 }
1660
1661 d->size = total_len;
1662 if (total_len > size ) total_len = size;
1663 memcpy(buf, d->buf, total_len);
1664
1665 fclose(f);
1666 free(line);
1667 return total_len;
1668 }
1669
1670 /*
1671 * Read the cpuset.cpus for cg
1672 * Return the answer in a nih_alloced string
1673 */
1674 static char *get_cpuset(const char *cg)
1675 {
1676 char *answer;
1677
1678 if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
1679 return NULL;
1680 return answer;
1681 }
1682
1683 /*
1684 * Helper functions for cpuset_in-set
1685 */
1686 char *cpuset_nexttok(const char *c)
1687 {
1688 char *r = strchr(c+1, ',');
1689 if (r)
1690 return r+1;
1691 return NULL;
1692 }
1693
1694 int cpuset_getrange(const char *c, int *a, int *b)
1695 {
1696 int ret;
1697
1698 ret = sscanf(c, "%d-%d", a, b);
1699 return ret;
1700 }
1701
1702 /*
1703 * cpusets are in format "1,2-3,4"
1704 * iow, comma-delimited ranges
1705 */
1706 static bool cpu_in_cpuset(int cpu, const char *cpuset)
1707 {
1708 const char *c;
1709
1710 for (c = cpuset; c; c = cpuset_nexttok(c)) {
1711 int a, b, ret;
1712
1713 ret = cpuset_getrange(c, &a, &b);
1714 if (ret == 1 && cpu == a)
1715 return true;
1716 if (ret != 2) // bad cpuset!
1717 return false;
1718 if (cpu >= a && cpu <= b)
1719 return true;
1720 }
1721
1722 return false;
1723 }
1724
1725 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
1726 {
1727 int cpu;
1728
1729 if (sscanf(line, "processor : %d", &cpu) != 1)
1730 return false;
1731 return cpu_in_cpuset(cpu, cpuset);
1732 }
1733
1734 /*
1735 * check whether this is a '^processor" line in /proc/cpuinfo
1736 */
1737 static bool is_processor_line(const char *line)
1738 {
1739 int cpu;
1740
1741 if (sscanf(line, "processor : %d", &cpu) == 1)
1742 return true;
1743 return false;
1744 }
1745
1746 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
1747 struct fuse_file_info *fi)
1748 {
1749 struct fuse_context *fc = fuse_get_context();
1750 struct file_info *d = (struct file_info *)fi->fh;
1751 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1752 nih_local char *cpuset = NULL;
1753 char *line = NULL;
1754 size_t linelen = 0, total_len = 0;
1755 bool am_printing = false;
1756 int curcpu = -1;
1757 char *cache = d->buf;
1758 size_t cache_size = d->buflen;
1759 FILE *f;
1760
1761 if (offset){
1762 if (offset > d->size)
1763 return -EINVAL;
1764 int left = d->size - offset;
1765 total_len = left > size ? size: left;
1766 memcpy(buf, cache + offset, total_len);
1767 return total_len;
1768 }
1769
1770 if (!cg)
1771 return 0;
1772
1773 cpuset = get_cpuset(cg);
1774 if (!cpuset)
1775 return 0;
1776
1777 f = fopen("/proc/cpuinfo", "r");
1778 if (!f)
1779 return 0;
1780
1781 while (getline(&line, &linelen, f) != -1) {
1782 size_t l;
1783 if (is_processor_line(line)) {
1784 am_printing = cpuline_in_cpuset(line, cpuset);
1785 if (am_printing) {
1786 curcpu ++;
1787 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
1788 if (l < cache_size){
1789 cache += l;
1790 cache_size -= l;
1791 total_len += l;
1792 }else{
1793 cache += cache_size;
1794 total_len += cache_size;
1795 cache_size = 0;
1796 break;
1797 }
1798 }
1799 continue;
1800 }
1801 if (am_printing) {
1802 l = snprintf(cache, cache_size, "%s", line);
1803 if (l < cache_size) {
1804 cache += l;
1805 cache_size -= l;
1806 total_len += l;
1807 } else {
1808 cache += cache_size;
1809 total_len += cache_size;
1810 cache_size = 0;
1811 break;
1812 }
1813 }
1814 }
1815
1816 d->size = total_len;
1817 if (total_len > size ) total_len = size;
1818
1819 /* read from off 0 */
1820 memcpy(buf, d->buf, total_len);
1821
1822 fclose(f);
1823 free(line);
1824 return total_len;
1825 }
1826
1827 static int proc_stat_read(char *buf, size_t size, off_t offset,
1828 struct fuse_file_info *fi)
1829 {
1830 struct fuse_context *fc = fuse_get_context();
1831 struct file_info *d = (struct file_info *)fi->fh;
1832 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1833 nih_local char *cpuset = NULL;
1834 char *line = NULL;
1835 size_t linelen = 0, total_len = 0;
1836 int curcpu = -1; /* cpu numbering starts at 0 */
1837 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
1838 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
1839 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
1840 #define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
1841 char cpuall[CPUALL_MAX_SIZE];
1842 /* reserve for cpu all */
1843 char *cache = d->buf + CPUALL_MAX_SIZE;
1844 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
1845 FILE *f;
1846
1847 if (offset){
1848 if (offset > d->size)
1849 return -EINVAL;
1850 int left = d->size - offset;
1851 total_len = left > size ? size: left;
1852 memcpy(buf, d->buf + offset, total_len);
1853 return total_len;
1854 }
1855
1856 if (!cg)
1857 return 0;
1858
1859 cpuset = get_cpuset(cg);
1860 if (!cpuset)
1861 return 0;
1862
1863 f = fopen("/proc/stat", "r");
1864 if (!f)
1865 return 0;
1866
1867 //skip first line
1868 if (getline(&line, &linelen, f) < 0) {
1869 fprintf(stderr, "proc_stat_read read first line failed\n");
1870 goto out;
1871 }
1872
1873 while (getline(&line, &linelen, f) != -1) {
1874 size_t l;
1875 int cpu;
1876 char cpu_char[10]; /* That's a lot of cores */
1877 char *c;
1878
1879 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
1880 /* not a ^cpuN line containing a number N, just print it */
1881 l = snprintf(cache, cache_size, "%s", line);
1882 if (l < cache_size){
1883 cache += l;
1884 cache_size -= l;
1885 total_len += l;
1886 continue;
1887 }else{
1888 //no more space, break it
1889 cache += cache_size;
1890 total_len += cache_size;
1891 cache_size = 0;
1892 break;
1893 }
1894 }
1895
1896 if (sscanf(cpu_char, "%d", &cpu) != 1)
1897 continue;
1898 if (!cpu_in_cpuset(cpu, cpuset))
1899 continue;
1900 curcpu ++;
1901
1902 c = strchr(line, ' ');
1903 if (!c)
1904 continue;
1905 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
1906 cache += l;
1907 cache_size -= l;
1908 total_len += l;
1909
1910 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
1911 &softirq, &steal, &guest) != 9)
1912 continue;
1913 user_sum += user;
1914 nice_sum += nice;
1915 system_sum += system;
1916 idle_sum += idle;
1917 iowait_sum += iowait;
1918 irq_sum += irq;
1919 softirq_sum += softirq;
1920 steal_sum += steal;
1921 guest_sum += guest;
1922 }
1923
1924 cache = d->buf;
1925
1926 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
1927 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
1928 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
1929 memcpy(cache, cpuall, cpuall_len);
1930 cache += cpuall_len;
1931 }else{
1932 /* shouldn't happen */
1933 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
1934 cpuall_len = 0;
1935 }
1936
1937 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
1938 total_len += cpuall_len;
1939 d->size = total_len;
1940 if (total_len > size ) total_len = size;
1941
1942 memcpy(buf, d->buf, total_len);
1943 out:
1944 fclose(f);
1945 free(line);
1946 return total_len;
1947 }
1948
1949 /*
1950 * How to guess what to present for uptime?
1951 * One thing we could do would be to take the date on the caller's
1952 * memory.usage_in_bytes file, which should equal the time of creation
1953 * of his cgroup. However, a task could be in a sub-cgroup of the
1954 * container. The same problem exists if we try to look at the ages
1955 * of processes in the caller's cgroup.
1956 *
1957 * So we'll fork a task that will enter the caller's pidns, mount a
1958 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
1959 *
1960 * For the second uptime #, we'll do as Stéphane had done, just copy
1961 * the number from /proc/uptime. Not sure how to best emulate 'idle'
1962 * time. Maybe someone can come up with a good algorithm and submit a
1963 * patch. Maybe something based on cpushare info?
1964 */
1965
1966 /* return age of the reaper for $pid, taken from ctime of its procdir */
1967 static long int get_pid1_time(pid_t pid)
1968 {
1969 char fnam[100];
1970 int fd, cpipe[2], ret;
1971 struct stat sb;
1972 pid_t cpid;
1973 struct timeval tv;
1974 fd_set s;
1975 char v;
1976
1977 if (unshare(CLONE_NEWNS))
1978 return 0;
1979
1980 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
1981 perror("rslave mount failed");
1982 return 0;
1983 }
1984
1985 sprintf(fnam, "/proc/%d/ns/pid", pid);
1986 fd = open(fnam, O_RDONLY);
1987 if (fd < 0) {
1988 perror("get_pid1_time open of ns/pid");
1989 return 0;
1990 }
1991 if (setns(fd, 0)) {
1992 perror("get_pid1_time setns 1");
1993 close(fd);
1994 return 0;
1995 }
1996 close(fd);
1997
1998 if (pipe(cpipe) < 0)
1999 exit(1);
2000
2001 loop:
2002 cpid = fork();
2003 if (cpid < 0)
2004 return 0;
2005
2006 if (!cpid) {
2007 char b = '1';
2008 close(cpipe[0]);
2009 if (write(cpipe[1], &b, sizeof(char)) < 0) {
2010 fprintf(stderr, "%s (child): erorr on write: %s\n",
2011 __func__, strerror(errno));
2012 }
2013 close(cpipe[1]);
2014 umount2("/proc", MNT_DETACH);
2015 if (mount("proc", "/proc", "proc", 0, NULL)) {
2016 perror("get_pid1_time mount");
2017 return 0;
2018 }
2019 ret = lstat("/proc/1", &sb);
2020 if (ret) {
2021 perror("get_pid1_time lstat");
2022 return 0;
2023 }
2024 return time(NULL) - sb.st_ctime;
2025 }
2026
2027 // give the child 1 second to be done forking and
2028 // write it's ack
2029 FD_ZERO(&s);
2030 FD_SET(cpipe[0], &s);
2031 tv.tv_sec = 1;
2032 tv.tv_usec = 0;
2033 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
2034 if (ret <= 0)
2035 goto again;
2036 ret = read(cpipe[0], &v, 1);
2037 if (ret != sizeof(char) || v != '1') {
2038 goto again;
2039 }
2040
2041 wait_for_pid(cpid);
2042 exit(0);
2043
2044 again:
2045 kill(cpid, SIGKILL);
2046 wait_for_pid(cpid);
2047 goto loop;
2048 }
2049
2050 static long int getreaperage(pid_t qpid)
2051 {
2052 int pid, mypipe[2], ret;
2053 struct timeval tv;
2054 fd_set s;
2055 long int mtime, answer = 0;
2056
2057 if (pipe(mypipe)) {
2058 return 0;
2059 }
2060
2061 pid = fork();
2062
2063 if (!pid) { // child
2064 mtime = get_pid1_time(qpid);
2065 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
2066 fprintf(stderr, "Warning: bad write from getreaperage\n");
2067 exit(0);
2068 }
2069
2070 close(mypipe[1]);
2071 FD_ZERO(&s);
2072 FD_SET(mypipe[0], &s);
2073 tv.tv_sec = 1;
2074 tv.tv_usec = 0;
2075 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
2076 if (ret <= 0) {
2077 perror("select");
2078 goto out;
2079 }
2080 if (!ret) {
2081 fprintf(stderr, "timed out\n");
2082 goto out;
2083 }
2084 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
2085 perror("read");
2086 goto out;
2087 }
2088 answer = mtime;
2089
2090 out:
2091 wait_for_pid(pid);
2092 close(mypipe[0]);
2093 return answer;
2094 }
2095
2096 static long int getprocidle(void)
2097 {
2098 FILE *f = fopen("/proc/uptime", "r");
2099 long int age, idle;
2100 int ret;
2101 if (!f)
2102 return 0;
2103 ret = fscanf(f, "%ld %ld", &age, &idle);
2104 fclose(f);
2105 if (ret != 2)
2106 return 0;
2107 return idle;
2108 }
2109
2110 /*
2111 * We read /proc/uptime and reuse its second field.
2112 * For the first field, we use the mtime for the reaper for
2113 * the calling pid as returned by getreaperage
2114 */
2115 static int proc_uptime_read(char *buf, size_t size, off_t offset,
2116 struct fuse_file_info *fi)
2117 {
2118 struct fuse_context *fc = fuse_get_context();
2119 struct file_info *d = (struct file_info *)fi->fh;
2120 long int reaperage = getreaperage(fc->pid);;
2121 long int idletime = getprocidle();
2122 size_t total_len = 0;
2123
2124 if (offset){
2125 if (offset > d->size)
2126 return -EINVAL;
2127 return 0;
2128 }
2129
2130 total_len = snprintf(buf, size, "%ld %ld\n", reaperage, idletime);
2131 d->size = total_len;
2132 return total_len;
2133 }
2134
2135 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2136 struct fuse_file_info *fi)
2137 {
2138 char dev_name[72];
2139 struct fuse_context *fc = fuse_get_context();
2140 struct file_info *d = (struct file_info *)fi->fh;
2141 nih_local char *cg = get_pid_cgroup(fc->pid, "blkio");
2142 nih_local char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
2143 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2144 unsigned long read = 0, write = 0;
2145 unsigned long read_merged = 0, write_merged = 0;
2146 unsigned long read_sectors = 0, write_sectors = 0;
2147 unsigned long read_ticks = 0, write_ticks = 0;
2148 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2149 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
2150 char *line = NULL;
2151 size_t linelen = 0, total_len = 0;
2152 unsigned int major = 0, minor = 0;
2153 int i = 0;
2154 FILE *f;
2155
2156 if (offset){
2157 if (offset > d->size)
2158 return -EINVAL;
2159 return 0;
2160 }
2161
2162 if (!cg)
2163 return 0;
2164
2165 if (!cgm_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2166 return 0;
2167 if (!cgm_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2168 return 0;
2169 if (!cgm_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2170 return 0;
2171 if (!cgm_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2172 return 0;
2173 if (!cgm_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2174 return 0;
2175
2176
2177 f = fopen("/proc/diskstats", "r");
2178 if (!f)
2179 return 0;
2180
2181 while (getline(&line, &linelen, f) != -1) {
2182 size_t l;
2183 char *printme, lbuf[256];
2184
2185 i = sscanf(line, "%u %u %s", &major, &minor, dev_name);
2186 if(i == 3){
2187 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2188 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2189 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2190 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2191 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2192 read_sectors = read_sectors/512;
2193 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2194 write_sectors = write_sectors/512;
2195
2196 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2197 rd_svctm = rd_svctm/1000000;
2198 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2199 rd_wait = rd_wait/1000000;
2200 read_ticks = rd_svctm + rd_wait;
2201
2202 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2203 wr_svctm = wr_svctm/1000000;
2204 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2205 wr_wait = wr_wait/1000000;
2206 write_ticks = wr_svctm + wr_wait;
2207
2208 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2209 tot_ticks = tot_ticks/1000000;
2210 }else{
2211 continue;
2212 }
2213
2214 memset(lbuf, 0, 256);
2215 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2216 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2217 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2218 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2219 printme = lbuf;
2220 } else
2221 continue;
2222
2223 l = snprintf(buf, size, "%s", printme);
2224 buf += l;
2225 size -= l;
2226 total_len += l;
2227 }
2228
2229 d->size = total_len;
2230
2231 fclose(f);
2232 free(line);
2233 return total_len;
2234 }
2235
2236 static off_t get_procfile_size(const char *which)
2237 {
2238 FILE *f = fopen(which, "r");
2239 char *line = NULL;
2240 size_t len = 0;
2241 ssize_t sz, answer = 0;
2242 if (!f)
2243 return 0;
2244
2245 while ((sz = getline(&line, &len, f)) != -1)
2246 answer += sz;
2247 fclose (f);
2248 free(line);
2249
2250 return answer;
2251 }
2252
2253 static int proc_getattr(const char *path, struct stat *sb)
2254 {
2255 struct timespec now;
2256
2257 memset(sb, 0, sizeof(struct stat));
2258 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2259 return -EINVAL;
2260 sb->st_uid = sb->st_gid = 0;
2261 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2262 if (strcmp(path, "/proc") == 0) {
2263 sb->st_mode = S_IFDIR | 00555;
2264 sb->st_nlink = 2;
2265 return 0;
2266 }
2267 if (strcmp(path, "/proc/meminfo") == 0 ||
2268 strcmp(path, "/proc/cpuinfo") == 0 ||
2269 strcmp(path, "/proc/uptime") == 0 ||
2270 strcmp(path, "/proc/stat") == 0 ||
2271 strcmp(path, "/proc/diskstats") == 0) {
2272 sb->st_size = 0;
2273 sb->st_mode = S_IFREG | 00444;
2274 sb->st_nlink = 1;
2275 return 0;
2276 }
2277
2278 return -ENOENT;
2279 }
2280
2281 static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2282 struct fuse_file_info *fi)
2283 {
2284 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2285 filler(buf, "meminfo", NULL, 0) != 0 ||
2286 filler(buf, "stat", NULL, 0) != 0 ||
2287 filler(buf, "uptime", NULL, 0) != 0 ||
2288 filler(buf, "diskstats", NULL, 0) != 0)
2289 return -EINVAL;
2290 return 0;
2291 }
2292
2293 static int proc_open(const char *path, struct fuse_file_info *fi)
2294 {
2295 int type = -1;
2296 struct file_info *info;
2297
2298 if (strcmp(path, "/proc/meminfo") == 0)
2299 type = LXC_TYPE_PROC_MEMINFO;
2300 else if (strcmp(path, "/proc/cpuinfo") == 0)
2301 type = LXC_TYPE_PROC_CPUINFO;
2302 else if (strcmp(path, "/proc/uptime") == 0)
2303 type = LXC_TYPE_PROC_UPTIME;
2304 else if (strcmp(path, "/proc/stat") == 0)
2305 type = LXC_TYPE_PROC_STAT;
2306 else if (strcmp(path, "/proc/diskstats") == 0)
2307 type = LXC_TYPE_PROC_DISKSTATS;
2308 if (type == -1)
2309 return -ENOENT;
2310
2311 info = NIH_MUST( nih_alloc(NULL, sizeof(*info)) );
2312 memset(info, 0, sizeof(*info));
2313 info->type = type;
2314
2315 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
2316 info->buf = NIH_MUST( nih_alloc(info, info->buflen) );
2317 memset(info->buf, 0, info->buflen);
2318 /* set actual size to buffer size */
2319 info->size = info->buflen;
2320
2321 fi->fh = (unsigned long)info;
2322 return 0;
2323 }
2324
2325 static int proc_release(const char *path, struct fuse_file_info *fi)
2326 {
2327 struct file_info *f = (struct file_info *)fi->fh;
2328
2329 do_release_file_info(f);
2330 return 0;
2331 }
2332
2333 static int proc_read(const char *path, char *buf, size_t size, off_t offset,
2334 struct fuse_file_info *fi)
2335 {
2336 struct file_info *f = (struct file_info *) fi->fh;
2337
2338 switch (f->type) {
2339 case LXC_TYPE_PROC_MEMINFO:
2340 return proc_meminfo_read(buf, size, offset, fi);
2341 case LXC_TYPE_PROC_CPUINFO:
2342 return proc_cpuinfo_read(buf, size, offset, fi);
2343 case LXC_TYPE_PROC_UPTIME:
2344 return proc_uptime_read(buf, size, offset, fi);
2345 case LXC_TYPE_PROC_STAT:
2346 return proc_stat_read(buf, size, offset, fi);
2347 case LXC_TYPE_PROC_DISKSTATS:
2348 return proc_diskstats_read(buf, size, offset, fi);
2349 default:
2350 return -EINVAL;
2351 }
2352 }
2353
2354 /*
2355 * FUSE ops for /
2356 * these just delegate to the /proc and /cgroup ops as
2357 * needed
2358 */
2359
2360 static int lxcfs_getattr(const char *path, struct stat *sb)
2361 {
2362 if (strcmp(path, "/") == 0) {
2363 sb->st_mode = S_IFDIR | 00755;
2364 sb->st_nlink = 2;
2365 return 0;
2366 }
2367 if (strncmp(path, "/cgroup", 7) == 0) {
2368 return cg_getattr(path, sb);
2369 }
2370 if (strncmp(path, "/proc", 5) == 0) {
2371 return proc_getattr(path, sb);
2372 }
2373 return -EINVAL;
2374 }
2375
2376 static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
2377 {
2378 if (strcmp(path, "/") == 0)
2379 return 0;
2380
2381 if (strncmp(path, "/cgroup", 7) == 0) {
2382 return cg_opendir(path, fi);
2383 }
2384 if (strcmp(path, "/proc") == 0)
2385 return 0;
2386 return -ENOENT;
2387 }
2388
2389 static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2390 struct fuse_file_info *fi)
2391 {
2392 if (strcmp(path, "/") == 0) {
2393 if (filler(buf, "proc", NULL, 0) != 0 ||
2394 filler(buf, "cgroup", NULL, 0) != 0)
2395 return -EINVAL;
2396 return 0;
2397 }
2398 if (strncmp(path, "/cgroup", 7) == 0)
2399 return cg_readdir(path, buf, filler, offset, fi);
2400 if (strcmp(path, "/proc") == 0)
2401 return proc_readdir(path, buf, filler, offset, fi);
2402 return -EINVAL;
2403 }
2404
2405 static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
2406 {
2407 if (strcmp(path, "/") == 0)
2408 return 0;
2409 if (strncmp(path, "/cgroup", 7) == 0) {
2410 return cg_releasedir(path, fi);
2411 }
2412 if (strcmp(path, "/proc") == 0)
2413 return 0;
2414 return -EINVAL;
2415 }
2416
2417 static int lxcfs_open(const char *path, struct fuse_file_info *fi)
2418 {
2419 if (strncmp(path, "/cgroup", 7) == 0)
2420 return cg_open(path, fi);
2421 if (strncmp(path, "/proc", 5) == 0)
2422 return proc_open(path, fi);
2423
2424 return -EINVAL;
2425 }
2426
2427 static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
2428 struct fuse_file_info *fi)
2429 {
2430 if (strncmp(path, "/cgroup", 7) == 0)
2431 return cg_read(path, buf, size, offset, fi);
2432 if (strncmp(path, "/proc", 5) == 0)
2433 return proc_read(path, buf, size, offset, fi);
2434
2435 return -EINVAL;
2436 }
2437
2438 int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
2439 struct fuse_file_info *fi)
2440 {
2441 if (strncmp(path, "/cgroup", 7) == 0) {
2442 return cg_write(path, buf, size, offset, fi);
2443 }
2444
2445 return -EINVAL;
2446 }
2447
2448 static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2449 {
2450 return 0;
2451 }
2452
2453 static int lxcfs_release(const char *path, struct fuse_file_info *fi)
2454 {
2455 if (strncmp(path, "/cgroup", 7) == 0)
2456 return cg_release(path, fi);
2457 if (strncmp(path, "/proc", 5) == 0)
2458 return proc_release(path, fi);
2459
2460 return -EINVAL;
2461 }
2462
2463 static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2464 {
2465 return 0;
2466 }
2467
2468 int lxcfs_mkdir(const char *path, mode_t mode)
2469 {
2470 if (strncmp(path, "/cgroup", 7) == 0)
2471 return cg_mkdir(path, mode);
2472
2473 return -EINVAL;
2474 }
2475
2476 int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2477 {
2478 if (strncmp(path, "/cgroup", 7) == 0)
2479 return cg_chown(path, uid, gid);
2480
2481 return -EINVAL;
2482 }
2483
2484 /*
2485 * cat first does a truncate before doing ops->write. This doesn't
2486 * really make sense for cgroups. So just return 0 always but do
2487 * nothing.
2488 */
2489 int lxcfs_truncate(const char *path, off_t newsize)
2490 {
2491 if (strncmp(path, "/cgroup", 7) == 0)
2492 return 0;
2493 return -EINVAL;
2494 }
2495
2496 int lxcfs_rmdir(const char *path)
2497 {
2498 if (strncmp(path, "/cgroup", 7) == 0)
2499 return cg_rmdir(path);
2500 return -EINVAL;
2501 }
2502
2503 int lxcfs_chmod(const char *path, mode_t mode)
2504 {
2505 if (strncmp(path, "/cgroup", 7) == 0)
2506 return cg_chmod(path, mode);
2507 return -EINVAL;
2508 }
2509
2510 const struct fuse_operations lxcfs_ops = {
2511 .getattr = lxcfs_getattr,
2512 .readlink = NULL,
2513 .getdir = NULL,
2514 .mknod = NULL,
2515 .mkdir = lxcfs_mkdir,
2516 .unlink = NULL,
2517 .rmdir = lxcfs_rmdir,
2518 .symlink = NULL,
2519 .rename = NULL,
2520 .link = NULL,
2521 .chmod = lxcfs_chmod,
2522 .chown = lxcfs_chown,
2523 .truncate = lxcfs_truncate,
2524 .utime = NULL,
2525
2526 .open = lxcfs_open,
2527 .read = lxcfs_read,
2528 .release = lxcfs_release,
2529 .write = lxcfs_write,
2530
2531 .statfs = NULL,
2532 .flush = lxcfs_flush,
2533 .fsync = lxcfs_fsync,
2534
2535 .setxattr = NULL,
2536 .getxattr = NULL,
2537 .listxattr = NULL,
2538 .removexattr = NULL,
2539
2540 .opendir = lxcfs_opendir,
2541 .readdir = lxcfs_readdir,
2542 .releasedir = lxcfs_releasedir,
2543
2544 .fsyncdir = NULL,
2545 .init = NULL,
2546 .destroy = NULL,
2547 .access = NULL,
2548 .create = NULL,
2549 .ftruncate = NULL,
2550 .fgetattr = NULL,
2551 };
2552
2553 static void usage(const char *me)
2554 {
2555 fprintf(stderr, "Usage:\n");
2556 fprintf(stderr, "\n");
2557 fprintf(stderr, "%s mountpoint\n", me);
2558 fprintf(stderr, "%s -h\n", me);
2559 exit(1);
2560 }
2561
2562 static bool is_help(char *w)
2563 {
2564 if (strcmp(w, "-h") == 0 ||
2565 strcmp(w, "--help") == 0 ||
2566 strcmp(w, "-help") == 0 ||
2567 strcmp(w, "help") == 0)
2568 return true;
2569 return false;
2570 }
2571
2572 void swallow_arg(int *argcp, char *argv[], char *which)
2573 {
2574 int i;
2575
2576 for (i = 1; argv[i]; i++) {
2577 if (strcmp(argv[i], which) != 0)
2578 continue;
2579 for (; argv[i]; i++) {
2580 argv[i] = argv[i+1];
2581 }
2582 (*argcp)--;
2583 return;
2584 }
2585 }
2586
2587 void swallow_option(int *argcp, char *argv[], char *opt, char *v)
2588 {
2589 int i;
2590
2591 for (i = 1; argv[i]; i++) {
2592 if (!argv[i+1])
2593 continue;
2594 if (strcmp(argv[i], opt) != 0)
2595 continue;
2596 if (strcmp(argv[i+1], v) != 0) {
2597 fprintf(stderr, "Warning: unexpected fuse option %s\n", v);
2598 exit(1);
2599 }
2600 for (; argv[i+1]; i++) {
2601 argv[i] = argv[i+2];
2602 }
2603 (*argcp) -= 2;
2604 return;
2605 }
2606 }
2607
2608 int main(int argc, char *argv[])
2609 {
2610 int ret;
2611 struct lxcfs_state *d;
2612 /*
2613 * what we pass to fuse_main is:
2614 * argv[0] -s -f -o allow_other,directio argv[1] NULL
2615 */
2616 #define NARGS 7
2617 char *newargv[7];
2618
2619 /* accomodate older init scripts */
2620 swallow_arg(&argc, argv, "-s");
2621 swallow_arg(&argc, argv, "-f");
2622 swallow_option(&argc, argv, "-o", "allow_other");
2623
2624 if (argc != 2 || is_help(argv[1]))
2625 usage(argv[0]);
2626
2627 d = NIH_MUST( malloc(sizeof(*d)) );
2628
2629 newargv[0] = argv[0];
2630 newargv[1] = "-s";
2631 newargv[2] = "-f";
2632 newargv[3] = "-o";
2633 newargv[4] = "allow_other,direct_io";
2634 newargv[5] = argv[1];
2635 newargv[6] = NULL;
2636
2637 if (!cgm_escape_cgroup())
2638 fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
2639
2640 if (!cgm_get_controllers(&d->subsystems))
2641 return -1;
2642
2643 ret = fuse_main(NARGS - 1, newargv, &lxcfs_ops, d);
2644
2645 return ret;
2646 }