]> git.proxmox.com Git - mirror_lxcfs.git/blob - lxcfs.c
cg_read: add ending newline
[mirror_lxcfs.git] / lxcfs.c
1 /* lxcfs
2 *
3 * Copyright © 2014 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #include <stdio.h>
12 #include <dirent.h>
13 #include <fcntl.h>
14 #include <fuse.h>
15 #include <unistd.h>
16 #include <errno.h>
17 #include <stdbool.h>
18 #include <time.h>
19 #include <string.h>
20 #include <stdlib.h>
21 #include <libgen.h>
22 #include <sched.h>
23 #include <linux/sched.h>
24 #include <sys/socket.h>
25 #include <sys/mount.h>
26 #include <wait.h>
27
28 #include <nih/alloc.h>
29 #include <nih/string.h>
30
31 #include "cgmanager.h"
32
33 struct lxcfs_state {
34 /*
35 * a null-terminated, nih-allocated list of the mounted subsystems. We
36 * detect this at startup.
37 */
38 char **subsystems;
39 };
40 #define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
41
42 enum {
43 LXC_TYPE_CGDIR,
44 LXC_TYPE_CGFILE,
45 LXC_TYPE_PROC_MEMINFO,
46 LXC_TYPE_PROC_CPUINFO,
47 LXC_TYPE_PROC_UPTIME,
48 LXC_TYPE_PROC_STAT,
49 LXC_TYPE_PROC_DISKSTATS,
50 };
51
52 struct file_info {
53 char *controller;
54 char *cgroup;
55 char *file;
56 int type;
57 char *buf; // unused as of yet
58 int buflen;
59 int size; //actual data size
60 };
61
62 /* reserve buffer size, for cpuall in /proc/stat */
63 #define BUF_RESERVE_SIZE 256
64
65 static char *must_copy_string(void *parent, const char *str)
66 {
67 if (!str)
68 return NULL;
69 return NIH_MUST( nih_strdup(parent, str) );
70 }
71
72 /*
73 * TODO - return value should denote whether child exited with failure
74 * so callers can return errors. Esp read/write of tasks and cgroup.procs
75 */
76 static int wait_for_pid(pid_t pid)
77 {
78 int status, ret;
79
80 again:
81 ret = waitpid(pid, &status, 0);
82 if (ret == -1) {
83 if (errno == EINTR)
84 goto again;
85 return -1;
86 }
87 if (ret != pid)
88 goto again;
89 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
90 return -1;
91 return 0;
92 }
93
94 /*
95 * Given a open file * to /proc/pid/{u,g}id_map, and an id
96 * valid in the caller's namespace, return the id mapped into
97 * pid's namespace.
98 * Returns the mapped id, or -1 on error.
99 */
100 unsigned int
101 convert_id_to_ns(FILE *idfile, unsigned int in_id)
102 {
103 unsigned int nsuid, // base id for a range in the idfile's namespace
104 hostuid, // base id for a range in the caller's namespace
105 count; // number of ids in this range
106 char line[400];
107 int ret;
108
109 fseek(idfile, 0L, SEEK_SET);
110 while (fgets(line, 400, idfile)) {
111 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
112 if (ret != 3)
113 continue;
114 if (hostuid + count < hostuid || nsuid + count < nsuid) {
115 /*
116 * uids wrapped around - unexpected as this is a procfile,
117 * so just bail.
118 */
119 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
120 nsuid, hostuid, count, line);
121 return -1;
122 }
123 if (hostuid <= in_id && hostuid+count > in_id) {
124 /*
125 * now since hostuid <= in_id < hostuid+count, and
126 * hostuid+count and nsuid+count do not wrap around,
127 * we know that nsuid+(in_id-hostuid) which must be
128 * less that nsuid+(count) must not wrap around
129 */
130 return (in_id - hostuid) + nsuid;
131 }
132 }
133
134 // no answer found
135 return -1;
136 }
137
138 /*
139 * for is_privileged_over,
140 * specify whether we require the calling uid to be root in his
141 * namespace
142 */
143 #define NS_ROOT_REQD true
144 #define NS_ROOT_OPT false
145
146 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
147 {
148 nih_local char *fpath = NULL;
149 bool answer = false;
150 uid_t nsuid;
151
152 if (victim == -1 || uid == -1)
153 return false;
154
155 /*
156 * If the request is one not requiring root in the namespace,
157 * then having the same uid suffices. (i.e. uid 1000 has write
158 * access to files owned by uid 1000
159 */
160 if (!req_ns_root && uid == victim)
161 return true;
162
163 fpath = NIH_MUST( nih_sprintf(NULL, "/proc/%d/uid_map", pid) );
164 FILE *f = fopen(fpath, "r");
165 if (!f)
166 return false;
167
168 /* if caller's not root in his namespace, reject */
169 nsuid = convert_id_to_ns(f, uid);
170 if (nsuid)
171 goto out;
172
173 /*
174 * If victim is not mapped into caller's ns, reject.
175 * XXX I'm not sure this check is needed given that fuse
176 * will be sending requests where the vfs has converted
177 */
178 nsuid = convert_id_to_ns(f, victim);
179 if (nsuid == -1)
180 goto out;
181
182 answer = true;
183
184 out:
185 fclose(f);
186 return answer;
187 }
188
189 static bool perms_include(int fmode, mode_t req_mode)
190 {
191 mode_t r;
192
193 switch (req_mode & O_ACCMODE) {
194 case O_RDONLY:
195 r = S_IROTH;
196 break;
197 case O_WRONLY:
198 r = S_IWOTH;
199 break;
200 case O_RDWR:
201 r = S_IROTH | S_IWOTH;
202 break;
203 default:
204 return false;
205 }
206 return ((fmode & r) == r);
207 }
208
209 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
210 {
211 char *start, *end;
212
213 if (strlen(taskcg) <= strlen(querycg)) {
214 fprintf(stderr, "%s: I was fed bad input\n", __func__);
215 return NULL;
216 }
217
218 if (strcmp(querycg, "/") == 0)
219 start = NIH_MUST( nih_strdup(NULL, taskcg + 1) );
220 else
221 start = NIH_MUST( nih_strdup(NULL, taskcg + strlen(querycg) + 1) );
222 end = strchr(start, '/');
223 if (end)
224 *end = '\0';
225 return start;
226 }
227
228 /*
229 * check whether a fuse context may access a cgroup dir or file
230 *
231 * If file is not null, it is a cgroup file to check under cg.
232 * If file is null, then we are checking perms on cg itself.
233 *
234 * For files we can check the mode of the list_keys result.
235 * For cgroups, we must make assumptions based on the files under the
236 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
237 * yet.
238 */
239 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
240 {
241 nih_local struct cgm_keys **list = NULL;
242 int i;
243
244 if (!file)
245 file = "tasks";
246
247 if (*file == '/')
248 file++;
249
250 if (!cgm_list_keys(contrl, cg, &list))
251 return false;
252 for (i = 0; list[i]; i++) {
253 if (strcmp(list[i]->name, file) == 0) {
254 struct cgm_keys *k = list[i];
255 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
256 if (perms_include(k->mode >> 6, mode))
257 return true;
258 }
259 if (fc->gid == k->gid) {
260 if (perms_include(k->mode >> 3, mode))
261 return true;
262 }
263 return perms_include(k->mode, mode);
264 }
265 }
266
267 return false;
268 }
269
270 static void stripnewline(char *x)
271 {
272 size_t l = strlen(x);
273 if (l && x[l-1] == '\n')
274 x[l-1] = '\0';
275 }
276
277 /*
278 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
279 * If caller is in /a, he may act on /a/b, but not on /b.
280 * if the answer is false and nextcg is not NULL, then *nextcg will point
281 * to a nih_alloc'd string containing the next cgroup directory under cg
282 */
283 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
284 {
285 nih_local char *fnam = NULL;
286 FILE *f;
287 bool answer = false;
288 char *line = NULL;
289 size_t len = 0;
290
291 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
292 if (!(f = fopen(fnam, "r")))
293 return false;
294
295 while (getline(&line, &len, f) != -1) {
296 char *c1, *c2, *linecmp;
297 if (!line[0])
298 continue;
299 c1 = strchr(line, ':');
300 if (!c1)
301 goto out;
302 c1++;
303 c2 = strchr(c1, ':');
304 if (!c2)
305 goto out;
306 *c2 = '\0';
307 if (strcmp(c1, contrl) != 0)
308 continue;
309 c2++;
310 stripnewline(c2);
311 /*
312 * callers pass in '/' for root cgroup, otherwise they pass
313 * in a cgroup without leading '/'
314 */
315 linecmp = *cg == '/' ? c2 : c2+1;
316 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
317 if (nextcg)
318 *nextcg = get_next_cgroup_dir(linecmp, cg);
319 goto out;
320 }
321 answer = true;
322 goto out;
323 }
324
325 out:
326 fclose(f);
327 free(line);
328 return answer;
329 }
330
331 /*
332 * given /cgroup/freezer/a/b, return "freezer". this will be nih-allocated
333 * and needs to be nih_freed.
334 */
335 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
336 {
337 const char *p1;
338 char *ret, *slash;
339
340 if (strlen(path) < 9)
341 return NULL;
342 p1 = path+8;
343 ret = nih_strdup(NULL, p1);
344 if (!ret)
345 return ret;
346 slash = strstr(ret, "/");
347 if (slash)
348 *slash = '\0';
349
350 /* verify that it is a subsystem */
351 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
352 int i;
353 if (!list) {
354 nih_free(ret);
355 return NULL;
356 }
357 for (i = 0; list[i]; i++) {
358 if (strcmp(list[i], ret) == 0)
359 return ret;
360 }
361 nih_free(ret);
362 return NULL;
363 }
364
365 /*
366 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
367 * Note that the returned value may include files (keynames) etc
368 */
369 static const char *find_cgroup_in_path(const char *path)
370 {
371 const char *p1;
372
373 if (strlen(path) < 9)
374 return NULL;
375 p1 = strstr(path+8, "/");
376 if (!p1)
377 return NULL;
378 return p1+1;
379 }
380
381 static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
382 {
383 nih_local char **list = NULL;
384 int i;
385
386 if (!f)
387 return false;
388 if (*f == '/')
389 f++;
390
391 if (!cgm_list_children(contr, dir, &list))
392 return false;
393 for (i = 0; list[i]; i++) {
394 if (strcmp(list[i], f) == 0)
395 return true;
396 }
397
398 return false;
399 }
400
401 static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
402 {
403 nih_local struct cgm_keys **list = NULL;
404 struct cgm_keys *k;
405 int i;
406
407 if (!f)
408 return NULL;
409 if (*f == '/')
410 f++;
411 if (!cgm_list_keys(contr, dir, &list))
412 return NULL;
413 for (i = 0; list[i]; i++) {
414 if (strcmp(list[i]->name, f) == 0) {
415 k = NIH_MUST( nih_alloc(NULL, (sizeof(*k))) );
416 k->name = NIH_MUST( nih_strdup(k, list[i]->name) );
417 k->uid = list[i]->uid;
418 k->gid = list[i]->gid;
419 k->mode = list[i]->mode;
420 return k;
421 }
422 }
423
424 return NULL;
425 }
426
427 static void get_cgdir_and_path(const char *cg, char **dir, char **file)
428 {
429 char *p;
430
431 *dir = NIH_MUST( nih_strdup(NULL, cg) );
432 *file = strrchr(cg, '/');
433 if (!*file) {
434 *file = NULL;
435 return;
436 }
437 p = strrchr(*dir, '/');
438 *p = '\0';
439 }
440
441 /*
442 * FUSE ops for /cgroup
443 */
444
445 static int cg_getattr(const char *path, struct stat *sb)
446 {
447 struct timespec now;
448 struct fuse_context *fc = fuse_get_context();
449 nih_local char * cgdir = NULL;
450 char *fpath = NULL, *path1, *path2;
451 nih_local struct cgm_keys *k = NULL;
452 const char *cgroup;
453 nih_local char *controller = NULL;
454
455
456 if (!fc)
457 return -EIO;
458
459 memset(sb, 0, sizeof(struct stat));
460
461 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
462 return -EINVAL;
463
464 sb->st_uid = sb->st_gid = 0;
465 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
466 sb->st_size = 0;
467
468 if (strcmp(path, "/cgroup") == 0) {
469 sb->st_mode = S_IFDIR | 00755;
470 sb->st_nlink = 2;
471 return 0;
472 }
473
474 controller = pick_controller_from_path(fc, path);
475 if (!controller)
476 return -EIO;
477 cgroup = find_cgroup_in_path(path);
478 if (!cgroup) {
479 /* this is just /cgroup/controller, return it as a dir */
480 sb->st_mode = S_IFDIR | 00755;
481 sb->st_nlink = 2;
482 return 0;
483 }
484
485 get_cgdir_and_path(cgroup, &cgdir, &fpath);
486
487 if (!fpath) {
488 path1 = "/";
489 path2 = cgdir;
490 } else {
491 path1 = cgdir;
492 path2 = fpath;
493 }
494
495 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
496 * Then check that caller's cgroup is under path if fpath is a child
497 * cgroup, or cgdir if fpath is a file */
498
499 if (is_child_cgroup(controller, path1, path2)) {
500 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
501 /* this is just /cgroup/controller, return it as a dir */
502 sb->st_mode = S_IFDIR | 00555;
503 sb->st_nlink = 2;
504 return 0;
505 }
506 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
507 return -EACCES;
508
509 // get uid, gid, from '/tasks' file and make up a mode
510 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
511 sb->st_mode = S_IFDIR | 00755;
512 k = get_cgroup_key(controller, cgroup, "tasks");
513 if (!k) {
514 sb->st_uid = sb->st_gid = 0;
515 } else {
516 sb->st_uid = k->uid;
517 sb->st_gid = k->gid;
518 }
519 sb->st_nlink = 2;
520 return 0;
521 }
522
523 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
524 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL))
525 return -ENOENT;
526 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
527 return -EACCES;
528
529 sb->st_mode = S_IFREG | k->mode;
530 sb->st_nlink = 1;
531 sb->st_uid = k->uid;
532 sb->st_gid = k->gid;
533 sb->st_size = 0;
534 return 0;
535 }
536
537 return -ENOENT;
538 }
539
540 /*
541 * TODO - cache these results in a table for use in opendir, free
542 * in releasedir
543 */
544 static int cg_opendir(const char *path, struct fuse_file_info *fi)
545 {
546 struct fuse_context *fc = fuse_get_context();
547 nih_local struct cgm_keys **list = NULL;
548 const char *cgroup;
549 struct file_info *dir_info;
550 nih_local char *controller = NULL;
551
552 if (!fc)
553 return -EIO;
554
555 if (strcmp(path, "/cgroup") == 0) {
556 cgroup = NULL;
557 controller = NULL;
558 } else {
559 // return list of keys for the controller, and list of child cgroups
560 controller = pick_controller_from_path(fc, path);
561 if (!controller)
562 return -EIO;
563
564 cgroup = find_cgroup_in_path(path);
565 if (!cgroup) {
566 /* this is just /cgroup/controller, return its contents */
567 cgroup = "/";
568 }
569 }
570
571 if (cgroup && !fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
572 return -EACCES;
573
574 /* we'll free this at cg_releasedir */
575 dir_info = NIH_MUST( nih_alloc(NULL, sizeof(*dir_info)) );
576 dir_info->controller = must_copy_string(dir_info, controller);
577 dir_info->cgroup = must_copy_string(dir_info, cgroup);
578 dir_info->type = LXC_TYPE_CGDIR;
579 dir_info->buf = NULL;
580 dir_info->file = NULL;
581 dir_info->buflen = 0;
582
583 fi->fh = (unsigned long)dir_info;
584 return 0;
585 }
586
587 static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
588 struct fuse_file_info *fi)
589 {
590 struct file_info *d = (struct file_info *)fi->fh;
591 nih_local struct cgm_keys **list = NULL;
592 int i;
593 nih_local char *nextcg = NULL;
594 struct fuse_context *fc = fuse_get_context();
595
596 if (d->type != LXC_TYPE_CGDIR) {
597 fprintf(stderr, "Internal error: file cache info used in readdir\n");
598 return -EIO;
599 }
600 if (!d->cgroup && !d->controller) {
601 // ls /var/lib/lxcfs/cgroup - just show list of controllers
602 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
603 int i;
604
605 if (!list)
606 return -EIO;
607
608 for (i = 0; list[i]; i++) {
609 if (filler(buf, list[i], NULL, 0) != 0) {
610 return -EIO;
611 }
612 }
613 return 0;
614 }
615
616 if (!cgm_list_keys(d->controller, d->cgroup, &list))
617 // not a valid cgroup
618 return -EINVAL;
619
620 if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
621 if (nextcg) {
622 int ret;
623 ret = filler(buf, nextcg, NULL, 0);
624 if (ret != 0)
625 return -EIO;
626 }
627 return 0;
628 }
629
630 for (i = 0; list[i]; i++) {
631 if (filler(buf, list[i]->name, NULL, 0) != 0) {
632 return -EIO;
633 }
634 }
635
636 // now get the list of child cgroups
637 nih_local char **clist = NULL;
638
639 if (!cgm_list_children(d->controller, d->cgroup, &clist))
640 return 0;
641 for (i = 0; clist[i]; i++) {
642 if (filler(buf, clist[i], NULL, 0) != 0) {
643 return -EIO;
644 }
645 }
646 return 0;
647 }
648
649 static void do_release_file_info(struct file_info *f)
650 {
651 /*
652 * all file_info fields which are nih_alloc()d with f as parent
653 * will be automatically freed
654 */
655 nih_free(f);
656 }
657
658 static int cg_releasedir(const char *path, struct fuse_file_info *fi)
659 {
660 struct file_info *d = (struct file_info *)fi->fh;
661
662 do_release_file_info(d);
663 return 0;
664 }
665
666 static int cg_open(const char *path, struct fuse_file_info *fi)
667 {
668 nih_local char *controller = NULL;
669 const char *cgroup;
670 char *fpath = NULL, *path1, *path2;
671 nih_local char * cgdir = NULL;
672 nih_local struct cgm_keys *k = NULL;
673 struct file_info *file_info;
674 struct fuse_context *fc = fuse_get_context();
675
676 if (!fc)
677 return -EIO;
678
679 controller = pick_controller_from_path(fc, path);
680 if (!controller)
681 return -EIO;
682 cgroup = find_cgroup_in_path(path);
683 if (!cgroup)
684 return -EINVAL;
685
686 get_cgdir_and_path(cgroup, &cgdir, &fpath);
687 if (!fpath) {
688 path1 = "/";
689 path2 = cgdir;
690 } else {
691 path1 = cgdir;
692 path2 = fpath;
693 }
694
695 k = get_cgroup_key(controller, path1, path2);
696 if (!k)
697 return -EINVAL;
698
699 if (!fc_may_access(fc, controller, path1, path2, fi->flags))
700 // should never get here
701 return -EACCES;
702
703 /* we'll free this at cg_release */
704 file_info = NIH_MUST( nih_alloc(NULL, sizeof(*file_info)) );
705 file_info->controller = must_copy_string(file_info, controller);
706 file_info->cgroup = must_copy_string(file_info, path1);
707 file_info->file = must_copy_string(file_info, path2);
708 file_info->type = LXC_TYPE_CGFILE;
709 file_info->buf = NULL;
710 file_info->buflen = 0;
711
712 fi->fh = (unsigned long)file_info;
713 return 0;
714 }
715
716 static int cg_release(const char *path, struct fuse_file_info *fi)
717 {
718 struct file_info *f = (struct file_info *)fi->fh;
719
720 do_release_file_info(f);
721 return 0;
722 }
723
724 static int msgrecv(int sockfd, void *buf, size_t len)
725 {
726 struct timeval tv;
727 fd_set rfds;
728
729 FD_ZERO(&rfds);
730 FD_SET(sockfd, &rfds);
731 tv.tv_sec = 2;
732 tv.tv_usec = 0;
733
734 if (select(sockfd+1, &rfds, NULL, NULL, &tv) <= 0)
735 return -1;
736 return recv(sockfd, buf, len, MSG_DONTWAIT);
737 }
738
739 #define SEND_CREDS_OK 0
740 #define SEND_CREDS_NOTSK 1
741 #define SEND_CREDS_FAIL 2
742 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
743 {
744 struct msghdr msg = { 0 };
745 struct iovec iov;
746 struct cmsghdr *cmsg;
747 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
748 char buf[1];
749 buf[0] = 'p';
750
751 if (pingfirst) {
752 if (msgrecv(sock, buf, 1) != 1) {
753 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
754 __func__);
755 return SEND_CREDS_FAIL;
756 }
757 }
758
759 msg.msg_control = cmsgbuf;
760 msg.msg_controllen = sizeof(cmsgbuf);
761
762 cmsg = CMSG_FIRSTHDR(&msg);
763 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
764 cmsg->cmsg_level = SOL_SOCKET;
765 cmsg->cmsg_type = SCM_CREDENTIALS;
766 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
767
768 msg.msg_name = NULL;
769 msg.msg_namelen = 0;
770
771 buf[0] = v;
772 iov.iov_base = buf;
773 iov.iov_len = sizeof(buf);
774 msg.msg_iov = &iov;
775 msg.msg_iovlen = 1;
776
777 if (sendmsg(sock, &msg, 0) < 0) {
778 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
779 strerror(errno));
780 if (errno == 3)
781 return SEND_CREDS_NOTSK;
782 return SEND_CREDS_FAIL;
783 }
784
785 return SEND_CREDS_OK;
786 }
787
788 static bool recv_creds(int sock, struct ucred *cred, char *v)
789 {
790 struct msghdr msg = { 0 };
791 struct iovec iov;
792 struct cmsghdr *cmsg;
793 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
794 char buf[1];
795 int ret;
796 int optval = 1;
797 struct timeval tv;
798 fd_set rfds;
799
800 *v = '1';
801
802 cred->pid = -1;
803 cred->uid = -1;
804 cred->gid = -1;
805
806 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
807 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
808 return false;
809 }
810 buf[0] = '1';
811 if (write(sock, buf, 1) != 1) {
812 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
813 return false;
814 }
815
816 msg.msg_name = NULL;
817 msg.msg_namelen = 0;
818 msg.msg_control = cmsgbuf;
819 msg.msg_controllen = sizeof(cmsgbuf);
820
821 iov.iov_base = buf;
822 iov.iov_len = sizeof(buf);
823 msg.msg_iov = &iov;
824 msg.msg_iovlen = 1;
825
826 FD_ZERO(&rfds);
827 FD_SET(sock, &rfds);
828 tv.tv_sec = 2;
829 tv.tv_usec = 0;
830 if (select(sock+1, &rfds, NULL, NULL, &tv) <= 0) {
831 fprintf(stderr, "Failed to select for scm_cred: %s\n",
832 strerror(errno));
833 return false;
834 }
835 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
836 if (ret < 0) {
837 fprintf(stderr, "Failed to receive scm_cred: %s\n",
838 strerror(errno));
839 return false;
840 }
841
842 cmsg = CMSG_FIRSTHDR(&msg);
843
844 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
845 cmsg->cmsg_level == SOL_SOCKET &&
846 cmsg->cmsg_type == SCM_CREDENTIALS) {
847 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
848 }
849 *v = buf[0];
850
851 return true;
852 }
853
854
855 /*
856 * pid_to_ns - reads pids from a ucred over a socket, then writes the
857 * int value back over the socket. This shifts the pid from the
858 * sender's pidns into tpid's pidns.
859 */
860 static void pid_to_ns(int sock, pid_t tpid)
861 {
862 char v = '0';
863 struct ucred cred;
864
865 while (recv_creds(sock, &cred, &v)) {
866 if (v == '1')
867 exit(0);
868 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
869 exit(1);
870 }
871 exit(0);
872 }
873
874 /*
875 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
876 * in your old pidns. Only children which you fork will be in the target
877 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
878 * actually convert pids
879 */
880 static void pid_to_ns_wrapper(int sock, pid_t tpid)
881 {
882 int newnsfd = -1, ret, cpipe[2];
883 char fnam[100];
884 pid_t cpid;
885 struct timeval tv;
886 fd_set s;
887 char v;
888
889 sprintf(fnam, "/proc/%d/ns/pid", tpid);
890 newnsfd = open(fnam, O_RDONLY);
891 if (newnsfd < 0)
892 exit(1);
893 if (setns(newnsfd, 0) < 0)
894 exit(1);
895 close(newnsfd);
896
897 if (pipe(cpipe) < 0)
898 exit(1);
899
900 loop:
901 cpid = fork();
902 if (cpid < 0)
903 exit(1);
904
905 if (!cpid) {
906 char b = '1';
907 close(cpipe[0]);
908 if (write(cpipe[1], &b, sizeof(char)) < 0) {
909 fprintf(stderr, "%s (child): erorr on write: %s\n",
910 __func__, strerror(errno));
911 }
912 close(cpipe[1]);
913 pid_to_ns(sock, tpid);
914 }
915 // give the child 1 second to be done forking and
916 // write it's ack
917 FD_ZERO(&s);
918 FD_SET(cpipe[0], &s);
919 tv.tv_sec = 1;
920 tv.tv_usec = 0;
921 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
922 if (ret <= 0)
923 goto again;
924 ret = read(cpipe[0], &v, 1);
925 if (ret != sizeof(char) || v != '1') {
926 goto again;
927 }
928
929 if (!wait_for_pid(cpid))
930 exit(1);
931 exit(0);
932
933 again:
934 kill(cpid, SIGKILL);
935 wait_for_pid(cpid);
936 goto loop;
937 }
938
939 /*
940 * To read cgroup files with a particular pid, we will setns into the child
941 * pidns, open a pipe, fork a child - which will be the first to really be in
942 * the child ns - which does the cgm_get_value and writes the data to the pipe.
943 */
944 static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
945 {
946 int sock[2] = {-1, -1};
947 nih_local char *tmpdata = NULL;
948 int ret;
949 pid_t qpid, cpid = -1;
950 bool answer = false;
951 char v = '0';
952 struct ucred cred;
953 struct timeval tv;
954 fd_set s;
955
956 if (!cgm_get_value(contrl, cg, file, &tmpdata))
957 return false;
958
959 /*
960 * Now we read the pids from returned data one by one, pass
961 * them into a child in the target namespace, read back the
962 * translated pids, and put them into our to-return data
963 */
964
965 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
966 perror("socketpair");
967 exit(1);
968 }
969
970 cpid = fork();
971 if (cpid == -1)
972 goto out;
973
974 if (!cpid) // child
975 pid_to_ns_wrapper(sock[1], tpid);
976
977 char *ptr = tmpdata;
978 cred.uid = 0;
979 cred.gid = 0;
980 while (sscanf(ptr, "%d\n", &qpid) == 1) {
981 cred.pid = qpid;
982 ret = send_creds(sock[0], &cred, v, true);
983
984 if (ret == SEND_CREDS_NOTSK)
985 goto next;
986 if (ret == SEND_CREDS_FAIL)
987 goto out;
988
989 // read converted results
990 FD_ZERO(&s);
991 FD_SET(sock[0], &s);
992 tv.tv_sec = 2;
993 tv.tv_usec = 0;
994 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
995 if (ret <= 0) {
996 fprintf(stderr, "%s: select error waiting for pid from child: %s\n",
997 __func__, strerror(errno));
998 goto out;
999 }
1000 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1001 fprintf(stderr, "%s: error reading pid from child: %s\n",
1002 __func__, strerror(errno));
1003 goto out;
1004 }
1005 NIH_MUST( nih_strcat_sprintf(d, NULL, "%d\n", qpid) );
1006 next:
1007 ptr = strchr(ptr, '\n');
1008 if (!ptr)
1009 break;
1010 ptr++;
1011 }
1012
1013 cred.pid = getpid();
1014 v = '1';
1015 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
1016 // failed to ask child to exit
1017 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1018 __func__, strerror(errno));
1019 goto out;
1020 }
1021
1022 answer = true;
1023
1024 out:
1025 if (cpid != -1)
1026 wait_for_pid(cpid);
1027 if (sock[0] != -1) {
1028 close(sock[0]);
1029 close(sock[1]);
1030 }
1031 return answer;
1032 }
1033
1034 static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1035 struct fuse_file_info *fi)
1036 {
1037 struct fuse_context *fc = fuse_get_context();
1038 struct file_info *f = (struct file_info *)fi->fh;
1039 nih_local struct cgm_keys *k = NULL;
1040
1041 if (f->type != LXC_TYPE_CGFILE) {
1042 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1043 return -EIO;
1044 }
1045
1046 if (offset)
1047 return 0;
1048
1049 if (!fc)
1050 return -EIO;
1051
1052 if (!f->controller)
1053 return -EINVAL;
1054
1055 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
1056 nih_local char *data = NULL;
1057 int s;
1058 bool r;
1059
1060 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY))
1061 // should never get here
1062 return -EACCES;
1063
1064 if (strcmp(f->file, "tasks") == 0 ||
1065 strcmp(f->file, "/tasks") == 0 ||
1066 strcmp(f->file, "/cgroup.procs") == 0 ||
1067 strcmp(f->file, "cgroup.procs") == 0)
1068 // special case - we have to translate the pids
1069 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
1070 else
1071 r = cgm_get_value(f->controller, f->cgroup, f->file, &data);
1072
1073 if (!r)
1074 return -EINVAL;
1075
1076 if (!data)
1077 return 0;
1078 s = strlen(data);
1079 if (s > size)
1080 s = size;
1081 memcpy(buf, data, s);
1082 if (s > 0 && s < size && data[s-1] != '\n')
1083 buf[s++] = '\n';
1084
1085 return s;
1086 }
1087
1088 return -EINVAL;
1089 }
1090
1091 static void pid_from_ns(int sock, pid_t tpid)
1092 {
1093 pid_t vpid;
1094 struct ucred cred;
1095 char v;
1096 struct timeval tv;
1097 fd_set s;
1098 int ret;
1099
1100 cred.uid = 0;
1101 cred.gid = 0;
1102 while (1) {
1103 FD_ZERO(&s);
1104 FD_SET(sock, &s);
1105 tv.tv_sec = 2;
1106 tv.tv_usec = 0;
1107 ret = select(sock+1, &s, NULL, NULL, &tv);
1108 if (ret <= 0) {
1109 fprintf(stderr, "%s: bad select before read from parent: %s\n",
1110 __func__, strerror(errno));
1111 exit(1);
1112 }
1113 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1114 fprintf(stderr, "%s: bad read from parent: %s\n",
1115 __func__, strerror(errno));
1116 exit(1);
1117 }
1118 if (vpid == -1) // done
1119 break;
1120 v = '0';
1121 cred.pid = vpid;
1122 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
1123 v = '1';
1124 cred.pid = getpid();
1125 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
1126 exit(1);
1127 }
1128 }
1129 exit(0);
1130 }
1131
1132 static void pid_from_ns_wrapper(int sock, pid_t tpid)
1133 {
1134 int newnsfd = -1, ret, cpipe[2];
1135 char fnam[100];
1136 pid_t cpid;
1137 fd_set s;
1138 struct timeval tv;
1139 char v;
1140
1141 sprintf(fnam, "/proc/%d/ns/pid", tpid);
1142 newnsfd = open(fnam, O_RDONLY);
1143 if (newnsfd < 0)
1144 exit(1);
1145 if (setns(newnsfd, 0) < 0)
1146 exit(1);
1147 close(newnsfd);
1148
1149 if (pipe(cpipe) < 0)
1150 exit(1);
1151
1152 loop:
1153 cpid = fork();
1154
1155 if (cpid < 0)
1156 exit(1);
1157
1158 if (!cpid) {
1159 char b = '1';
1160 close(cpipe[0]);
1161 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1162 fprintf(stderr, "%s (child): erorr on write: %s\n",
1163 __func__, strerror(errno));
1164 }
1165 close(cpipe[1]);
1166 pid_from_ns(sock, tpid);
1167 }
1168
1169 // give the child 1 second to be done forking and
1170 // write it's ack
1171 FD_ZERO(&s);
1172 FD_SET(cpipe[0], &s);
1173 tv.tv_sec = 1;
1174 tv.tv_usec = 0;
1175 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1176 if (ret <= 0)
1177 goto again;
1178 ret = read(cpipe[0], &v, 1);
1179 if (ret != sizeof(char) || v != '1') {
1180 goto again;
1181 }
1182
1183 if (!wait_for_pid(cpid))
1184 exit(1);
1185 exit(0);
1186
1187 again:
1188 kill(cpid, SIGKILL);
1189 wait_for_pid(cpid);
1190 goto loop;
1191 }
1192
1193 static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
1194 {
1195 int sock[2] = {-1, -1};
1196 pid_t qpid, cpid = -1;
1197 bool answer = false, fail = false;
1198
1199 /*
1200 * write the pids to a socket, have helper in writer's pidns
1201 * call movepid for us
1202 */
1203 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1204 perror("socketpair");
1205 exit(1);
1206 }
1207
1208 cpid = fork();
1209 if (cpid == -1)
1210 goto out;
1211
1212 if (!cpid) // child
1213 pid_from_ns_wrapper(sock[1], tpid);
1214
1215 const char *ptr = buf;
1216 while (sscanf(ptr, "%d", &qpid) == 1) {
1217 struct ucred cred;
1218 char v;
1219
1220 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1221 fprintf(stderr, "%s: error writing pid to child: %s\n",
1222 __func__, strerror(errno));
1223 goto out;
1224 }
1225
1226 if (recv_creds(sock[0], &cred, &v)) {
1227 if (v == '0') {
1228 if (!cgm_move_pid(contrl, cg, cred.pid))
1229 fail = true;
1230 }
1231 }
1232
1233 ptr = strchr(ptr, '\n');
1234 if (!ptr)
1235 break;
1236 ptr++;
1237 }
1238
1239 /* All good, write the value */
1240 qpid = -1;
1241 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1242 fprintf(stderr, "Warning: failed to ask child to exit\n");
1243
1244 if (!fail)
1245 answer = true;
1246
1247 out:
1248 if (cpid != -1)
1249 wait_for_pid(cpid);
1250 if (sock[0] != -1) {
1251 close(sock[0]);
1252 close(sock[1]);
1253 }
1254 return answer;
1255 }
1256
1257 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1258 struct fuse_file_info *fi)
1259 {
1260 struct fuse_context *fc = fuse_get_context();
1261 nih_local char *localbuf = NULL;
1262 nih_local struct cgm_keys *k = NULL;
1263 struct file_info *f = (struct file_info *)fi->fh;
1264
1265 if (f->type != LXC_TYPE_CGFILE) {
1266 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1267 return -EIO;
1268 }
1269
1270 if (offset)
1271 return 0;
1272
1273 if (!fc)
1274 return -EIO;
1275
1276 localbuf = NIH_MUST( nih_alloc(NULL, size+1) );
1277 localbuf[size] = '\0';
1278 memcpy(localbuf, buf, size);
1279
1280 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) != NULL) {
1281 bool r;
1282
1283 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY))
1284 return -EACCES;
1285
1286 if (strcmp(f->file, "tasks") == 0 ||
1287 strcmp(f->file, "/tasks") == 0 ||
1288 strcmp(f->file, "/cgroup.procs") == 0 ||
1289 strcmp(f->file, "cgroup.procs") == 0)
1290 // special case - we have to translate the pids
1291 r = do_write_pids(fc->pid, f->controller, f->cgroup, f->file, localbuf);
1292 else
1293 r = cgm_set_value(f->controller, f->cgroup, f->file, localbuf);
1294
1295 if (!r)
1296 return -EINVAL;
1297
1298 return size;
1299 }
1300
1301 return -EINVAL;
1302 }
1303
1304 int cg_chown(const char *path, uid_t uid, gid_t gid)
1305 {
1306 struct fuse_context *fc = fuse_get_context();
1307 nih_local char * cgdir = NULL;
1308 char *fpath = NULL, *path1, *path2;
1309 nih_local struct cgm_keys *k = NULL;
1310 const char *cgroup;
1311 nih_local char *controller = NULL;
1312
1313
1314 if (!fc)
1315 return -EIO;
1316
1317 if (strcmp(path, "/cgroup") == 0)
1318 return -EINVAL;
1319
1320 controller = pick_controller_from_path(fc, path);
1321 if (!controller)
1322 return -EINVAL;
1323 cgroup = find_cgroup_in_path(path);
1324 if (!cgroup)
1325 /* this is just /cgroup/controller */
1326 return -EINVAL;
1327
1328 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1329
1330 if (!fpath) {
1331 path1 = "/";
1332 path2 = cgdir;
1333 } else {
1334 path1 = cgdir;
1335 path2 = fpath;
1336 }
1337
1338 if (is_child_cgroup(controller, path1, path2)) {
1339 // get uid, gid, from '/tasks' file and make up a mode
1340 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1341 k = get_cgroup_key(controller, cgroup, "tasks");
1342
1343 } else
1344 k = get_cgroup_key(controller, path1, path2);
1345
1346 if (!k)
1347 return -EINVAL;
1348
1349 /*
1350 * This being a fuse request, the uid and gid must be valid
1351 * in the caller's namespace. So we can just check to make
1352 * sure that the caller is root in his uid, and privileged
1353 * over the file's current owner.
1354 */
1355 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD))
1356 return -EACCES;
1357
1358 if (!cgm_chown_file(controller, cgroup, uid, gid))
1359 return -EINVAL;
1360 return 0;
1361 }
1362
1363 int cg_chmod(const char *path, mode_t mode)
1364 {
1365 struct fuse_context *fc = fuse_get_context();
1366 nih_local char * cgdir = NULL;
1367 char *fpath = NULL, *path1, *path2;
1368 nih_local struct cgm_keys *k = NULL;
1369 const char *cgroup;
1370 nih_local char *controller = NULL;
1371
1372 if (!fc)
1373 return -EIO;
1374
1375 if (strcmp(path, "/cgroup") == 0)
1376 return -EINVAL;
1377
1378 controller = pick_controller_from_path(fc, path);
1379 if (!controller)
1380 return -EINVAL;
1381 cgroup = find_cgroup_in_path(path);
1382 if (!cgroup)
1383 /* this is just /cgroup/controller */
1384 return -EINVAL;
1385
1386 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1387
1388 if (!fpath) {
1389 path1 = "/";
1390 path2 = cgdir;
1391 } else {
1392 path1 = cgdir;
1393 path2 = fpath;
1394 }
1395
1396 if (is_child_cgroup(controller, path1, path2)) {
1397 // get uid, gid, from '/tasks' file and make up a mode
1398 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1399 k = get_cgroup_key(controller, cgroup, "tasks");
1400
1401 } else
1402 k = get_cgroup_key(controller, path1, path2);
1403
1404 if (!k)
1405 return -EINVAL;
1406
1407 /*
1408 * This being a fuse request, the uid and gid must be valid
1409 * in the caller's namespace. So we can just check to make
1410 * sure that the caller is root in his uid, and privileged
1411 * over the file's current owner.
1412 */
1413 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT))
1414 return -EPERM;
1415
1416 if (!cgm_chmod_file(controller, cgroup, mode))
1417 return -EINVAL;
1418 return 0;
1419 }
1420
1421 int cg_mkdir(const char *path, mode_t mode)
1422 {
1423 struct fuse_context *fc = fuse_get_context();
1424 nih_local struct cgm_keys **list = NULL;
1425 char *fpath = NULL, *path1;
1426 nih_local char * cgdir = NULL;
1427 const char *cgroup;
1428 nih_local char *controller = NULL;
1429
1430 if (!fc)
1431 return -EIO;
1432
1433
1434 controller = pick_controller_from_path(fc, path);
1435 if (!controller)
1436 return -EINVAL;
1437
1438 cgroup = find_cgroup_in_path(path);
1439 if (!cgroup)
1440 return -EINVAL;
1441
1442 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1443 if (!fpath)
1444 path1 = "/";
1445 else
1446 path1 = cgdir;
1447
1448 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR))
1449 return -EACCES;
1450
1451
1452 if (!cgm_create(controller, cgroup, fc->uid, fc->gid))
1453 return -EINVAL;
1454
1455 return 0;
1456 }
1457
1458 static int cg_rmdir(const char *path)
1459 {
1460 struct fuse_context *fc = fuse_get_context();
1461 nih_local struct cgm_keys **list = NULL;
1462 char *fpath = NULL;
1463 nih_local char * cgdir = NULL;
1464 const char *cgroup;
1465 nih_local char *controller = NULL;
1466
1467 if (!fc)
1468 return -EIO;
1469
1470
1471 controller = pick_controller_from_path(fc, path);
1472 if (!controller)
1473 return -EINVAL;
1474
1475 cgroup = find_cgroup_in_path(path);
1476 if (!cgroup)
1477 return -EINVAL;
1478
1479 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1480 if (!fpath)
1481 return -EINVAL;
1482
1483 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY))
1484 return -EACCES;
1485
1486 if (!cgm_remove(controller, cgroup))
1487 return -EINVAL;
1488
1489 return 0;
1490 }
1491
1492 static bool startswith(const char *line, const char *pref)
1493 {
1494 if (strncmp(line, pref, strlen(pref)) == 0)
1495 return true;
1496 return false;
1497 }
1498
1499 static void get_mem_cached(char *memstat, unsigned long *v)
1500 {
1501 char *eol;
1502
1503 *v = 0;
1504 while (*memstat) {
1505 if (startswith(memstat, "total_cache")) {
1506 sscanf(memstat + 11, "%lu", v);
1507 *v /= 1024;
1508 return;
1509 }
1510 eol = strchr(memstat, '\n');
1511 if (!eol)
1512 return;
1513 memstat = eol+1;
1514 }
1515 }
1516
1517 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
1518 {
1519 char *eol;
1520 char key[32];
1521
1522 memset(key, 0, 32);
1523 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
1524
1525 size_t len = strlen(key);
1526 *v = 0;
1527
1528 while (*str) {
1529 if (startswith(str, key)) {
1530 sscanf(str + len, "%lu", v);
1531 return;
1532 }
1533 eol = strchr(str, '\n');
1534 if (!eol)
1535 return;
1536 str = eol+1;
1537 }
1538 }
1539
1540 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1541 {
1542 nih_local char *fnam = NULL;
1543 FILE *f;
1544 char *answer = NULL;
1545 char *line = NULL;
1546 size_t len = 0;
1547
1548 fnam = NIH_MUST( nih_sprintf(NULL, "/proc/%d/cgroup", pid) );
1549 if (!(f = fopen(fnam, "r")))
1550 return false;
1551
1552 while (getline(&line, &len, f) != -1) {
1553 char *c1, *c2;
1554 if (!line[0])
1555 continue;
1556 c1 = strchr(line, ':');
1557 if (!c1)
1558 goto out;
1559 c1++;
1560 c2 = strchr(c1, ':');
1561 if (!c2)
1562 goto out;
1563 *c2 = '\0';
1564 if (strcmp(c1, contrl) != 0)
1565 continue;
1566 c2++;
1567 stripnewline(c2);
1568 answer = NIH_MUST( nih_strdup(NULL, c2) );
1569 goto out;
1570 }
1571
1572 out:
1573 fclose(f);
1574 free(line);
1575 return answer;
1576 }
1577
1578 /*
1579 * FUSE ops for /proc
1580 */
1581
1582 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1583 struct fuse_file_info *fi)
1584 {
1585 struct fuse_context *fc = fuse_get_context();
1586 struct file_info *d = (struct file_info *)fi->fh;
1587 nih_local char *cg = get_pid_cgroup(fc->pid, "memory");
1588 nih_local char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
1589 unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
1590 char *line = NULL;
1591 size_t linelen = 0, total_len = 0;
1592 char *cache = d->buf;
1593 size_t cache_size = d->buflen;
1594 FILE *f;
1595
1596 if (offset){
1597 if (offset > d->size)
1598 return -EINVAL;
1599 int left = d->size - offset;
1600 total_len = left > size ? size: left;
1601 memcpy(buf, cache + offset, total_len);
1602 return total_len;
1603 }
1604
1605 if (!cg)
1606 return 0;
1607
1608 if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
1609 return 0;
1610 if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
1611 return 0;
1612 if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
1613 return 0;
1614 memlimit = strtoul(memlimit_str, NULL, 10);
1615 memusage = strtoul(memusage_str, NULL, 10);
1616 memlimit /= 1024;
1617 memusage /= 1024;
1618 get_mem_cached(memstat_str, &cached);
1619
1620 f = fopen("/proc/meminfo", "r");
1621 if (!f)
1622 return 0;
1623
1624 while (getline(&line, &linelen, f) != -1) {
1625 size_t l;
1626 char *printme, lbuf[100];
1627
1628 memset(lbuf, 0, 100);
1629 if (startswith(line, "MemTotal:")) {
1630 sscanf(line+14, "%lu", &hosttotal);
1631 if (hosttotal < memlimit)
1632 memlimit = hosttotal;
1633 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1634 printme = lbuf;
1635 } else if (startswith(line, "MemFree:")) {
1636 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1637 printme = lbuf;
1638 } else if (startswith(line, "MemAvailable:")) {
1639 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1640 printme = lbuf;
1641 } else if (startswith(line, "Buffers:")) {
1642 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1643 printme = lbuf;
1644 } else if (startswith(line, "Cached:")) {
1645 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
1646 printme = lbuf;
1647 } else if (startswith(line, "SwapCached:")) {
1648 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
1649 printme = lbuf;
1650 } else
1651 printme = line;
1652
1653 l = snprintf(cache, cache_size, "%s", printme);
1654 cache += l;
1655 cache_size -= l;
1656 total_len += l;
1657 }
1658
1659 d->size = total_len;
1660 if (total_len > size ) total_len = size;
1661 memcpy(buf, d->buf, total_len);
1662
1663 fclose(f);
1664 free(line);
1665 return total_len;
1666 }
1667
1668 /*
1669 * Read the cpuset.cpus for cg
1670 * Return the answer in a nih_alloced string
1671 */
1672 static char *get_cpuset(const char *cg)
1673 {
1674 char *answer;
1675
1676 if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
1677 return NULL;
1678 return answer;
1679 }
1680
1681 /*
1682 * Helper functions for cpuset_in-set
1683 */
1684 char *cpuset_nexttok(const char *c)
1685 {
1686 char *r = strchr(c+1, ',');
1687 if (r)
1688 return r+1;
1689 return NULL;
1690 }
1691
1692 int cpuset_getrange(const char *c, int *a, int *b)
1693 {
1694 int ret;
1695
1696 ret = sscanf(c, "%d-%d", a, b);
1697 return ret;
1698 }
1699
1700 /*
1701 * cpusets are in format "1,2-3,4"
1702 * iow, comma-delimited ranges
1703 */
1704 static bool cpu_in_cpuset(int cpu, const char *cpuset)
1705 {
1706 const char *c;
1707
1708 for (c = cpuset; c; c = cpuset_nexttok(c)) {
1709 int a, b, ret;
1710
1711 ret = cpuset_getrange(c, &a, &b);
1712 if (ret == 1 && cpu == a)
1713 return true;
1714 if (ret != 2) // bad cpuset!
1715 return false;
1716 if (cpu >= a && cpu <= b)
1717 return true;
1718 }
1719
1720 return false;
1721 }
1722
1723 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
1724 {
1725 int cpu;
1726
1727 if (sscanf(line, "processor : %d", &cpu) != 1)
1728 return false;
1729 return cpu_in_cpuset(cpu, cpuset);
1730 }
1731
1732 /*
1733 * check whether this is a '^processor" line in /proc/cpuinfo
1734 */
1735 static bool is_processor_line(const char *line)
1736 {
1737 int cpu;
1738
1739 if (sscanf(line, "processor : %d", &cpu) == 1)
1740 return true;
1741 return false;
1742 }
1743
1744 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
1745 struct fuse_file_info *fi)
1746 {
1747 struct fuse_context *fc = fuse_get_context();
1748 struct file_info *d = (struct file_info *)fi->fh;
1749 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1750 nih_local char *cpuset = NULL;
1751 char *line = NULL;
1752 size_t linelen = 0, total_len = 0;
1753 bool am_printing = false;
1754 int curcpu = -1;
1755 char *cache = d->buf;
1756 size_t cache_size = d->buflen;
1757 FILE *f;
1758
1759 if (offset){
1760 if (offset > d->size)
1761 return -EINVAL;
1762 int left = d->size - offset;
1763 total_len = left > size ? size: left;
1764 memcpy(buf, cache + offset, total_len);
1765 return total_len;
1766 }
1767
1768 if (!cg)
1769 return 0;
1770
1771 cpuset = get_cpuset(cg);
1772 if (!cpuset)
1773 return 0;
1774
1775 f = fopen("/proc/cpuinfo", "r");
1776 if (!f)
1777 return 0;
1778
1779 while (getline(&line, &linelen, f) != -1) {
1780 size_t l;
1781 if (is_processor_line(line)) {
1782 am_printing = cpuline_in_cpuset(line, cpuset);
1783 if (am_printing) {
1784 curcpu ++;
1785 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
1786 if (l < cache_size){
1787 cache += l;
1788 cache_size -= l;
1789 total_len += l;
1790 }else{
1791 cache += cache_size;
1792 total_len += cache_size;
1793 cache_size = 0;
1794 break;
1795 }
1796 }
1797 continue;
1798 }
1799 if (am_printing) {
1800 l = snprintf(cache, cache_size, "%s", line);
1801 if (l < cache_size) {
1802 cache += l;
1803 cache_size -= l;
1804 total_len += l;
1805 } else {
1806 cache += cache_size;
1807 total_len += cache_size;
1808 cache_size = 0;
1809 break;
1810 }
1811 }
1812 }
1813
1814 d->size = total_len;
1815 if (total_len > size ) total_len = size;
1816
1817 /* read from off 0 */
1818 memcpy(buf, d->buf, total_len);
1819
1820 fclose(f);
1821 free(line);
1822 return total_len;
1823 }
1824
1825 static int proc_stat_read(char *buf, size_t size, off_t offset,
1826 struct fuse_file_info *fi)
1827 {
1828 struct fuse_context *fc = fuse_get_context();
1829 struct file_info *d = (struct file_info *)fi->fh;
1830 nih_local char *cg = get_pid_cgroup(fc->pid, "cpuset");
1831 nih_local char *cpuset = NULL;
1832 char *line = NULL;
1833 size_t linelen = 0, total_len = 0;
1834 int curcpu = -1; /* cpu numbering starts at 0 */
1835 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
1836 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
1837 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
1838 #define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
1839 char cpuall[CPUALL_MAX_SIZE];
1840 /* reserve for cpu all */
1841 char *cache = d->buf + CPUALL_MAX_SIZE;
1842 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
1843 FILE *f;
1844
1845 if (offset){
1846 if (offset > d->size)
1847 return -EINVAL;
1848 int left = d->size - offset;
1849 total_len = left > size ? size: left;
1850 memcpy(buf, d->buf + offset, total_len);
1851 return total_len;
1852 }
1853
1854 if (!cg)
1855 return 0;
1856
1857 cpuset = get_cpuset(cg);
1858 if (!cpuset)
1859 return 0;
1860
1861 f = fopen("/proc/stat", "r");
1862 if (!f)
1863 return 0;
1864
1865 //skip first line
1866 if (getline(&line, &linelen, f) < 0) {
1867 fprintf(stderr, "proc_stat_read read first line failed\n");
1868 goto out;
1869 }
1870
1871 while (getline(&line, &linelen, f) != -1) {
1872 size_t l;
1873 int cpu;
1874 char cpu_char[10]; /* That's a lot of cores */
1875 char *c;
1876
1877 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
1878 /* not a ^cpuN line containing a number N, just print it */
1879 l = snprintf(cache, cache_size, "%s", line);
1880 if (l < cache_size){
1881 cache += l;
1882 cache_size -= l;
1883 total_len += l;
1884 continue;
1885 }else{
1886 //no more space, break it
1887 cache += cache_size;
1888 total_len += cache_size;
1889 cache_size = 0;
1890 break;
1891 }
1892 }
1893
1894 if (sscanf(cpu_char, "%d", &cpu) != 1)
1895 continue;
1896 if (!cpu_in_cpuset(cpu, cpuset))
1897 continue;
1898 curcpu ++;
1899
1900 c = strchr(line, ' ');
1901 if (!c)
1902 continue;
1903 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
1904 cache += l;
1905 cache_size -= l;
1906 total_len += l;
1907
1908 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
1909 &softirq, &steal, &guest) != 9)
1910 continue;
1911 user_sum += user;
1912 nice_sum += nice;
1913 system_sum += system;
1914 idle_sum += idle;
1915 iowait_sum += iowait;
1916 irq_sum += irq;
1917 softirq_sum += softirq;
1918 steal_sum += steal;
1919 guest_sum += guest;
1920 }
1921
1922 cache = d->buf;
1923
1924 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
1925 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
1926 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
1927 memcpy(cache, cpuall, cpuall_len);
1928 cache += cpuall_len;
1929 }else{
1930 /* shouldn't happen */
1931 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
1932 cpuall_len = 0;
1933 }
1934
1935 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
1936 total_len += cpuall_len;
1937 d->size = total_len;
1938 if (total_len > size ) total_len = size;
1939
1940 memcpy(buf, d->buf, total_len);
1941 out:
1942 fclose(f);
1943 free(line);
1944 return total_len;
1945 }
1946
1947 /*
1948 * How to guess what to present for uptime?
1949 * One thing we could do would be to take the date on the caller's
1950 * memory.usage_in_bytes file, which should equal the time of creation
1951 * of his cgroup. However, a task could be in a sub-cgroup of the
1952 * container. The same problem exists if we try to look at the ages
1953 * of processes in the caller's cgroup.
1954 *
1955 * So we'll fork a task that will enter the caller's pidns, mount a
1956 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
1957 *
1958 * For the second uptime #, we'll do as Stéphane had done, just copy
1959 * the number from /proc/uptime. Not sure how to best emulate 'idle'
1960 * time. Maybe someone can come up with a good algorithm and submit a
1961 * patch. Maybe something based on cpushare info?
1962 */
1963
1964 /* return age of the reaper for $pid, taken from ctime of its procdir */
1965 static long int get_pid1_time(pid_t pid)
1966 {
1967 char fnam[100];
1968 int fd, cpipe[2], ret;
1969 struct stat sb;
1970 pid_t cpid;
1971 struct timeval tv;
1972 fd_set s;
1973 char v;
1974
1975 if (unshare(CLONE_NEWNS))
1976 return 0;
1977
1978 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
1979 perror("rslave mount failed");
1980 return 0;
1981 }
1982
1983 sprintf(fnam, "/proc/%d/ns/pid", pid);
1984 fd = open(fnam, O_RDONLY);
1985 if (fd < 0) {
1986 perror("get_pid1_time open of ns/pid");
1987 return 0;
1988 }
1989 if (setns(fd, 0)) {
1990 perror("get_pid1_time setns 1");
1991 close(fd);
1992 return 0;
1993 }
1994 close(fd);
1995
1996 if (pipe(cpipe) < 0)
1997 exit(1);
1998
1999 loop:
2000 cpid = fork();
2001 if (cpid < 0)
2002 return 0;
2003
2004 if (!cpid) {
2005 char b = '1';
2006 close(cpipe[0]);
2007 if (write(cpipe[1], &b, sizeof(char)) < 0) {
2008 fprintf(stderr, "%s (child): erorr on write: %s\n",
2009 __func__, strerror(errno));
2010 }
2011 close(cpipe[1]);
2012 umount2("/proc", MNT_DETACH);
2013 if (mount("proc", "/proc", "proc", 0, NULL)) {
2014 perror("get_pid1_time mount");
2015 return 0;
2016 }
2017 ret = lstat("/proc/1", &sb);
2018 if (ret) {
2019 perror("get_pid1_time lstat");
2020 return 0;
2021 }
2022 return time(NULL) - sb.st_ctime;
2023 }
2024
2025 // give the child 1 second to be done forking and
2026 // write it's ack
2027 FD_ZERO(&s);
2028 FD_SET(cpipe[0], &s);
2029 tv.tv_sec = 1;
2030 tv.tv_usec = 0;
2031 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
2032 if (ret <= 0)
2033 goto again;
2034 ret = read(cpipe[0], &v, 1);
2035 if (ret != sizeof(char) || v != '1') {
2036 goto again;
2037 }
2038
2039 wait_for_pid(cpid);
2040 exit(0);
2041
2042 again:
2043 kill(cpid, SIGKILL);
2044 wait_for_pid(cpid);
2045 goto loop;
2046 }
2047
2048 static long int getreaperage(pid_t qpid)
2049 {
2050 int pid, mypipe[2], ret;
2051 struct timeval tv;
2052 fd_set s;
2053 long int mtime, answer = 0;
2054
2055 if (pipe(mypipe)) {
2056 return 0;
2057 }
2058
2059 pid = fork();
2060
2061 if (!pid) { // child
2062 mtime = get_pid1_time(qpid);
2063 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
2064 fprintf(stderr, "Warning: bad write from getreaperage\n");
2065 exit(0);
2066 }
2067
2068 close(mypipe[1]);
2069 FD_ZERO(&s);
2070 FD_SET(mypipe[0], &s);
2071 tv.tv_sec = 1;
2072 tv.tv_usec = 0;
2073 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
2074 if (ret <= 0) {
2075 perror("select");
2076 goto out;
2077 }
2078 if (!ret) {
2079 fprintf(stderr, "timed out\n");
2080 goto out;
2081 }
2082 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
2083 perror("read");
2084 goto out;
2085 }
2086 answer = mtime;
2087
2088 out:
2089 wait_for_pid(pid);
2090 close(mypipe[0]);
2091 return answer;
2092 }
2093
2094 static long int getprocidle(void)
2095 {
2096 FILE *f = fopen("/proc/uptime", "r");
2097 long int age, idle;
2098 int ret;
2099 if (!f)
2100 return 0;
2101 ret = fscanf(f, "%ld %ld", &age, &idle);
2102 fclose(f);
2103 if (ret != 2)
2104 return 0;
2105 return idle;
2106 }
2107
2108 /*
2109 * We read /proc/uptime and reuse its second field.
2110 * For the first field, we use the mtime for the reaper for
2111 * the calling pid as returned by getreaperage
2112 */
2113 static int proc_uptime_read(char *buf, size_t size, off_t offset,
2114 struct fuse_file_info *fi)
2115 {
2116 struct fuse_context *fc = fuse_get_context();
2117 struct file_info *d = (struct file_info *)fi->fh;
2118 long int reaperage = getreaperage(fc->pid);;
2119 long int idletime = getprocidle();
2120 size_t total_len = 0;
2121
2122 if (offset){
2123 if (offset > d->size)
2124 return -EINVAL;
2125 return 0;
2126 }
2127
2128 total_len = snprintf(buf, size, "%ld %ld\n", reaperage, idletime);
2129 d->size = total_len;
2130 return total_len;
2131 }
2132
2133 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2134 struct fuse_file_info *fi)
2135 {
2136 char dev_name[72];
2137 struct fuse_context *fc = fuse_get_context();
2138 struct file_info *d = (struct file_info *)fi->fh;
2139 nih_local char *cg = get_pid_cgroup(fc->pid, "blkio");
2140 nih_local char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
2141 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2142 unsigned long read = 0, write = 0;
2143 unsigned long read_merged = 0, write_merged = 0;
2144 unsigned long read_sectors = 0, write_sectors = 0;
2145 unsigned long read_ticks = 0, write_ticks = 0;
2146 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2147 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
2148 char *line = NULL;
2149 size_t linelen = 0, total_len = 0;
2150 unsigned int major = 0, minor = 0;
2151 int i = 0;
2152 FILE *f;
2153
2154 if (offset){
2155 if (offset > d->size)
2156 return -EINVAL;
2157 return 0;
2158 }
2159
2160 if (!cg)
2161 return 0;
2162
2163 if (!cgm_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2164 return 0;
2165 if (!cgm_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2166 return 0;
2167 if (!cgm_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2168 return 0;
2169 if (!cgm_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2170 return 0;
2171 if (!cgm_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2172 return 0;
2173
2174
2175 f = fopen("/proc/diskstats", "r");
2176 if (!f)
2177 return 0;
2178
2179 while (getline(&line, &linelen, f) != -1) {
2180 size_t l;
2181 char *printme, lbuf[256];
2182
2183 i = sscanf(line, "%u %u %s", &major, &minor, dev_name);
2184 if(i == 3){
2185 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2186 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2187 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2188 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2189 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2190 read_sectors = read_sectors/512;
2191 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2192 write_sectors = write_sectors/512;
2193
2194 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2195 rd_svctm = rd_svctm/1000000;
2196 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2197 rd_wait = rd_wait/1000000;
2198 read_ticks = rd_svctm + rd_wait;
2199
2200 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2201 wr_svctm = wr_svctm/1000000;
2202 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2203 wr_wait = wr_wait/1000000;
2204 write_ticks = wr_svctm + wr_wait;
2205
2206 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2207 tot_ticks = tot_ticks/1000000;
2208 }else{
2209 continue;
2210 }
2211
2212 memset(lbuf, 0, 256);
2213 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2214 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2215 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2216 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2217 printme = lbuf;
2218 } else
2219 continue;
2220
2221 l = snprintf(buf, size, "%s", printme);
2222 buf += l;
2223 size -= l;
2224 total_len += l;
2225 }
2226
2227 d->size = total_len;
2228
2229 fclose(f);
2230 free(line);
2231 return total_len;
2232 }
2233
2234 static off_t get_procfile_size(const char *which)
2235 {
2236 FILE *f = fopen(which, "r");
2237 char *line = NULL;
2238 size_t len = 0;
2239 ssize_t sz, answer = 0;
2240 if (!f)
2241 return 0;
2242
2243 while ((sz = getline(&line, &len, f)) != -1)
2244 answer += sz;
2245 fclose (f);
2246 free(line);
2247
2248 return answer;
2249 }
2250
2251 static int proc_getattr(const char *path, struct stat *sb)
2252 {
2253 struct timespec now;
2254
2255 memset(sb, 0, sizeof(struct stat));
2256 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2257 return -EINVAL;
2258 sb->st_uid = sb->st_gid = 0;
2259 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2260 if (strcmp(path, "/proc") == 0) {
2261 sb->st_mode = S_IFDIR | 00555;
2262 sb->st_nlink = 2;
2263 return 0;
2264 }
2265 if (strcmp(path, "/proc/meminfo") == 0 ||
2266 strcmp(path, "/proc/cpuinfo") == 0 ||
2267 strcmp(path, "/proc/uptime") == 0 ||
2268 strcmp(path, "/proc/stat") == 0 ||
2269 strcmp(path, "/proc/diskstats") == 0) {
2270 sb->st_size = 0;
2271 sb->st_mode = S_IFREG | 00444;
2272 sb->st_nlink = 1;
2273 return 0;
2274 }
2275
2276 return -ENOENT;
2277 }
2278
2279 static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2280 struct fuse_file_info *fi)
2281 {
2282 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2283 filler(buf, "meminfo", NULL, 0) != 0 ||
2284 filler(buf, "stat", NULL, 0) != 0 ||
2285 filler(buf, "uptime", NULL, 0) != 0 ||
2286 filler(buf, "diskstats", NULL, 0) != 0)
2287 return -EINVAL;
2288 return 0;
2289 }
2290
2291 static int proc_open(const char *path, struct fuse_file_info *fi)
2292 {
2293 int type = -1;
2294 struct file_info *info;
2295
2296 if (strcmp(path, "/proc/meminfo") == 0)
2297 type = LXC_TYPE_PROC_MEMINFO;
2298 else if (strcmp(path, "/proc/cpuinfo") == 0)
2299 type = LXC_TYPE_PROC_CPUINFO;
2300 else if (strcmp(path, "/proc/uptime") == 0)
2301 type = LXC_TYPE_PROC_UPTIME;
2302 else if (strcmp(path, "/proc/stat") == 0)
2303 type = LXC_TYPE_PROC_STAT;
2304 else if (strcmp(path, "/proc/diskstats") == 0)
2305 type = LXC_TYPE_PROC_DISKSTATS;
2306 if (type == -1)
2307 return -ENOENT;
2308
2309 info = NIH_MUST( nih_alloc(NULL, sizeof(*info)) );
2310 memset(info, 0, sizeof(*info));
2311 info->type = type;
2312
2313 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
2314 info->buf = NIH_MUST( nih_alloc(info, info->buflen) );
2315 memset(info->buf, 0, info->buflen);
2316 /* set actual size to buffer size */
2317 info->size = info->buflen;
2318
2319 fi->fh = (unsigned long)info;
2320 return 0;
2321 }
2322
2323 static int proc_release(const char *path, struct fuse_file_info *fi)
2324 {
2325 struct file_info *f = (struct file_info *)fi->fh;
2326
2327 do_release_file_info(f);
2328 return 0;
2329 }
2330
2331 static int proc_read(const char *path, char *buf, size_t size, off_t offset,
2332 struct fuse_file_info *fi)
2333 {
2334 struct file_info *f = (struct file_info *) fi->fh;
2335
2336 switch (f->type) {
2337 case LXC_TYPE_PROC_MEMINFO:
2338 return proc_meminfo_read(buf, size, offset, fi);
2339 case LXC_TYPE_PROC_CPUINFO:
2340 return proc_cpuinfo_read(buf, size, offset, fi);
2341 case LXC_TYPE_PROC_UPTIME:
2342 return proc_uptime_read(buf, size, offset, fi);
2343 case LXC_TYPE_PROC_STAT:
2344 return proc_stat_read(buf, size, offset, fi);
2345 case LXC_TYPE_PROC_DISKSTATS:
2346 return proc_diskstats_read(buf, size, offset, fi);
2347 default:
2348 return -EINVAL;
2349 }
2350 }
2351
2352 /*
2353 * FUSE ops for /
2354 * these just delegate to the /proc and /cgroup ops as
2355 * needed
2356 */
2357
2358 static int lxcfs_getattr(const char *path, struct stat *sb)
2359 {
2360 if (strcmp(path, "/") == 0) {
2361 sb->st_mode = S_IFDIR | 00755;
2362 sb->st_nlink = 2;
2363 return 0;
2364 }
2365 if (strncmp(path, "/cgroup", 7) == 0) {
2366 return cg_getattr(path, sb);
2367 }
2368 if (strncmp(path, "/proc", 5) == 0) {
2369 return proc_getattr(path, sb);
2370 }
2371 return -EINVAL;
2372 }
2373
2374 static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
2375 {
2376 if (strcmp(path, "/") == 0)
2377 return 0;
2378
2379 if (strncmp(path, "/cgroup", 7) == 0) {
2380 return cg_opendir(path, fi);
2381 }
2382 if (strcmp(path, "/proc") == 0)
2383 return 0;
2384 return -ENOENT;
2385 }
2386
2387 static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2388 struct fuse_file_info *fi)
2389 {
2390 if (strcmp(path, "/") == 0) {
2391 if (filler(buf, "proc", NULL, 0) != 0 ||
2392 filler(buf, "cgroup", NULL, 0) != 0)
2393 return -EINVAL;
2394 return 0;
2395 }
2396 if (strncmp(path, "/cgroup", 7) == 0)
2397 return cg_readdir(path, buf, filler, offset, fi);
2398 if (strcmp(path, "/proc") == 0)
2399 return proc_readdir(path, buf, filler, offset, fi);
2400 return -EINVAL;
2401 }
2402
2403 static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
2404 {
2405 if (strcmp(path, "/") == 0)
2406 return 0;
2407 if (strncmp(path, "/cgroup", 7) == 0) {
2408 return cg_releasedir(path, fi);
2409 }
2410 if (strcmp(path, "/proc") == 0)
2411 return 0;
2412 return -EINVAL;
2413 }
2414
2415 static int lxcfs_open(const char *path, struct fuse_file_info *fi)
2416 {
2417 if (strncmp(path, "/cgroup", 7) == 0)
2418 return cg_open(path, fi);
2419 if (strncmp(path, "/proc", 5) == 0)
2420 return proc_open(path, fi);
2421
2422 return -EINVAL;
2423 }
2424
2425 static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
2426 struct fuse_file_info *fi)
2427 {
2428 if (strncmp(path, "/cgroup", 7) == 0)
2429 return cg_read(path, buf, size, offset, fi);
2430 if (strncmp(path, "/proc", 5) == 0)
2431 return proc_read(path, buf, size, offset, fi);
2432
2433 return -EINVAL;
2434 }
2435
2436 int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
2437 struct fuse_file_info *fi)
2438 {
2439 if (strncmp(path, "/cgroup", 7) == 0) {
2440 return cg_write(path, buf, size, offset, fi);
2441 }
2442
2443 return -EINVAL;
2444 }
2445
2446 static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2447 {
2448 return 0;
2449 }
2450
2451 static int lxcfs_release(const char *path, struct fuse_file_info *fi)
2452 {
2453 if (strncmp(path, "/cgroup", 7) == 0)
2454 return cg_release(path, fi);
2455 if (strncmp(path, "/proc", 5) == 0)
2456 return proc_release(path, fi);
2457
2458 return -EINVAL;
2459 }
2460
2461 static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2462 {
2463 return 0;
2464 }
2465
2466 int lxcfs_mkdir(const char *path, mode_t mode)
2467 {
2468 if (strncmp(path, "/cgroup", 7) == 0)
2469 return cg_mkdir(path, mode);
2470
2471 return -EINVAL;
2472 }
2473
2474 int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2475 {
2476 if (strncmp(path, "/cgroup", 7) == 0)
2477 return cg_chown(path, uid, gid);
2478
2479 return -EINVAL;
2480 }
2481
2482 /*
2483 * cat first does a truncate before doing ops->write. This doesn't
2484 * really make sense for cgroups. So just return 0 always but do
2485 * nothing.
2486 */
2487 int lxcfs_truncate(const char *path, off_t newsize)
2488 {
2489 if (strncmp(path, "/cgroup", 7) == 0)
2490 return 0;
2491 return -EINVAL;
2492 }
2493
2494 int lxcfs_rmdir(const char *path)
2495 {
2496 if (strncmp(path, "/cgroup", 7) == 0)
2497 return cg_rmdir(path);
2498 return -EINVAL;
2499 }
2500
2501 int lxcfs_chmod(const char *path, mode_t mode)
2502 {
2503 if (strncmp(path, "/cgroup", 7) == 0)
2504 return cg_chmod(path, mode);
2505 return -EINVAL;
2506 }
2507
2508 const struct fuse_operations lxcfs_ops = {
2509 .getattr = lxcfs_getattr,
2510 .readlink = NULL,
2511 .getdir = NULL,
2512 .mknod = NULL,
2513 .mkdir = lxcfs_mkdir,
2514 .unlink = NULL,
2515 .rmdir = lxcfs_rmdir,
2516 .symlink = NULL,
2517 .rename = NULL,
2518 .link = NULL,
2519 .chmod = lxcfs_chmod,
2520 .chown = lxcfs_chown,
2521 .truncate = lxcfs_truncate,
2522 .utime = NULL,
2523
2524 .open = lxcfs_open,
2525 .read = lxcfs_read,
2526 .release = lxcfs_release,
2527 .write = lxcfs_write,
2528
2529 .statfs = NULL,
2530 .flush = lxcfs_flush,
2531 .fsync = lxcfs_fsync,
2532
2533 .setxattr = NULL,
2534 .getxattr = NULL,
2535 .listxattr = NULL,
2536 .removexattr = NULL,
2537
2538 .opendir = lxcfs_opendir,
2539 .readdir = lxcfs_readdir,
2540 .releasedir = lxcfs_releasedir,
2541
2542 .fsyncdir = NULL,
2543 .init = NULL,
2544 .destroy = NULL,
2545 .access = NULL,
2546 .create = NULL,
2547 .ftruncate = NULL,
2548 .fgetattr = NULL,
2549 };
2550
2551 static void usage(const char *me)
2552 {
2553 fprintf(stderr, "Usage:\n");
2554 fprintf(stderr, "\n");
2555 fprintf(stderr, "%s mountpoint\n", me);
2556 fprintf(stderr, "%s -h\n", me);
2557 exit(1);
2558 }
2559
2560 static bool is_help(char *w)
2561 {
2562 if (strcmp(w, "-h") == 0 ||
2563 strcmp(w, "--help") == 0 ||
2564 strcmp(w, "-help") == 0 ||
2565 strcmp(w, "help") == 0)
2566 return true;
2567 return false;
2568 }
2569
2570 void swallow_arg(int *argcp, char *argv[], char *which)
2571 {
2572 int i;
2573
2574 for (i = 1; argv[i]; i++) {
2575 if (strcmp(argv[i], which) != 0)
2576 continue;
2577 for (; argv[i]; i++) {
2578 argv[i] = argv[i+1];
2579 }
2580 (*argcp)--;
2581 return;
2582 }
2583 }
2584
2585 void swallow_option(int *argcp, char *argv[], char *opt, char *v)
2586 {
2587 int i;
2588
2589 for (i = 1; argv[i]; i++) {
2590 if (!argv[i+1])
2591 continue;
2592 if (strcmp(argv[i], opt) != 0)
2593 continue;
2594 if (strcmp(argv[i+1], v) != 0) {
2595 fprintf(stderr, "Warning: unexpected fuse option %s\n", v);
2596 exit(1);
2597 }
2598 for (; argv[i+1]; i++) {
2599 argv[i] = argv[i+2];
2600 }
2601 (*argcp) -= 2;
2602 return;
2603 }
2604 }
2605
2606 int main(int argc, char *argv[])
2607 {
2608 int ret;
2609 struct lxcfs_state *d;
2610 /*
2611 * what we pass to fuse_main is:
2612 * argv[0] -s -f -o allow_other,directio argv[1] NULL
2613 */
2614 #define NARGS 7
2615 char *newargv[7];
2616
2617 /* accomodate older init scripts */
2618 swallow_arg(&argc, argv, "-s");
2619 swallow_arg(&argc, argv, "-f");
2620 swallow_option(&argc, argv, "-o", "allow_other");
2621
2622 if (argc != 2 || is_help(argv[1]))
2623 usage(argv[0]);
2624
2625 d = NIH_MUST( malloc(sizeof(*d)) );
2626
2627 newargv[0] = argv[0];
2628 newargv[1] = "-s";
2629 newargv[2] = "-f";
2630 newargv[3] = "-o";
2631 newargv[4] = "allow_other,direct_io";
2632 newargv[5] = argv[1];
2633 newargv[6] = NULL;
2634
2635 if (!cgm_escape_cgroup())
2636 fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
2637
2638 if (!cgm_get_controllers(&d->subsystems))
2639 return -1;
2640
2641 ret = fuse_main(NARGS - 1, newargv, &lxcfs_ops, d);
2642
2643 return ret;
2644 }