]> git.proxmox.com Git - mirror_lxcfs.git/blob - lxcfs.c
don't let idletime be > reaperage
[mirror_lxcfs.git] / lxcfs.c
1 /* lxcfs
2 *
3 * Copyright © 2014,2015 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #include <stdio.h>
12 #include <dirent.h>
13 #include <fcntl.h>
14 #include <fuse.h>
15 #include <unistd.h>
16 #include <errno.h>
17 #include <stdbool.h>
18 #include <time.h>
19 #include <string.h>
20 #include <stdlib.h>
21 #include <libgen.h>
22 #include <sched.h>
23 #include <linux/sched.h>
24 #include <sys/socket.h>
25 #include <sys/mount.h>
26 #include <wait.h>
27
28 #include "cgmanager.h"
29 #include "config.h" // for VERSION
30
31 struct lxcfs_state {
32 /*
33 * a null-terminated list of the mounted subsystems. We
34 * detect this at startup.
35 */
36 char **subsystems;
37 };
38 #define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
39
40 enum {
41 LXC_TYPE_CGDIR,
42 LXC_TYPE_CGFILE,
43 LXC_TYPE_PROC_MEMINFO,
44 LXC_TYPE_PROC_CPUINFO,
45 LXC_TYPE_PROC_UPTIME,
46 LXC_TYPE_PROC_STAT,
47 LXC_TYPE_PROC_DISKSTATS,
48 };
49
50 struct file_info {
51 char *controller;
52 char *cgroup;
53 char *file;
54 int type;
55 char *buf; // unused as of yet
56 int buflen;
57 int size; //actual data size
58 int cached;
59 };
60
61 /* reserve buffer size, for cpuall in /proc/stat */
62 #define BUF_RESERVE_SIZE 256
63
64 /*
65 * append pid to *src.
66 * src: a pointer to a char* in which ot append the pid.
67 * sz: the number of characters printed so far, minus trailing \0.
68 * asz: the allocated size so far
69 * pid: the pid to append
70 */
71 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
72 {
73 char *d = *src;
74 char tmp[30];
75
76 sprintf(tmp, "%d\n", (int)pid);
77
78 if (!d) {
79 do {
80 d = malloc(BUF_RESERVE_SIZE);
81 } while (!d);
82 *src = d;
83 *asz = BUF_RESERVE_SIZE;
84 } else if (strlen(tmp) + sz + 1 >= asz) {
85 do {
86 d = realloc(d, *asz + BUF_RESERVE_SIZE);
87 } while (!d);
88 *src = d;
89 *asz += BUF_RESERVE_SIZE;
90 }
91 memcpy(d+*sz, tmp, strlen(tmp));
92 *sz += strlen(tmp);
93 d[*sz] = '\0';
94 }
95
96 static char *must_copy_string(void *parent, const char *str)
97 {
98 char *dup = NULL;
99 if (!str)
100 return NULL;
101 do {
102 dup = strdup(str);
103 } while (!dup);
104
105 return dup;
106 }
107
108 static int wait_for_pid(pid_t pid)
109 {
110 int status, ret;
111
112 again:
113 ret = waitpid(pid, &status, 0);
114 if (ret == -1) {
115 if (errno == EINTR)
116 goto again;
117 return -1;
118 }
119 if (ret != pid)
120 goto again;
121 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
122 return -1;
123 return 0;
124 }
125
126 /*
127 * Given a open file * to /proc/pid/{u,g}id_map, and an id
128 * valid in the caller's namespace, return the id mapped into
129 * pid's namespace.
130 * Returns the mapped id, or -1 on error.
131 */
132 unsigned int
133 convert_id_to_ns(FILE *idfile, unsigned int in_id)
134 {
135 unsigned int nsuid, // base id for a range in the idfile's namespace
136 hostuid, // base id for a range in the caller's namespace
137 count; // number of ids in this range
138 char line[400];
139 int ret;
140
141 fseek(idfile, 0L, SEEK_SET);
142 while (fgets(line, 400, idfile)) {
143 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
144 if (ret != 3)
145 continue;
146 if (hostuid + count < hostuid || nsuid + count < nsuid) {
147 /*
148 * uids wrapped around - unexpected as this is a procfile,
149 * so just bail.
150 */
151 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
152 nsuid, hostuid, count, line);
153 return -1;
154 }
155 if (hostuid <= in_id && hostuid+count > in_id) {
156 /*
157 * now since hostuid <= in_id < hostuid+count, and
158 * hostuid+count and nsuid+count do not wrap around,
159 * we know that nsuid+(in_id-hostuid) which must be
160 * less that nsuid+(count) must not wrap around
161 */
162 return (in_id - hostuid) + nsuid;
163 }
164 }
165
166 // no answer found
167 return -1;
168 }
169
170 /*
171 * for is_privileged_over,
172 * specify whether we require the calling uid to be root in his
173 * namespace
174 */
175 #define NS_ROOT_REQD true
176 #define NS_ROOT_OPT false
177
178 #define PROCLEN 100
179
180 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
181 {
182 char fpath[PROCLEN];
183 int ret;
184 bool answer = false;
185 uid_t nsuid;
186
187 if (victim == -1 || uid == -1)
188 return false;
189
190 /*
191 * If the request is one not requiring root in the namespace,
192 * then having the same uid suffices. (i.e. uid 1000 has write
193 * access to files owned by uid 1000
194 */
195 if (!req_ns_root && uid == victim)
196 return true;
197
198 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
199 if (ret < 0 || ret >= PROCLEN)
200 return false;
201 FILE *f = fopen(fpath, "r");
202 if (!f)
203 return false;
204
205 /* if caller's not root in his namespace, reject */
206 nsuid = convert_id_to_ns(f, uid);
207 if (nsuid)
208 goto out;
209
210 /*
211 * If victim is not mapped into caller's ns, reject.
212 * XXX I'm not sure this check is needed given that fuse
213 * will be sending requests where the vfs has converted
214 */
215 nsuid = convert_id_to_ns(f, victim);
216 if (nsuid == -1)
217 goto out;
218
219 answer = true;
220
221 out:
222 fclose(f);
223 return answer;
224 }
225
226 static bool perms_include(int fmode, mode_t req_mode)
227 {
228 mode_t r;
229
230 switch (req_mode & O_ACCMODE) {
231 case O_RDONLY:
232 r = S_IROTH;
233 break;
234 case O_WRONLY:
235 r = S_IWOTH;
236 break;
237 case O_RDWR:
238 r = S_IROTH | S_IWOTH;
239 break;
240 default:
241 return false;
242 }
243 return ((fmode & r) == r);
244 }
245
246 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
247 {
248 char *start, *end;
249
250 if (strlen(taskcg) <= strlen(querycg)) {
251 fprintf(stderr, "%s: I was fed bad input\n", __func__);
252 return NULL;
253 }
254
255 if (strcmp(querycg, "/") == 0)
256 start = strdup(taskcg + 1);
257 else
258 start = strdup(taskcg + strlen(querycg) + 1);
259 if (!start)
260 return NULL;
261 end = strchr(start, '/');
262 if (end)
263 *end = '\0';
264 return start;
265 }
266
267 static void stripnewline(char *x)
268 {
269 size_t l = strlen(x);
270 if (l && x[l-1] == '\n')
271 x[l-1] = '\0';
272 }
273
274 static char *get_pid_cgroup(pid_t pid, const char *contrl)
275 {
276 char fnam[PROCLEN];
277 FILE *f;
278 char *answer = NULL;
279 char *line = NULL;
280 size_t len = 0;
281 int ret;
282
283 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
284 if (ret < 0 || ret >= PROCLEN)
285 return NULL;
286 if (!(f = fopen(fnam, "r")))
287 return NULL;
288
289 while (getline(&line, &len, f) != -1) {
290 char *c1, *c2;
291 if (!line[0])
292 continue;
293 c1 = strchr(line, ':');
294 if (!c1)
295 goto out;
296 c1++;
297 c2 = strchr(c1, ':');
298 if (!c2)
299 goto out;
300 *c2 = '\0';
301 if (strcmp(c1, contrl) != 0)
302 continue;
303 c2++;
304 stripnewline(c2);
305 do {
306 answer = strdup(c2);
307 } while (!answer);
308 break;
309 }
310
311 out:
312 fclose(f);
313 free(line);
314 return answer;
315 }
316
317 /*
318 * check whether a fuse context may access a cgroup dir or file
319 *
320 * If file is not null, it is a cgroup file to check under cg.
321 * If file is null, then we are checking perms on cg itself.
322 *
323 * For files we can check the mode of the list_keys result.
324 * For cgroups, we must make assumptions based on the files under the
325 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
326 * yet.
327 */
328 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
329 {
330 struct cgm_keys **list = NULL;
331 bool ret = false;
332 int i;
333
334 if (!file)
335 file = "tasks";
336
337 if (*file == '/')
338 file++;
339
340 if (!cgm_list_keys(contrl, cg, &list))
341 return false;
342 for (i = 0; list[i]; i++) {
343 if (strcmp(list[i]->name, file) == 0) {
344 struct cgm_keys *k = list[i];
345 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
346 if (perms_include(k->mode >> 6, mode)) {
347 ret = true;
348 goto out;
349 }
350 }
351 if (fc->gid == k->gid) {
352 if (perms_include(k->mode >> 3, mode)) {
353 ret = true;
354 goto out;
355 }
356 }
357 ret = perms_include(k->mode, mode);
358 goto out;
359 }
360 }
361
362 out:
363 free_keys(list);
364 return ret;
365 }
366
367 #define INITSCOPE "/init.scope"
368 static void prune_init_slice(char *cg)
369 {
370 char *point;
371 point = cg + strlen(cg) - strlen(INITSCOPE);
372 if (point < cg)
373 return;
374 if (strcmp(point, INITSCOPE) == 0) {
375 if (point == cg)
376 *(point+1) = '\0';
377 else
378 *point = '\0';
379 }
380 }
381
382 /*
383 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
384 * If caller is in /a, he may act on /a/b, but not on /b.
385 * if the answer is false and nextcg is not NULL, then *nextcg will point
386 * to a string containing the next cgroup directory under cg, which must be
387 * freed by the caller.
388 */
389 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
390 {
391 char fnam[PROCLEN];
392 FILE *f;
393 bool answer = false;
394 char *line = NULL;
395 size_t len = 0;
396 int ret;
397
398 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
399 if (ret < 0 || ret >= PROCLEN)
400 return false;
401 if (!(f = fopen(fnam, "r")))
402 return false;
403
404 while (getline(&line, &len, f) != -1) {
405 char *c1, *c2, *linecmp;
406 if (!line[0])
407 continue;
408 c1 = strchr(line, ':');
409 if (!c1)
410 goto out;
411 c1++;
412 c2 = strchr(c1, ':');
413 if (!c2)
414 goto out;
415 *c2 = '\0';
416 if (strcmp(c1, contrl) != 0)
417 continue;
418 c2++;
419 stripnewline(c2);
420 prune_init_slice(c2);
421 /*
422 * callers pass in '/' for root cgroup, otherwise they pass
423 * in a cgroup without leading '/'
424 */
425 linecmp = *cg == '/' ? c2 : c2+1;
426 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
427 if (nextcg)
428 *nextcg = get_next_cgroup_dir(linecmp, cg);
429 goto out;
430 }
431 answer = true;
432 goto out;
433 }
434
435 out:
436 fclose(f);
437 free(line);
438 return answer;
439 }
440
441 /*
442 * given /cgroup/freezer/a/b, return "freezer".
443 * the returned char* should NOT be freed.
444 */
445 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
446 {
447 const char *p1;
448 char *contr, *slash;
449
450 if (strlen(path) < 9)
451 return NULL;
452 if (*(path+7) != '/')
453 return NULL;
454 p1 = path+8;
455 contr = strdupa(p1);
456 if (!contr)
457 return NULL;
458 slash = strstr(contr, "/");
459 if (slash)
460 *slash = '\0';
461
462 /* verify that it is a subsystem */
463 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
464 int i;
465 if (!list)
466 return NULL;
467 for (i = 0; list[i]; i++) {
468 if (strcmp(list[i], contr) == 0)
469 return list[i];
470 }
471 return NULL;
472 }
473
474 /*
475 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
476 * Note that the returned value may include files (keynames) etc
477 */
478 static const char *find_cgroup_in_path(const char *path)
479 {
480 const char *p1;
481
482 if (strlen(path) < 9)
483 return NULL;
484 p1 = strstr(path+8, "/");
485 if (!p1)
486 return NULL;
487 return p1+1;
488 }
489
490 static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
491 {
492 char **list;
493 bool ret = false;
494 int i;
495
496 if (!f)
497 return false;
498 if (*f == '/')
499 f++;
500
501 if (!cgm_list_children(contr, dir, &list))
502 return false;
503 for (i = 0; list[i]; i++) {
504 if (strcmp(list[i], f) == 0) {
505 ret = true;
506 goto out;
507 }
508 }
509
510 out:
511 for (i = 0; list[i]; i++)
512 free(list[i]);
513 free(list);
514 return ret;
515 }
516
517 static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
518 {
519 struct cgm_keys **list = NULL;
520 struct cgm_keys *k = NULL;
521 int i;
522
523 if (!f)
524 return NULL;
525 if (*f == '/')
526 f++;
527 if (!cgm_list_keys(contr, dir, &list))
528 return NULL;
529 for (i = 0; list[i]; i++) {
530 if (strcmp(list[i]->name, f) == 0) {
531 int j;
532 // free all the keys we are not returning
533 k = list[i];
534 for (j = 0; list[j]; j++) {
535 if (i != j)
536 free_key(list[j]);
537 }
538 free(list);
539 return k;
540 }
541 }
542
543 free_keys(list);
544 return NULL;
545 }
546
547 /*
548 * dir should be freed, file not
549 */
550 static void get_cgdir_and_path(const char *cg, char **dir, char **file)
551 {
552 char *p;
553
554 do {
555 *dir = strdup(cg);
556 } while (!*dir);
557 *file = strrchr(cg, '/');
558 if (!*file) {
559 *file = NULL;
560 return;
561 }
562 p = strrchr(*dir, '/');
563 *p = '\0';
564 }
565
566 /*
567 * FUSE ops for /cgroup
568 */
569
570 static int cg_getattr(const char *path, struct stat *sb)
571 {
572 struct timespec now;
573 struct fuse_context *fc = fuse_get_context();
574 char * cgdir = NULL;
575 char *fpath = NULL, *path1, *path2;
576 struct cgm_keys *k = NULL;
577 const char *cgroup;
578 const char *controller = NULL;
579 int ret = -ENOENT;
580
581
582 if (!fc)
583 return -EIO;
584
585 memset(sb, 0, sizeof(struct stat));
586
587 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
588 return -EINVAL;
589
590 sb->st_uid = sb->st_gid = 0;
591 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
592 sb->st_size = 0;
593
594 if (strcmp(path, "/cgroup") == 0) {
595 sb->st_mode = S_IFDIR | 00755;
596 sb->st_nlink = 2;
597 return 0;
598 }
599
600 controller = pick_controller_from_path(fc, path);
601 if (!controller)
602 return -EIO;
603 cgroup = find_cgroup_in_path(path);
604 if (!cgroup) {
605 /* this is just /cgroup/controller, return it as a dir */
606 sb->st_mode = S_IFDIR | 00755;
607 sb->st_nlink = 2;
608 return 0;
609 }
610
611 get_cgdir_and_path(cgroup, &cgdir, &fpath);
612
613 if (!fpath) {
614 path1 = "/";
615 path2 = cgdir;
616 } else {
617 path1 = cgdir;
618 path2 = fpath;
619 }
620
621 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
622 * Then check that caller's cgroup is under path if fpath is a child
623 * cgroup, or cgdir if fpath is a file */
624
625 if (is_child_cgroup(controller, path1, path2)) {
626 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
627 /* this is just /cgroup/controller, return it as a dir */
628 sb->st_mode = S_IFDIR | 00555;
629 sb->st_nlink = 2;
630 ret = 0;
631 goto out;
632 }
633 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
634 ret = -EACCES;
635 goto out;
636 }
637
638 // get uid, gid, from '/tasks' file and make up a mode
639 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
640 sb->st_mode = S_IFDIR | 00755;
641 k = get_cgroup_key(controller, cgroup, "tasks");
642 if (!k) {
643 sb->st_uid = sb->st_gid = 0;
644 } else {
645 sb->st_uid = k->uid;
646 sb->st_gid = k->gid;
647 }
648 free_key(k);
649 sb->st_nlink = 2;
650 ret = 0;
651 goto out;
652 }
653
654 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
655 sb->st_mode = S_IFREG | k->mode;
656 sb->st_nlink = 1;
657 sb->st_uid = k->uid;
658 sb->st_gid = k->gid;
659 sb->st_size = 0;
660 free_key(k);
661 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL)) {
662 ret = -ENOENT;
663 goto out;
664 }
665 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
666 ret = -EACCES;
667 goto out;
668 }
669
670 ret = 0;
671 }
672
673 out:
674 free(cgdir);
675 return ret;
676 }
677
678 static int cg_opendir(const char *path, struct fuse_file_info *fi)
679 {
680 struct fuse_context *fc = fuse_get_context();
681 const char *cgroup;
682 struct file_info *dir_info;
683 char *controller = NULL;
684
685 if (!fc)
686 return -EIO;
687
688 if (strcmp(path, "/cgroup") == 0) {
689 cgroup = NULL;
690 controller = NULL;
691 } else {
692 // return list of keys for the controller, and list of child cgroups
693 controller = pick_controller_from_path(fc, path);
694 if (!controller)
695 return -EIO;
696
697 cgroup = find_cgroup_in_path(path);
698 if (!cgroup) {
699 /* this is just /cgroup/controller, return its contents */
700 cgroup = "/";
701 }
702 }
703
704 if (cgroup && !fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
705 return -EACCES;
706 }
707
708 /* we'll free this at cg_releasedir */
709 dir_info = malloc(sizeof(*dir_info));
710 if (!dir_info)
711 return -ENOMEM;
712 dir_info->controller = must_copy_string(dir_info, controller);
713 dir_info->cgroup = must_copy_string(dir_info, cgroup);
714 dir_info->type = LXC_TYPE_CGDIR;
715 dir_info->buf = NULL;
716 dir_info->file = NULL;
717 dir_info->buflen = 0;
718
719 fi->fh = (unsigned long)dir_info;
720 return 0;
721 }
722
723 static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
724 struct fuse_file_info *fi)
725 {
726 struct file_info *d = (struct file_info *)fi->fh;
727 struct cgm_keys **list = NULL;
728 int i, ret;
729 char *nextcg = NULL;
730 struct fuse_context *fc = fuse_get_context();
731 char **clist = NULL;
732
733 if (d->type != LXC_TYPE_CGDIR) {
734 fprintf(stderr, "Internal error: file cache info used in readdir\n");
735 return -EIO;
736 }
737 if (!d->cgroup && !d->controller) {
738 // ls /var/lib/lxcfs/cgroup - just show list of controllers
739 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
740 int i;
741
742 if (!list)
743 return -EIO;
744
745 for (i = 0; list[i]; i++) {
746 if (filler(buf, list[i], NULL, 0) != 0) {
747 return -EIO;
748 }
749 }
750 return 0;
751 }
752
753 if (!cgm_list_keys(d->controller, d->cgroup, &list)) {
754 // not a valid cgroup
755 ret = -EINVAL;
756 goto out;
757 }
758
759 if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
760 if (nextcg) {
761 int ret;
762 ret = filler(buf, nextcg, NULL, 0);
763 free(nextcg);
764 if (ret != 0) {
765 ret = -EIO;
766 goto out;
767 }
768 }
769 ret = 0;
770 goto out;
771 }
772
773 for (i = 0; list[i]; i++) {
774 if (filler(buf, list[i]->name, NULL, 0) != 0) {
775 ret = -EIO;
776 goto out;
777 }
778 }
779
780 // now get the list of child cgroups
781
782 if (!cgm_list_children(d->controller, d->cgroup, &clist)) {
783 ret = 0;
784 goto out;
785 }
786 for (i = 0; clist[i]; i++) {
787 if (filler(buf, clist[i], NULL, 0) != 0) {
788 ret = -EIO;
789 goto out;
790 }
791 }
792 ret = 0;
793
794 out:
795 free_keys(list);
796 if (clist) {
797 for (i = 0; clist[i]; i++)
798 free(clist[i]);
799 free(clist);
800 }
801 return ret;
802 }
803
804 static void do_release_file_info(struct file_info *f)
805 {
806 if (!f)
807 return;
808 free(f->controller);
809 free(f->cgroup);
810 free(f->file);
811 free(f->buf);
812 free(f);
813 }
814
815 static int cg_releasedir(const char *path, struct fuse_file_info *fi)
816 {
817 struct file_info *d = (struct file_info *)fi->fh;
818
819 do_release_file_info(d);
820 return 0;
821 }
822
823 static int cg_open(const char *path, struct fuse_file_info *fi)
824 {
825 const char *cgroup;
826 char *fpath = NULL, *path1, *path2, * cgdir = NULL, *controller;
827 struct cgm_keys *k = NULL;
828 struct file_info *file_info;
829 struct fuse_context *fc = fuse_get_context();
830 int ret;
831
832 if (!fc)
833 return -EIO;
834
835 controller = pick_controller_from_path(fc, path);
836 if (!controller)
837 return -EIO;
838 cgroup = find_cgroup_in_path(path);
839 if (!cgroup)
840 return -EINVAL;
841
842 get_cgdir_and_path(cgroup, &cgdir, &fpath);
843 if (!fpath) {
844 path1 = "/";
845 path2 = cgdir;
846 } else {
847 path1 = cgdir;
848 path2 = fpath;
849 }
850
851 k = get_cgroup_key(controller, path1, path2);
852 if (!k) {
853 ret = -EINVAL;
854 goto out;
855 }
856 free_key(k);
857
858 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
859 // should never get here
860 ret = -EACCES;
861 goto out;
862 }
863
864 /* we'll free this at cg_release */
865 file_info = malloc(sizeof(*file_info));
866 if (!file_info) {
867 ret = -ENOMEM;
868 goto out;
869 }
870 file_info->controller = must_copy_string(file_info, controller);
871 file_info->cgroup = must_copy_string(file_info, path1);
872 file_info->file = must_copy_string(file_info, path2);
873 file_info->type = LXC_TYPE_CGFILE;
874 file_info->buf = NULL;
875 file_info->buflen = 0;
876
877 fi->fh = (unsigned long)file_info;
878 ret = 0;
879
880 out:
881 free(cgdir);
882 return ret;
883 }
884
885 static int cg_release(const char *path, struct fuse_file_info *fi)
886 {
887 struct file_info *f = (struct file_info *)fi->fh;
888
889 do_release_file_info(f);
890 return 0;
891 }
892
893 static int msgrecv(int sockfd, void *buf, size_t len)
894 {
895 struct timeval tv;
896 fd_set rfds;
897
898 FD_ZERO(&rfds);
899 FD_SET(sockfd, &rfds);
900 tv.tv_sec = 2;
901 tv.tv_usec = 0;
902
903 if (select(sockfd+1, &rfds, NULL, NULL, &tv) <= 0)
904 return -1;
905 return recv(sockfd, buf, len, MSG_DONTWAIT);
906 }
907
908 #define SEND_CREDS_OK 0
909 #define SEND_CREDS_NOTSK 1
910 #define SEND_CREDS_FAIL 2
911 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
912 {
913 struct msghdr msg = { 0 };
914 struct iovec iov;
915 struct cmsghdr *cmsg;
916 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
917 char buf[1];
918 buf[0] = 'p';
919
920 if (pingfirst) {
921 if (msgrecv(sock, buf, 1) != 1) {
922 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
923 __func__);
924 return SEND_CREDS_FAIL;
925 }
926 }
927
928 msg.msg_control = cmsgbuf;
929 msg.msg_controllen = sizeof(cmsgbuf);
930
931 cmsg = CMSG_FIRSTHDR(&msg);
932 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
933 cmsg->cmsg_level = SOL_SOCKET;
934 cmsg->cmsg_type = SCM_CREDENTIALS;
935 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
936
937 msg.msg_name = NULL;
938 msg.msg_namelen = 0;
939
940 buf[0] = v;
941 iov.iov_base = buf;
942 iov.iov_len = sizeof(buf);
943 msg.msg_iov = &iov;
944 msg.msg_iovlen = 1;
945
946 if (sendmsg(sock, &msg, 0) < 0) {
947 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
948 strerror(errno));
949 if (errno == 3)
950 return SEND_CREDS_NOTSK;
951 return SEND_CREDS_FAIL;
952 }
953
954 return SEND_CREDS_OK;
955 }
956
957 static bool recv_creds(int sock, struct ucred *cred, char *v)
958 {
959 struct msghdr msg = { 0 };
960 struct iovec iov;
961 struct cmsghdr *cmsg;
962 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
963 char buf[1];
964 int ret;
965 int optval = 1;
966 struct timeval tv;
967 fd_set rfds;
968
969 *v = '1';
970
971 cred->pid = -1;
972 cred->uid = -1;
973 cred->gid = -1;
974
975 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
976 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
977 return false;
978 }
979 buf[0] = '1';
980 if (write(sock, buf, 1) != 1) {
981 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
982 return false;
983 }
984
985 msg.msg_name = NULL;
986 msg.msg_namelen = 0;
987 msg.msg_control = cmsgbuf;
988 msg.msg_controllen = sizeof(cmsgbuf);
989
990 iov.iov_base = buf;
991 iov.iov_len = sizeof(buf);
992 msg.msg_iov = &iov;
993 msg.msg_iovlen = 1;
994
995 FD_ZERO(&rfds);
996 FD_SET(sock, &rfds);
997 tv.tv_sec = 2;
998 tv.tv_usec = 0;
999 if (select(sock+1, &rfds, NULL, NULL, &tv) <= 0) {
1000 fprintf(stderr, "Failed to select for scm_cred: %s\n",
1001 strerror(errno));
1002 return false;
1003 }
1004 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
1005 if (ret < 0) {
1006 fprintf(stderr, "Failed to receive scm_cred: %s\n",
1007 strerror(errno));
1008 return false;
1009 }
1010
1011 cmsg = CMSG_FIRSTHDR(&msg);
1012
1013 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
1014 cmsg->cmsg_level == SOL_SOCKET &&
1015 cmsg->cmsg_type == SCM_CREDENTIALS) {
1016 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
1017 }
1018 *v = buf[0];
1019
1020 return true;
1021 }
1022
1023
1024 /*
1025 * pid_to_ns - reads pids from a ucred over a socket, then writes the
1026 * int value back over the socket. This shifts the pid from the
1027 * sender's pidns into tpid's pidns.
1028 */
1029 static void pid_to_ns(int sock, pid_t tpid)
1030 {
1031 char v = '0';
1032 struct ucred cred;
1033
1034 while (recv_creds(sock, &cred, &v)) {
1035 if (v == '1')
1036 _exit(0);
1037 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
1038 _exit(1);
1039 }
1040 _exit(0);
1041 }
1042
1043 /*
1044 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
1045 * in your old pidns. Only children which you fork will be in the target
1046 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
1047 * actually convert pids
1048 */
1049 static void pid_to_ns_wrapper(int sock, pid_t tpid)
1050 {
1051 int newnsfd = -1, ret, cpipe[2];
1052 char fnam[100];
1053 pid_t cpid;
1054 struct timeval tv;
1055 fd_set s;
1056 char v;
1057
1058 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1059 if (ret < 0 || ret >= sizeof(fnam))
1060 _exit(1);
1061 newnsfd = open(fnam, O_RDONLY);
1062 if (newnsfd < 0)
1063 _exit(1);
1064 if (setns(newnsfd, 0) < 0)
1065 _exit(1);
1066 close(newnsfd);
1067
1068 if (pipe(cpipe) < 0)
1069 _exit(1);
1070
1071 loop:
1072 cpid = fork();
1073 if (cpid < 0)
1074 _exit(1);
1075
1076 if (!cpid) {
1077 char b = '1';
1078 close(cpipe[0]);
1079 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1080 fprintf(stderr, "%s (child): erorr on write: %s\n",
1081 __func__, strerror(errno));
1082 }
1083 close(cpipe[1]);
1084 pid_to_ns(sock, tpid);
1085 }
1086 // give the child 1 second to be done forking and
1087 // write it's ack
1088 FD_ZERO(&s);
1089 FD_SET(cpipe[0], &s);
1090 tv.tv_sec = 1;
1091 tv.tv_usec = 0;
1092 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1093 if (ret <= 0)
1094 goto again;
1095 ret = read(cpipe[0], &v, 1);
1096 if (ret != sizeof(char) || v != '1') {
1097 goto again;
1098 }
1099
1100 if (!wait_for_pid(cpid))
1101 _exit(1);
1102 _exit(0);
1103
1104 again:
1105 kill(cpid, SIGKILL);
1106 wait_for_pid(cpid);
1107 goto loop;
1108 }
1109
1110 /*
1111 * To read cgroup files with a particular pid, we will setns into the child
1112 * pidns, open a pipe, fork a child - which will be the first to really be in
1113 * the child ns - which does the cgm_get_value and writes the data to the pipe.
1114 */
1115 static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
1116 {
1117 int sock[2] = {-1, -1};
1118 char *tmpdata = NULL;
1119 int ret;
1120 pid_t qpid, cpid = -1;
1121 bool answer = false;
1122 char v = '0';
1123 struct ucred cred;
1124 struct timeval tv;
1125 size_t sz = 0, asz = 0;
1126 fd_set s;
1127
1128 if (!cgm_get_value(contrl, cg, file, &tmpdata))
1129 return false;
1130
1131 /*
1132 * Now we read the pids from returned data one by one, pass
1133 * them into a child in the target namespace, read back the
1134 * translated pids, and put them into our to-return data
1135 */
1136
1137 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1138 perror("socketpair");
1139 free(tmpdata);
1140 return false;
1141 }
1142
1143 cpid = fork();
1144 if (cpid == -1)
1145 goto out;
1146
1147 if (!cpid) // child
1148 pid_to_ns_wrapper(sock[1], tpid);
1149
1150 char *ptr = tmpdata;
1151 cred.uid = 0;
1152 cred.gid = 0;
1153 while (sscanf(ptr, "%d\n", &qpid) == 1) {
1154 cred.pid = qpid;
1155 ret = send_creds(sock[0], &cred, v, true);
1156
1157 if (ret == SEND_CREDS_NOTSK)
1158 goto next;
1159 if (ret == SEND_CREDS_FAIL)
1160 goto out;
1161
1162 // read converted results
1163 FD_ZERO(&s);
1164 FD_SET(sock[0], &s);
1165 tv.tv_sec = 2;
1166 tv.tv_usec = 0;
1167 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
1168 if (ret <= 0) {
1169 fprintf(stderr, "%s: select error waiting for pid from child: %s\n",
1170 __func__, strerror(errno));
1171 goto out;
1172 }
1173 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1174 fprintf(stderr, "%s: error reading pid from child: %s\n",
1175 __func__, strerror(errno));
1176 goto out;
1177 }
1178 must_strcat_pid(d, &sz, &asz, qpid);
1179 next:
1180 ptr = strchr(ptr, '\n');
1181 if (!ptr)
1182 break;
1183 ptr++;
1184 }
1185
1186 cred.pid = getpid();
1187 v = '1';
1188 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
1189 // failed to ask child to exit
1190 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1191 __func__, strerror(errno));
1192 goto out;
1193 }
1194
1195 answer = true;
1196
1197 out:
1198 free(tmpdata);
1199 if (cpid != -1)
1200 wait_for_pid(cpid);
1201 if (sock[0] != -1) {
1202 close(sock[0]);
1203 close(sock[1]);
1204 }
1205 return answer;
1206 }
1207
1208 static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1209 struct fuse_file_info *fi)
1210 {
1211 struct fuse_context *fc = fuse_get_context();
1212 struct file_info *f = (struct file_info *)fi->fh;
1213 struct cgm_keys *k = NULL;
1214 char *data = NULL;
1215 int ret, s;
1216 bool r;
1217
1218 if (f->type != LXC_TYPE_CGFILE) {
1219 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1220 return -EIO;
1221 }
1222
1223 if (offset)
1224 return 0;
1225
1226 if (!fc)
1227 return -EIO;
1228
1229 if (!f->controller)
1230 return -EINVAL;
1231
1232 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) == NULL) {
1233 return -EINVAL;
1234 }
1235 free_key(k);
1236
1237
1238 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { // should never get here
1239 ret = -EACCES;
1240 goto out;
1241 }
1242
1243 if (strcmp(f->file, "tasks") == 0 ||
1244 strcmp(f->file, "/tasks") == 0 ||
1245 strcmp(f->file, "/cgroup.procs") == 0 ||
1246 strcmp(f->file, "cgroup.procs") == 0)
1247 // special case - we have to translate the pids
1248 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
1249 else
1250 r = cgm_get_value(f->controller, f->cgroup, f->file, &data);
1251
1252 if (!r) {
1253 ret = -EINVAL;
1254 goto out;
1255 }
1256
1257 if (!data) {
1258 ret = 0;
1259 goto out;
1260 }
1261 s = strlen(data);
1262 if (s > size)
1263 s = size;
1264 memcpy(buf, data, s);
1265 if (s > 0 && s < size && data[s-1] != '\n')
1266 buf[s++] = '\n';
1267
1268 ret = s;
1269
1270 out:
1271 free(data);
1272 return ret;
1273 }
1274
1275 static void pid_from_ns(int sock, pid_t tpid)
1276 {
1277 pid_t vpid;
1278 struct ucred cred;
1279 char v;
1280 struct timeval tv;
1281 fd_set s;
1282 int ret;
1283
1284 cred.uid = 0;
1285 cred.gid = 0;
1286 while (1) {
1287 FD_ZERO(&s);
1288 FD_SET(sock, &s);
1289 tv.tv_sec = 2;
1290 tv.tv_usec = 0;
1291 ret = select(sock+1, &s, NULL, NULL, &tv);
1292 if (ret <= 0) {
1293 fprintf(stderr, "%s: bad select before read from parent: %s\n",
1294 __func__, strerror(errno));
1295 _exit(1);
1296 }
1297 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1298 fprintf(stderr, "%s: bad read from parent: %s\n",
1299 __func__, strerror(errno));
1300 _exit(1);
1301 }
1302 if (vpid == -1) // done
1303 break;
1304 v = '0';
1305 cred.pid = vpid;
1306 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
1307 v = '1';
1308 cred.pid = getpid();
1309 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
1310 _exit(1);
1311 }
1312 }
1313 _exit(0);
1314 }
1315
1316 static void pid_from_ns_wrapper(int sock, pid_t tpid)
1317 {
1318 int newnsfd = -1, ret, cpipe[2];
1319 char fnam[100];
1320 pid_t cpid;
1321 fd_set s;
1322 struct timeval tv;
1323 char v;
1324
1325 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1326 if (ret < 0 || ret >= sizeof(fnam))
1327 _exit(1);
1328 newnsfd = open(fnam, O_RDONLY);
1329 if (newnsfd < 0)
1330 _exit(1);
1331 if (setns(newnsfd, 0) < 0)
1332 _exit(1);
1333 close(newnsfd);
1334
1335 if (pipe(cpipe) < 0)
1336 _exit(1);
1337
1338 loop:
1339 cpid = fork();
1340
1341 if (cpid < 0)
1342 _exit(1);
1343
1344 if (!cpid) {
1345 char b = '1';
1346 close(cpipe[0]);
1347 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1348 fprintf(stderr, "%s (child): erorr on write: %s\n",
1349 __func__, strerror(errno));
1350 }
1351 close(cpipe[1]);
1352 pid_from_ns(sock, tpid);
1353 }
1354
1355 // give the child 1 second to be done forking and
1356 // write it's ack
1357 FD_ZERO(&s);
1358 FD_SET(cpipe[0], &s);
1359 tv.tv_sec = 1;
1360 tv.tv_usec = 0;
1361 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1362 if (ret <= 0)
1363 goto again;
1364 ret = read(cpipe[0], &v, 1);
1365 if (ret != sizeof(char) || v != '1') {
1366 goto again;
1367 }
1368
1369 if (!wait_for_pid(cpid))
1370 _exit(1);
1371 _exit(0);
1372
1373 again:
1374 kill(cpid, SIGKILL);
1375 wait_for_pid(cpid);
1376 goto loop;
1377 }
1378
1379 static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
1380 {
1381 int sock[2] = {-1, -1};
1382 pid_t qpid, cpid = -1;
1383 bool answer = false, fail = false;
1384
1385 /*
1386 * write the pids to a socket, have helper in writer's pidns
1387 * call movepid for us
1388 */
1389 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1390 perror("socketpair");
1391 exit(1);
1392 }
1393
1394 cpid = fork();
1395 if (cpid == -1)
1396 goto out;
1397
1398 if (!cpid) // child
1399 pid_from_ns_wrapper(sock[1], tpid);
1400
1401 const char *ptr = buf;
1402 while (sscanf(ptr, "%d", &qpid) == 1) {
1403 struct ucred cred;
1404 char v;
1405
1406 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1407 fprintf(stderr, "%s: error writing pid to child: %s\n",
1408 __func__, strerror(errno));
1409 goto out;
1410 }
1411
1412 if (recv_creds(sock[0], &cred, &v)) {
1413 if (v == '0') {
1414 if (!cgm_move_pid(contrl, cg, cred.pid))
1415 fail = true;
1416 }
1417 }
1418
1419 ptr = strchr(ptr, '\n');
1420 if (!ptr)
1421 break;
1422 ptr++;
1423 }
1424
1425 /* All good, write the value */
1426 qpid = -1;
1427 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1428 fprintf(stderr, "Warning: failed to ask child to exit\n");
1429
1430 if (!fail)
1431 answer = true;
1432
1433 out:
1434 if (cpid != -1)
1435 wait_for_pid(cpid);
1436 if (sock[0] != -1) {
1437 close(sock[0]);
1438 close(sock[1]);
1439 }
1440 return answer;
1441 }
1442
1443 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1444 struct fuse_file_info *fi)
1445 {
1446 struct fuse_context *fc = fuse_get_context();
1447 char *localbuf = NULL;
1448 struct cgm_keys *k = NULL;
1449 struct file_info *f = (struct file_info *)fi->fh;
1450 bool r;
1451
1452 if (f->type != LXC_TYPE_CGFILE) {
1453 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1454 return -EIO;
1455 }
1456
1457 if (offset)
1458 return 0;
1459
1460 if (!fc)
1461 return -EIO;
1462
1463 localbuf = alloca(size+1);
1464 localbuf[size] = '\0';
1465 memcpy(localbuf, buf, size);
1466
1467 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) == NULL) {
1468 size = -EINVAL;
1469 goto out;
1470 }
1471
1472 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
1473 size = -EACCES;
1474 goto out;
1475 }
1476
1477 if (strcmp(f->file, "tasks") == 0 ||
1478 strcmp(f->file, "/tasks") == 0 ||
1479 strcmp(f->file, "/cgroup.procs") == 0 ||
1480 strcmp(f->file, "cgroup.procs") == 0)
1481 // special case - we have to translate the pids
1482 r = do_write_pids(fc->pid, f->controller, f->cgroup, f->file, localbuf);
1483 else
1484 r = cgm_set_value(f->controller, f->cgroup, f->file, localbuf);
1485
1486 if (!r)
1487 size = -EINVAL;
1488
1489 out:
1490 free_key(k);
1491 return size;
1492 }
1493
1494 int cg_chown(const char *path, uid_t uid, gid_t gid)
1495 {
1496 struct fuse_context *fc = fuse_get_context();
1497 char *cgdir = NULL, *fpath = NULL, *path1, *path2, *controller;
1498 struct cgm_keys *k = NULL;
1499 const char *cgroup;
1500 int ret;
1501
1502 if (!fc)
1503 return -EIO;
1504
1505 if (strcmp(path, "/cgroup") == 0)
1506 return -EINVAL;
1507
1508 controller = pick_controller_from_path(fc, path);
1509 if (!controller)
1510 return -EINVAL;
1511 cgroup = find_cgroup_in_path(path);
1512 if (!cgroup)
1513 /* this is just /cgroup/controller */
1514 return -EINVAL;
1515
1516 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1517
1518 if (!fpath) {
1519 path1 = "/";
1520 path2 = cgdir;
1521 } else {
1522 path1 = cgdir;
1523 path2 = fpath;
1524 }
1525
1526 if (is_child_cgroup(controller, path1, path2)) {
1527 // get uid, gid, from '/tasks' file and make up a mode
1528 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1529 k = get_cgroup_key(controller, cgroup, "tasks");
1530
1531 } else
1532 k = get_cgroup_key(controller, path1, path2);
1533
1534 if (!k) {
1535 ret = -EINVAL;
1536 goto out;
1537 }
1538
1539 /*
1540 * This being a fuse request, the uid and gid must be valid
1541 * in the caller's namespace. So we can just check to make
1542 * sure that the caller is root in his uid, and privileged
1543 * over the file's current owner.
1544 */
1545 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
1546 ret = -EACCES;
1547 goto out;
1548 }
1549
1550 if (!cgm_chown_file(controller, cgroup, uid, gid)) {
1551 ret = -EINVAL;
1552 goto out;
1553 }
1554
1555 ret = 0;
1556
1557 out:
1558 free_key(k);
1559 free(cgdir);
1560
1561 return ret;
1562 }
1563
1564 int cg_chmod(const char *path, mode_t mode)
1565 {
1566 struct fuse_context *fc = fuse_get_context();
1567 char * cgdir = NULL, *fpath = NULL, *path1, *path2, *controller;
1568 struct cgm_keys *k = NULL;
1569 const char *cgroup;
1570 int ret;
1571
1572 if (!fc)
1573 return -EIO;
1574
1575 if (strcmp(path, "/cgroup") == 0)
1576 return -EINVAL;
1577
1578 controller = pick_controller_from_path(fc, path);
1579 if (!controller)
1580 return -EINVAL;
1581 cgroup = find_cgroup_in_path(path);
1582 if (!cgroup)
1583 /* this is just /cgroup/controller */
1584 return -EINVAL;
1585
1586 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1587
1588 if (!fpath) {
1589 path1 = "/";
1590 path2 = cgdir;
1591 } else {
1592 path1 = cgdir;
1593 path2 = fpath;
1594 }
1595
1596 if (is_child_cgroup(controller, path1, path2)) {
1597 // get uid, gid, from '/tasks' file and make up a mode
1598 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1599 k = get_cgroup_key(controller, cgroup, "tasks");
1600
1601 } else
1602 k = get_cgroup_key(controller, path1, path2);
1603
1604 if (!k) {
1605 ret = -EINVAL;
1606 goto out;
1607 }
1608
1609 /*
1610 * This being a fuse request, the uid and gid must be valid
1611 * in the caller's namespace. So we can just check to make
1612 * sure that the caller is root in his uid, and privileged
1613 * over the file's current owner.
1614 */
1615 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1616 ret = -EPERM;
1617 goto out;
1618 }
1619
1620 if (!cgm_chmod_file(controller, cgroup, mode)) {
1621 ret = -EINVAL;
1622 goto out;
1623 }
1624
1625 ret = 0;
1626 out:
1627 free_key(k);
1628 free(cgdir);
1629 return ret;
1630 }
1631
1632 #define LXCFS_MKDIR_PATH LIBEXECDIR "/lxcfs/lxcfs_mkdir"
1633
1634 int cg_mkdir(const char *path, mode_t mode)
1635 {
1636 struct fuse_context *fc = fuse_get_context();
1637 char *fpath = NULL, *path1, *cgdir = NULL, *controller;
1638 const char *cgroup;
1639 int ret;
1640
1641 if (!fc)
1642 return -EIO;
1643
1644
1645 controller = pick_controller_from_path(fc, path);
1646 if (!controller)
1647 return -EINVAL;
1648
1649 cgroup = find_cgroup_in_path(path);
1650 if (!cgroup)
1651 return -EINVAL;
1652
1653 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1654 if (!fpath)
1655 path1 = "/";
1656 else
1657 path1 = cgdir;
1658
1659 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
1660 ret = -EACCES;
1661 goto out;
1662 }
1663 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL)) {
1664 ret = -EACCES;
1665 goto out;
1666 }
1667
1668 if (fc->uid == 0 && fc->gid == 0) {
1669 if (!cgm_create(controller, cgroup)) {
1670 ret = -EINVAL;
1671 goto out;
1672 }
1673 } else {
1674 /*
1675 * exec a helper so as to get a clean dbus connection
1676 * 17 for lxcfs_mkdir, and spaces and newline and \0. 50 for two ints.
1677 * 50 for two ints
1678 */
1679 size_t len = strlen(cgroup) + strlen(controller) + 17 + 50;
1680 char *cmd = alloca(len);
1681 ret = snprintf(cmd, len, "%s %d %d %s %s\n", LXCFS_MKDIR_PATH,
1682 fc->uid, fc->gid, controller, cgroup);
1683 if (ret < 0 || ret >= len) {
1684 ret = -EINVAL;
1685 goto out;
1686 }
1687 ret = system(cmd);
1688 if (ret != 0)
1689 goto out;
1690 }
1691
1692 ret = 0;
1693
1694 out:
1695 free(cgdir);
1696 return ret;
1697 }
1698
1699 static int cg_rmdir(const char *path)
1700 {
1701 struct fuse_context *fc = fuse_get_context();
1702 char *fpath = NULL, *cgdir = NULL, *controller;
1703 const char *cgroup;
1704 int ret;
1705
1706 if (!fc)
1707 return -EIO;
1708
1709 controller = pick_controller_from_path(fc, path);
1710 if (!controller)
1711 return -EINVAL;
1712
1713 cgroup = find_cgroup_in_path(path);
1714 if (!cgroup)
1715 return -EINVAL;
1716
1717 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1718 if (!fpath) {
1719 ret = -EINVAL;
1720 goto out;
1721 }
1722
1723 fprintf(stderr, "rmdir: verifying access to %s:%s (req path %s)\n",
1724 controller, cgdir, path);
1725 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
1726 ret = -EACCES;
1727 goto out;
1728 }
1729 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
1730 ret = -EACCES;
1731 goto out;
1732 }
1733
1734 if (!cgm_remove(controller, cgroup)) {
1735 ret = -EINVAL;
1736 goto out;
1737 }
1738
1739 ret = 0;
1740
1741 out:
1742 free(cgdir);
1743 return ret;
1744 }
1745
1746 static bool startswith(const char *line, const char *pref)
1747 {
1748 if (strncmp(line, pref, strlen(pref)) == 0)
1749 return true;
1750 return false;
1751 }
1752
1753 static void get_mem_cached(char *memstat, unsigned long *v)
1754 {
1755 char *eol;
1756
1757 *v = 0;
1758 while (*memstat) {
1759 if (startswith(memstat, "total_cache")) {
1760 sscanf(memstat + 11, "%lu", v);
1761 *v /= 1024;
1762 return;
1763 }
1764 eol = strchr(memstat, '\n');
1765 if (!eol)
1766 return;
1767 memstat = eol+1;
1768 }
1769 }
1770
1771 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
1772 {
1773 char *eol;
1774 char key[32];
1775
1776 memset(key, 0, 32);
1777 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
1778
1779 size_t len = strlen(key);
1780 *v = 0;
1781
1782 while (*str) {
1783 if (startswith(str, key)) {
1784 sscanf(str + len, "%lu", v);
1785 return;
1786 }
1787 eol = strchr(str, '\n');
1788 if (!eol)
1789 return;
1790 str = eol+1;
1791 }
1792 }
1793
1794 static int read_file(const char *path, char *buf, size_t size,
1795 struct file_info *d)
1796 {
1797 size_t linelen = 0, total_len = 0, rv = 0;
1798 char *line = NULL;
1799 char *cache = d->buf;
1800 size_t cache_size = d->buflen;
1801 FILE *f = fopen(path, "r");
1802 if (!f)
1803 return 0;
1804
1805 while (getline(&line, &linelen, f) != -1) {
1806 size_t l = snprintf(cache, cache_size, "%s", line);
1807 if (l < 0) {
1808 perror("Error writing to cache");
1809 rv = 0;
1810 goto err;
1811 }
1812 if (l >= cache_size) {
1813 fprintf(stderr, "Internal error: truncated write to cache\n");
1814 rv = 0;
1815 goto err;
1816 }
1817 if (l < cache_size) {
1818 cache += l;
1819 cache_size -= l;
1820 total_len += l;
1821 } else {
1822 cache += cache_size;
1823 total_len += cache_size;
1824 cache_size = 0;
1825 break;
1826 }
1827 }
1828
1829 d->size = total_len;
1830 if (total_len > size ) total_len = size;
1831
1832 /* read from off 0 */
1833 memcpy(buf, d->buf, total_len);
1834 rv = total_len;
1835 err:
1836 fclose(f);
1837 free(line);
1838 return rv;
1839 }
1840
1841 /*
1842 * FUSE ops for /proc
1843 */
1844
1845 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1846 struct fuse_file_info *fi)
1847 {
1848 struct fuse_context *fc = fuse_get_context();
1849 struct file_info *d = (struct file_info *)fi->fh;
1850 char *cg;
1851 char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
1852 unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
1853 char *line = NULL;
1854 size_t linelen = 0, total_len = 0, rv = 0;
1855 char *cache = d->buf;
1856 size_t cache_size = d->buflen;
1857 FILE *f = NULL;
1858
1859 if (offset){
1860 if (offset > d->size)
1861 return -EINVAL;
1862 if (!d->cached)
1863 return 0;
1864 int left = d->size - offset;
1865 total_len = left > size ? size: left;
1866 memcpy(buf, cache + offset, total_len);
1867 return total_len;
1868 }
1869
1870 cg = get_pid_cgroup(fc->pid, "memory");
1871 if (!cg)
1872 return read_file("/proc/meminfo", buf, size, d);
1873
1874 if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
1875 goto err;
1876 if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
1877 goto err;
1878 if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
1879 goto err;
1880 memlimit = strtoul(memlimit_str, NULL, 10);
1881 memusage = strtoul(memusage_str, NULL, 10);
1882 memlimit /= 1024;
1883 memusage /= 1024;
1884 get_mem_cached(memstat_str, &cached);
1885
1886 f = fopen("/proc/meminfo", "r");
1887 if (!f)
1888 goto err;
1889
1890 while (getline(&line, &linelen, f) != -1) {
1891 size_t l;
1892 char *printme, lbuf[100];
1893
1894 memset(lbuf, 0, 100);
1895 if (startswith(line, "MemTotal:")) {
1896 sscanf(line+14, "%lu", &hosttotal);
1897 if (hosttotal < memlimit)
1898 memlimit = hosttotal;
1899 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1900 printme = lbuf;
1901 } else if (startswith(line, "MemFree:")) {
1902 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1903 printme = lbuf;
1904 } else if (startswith(line, "MemAvailable:")) {
1905 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1906 printme = lbuf;
1907 } else if (startswith(line, "Buffers:")) {
1908 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1909 printme = lbuf;
1910 } else if (startswith(line, "Cached:")) {
1911 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
1912 printme = lbuf;
1913 } else if (startswith(line, "SwapCached:")) {
1914 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
1915 printme = lbuf;
1916 } else
1917 printme = line;
1918
1919 l = snprintf(cache, cache_size, "%s", printme);
1920 if (l < 0) {
1921 perror("Error writing to cache");
1922 rv = 0;
1923 goto err;
1924
1925 }
1926 if (l >= cache_size) {
1927 fprintf(stderr, "Internal error: truncated write to cache\n");
1928 rv = 0;
1929 goto err;
1930 }
1931
1932 cache += l;
1933 cache_size -= l;
1934 total_len += l;
1935 }
1936
1937 d->cached = 1;
1938 d->size = total_len;
1939 if (total_len > size ) total_len = size;
1940 memcpy(buf, d->buf, total_len);
1941
1942 rv = total_len;
1943 err:
1944 if (f)
1945 fclose(f);
1946 free(line);
1947 free(cg);
1948 free(memlimit_str);
1949 free(memusage_str);
1950 free(memstat_str);
1951 return rv;
1952 }
1953
1954 /*
1955 * Read the cpuset.cpus for cg
1956 * Return the answer in a newly allocated string which must be freed
1957 */
1958 static char *get_cpuset(const char *cg)
1959 {
1960 char *answer;
1961
1962 if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
1963 return NULL;
1964 return answer;
1965 }
1966
1967 bool cpu_in_cpuset(int cpu, const char *cpuset);
1968
1969 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
1970 {
1971 int cpu;
1972
1973 if (sscanf(line, "processor : %d", &cpu) != 1)
1974 return false;
1975 return cpu_in_cpuset(cpu, cpuset);
1976 }
1977
1978 /*
1979 * check whether this is a '^processor" line in /proc/cpuinfo
1980 */
1981 static bool is_processor_line(const char *line)
1982 {
1983 int cpu;
1984
1985 if (sscanf(line, "processor : %d", &cpu) == 1)
1986 return true;
1987 return false;
1988 }
1989
1990 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
1991 struct fuse_file_info *fi)
1992 {
1993 struct fuse_context *fc = fuse_get_context();
1994 struct file_info *d = (struct file_info *)fi->fh;
1995 char *cg;
1996 char *cpuset = NULL;
1997 char *line = NULL;
1998 size_t linelen = 0, total_len = 0, rv = 0;
1999 bool am_printing = false;
2000 int curcpu = -1;
2001 char *cache = d->buf;
2002 size_t cache_size = d->buflen;
2003 FILE *f = NULL;
2004
2005 if (offset){
2006 if (offset > d->size)
2007 return -EINVAL;
2008 if (!d->cached)
2009 return 0;
2010 int left = d->size - offset;
2011 total_len = left > size ? size: left;
2012 memcpy(buf, cache + offset, total_len);
2013 return total_len;
2014 }
2015
2016 cg = get_pid_cgroup(fc->pid, "cpuset");
2017 if (!cg)
2018 return read_file("proc/cpuinfo", buf, size, d);
2019
2020 cpuset = get_cpuset(cg);
2021 if (!cpuset)
2022 goto err;
2023
2024 f = fopen("/proc/cpuinfo", "r");
2025 if (!f)
2026 goto err;
2027
2028 while (getline(&line, &linelen, f) != -1) {
2029 size_t l;
2030 if (is_processor_line(line)) {
2031 am_printing = cpuline_in_cpuset(line, cpuset);
2032 if (am_printing) {
2033 curcpu ++;
2034 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
2035 if (l < 0) {
2036 perror("Error writing to cache");
2037 rv = 0;
2038 goto err;
2039 }
2040 if (l >= cache_size) {
2041 fprintf(stderr, "Internal error: truncated write to cache\n");
2042 rv = 0;
2043 goto err;
2044 }
2045 if (l < cache_size){
2046 cache += l;
2047 cache_size -= l;
2048 total_len += l;
2049 }else{
2050 cache += cache_size;
2051 total_len += cache_size;
2052 cache_size = 0;
2053 break;
2054 }
2055 }
2056 continue;
2057 }
2058 if (am_printing) {
2059 l = snprintf(cache, cache_size, "%s", line);
2060 if (l < 0) {
2061 perror("Error writing to cache");
2062 rv = 0;
2063 goto err;
2064 }
2065 if (l >= cache_size) {
2066 fprintf(stderr, "Internal error: truncated write to cache\n");
2067 rv = 0;
2068 goto err;
2069 }
2070 if (l < cache_size) {
2071 cache += l;
2072 cache_size -= l;
2073 total_len += l;
2074 } else {
2075 cache += cache_size;
2076 total_len += cache_size;
2077 cache_size = 0;
2078 break;
2079 }
2080 }
2081 }
2082
2083 d->cached = 1;
2084 d->size = total_len;
2085 if (total_len > size ) total_len = size;
2086
2087 /* read from off 0 */
2088 memcpy(buf, d->buf, total_len);
2089 rv = total_len;
2090 err:
2091 if (f)
2092 fclose(f);
2093 free(line);
2094 free(cpuset);
2095 free(cg);
2096 return rv;
2097 }
2098
2099 static int proc_stat_read(char *buf, size_t size, off_t offset,
2100 struct fuse_file_info *fi)
2101 {
2102 struct fuse_context *fc = fuse_get_context();
2103 struct file_info *d = (struct file_info *)fi->fh;
2104 char *cg;
2105 char *cpuset = NULL;
2106 char *line = NULL;
2107 size_t linelen = 0, total_len = 0, rv = 0;
2108 int curcpu = -1; /* cpu numbering starts at 0 */
2109 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
2110 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
2111 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
2112 #define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
2113 char cpuall[CPUALL_MAX_SIZE];
2114 /* reserve for cpu all */
2115 char *cache = d->buf + CPUALL_MAX_SIZE;
2116 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
2117 FILE *f = NULL;
2118
2119 if (offset){
2120 if (offset > d->size)
2121 return -EINVAL;
2122 if (!d->cached)
2123 return 0;
2124 int left = d->size - offset;
2125 total_len = left > size ? size: left;
2126 memcpy(buf, d->buf + offset, total_len);
2127 return total_len;
2128 }
2129
2130 cg = get_pid_cgroup(fc->pid, "cpuset");
2131 if (!cg)
2132 return read_file("/proc/stat", buf, size, d);
2133
2134 cpuset = get_cpuset(cg);
2135 if (!cpuset)
2136 goto err;
2137
2138 f = fopen("/proc/stat", "r");
2139 if (!f)
2140 goto err;
2141
2142 //skip first line
2143 if (getline(&line, &linelen, f) < 0) {
2144 fprintf(stderr, "proc_stat_read read first line failed\n");
2145 goto err;
2146 }
2147
2148 while (getline(&line, &linelen, f) != -1) {
2149 size_t l;
2150 int cpu;
2151 char cpu_char[10]; /* That's a lot of cores */
2152 char *c;
2153
2154 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
2155 /* not a ^cpuN line containing a number N, just print it */
2156 l = snprintf(cache, cache_size, "%s", line);
2157 if (l < 0) {
2158 perror("Error writing to cache");
2159 rv = 0;
2160 goto err;
2161 }
2162 if (l >= cache_size) {
2163 fprintf(stderr, "Internal error: truncated write to cache\n");
2164 rv = 0;
2165 goto err;
2166 }
2167 if (l < cache_size) {
2168 cache += l;
2169 cache_size -= l;
2170 total_len += l;
2171 continue;
2172 } else {
2173 //no more space, break it
2174 cache += cache_size;
2175 total_len += cache_size;
2176 cache_size = 0;
2177 break;
2178 }
2179 }
2180
2181 if (sscanf(cpu_char, "%d", &cpu) != 1)
2182 continue;
2183 if (!cpu_in_cpuset(cpu, cpuset))
2184 continue;
2185 curcpu ++;
2186
2187 c = strchr(line, ' ');
2188 if (!c)
2189 continue;
2190 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
2191 if (l < 0) {
2192 perror("Error writing to cache");
2193 rv = 0;
2194 goto err;
2195
2196 }
2197 if (l >= cache_size) {
2198 fprintf(stderr, "Internal error: truncated write to cache\n");
2199 rv = 0;
2200 goto err;
2201 }
2202
2203 cache += l;
2204 cache_size -= l;
2205 total_len += l;
2206
2207 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
2208 &softirq, &steal, &guest) != 9)
2209 continue;
2210 user_sum += user;
2211 nice_sum += nice;
2212 system_sum += system;
2213 idle_sum += idle;
2214 iowait_sum += iowait;
2215 irq_sum += irq;
2216 softirq_sum += softirq;
2217 steal_sum += steal;
2218 guest_sum += guest;
2219 }
2220
2221 cache = d->buf;
2222
2223 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2224 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
2225 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
2226 memcpy(cache, cpuall, cpuall_len);
2227 cache += cpuall_len;
2228 } else{
2229 /* shouldn't happen */
2230 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
2231 cpuall_len = 0;
2232 }
2233
2234 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
2235 total_len += cpuall_len;
2236 d->cached = 1;
2237 d->size = total_len;
2238 if (total_len > size ) total_len = size;
2239
2240 memcpy(buf, d->buf, total_len);
2241 rv = total_len;
2242
2243 err:
2244 if (f)
2245 fclose(f);
2246 free(line);
2247 free(cpuset);
2248 free(cg);
2249 return rv;
2250 }
2251
2252 /*
2253 * How to guess what to present for uptime?
2254 * One thing we could do would be to take the date on the caller's
2255 * memory.usage_in_bytes file, which should equal the time of creation
2256 * of his cgroup. However, a task could be in a sub-cgroup of the
2257 * container. The same problem exists if we try to look at the ages
2258 * of processes in the caller's cgroup.
2259 *
2260 * So we'll fork a task that will enter the caller's pidns, mount a
2261 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
2262 *
2263 * For the second uptime #, we'll do as Stéphane had done, just copy
2264 * the number from /proc/uptime. Not sure how to best emulate 'idle'
2265 * time. Maybe someone can come up with a good algorithm and submit a
2266 * patch. Maybe something based on cpushare info?
2267 */
2268
2269 /* return age of the reaper for $pid, taken from ctime of its procdir */
2270 static long int get_pid1_time(pid_t pid)
2271 {
2272 char fnam[100];
2273 int fd, cpipe[2], ret;
2274 struct stat sb;
2275 pid_t cpid;
2276 struct timeval tv;
2277 fd_set s;
2278 char v;
2279
2280 if (unshare(CLONE_NEWNS))
2281 return 0;
2282
2283 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
2284 perror("rslave mount failed");
2285 return 0;
2286 }
2287
2288 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", pid);
2289 if (ret < 0 || ret >= sizeof(fnam))
2290 return 0;
2291
2292 fd = open(fnam, O_RDONLY);
2293 if (fd < 0) {
2294 perror("get_pid1_time open of ns/pid");
2295 return 0;
2296 }
2297 if (setns(fd, 0)) {
2298 perror("get_pid1_time setns 1");
2299 close(fd);
2300 return 0;
2301 }
2302 close(fd);
2303
2304 if (pipe(cpipe) < 0)
2305 exit(1);
2306
2307 loop:
2308 cpid = fork();
2309 if (cpid < 0)
2310 return 0;
2311
2312 if (!cpid) {
2313 char b = '1';
2314 close(cpipe[0]);
2315 if (write(cpipe[1], &b, sizeof(char)) < 0) {
2316 fprintf(stderr, "%s (child): erorr on write: %s\n",
2317 __func__, strerror(errno));
2318 }
2319 close(cpipe[1]);
2320 umount2("/proc", MNT_DETACH);
2321 if (mount("proc", "/proc", "proc", 0, NULL)) {
2322 perror("get_pid1_time mount");
2323 return 0;
2324 }
2325 ret = lstat("/proc/1", &sb);
2326 if (ret) {
2327 perror("get_pid1_time lstat");
2328 return 0;
2329 }
2330 return time(NULL) - sb.st_ctime;
2331 }
2332
2333 // give the child 1 second to be done forking and
2334 // write it's ack
2335 FD_ZERO(&s);
2336 FD_SET(cpipe[0], &s);
2337 tv.tv_sec = 1;
2338 tv.tv_usec = 0;
2339 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
2340 if (ret <= 0)
2341 goto again;
2342 ret = read(cpipe[0], &v, 1);
2343 if (ret != sizeof(char) || v != '1') {
2344 goto again;
2345 }
2346
2347 wait_for_pid(cpid);
2348 _exit(0);
2349
2350 again:
2351 kill(cpid, SIGKILL);
2352 wait_for_pid(cpid);
2353 goto loop;
2354 }
2355
2356 static long int getreaperage(pid_t qpid)
2357 {
2358 int pid, mypipe[2], ret;
2359 struct timeval tv;
2360 fd_set s;
2361 long int mtime, answer = 0;
2362
2363 if (pipe(mypipe)) {
2364 return 0;
2365 }
2366
2367 pid = fork();
2368
2369 if (!pid) { // child
2370 mtime = get_pid1_time(qpid);
2371 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
2372 fprintf(stderr, "Warning: bad write from getreaperage\n");
2373 _exit(0);
2374 }
2375
2376 close(mypipe[1]);
2377 FD_ZERO(&s);
2378 FD_SET(mypipe[0], &s);
2379 tv.tv_sec = 1;
2380 tv.tv_usec = 0;
2381 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
2382 if (ret <= 0) {
2383 perror("select");
2384 goto out;
2385 }
2386 if (!ret) {
2387 fprintf(stderr, "timed out\n");
2388 goto out;
2389 }
2390 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
2391 perror("read");
2392 goto out;
2393 }
2394 answer = mtime;
2395
2396 out:
2397 wait_for_pid(pid);
2398 close(mypipe[0]);
2399 return answer;
2400 }
2401
2402 static unsigned long int getprocidle(void)
2403 {
2404 FILE *f = fopen("/proc/uptime", "r");
2405 unsigned long int age, idle;
2406 unsigned long int age_nsec, idle_nsec;
2407
2408 int ret;
2409 if (!f)
2410 return 0;
2411 ret = fscanf(f, "%lu.%02lu %lu.%02lu", &age, &age_nsec, &idle, &idle_nsec);
2412 fclose(f);
2413 if (ret != 4)
2414 return 0;
2415 return idle;
2416 }
2417
2418 /*
2419 * We read /proc/uptime and reuse its second field.
2420 * For the first field, we use the mtime for the reaper for
2421 * the calling pid as returned by getreaperage
2422 */
2423 static int proc_uptime_read(char *buf, size_t size, off_t offset,
2424 struct fuse_file_info *fi)
2425 {
2426 struct fuse_context *fc = fuse_get_context();
2427 struct file_info *d = (struct file_info *)fi->fh;
2428 long int reaperage = getreaperage(fc->pid);;
2429 unsigned long int idletime = getprocidle();
2430 char *cache = d->buf;
2431 size_t total_len = 0;
2432
2433 if (offset){
2434 if (offset > d->size)
2435 return -EINVAL;
2436 if (!d->cached)
2437 return 0;
2438 int left = d->size - offset;
2439 total_len = left > size ? size: left;
2440 memcpy(buf, cache + offset, total_len);
2441 return total_len;
2442 }
2443
2444 if (idletime > reaperage)
2445 idletime = reaperage;
2446
2447 total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
2448 if (total_len < 0){
2449 perror("Error writing to cache");
2450 return 0;
2451 }
2452
2453 d->size = (int)total_len;
2454 d->cached = 1;
2455
2456 if (total_len > size) total_len = size;
2457
2458 memcpy(buf, d->buf, total_len);
2459 return total_len;
2460 }
2461
2462 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2463 struct fuse_file_info *fi)
2464 {
2465 char dev_name[72];
2466 struct fuse_context *fc = fuse_get_context();
2467 struct file_info *d = (struct file_info *)fi->fh;
2468 char *cg;
2469 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
2470 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2471 unsigned long read = 0, write = 0;
2472 unsigned long read_merged = 0, write_merged = 0;
2473 unsigned long read_sectors = 0, write_sectors = 0;
2474 unsigned long read_ticks = 0, write_ticks = 0;
2475 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2476 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
2477 char *cache = d->buf;
2478 size_t cache_size = d->buflen;
2479 char *line = NULL;
2480 size_t linelen = 0, total_len = 0, rv = 0;
2481 unsigned int major = 0, minor = 0;
2482 int i = 0;
2483 FILE *f = NULL;
2484
2485 if (offset){
2486 if (offset > d->size)
2487 return -EINVAL;
2488 if (!d->cached)
2489 return 0;
2490 int left = d->size - offset;
2491 total_len = left > size ? size: left;
2492 memcpy(buf, cache + offset, total_len);
2493 return total_len;
2494 }
2495
2496 cg = get_pid_cgroup(fc->pid, "blkio");
2497 if (!cg)
2498 return read_file("/proc/diskstats", buf, size, d);
2499
2500 if (!cgm_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2501 goto err;
2502 if (!cgm_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2503 goto err;
2504 if (!cgm_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2505 goto err;
2506 if (!cgm_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2507 goto err;
2508 if (!cgm_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2509 goto err;
2510
2511
2512 f = fopen("/proc/diskstats", "r");
2513 if (!f)
2514 goto err;
2515
2516 while (getline(&line, &linelen, f) != -1) {
2517 size_t l;
2518 char *printme, lbuf[256];
2519
2520 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2521 if(i == 3){
2522 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2523 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2524 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2525 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2526 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2527 read_sectors = read_sectors/512;
2528 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2529 write_sectors = write_sectors/512;
2530
2531 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2532 rd_svctm = rd_svctm/1000000;
2533 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2534 rd_wait = rd_wait/1000000;
2535 read_ticks = rd_svctm + rd_wait;
2536
2537 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2538 wr_svctm = wr_svctm/1000000;
2539 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2540 wr_wait = wr_wait/1000000;
2541 write_ticks = wr_svctm + wr_wait;
2542
2543 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2544 tot_ticks = tot_ticks/1000000;
2545 }else{
2546 continue;
2547 }
2548
2549 memset(lbuf, 0, 256);
2550 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2551 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2552 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2553 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2554 printme = lbuf;
2555 } else
2556 continue;
2557
2558 l = snprintf(cache, cache_size, "%s", printme);
2559 if (l < 0) {
2560 perror("Error writing to fuse buf");
2561 rv = 0;
2562 goto err;
2563 }
2564 if (l >= cache_size) {
2565 fprintf(stderr, "Internal error: truncated write to cache\n");
2566 rv = 0;
2567 goto err;
2568 }
2569 cache += l;
2570 cache_size -= l;
2571 total_len += l;
2572 }
2573
2574 d->cached = 1;
2575 d->size = total_len;
2576 if (total_len > size ) total_len = size;
2577 memcpy(buf, d->buf, total_len);
2578
2579 rv = total_len;
2580 err:
2581 free(cg);
2582 if (f)
2583 fclose(f);
2584 free(line);
2585 free(io_serviced_str);
2586 free(io_merged_str);
2587 free(io_service_bytes_str);
2588 free(io_wait_time_str);
2589 free(io_service_time_str);
2590 return rv;
2591 }
2592
2593 static off_t get_procfile_size(const char *which)
2594 {
2595 FILE *f = fopen(which, "r");
2596 char *line = NULL;
2597 size_t len = 0;
2598 ssize_t sz, answer = 0;
2599 if (!f)
2600 return 0;
2601
2602 while ((sz = getline(&line, &len, f)) != -1)
2603 answer += sz;
2604 fclose (f);
2605 free(line);
2606
2607 return answer;
2608 }
2609
2610 static int proc_getattr(const char *path, struct stat *sb)
2611 {
2612 struct timespec now;
2613
2614 memset(sb, 0, sizeof(struct stat));
2615 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2616 return -EINVAL;
2617 sb->st_uid = sb->st_gid = 0;
2618 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2619 if (strcmp(path, "/proc") == 0) {
2620 sb->st_mode = S_IFDIR | 00555;
2621 sb->st_nlink = 2;
2622 return 0;
2623 }
2624 if (strcmp(path, "/proc/meminfo") == 0 ||
2625 strcmp(path, "/proc/cpuinfo") == 0 ||
2626 strcmp(path, "/proc/uptime") == 0 ||
2627 strcmp(path, "/proc/stat") == 0 ||
2628 strcmp(path, "/proc/diskstats") == 0) {
2629 sb->st_size = 0;
2630 sb->st_mode = S_IFREG | 00444;
2631 sb->st_nlink = 1;
2632 return 0;
2633 }
2634
2635 return -ENOENT;
2636 }
2637
2638 static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2639 struct fuse_file_info *fi)
2640 {
2641 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2642 filler(buf, "meminfo", NULL, 0) != 0 ||
2643 filler(buf, "stat", NULL, 0) != 0 ||
2644 filler(buf, "uptime", NULL, 0) != 0 ||
2645 filler(buf, "diskstats", NULL, 0) != 0)
2646 return -EINVAL;
2647 return 0;
2648 }
2649
2650 static int proc_open(const char *path, struct fuse_file_info *fi)
2651 {
2652 int type = -1;
2653 struct file_info *info;
2654
2655 if (strcmp(path, "/proc/meminfo") == 0)
2656 type = LXC_TYPE_PROC_MEMINFO;
2657 else if (strcmp(path, "/proc/cpuinfo") == 0)
2658 type = LXC_TYPE_PROC_CPUINFO;
2659 else if (strcmp(path, "/proc/uptime") == 0)
2660 type = LXC_TYPE_PROC_UPTIME;
2661 else if (strcmp(path, "/proc/stat") == 0)
2662 type = LXC_TYPE_PROC_STAT;
2663 else if (strcmp(path, "/proc/diskstats") == 0)
2664 type = LXC_TYPE_PROC_DISKSTATS;
2665 if (type == -1)
2666 return -ENOENT;
2667
2668 info = malloc(sizeof(*info));
2669 if (!info)
2670 return -ENOMEM;
2671
2672 memset(info, 0, sizeof(*info));
2673 info->type = type;
2674
2675 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
2676 do {
2677 info->buf = malloc(info->buflen);
2678 } while (!info->buf);
2679 memset(info->buf, 0, info->buflen);
2680 /* set actual size to buffer size */
2681 info->size = info->buflen;
2682
2683 fi->fh = (unsigned long)info;
2684 return 0;
2685 }
2686
2687 static int proc_release(const char *path, struct fuse_file_info *fi)
2688 {
2689 struct file_info *f = (struct file_info *)fi->fh;
2690
2691 do_release_file_info(f);
2692 return 0;
2693 }
2694
2695 static int proc_read(const char *path, char *buf, size_t size, off_t offset,
2696 struct fuse_file_info *fi)
2697 {
2698 struct file_info *f = (struct file_info *) fi->fh;
2699
2700 switch (f->type) {
2701 case LXC_TYPE_PROC_MEMINFO:
2702 return proc_meminfo_read(buf, size, offset, fi);
2703 case LXC_TYPE_PROC_CPUINFO:
2704 return proc_cpuinfo_read(buf, size, offset, fi);
2705 case LXC_TYPE_PROC_UPTIME:
2706 return proc_uptime_read(buf, size, offset, fi);
2707 case LXC_TYPE_PROC_STAT:
2708 return proc_stat_read(buf, size, offset, fi);
2709 case LXC_TYPE_PROC_DISKSTATS:
2710 return proc_diskstats_read(buf, size, offset, fi);
2711 default:
2712 return -EINVAL;
2713 }
2714 }
2715
2716 /*
2717 * FUSE ops for /
2718 * these just delegate to the /proc and /cgroup ops as
2719 * needed
2720 */
2721
2722 static int lxcfs_getattr(const char *path, struct stat *sb)
2723 {
2724 if (strcmp(path, "/") == 0) {
2725 sb->st_mode = S_IFDIR | 00755;
2726 sb->st_nlink = 2;
2727 return 0;
2728 }
2729 if (strncmp(path, "/cgroup", 7) == 0) {
2730 return cg_getattr(path, sb);
2731 }
2732 if (strncmp(path, "/proc", 5) == 0) {
2733 return proc_getattr(path, sb);
2734 }
2735 return -EINVAL;
2736 }
2737
2738 static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
2739 {
2740 if (strcmp(path, "/") == 0)
2741 return 0;
2742
2743 if (strncmp(path, "/cgroup", 7) == 0) {
2744 return cg_opendir(path, fi);
2745 }
2746 if (strcmp(path, "/proc") == 0)
2747 return 0;
2748 return -ENOENT;
2749 }
2750
2751 static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2752 struct fuse_file_info *fi)
2753 {
2754 if (strcmp(path, "/") == 0) {
2755 if (filler(buf, "proc", NULL, 0) != 0 ||
2756 filler(buf, "cgroup", NULL, 0) != 0)
2757 return -EINVAL;
2758 return 0;
2759 }
2760 if (strncmp(path, "/cgroup", 7) == 0)
2761 return cg_readdir(path, buf, filler, offset, fi);
2762 if (strcmp(path, "/proc") == 0)
2763 return proc_readdir(path, buf, filler, offset, fi);
2764 return -EINVAL;
2765 }
2766
2767 static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
2768 {
2769 if (strcmp(path, "/") == 0)
2770 return 0;
2771 if (strncmp(path, "/cgroup", 7) == 0) {
2772 return cg_releasedir(path, fi);
2773 }
2774 if (strcmp(path, "/proc") == 0)
2775 return 0;
2776 return -EINVAL;
2777 }
2778
2779 static int lxcfs_open(const char *path, struct fuse_file_info *fi)
2780 {
2781 if (strncmp(path, "/cgroup", 7) == 0)
2782 return cg_open(path, fi);
2783 if (strncmp(path, "/proc", 5) == 0)
2784 return proc_open(path, fi);
2785
2786 return -EINVAL;
2787 }
2788
2789 static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
2790 struct fuse_file_info *fi)
2791 {
2792 if (strncmp(path, "/cgroup", 7) == 0)
2793 return cg_read(path, buf, size, offset, fi);
2794 if (strncmp(path, "/proc", 5) == 0)
2795 return proc_read(path, buf, size, offset, fi);
2796
2797 return -EINVAL;
2798 }
2799
2800 int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
2801 struct fuse_file_info *fi)
2802 {
2803 if (strncmp(path, "/cgroup", 7) == 0) {
2804 return cg_write(path, buf, size, offset, fi);
2805 }
2806
2807 return -EINVAL;
2808 }
2809
2810 static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2811 {
2812 return 0;
2813 }
2814
2815 static int lxcfs_release(const char *path, struct fuse_file_info *fi)
2816 {
2817 if (strncmp(path, "/cgroup", 7) == 0)
2818 return cg_release(path, fi);
2819 if (strncmp(path, "/proc", 5) == 0)
2820 return proc_release(path, fi);
2821
2822 return -EINVAL;
2823 }
2824
2825 static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2826 {
2827 return 0;
2828 }
2829
2830 int lxcfs_mkdir(const char *path, mode_t mode)
2831 {
2832 if (strncmp(path, "/cgroup", 7) == 0)
2833 return cg_mkdir(path, mode);
2834
2835 return -EINVAL;
2836 }
2837
2838 int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2839 {
2840 if (strncmp(path, "/cgroup", 7) == 0)
2841 return cg_chown(path, uid, gid);
2842
2843 return -EINVAL;
2844 }
2845
2846 /*
2847 * cat first does a truncate before doing ops->write. This doesn't
2848 * really make sense for cgroups. So just return 0 always but do
2849 * nothing.
2850 */
2851 int lxcfs_truncate(const char *path, off_t newsize)
2852 {
2853 if (strncmp(path, "/cgroup", 7) == 0)
2854 return 0;
2855 return -EINVAL;
2856 }
2857
2858 int lxcfs_rmdir(const char *path)
2859 {
2860 if (strncmp(path, "/cgroup", 7) == 0)
2861 return cg_rmdir(path);
2862 return -EINVAL;
2863 }
2864
2865 int lxcfs_chmod(const char *path, mode_t mode)
2866 {
2867 if (strncmp(path, "/cgroup", 7) == 0)
2868 return cg_chmod(path, mode);
2869 return -EINVAL;
2870 }
2871
2872 const struct fuse_operations lxcfs_ops = {
2873 .getattr = lxcfs_getattr,
2874 .readlink = NULL,
2875 .getdir = NULL,
2876 .mknod = NULL,
2877 .mkdir = lxcfs_mkdir,
2878 .unlink = NULL,
2879 .rmdir = lxcfs_rmdir,
2880 .symlink = NULL,
2881 .rename = NULL,
2882 .link = NULL,
2883 .chmod = lxcfs_chmod,
2884 .chown = lxcfs_chown,
2885 .truncate = lxcfs_truncate,
2886 .utime = NULL,
2887
2888 .open = lxcfs_open,
2889 .read = lxcfs_read,
2890 .release = lxcfs_release,
2891 .write = lxcfs_write,
2892
2893 .statfs = NULL,
2894 .flush = lxcfs_flush,
2895 .fsync = lxcfs_fsync,
2896
2897 .setxattr = NULL,
2898 .getxattr = NULL,
2899 .listxattr = NULL,
2900 .removexattr = NULL,
2901
2902 .opendir = lxcfs_opendir,
2903 .readdir = lxcfs_readdir,
2904 .releasedir = lxcfs_releasedir,
2905
2906 .fsyncdir = NULL,
2907 .init = NULL,
2908 .destroy = NULL,
2909 .access = NULL,
2910 .create = NULL,
2911 .ftruncate = NULL,
2912 .fgetattr = NULL,
2913 };
2914
2915 static void usage(const char *me)
2916 {
2917 fprintf(stderr, "Usage:\n");
2918 fprintf(stderr, "\n");
2919 fprintf(stderr, "%s mountpoint\n", me);
2920 fprintf(stderr, "%s -h\n", me);
2921 exit(1);
2922 }
2923
2924 static bool is_help(char *w)
2925 {
2926 if (strcmp(w, "-h") == 0 ||
2927 strcmp(w, "--help") == 0 ||
2928 strcmp(w, "-help") == 0 ||
2929 strcmp(w, "help") == 0)
2930 return true;
2931 return false;
2932 }
2933
2934 void swallow_arg(int *argcp, char *argv[], char *which)
2935 {
2936 int i;
2937
2938 for (i = 1; argv[i]; i++) {
2939 if (strcmp(argv[i], which) != 0)
2940 continue;
2941 for (; argv[i]; i++) {
2942 argv[i] = argv[i+1];
2943 }
2944 (*argcp)--;
2945 return;
2946 }
2947 }
2948
2949 void swallow_option(int *argcp, char *argv[], char *opt, char *v)
2950 {
2951 int i;
2952
2953 for (i = 1; argv[i]; i++) {
2954 if (!argv[i+1])
2955 continue;
2956 if (strcmp(argv[i], opt) != 0)
2957 continue;
2958 if (strcmp(argv[i+1], v) != 0) {
2959 fprintf(stderr, "Warning: unexpected fuse option %s\n", v);
2960 exit(1);
2961 }
2962 for (; argv[i+1]; i++) {
2963 argv[i] = argv[i+2];
2964 }
2965 (*argcp) -= 2;
2966 return;
2967 }
2968 }
2969
2970 int main(int argc, char *argv[])
2971 {
2972 int ret = -1;
2973 struct lxcfs_state *d = NULL;
2974 /*
2975 * what we pass to fuse_main is:
2976 * argv[0] -s -f -o allow_other,directio argv[1] NULL
2977 */
2978 int nargs = 5, cnt = 0;
2979 char *newargv[6];
2980
2981 /* accomodate older init scripts */
2982 swallow_arg(&argc, argv, "-s");
2983 swallow_arg(&argc, argv, "-f");
2984 swallow_option(&argc, argv, "-o", "allow_other");
2985
2986 if (argc == 2 && strcmp(argv[1], "--version") == 0) {
2987 fprintf(stderr, "%s\n", VERSION);
2988 exit(0);
2989 }
2990 if (argc != 2 || is_help(argv[1]))
2991 usage(argv[0]);
2992
2993 do {
2994 d = malloc(sizeof(*d));
2995 } while (!d);
2996
2997 newargv[cnt++] = argv[0];
2998 newargv[cnt++] = "-f";
2999 newargv[cnt++] = "-o";
3000 newargv[cnt++] = "allow_other,direct_io";
3001 newargv[cnt++] = argv[1];
3002 newargv[cnt++] = NULL;
3003
3004 if (!cgm_escape_cgroup())
3005 fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
3006
3007 if (!cgm_get_controllers(&d->subsystems))
3008 goto out;
3009
3010 ret = fuse_main(nargs, newargv, &lxcfs_ops, d);
3011 cgm_dbus_disconnect();
3012
3013 out:
3014 free(d);
3015 return ret;
3016 }