]> git.proxmox.com Git - mirror_lxcfs.git/blob - lxcfs.c
267814c7fed9866d9d6e20b44f7825fd0d79c6cc
[mirror_lxcfs.git] / lxcfs.c
1 /* lxcfs
2 *
3 * Copyright © 2014,2015 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 /* XXX TODO - in debian/control, drop libcgmanager-dev,
10 and add libdbus-glib-1-dev, and libglib2.0-dev to build-deps.
11 make sure lxcfs_mkdir is added to the installed files */
12 #define FUSE_USE_VERSION 26
13
14 #include <stdio.h>
15 #include <dirent.h>
16 #include <fcntl.h>
17 #include <fuse.h>
18 #include <unistd.h>
19 #include <errno.h>
20 #include <stdbool.h>
21 #include <time.h>
22 #include <string.h>
23 #include <stdlib.h>
24 #include <libgen.h>
25 #include <sched.h>
26 #include <linux/sched.h>
27 #include <sys/socket.h>
28 #include <sys/mount.h>
29 #include <wait.h>
30
31 #include "cgmanager.h"
32 #include "config.h" // for VERSION
33
34 struct lxcfs_state {
35 /*
36 * a null-terminated list of the mounted subsystems. We
37 * detect this at startup.
38 */
39 char **subsystems;
40 };
41 #define LXCFS_DATA ((struct lxcfs_state *) fuse_get_context()->private_data)
42
43 enum {
44 LXC_TYPE_CGDIR,
45 LXC_TYPE_CGFILE,
46 LXC_TYPE_PROC_MEMINFO,
47 LXC_TYPE_PROC_CPUINFO,
48 LXC_TYPE_PROC_UPTIME,
49 LXC_TYPE_PROC_STAT,
50 LXC_TYPE_PROC_DISKSTATS,
51 };
52
53 struct file_info {
54 char *controller;
55 char *cgroup;
56 char *file;
57 int type;
58 char *buf; // unused as of yet
59 int buflen;
60 int size; //actual data size
61 };
62
63 /* reserve buffer size, for cpuall in /proc/stat */
64 #define BUF_RESERVE_SIZE 256
65
66 /*
67 * append pid to *src.
68 * src: a pointer to a char* in which ot append the pid.
69 * sz: the number of characters printed so far, minus trailing \0.
70 * asz: the allocated size so far
71 * pid: the pid to append
72 */
73 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
74 {
75 char *d = *src;
76 char tmp[30];
77
78 sprintf(tmp, "%d\n", (int)pid);
79
80 if (!d) {
81 do {
82 d = malloc(BUF_RESERVE_SIZE);
83 } while (!d);
84 *src = d;
85 *asz = BUF_RESERVE_SIZE;
86 } else if (strlen(tmp) + sz + 1 >= asz) {
87 do {
88 d = realloc(d, *asz + BUF_RESERVE_SIZE);
89 } while (!d);
90 *src = d;
91 *asz += BUF_RESERVE_SIZE;
92 }
93 memcpy(d+*sz, tmp, strlen(tmp));
94 *sz += strlen(tmp);
95 d[*sz] = '\0';
96 }
97
98 static char *must_copy_string(void *parent, const char *str)
99 {
100 char *dup = NULL;
101 if (!str)
102 return NULL;
103 do {
104 dup = strdup(str);
105 } while (!dup);
106
107 return dup;
108 }
109
110 /*
111 * TODO - return value should denote whether child exited with failure
112 * so callers can return errors. Esp read/write of tasks and cgroup.procs
113 */
114 static int wait_for_pid(pid_t pid)
115 {
116 int status, ret;
117
118 again:
119 ret = waitpid(pid, &status, 0);
120 if (ret == -1) {
121 if (errno == EINTR)
122 goto again;
123 return -1;
124 }
125 if (ret != pid)
126 goto again;
127 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
128 return -1;
129 return 0;
130 }
131
132 /*
133 * Given a open file * to /proc/pid/{u,g}id_map, and an id
134 * valid in the caller's namespace, return the id mapped into
135 * pid's namespace.
136 * Returns the mapped id, or -1 on error.
137 */
138 unsigned int
139 convert_id_to_ns(FILE *idfile, unsigned int in_id)
140 {
141 unsigned int nsuid, // base id for a range in the idfile's namespace
142 hostuid, // base id for a range in the caller's namespace
143 count; // number of ids in this range
144 char line[400];
145 int ret;
146
147 fseek(idfile, 0L, SEEK_SET);
148 while (fgets(line, 400, idfile)) {
149 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
150 if (ret != 3)
151 continue;
152 if (hostuid + count < hostuid || nsuid + count < nsuid) {
153 /*
154 * uids wrapped around - unexpected as this is a procfile,
155 * so just bail.
156 */
157 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
158 nsuid, hostuid, count, line);
159 return -1;
160 }
161 if (hostuid <= in_id && hostuid+count > in_id) {
162 /*
163 * now since hostuid <= in_id < hostuid+count, and
164 * hostuid+count and nsuid+count do not wrap around,
165 * we know that nsuid+(in_id-hostuid) which must be
166 * less that nsuid+(count) must not wrap around
167 */
168 return (in_id - hostuid) + nsuid;
169 }
170 }
171
172 // no answer found
173 return -1;
174 }
175
176 /*
177 * for is_privileged_over,
178 * specify whether we require the calling uid to be root in his
179 * namespace
180 */
181 #define NS_ROOT_REQD true
182 #define NS_ROOT_OPT false
183
184 #define PROCLEN 100
185
186 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
187 {
188 char fpath[PROCLEN];
189 int ret;
190 bool answer = false;
191 uid_t nsuid;
192
193 if (victim == -1 || uid == -1)
194 return false;
195
196 /*
197 * If the request is one not requiring root in the namespace,
198 * then having the same uid suffices. (i.e. uid 1000 has write
199 * access to files owned by uid 1000
200 */
201 if (!req_ns_root && uid == victim)
202 return true;
203
204 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
205 if (ret < 0 || ret >= PROCLEN)
206 return false;
207 FILE *f = fopen(fpath, "r");
208 if (!f)
209 return false;
210
211 /* if caller's not root in his namespace, reject */
212 nsuid = convert_id_to_ns(f, uid);
213 if (nsuid)
214 goto out;
215
216 /*
217 * If victim is not mapped into caller's ns, reject.
218 * XXX I'm not sure this check is needed given that fuse
219 * will be sending requests where the vfs has converted
220 */
221 nsuid = convert_id_to_ns(f, victim);
222 if (nsuid == -1)
223 goto out;
224
225 answer = true;
226
227 out:
228 fclose(f);
229 return answer;
230 }
231
232 static bool perms_include(int fmode, mode_t req_mode)
233 {
234 mode_t r;
235
236 switch (req_mode & O_ACCMODE) {
237 case O_RDONLY:
238 r = S_IROTH;
239 break;
240 case O_WRONLY:
241 r = S_IWOTH;
242 break;
243 case O_RDWR:
244 r = S_IROTH | S_IWOTH;
245 break;
246 default:
247 return false;
248 }
249 return ((fmode & r) == r);
250 }
251
252 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
253 {
254 char *start, *end;
255
256 if (strlen(taskcg) <= strlen(querycg)) {
257 fprintf(stderr, "%s: I was fed bad input\n", __func__);
258 return NULL;
259 }
260
261 if (strcmp(querycg, "/") == 0)
262 start = strdup(taskcg + 1);
263 else
264 start = strdup(taskcg + strlen(querycg) + 1);
265 if (!start)
266 return NULL;
267 end = strchr(start, '/');
268 if (end)
269 *end = '\0';
270 return start;
271 }
272
273 static void stripnewline(char *x)
274 {
275 size_t l = strlen(x);
276 if (l && x[l-1] == '\n')
277 x[l-1] = '\0';
278 }
279
280 static char *get_pid_cgroup(pid_t pid, const char *contrl)
281 {
282 char fnam[PROCLEN];
283 FILE *f;
284 char *answer = NULL;
285 char *line = NULL;
286 size_t len = 0;
287 int ret;
288
289 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
290 if (ret < 0 || ret >= PROCLEN)
291 return NULL;
292 if (!(f = fopen(fnam, "r")))
293 return NULL;
294
295 while (getline(&line, &len, f) != -1) {
296 char *c1, *c2;
297 if (!line[0])
298 continue;
299 c1 = strchr(line, ':');
300 if (!c1)
301 goto out;
302 c1++;
303 c2 = strchr(c1, ':');
304 if (!c2)
305 goto out;
306 *c2 = '\0';
307 if (strcmp(c1, contrl) != 0)
308 continue;
309 c2++;
310 stripnewline(c2);
311 do {
312 answer = strdup(c2);
313 } while (!answer);
314 break;
315 }
316
317 out:
318 fclose(f);
319 free(line);
320 return answer;
321 }
322
323 /*
324 * check whether a fuse context may access a cgroup dir or file
325 *
326 * If file is not null, it is a cgroup file to check under cg.
327 * If file is null, then we are checking perms on cg itself.
328 *
329 * For files we can check the mode of the list_keys result.
330 * For cgroups, we must make assumptions based on the files under the
331 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
332 * yet.
333 */
334 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
335 {
336 struct cgm_keys **list = NULL;
337 bool ret = false;
338 int i;
339
340 if (!file)
341 file = "tasks";
342
343 if (*file == '/')
344 file++;
345
346 if (!cgm_list_keys(contrl, cg, &list))
347 return false;
348 for (i = 0; list[i]; i++) {
349 if (strcmp(list[i]->name, file) == 0) {
350 struct cgm_keys *k = list[i];
351 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
352 if (perms_include(k->mode >> 6, mode)) {
353 ret = true;
354 goto out;
355 }
356 }
357 if (fc->gid == k->gid) {
358 if (perms_include(k->mode >> 3, mode)) {
359 ret = true;
360 goto out;
361 }
362 }
363 ret = perms_include(k->mode, mode);
364 goto out;
365 }
366 }
367
368 out:
369 free_keys(list);
370 return ret;
371 }
372
373 #define INITSCOPE "/init.scope"
374 static void prune_init_slice(char *cg)
375 {
376 char *point;
377 point = cg + strlen(cg) - strlen(INITSCOPE);
378 if (point < cg)
379 return;
380 if (strcmp(point, INITSCOPE) == 0) {
381 if (point == cg)
382 *(point+1) = '\0';
383 else
384 *point = '\0';
385 }
386 }
387
388 /*
389 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
390 * If caller is in /a, he may act on /a/b, but not on /b.
391 * if the answer is false and nextcg is not NULL, then *nextcg will point
392 * to a string containing the next cgroup directory under cg, which must be
393 * freed by the caller.
394 */
395 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
396 {
397 char fnam[PROCLEN];
398 FILE *f;
399 bool answer = false;
400 char *line = NULL;
401 size_t len = 0;
402 int ret;
403
404 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
405 if (ret < 0 || ret >= PROCLEN)
406 return false;
407 if (!(f = fopen(fnam, "r")))
408 return false;
409
410 while (getline(&line, &len, f) != -1) {
411 char *c1, *c2, *linecmp;
412 if (!line[0])
413 continue;
414 c1 = strchr(line, ':');
415 if (!c1)
416 goto out;
417 c1++;
418 c2 = strchr(c1, ':');
419 if (!c2)
420 goto out;
421 *c2 = '\0';
422 if (strcmp(c1, contrl) != 0)
423 continue;
424 c2++;
425 stripnewline(c2);
426 prune_init_slice(c2);
427 /*
428 * callers pass in '/' for root cgroup, otherwise they pass
429 * in a cgroup without leading '/'
430 */
431 linecmp = *cg == '/' ? c2 : c2+1;
432 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
433 if (nextcg)
434 *nextcg = get_next_cgroup_dir(linecmp, cg);
435 goto out;
436 }
437 answer = true;
438 goto out;
439 }
440
441 out:
442 fclose(f);
443 free(line);
444 return answer;
445 }
446
447 /*
448 * given /cgroup/freezer/a/b, return "freezer".
449 * the returned char* should NOT be freed.
450 */
451 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
452 {
453 const char *p1;
454 char *contr, *slash;
455
456 if (strlen(path) < 9)
457 return NULL;
458 if (*(path+7) != '/')
459 return NULL;
460 p1 = path+8;
461 contr = strdupa(p1);
462 if (!contr)
463 return NULL;
464 slash = strstr(contr, "/");
465 if (slash)
466 *slash = '\0';
467
468 /* verify that it is a subsystem */
469 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
470 int i;
471 if (!list)
472 return NULL;
473 for (i = 0; list[i]; i++) {
474 if (strcmp(list[i], contr) == 0)
475 return list[i];
476 }
477 return NULL;
478 }
479
480 /*
481 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
482 * Note that the returned value may include files (keynames) etc
483 */
484 static const char *find_cgroup_in_path(const char *path)
485 {
486 const char *p1;
487
488 if (strlen(path) < 9)
489 return NULL;
490 p1 = strstr(path+8, "/");
491 if (!p1)
492 return NULL;
493 return p1+1;
494 }
495
496 static bool is_child_cgroup(const char *contr, const char *dir, const char *f)
497 {
498 char **list;
499 bool ret = false;
500 int i;
501
502 if (!f)
503 return false;
504 if (*f == '/')
505 f++;
506
507 if (!cgm_list_children(contr, dir, &list))
508 return false;
509 for (i = 0; list[i]; i++) {
510 if (strcmp(list[i], f) == 0) {
511 ret = true;
512 goto out;
513 }
514 }
515
516 out:
517 for (i = 0; list[i]; i++)
518 free(list[i]);
519 free(list);
520 return ret;
521 }
522
523 static struct cgm_keys *get_cgroup_key(const char *contr, const char *dir, const char *f)
524 {
525 struct cgm_keys **list = NULL;
526 struct cgm_keys *k = NULL;
527 int i;
528
529 if (!f)
530 return NULL;
531 if (*f == '/')
532 f++;
533 if (!cgm_list_keys(contr, dir, &list))
534 return NULL;
535 for (i = 0; list[i]; i++) {
536 if (strcmp(list[i]->name, f) == 0) {
537 int j;
538 // free all the keys we are not returning
539 k = list[i];
540 for (j = 0; list[j]; j++) {
541 if (i != j)
542 free_key(list[j]);
543 }
544 free(list);
545 return k;
546 }
547 }
548
549 free_keys(list);
550 return NULL;
551 }
552
553 /*
554 * dir should be freed, file not
555 */
556 static void get_cgdir_and_path(const char *cg, char **dir, char **file)
557 {
558 char *p;
559
560 do {
561 *dir = strdup(cg);
562 } while (!*dir);
563 *file = strrchr(cg, '/');
564 if (!*file) {
565 *file = NULL;
566 return;
567 }
568 p = strrchr(*dir, '/');
569 *p = '\0';
570 }
571
572 /*
573 * FUSE ops for /cgroup
574 */
575
576 static int cg_getattr(const char *path, struct stat *sb)
577 {
578 struct timespec now;
579 struct fuse_context *fc = fuse_get_context();
580 char * cgdir = NULL;
581 char *fpath = NULL, *path1, *path2;
582 struct cgm_keys *k = NULL;
583 const char *cgroup;
584 const char *controller = NULL;
585 int ret = -ENOENT;
586
587
588 if (!fc)
589 return -EIO;
590
591 memset(sb, 0, sizeof(struct stat));
592
593 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
594 return -EINVAL;
595
596 sb->st_uid = sb->st_gid = 0;
597 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
598 sb->st_size = 0;
599
600 if (strcmp(path, "/cgroup") == 0) {
601 sb->st_mode = S_IFDIR | 00755;
602 sb->st_nlink = 2;
603 return 0;
604 }
605
606 controller = pick_controller_from_path(fc, path);
607 if (!controller)
608 return -EIO;
609 cgroup = find_cgroup_in_path(path);
610 if (!cgroup) {
611 /* this is just /cgroup/controller, return it as a dir */
612 sb->st_mode = S_IFDIR | 00755;
613 sb->st_nlink = 2;
614 return 0;
615 }
616
617 get_cgdir_and_path(cgroup, &cgdir, &fpath);
618
619 if (!fpath) {
620 path1 = "/";
621 path2 = cgdir;
622 } else {
623 path1 = cgdir;
624 path2 = fpath;
625 }
626
627 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
628 * Then check that caller's cgroup is under path if fpath is a child
629 * cgroup, or cgdir if fpath is a file */
630
631 if (is_child_cgroup(controller, path1, path2)) {
632 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
633 /* this is just /cgroup/controller, return it as a dir */
634 sb->st_mode = S_IFDIR | 00555;
635 sb->st_nlink = 2;
636 ret = 0;
637 goto out;
638 }
639 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
640 ret = -EACCES;
641 goto out;
642 }
643
644 // get uid, gid, from '/tasks' file and make up a mode
645 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
646 sb->st_mode = S_IFDIR | 00755;
647 k = get_cgroup_key(controller, cgroup, "tasks");
648 if (!k) {
649 sb->st_uid = sb->st_gid = 0;
650 } else {
651 sb->st_uid = k->uid;
652 sb->st_gid = k->gid;
653 }
654 free_key(k);
655 sb->st_nlink = 2;
656 ret = 0;
657 goto out;
658 }
659
660 if ((k = get_cgroup_key(controller, path1, path2)) != NULL) {
661 sb->st_mode = S_IFREG | k->mode;
662 sb->st_nlink = 1;
663 sb->st_uid = k->uid;
664 sb->st_gid = k->gid;
665 sb->st_size = 0;
666 free_key(k);
667 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL))
668 return -ENOENT;
669 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY))
670 return -EACCES;
671
672 ret = 0;
673 }
674
675 out:
676 free(cgdir);
677 return ret;
678 }
679
680 /*
681 * TODO - cache these results in a table for use in opendir, free
682 * in releasedir
683 */
684 static int cg_opendir(const char *path, struct fuse_file_info *fi)
685 {
686 struct fuse_context *fc = fuse_get_context();
687 const char *cgroup;
688 struct file_info *dir_info;
689 char *controller = NULL;
690
691 if (!fc)
692 return -EIO;
693
694 if (strcmp(path, "/cgroup") == 0) {
695 cgroup = NULL;
696 controller = NULL;
697 } else {
698 // return list of keys for the controller, and list of child cgroups
699 controller = pick_controller_from_path(fc, path);
700 if (!controller)
701 return -EIO;
702
703 cgroup = find_cgroup_in_path(path);
704 if (!cgroup) {
705 /* this is just /cgroup/controller, return its contents */
706 cgroup = "/";
707 }
708 }
709
710 if (cgroup && !fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
711 return -EACCES;
712 }
713
714 /* we'll free this at cg_releasedir */
715 dir_info = malloc(sizeof(*dir_info));
716 if (!dir_info)
717 return -ENOMEM;
718 dir_info->controller = must_copy_string(dir_info, controller);
719 dir_info->cgroup = must_copy_string(dir_info, cgroup);
720 dir_info->type = LXC_TYPE_CGDIR;
721 dir_info->buf = NULL;
722 dir_info->file = NULL;
723 dir_info->buflen = 0;
724
725 fi->fh = (unsigned long)dir_info;
726 return 0;
727 }
728
729 static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
730 struct fuse_file_info *fi)
731 {
732 struct file_info *d = (struct file_info *)fi->fh;
733 struct cgm_keys **list = NULL;
734 int i, ret;
735 char *nextcg = NULL;
736 struct fuse_context *fc = fuse_get_context();
737 char **clist = NULL;
738
739 if (d->type != LXC_TYPE_CGDIR) {
740 fprintf(stderr, "Internal error: file cache info used in readdir\n");
741 return -EIO;
742 }
743 if (!d->cgroup && !d->controller) {
744 // ls /var/lib/lxcfs/cgroup - just show list of controllers
745 char **list = LXCFS_DATA ? LXCFS_DATA->subsystems : NULL;
746 int i;
747
748 if (!list)
749 return -EIO;
750
751 for (i = 0; list[i]; i++) {
752 if (filler(buf, list[i], NULL, 0) != 0) {
753 return -EIO;
754 }
755 }
756 return 0;
757 }
758
759 if (!cgm_list_keys(d->controller, d->cgroup, &list)) {
760 // not a valid cgroup
761 ret = -EINVAL;
762 goto out;
763 }
764
765 if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
766 if (nextcg) {
767 int ret;
768 ret = filler(buf, nextcg, NULL, 0);
769 free(nextcg);
770 if (ret != 0) {
771 ret = -EIO;
772 goto out;
773 }
774 }
775 ret = 0;
776 goto out;
777 }
778
779 for (i = 0; list[i]; i++) {
780 if (filler(buf, list[i]->name, NULL, 0) != 0) {
781 ret = -EIO;
782 goto out;
783 }
784 }
785
786 // now get the list of child cgroups
787
788 if (!cgm_list_children(d->controller, d->cgroup, &clist)) {
789 ret = 0;
790 goto out;
791 }
792 for (i = 0; clist[i]; i++) {
793 if (filler(buf, clist[i], NULL, 0) != 0) {
794 ret = -EIO;
795 goto out;
796 }
797 }
798 ret = 0;
799
800 out:
801 free_keys(list);
802 if (clist) {
803 for (i = 0; clist[i]; i++)
804 free(clist[i]);
805 free(clist);
806 }
807 return ret;
808 }
809
810 static void do_release_file_info(struct file_info *f)
811 {
812 if (!f)
813 return;
814 free(f->controller);
815 free(f->cgroup);
816 free(f->file);
817 free(f->buf);
818 free(f);
819 }
820
821 static int cg_releasedir(const char *path, struct fuse_file_info *fi)
822 {
823 struct file_info *d = (struct file_info *)fi->fh;
824
825 do_release_file_info(d);
826 return 0;
827 }
828
829 static int cg_open(const char *path, struct fuse_file_info *fi)
830 {
831 const char *cgroup;
832 char *fpath = NULL, *path1, *path2, * cgdir = NULL, *controller;
833 struct cgm_keys *k = NULL;
834 struct file_info *file_info;
835 struct fuse_context *fc = fuse_get_context();
836 int ret;
837
838 if (!fc)
839 return -EIO;
840
841 controller = pick_controller_from_path(fc, path);
842 if (!controller)
843 return -EIO;
844 cgroup = find_cgroup_in_path(path);
845 if (!cgroup)
846 return -EINVAL;
847
848 get_cgdir_and_path(cgroup, &cgdir, &fpath);
849 if (!fpath) {
850 path1 = "/";
851 path2 = cgdir;
852 } else {
853 path1 = cgdir;
854 path2 = fpath;
855 }
856
857 k = get_cgroup_key(controller, path1, path2);
858 if (!k) {
859 ret = -EINVAL;
860 goto out;
861 }
862 free_key(k);
863
864 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
865 // should never get here
866 ret = -EACCES;
867 goto out;
868 }
869
870 /* we'll free this at cg_release */
871 file_info = malloc(sizeof(*file_info));
872 if (!file_info) {
873 ret = -ENOMEM;
874 goto out;
875 }
876 file_info->controller = must_copy_string(file_info, controller);
877 file_info->cgroup = must_copy_string(file_info, path1);
878 file_info->file = must_copy_string(file_info, path2);
879 file_info->type = LXC_TYPE_CGFILE;
880 file_info->buf = NULL;
881 file_info->buflen = 0;
882
883 fi->fh = (unsigned long)file_info;
884 ret = 0;
885
886 out:
887 free(cgdir);
888 return ret;
889 }
890
891 static int cg_release(const char *path, struct fuse_file_info *fi)
892 {
893 struct file_info *f = (struct file_info *)fi->fh;
894
895 do_release_file_info(f);
896 return 0;
897 }
898
899 static int msgrecv(int sockfd, void *buf, size_t len)
900 {
901 struct timeval tv;
902 fd_set rfds;
903
904 FD_ZERO(&rfds);
905 FD_SET(sockfd, &rfds);
906 tv.tv_sec = 2;
907 tv.tv_usec = 0;
908
909 if (select(sockfd+1, &rfds, NULL, NULL, &tv) <= 0)
910 return -1;
911 return recv(sockfd, buf, len, MSG_DONTWAIT);
912 }
913
914 #define SEND_CREDS_OK 0
915 #define SEND_CREDS_NOTSK 1
916 #define SEND_CREDS_FAIL 2
917 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
918 {
919 struct msghdr msg = { 0 };
920 struct iovec iov;
921 struct cmsghdr *cmsg;
922 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
923 char buf[1];
924 buf[0] = 'p';
925
926 if (pingfirst) {
927 if (msgrecv(sock, buf, 1) != 1) {
928 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
929 __func__);
930 return SEND_CREDS_FAIL;
931 }
932 }
933
934 msg.msg_control = cmsgbuf;
935 msg.msg_controllen = sizeof(cmsgbuf);
936
937 cmsg = CMSG_FIRSTHDR(&msg);
938 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
939 cmsg->cmsg_level = SOL_SOCKET;
940 cmsg->cmsg_type = SCM_CREDENTIALS;
941 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
942
943 msg.msg_name = NULL;
944 msg.msg_namelen = 0;
945
946 buf[0] = v;
947 iov.iov_base = buf;
948 iov.iov_len = sizeof(buf);
949 msg.msg_iov = &iov;
950 msg.msg_iovlen = 1;
951
952 if (sendmsg(sock, &msg, 0) < 0) {
953 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
954 strerror(errno));
955 if (errno == 3)
956 return SEND_CREDS_NOTSK;
957 return SEND_CREDS_FAIL;
958 }
959
960 return SEND_CREDS_OK;
961 }
962
963 static bool recv_creds(int sock, struct ucred *cred, char *v)
964 {
965 struct msghdr msg = { 0 };
966 struct iovec iov;
967 struct cmsghdr *cmsg;
968 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
969 char buf[1];
970 int ret;
971 int optval = 1;
972 struct timeval tv;
973 fd_set rfds;
974
975 *v = '1';
976
977 cred->pid = -1;
978 cred->uid = -1;
979 cred->gid = -1;
980
981 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
982 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
983 return false;
984 }
985 buf[0] = '1';
986 if (write(sock, buf, 1) != 1) {
987 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
988 return false;
989 }
990
991 msg.msg_name = NULL;
992 msg.msg_namelen = 0;
993 msg.msg_control = cmsgbuf;
994 msg.msg_controllen = sizeof(cmsgbuf);
995
996 iov.iov_base = buf;
997 iov.iov_len = sizeof(buf);
998 msg.msg_iov = &iov;
999 msg.msg_iovlen = 1;
1000
1001 FD_ZERO(&rfds);
1002 FD_SET(sock, &rfds);
1003 tv.tv_sec = 2;
1004 tv.tv_usec = 0;
1005 if (select(sock+1, &rfds, NULL, NULL, &tv) <= 0) {
1006 fprintf(stderr, "Failed to select for scm_cred: %s\n",
1007 strerror(errno));
1008 return false;
1009 }
1010 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
1011 if (ret < 0) {
1012 fprintf(stderr, "Failed to receive scm_cred: %s\n",
1013 strerror(errno));
1014 return false;
1015 }
1016
1017 cmsg = CMSG_FIRSTHDR(&msg);
1018
1019 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
1020 cmsg->cmsg_level == SOL_SOCKET &&
1021 cmsg->cmsg_type == SCM_CREDENTIALS) {
1022 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
1023 }
1024 *v = buf[0];
1025
1026 return true;
1027 }
1028
1029
1030 /*
1031 * pid_to_ns - reads pids from a ucred over a socket, then writes the
1032 * int value back over the socket. This shifts the pid from the
1033 * sender's pidns into tpid's pidns.
1034 */
1035 static void pid_to_ns(int sock, pid_t tpid)
1036 {
1037 char v = '0';
1038 struct ucred cred;
1039
1040 while (recv_creds(sock, &cred, &v)) {
1041 if (v == '1')
1042 _exit(0);
1043 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
1044 _exit(1);
1045 }
1046 _exit(0);
1047 }
1048
1049 /*
1050 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
1051 * in your old pidns. Only children which you fork will be in the target
1052 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
1053 * actually convert pids
1054 */
1055 static void pid_to_ns_wrapper(int sock, pid_t tpid)
1056 {
1057 int newnsfd = -1, ret, cpipe[2];
1058 char fnam[100];
1059 pid_t cpid;
1060 struct timeval tv;
1061 fd_set s;
1062 char v;
1063
1064 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1065 if (ret < 0 || ret >= sizeof(fnam))
1066 _exit(1);
1067 newnsfd = open(fnam, O_RDONLY);
1068 if (newnsfd < 0)
1069 _exit(1);
1070 if (setns(newnsfd, 0) < 0)
1071 _exit(1);
1072 close(newnsfd);
1073
1074 if (pipe(cpipe) < 0)
1075 _exit(1);
1076
1077 loop:
1078 cpid = fork();
1079 if (cpid < 0)
1080 _exit(1);
1081
1082 if (!cpid) {
1083 char b = '1';
1084 close(cpipe[0]);
1085 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1086 fprintf(stderr, "%s (child): erorr on write: %s\n",
1087 __func__, strerror(errno));
1088 }
1089 close(cpipe[1]);
1090 pid_to_ns(sock, tpid);
1091 }
1092 // give the child 1 second to be done forking and
1093 // write it's ack
1094 FD_ZERO(&s);
1095 FD_SET(cpipe[0], &s);
1096 tv.tv_sec = 1;
1097 tv.tv_usec = 0;
1098 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1099 if (ret <= 0)
1100 goto again;
1101 ret = read(cpipe[0], &v, 1);
1102 if (ret != sizeof(char) || v != '1') {
1103 goto again;
1104 }
1105
1106 if (!wait_for_pid(cpid))
1107 _exit(1);
1108 _exit(0);
1109
1110 again:
1111 kill(cpid, SIGKILL);
1112 wait_for_pid(cpid);
1113 goto loop;
1114 }
1115
1116 /*
1117 * To read cgroup files with a particular pid, we will setns into the child
1118 * pidns, open a pipe, fork a child - which will be the first to really be in
1119 * the child ns - which does the cgm_get_value and writes the data to the pipe.
1120 */
1121 static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
1122 {
1123 int sock[2] = {-1, -1};
1124 char *tmpdata = NULL;
1125 int ret;
1126 pid_t qpid, cpid = -1;
1127 bool answer = false;
1128 char v = '0';
1129 struct ucred cred;
1130 struct timeval tv;
1131 size_t sz = 0, asz = 0;
1132 fd_set s;
1133
1134 if (!cgm_get_value(contrl, cg, file, &tmpdata))
1135 return false;
1136
1137 /*
1138 * Now we read the pids from returned data one by one, pass
1139 * them into a child in the target namespace, read back the
1140 * translated pids, and put them into our to-return data
1141 */
1142
1143 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1144 perror("socketpair");
1145 free(tmpdata);
1146 return false;
1147 }
1148
1149 cpid = fork();
1150 if (cpid == -1)
1151 goto out;
1152
1153 if (!cpid) // child
1154 pid_to_ns_wrapper(sock[1], tpid);
1155
1156 char *ptr = tmpdata;
1157 cred.uid = 0;
1158 cred.gid = 0;
1159 while (sscanf(ptr, "%d\n", &qpid) == 1) {
1160 cred.pid = qpid;
1161 ret = send_creds(sock[0], &cred, v, true);
1162
1163 if (ret == SEND_CREDS_NOTSK)
1164 goto next;
1165 if (ret == SEND_CREDS_FAIL)
1166 goto out;
1167
1168 // read converted results
1169 FD_ZERO(&s);
1170 FD_SET(sock[0], &s);
1171 tv.tv_sec = 2;
1172 tv.tv_usec = 0;
1173 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
1174 if (ret <= 0) {
1175 fprintf(stderr, "%s: select error waiting for pid from child: %s\n",
1176 __func__, strerror(errno));
1177 goto out;
1178 }
1179 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1180 fprintf(stderr, "%s: error reading pid from child: %s\n",
1181 __func__, strerror(errno));
1182 goto out;
1183 }
1184 must_strcat_pid(d, &sz, &asz, qpid);
1185 next:
1186 ptr = strchr(ptr, '\n');
1187 if (!ptr)
1188 break;
1189 ptr++;
1190 }
1191
1192 cred.pid = getpid();
1193 v = '1';
1194 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
1195 // failed to ask child to exit
1196 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1197 __func__, strerror(errno));
1198 goto out;
1199 }
1200
1201 answer = true;
1202
1203 out:
1204 free(tmpdata);
1205 if (cpid != -1)
1206 wait_for_pid(cpid);
1207 if (sock[0] != -1) {
1208 close(sock[0]);
1209 close(sock[1]);
1210 }
1211 return answer;
1212 }
1213
1214 static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1215 struct fuse_file_info *fi)
1216 {
1217 struct fuse_context *fc = fuse_get_context();
1218 struct file_info *f = (struct file_info *)fi->fh;
1219 struct cgm_keys *k = NULL;
1220 char *data = NULL;
1221 int ret, s;
1222 bool r;
1223
1224 if (f->type != LXC_TYPE_CGFILE) {
1225 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1226 return -EIO;
1227 }
1228
1229 if (offset)
1230 return 0;
1231
1232 if (!fc)
1233 return -EIO;
1234
1235 if (!f->controller)
1236 return -EINVAL;
1237
1238 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) == NULL) {
1239 return -EINVAL;
1240 }
1241 free_key(k);
1242
1243
1244 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { // should never get here
1245 ret = -EACCES;
1246 goto out;
1247 }
1248
1249 if (strcmp(f->file, "tasks") == 0 ||
1250 strcmp(f->file, "/tasks") == 0 ||
1251 strcmp(f->file, "/cgroup.procs") == 0 ||
1252 strcmp(f->file, "cgroup.procs") == 0)
1253 // special case - we have to translate the pids
1254 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
1255 else
1256 r = cgm_get_value(f->controller, f->cgroup, f->file, &data);
1257
1258 if (!r) {
1259 ret = -EINVAL;
1260 goto out;
1261 }
1262
1263 if (!data) {
1264 ret = 0;
1265 goto out;
1266 }
1267 s = strlen(data);
1268 if (s > size)
1269 s = size;
1270 memcpy(buf, data, s);
1271 if (s > 0 && s < size && data[s-1] != '\n')
1272 buf[s++] = '\n';
1273
1274 ret = s;
1275
1276 out:
1277 free(data);
1278 return ret;
1279 }
1280
1281 static void pid_from_ns(int sock, pid_t tpid)
1282 {
1283 pid_t vpid;
1284 struct ucred cred;
1285 char v;
1286 struct timeval tv;
1287 fd_set s;
1288 int ret;
1289
1290 cred.uid = 0;
1291 cred.gid = 0;
1292 while (1) {
1293 FD_ZERO(&s);
1294 FD_SET(sock, &s);
1295 tv.tv_sec = 2;
1296 tv.tv_usec = 0;
1297 ret = select(sock+1, &s, NULL, NULL, &tv);
1298 if (ret <= 0) {
1299 fprintf(stderr, "%s: bad select before read from parent: %s\n",
1300 __func__, strerror(errno));
1301 _exit(1);
1302 }
1303 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1304 fprintf(stderr, "%s: bad read from parent: %s\n",
1305 __func__, strerror(errno));
1306 _exit(1);
1307 }
1308 if (vpid == -1) // done
1309 break;
1310 v = '0';
1311 cred.pid = vpid;
1312 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
1313 v = '1';
1314 cred.pid = getpid();
1315 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
1316 _exit(1);
1317 }
1318 }
1319 _exit(0);
1320 }
1321
1322 static void pid_from_ns_wrapper(int sock, pid_t tpid)
1323 {
1324 int newnsfd = -1, ret, cpipe[2];
1325 char fnam[100];
1326 pid_t cpid;
1327 fd_set s;
1328 struct timeval tv;
1329 char v;
1330
1331 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1332 if (ret < 0 || ret >= sizeof(fnam))
1333 _exit(1);
1334 newnsfd = open(fnam, O_RDONLY);
1335 if (newnsfd < 0)
1336 _exit(1);
1337 if (setns(newnsfd, 0) < 0)
1338 _exit(1);
1339 close(newnsfd);
1340
1341 if (pipe(cpipe) < 0)
1342 _exit(1);
1343
1344 loop:
1345 cpid = fork();
1346
1347 if (cpid < 0)
1348 _exit(1);
1349
1350 if (!cpid) {
1351 char b = '1';
1352 close(cpipe[0]);
1353 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1354 fprintf(stderr, "%s (child): erorr on write: %s\n",
1355 __func__, strerror(errno));
1356 }
1357 close(cpipe[1]);
1358 pid_from_ns(sock, tpid);
1359 }
1360
1361 // give the child 1 second to be done forking and
1362 // write it's ack
1363 FD_ZERO(&s);
1364 FD_SET(cpipe[0], &s);
1365 tv.tv_sec = 1;
1366 tv.tv_usec = 0;
1367 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1368 if (ret <= 0)
1369 goto again;
1370 ret = read(cpipe[0], &v, 1);
1371 if (ret != sizeof(char) || v != '1') {
1372 goto again;
1373 }
1374
1375 if (!wait_for_pid(cpid))
1376 _exit(1);
1377 _exit(0);
1378
1379 again:
1380 kill(cpid, SIGKILL);
1381 wait_for_pid(cpid);
1382 goto loop;
1383 }
1384
1385 static bool do_write_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, const char *buf)
1386 {
1387 int sock[2] = {-1, -1};
1388 pid_t qpid, cpid = -1;
1389 bool answer = false, fail = false;
1390
1391 /*
1392 * write the pids to a socket, have helper in writer's pidns
1393 * call movepid for us
1394 */
1395 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1396 perror("socketpair");
1397 exit(1);
1398 }
1399
1400 cpid = fork();
1401 if (cpid == -1)
1402 goto out;
1403
1404 if (!cpid) // child
1405 pid_from_ns_wrapper(sock[1], tpid);
1406
1407 const char *ptr = buf;
1408 while (sscanf(ptr, "%d", &qpid) == 1) {
1409 struct ucred cred;
1410 char v;
1411
1412 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1413 fprintf(stderr, "%s: error writing pid to child: %s\n",
1414 __func__, strerror(errno));
1415 goto out;
1416 }
1417
1418 if (recv_creds(sock[0], &cred, &v)) {
1419 if (v == '0') {
1420 if (!cgm_move_pid(contrl, cg, cred.pid))
1421 fail = true;
1422 }
1423 }
1424
1425 ptr = strchr(ptr, '\n');
1426 if (!ptr)
1427 break;
1428 ptr++;
1429 }
1430
1431 /* All good, write the value */
1432 qpid = -1;
1433 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1434 fprintf(stderr, "Warning: failed to ask child to exit\n");
1435
1436 if (!fail)
1437 answer = true;
1438
1439 out:
1440 if (cpid != -1)
1441 wait_for_pid(cpid);
1442 if (sock[0] != -1) {
1443 close(sock[0]);
1444 close(sock[1]);
1445 }
1446 return answer;
1447 }
1448
1449 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1450 struct fuse_file_info *fi)
1451 {
1452 struct fuse_context *fc = fuse_get_context();
1453 char *localbuf = NULL;
1454 struct cgm_keys *k = NULL;
1455 struct file_info *f = (struct file_info *)fi->fh;
1456 bool r;
1457
1458 if (f->type != LXC_TYPE_CGFILE) {
1459 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1460 return -EIO;
1461 }
1462
1463 if (offset)
1464 return 0;
1465
1466 if (!fc)
1467 return -EIO;
1468
1469 localbuf = alloca(size+1);
1470 localbuf[size] = '\0';
1471 memcpy(localbuf, buf, size);
1472
1473 if ((k = get_cgroup_key(f->controller, f->cgroup, f->file)) == NULL) {
1474 size = -EINVAL;
1475 goto out;
1476 }
1477
1478 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
1479 size = -EACCES;
1480 goto out;
1481 }
1482
1483 if (strcmp(f->file, "tasks") == 0 ||
1484 strcmp(f->file, "/tasks") == 0 ||
1485 strcmp(f->file, "/cgroup.procs") == 0 ||
1486 strcmp(f->file, "cgroup.procs") == 0)
1487 // special case - we have to translate the pids
1488 r = do_write_pids(fc->pid, f->controller, f->cgroup, f->file, localbuf);
1489 else
1490 r = cgm_set_value(f->controller, f->cgroup, f->file, localbuf);
1491
1492 if (!r)
1493 size = -EINVAL;
1494
1495 out:
1496 free_key(k);
1497 return size;
1498 }
1499
1500 int cg_chown(const char *path, uid_t uid, gid_t gid)
1501 {
1502 struct fuse_context *fc = fuse_get_context();
1503 char *cgdir = NULL, *fpath = NULL, *path1, *path2, *controller;
1504 struct cgm_keys *k = NULL;
1505 const char *cgroup;
1506 int ret;
1507
1508 if (!fc)
1509 return -EIO;
1510
1511 if (strcmp(path, "/cgroup") == 0)
1512 return -EINVAL;
1513
1514 controller = pick_controller_from_path(fc, path);
1515 if (!controller)
1516 return -EINVAL;
1517 cgroup = find_cgroup_in_path(path);
1518 if (!cgroup)
1519 /* this is just /cgroup/controller */
1520 return -EINVAL;
1521
1522 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1523
1524 if (!fpath) {
1525 path1 = "/";
1526 path2 = cgdir;
1527 } else {
1528 path1 = cgdir;
1529 path2 = fpath;
1530 }
1531
1532 if (is_child_cgroup(controller, path1, path2)) {
1533 // get uid, gid, from '/tasks' file and make up a mode
1534 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1535 k = get_cgroup_key(controller, cgroup, "tasks");
1536
1537 } else
1538 k = get_cgroup_key(controller, path1, path2);
1539
1540 if (!k) {
1541 ret = -EINVAL;
1542 goto out;
1543 }
1544
1545 /*
1546 * This being a fuse request, the uid and gid must be valid
1547 * in the caller's namespace. So we can just check to make
1548 * sure that the caller is root in his uid, and privileged
1549 * over the file's current owner.
1550 */
1551 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
1552 ret = -EACCES;
1553 goto out;
1554 }
1555
1556 if (!cgm_chown_file(controller, cgroup, uid, gid)) {
1557 ret = -EINVAL;
1558 goto out;
1559 }
1560
1561 ret = 0;
1562
1563 out:
1564 free_key(k);
1565 free(cgdir);
1566
1567 return ret;
1568 }
1569
1570 int cg_chmod(const char *path, mode_t mode)
1571 {
1572 struct fuse_context *fc = fuse_get_context();
1573 char * cgdir = NULL, *fpath = NULL, *path1, *path2, *controller;
1574 struct cgm_keys *k = NULL;
1575 const char *cgroup;
1576 int ret;
1577
1578 if (!fc)
1579 return -EIO;
1580
1581 if (strcmp(path, "/cgroup") == 0)
1582 return -EINVAL;
1583
1584 controller = pick_controller_from_path(fc, path);
1585 if (!controller)
1586 return -EINVAL;
1587 cgroup = find_cgroup_in_path(path);
1588 if (!cgroup)
1589 /* this is just /cgroup/controller */
1590 return -EINVAL;
1591
1592 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1593
1594 if (!fpath) {
1595 path1 = "/";
1596 path2 = cgdir;
1597 } else {
1598 path1 = cgdir;
1599 path2 = fpath;
1600 }
1601
1602 if (is_child_cgroup(controller, path1, path2)) {
1603 // get uid, gid, from '/tasks' file and make up a mode
1604 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1605 k = get_cgroup_key(controller, cgroup, "tasks");
1606
1607 } else
1608 k = get_cgroup_key(controller, path1, path2);
1609
1610 if (!k) {
1611 ret = -EINVAL;
1612 goto out;
1613 }
1614
1615 /*
1616 * This being a fuse request, the uid and gid must be valid
1617 * in the caller's namespace. So we can just check to make
1618 * sure that the caller is root in his uid, and privileged
1619 * over the file's current owner.
1620 */
1621 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1622 ret = -EPERM;
1623 goto out;
1624 }
1625
1626 if (!cgm_chmod_file(controller, cgroup, mode)) {
1627 ret = -EINVAL;
1628 goto out;
1629 }
1630
1631 ret = 0;
1632 out:
1633 free_key(k);
1634 free(cgdir);
1635 return ret;
1636 }
1637
1638 int cg_mkdir(const char *path, mode_t mode)
1639 {
1640 struct fuse_context *fc = fuse_get_context();
1641 char *fpath = NULL, *path1, *cgdir = NULL, *controller;
1642 const char *cgroup;
1643 int ret;
1644
1645 if (!fc)
1646 return -EIO;
1647
1648
1649 controller = pick_controller_from_path(fc, path);
1650 if (!controller)
1651 return -EINVAL;
1652
1653 cgroup = find_cgroup_in_path(path);
1654 if (!cgroup)
1655 return -EINVAL;
1656
1657 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1658 if (!fpath)
1659 path1 = "/";
1660 else
1661 path1 = cgdir;
1662
1663 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
1664 ret = -EACCES;
1665 goto out;
1666 }
1667 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL)) {
1668 ret = -EACCES;
1669 goto out;
1670 }
1671
1672 if (fc->uid == 0 && fc->gid == 0) {
1673 if (!cgm_create(controller, cgroup)) {
1674 ret = -EINVAL;
1675 goto out;
1676 }
1677 } else {
1678 /*
1679 * exec a helerp so as to get a clean dbus connection
1680 * 17 for lxcfs_mkdir, and spaces and newline and \0. 50 for two ints.
1681 * 50 for two ints
1682 */
1683 size_t len = strlen(cgroup) + strlen(controller) + 17 + 50;
1684 char *cmd = alloca(len);
1685 ret = snprintf(cmd, len, "lxcfs_mkdir %d %d %s %s\n",
1686 fc->uid, fc->gid, controller, cgroup);
1687 if (ret < 0 || ret >= len) {
1688 ret = -EINVAL;
1689 goto out;
1690 }
1691 ret = system(cmd);
1692 if (ret != 0)
1693 goto out;
1694 }
1695
1696 ret = 0;
1697
1698 out:
1699 free(cgdir);
1700 return ret;
1701 }
1702
1703 static int cg_rmdir(const char *path)
1704 {
1705 struct fuse_context *fc = fuse_get_context();
1706 char *fpath = NULL, *cgdir = NULL, *controller;
1707 const char *cgroup;
1708 int ret;
1709
1710 if (!fc)
1711 return -EIO;
1712
1713 controller = pick_controller_from_path(fc, path);
1714 if (!controller)
1715 return -EINVAL;
1716
1717 cgroup = find_cgroup_in_path(path);
1718 if (!cgroup)
1719 return -EINVAL;
1720
1721 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1722 if (!fpath) {
1723 ret = -EINVAL;
1724 goto out;
1725 }
1726
1727 fprintf(stderr, "rmdir: verifying access to %s:%s (req path %s)\n",
1728 controller, cgdir, path);
1729 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
1730 ret = -EACCES;
1731 goto out;
1732 }
1733 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
1734 ret = -EACCES;
1735 goto out;
1736 }
1737
1738 if (!cgm_remove(controller, cgroup)) {
1739 ret = -EINVAL;
1740 goto out;
1741 }
1742
1743 ret = 0;
1744
1745 out:
1746 free(cgdir);
1747 return ret;
1748 }
1749
1750 static bool startswith(const char *line, const char *pref)
1751 {
1752 if (strncmp(line, pref, strlen(pref)) == 0)
1753 return true;
1754 return false;
1755 }
1756
1757 static void get_mem_cached(char *memstat, unsigned long *v)
1758 {
1759 char *eol;
1760
1761 *v = 0;
1762 while (*memstat) {
1763 if (startswith(memstat, "total_cache")) {
1764 sscanf(memstat + 11, "%lu", v);
1765 *v /= 1024;
1766 return;
1767 }
1768 eol = strchr(memstat, '\n');
1769 if (!eol)
1770 return;
1771 memstat = eol+1;
1772 }
1773 }
1774
1775 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
1776 {
1777 char *eol;
1778 char key[32];
1779
1780 memset(key, 0, 32);
1781 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
1782
1783 size_t len = strlen(key);
1784 *v = 0;
1785
1786 while (*str) {
1787 if (startswith(str, key)) {
1788 sscanf(str + len, "%lu", v);
1789 return;
1790 }
1791 eol = strchr(str, '\n');
1792 if (!eol)
1793 return;
1794 str = eol+1;
1795 }
1796 }
1797
1798 static int read_file(const char *path, char *buf, size_t size,
1799 struct file_info *d)
1800 {
1801 size_t linelen = 0, total_len = 0, rv = 0;
1802 char *line = NULL;
1803 char *cache = d->buf;
1804 size_t cache_size = d->buflen;
1805 FILE *f = fopen(path, "r");
1806 if (!f)
1807 return 0;
1808
1809 while (getline(&line, &linelen, f) != -1) {
1810 size_t l = snprintf(cache, cache_size, "%s", line);
1811 if (l < 0) {
1812 perror("Error writing to cache");
1813 rv = 0;
1814 goto err;
1815 }
1816 if (l >= cache_size) {
1817 fprintf(stderr, "Internal error: truncated write to cache\n");
1818 rv = 0;
1819 goto err;
1820 }
1821 if (l < cache_size) {
1822 cache += l;
1823 cache_size -= l;
1824 total_len += l;
1825 } else {
1826 cache += cache_size;
1827 total_len += cache_size;
1828 cache_size = 0;
1829 break;
1830 }
1831 }
1832
1833 d->size = total_len;
1834 if (total_len > size ) total_len = size;
1835
1836 /* read from off 0 */
1837 memcpy(buf, d->buf, total_len);
1838 rv = total_len;
1839 err:
1840 fclose(f);
1841 free(line);
1842 return rv;
1843 }
1844
1845 /*
1846 * FUSE ops for /proc
1847 */
1848
1849 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1850 struct fuse_file_info *fi)
1851 {
1852 struct fuse_context *fc = fuse_get_context();
1853 struct file_info *d = (struct file_info *)fi->fh;
1854 char *cg;
1855 char *memlimit_str = NULL, *memusage_str = NULL, *memstat_str = NULL;
1856 unsigned long memlimit = 0, memusage = 0, cached = 0, hosttotal = 0;
1857 char *line = NULL;
1858 size_t linelen = 0, total_len = 0, rv = 0;
1859 char *cache = d->buf;
1860 size_t cache_size = d->buflen;
1861 FILE *f = NULL;
1862
1863 if (offset){
1864 if (offset > d->size)
1865 return -EINVAL;
1866 int left = d->size - offset;
1867 total_len = left > size ? size: left;
1868 memcpy(buf, cache + offset, total_len);
1869 return total_len;
1870 }
1871
1872 cg = get_pid_cgroup(fc->pid, "memory");
1873 if (!cg)
1874 return read_file("/proc/meminfo", buf, size, d);
1875
1876 if (!cgm_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
1877 goto err;
1878 if (!cgm_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
1879 goto err;
1880 if (!cgm_get_value("memory", cg, "memory.stat", &memstat_str))
1881 goto err;
1882 memlimit = strtoul(memlimit_str, NULL, 10);
1883 memusage = strtoul(memusage_str, NULL, 10);
1884 memlimit /= 1024;
1885 memusage /= 1024;
1886 get_mem_cached(memstat_str, &cached);
1887
1888 f = fopen("/proc/meminfo", "r");
1889 if (!f)
1890 goto err;
1891
1892 while (getline(&line, &linelen, f) != -1) {
1893 size_t l;
1894 char *printme, lbuf[100];
1895
1896 memset(lbuf, 0, 100);
1897 if (startswith(line, "MemTotal:")) {
1898 sscanf(line+14, "%lu", &hosttotal);
1899 if (hosttotal < memlimit)
1900 memlimit = hosttotal;
1901 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1902 printme = lbuf;
1903 } else if (startswith(line, "MemFree:")) {
1904 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1905 printme = lbuf;
1906 } else if (startswith(line, "MemAvailable:")) {
1907 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1908 printme = lbuf;
1909 } else if (startswith(line, "Buffers:")) {
1910 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
1911 printme = lbuf;
1912 } else if (startswith(line, "Cached:")) {
1913 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
1914 printme = lbuf;
1915 } else if (startswith(line, "SwapCached:")) {
1916 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
1917 printme = lbuf;
1918 } else
1919 printme = line;
1920
1921 l = snprintf(cache, cache_size, "%s", printme);
1922 if (l < 0) {
1923 perror("Error writing to cache");
1924 rv = 0;
1925 goto err;
1926
1927 }
1928 if (l >= cache_size) {
1929 fprintf(stderr, "Internal error: truncated write to cache\n");
1930 rv = 0;
1931 goto err;
1932 }
1933
1934 cache += l;
1935 cache_size -= l;
1936 total_len += l;
1937 }
1938
1939 d->size = total_len;
1940 if (total_len > size ) total_len = size;
1941 memcpy(buf, d->buf, total_len);
1942
1943 rv = total_len;
1944 err:
1945 if (f)
1946 fclose(f);
1947 free(line);
1948 free(cg);
1949 free(memlimit_str);
1950 free(memusage_str);
1951 free(memstat_str);
1952 return rv;
1953 }
1954
1955 /*
1956 * Read the cpuset.cpus for cg
1957 * Return the answer in a newly allocated string which must be freed
1958 */
1959 static char *get_cpuset(const char *cg)
1960 {
1961 char *answer;
1962
1963 if (!cgm_get_value("cpuset", cg, "cpuset.cpus", &answer))
1964 return NULL;
1965 return answer;
1966 }
1967
1968 bool cpu_in_cpuset(int cpu, const char *cpuset);
1969
1970 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
1971 {
1972 int cpu;
1973
1974 if (sscanf(line, "processor : %d", &cpu) != 1)
1975 return false;
1976 return cpu_in_cpuset(cpu, cpuset);
1977 }
1978
1979 /*
1980 * check whether this is a '^processor" line in /proc/cpuinfo
1981 */
1982 static bool is_processor_line(const char *line)
1983 {
1984 int cpu;
1985
1986 if (sscanf(line, "processor : %d", &cpu) == 1)
1987 return true;
1988 return false;
1989 }
1990
1991 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
1992 struct fuse_file_info *fi)
1993 {
1994 struct fuse_context *fc = fuse_get_context();
1995 struct file_info *d = (struct file_info *)fi->fh;
1996 char *cg;
1997 char *cpuset = NULL;
1998 char *line = NULL;
1999 size_t linelen = 0, total_len = 0, rv = 0;
2000 bool am_printing = false;
2001 int curcpu = -1;
2002 char *cache = d->buf;
2003 size_t cache_size = d->buflen;
2004 FILE *f = NULL;
2005
2006 if (offset){
2007 if (offset > d->size)
2008 return -EINVAL;
2009 int left = d->size - offset;
2010 total_len = left > size ? size: left;
2011 memcpy(buf, cache + offset, total_len);
2012 return total_len;
2013 }
2014
2015 cg = get_pid_cgroup(fc->pid, "cpuset");
2016 if (!cg)
2017 return read_file("proc/cpuinfo", buf, size, d);
2018
2019 cpuset = get_cpuset(cg);
2020 if (!cpuset)
2021 goto err;
2022
2023 f = fopen("/proc/cpuinfo", "r");
2024 if (!f)
2025 goto err;
2026
2027 while (getline(&line, &linelen, f) != -1) {
2028 size_t l;
2029 if (is_processor_line(line)) {
2030 am_printing = cpuline_in_cpuset(line, cpuset);
2031 if (am_printing) {
2032 curcpu ++;
2033 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
2034 if (l < 0) {
2035 perror("Error writing to cache");
2036 rv = 0;
2037 goto err;
2038 }
2039 if (l >= cache_size) {
2040 fprintf(stderr, "Internal error: truncated write to cache\n");
2041 rv = 0;
2042 goto err;
2043 }
2044 if (l < cache_size){
2045 cache += l;
2046 cache_size -= l;
2047 total_len += l;
2048 }else{
2049 cache += cache_size;
2050 total_len += cache_size;
2051 cache_size = 0;
2052 break;
2053 }
2054 }
2055 continue;
2056 }
2057 if (am_printing) {
2058 l = snprintf(cache, cache_size, "%s", line);
2059 if (l < 0) {
2060 perror("Error writing to cache");
2061 rv = 0;
2062 goto err;
2063 }
2064 if (l >= cache_size) {
2065 fprintf(stderr, "Internal error: truncated write to cache\n");
2066 rv = 0;
2067 goto err;
2068 }
2069 if (l < cache_size) {
2070 cache += l;
2071 cache_size -= l;
2072 total_len += l;
2073 } else {
2074 cache += cache_size;
2075 total_len += cache_size;
2076 cache_size = 0;
2077 break;
2078 }
2079 }
2080 }
2081
2082 d->size = total_len;
2083 if (total_len > size ) total_len = size;
2084
2085 /* read from off 0 */
2086 memcpy(buf, d->buf, total_len);
2087 rv = total_len;
2088 err:
2089 if (f)
2090 fclose(f);
2091 free(line);
2092 free(cpuset);
2093 free(cg);
2094 return rv;
2095 }
2096
2097 static int proc_stat_read(char *buf, size_t size, off_t offset,
2098 struct fuse_file_info *fi)
2099 {
2100 struct fuse_context *fc = fuse_get_context();
2101 struct file_info *d = (struct file_info *)fi->fh;
2102 char *cg;
2103 char *cpuset = NULL;
2104 char *line = NULL;
2105 size_t linelen = 0, total_len = 0, rv = 0;
2106 int curcpu = -1; /* cpu numbering starts at 0 */
2107 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
2108 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
2109 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
2110 #define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
2111 char cpuall[CPUALL_MAX_SIZE];
2112 /* reserve for cpu all */
2113 char *cache = d->buf + CPUALL_MAX_SIZE;
2114 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
2115 FILE *f = NULL;
2116
2117 if (offset){
2118 if (offset > d->size)
2119 return -EINVAL;
2120 int left = d->size - offset;
2121 total_len = left > size ? size: left;
2122 memcpy(buf, d->buf + offset, total_len);
2123 return total_len;
2124 }
2125
2126 cg = get_pid_cgroup(fc->pid, "cpuset");
2127 if (!cg)
2128 return read_file("/proc/stat", buf, size, d);
2129
2130 cpuset = get_cpuset(cg);
2131 if (!cpuset)
2132 goto err;
2133
2134 f = fopen("/proc/stat", "r");
2135 if (!f)
2136 goto err;
2137
2138 //skip first line
2139 if (getline(&line, &linelen, f) < 0) {
2140 fprintf(stderr, "proc_stat_read read first line failed\n");
2141 goto err;
2142 }
2143
2144 while (getline(&line, &linelen, f) != -1) {
2145 size_t l;
2146 int cpu;
2147 char cpu_char[10]; /* That's a lot of cores */
2148 char *c;
2149
2150 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
2151 /* not a ^cpuN line containing a number N, just print it */
2152 l = snprintf(cache, cache_size, "%s", line);
2153 if (l < 0) {
2154 perror("Error writing to cache");
2155 rv = 0;
2156 goto err;
2157 }
2158 if (l >= cache_size) {
2159 fprintf(stderr, "Internal error: truncated write to cache\n");
2160 rv = 0;
2161 goto err;
2162 }
2163 if (l < cache_size) {
2164 cache += l;
2165 cache_size -= l;
2166 total_len += l;
2167 continue;
2168 } else {
2169 //no more space, break it
2170 cache += cache_size;
2171 total_len += cache_size;
2172 cache_size = 0;
2173 break;
2174 }
2175 }
2176
2177 if (sscanf(cpu_char, "%d", &cpu) != 1)
2178 continue;
2179 if (!cpu_in_cpuset(cpu, cpuset))
2180 continue;
2181 curcpu ++;
2182
2183 c = strchr(line, ' ');
2184 if (!c)
2185 continue;
2186 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
2187 if (l < 0) {
2188 perror("Error writing to cache");
2189 rv = 0;
2190 goto err;
2191
2192 }
2193 if (l >= cache_size) {
2194 fprintf(stderr, "Internal error: truncated write to cache\n");
2195 rv = 0;
2196 goto err;
2197 }
2198
2199 cache += l;
2200 cache_size -= l;
2201 total_len += l;
2202
2203 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
2204 &softirq, &steal, &guest) != 9)
2205 continue;
2206 user_sum += user;
2207 nice_sum += nice;
2208 system_sum += system;
2209 idle_sum += idle;
2210 iowait_sum += iowait;
2211 irq_sum += irq;
2212 softirq_sum += softirq;
2213 steal_sum += steal;
2214 guest_sum += guest;
2215 }
2216
2217 cache = d->buf;
2218
2219 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2220 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
2221 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
2222 memcpy(cache, cpuall, cpuall_len);
2223 cache += cpuall_len;
2224 } else{
2225 /* shouldn't happen */
2226 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
2227 cpuall_len = 0;
2228 }
2229
2230 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
2231 total_len += cpuall_len;
2232 d->size = total_len;
2233 if (total_len > size ) total_len = size;
2234
2235 memcpy(buf, d->buf, total_len);
2236 rv = total_len;
2237
2238 err:
2239 if (f)
2240 fclose(f);
2241 free(line);
2242 free(cpuset);
2243 free(cg);
2244 return rv;
2245 }
2246
2247 /*
2248 * How to guess what to present for uptime?
2249 * One thing we could do would be to take the date on the caller's
2250 * memory.usage_in_bytes file, which should equal the time of creation
2251 * of his cgroup. However, a task could be in a sub-cgroup of the
2252 * container. The same problem exists if we try to look at the ages
2253 * of processes in the caller's cgroup.
2254 *
2255 * So we'll fork a task that will enter the caller's pidns, mount a
2256 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
2257 *
2258 * For the second uptime #, we'll do as Stéphane had done, just copy
2259 * the number from /proc/uptime. Not sure how to best emulate 'idle'
2260 * time. Maybe someone can come up with a good algorithm and submit a
2261 * patch. Maybe something based on cpushare info?
2262 */
2263
2264 /* return age of the reaper for $pid, taken from ctime of its procdir */
2265 static long int get_pid1_time(pid_t pid)
2266 {
2267 char fnam[100];
2268 int fd, cpipe[2], ret;
2269 struct stat sb;
2270 pid_t cpid;
2271 struct timeval tv;
2272 fd_set s;
2273 char v;
2274
2275 if (unshare(CLONE_NEWNS))
2276 return 0;
2277
2278 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
2279 perror("rslave mount failed");
2280 return 0;
2281 }
2282
2283 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", pid);
2284 if (ret < 0 || ret >= sizeof(fnam))
2285 return 0;
2286
2287 fd = open(fnam, O_RDONLY);
2288 if (fd < 0) {
2289 perror("get_pid1_time open of ns/pid");
2290 return 0;
2291 }
2292 if (setns(fd, 0)) {
2293 perror("get_pid1_time setns 1");
2294 close(fd);
2295 return 0;
2296 }
2297 close(fd);
2298
2299 if (pipe(cpipe) < 0)
2300 exit(1);
2301
2302 loop:
2303 cpid = fork();
2304 if (cpid < 0)
2305 return 0;
2306
2307 if (!cpid) {
2308 char b = '1';
2309 close(cpipe[0]);
2310 if (write(cpipe[1], &b, sizeof(char)) < 0) {
2311 fprintf(stderr, "%s (child): erorr on write: %s\n",
2312 __func__, strerror(errno));
2313 }
2314 close(cpipe[1]);
2315 umount2("/proc", MNT_DETACH);
2316 if (mount("proc", "/proc", "proc", 0, NULL)) {
2317 perror("get_pid1_time mount");
2318 return 0;
2319 }
2320 ret = lstat("/proc/1", &sb);
2321 if (ret) {
2322 perror("get_pid1_time lstat");
2323 return 0;
2324 }
2325 return time(NULL) - sb.st_ctime;
2326 }
2327
2328 // give the child 1 second to be done forking and
2329 // write it's ack
2330 FD_ZERO(&s);
2331 FD_SET(cpipe[0], &s);
2332 tv.tv_sec = 1;
2333 tv.tv_usec = 0;
2334 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
2335 if (ret <= 0)
2336 goto again;
2337 ret = read(cpipe[0], &v, 1);
2338 if (ret != sizeof(char) || v != '1') {
2339 goto again;
2340 }
2341
2342 wait_for_pid(cpid);
2343 _exit(0);
2344
2345 again:
2346 kill(cpid, SIGKILL);
2347 wait_for_pid(cpid);
2348 goto loop;
2349 }
2350
2351 static long int getreaperage(pid_t qpid)
2352 {
2353 int pid, mypipe[2], ret;
2354 struct timeval tv;
2355 fd_set s;
2356 long int mtime, answer = 0;
2357
2358 if (pipe(mypipe)) {
2359 return 0;
2360 }
2361
2362 pid = fork();
2363
2364 if (!pid) { // child
2365 mtime = get_pid1_time(qpid);
2366 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
2367 fprintf(stderr, "Warning: bad write from getreaperage\n");
2368 _exit(0);
2369 }
2370
2371 close(mypipe[1]);
2372 FD_ZERO(&s);
2373 FD_SET(mypipe[0], &s);
2374 tv.tv_sec = 1;
2375 tv.tv_usec = 0;
2376 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
2377 if (ret <= 0) {
2378 perror("select");
2379 goto out;
2380 }
2381 if (!ret) {
2382 fprintf(stderr, "timed out\n");
2383 goto out;
2384 }
2385 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
2386 perror("read");
2387 goto out;
2388 }
2389 answer = mtime;
2390
2391 out:
2392 wait_for_pid(pid);
2393 close(mypipe[0]);
2394 return answer;
2395 }
2396
2397 static long int getprocidle(void)
2398 {
2399 FILE *f = fopen("/proc/uptime", "r");
2400 long int age, idle;
2401 int ret;
2402 if (!f)
2403 return 0;
2404 ret = fscanf(f, "%ld %ld", &age, &idle);
2405 fclose(f);
2406 if (ret != 2)
2407 return 0;
2408 return idle;
2409 }
2410
2411 /*
2412 * We read /proc/uptime and reuse its second field.
2413 * For the first field, we use the mtime for the reaper for
2414 * the calling pid as returned by getreaperage
2415 */
2416 static int proc_uptime_read(char *buf, size_t size, off_t offset,
2417 struct fuse_file_info *fi)
2418 {
2419 struct fuse_context *fc = fuse_get_context();
2420 struct file_info *d = (struct file_info *)fi->fh;
2421 long int reaperage = getreaperage(fc->pid);;
2422 long int idletime = getprocidle();
2423 size_t total_len = 0;
2424
2425 if (offset){
2426 if (offset > d->size)
2427 return -EINVAL;
2428 return 0;
2429 }
2430
2431 total_len = snprintf(buf, size, "%ld %ld\n", reaperage, idletime);
2432 if (total_len < 0){
2433 perror("Error writing to cache");
2434 return 0;
2435 }
2436 if (total_len >= size){
2437 d->size = size;
2438 return size;
2439 }
2440
2441 d->size = total_len;
2442 return total_len;
2443 }
2444
2445 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2446 struct fuse_file_info *fi)
2447 {
2448 char dev_name[72];
2449 struct fuse_context *fc = fuse_get_context();
2450 struct file_info *d = (struct file_info *)fi->fh;
2451 char *cg;
2452 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
2453 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2454 unsigned long read = 0, write = 0;
2455 unsigned long read_merged = 0, write_merged = 0;
2456 unsigned long read_sectors = 0, write_sectors = 0;
2457 unsigned long read_ticks = 0, write_ticks = 0;
2458 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2459 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
2460 char *line = NULL;
2461 size_t linelen = 0, total_len = 0, rv = 0;
2462 unsigned int major = 0, minor = 0;
2463 int i = 0;
2464 FILE *f = NULL;
2465
2466 if (offset){
2467 if (offset > d->size)
2468 return -EINVAL;
2469 return 0;
2470 }
2471
2472 cg = get_pid_cgroup(fc->pid, "blkio");
2473 if (!cg)
2474 return read_file("/proc/diskstats", buf, size, d);
2475
2476 if (!cgm_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2477 goto err;
2478 if (!cgm_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2479 goto err;
2480 if (!cgm_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2481 goto err;
2482 if (!cgm_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2483 goto err;
2484 if (!cgm_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2485 goto err;
2486
2487
2488 f = fopen("/proc/diskstats", "r");
2489 if (!f)
2490 goto err;
2491
2492 while (getline(&line, &linelen, f) != -1) {
2493 size_t l;
2494 char *printme, lbuf[256];
2495
2496 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2497 if(i == 3){
2498 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2499 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2500 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2501 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2502 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2503 read_sectors = read_sectors/512;
2504 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2505 write_sectors = write_sectors/512;
2506
2507 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2508 rd_svctm = rd_svctm/1000000;
2509 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2510 rd_wait = rd_wait/1000000;
2511 read_ticks = rd_svctm + rd_wait;
2512
2513 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2514 wr_svctm = wr_svctm/1000000;
2515 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2516 wr_wait = wr_wait/1000000;
2517 write_ticks = wr_svctm + wr_wait;
2518
2519 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2520 tot_ticks = tot_ticks/1000000;
2521 }else{
2522 continue;
2523 }
2524
2525 memset(lbuf, 0, 256);
2526 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2527 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2528 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2529 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2530 printme = lbuf;
2531 } else
2532 continue;
2533
2534 l = snprintf(buf, size, "%s", printme);
2535 if (l < 0) {
2536 perror("Error writing to fuse buf");
2537 rv = 0;
2538 goto err;
2539 }
2540 if (l >= size) {
2541 fprintf(stderr, "Internal error: truncated write to cache\n");
2542 rv = 0;
2543 goto err;
2544 }
2545 buf += l;
2546 size -= l;
2547 total_len += l;
2548 }
2549
2550 d->size = total_len;
2551 rv = total_len;
2552 err:
2553 free(cg);
2554 if (f)
2555 fclose(f);
2556 free(line);
2557 free(io_serviced_str);
2558 free(io_merged_str);
2559 free(io_service_bytes_str);
2560 free(io_wait_time_str);
2561 free(io_service_time_str);
2562 return rv;
2563 }
2564
2565 static off_t get_procfile_size(const char *which)
2566 {
2567 FILE *f = fopen(which, "r");
2568 char *line = NULL;
2569 size_t len = 0;
2570 ssize_t sz, answer = 0;
2571 if (!f)
2572 return 0;
2573
2574 while ((sz = getline(&line, &len, f)) != -1)
2575 answer += sz;
2576 fclose (f);
2577 free(line);
2578
2579 return answer;
2580 }
2581
2582 static int proc_getattr(const char *path, struct stat *sb)
2583 {
2584 struct timespec now;
2585
2586 memset(sb, 0, sizeof(struct stat));
2587 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2588 return -EINVAL;
2589 sb->st_uid = sb->st_gid = 0;
2590 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2591 if (strcmp(path, "/proc") == 0) {
2592 sb->st_mode = S_IFDIR | 00555;
2593 sb->st_nlink = 2;
2594 return 0;
2595 }
2596 if (strcmp(path, "/proc/meminfo") == 0 ||
2597 strcmp(path, "/proc/cpuinfo") == 0 ||
2598 strcmp(path, "/proc/uptime") == 0 ||
2599 strcmp(path, "/proc/stat") == 0 ||
2600 strcmp(path, "/proc/diskstats") == 0) {
2601 sb->st_size = 0;
2602 sb->st_mode = S_IFREG | 00444;
2603 sb->st_nlink = 1;
2604 return 0;
2605 }
2606
2607 return -ENOENT;
2608 }
2609
2610 static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2611 struct fuse_file_info *fi)
2612 {
2613 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2614 filler(buf, "meminfo", NULL, 0) != 0 ||
2615 filler(buf, "stat", NULL, 0) != 0 ||
2616 filler(buf, "uptime", NULL, 0) != 0 ||
2617 filler(buf, "diskstats", NULL, 0) != 0)
2618 return -EINVAL;
2619 return 0;
2620 }
2621
2622 static int proc_open(const char *path, struct fuse_file_info *fi)
2623 {
2624 int type = -1;
2625 struct file_info *info;
2626
2627 if (strcmp(path, "/proc/meminfo") == 0)
2628 type = LXC_TYPE_PROC_MEMINFO;
2629 else if (strcmp(path, "/proc/cpuinfo") == 0)
2630 type = LXC_TYPE_PROC_CPUINFO;
2631 else if (strcmp(path, "/proc/uptime") == 0)
2632 type = LXC_TYPE_PROC_UPTIME;
2633 else if (strcmp(path, "/proc/stat") == 0)
2634 type = LXC_TYPE_PROC_STAT;
2635 else if (strcmp(path, "/proc/diskstats") == 0)
2636 type = LXC_TYPE_PROC_DISKSTATS;
2637 if (type == -1)
2638 return -ENOENT;
2639
2640 info = malloc(sizeof(*info));
2641 if (!info)
2642 return -ENOMEM;
2643
2644 memset(info, 0, sizeof(*info));
2645 info->type = type;
2646
2647 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
2648 do {
2649 info->buf = malloc(info->buflen);
2650 } while (!info->buf);
2651 memset(info->buf, 0, info->buflen);
2652 /* set actual size to buffer size */
2653 info->size = info->buflen;
2654
2655 fi->fh = (unsigned long)info;
2656 return 0;
2657 }
2658
2659 static int proc_release(const char *path, struct fuse_file_info *fi)
2660 {
2661 struct file_info *f = (struct file_info *)fi->fh;
2662
2663 do_release_file_info(f);
2664 return 0;
2665 }
2666
2667 static int proc_read(const char *path, char *buf, size_t size, off_t offset,
2668 struct fuse_file_info *fi)
2669 {
2670 struct file_info *f = (struct file_info *) fi->fh;
2671
2672 switch (f->type) {
2673 case LXC_TYPE_PROC_MEMINFO:
2674 return proc_meminfo_read(buf, size, offset, fi);
2675 case LXC_TYPE_PROC_CPUINFO:
2676 return proc_cpuinfo_read(buf, size, offset, fi);
2677 case LXC_TYPE_PROC_UPTIME:
2678 return proc_uptime_read(buf, size, offset, fi);
2679 case LXC_TYPE_PROC_STAT:
2680 return proc_stat_read(buf, size, offset, fi);
2681 case LXC_TYPE_PROC_DISKSTATS:
2682 return proc_diskstats_read(buf, size, offset, fi);
2683 default:
2684 return -EINVAL;
2685 }
2686 }
2687
2688 /*
2689 * FUSE ops for /
2690 * these just delegate to the /proc and /cgroup ops as
2691 * needed
2692 */
2693
2694 static int lxcfs_getattr(const char *path, struct stat *sb)
2695 {
2696 if (strcmp(path, "/") == 0) {
2697 sb->st_mode = S_IFDIR | 00755;
2698 sb->st_nlink = 2;
2699 return 0;
2700 }
2701 if (strncmp(path, "/cgroup", 7) == 0) {
2702 return cg_getattr(path, sb);
2703 }
2704 if (strncmp(path, "/proc", 5) == 0) {
2705 return proc_getattr(path, sb);
2706 }
2707 return -EINVAL;
2708 }
2709
2710 static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
2711 {
2712 if (strcmp(path, "/") == 0)
2713 return 0;
2714
2715 if (strncmp(path, "/cgroup", 7) == 0) {
2716 return cg_opendir(path, fi);
2717 }
2718 if (strcmp(path, "/proc") == 0)
2719 return 0;
2720 return -ENOENT;
2721 }
2722
2723 static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2724 struct fuse_file_info *fi)
2725 {
2726 if (strcmp(path, "/") == 0) {
2727 if (filler(buf, "proc", NULL, 0) != 0 ||
2728 filler(buf, "cgroup", NULL, 0) != 0)
2729 return -EINVAL;
2730 return 0;
2731 }
2732 if (strncmp(path, "/cgroup", 7) == 0)
2733 return cg_readdir(path, buf, filler, offset, fi);
2734 if (strcmp(path, "/proc") == 0)
2735 return proc_readdir(path, buf, filler, offset, fi);
2736 return -EINVAL;
2737 }
2738
2739 static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
2740 {
2741 if (strcmp(path, "/") == 0)
2742 return 0;
2743 if (strncmp(path, "/cgroup", 7) == 0) {
2744 return cg_releasedir(path, fi);
2745 }
2746 if (strcmp(path, "/proc") == 0)
2747 return 0;
2748 return -EINVAL;
2749 }
2750
2751 static int lxcfs_open(const char *path, struct fuse_file_info *fi)
2752 {
2753 if (strncmp(path, "/cgroup", 7) == 0)
2754 return cg_open(path, fi);
2755 if (strncmp(path, "/proc", 5) == 0)
2756 return proc_open(path, fi);
2757
2758 return -EINVAL;
2759 }
2760
2761 static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
2762 struct fuse_file_info *fi)
2763 {
2764 if (strncmp(path, "/cgroup", 7) == 0)
2765 return cg_read(path, buf, size, offset, fi);
2766 if (strncmp(path, "/proc", 5) == 0)
2767 return proc_read(path, buf, size, offset, fi);
2768
2769 return -EINVAL;
2770 }
2771
2772 int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
2773 struct fuse_file_info *fi)
2774 {
2775 if (strncmp(path, "/cgroup", 7) == 0) {
2776 return cg_write(path, buf, size, offset, fi);
2777 }
2778
2779 return -EINVAL;
2780 }
2781
2782 static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
2783 {
2784 return 0;
2785 }
2786
2787 static int lxcfs_release(const char *path, struct fuse_file_info *fi)
2788 {
2789 if (strncmp(path, "/cgroup", 7) == 0)
2790 return cg_release(path, fi);
2791 if (strncmp(path, "/proc", 5) == 0)
2792 return proc_release(path, fi);
2793
2794 return -EINVAL;
2795 }
2796
2797 static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
2798 {
2799 return 0;
2800 }
2801
2802 int lxcfs_mkdir(const char *path, mode_t mode)
2803 {
2804 if (strncmp(path, "/cgroup", 7) == 0)
2805 return cg_mkdir(path, mode);
2806
2807 return -EINVAL;
2808 }
2809
2810 int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
2811 {
2812 if (strncmp(path, "/cgroup", 7) == 0)
2813 return cg_chown(path, uid, gid);
2814
2815 return -EINVAL;
2816 }
2817
2818 /*
2819 * cat first does a truncate before doing ops->write. This doesn't
2820 * really make sense for cgroups. So just return 0 always but do
2821 * nothing.
2822 */
2823 int lxcfs_truncate(const char *path, off_t newsize)
2824 {
2825 if (strncmp(path, "/cgroup", 7) == 0)
2826 return 0;
2827 return -EINVAL;
2828 }
2829
2830 int lxcfs_rmdir(const char *path)
2831 {
2832 if (strncmp(path, "/cgroup", 7) == 0)
2833 return cg_rmdir(path);
2834 return -EINVAL;
2835 }
2836
2837 int lxcfs_chmod(const char *path, mode_t mode)
2838 {
2839 if (strncmp(path, "/cgroup", 7) == 0)
2840 return cg_chmod(path, mode);
2841 return -EINVAL;
2842 }
2843
2844 const struct fuse_operations lxcfs_ops = {
2845 .getattr = lxcfs_getattr,
2846 .readlink = NULL,
2847 .getdir = NULL,
2848 .mknod = NULL,
2849 .mkdir = lxcfs_mkdir,
2850 .unlink = NULL,
2851 .rmdir = lxcfs_rmdir,
2852 .symlink = NULL,
2853 .rename = NULL,
2854 .link = NULL,
2855 .chmod = lxcfs_chmod,
2856 .chown = lxcfs_chown,
2857 .truncate = lxcfs_truncate,
2858 .utime = NULL,
2859
2860 .open = lxcfs_open,
2861 .read = lxcfs_read,
2862 .release = lxcfs_release,
2863 .write = lxcfs_write,
2864
2865 .statfs = NULL,
2866 .flush = lxcfs_flush,
2867 .fsync = lxcfs_fsync,
2868
2869 .setxattr = NULL,
2870 .getxattr = NULL,
2871 .listxattr = NULL,
2872 .removexattr = NULL,
2873
2874 .opendir = lxcfs_opendir,
2875 .readdir = lxcfs_readdir,
2876 .releasedir = lxcfs_releasedir,
2877
2878 .fsyncdir = NULL,
2879 .init = NULL,
2880 .destroy = NULL,
2881 .access = NULL,
2882 .create = NULL,
2883 .ftruncate = NULL,
2884 .fgetattr = NULL,
2885 };
2886
2887 static void usage(const char *me)
2888 {
2889 fprintf(stderr, "Usage:\n");
2890 fprintf(stderr, "\n");
2891 fprintf(stderr, "%s mountpoint\n", me);
2892 fprintf(stderr, "%s -h\n", me);
2893 exit(1);
2894 }
2895
2896 static bool is_help(char *w)
2897 {
2898 if (strcmp(w, "-h") == 0 ||
2899 strcmp(w, "--help") == 0 ||
2900 strcmp(w, "-help") == 0 ||
2901 strcmp(w, "help") == 0)
2902 return true;
2903 return false;
2904 }
2905
2906 void swallow_arg(int *argcp, char *argv[], char *which)
2907 {
2908 int i;
2909
2910 for (i = 1; argv[i]; i++) {
2911 if (strcmp(argv[i], which) != 0)
2912 continue;
2913 for (; argv[i]; i++) {
2914 argv[i] = argv[i+1];
2915 }
2916 (*argcp)--;
2917 return;
2918 }
2919 }
2920
2921 void swallow_option(int *argcp, char *argv[], char *opt, char *v)
2922 {
2923 int i;
2924
2925 for (i = 1; argv[i]; i++) {
2926 if (!argv[i+1])
2927 continue;
2928 if (strcmp(argv[i], opt) != 0)
2929 continue;
2930 if (strcmp(argv[i+1], v) != 0) {
2931 fprintf(stderr, "Warning: unexpected fuse option %s\n", v);
2932 exit(1);
2933 }
2934 for (; argv[i+1]; i++) {
2935 argv[i] = argv[i+2];
2936 }
2937 (*argcp) -= 2;
2938 return;
2939 }
2940 }
2941
2942 int main(int argc, char *argv[])
2943 {
2944 int ret = -1;
2945 struct lxcfs_state *d = NULL;
2946 /*
2947 * what we pass to fuse_main is:
2948 * argv[0] -s -f -o allow_other,directio argv[1] NULL
2949 */
2950 int nargs = 5, cnt = 0;
2951 char *newargv[6];
2952
2953 /* accomodate older init scripts */
2954 swallow_arg(&argc, argv, "-s");
2955 swallow_arg(&argc, argv, "-f");
2956 swallow_option(&argc, argv, "-o", "allow_other");
2957
2958 if (argc == 2 && strcmp(argv[1], "--version") == 0) {
2959 fprintf(stderr, "%s\n", VERSION);
2960 exit(0);
2961 }
2962 if (argc != 2 || is_help(argv[1]))
2963 usage(argv[0]);
2964
2965 do {
2966 d = malloc(sizeof(*d));
2967 } while (!d);
2968
2969 newargv[cnt++] = argv[0];
2970 newargv[cnt++] = "-f";
2971 newargv[cnt++] = "-o";
2972 newargv[cnt++] = "allow_other,direct_io";
2973 newargv[cnt++] = argv[1];
2974 newargv[cnt++] = NULL;
2975
2976 if (!cgm_escape_cgroup())
2977 fprintf(stderr, "WARNING: failed to escape to root cgroup\n");
2978
2979 if (!cgm_get_controllers(&d->subsystems))
2980 goto out;
2981
2982 ret = fuse_main(nargs, newargv, &lxcfs_ops, d);
2983 cgm_dbus_disconnect();
2984
2985 out:
2986 free(d);
2987 return ret;
2988 }