]> git.proxmox.com Git - mirror_lxcfs.git/blob - lxcfs.c
meminfo: fix report swap usage
[mirror_lxcfs.git] / lxcfs.c
1 /* lxcfs
2 *
3 * Copyright © 2014,2015 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 /*
10 * TODO XXX
11 * sanitize paths for '..', cgmanager's not doing that for us any more
12 * does fuse help us?
13 * Surely there are more paths we'll need to sanitize - look back through
14 * cgmanager's sources.
15 */
16
17 #define FUSE_USE_VERSION 26
18
19 #include <stdio.h>
20 #include <dirent.h>
21 #include <fcntl.h>
22 #include <fuse.h>
23 #include <unistd.h>
24 #include <errno.h>
25 #include <stdbool.h>
26 #include <time.h>
27 #include <string.h>
28 #include <stdlib.h>
29 #include <libgen.h>
30 #include <sched.h>
31 #include <linux/sched.h>
32 #include <sys/socket.h>
33 #include <sys/mount.h>
34 #include <wait.h>
35
36 #ifdef FORTRAVIS
37 #define GLIB_DISABLE_DEPRECATION_WARNINGS
38 #include <glib-object.h>
39 #endif
40
41 #include "cgfs.h"
42 #include "config.h" // for VERSION
43
44 enum {
45 LXC_TYPE_CGDIR,
46 LXC_TYPE_CGFILE,
47 LXC_TYPE_PROC_MEMINFO,
48 LXC_TYPE_PROC_CPUINFO,
49 LXC_TYPE_PROC_UPTIME,
50 LXC_TYPE_PROC_STAT,
51 LXC_TYPE_PROC_DISKSTATS,
52 };
53
54 struct file_info {
55 char *controller;
56 char *cgroup;
57 char *file;
58 int type;
59 char *buf; // unused as of yet
60 int buflen;
61 int size; //actual data size
62 int cached;
63 };
64
65 /* reserve buffer size, for cpuall in /proc/stat */
66 #define BUF_RESERVE_SIZE 256
67
68 /*
69 * append pid to *src.
70 * src: a pointer to a char* in which ot append the pid.
71 * sz: the number of characters printed so far, minus trailing \0.
72 * asz: the allocated size so far
73 * pid: the pid to append
74 */
75 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
76 {
77 char *d = *src;
78 char tmp[30];
79
80 sprintf(tmp, "%d\n", (int)pid);
81
82 if (!d) {
83 do {
84 d = malloc(BUF_RESERVE_SIZE);
85 } while (!d);
86 *src = d;
87 *asz = BUF_RESERVE_SIZE;
88 } else if (strlen(tmp) + sz + 1 >= asz) {
89 do {
90 d = realloc(d, *asz + BUF_RESERVE_SIZE);
91 } while (!d);
92 *src = d;
93 *asz += BUF_RESERVE_SIZE;
94 }
95 memcpy(d+*sz, tmp, strlen(tmp));
96 *sz += strlen(tmp);
97 d[*sz] = '\0';
98 }
99
100 static int wait_for_pid(pid_t pid)
101 {
102 int status, ret;
103
104 again:
105 ret = waitpid(pid, &status, 0);
106 if (ret == -1) {
107 if (errno == EINTR)
108 goto again;
109 return -1;
110 }
111 if (ret != pid)
112 goto again;
113 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
114 return -1;
115 return 0;
116 }
117
118 /*
119 * Given a open file * to /proc/pid/{u,g}id_map, and an id
120 * valid in the caller's namespace, return the id mapped into
121 * pid's namespace.
122 * Returns the mapped id, or -1 on error.
123 */
124 unsigned int
125 convert_id_to_ns(FILE *idfile, unsigned int in_id)
126 {
127 unsigned int nsuid, // base id for a range in the idfile's namespace
128 hostuid, // base id for a range in the caller's namespace
129 count; // number of ids in this range
130 char line[400];
131 int ret;
132
133 fseek(idfile, 0L, SEEK_SET);
134 while (fgets(line, 400, idfile)) {
135 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
136 if (ret != 3)
137 continue;
138 if (hostuid + count < hostuid || nsuid + count < nsuid) {
139 /*
140 * uids wrapped around - unexpected as this is a procfile,
141 * so just bail.
142 */
143 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
144 nsuid, hostuid, count, line);
145 return -1;
146 }
147 if (hostuid <= in_id && hostuid+count > in_id) {
148 /*
149 * now since hostuid <= in_id < hostuid+count, and
150 * hostuid+count and nsuid+count do not wrap around,
151 * we know that nsuid+(in_id-hostuid) which must be
152 * less that nsuid+(count) must not wrap around
153 */
154 return (in_id - hostuid) + nsuid;
155 }
156 }
157
158 // no answer found
159 return -1;
160 }
161
162 /*
163 * for is_privileged_over,
164 * specify whether we require the calling uid to be root in his
165 * namespace
166 */
167 #define NS_ROOT_REQD true
168 #define NS_ROOT_OPT false
169
170 #define PROCLEN 100
171
172 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
173 {
174 char fpath[PROCLEN];
175 int ret;
176 bool answer = false;
177 uid_t nsuid;
178
179 if (victim == -1 || uid == -1)
180 return false;
181
182 /*
183 * If the request is one not requiring root in the namespace,
184 * then having the same uid suffices. (i.e. uid 1000 has write
185 * access to files owned by uid 1000
186 */
187 if (!req_ns_root && uid == victim)
188 return true;
189
190 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
191 if (ret < 0 || ret >= PROCLEN)
192 return false;
193 FILE *f = fopen(fpath, "r");
194 if (!f)
195 return false;
196
197 /* if caller's not root in his namespace, reject */
198 nsuid = convert_id_to_ns(f, uid);
199 if (nsuid)
200 goto out;
201
202 /*
203 * If victim is not mapped into caller's ns, reject.
204 * XXX I'm not sure this check is needed given that fuse
205 * will be sending requests where the vfs has converted
206 */
207 nsuid = convert_id_to_ns(f, victim);
208 if (nsuid == -1)
209 goto out;
210
211 answer = true;
212
213 out:
214 fclose(f);
215 return answer;
216 }
217
218 static bool perms_include(int fmode, mode_t req_mode)
219 {
220 mode_t r;
221
222 switch (req_mode & O_ACCMODE) {
223 case O_RDONLY:
224 r = S_IROTH;
225 break;
226 case O_WRONLY:
227 r = S_IWOTH;
228 break;
229 case O_RDWR:
230 r = S_IROTH | S_IWOTH;
231 break;
232 default:
233 return false;
234 }
235 return ((fmode & r) == r);
236 }
237
238
239 /*
240 * taskcg is a/b/c
241 * querycg is /a/b/c/d/e
242 * we return 'd'
243 */
244 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
245 {
246 char *start, *end;
247
248 if (strlen(taskcg) <= strlen(querycg)) {
249 fprintf(stderr, "%s: I was fed bad input\n", __func__);
250 return NULL;
251 }
252
253 if (strcmp(querycg, "/") == 0)
254 start = strdup(taskcg + 1);
255 else
256 start = strdup(taskcg + strlen(querycg) + 1);
257 if (!start)
258 return NULL;
259 end = strchr(start, '/');
260 if (end)
261 *end = '\0';
262 return start;
263 }
264
265 static void stripnewline(char *x)
266 {
267 size_t l = strlen(x);
268 if (l && x[l-1] == '\n')
269 x[l-1] = '\0';
270 }
271
272 static char *get_pid_cgroup(pid_t pid, const char *contrl)
273 {
274 char fnam[PROCLEN];
275 FILE *f;
276 char *answer = NULL;
277 char *line = NULL;
278 size_t len = 0;
279 int ret;
280 const char *h = find_mounted_controller(contrl);
281 if (!h)
282 return NULL;
283
284 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
285 if (ret < 0 || ret >= PROCLEN)
286 return NULL;
287 if (!(f = fopen(fnam, "r")))
288 return NULL;
289
290 while (getline(&line, &len, f) != -1) {
291 char *c1, *c2;
292 if (!line[0])
293 continue;
294 c1 = strchr(line, ':');
295 if (!c1)
296 goto out;
297 c1++;
298 c2 = strchr(c1, ':');
299 if (!c2)
300 goto out;
301 *c2 = '\0';
302 if (strcmp(c1, h) != 0)
303 continue;
304 c2++;
305 stripnewline(c2);
306 do {
307 answer = strdup(c2);
308 } while (!answer);
309 break;
310 }
311
312 out:
313 fclose(f);
314 free(line);
315 return answer;
316 }
317
318 /*
319 * check whether a fuse context may access a cgroup dir or file
320 *
321 * If file is not null, it is a cgroup file to check under cg.
322 * If file is null, then we are checking perms on cg itself.
323 *
324 * For files we can check the mode of the list_keys result.
325 * For cgroups, we must make assumptions based on the files under the
326 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
327 * yet.
328 */
329 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
330 {
331 struct cgfs_files *k = NULL;
332 bool ret = false;
333
334 if (!file)
335 file = "tasks";
336
337 if (*file == '/')
338 file++;
339
340 k = cgfs_get_key(contrl, cg, file);
341 if (!k)
342 return false;
343
344 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
345 if (perms_include(k->mode >> 6, mode)) {
346 ret = true;
347 goto out;
348 }
349 }
350 if (fc->gid == k->gid) {
351 if (perms_include(k->mode >> 3, mode)) {
352 ret = true;
353 goto out;
354 }
355 }
356 ret = perms_include(k->mode, mode);
357
358 out:
359 free_key(k);
360 return ret;
361 }
362
363 #define INITSCOPE "/init.scope"
364 static void prune_init_slice(char *cg)
365 {
366 char *point;
367 point = cg + strlen(cg) - strlen(INITSCOPE);
368 if (point < cg)
369 return;
370 if (strcmp(point, INITSCOPE) == 0) {
371 if (point == cg)
372 *(point+1) = '\0';
373 else
374 *point = '\0';
375 }
376 }
377
378 /*
379 * If caller is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
380 * If caller is in /a, he may act on /a/b, but not on /b.
381 * if the answer is false and nextcg is not NULL, then *nextcg will point
382 * to a string containing the next cgroup directory under cg, which must be
383 * freed by the caller.
384 */
385 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
386 {
387 bool answer = false;
388 char *c2 = get_pid_cgroup(pid, contrl);
389 char *linecmp;
390
391 if (!c2)
392 return false;
393 prune_init_slice(c2);
394
395 /*
396 * callers pass in '/' for root cgroup, otherwise they pass
397 * in a cgroup without leading '/'
398 */
399 linecmp = *cg == '/' ? c2 : c2+1;
400 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
401 if (nextcg) {
402 *nextcg = get_next_cgroup_dir(linecmp, cg);
403 }
404 goto out;
405 }
406 answer = true;
407
408 out:
409 free(c2);
410 return answer;
411 }
412
413 /*
414 * If caller is in /a/b/c, he may see that /a exists, but not /b or /a/c.
415 */
416 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
417 {
418 bool answer = false;
419 char *c2, *task_cg;
420 size_t target_len, task_len;
421
422 if (strcmp(cg, "/") == 0)
423 return true;
424
425 c2 = get_pid_cgroup(pid, contrl);
426 if (!c2)
427 return false;
428 prune_init_slice(c2);
429
430 task_cg = c2 + 1;
431 target_len = strlen(cg);
432 task_len = strlen(task_cg);
433 if (strcmp(cg, task_cg) == 0) {
434 answer = true;
435 goto out;
436 }
437 if (target_len < task_len) {
438 /* looking up a parent dir */
439 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
440 answer = true;
441 goto out;
442 }
443 if (target_len > task_len) {
444 /* looking up a child dir */
445 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
446 answer = true;
447 goto out;
448 }
449
450 out:
451 free(c2);
452 return answer;
453 }
454
455 /*
456 * given /cgroup/freezer/a/b, return "freezer".
457 * the returned char* should NOT be freed.
458 */
459 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
460 {
461 const char *p1;
462 char *contr, *slash;
463
464 if (strlen(path) < 9)
465 return NULL;
466 if (*(path+7) != '/')
467 return NULL;
468 p1 = path+8;
469 contr = strdupa(p1);
470 if (!contr)
471 return NULL;
472 slash = strstr(contr, "/");
473 if (slash)
474 *slash = '\0';
475
476 int i;
477 for (i = 0; i < num_hierarchies; i++) {
478 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
479 return hierarchies[i];
480 }
481 return NULL;
482 }
483
484 /*
485 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
486 * Note that the returned value may include files (keynames) etc
487 */
488 static const char *find_cgroup_in_path(const char *path)
489 {
490 const char *p1;
491
492 if (strlen(path) < 9)
493 return NULL;
494 p1 = strstr(path+8, "/");
495 if (!p1)
496 return NULL;
497 return p1+1;
498 }
499
500 /*
501 * dir should be freed, file not
502 */
503 static void get_cgdir_and_path(const char *cg, char **dir, char **file)
504 {
505 char *p;
506
507 do {
508 *dir = strdup(cg);
509 } while (!*dir);
510 *file = strrchr(cg, '/');
511 if (!*file) {
512 *file = NULL;
513 return;
514 }
515 p = strrchr(*dir, '/');
516 *p = '\0';
517 }
518
519 /*
520 * FUSE ops for /cgroup
521 */
522
523 static int cg_getattr(const char *path, struct stat *sb)
524 {
525 struct timespec now;
526 struct fuse_context *fc = fuse_get_context();
527 char * cgdir = NULL;
528 char *fpath = NULL, *path1, *path2;
529 struct cgfs_files *k = NULL;
530 const char *cgroup;
531 const char *controller = NULL;
532 int ret = -ENOENT;
533
534
535 if (!fc)
536 return -EIO;
537
538 memset(sb, 0, sizeof(struct stat));
539
540 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
541 return -EINVAL;
542
543 sb->st_uid = sb->st_gid = 0;
544 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
545 sb->st_size = 0;
546
547 if (strcmp(path, "/cgroup") == 0) {
548 sb->st_mode = S_IFDIR | 00755;
549 sb->st_nlink = 2;
550 return 0;
551 }
552
553 controller = pick_controller_from_path(fc, path);
554 if (!controller)
555 return -EIO;
556 cgroup = find_cgroup_in_path(path);
557 if (!cgroup) {
558 /* this is just /cgroup/controller, return it as a dir */
559 sb->st_mode = S_IFDIR | 00755;
560 sb->st_nlink = 2;
561 return 0;
562 }
563
564 get_cgdir_and_path(cgroup, &cgdir, &fpath);
565
566 if (!fpath) {
567 path1 = "/";
568 path2 = cgdir;
569 } else {
570 path1 = cgdir;
571 path2 = fpath;
572 }
573
574 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
575 * Then check that caller's cgroup is under path if fpath is a child
576 * cgroup, or cgdir if fpath is a file */
577
578 if (is_child_cgroup(controller, path1, path2)) {
579 if (!caller_may_see_dir(fc->pid, controller, cgroup)) {
580 ret = -ENOENT;
581 goto out;
582 }
583 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
584 /* this is just /cgroup/controller, return it as a dir */
585 sb->st_mode = S_IFDIR | 00555;
586 sb->st_nlink = 2;
587 ret = 0;
588 goto out;
589 }
590 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
591 ret = -EACCES;
592 goto out;
593 }
594
595 // get uid, gid, from '/tasks' file and make up a mode
596 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
597 sb->st_mode = S_IFDIR | 00755;
598 k = cgfs_get_key(controller, cgroup, "tasks");
599 if (!k) {
600 sb->st_uid = sb->st_gid = 0;
601 } else {
602 sb->st_uid = k->uid;
603 sb->st_gid = k->gid;
604 }
605 free_key(k);
606 sb->st_nlink = 2;
607 ret = 0;
608 goto out;
609 }
610
611 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
612 sb->st_mode = S_IFREG | k->mode;
613 sb->st_nlink = 1;
614 sb->st_uid = k->uid;
615 sb->st_gid = k->gid;
616 sb->st_size = 0;
617 free_key(k);
618 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL)) {
619 ret = -ENOENT;
620 goto out;
621 }
622 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
623 ret = -EACCES;
624 goto out;
625 }
626
627 ret = 0;
628 }
629
630 out:
631 free(cgdir);
632 return ret;
633 }
634
635 static int cg_opendir(const char *path, struct fuse_file_info *fi)
636 {
637 struct fuse_context *fc = fuse_get_context();
638 const char *cgroup;
639 struct file_info *dir_info;
640 char *controller = NULL;
641
642 if (!fc)
643 return -EIO;
644
645 if (strcmp(path, "/cgroup") == 0) {
646 cgroup = NULL;
647 controller = NULL;
648 } else {
649 // return list of keys for the controller, and list of child cgroups
650 controller = pick_controller_from_path(fc, path);
651 if (!controller)
652 return -EIO;
653
654 cgroup = find_cgroup_in_path(path);
655 if (!cgroup) {
656 /* this is just /cgroup/controller, return its contents */
657 cgroup = "/";
658 }
659 }
660
661 if (cgroup) {
662 if (!caller_may_see_dir(fc->pid, controller, cgroup))
663 return -ENOENT;
664 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
665 return -EACCES;
666 }
667
668 /* we'll free this at cg_releasedir */
669 dir_info = malloc(sizeof(*dir_info));
670 if (!dir_info)
671 return -ENOMEM;
672 dir_info->controller = must_copy_string(controller);
673 dir_info->cgroup = must_copy_string(cgroup);
674 dir_info->type = LXC_TYPE_CGDIR;
675 dir_info->buf = NULL;
676 dir_info->file = NULL;
677 dir_info->buflen = 0;
678
679 fi->fh = (unsigned long)dir_info;
680 return 0;
681 }
682
683 static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
684 struct fuse_file_info *fi)
685 {
686 struct file_info *d = (struct file_info *)fi->fh;
687 struct cgfs_files **list = NULL;
688 int i, ret;
689 char *nextcg = NULL;
690 struct fuse_context *fc = fuse_get_context();
691 char **clist = NULL;
692
693 if (d->type != LXC_TYPE_CGDIR) {
694 fprintf(stderr, "Internal error: file cache info used in readdir\n");
695 return -EIO;
696 }
697 if (!d->cgroup && !d->controller) {
698 // ls /var/lib/lxcfs/cgroup - just show list of controllers
699 int i;
700
701 for (i = 0; i < num_hierarchies; i++) {
702 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
703 return -EIO;
704 }
705 }
706 return 0;
707 }
708
709 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
710 // not a valid cgroup
711 ret = -EINVAL;
712 goto out;
713 }
714
715 if (!caller_is_in_ancestor(fc->pid, d->controller, d->cgroup, &nextcg)) {
716 if (nextcg) {
717 int ret;
718 ret = filler(buf, nextcg, NULL, 0);
719 free(nextcg);
720 if (ret != 0) {
721 ret = -EIO;
722 goto out;
723 }
724 }
725 ret = 0;
726 goto out;
727 }
728
729 for (i = 0; list[i]; i++) {
730 if (filler(buf, list[i]->name, NULL, 0) != 0) {
731 ret = -EIO;
732 goto out;
733 }
734 }
735
736 // now get the list of child cgroups
737
738 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
739 ret = 0;
740 goto out;
741 }
742 for (i = 0; clist[i]; i++) {
743 if (filler(buf, clist[i], NULL, 0) != 0) {
744 ret = -EIO;
745 goto out;
746 }
747 }
748 ret = 0;
749
750 out:
751 free_keys(list);
752 if (clist) {
753 for (i = 0; clist[i]; i++)
754 free(clist[i]);
755 free(clist);
756 }
757 return ret;
758 }
759
760 static void do_release_file_info(struct file_info *f)
761 {
762 if (!f)
763 return;
764 free(f->controller);
765 free(f->cgroup);
766 free(f->file);
767 free(f->buf);
768 free(f);
769 }
770
771 static int cg_releasedir(const char *path, struct fuse_file_info *fi)
772 {
773 struct file_info *d = (struct file_info *)fi->fh;
774
775 do_release_file_info(d);
776 return 0;
777 }
778
779 static int cg_open(const char *path, struct fuse_file_info *fi)
780 {
781 const char *cgroup;
782 char *fpath = NULL, *path1, *path2, * cgdir = NULL, *controller;
783 struct cgfs_files *k = NULL;
784 struct file_info *file_info;
785 struct fuse_context *fc = fuse_get_context();
786 int ret;
787
788 if (!fc)
789 return -EIO;
790
791 controller = pick_controller_from_path(fc, path);
792 if (!controller)
793 return -EIO;
794 cgroup = find_cgroup_in_path(path);
795 if (!cgroup)
796 return -EINVAL;
797
798 get_cgdir_and_path(cgroup, &cgdir, &fpath);
799 if (!fpath) {
800 path1 = "/";
801 path2 = cgdir;
802 } else {
803 path1 = cgdir;
804 path2 = fpath;
805 }
806
807 k = cgfs_get_key(controller, path1, path2);
808 if (!k) {
809 ret = -EINVAL;
810 goto out;
811 }
812 free_key(k);
813
814 if (!caller_may_see_dir(fc->pid, controller, path1)) {
815 ret = -ENOENT;
816 goto out;
817 }
818 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
819 // should never get here
820 ret = -EACCES;
821 goto out;
822 }
823
824 /* we'll free this at cg_release */
825 file_info = malloc(sizeof(*file_info));
826 if (!file_info) {
827 ret = -ENOMEM;
828 goto out;
829 }
830 file_info->controller = must_copy_string(controller);
831 file_info->cgroup = must_copy_string(path1);
832 file_info->file = must_copy_string(path2);
833 file_info->type = LXC_TYPE_CGFILE;
834 file_info->buf = NULL;
835 file_info->buflen = 0;
836
837 fi->fh = (unsigned long)file_info;
838 ret = 0;
839
840 out:
841 free(cgdir);
842 return ret;
843 }
844
845 static int cg_release(const char *path, struct fuse_file_info *fi)
846 {
847 struct file_info *f = (struct file_info *)fi->fh;
848
849 do_release_file_info(f);
850 return 0;
851 }
852
853 static int msgrecv(int sockfd, void *buf, size_t len)
854 {
855 struct timeval tv;
856 fd_set rfds;
857
858 FD_ZERO(&rfds);
859 FD_SET(sockfd, &rfds);
860 tv.tv_sec = 2;
861 tv.tv_usec = 0;
862
863 if (select(sockfd+1, &rfds, NULL, NULL, &tv) <= 0)
864 return -1;
865 return recv(sockfd, buf, len, MSG_DONTWAIT);
866 }
867
868 #define SEND_CREDS_OK 0
869 #define SEND_CREDS_NOTSK 1
870 #define SEND_CREDS_FAIL 2
871 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
872 {
873 struct msghdr msg = { 0 };
874 struct iovec iov;
875 struct cmsghdr *cmsg;
876 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
877 char buf[1];
878 buf[0] = 'p';
879
880 if (pingfirst) {
881 if (msgrecv(sock, buf, 1) != 1) {
882 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
883 __func__);
884 return SEND_CREDS_FAIL;
885 }
886 }
887
888 msg.msg_control = cmsgbuf;
889 msg.msg_controllen = sizeof(cmsgbuf);
890
891 cmsg = CMSG_FIRSTHDR(&msg);
892 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
893 cmsg->cmsg_level = SOL_SOCKET;
894 cmsg->cmsg_type = SCM_CREDENTIALS;
895 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
896
897 msg.msg_name = NULL;
898 msg.msg_namelen = 0;
899
900 buf[0] = v;
901 iov.iov_base = buf;
902 iov.iov_len = sizeof(buf);
903 msg.msg_iov = &iov;
904 msg.msg_iovlen = 1;
905
906 if (sendmsg(sock, &msg, 0) < 0) {
907 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
908 strerror(errno));
909 if (errno == 3)
910 return SEND_CREDS_NOTSK;
911 return SEND_CREDS_FAIL;
912 }
913
914 return SEND_CREDS_OK;
915 }
916
917 static bool recv_creds(int sock, struct ucred *cred, char *v)
918 {
919 struct msghdr msg = { 0 };
920 struct iovec iov;
921 struct cmsghdr *cmsg;
922 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
923 char buf[1];
924 int ret;
925 int optval = 1;
926 struct timeval tv;
927 fd_set rfds;
928
929 *v = '1';
930
931 cred->pid = -1;
932 cred->uid = -1;
933 cred->gid = -1;
934
935 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
936 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
937 return false;
938 }
939 buf[0] = '1';
940 if (write(sock, buf, 1) != 1) {
941 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
942 return false;
943 }
944
945 msg.msg_name = NULL;
946 msg.msg_namelen = 0;
947 msg.msg_control = cmsgbuf;
948 msg.msg_controllen = sizeof(cmsgbuf);
949
950 iov.iov_base = buf;
951 iov.iov_len = sizeof(buf);
952 msg.msg_iov = &iov;
953 msg.msg_iovlen = 1;
954
955 FD_ZERO(&rfds);
956 FD_SET(sock, &rfds);
957 tv.tv_sec = 2;
958 tv.tv_usec = 0;
959 if (select(sock+1, &rfds, NULL, NULL, &tv) <= 0) {
960 fprintf(stderr, "Failed to select for scm_cred: %s\n",
961 strerror(errno));
962 return false;
963 }
964 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
965 if (ret < 0) {
966 fprintf(stderr, "Failed to receive scm_cred: %s\n",
967 strerror(errno));
968 return false;
969 }
970
971 cmsg = CMSG_FIRSTHDR(&msg);
972
973 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
974 cmsg->cmsg_level == SOL_SOCKET &&
975 cmsg->cmsg_type == SCM_CREDENTIALS) {
976 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
977 }
978 *v = buf[0];
979
980 return true;
981 }
982
983
984 /*
985 * pid_to_ns - reads pids from a ucred over a socket, then writes the
986 * int value back over the socket. This shifts the pid from the
987 * sender's pidns into tpid's pidns.
988 */
989 static void pid_to_ns(int sock, pid_t tpid)
990 {
991 char v = '0';
992 struct ucred cred;
993
994 while (recv_creds(sock, &cred, &v)) {
995 if (v == '1')
996 _exit(0);
997 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
998 _exit(1);
999 }
1000 _exit(0);
1001 }
1002
1003 /*
1004 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
1005 * in your old pidns. Only children which you fork will be in the target
1006 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
1007 * actually convert pids
1008 */
1009 static void pid_to_ns_wrapper(int sock, pid_t tpid)
1010 {
1011 int newnsfd = -1, ret, cpipe[2];
1012 char fnam[100];
1013 pid_t cpid;
1014 struct timeval tv;
1015 fd_set s;
1016 char v;
1017
1018 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1019 if (ret < 0 || ret >= sizeof(fnam))
1020 _exit(1);
1021 newnsfd = open(fnam, O_RDONLY);
1022 if (newnsfd < 0)
1023 _exit(1);
1024 if (setns(newnsfd, 0) < 0)
1025 _exit(1);
1026 close(newnsfd);
1027
1028 if (pipe(cpipe) < 0)
1029 _exit(1);
1030
1031 cpid = fork();
1032 if (cpid < 0)
1033 _exit(1);
1034
1035 if (!cpid) {
1036 char b = '1';
1037 close(cpipe[0]);
1038 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1039 fprintf(stderr, "%s (child): erorr on write: %s\n",
1040 __func__, strerror(errno));
1041 }
1042 close(cpipe[1]);
1043 pid_to_ns(sock, tpid);
1044 _exit(1); // not reached
1045 }
1046 // give the child 1 second to be done forking and
1047 // write its ack
1048 FD_ZERO(&s);
1049 FD_SET(cpipe[0], &s);
1050 tv.tv_sec = 1;
1051 tv.tv_usec = 0;
1052 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1053 if (ret <= 0)
1054 _exit(1);
1055 ret = read(cpipe[0], &v, 1);
1056 if (ret != sizeof(char) || v != '1')
1057 _exit(1);
1058
1059 if (!wait_for_pid(cpid))
1060 _exit(1);
1061 _exit(0);
1062 }
1063
1064 /*
1065 * To read cgroup files with a particular pid, we will setns into the child
1066 * pidns, open a pipe, fork a child - which will be the first to really be in
1067 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
1068 */
1069 static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
1070 {
1071 int sock[2] = {-1, -1};
1072 char *tmpdata = NULL;
1073 int ret;
1074 pid_t qpid, cpid = -1;
1075 bool answer = false;
1076 char v = '0';
1077 struct ucred cred;
1078 struct timeval tv;
1079 size_t sz = 0, asz = 0;
1080 fd_set s;
1081
1082 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
1083 return false;
1084
1085 /*
1086 * Now we read the pids from returned data one by one, pass
1087 * them into a child in the target namespace, read back the
1088 * translated pids, and put them into our to-return data
1089 */
1090
1091 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1092 perror("socketpair");
1093 free(tmpdata);
1094 return false;
1095 }
1096
1097 cpid = fork();
1098 if (cpid == -1)
1099 goto out;
1100
1101 if (!cpid) // child - exits when done
1102 pid_to_ns_wrapper(sock[1], tpid);
1103
1104 char *ptr = tmpdata;
1105 cred.uid = 0;
1106 cred.gid = 0;
1107 while (sscanf(ptr, "%d\n", &qpid) == 1) {
1108 cred.pid = qpid;
1109 ret = send_creds(sock[0], &cred, v, true);
1110
1111 if (ret == SEND_CREDS_NOTSK)
1112 goto next;
1113 if (ret == SEND_CREDS_FAIL)
1114 goto out;
1115
1116 // read converted results
1117 FD_ZERO(&s);
1118 FD_SET(sock[0], &s);
1119 tv.tv_sec = 2;
1120 tv.tv_usec = 0;
1121 ret = select(sock[0]+1, &s, NULL, NULL, &tv);
1122 if (ret <= 0) {
1123 fprintf(stderr, "%s: select error waiting for pid from child: %s\n",
1124 __func__, strerror(errno));
1125 goto out;
1126 }
1127 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1128 fprintf(stderr, "%s: error reading pid from child: %s\n",
1129 __func__, strerror(errno));
1130 goto out;
1131 }
1132 must_strcat_pid(d, &sz, &asz, qpid);
1133 next:
1134 ptr = strchr(ptr, '\n');
1135 if (!ptr)
1136 break;
1137 ptr++;
1138 }
1139
1140 cred.pid = getpid();
1141 v = '1';
1142 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
1143 // failed to ask child to exit
1144 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1145 __func__, strerror(errno));
1146 goto out;
1147 }
1148
1149 answer = true;
1150
1151 out:
1152 free(tmpdata);
1153 if (cpid != -1)
1154 wait_for_pid(cpid);
1155 if (sock[0] != -1) {
1156 close(sock[0]);
1157 close(sock[1]);
1158 }
1159 return answer;
1160 }
1161
1162 static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1163 struct fuse_file_info *fi)
1164 {
1165 struct fuse_context *fc = fuse_get_context();
1166 struct file_info *f = (struct file_info *)fi->fh;
1167 struct cgfs_files *k = NULL;
1168 char *data = NULL;
1169 int ret, s;
1170 bool r;
1171
1172 if (f->type != LXC_TYPE_CGFILE) {
1173 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1174 return -EIO;
1175 }
1176
1177 if (offset)
1178 return 0;
1179
1180 if (!fc)
1181 return -EIO;
1182
1183 if (!f->controller)
1184 return -EINVAL;
1185
1186 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
1187 return -EINVAL;
1188 }
1189 free_key(k);
1190
1191
1192 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { // should never get here
1193 ret = -EACCES;
1194 goto out;
1195 }
1196
1197 if (strcmp(f->file, "tasks") == 0 ||
1198 strcmp(f->file, "/tasks") == 0 ||
1199 strcmp(f->file, "/cgroup.procs") == 0 ||
1200 strcmp(f->file, "cgroup.procs") == 0)
1201 // special case - we have to translate the pids
1202 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
1203 else
1204 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
1205
1206 if (!r) {
1207 ret = -EINVAL;
1208 goto out;
1209 }
1210
1211 if (!data) {
1212 ret = 0;
1213 goto out;
1214 }
1215 s = strlen(data);
1216 if (s > size)
1217 s = size;
1218 memcpy(buf, data, s);
1219 if (s > 0 && s < size && data[s-1] != '\n')
1220 buf[s++] = '\n';
1221
1222 ret = s;
1223
1224 out:
1225 free(data);
1226 return ret;
1227 }
1228
1229 static void pid_from_ns(int sock, pid_t tpid)
1230 {
1231 pid_t vpid;
1232 struct ucred cred;
1233 char v;
1234 struct timeval tv;
1235 fd_set s;
1236 int ret;
1237
1238 cred.uid = 0;
1239 cred.gid = 0;
1240 while (1) {
1241 FD_ZERO(&s);
1242 FD_SET(sock, &s);
1243 tv.tv_sec = 2;
1244 tv.tv_usec = 0;
1245 ret = select(sock+1, &s, NULL, NULL, &tv);
1246 if (ret <= 0) {
1247 fprintf(stderr, "%s: bad select before read from parent: %s\n",
1248 __func__, strerror(errno));
1249 _exit(1);
1250 }
1251 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1252 fprintf(stderr, "%s: bad read from parent: %s\n",
1253 __func__, strerror(errno));
1254 _exit(1);
1255 }
1256 if (vpid == -1) // done
1257 break;
1258 v = '0';
1259 cred.pid = vpid;
1260 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
1261 v = '1';
1262 cred.pid = getpid();
1263 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
1264 _exit(1);
1265 }
1266 }
1267 _exit(0);
1268 }
1269
1270 static void pid_from_ns_wrapper(int sock, pid_t tpid)
1271 {
1272 int newnsfd = -1, ret, cpipe[2];
1273 char fnam[100];
1274 pid_t cpid;
1275 fd_set s;
1276 struct timeval tv;
1277 char v;
1278
1279 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1280 if (ret < 0 || ret >= sizeof(fnam))
1281 _exit(1);
1282 newnsfd = open(fnam, O_RDONLY);
1283 if (newnsfd < 0)
1284 _exit(1);
1285 if (setns(newnsfd, 0) < 0)
1286 _exit(1);
1287 close(newnsfd);
1288
1289 if (pipe(cpipe) < 0)
1290 _exit(1);
1291
1292 loop:
1293 cpid = fork();
1294
1295 if (cpid < 0)
1296 _exit(1);
1297
1298 if (!cpid) {
1299 char b = '1';
1300 close(cpipe[0]);
1301 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1302 fprintf(stderr, "%s (child): erorr on write: %s\n",
1303 __func__, strerror(errno));
1304 }
1305 close(cpipe[1]);
1306 pid_from_ns(sock, tpid);
1307 }
1308
1309 // give the child 1 second to be done forking and
1310 // write it's ack
1311 FD_ZERO(&s);
1312 FD_SET(cpipe[0], &s);
1313 tv.tv_sec = 1;
1314 tv.tv_usec = 0;
1315 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
1316 if (ret <= 0)
1317 goto again;
1318 ret = read(cpipe[0], &v, 1);
1319 if (ret != sizeof(char) || v != '1') {
1320 goto again;
1321 }
1322
1323 if (!wait_for_pid(cpid))
1324 _exit(1);
1325 _exit(0);
1326
1327 again:
1328 kill(cpid, SIGKILL);
1329 wait_for_pid(cpid);
1330 goto loop;
1331 }
1332
1333 /*
1334 * Given host @uid, return the uid to which it maps in
1335 * @pid's user namespace, or -1 if none.
1336 */
1337 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
1338 {
1339 FILE *f;
1340 char line[400];
1341
1342 sprintf(line, "/proc/%d/uid_map", pid);
1343 if ((f = fopen(line, "r")) == NULL) {
1344 return false;
1345 }
1346
1347 *answer = convert_id_to_ns(f, uid);
1348 fclose(f);
1349
1350 if (*answer == -1)
1351 return false;
1352 return true;
1353 }
1354
1355 /*
1356 * get_pid_creds: get the real uid and gid of @pid from
1357 * /proc/$$/status
1358 * (XXX should we use euid here?)
1359 */
1360 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
1361 {
1362 char line[400];
1363 uid_t u;
1364 gid_t g;
1365 FILE *f;
1366
1367 *uid = -1;
1368 *gid = -1;
1369 sprintf(line, "/proc/%d/status", pid);
1370 if ((f = fopen(line, "r")) == NULL) {
1371 fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno));
1372 return;
1373 }
1374 while (fgets(line, 400, f)) {
1375 if (strncmp(line, "Uid:", 4) == 0) {
1376 if (sscanf(line+4, "%u", &u) != 1) {
1377 fprintf(stderr, "bad uid line for pid %u\n", pid);
1378 fclose(f);
1379 return;
1380 }
1381 *uid = u;
1382 } else if (strncmp(line, "Gid:", 4) == 0) {
1383 if (sscanf(line+4, "%u", &g) != 1) {
1384 fprintf(stderr, "bad gid line for pid %u\n", pid);
1385 fclose(f);
1386 return;
1387 }
1388 *gid = g;
1389 }
1390 }
1391 fclose(f);
1392 }
1393
1394 /*
1395 * May the requestor @r move victim @v to a new cgroup?
1396 * This is allowed if
1397 * . they are the same task
1398 * . they are ownedy by the same uid
1399 * . @r is root on the host, or
1400 * . @v's uid is mapped into @r's where @r is root.
1401 */
1402 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
1403 {
1404 uid_t v_uid, tmpuid;
1405 gid_t v_gid;
1406
1407 if (r == v)
1408 return true;
1409 if (r_uid == 0)
1410 return true;
1411 get_pid_creds(v, &v_uid, &v_gid);
1412 if (r_uid == v_uid)
1413 return true;
1414 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
1415 && hostuid_to_ns(v_uid, r, &tmpuid))
1416 return true;
1417 return false;
1418 }
1419
1420 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
1421 const char *file, const char *buf)
1422 {
1423 int sock[2] = {-1, -1};
1424 pid_t qpid, cpid = -1;
1425 FILE *pids_file = NULL;
1426 bool answer = false, fail = false;
1427
1428 pids_file = open_pids_file(contrl, cg);
1429 if (!pids_file)
1430 return false;
1431
1432 /*
1433 * write the pids to a socket, have helper in writer's pidns
1434 * call movepid for us
1435 */
1436 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1437 perror("socketpair");
1438 goto out;
1439 }
1440
1441 cpid = fork();
1442 if (cpid == -1)
1443 goto out;
1444
1445 if (!cpid) { // child
1446 fclose(pids_file);
1447 pid_from_ns_wrapper(sock[1], tpid);
1448 }
1449
1450 const char *ptr = buf;
1451 while (sscanf(ptr, "%d", &qpid) == 1) {
1452 struct ucred cred;
1453 char v;
1454
1455 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1456 fprintf(stderr, "%s: error writing pid to child: %s\n",
1457 __func__, strerror(errno));
1458 goto out;
1459 }
1460
1461 if (recv_creds(sock[0], &cred, &v)) {
1462 if (v == '0') {
1463 if (!may_move_pid(tpid, tuid, cred.pid)) {
1464 fail = true;
1465 break;
1466 }
1467 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
1468 fail = true;
1469 }
1470 }
1471
1472 ptr = strchr(ptr, '\n');
1473 if (!ptr)
1474 break;
1475 ptr++;
1476 }
1477
1478 /* All good, write the value */
1479 qpid = -1;
1480 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1481 fprintf(stderr, "Warning: failed to ask child to exit\n");
1482
1483 if (!fail)
1484 answer = true;
1485
1486 out:
1487 if (cpid != -1)
1488 wait_for_pid(cpid);
1489 if (sock[0] != -1) {
1490 close(sock[0]);
1491 close(sock[1]);
1492 }
1493 if (pids_file) {
1494 if (fclose(pids_file) != 0)
1495 answer = false;
1496 }
1497 return answer;
1498 }
1499
1500 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1501 struct fuse_file_info *fi)
1502 {
1503 struct fuse_context *fc = fuse_get_context();
1504 char *localbuf = NULL;
1505 struct cgfs_files *k = NULL;
1506 struct file_info *f = (struct file_info *)fi->fh;
1507 bool r;
1508
1509 if (f->type != LXC_TYPE_CGFILE) {
1510 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1511 return -EIO;
1512 }
1513
1514 if (offset)
1515 return 0;
1516
1517 if (!fc)
1518 return -EIO;
1519
1520 localbuf = alloca(size+1);
1521 localbuf[size] = '\0';
1522 memcpy(localbuf, buf, size);
1523
1524 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
1525 size = -EINVAL;
1526 goto out;
1527 }
1528
1529 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
1530 size = -EACCES;
1531 goto out;
1532 }
1533
1534 if (strcmp(f->file, "tasks") == 0 ||
1535 strcmp(f->file, "/tasks") == 0 ||
1536 strcmp(f->file, "/cgroup.procs") == 0 ||
1537 strcmp(f->file, "cgroup.procs") == 0)
1538 // special case - we have to translate the pids
1539 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
1540 else
1541 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
1542
1543 if (!r)
1544 size = -EINVAL;
1545
1546 out:
1547 free_key(k);
1548 return size;
1549 }
1550
1551 int cg_chown(const char *path, uid_t uid, gid_t gid)
1552 {
1553 struct fuse_context *fc = fuse_get_context();
1554 char *cgdir = NULL, *fpath = NULL, *path1, *path2, *controller;
1555 struct cgfs_files *k = NULL;
1556 const char *cgroup;
1557 int ret;
1558
1559 if (!fc)
1560 return -EIO;
1561
1562 if (strcmp(path, "/cgroup") == 0)
1563 return -EINVAL;
1564
1565 controller = pick_controller_from_path(fc, path);
1566 if (!controller)
1567 return -EINVAL;
1568 cgroup = find_cgroup_in_path(path);
1569 if (!cgroup)
1570 /* this is just /cgroup/controller */
1571 return -EINVAL;
1572
1573 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1574
1575 if (!fpath) {
1576 path1 = "/";
1577 path2 = cgdir;
1578 } else {
1579 path1 = cgdir;
1580 path2 = fpath;
1581 }
1582
1583 if (is_child_cgroup(controller, path1, path2)) {
1584 // get uid, gid, from '/tasks' file and make up a mode
1585 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1586 k = cgfs_get_key(controller, cgroup, "tasks");
1587
1588 } else
1589 k = cgfs_get_key(controller, path1, path2);
1590
1591 if (!k) {
1592 ret = -EINVAL;
1593 goto out;
1594 }
1595
1596 /*
1597 * This being a fuse request, the uid and gid must be valid
1598 * in the caller's namespace. So we can just check to make
1599 * sure that the caller is root in his uid, and privileged
1600 * over the file's current owner.
1601 */
1602 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
1603 ret = -EACCES;
1604 goto out;
1605 }
1606
1607 ret = cgfs_chown_file(controller, cgroup, uid, gid);
1608
1609 out:
1610 free_key(k);
1611 free(cgdir);
1612
1613 return ret;
1614 }
1615
1616 int cg_chmod(const char *path, mode_t mode)
1617 {
1618 struct fuse_context *fc = fuse_get_context();
1619 char * cgdir = NULL, *fpath = NULL, *path1, *path2, *controller;
1620 struct cgfs_files *k = NULL;
1621 const char *cgroup;
1622 int ret;
1623
1624 if (!fc)
1625 return -EIO;
1626
1627 if (strcmp(path, "/cgroup") == 0)
1628 return -EINVAL;
1629
1630 controller = pick_controller_from_path(fc, path);
1631 if (!controller)
1632 return -EINVAL;
1633 cgroup = find_cgroup_in_path(path);
1634 if (!cgroup)
1635 /* this is just /cgroup/controller */
1636 return -EINVAL;
1637
1638 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1639
1640 if (!fpath) {
1641 path1 = "/";
1642 path2 = cgdir;
1643 } else {
1644 path1 = cgdir;
1645 path2 = fpath;
1646 }
1647
1648 if (is_child_cgroup(controller, path1, path2)) {
1649 // get uid, gid, from '/tasks' file and make up a mode
1650 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1651 k = cgfs_get_key(controller, cgroup, "tasks");
1652
1653 } else
1654 k = cgfs_get_key(controller, path1, path2);
1655
1656 if (!k) {
1657 ret = -EINVAL;
1658 goto out;
1659 }
1660
1661 /*
1662 * This being a fuse request, the uid and gid must be valid
1663 * in the caller's namespace. So we can just check to make
1664 * sure that the caller is root in his uid, and privileged
1665 * over the file's current owner.
1666 */
1667 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1668 ret = -EPERM;
1669 goto out;
1670 }
1671
1672 if (!cgfs_chmod_file(controller, cgroup, mode)) {
1673 ret = -EINVAL;
1674 goto out;
1675 }
1676
1677 ret = 0;
1678 out:
1679 free_key(k);
1680 free(cgdir);
1681 return ret;
1682 }
1683
1684 int cg_mkdir(const char *path, mode_t mode)
1685 {
1686 struct fuse_context *fc = fuse_get_context();
1687 char *fpath = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
1688 const char *cgroup;
1689 int ret;
1690
1691 if (!fc)
1692 return -EIO;
1693
1694
1695 controller = pick_controller_from_path(fc, path);
1696 if (!controller)
1697 return -EINVAL;
1698
1699 cgroup = find_cgroup_in_path(path);
1700 if (!cgroup)
1701 return -EINVAL;
1702
1703 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1704 if (!fpath)
1705 path1 = "/";
1706 else
1707 path1 = cgdir;
1708
1709 if (!caller_is_in_ancestor(fc->pid, controller, path1, &next)) {
1710 if (fpath && strcmp(next, fpath) == 0)
1711 ret = -EEXIST;
1712 else
1713 ret = -ENOENT;
1714 goto out;
1715 }
1716
1717 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
1718 ret = -EACCES;
1719 goto out;
1720 }
1721 if (!caller_is_in_ancestor(fc->pid, controller, path1, NULL)) {
1722 ret = -EACCES;
1723 goto out;
1724 }
1725
1726 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
1727
1728 out:
1729 free(cgdir);
1730 free(next);
1731 return ret;
1732 }
1733
1734 static int cg_rmdir(const char *path)
1735 {
1736 struct fuse_context *fc = fuse_get_context();
1737 char *fpath = NULL, *cgdir = NULL, *controller, *next = NULL;
1738 const char *cgroup;
1739 int ret;
1740
1741 if (!fc)
1742 return -EIO;
1743
1744 controller = pick_controller_from_path(fc, path);
1745 if (!controller)
1746 return -EINVAL;
1747
1748 cgroup = find_cgroup_in_path(path);
1749 if (!cgroup)
1750 return -EINVAL;
1751
1752 get_cgdir_and_path(cgroup, &cgdir, &fpath);
1753 if (!fpath) {
1754 ret = -EINVAL;
1755 goto out;
1756 }
1757
1758 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, &next)) {
1759 if (!fpath || strcmp(next, fpath) == 0)
1760 ret = -EBUSY;
1761 else
1762 ret = -ENOENT;
1763 goto out;
1764 }
1765
1766 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
1767 ret = -EACCES;
1768 goto out;
1769 }
1770 if (!caller_is_in_ancestor(fc->pid, controller, cgroup, NULL)) {
1771 ret = -EACCES;
1772 goto out;
1773 }
1774
1775 if (!cgfs_remove(controller, cgroup)) {
1776 ret = -EINVAL;
1777 goto out;
1778 }
1779
1780 ret = 0;
1781
1782 out:
1783 free(cgdir);
1784 free(next);
1785 return ret;
1786 }
1787
1788 static bool startswith(const char *line, const char *pref)
1789 {
1790 if (strncmp(line, pref, strlen(pref)) == 0)
1791 return true;
1792 return false;
1793 }
1794
1795 static void get_mem_cached(char *memstat, unsigned long *v)
1796 {
1797 char *eol;
1798
1799 *v = 0;
1800 while (*memstat) {
1801 if (startswith(memstat, "total_cache")) {
1802 sscanf(memstat + 11, "%lu", v);
1803 *v /= 1024;
1804 return;
1805 }
1806 eol = strchr(memstat, '\n');
1807 if (!eol)
1808 return;
1809 memstat = eol+1;
1810 }
1811 }
1812
1813 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
1814 {
1815 char *eol;
1816 char key[32];
1817
1818 memset(key, 0, 32);
1819 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
1820
1821 size_t len = strlen(key);
1822 *v = 0;
1823
1824 while (*str) {
1825 if (startswith(str, key)) {
1826 sscanf(str + len, "%lu", v);
1827 return;
1828 }
1829 eol = strchr(str, '\n');
1830 if (!eol)
1831 return;
1832 str = eol+1;
1833 }
1834 }
1835
1836 static int read_file(const char *path, char *buf, size_t size,
1837 struct file_info *d)
1838 {
1839 size_t linelen = 0, total_len = 0, rv = 0;
1840 char *line = NULL;
1841 char *cache = d->buf;
1842 size_t cache_size = d->buflen;
1843 FILE *f = fopen(path, "r");
1844 if (!f)
1845 return 0;
1846
1847 while (getline(&line, &linelen, f) != -1) {
1848 size_t l = snprintf(cache, cache_size, "%s", line);
1849 if (l < 0) {
1850 perror("Error writing to cache");
1851 rv = 0;
1852 goto err;
1853 }
1854 if (l >= cache_size) {
1855 fprintf(stderr, "Internal error: truncated write to cache\n");
1856 rv = 0;
1857 goto err;
1858 }
1859 if (l < cache_size) {
1860 cache += l;
1861 cache_size -= l;
1862 total_len += l;
1863 } else {
1864 cache += cache_size;
1865 total_len += cache_size;
1866 cache_size = 0;
1867 break;
1868 }
1869 }
1870
1871 d->size = total_len;
1872 if (total_len > size ) total_len = size;
1873
1874 /* read from off 0 */
1875 memcpy(buf, d->buf, total_len);
1876 rv = total_len;
1877 err:
1878 fclose(f);
1879 free(line);
1880 return rv;
1881 }
1882
1883 /*
1884 * FUSE ops for /proc
1885 */
1886
1887 static unsigned long get_memlimit(const char *cgroup)
1888 {
1889 char *memlimit_str = NULL;
1890 unsigned long memlimit = -1;
1891
1892 if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
1893 memlimit = strtoul(memlimit_str, NULL, 10);
1894
1895 free(memlimit_str);
1896
1897 return memlimit;
1898 }
1899
1900 static unsigned long get_min_memlimit(const char *cgroup)
1901 {
1902 char *copy = strdupa(cgroup);
1903 unsigned long memlimit = 0, retlimit;
1904
1905 retlimit = get_memlimit(copy);
1906
1907 while (strcmp(copy, "/") != 0) {
1908 copy = dirname(copy);
1909 memlimit = get_memlimit(copy);
1910 if (memlimit != -1 && memlimit < retlimit)
1911 retlimit = memlimit;
1912 };
1913
1914 return retlimit;
1915 }
1916
1917 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
1918 struct fuse_file_info *fi)
1919 {
1920 struct fuse_context *fc = fuse_get_context();
1921 struct file_info *d = (struct file_info *)fi->fh;
1922 char *cg;
1923 char *memusage_str = NULL, *memstat_str = NULL,
1924 *memswlimit_str = NULL, *memswusage_str = NULL;
1925 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
1926 cached = 0, hosttotal = 0;
1927 char *line = NULL;
1928 size_t linelen = 0, total_len = 0, rv = 0;
1929 char *cache = d->buf;
1930 size_t cache_size = d->buflen;
1931 FILE *f = NULL;
1932
1933 if (offset){
1934 if (offset > d->size)
1935 return -EINVAL;
1936 if (!d->cached)
1937 return 0;
1938 int left = d->size - offset;
1939 total_len = left > size ? size: left;
1940 memcpy(buf, cache + offset, total_len);
1941 return total_len;
1942 }
1943
1944 cg = get_pid_cgroup(fc->pid, "memory");
1945 if (!cg)
1946 return read_file("/proc/meminfo", buf, size, d);
1947
1948 memlimit = get_min_memlimit(cg);
1949 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
1950 goto err;
1951 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
1952 goto err;
1953
1954 memusage = strtoul(memusage_str, NULL, 10);
1955 memlimit /= 1024;
1956 memusage /= 1024;
1957
1958 // Following values are allowed to fail, because swapaccount might be turned
1959 // off for current kernel
1960 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
1961 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
1962 {
1963 memswlimit = strtoul(memswlimit_str, NULL, 10);
1964 memswusage = strtoul(memswusage_str, NULL, 10);
1965 memswlimit /= 1024;
1966 memswusage /= 1024;
1967 if (memswlimit >= memlimit)
1968 memswlimit = 0;
1969 if (memswusage_str >= memlimit)
1970 memswusage = 0;
1971
1972 }
1973
1974 get_mem_cached(memstat_str, &cached);
1975
1976 f = fopen("/proc/meminfo", "r");
1977 if (!f)
1978 goto err;
1979
1980 while (getline(&line, &linelen, f) != -1) {
1981 size_t l;
1982 char *printme, lbuf[100];
1983
1984 memset(lbuf, 0, 100);
1985 if (startswith(line, "MemTotal:")) {
1986 sscanf(line+14, "%lu", &hosttotal);
1987 if (hosttotal < memlimit)
1988 memlimit = hosttotal;
1989 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
1990 printme = lbuf;
1991 } else if (startswith(line, "MemFree:")) {
1992 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
1993 printme = lbuf;
1994 } else if (startswith(line, "MemAvailable:")) {
1995 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
1996 printme = lbuf;
1997 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
1998 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit);
1999 printme = lbuf;
2000 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
2001 snprintf(lbuf, 100, "SwapFree: %8lu kB\n",
2002 (memswlimit - memlimit) - (memswusage - memusage));
2003 printme = lbuf;
2004 } else if (startswith(line, "Buffers:")) {
2005 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
2006 printme = lbuf;
2007 } else if (startswith(line, "Cached:")) {
2008 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
2009 printme = lbuf;
2010 } else if (startswith(line, "SwapCached:")) {
2011 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
2012 printme = lbuf;
2013 } else
2014 printme = line;
2015
2016 l = snprintf(cache, cache_size, "%s", printme);
2017 if (l < 0) {
2018 perror("Error writing to cache");
2019 rv = 0;
2020 goto err;
2021
2022 }
2023 if (l >= cache_size) {
2024 fprintf(stderr, "Internal error: truncated write to cache\n");
2025 rv = 0;
2026 goto err;
2027 }
2028
2029 cache += l;
2030 cache_size -= l;
2031 total_len += l;
2032 }
2033
2034 d->cached = 1;
2035 d->size = total_len;
2036 if (total_len > size ) total_len = size;
2037 memcpy(buf, d->buf, total_len);
2038
2039 rv = total_len;
2040 err:
2041 if (f)
2042 fclose(f);
2043 free(line);
2044 free(cg);
2045 free(memusage_str);
2046 free(memswlimit_str);
2047 free(memswusage_str);
2048 free(memstat_str);
2049 return rv;
2050 }
2051
2052 /*
2053 * Read the cpuset.cpus for cg
2054 * Return the answer in a newly allocated string which must be freed
2055 */
2056 static char *get_cpuset(const char *cg)
2057 {
2058 char *answer;
2059
2060 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
2061 return NULL;
2062 return answer;
2063 }
2064
2065 bool cpu_in_cpuset(int cpu, const char *cpuset);
2066
2067 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
2068 {
2069 int cpu;
2070
2071 if (sscanf(line, "processor : %d", &cpu) != 1)
2072 return false;
2073 return cpu_in_cpuset(cpu, cpuset);
2074 }
2075
2076 /*
2077 * check whether this is a '^processor" line in /proc/cpuinfo
2078 */
2079 static bool is_processor_line(const char *line)
2080 {
2081 int cpu;
2082
2083 if (sscanf(line, "processor : %d", &cpu) == 1)
2084 return true;
2085 return false;
2086 }
2087
2088 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
2089 struct fuse_file_info *fi)
2090 {
2091 struct fuse_context *fc = fuse_get_context();
2092 struct file_info *d = (struct file_info *)fi->fh;
2093 char *cg;
2094 char *cpuset = NULL;
2095 char *line = NULL;
2096 size_t linelen = 0, total_len = 0, rv = 0;
2097 bool am_printing = false;
2098 int curcpu = -1;
2099 char *cache = d->buf;
2100 size_t cache_size = d->buflen;
2101 FILE *f = NULL;
2102
2103 if (offset){
2104 if (offset > d->size)
2105 return -EINVAL;
2106 if (!d->cached)
2107 return 0;
2108 int left = d->size - offset;
2109 total_len = left > size ? size: left;
2110 memcpy(buf, cache + offset, total_len);
2111 return total_len;
2112 }
2113
2114 cg = get_pid_cgroup(fc->pid, "cpuset");
2115 if (!cg)
2116 return read_file("proc/cpuinfo", buf, size, d);
2117
2118 cpuset = get_cpuset(cg);
2119 if (!cpuset)
2120 goto err;
2121
2122 f = fopen("/proc/cpuinfo", "r");
2123 if (!f)
2124 goto err;
2125
2126 while (getline(&line, &linelen, f) != -1) {
2127 size_t l;
2128 if (is_processor_line(line)) {
2129 am_printing = cpuline_in_cpuset(line, cpuset);
2130 if (am_printing) {
2131 curcpu ++;
2132 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
2133 if (l < 0) {
2134 perror("Error writing to cache");
2135 rv = 0;
2136 goto err;
2137 }
2138 if (l >= cache_size) {
2139 fprintf(stderr, "Internal error: truncated write to cache\n");
2140 rv = 0;
2141 goto err;
2142 }
2143 if (l < cache_size){
2144 cache += l;
2145 cache_size -= l;
2146 total_len += l;
2147 }else{
2148 cache += cache_size;
2149 total_len += cache_size;
2150 cache_size = 0;
2151 break;
2152 }
2153 }
2154 continue;
2155 }
2156 if (am_printing) {
2157 l = snprintf(cache, cache_size, "%s", line);
2158 if (l < 0) {
2159 perror("Error writing to cache");
2160 rv = 0;
2161 goto err;
2162 }
2163 if (l >= cache_size) {
2164 fprintf(stderr, "Internal error: truncated write to cache\n");
2165 rv = 0;
2166 goto err;
2167 }
2168 if (l < cache_size) {
2169 cache += l;
2170 cache_size -= l;
2171 total_len += l;
2172 } else {
2173 cache += cache_size;
2174 total_len += cache_size;
2175 cache_size = 0;
2176 break;
2177 }
2178 }
2179 }
2180
2181 d->cached = 1;
2182 d->size = total_len;
2183 if (total_len > size ) total_len = size;
2184
2185 /* read from off 0 */
2186 memcpy(buf, d->buf, total_len);
2187 rv = total_len;
2188 err:
2189 if (f)
2190 fclose(f);
2191 free(line);
2192 free(cpuset);
2193 free(cg);
2194 return rv;
2195 }
2196
2197 static int proc_stat_read(char *buf, size_t size, off_t offset,
2198 struct fuse_file_info *fi)
2199 {
2200 struct fuse_context *fc = fuse_get_context();
2201 struct file_info *d = (struct file_info *)fi->fh;
2202 char *cg;
2203 char *cpuset = NULL;
2204 char *line = NULL;
2205 size_t linelen = 0, total_len = 0, rv = 0;
2206 int curcpu = -1; /* cpu numbering starts at 0 */
2207 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
2208 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
2209 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
2210 #define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
2211 char cpuall[CPUALL_MAX_SIZE];
2212 /* reserve for cpu all */
2213 char *cache = d->buf + CPUALL_MAX_SIZE;
2214 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
2215 FILE *f = NULL;
2216
2217 if (offset){
2218 if (offset > d->size)
2219 return -EINVAL;
2220 if (!d->cached)
2221 return 0;
2222 int left = d->size - offset;
2223 total_len = left > size ? size: left;
2224 memcpy(buf, d->buf + offset, total_len);
2225 return total_len;
2226 }
2227
2228 cg = get_pid_cgroup(fc->pid, "cpuset");
2229 if (!cg)
2230 return read_file("/proc/stat", buf, size, d);
2231
2232 cpuset = get_cpuset(cg);
2233 if (!cpuset)
2234 goto err;
2235
2236 f = fopen("/proc/stat", "r");
2237 if (!f)
2238 goto err;
2239
2240 //skip first line
2241 if (getline(&line, &linelen, f) < 0) {
2242 fprintf(stderr, "proc_stat_read read first line failed\n");
2243 goto err;
2244 }
2245
2246 while (getline(&line, &linelen, f) != -1) {
2247 size_t l;
2248 int cpu;
2249 char cpu_char[10]; /* That's a lot of cores */
2250 char *c;
2251
2252 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
2253 /* not a ^cpuN line containing a number N, just print it */
2254 l = snprintf(cache, cache_size, "%s", line);
2255 if (l < 0) {
2256 perror("Error writing to cache");
2257 rv = 0;
2258 goto err;
2259 }
2260 if (l >= cache_size) {
2261 fprintf(stderr, "Internal error: truncated write to cache\n");
2262 rv = 0;
2263 goto err;
2264 }
2265 if (l < cache_size) {
2266 cache += l;
2267 cache_size -= l;
2268 total_len += l;
2269 continue;
2270 } else {
2271 //no more space, break it
2272 cache += cache_size;
2273 total_len += cache_size;
2274 cache_size = 0;
2275 break;
2276 }
2277 }
2278
2279 if (sscanf(cpu_char, "%d", &cpu) != 1)
2280 continue;
2281 if (!cpu_in_cpuset(cpu, cpuset))
2282 continue;
2283 curcpu ++;
2284
2285 c = strchr(line, ' ');
2286 if (!c)
2287 continue;
2288 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
2289 if (l < 0) {
2290 perror("Error writing to cache");
2291 rv = 0;
2292 goto err;
2293
2294 }
2295 if (l >= cache_size) {
2296 fprintf(stderr, "Internal error: truncated write to cache\n");
2297 rv = 0;
2298 goto err;
2299 }
2300
2301 cache += l;
2302 cache_size -= l;
2303 total_len += l;
2304
2305 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
2306 &softirq, &steal, &guest) != 9)
2307 continue;
2308 user_sum += user;
2309 nice_sum += nice;
2310 system_sum += system;
2311 idle_sum += idle;
2312 iowait_sum += iowait;
2313 irq_sum += irq;
2314 softirq_sum += softirq;
2315 steal_sum += steal;
2316 guest_sum += guest;
2317 }
2318
2319 cache = d->buf;
2320
2321 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2322 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
2323 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
2324 memcpy(cache, cpuall, cpuall_len);
2325 cache += cpuall_len;
2326 } else{
2327 /* shouldn't happen */
2328 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
2329 cpuall_len = 0;
2330 }
2331
2332 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
2333 total_len += cpuall_len;
2334 d->cached = 1;
2335 d->size = total_len;
2336 if (total_len > size ) total_len = size;
2337
2338 memcpy(buf, d->buf, total_len);
2339 rv = total_len;
2340
2341 err:
2342 if (f)
2343 fclose(f);
2344 free(line);
2345 free(cpuset);
2346 free(cg);
2347 return rv;
2348 }
2349
2350 /*
2351 * How to guess what to present for uptime?
2352 * One thing we could do would be to take the date on the caller's
2353 * memory.usage_in_bytes file, which should equal the time of creation
2354 * of his cgroup. However, a task could be in a sub-cgroup of the
2355 * container. The same problem exists if we try to look at the ages
2356 * of processes in the caller's cgroup.
2357 *
2358 * So we'll fork a task that will enter the caller's pidns, mount a
2359 * fresh procfs, get the age of /proc/1, and pass that back over a pipe.
2360 *
2361 * For the second uptime #, we'll do as Stéphane had done, just copy
2362 * the number from /proc/uptime. Not sure how to best emulate 'idle'
2363 * time. Maybe someone can come up with a good algorithm and submit a
2364 * patch. Maybe something based on cpushare info?
2365 */
2366
2367 /* return age of the reaper for $pid, taken from ctime of its procdir */
2368 static long int get_pid1_time(pid_t pid)
2369 {
2370 char fnam[100];
2371 int fd, cpipe[2], ret;
2372 struct stat sb;
2373 pid_t cpid;
2374 struct timeval tv;
2375 fd_set s;
2376 long int v;
2377
2378 if (unshare(CLONE_NEWNS))
2379 return 0;
2380
2381 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL)) {
2382 perror("rslave mount failed");
2383 return 0;
2384 }
2385
2386 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", pid);
2387 if (ret < 0 || ret >= sizeof(fnam))
2388 return 0;
2389
2390 fd = open(fnam, O_RDONLY);
2391 if (fd < 0) {
2392 perror("get_pid1_time open of ns/pid");
2393 return 0;
2394 }
2395 if (setns(fd, 0)) {
2396 perror("get_pid1_time setns 1");
2397 close(fd);
2398 return 0;
2399 }
2400 close(fd);
2401
2402 if (pipe(cpipe) < 0)
2403 return(0);
2404
2405 cpid = fork();
2406 if (cpid < 0) {
2407 close(cpipe[0]);
2408 close(cpipe[1]);
2409 return 0;
2410 }
2411
2412 if (!cpid) {
2413 close(cpipe[0]);
2414 umount2("/proc", MNT_DETACH);
2415 if (mount("proc", "/proc", "proc", 0, NULL)) {
2416 perror("get_pid1_time mount");
2417 _exit(1);
2418 }
2419 ret = lstat("/proc/1", &sb);
2420 if (ret) {
2421 perror("get_pid1_time lstat");
2422 _exit(1);
2423 }
2424 long int retval = time(NULL) - sb.st_ctime;
2425 if (write(cpipe[1], &retval, sizeof(retval)) < 0) {
2426 fprintf(stderr, "%s (child): erorr on write: %s\n",
2427 __func__, strerror(errno));
2428 }
2429 close(cpipe[1]);
2430 _exit(0);
2431 }
2432 close(cpipe[1]);
2433
2434 // give the child 1 second to be done forking and
2435 // write its ack
2436 FD_ZERO(&s);
2437 FD_SET(cpipe[0], &s);
2438 tv.tv_sec = 1;
2439 tv.tv_usec = 0;
2440 ret = select(cpipe[0]+1, &s, NULL, NULL, &tv);
2441 if (ret <= 0)
2442 goto fail;
2443 ret = read(cpipe[0], &v, 1);
2444 if (ret != sizeof(char) || v != '1') {
2445 goto fail;
2446 }
2447
2448 wait_for_pid(cpid);
2449
2450 close(cpipe[0]);
2451 return v;
2452
2453 fail:
2454 kill(cpid, SIGKILL);
2455 wait_for_pid(cpid);
2456 close(cpipe[0]);
2457 return 0;
2458 }
2459
2460 static long int getreaperage(pid_t qpid)
2461 {
2462 int pid, mypipe[2], ret;
2463 struct timeval tv;
2464 fd_set s;
2465 long int mtime, answer = 0;
2466
2467 if (pipe(mypipe)) {
2468 return 0;
2469 }
2470
2471 pid = fork();
2472
2473 if (!pid) { // child
2474 mtime = get_pid1_time(qpid);
2475 if (write(mypipe[1], &mtime, sizeof(mtime)) != sizeof(mtime))
2476 fprintf(stderr, "Warning: bad write from getreaperage\n");
2477 _exit(0);
2478 }
2479
2480 close(mypipe[1]);
2481
2482 if (pid < 0)
2483 goto out;
2484
2485 FD_ZERO(&s);
2486 FD_SET(mypipe[0], &s);
2487 tv.tv_sec = 1;
2488 tv.tv_usec = 0;
2489 ret = select(mypipe[0]+1, &s, NULL, NULL, &tv);
2490 if (ret <= 0) {
2491 perror("select");
2492 goto out;
2493 }
2494 if (!ret) {
2495 fprintf(stderr, "timed out\n");
2496 goto out;
2497 }
2498 if (read(mypipe[0], &mtime, sizeof(mtime)) != sizeof(mtime)) {
2499 perror("read");
2500 goto out;
2501 }
2502 answer = mtime;
2503
2504 out:
2505 wait_for_pid(pid);
2506 close(mypipe[0]);
2507 return answer;
2508 }
2509
2510 /*
2511 * fork a task which switches to @task's namespace and writes '1'.
2512 * over a unix sock so we can read the task's reaper's pid in our
2513 * namespace
2514 */
2515 void write_task_init_pid_exit(int sock, pid_t target)
2516 {
2517 struct ucred cred;
2518 char fnam[100];
2519 pid_t pid;
2520 char v;
2521 int fd, ret;
2522
2523 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
2524 if (ret < 0 || ret >= sizeof(fnam))
2525 _exit(1);
2526
2527 fd = open(fnam, O_RDONLY);
2528 if (fd < 0) {
2529 perror("write_task_init_pid_exit open of ns/pid");
2530 _exit(1);
2531 }
2532 if (setns(fd, 0)) {
2533 perror("write_task_init_pid_exit setns 1");
2534 close(fd);
2535 _exit(1);
2536 }
2537 pid = fork();
2538 if (pid < 0)
2539 _exit(1);
2540 if (pid != 0) {
2541 wait_for_pid(pid);
2542 _exit(0);
2543 }
2544
2545 /* we are the child */
2546 cred.uid = 0;
2547 cred.gid = 0;
2548 cred.pid = 1;
2549 v = '1';
2550 send_creds(sock, &cred, v, true);
2551 _exit(0);
2552 }
2553
2554 static pid_t get_task_reaper_pid(pid_t task)
2555 {
2556 int sock[2];
2557 pid_t pid;
2558 pid_t ret = -1;
2559 char v = '0';
2560 struct ucred cred;
2561
2562 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2563 perror("socketpair");
2564 return -1;
2565 }
2566
2567 pid = fork();
2568 if (pid < 0)
2569 goto out;
2570 if (!pid) {
2571 close(sock[1]);
2572 write_task_init_pid_exit(sock[0], task);
2573 }
2574
2575 if (!recv_creds(sock[1], &cred, &v))
2576 goto out;
2577 ret = cred.pid;
2578
2579 out:
2580 close(sock[0]);
2581 close(sock[1]);
2582 wait_for_pid(pid);
2583 return ret;
2584 }
2585
2586 static unsigned long get_reaper_busy(pid_t task)
2587 {
2588 pid_t init = get_task_reaper_pid(task);
2589 char *cgroup = NULL, *usage_str = NULL;
2590 unsigned long usage = 0;
2591
2592 if (init == -1)
2593 return 0;
2594
2595 cgroup = get_pid_cgroup(task, "cpuacct");
2596 if (!cgroup)
2597 goto out;
2598 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
2599 goto out;
2600 usage = strtoul(usage_str, NULL, 10);
2601 usage /= 100000000;
2602
2603 out:
2604 free(cgroup);
2605 free(usage_str);
2606 return usage;
2607 }
2608
2609 /*
2610 * We read /proc/uptime and reuse its second field.
2611 * For the first field, we use the mtime for the reaper for
2612 * the calling pid as returned by getreaperage
2613 */
2614 static int proc_uptime_read(char *buf, size_t size, off_t offset,
2615 struct fuse_file_info *fi)
2616 {
2617 struct fuse_context *fc = fuse_get_context();
2618 struct file_info *d = (struct file_info *)fi->fh;
2619 long int reaperage = getreaperage(fc->pid);
2620 unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
2621 char *cache = d->buf;
2622 size_t total_len = 0;
2623
2624 if (offset){
2625 if (offset > d->size)
2626 return -EINVAL;
2627 if (!d->cached)
2628 return 0;
2629 int left = d->size - offset;
2630 total_len = left > size ? size: left;
2631 memcpy(buf, cache + offset, total_len);
2632 return total_len;
2633 }
2634
2635 idletime = reaperage - busytime;
2636 if (idletime > reaperage)
2637 idletime = reaperage;
2638
2639 total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
2640 if (total_len < 0){
2641 perror("Error writing to cache");
2642 return 0;
2643 }
2644
2645 d->size = (int)total_len;
2646 d->cached = 1;
2647
2648 if (total_len > size) total_len = size;
2649
2650 memcpy(buf, d->buf, total_len);
2651 return total_len;
2652 }
2653
2654 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2655 struct fuse_file_info *fi)
2656 {
2657 char dev_name[72];
2658 struct fuse_context *fc = fuse_get_context();
2659 struct file_info *d = (struct file_info *)fi->fh;
2660 char *cg;
2661 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
2662 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2663 unsigned long read = 0, write = 0;
2664 unsigned long read_merged = 0, write_merged = 0;
2665 unsigned long read_sectors = 0, write_sectors = 0;
2666 unsigned long read_ticks = 0, write_ticks = 0;
2667 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2668 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
2669 char *cache = d->buf;
2670 size_t cache_size = d->buflen;
2671 char *line = NULL;
2672 size_t linelen = 0, total_len = 0, rv = 0;
2673 unsigned int major = 0, minor = 0;
2674 int i = 0;
2675 FILE *f = NULL;
2676
2677 if (offset){
2678 if (offset > d->size)
2679 return -EINVAL;
2680 if (!d->cached)
2681 return 0;
2682 int left = d->size - offset;
2683 total_len = left > size ? size: left;
2684 memcpy(buf, cache + offset, total_len);
2685 return total_len;
2686 }
2687
2688 cg = get_pid_cgroup(fc->pid, "blkio");
2689 if (!cg)
2690 return read_file("/proc/diskstats", buf, size, d);
2691
2692 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2693 goto err;
2694 if (!cgfs_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2695 goto err;
2696 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2697 goto err;
2698 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2699 goto err;
2700 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2701 goto err;
2702
2703
2704 f = fopen("/proc/diskstats", "r");
2705 if (!f)
2706 goto err;
2707
2708 while (getline(&line, &linelen, f) != -1) {
2709 size_t l;
2710 char *printme, lbuf[256];
2711
2712 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2713 if(i == 3){
2714 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2715 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2716 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2717 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2718 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2719 read_sectors = read_sectors/512;
2720 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2721 write_sectors = write_sectors/512;
2722
2723 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2724 rd_svctm = rd_svctm/1000000;
2725 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2726 rd_wait = rd_wait/1000000;
2727 read_ticks = rd_svctm + rd_wait;
2728
2729 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2730 wr_svctm = wr_svctm/1000000;
2731 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2732 wr_wait = wr_wait/1000000;
2733 write_ticks = wr_svctm + wr_wait;
2734
2735 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2736 tot_ticks = tot_ticks/1000000;
2737 }else{
2738 continue;
2739 }
2740
2741 memset(lbuf, 0, 256);
2742 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2743 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2744 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2745 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2746 printme = lbuf;
2747 } else
2748 continue;
2749
2750 l = snprintf(cache, cache_size, "%s", printme);
2751 if (l < 0) {
2752 perror("Error writing to fuse buf");
2753 rv = 0;
2754 goto err;
2755 }
2756 if (l >= cache_size) {
2757 fprintf(stderr, "Internal error: truncated write to cache\n");
2758 rv = 0;
2759 goto err;
2760 }
2761 cache += l;
2762 cache_size -= l;
2763 total_len += l;
2764 }
2765
2766 d->cached = 1;
2767 d->size = total_len;
2768 if (total_len > size ) total_len = size;
2769 memcpy(buf, d->buf, total_len);
2770
2771 rv = total_len;
2772 err:
2773 free(cg);
2774 if (f)
2775 fclose(f);
2776 free(line);
2777 free(io_serviced_str);
2778 free(io_merged_str);
2779 free(io_service_bytes_str);
2780 free(io_wait_time_str);
2781 free(io_service_time_str);
2782 return rv;
2783 }
2784
2785 static off_t get_procfile_size(const char *which)
2786 {
2787 FILE *f = fopen(which, "r");
2788 char *line = NULL;
2789 size_t len = 0;
2790 ssize_t sz, answer = 0;
2791 if (!f)
2792 return 0;
2793
2794 while ((sz = getline(&line, &len, f)) != -1)
2795 answer += sz;
2796 fclose (f);
2797 free(line);
2798
2799 return answer;
2800 }
2801
2802 static int proc_getattr(const char *path, struct stat *sb)
2803 {
2804 struct timespec now;
2805
2806 memset(sb, 0, sizeof(struct stat));
2807 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2808 return -EINVAL;
2809 sb->st_uid = sb->st_gid = 0;
2810 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2811 if (strcmp(path, "/proc") == 0) {
2812 sb->st_mode = S_IFDIR | 00555;
2813 sb->st_nlink = 2;
2814 return 0;
2815 }
2816 if (strcmp(path, "/proc/meminfo") == 0 ||
2817 strcmp(path, "/proc/cpuinfo") == 0 ||
2818 strcmp(path, "/proc/uptime") == 0 ||
2819 strcmp(path, "/proc/stat") == 0 ||
2820 strcmp(path, "/proc/diskstats") == 0) {
2821 sb->st_size = 0;
2822 sb->st_mode = S_IFREG | 00444;
2823 sb->st_nlink = 1;
2824 return 0;
2825 }
2826
2827 return -ENOENT;
2828 }
2829
2830 static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2831 struct fuse_file_info *fi)
2832 {
2833 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2834 filler(buf, "meminfo", NULL, 0) != 0 ||
2835 filler(buf, "stat", NULL, 0) != 0 ||
2836 filler(buf, "uptime", NULL, 0) != 0 ||
2837 filler(buf, "diskstats", NULL, 0) != 0)
2838 return -EINVAL;
2839 return 0;
2840 }
2841
2842 static int proc_open(const char *path, struct fuse_file_info *fi)
2843 {
2844 int type = -1;
2845 struct file_info *info;
2846
2847 if (strcmp(path, "/proc/meminfo") == 0)
2848 type = LXC_TYPE_PROC_MEMINFO;
2849 else if (strcmp(path, "/proc/cpuinfo") == 0)
2850 type = LXC_TYPE_PROC_CPUINFO;
2851 else if (strcmp(path, "/proc/uptime") == 0)
2852 type = LXC_TYPE_PROC_UPTIME;
2853 else if (strcmp(path, "/proc/stat") == 0)
2854 type = LXC_TYPE_PROC_STAT;
2855 else if (strcmp(path, "/proc/diskstats") == 0)
2856 type = LXC_TYPE_PROC_DISKSTATS;
2857 if (type == -1)
2858 return -ENOENT;
2859
2860 info = malloc(sizeof(*info));
2861 if (!info)
2862 return -ENOMEM;
2863
2864 memset(info, 0, sizeof(*info));
2865 info->type = type;
2866
2867 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
2868 do {
2869 info->buf = malloc(info->buflen);
2870 } while (!info->buf);
2871 memset(info->buf, 0, info->buflen);
2872 /* set actual size to buffer size */
2873 info->size = info->buflen;
2874
2875 fi->fh = (unsigned long)info;
2876 return 0;
2877 }
2878
2879 static int proc_release(const char *path, struct fuse_file_info *fi)
2880 {
2881 struct file_info *f = (struct file_info *)fi->fh;
2882
2883 do_release_file_info(f);
2884 return 0;
2885 }
2886
2887 static int proc_read(const char *path, char *buf, size_t size, off_t offset,
2888 struct fuse_file_info *fi)
2889 {
2890 struct file_info *f = (struct file_info *) fi->fh;
2891
2892 switch (f->type) {
2893 case LXC_TYPE_PROC_MEMINFO:
2894 return proc_meminfo_read(buf, size, offset, fi);
2895 case LXC_TYPE_PROC_CPUINFO:
2896 return proc_cpuinfo_read(buf, size, offset, fi);
2897 case LXC_TYPE_PROC_UPTIME:
2898 return proc_uptime_read(buf, size, offset, fi);
2899 case LXC_TYPE_PROC_STAT:
2900 return proc_stat_read(buf, size, offset, fi);
2901 case LXC_TYPE_PROC_DISKSTATS:
2902 return proc_diskstats_read(buf, size, offset, fi);
2903 default:
2904 return -EINVAL;
2905 }
2906 }
2907
2908 /*
2909 * FUSE ops for /
2910 * these just delegate to the /proc and /cgroup ops as
2911 * needed
2912 */
2913
2914 static int lxcfs_getattr(const char *path, struct stat *sb)
2915 {
2916 if (strcmp(path, "/") == 0) {
2917 sb->st_mode = S_IFDIR | 00755;
2918 sb->st_nlink = 2;
2919 return 0;
2920 }
2921 if (strncmp(path, "/cgroup", 7) == 0) {
2922 return cg_getattr(path, sb);
2923 }
2924 if (strncmp(path, "/proc", 5) == 0) {
2925 return proc_getattr(path, sb);
2926 }
2927 return -EINVAL;
2928 }
2929
2930 static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
2931 {
2932 if (strcmp(path, "/") == 0)
2933 return 0;
2934
2935 if (strncmp(path, "/cgroup", 7) == 0) {
2936 return cg_opendir(path, fi);
2937 }
2938 if (strcmp(path, "/proc") == 0)
2939 return 0;
2940 return -ENOENT;
2941 }
2942
2943 static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2944 struct fuse_file_info *fi)
2945 {
2946 if (strcmp(path, "/") == 0) {
2947 if (filler(buf, "proc", NULL, 0) != 0 ||
2948 filler(buf, "cgroup", NULL, 0) != 0)
2949 return -EINVAL;
2950 return 0;
2951 }
2952 if (strncmp(path, "/cgroup", 7) == 0)
2953 return cg_readdir(path, buf, filler, offset, fi);
2954 if (strcmp(path, "/proc") == 0)
2955 return proc_readdir(path, buf, filler, offset, fi);
2956 return -EINVAL;
2957 }
2958
2959 static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
2960 {
2961 if (strcmp(path, "/") == 0)
2962 return 0;
2963 if (strncmp(path, "/cgroup", 7) == 0) {
2964 return cg_releasedir(path, fi);
2965 }
2966 if (strcmp(path, "/proc") == 0)
2967 return 0;
2968 return -EINVAL;
2969 }
2970
2971 static int lxcfs_open(const char *path, struct fuse_file_info *fi)
2972 {
2973 if (strncmp(path, "/cgroup", 7) == 0)
2974 return cg_open(path, fi);
2975 if (strncmp(path, "/proc", 5) == 0)
2976 return proc_open(path, fi);
2977
2978 return -EINVAL;
2979 }
2980
2981 static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
2982 struct fuse_file_info *fi)
2983 {
2984 if (strncmp(path, "/cgroup", 7) == 0)
2985 return cg_read(path, buf, size, offset, fi);
2986 if (strncmp(path, "/proc", 5) == 0)
2987 return proc_read(path, buf, size, offset, fi);
2988
2989 return -EINVAL;
2990 }
2991
2992 int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
2993 struct fuse_file_info *fi)
2994 {
2995 if (strncmp(path, "/cgroup", 7) == 0) {
2996 return cg_write(path, buf, size, offset, fi);
2997 }
2998
2999 return -EINVAL;
3000 }
3001
3002 static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
3003 {
3004 return 0;
3005 }
3006
3007 static int lxcfs_release(const char *path, struct fuse_file_info *fi)
3008 {
3009 if (strncmp(path, "/cgroup", 7) == 0)
3010 return cg_release(path, fi);
3011 if (strncmp(path, "/proc", 5) == 0)
3012 return proc_release(path, fi);
3013
3014 return -EINVAL;
3015 }
3016
3017 static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
3018 {
3019 return 0;
3020 }
3021
3022 int lxcfs_mkdir(const char *path, mode_t mode)
3023 {
3024 if (strncmp(path, "/cgroup", 7) == 0)
3025 return cg_mkdir(path, mode);
3026
3027 return -EINVAL;
3028 }
3029
3030 int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
3031 {
3032 if (strncmp(path, "/cgroup", 7) == 0)
3033 return cg_chown(path, uid, gid);
3034
3035 return -EINVAL;
3036 }
3037
3038 /*
3039 * cat first does a truncate before doing ops->write. This doesn't
3040 * really make sense for cgroups. So just return 0 always but do
3041 * nothing.
3042 */
3043 int lxcfs_truncate(const char *path, off_t newsize)
3044 {
3045 if (strncmp(path, "/cgroup", 7) == 0)
3046 return 0;
3047 return -EINVAL;
3048 }
3049
3050 int lxcfs_rmdir(const char *path)
3051 {
3052 if (strncmp(path, "/cgroup", 7) == 0)
3053 return cg_rmdir(path);
3054 return -EINVAL;
3055 }
3056
3057 int lxcfs_chmod(const char *path, mode_t mode)
3058 {
3059 if (strncmp(path, "/cgroup", 7) == 0)
3060 return cg_chmod(path, mode);
3061 return -EINVAL;
3062 }
3063
3064 const struct fuse_operations lxcfs_ops = {
3065 .getattr = lxcfs_getattr,
3066 .readlink = NULL,
3067 .getdir = NULL,
3068 .mknod = NULL,
3069 .mkdir = lxcfs_mkdir,
3070 .unlink = NULL,
3071 .rmdir = lxcfs_rmdir,
3072 .symlink = NULL,
3073 .rename = NULL,
3074 .link = NULL,
3075 .chmod = lxcfs_chmod,
3076 .chown = lxcfs_chown,
3077 .truncate = lxcfs_truncate,
3078 .utime = NULL,
3079
3080 .open = lxcfs_open,
3081 .read = lxcfs_read,
3082 .release = lxcfs_release,
3083 .write = lxcfs_write,
3084
3085 .statfs = NULL,
3086 .flush = lxcfs_flush,
3087 .fsync = lxcfs_fsync,
3088
3089 .setxattr = NULL,
3090 .getxattr = NULL,
3091 .listxattr = NULL,
3092 .removexattr = NULL,
3093
3094 .opendir = lxcfs_opendir,
3095 .readdir = lxcfs_readdir,
3096 .releasedir = lxcfs_releasedir,
3097
3098 .fsyncdir = NULL,
3099 .init = NULL,
3100 .destroy = NULL,
3101 .access = NULL,
3102 .create = NULL,
3103 .ftruncate = NULL,
3104 .fgetattr = NULL,
3105 };
3106
3107 static void usage(const char *me)
3108 {
3109 fprintf(stderr, "Usage:\n");
3110 fprintf(stderr, "\n");
3111 fprintf(stderr, "%s mountpoint\n", me);
3112 fprintf(stderr, "%s -h\n", me);
3113 exit(1);
3114 }
3115
3116 static bool is_help(char *w)
3117 {
3118 if (strcmp(w, "-h") == 0 ||
3119 strcmp(w, "--help") == 0 ||
3120 strcmp(w, "-help") == 0 ||
3121 strcmp(w, "help") == 0)
3122 return true;
3123 return false;
3124 }
3125
3126 void swallow_arg(int *argcp, char *argv[], char *which)
3127 {
3128 int i;
3129
3130 for (i = 1; argv[i]; i++) {
3131 if (strcmp(argv[i], which) != 0)
3132 continue;
3133 for (; argv[i]; i++) {
3134 argv[i] = argv[i+1];
3135 }
3136 (*argcp)--;
3137 return;
3138 }
3139 }
3140
3141 void swallow_option(int *argcp, char *argv[], char *opt, char *v)
3142 {
3143 int i;
3144
3145 for (i = 1; argv[i]; i++) {
3146 if (!argv[i+1])
3147 continue;
3148 if (strcmp(argv[i], opt) != 0)
3149 continue;
3150 if (strcmp(argv[i+1], v) != 0) {
3151 fprintf(stderr, "Warning: unexpected fuse option %s\n", v);
3152 exit(1);
3153 }
3154 for (; argv[i+1]; i++) {
3155 argv[i] = argv[i+2];
3156 }
3157 (*argcp) -= 2;
3158 return;
3159 }
3160 }
3161
3162 int main(int argc, char *argv[])
3163 {
3164 int ret = -1;
3165 /*
3166 * what we pass to fuse_main is:
3167 * argv[0] -s -f -o allow_other,directio argv[1] NULL
3168 */
3169 int nargs = 5, cnt = 0;
3170 char *newargv[6];
3171
3172 #ifdef FORTRAVIS
3173 /* for travis which runs on 12.04 */
3174 if (glib_check_version (2, 36, 0) != NULL)
3175 g_type_init ();
3176 #endif
3177
3178 /* accomodate older init scripts */
3179 swallow_arg(&argc, argv, "-s");
3180 swallow_arg(&argc, argv, "-f");
3181 swallow_option(&argc, argv, "-o", "allow_other");
3182
3183 if (argc == 2 && strcmp(argv[1], "--version") == 0) {
3184 fprintf(stderr, "%s\n", VERSION);
3185 exit(0);
3186 }
3187 if (argc != 2 || is_help(argv[1]))
3188 usage(argv[0]);
3189
3190 newargv[cnt++] = argv[0];
3191 newargv[cnt++] = "-f";
3192 newargv[cnt++] = "-o";
3193 newargv[cnt++] = "allow_other,direct_io,entry_timeout=0.5,attr_timeout=0.5";
3194 newargv[cnt++] = argv[1];
3195 newargv[cnt++] = NULL;
3196
3197 if (!cgfs_setup_controllers())
3198 goto out;
3199
3200 ret = fuse_main(nargs, newargv, &lxcfs_ops, NULL);
3201
3202 out:
3203 return ret;
3204 }