]> git.proxmox.com Git - mirror_lxcfs.git/blame_incremental - lxcfs.c
prune unused init pid store entries
[mirror_lxcfs.git] / lxcfs.c
... / ...
CommitLineData
1/* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9#define FUSE_USE_VERSION 26
10
11#include <stdio.h>
12#include <dirent.h>
13#include <fcntl.h>
14#include <fuse.h>
15#include <unistd.h>
16#include <errno.h>
17#include <stdbool.h>
18#include <time.h>
19#include <string.h>
20#include <stdlib.h>
21#include <libgen.h>
22#include <sched.h>
23#include <pthread.h>
24#include <linux/sched.h>
25#include <sys/socket.h>
26#include <sys/mount.h>
27#include <sys/epoll.h>
28#include <wait.h>
29
30#ifdef FORTRAVIS
31#define GLIB_DISABLE_DEPRECATION_WARNINGS
32#include <glib-object.h>
33#endif
34
35#include "cgfs.h"
36#include "config.h" // for VERSION
37
38enum {
39 LXC_TYPE_CGDIR,
40 LXC_TYPE_CGFILE,
41 LXC_TYPE_PROC_MEMINFO,
42 LXC_TYPE_PROC_CPUINFO,
43 LXC_TYPE_PROC_UPTIME,
44 LXC_TYPE_PROC_STAT,
45 LXC_TYPE_PROC_DISKSTATS,
46};
47
48struct file_info {
49 char *controller;
50 char *cgroup;
51 char *file;
52 int type;
53 char *buf; // unused as of yet
54 int buflen;
55 int size; //actual data size
56 int cached;
57};
58
59/* reserve buffer size, for cpuall in /proc/stat */
60#define BUF_RESERVE_SIZE 256
61
62/*
63 * A table caching which pid is init for a pid namespace.
64 * When looking up which pid is init for $qpid, we first
65 * 1. Stat /proc/$qpid/ns/pid.
66 * 2. Check whether the ino_t is in our store.
67 * a. if not, fork a child in qpid's ns to send us
68 * ucred.pid = 1, and read the initpid. Cache
69 * initpid and creation time for /proc/initpid
70 * in a new store entry.
71 * b. if so, verify that /proc/initpid still matches
72 * what we have saved. If not, clear the store
73 * entry and go back to a. If so, return the
74 * cached initpid.
75 */
76/* TODO - turn this into a hashtable */
77/* TODO - periodically purge the hashtable? */
78struct pidns_init_store {
79 ino_t ino; // inode number for /proc/$pid/ns/pid
80 pid_t initpid; // the pid of nit in that ns
81 long int ctime; // the time at which /proc/$initpid was created
82 struct pidns_init_store *next;
83 long int lastcheck;
84};
85
86struct pidns_init_store *pidns_inits;
87static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
88static void lock_mutex(pthread_mutex_t *l)
89{
90 int ret;
91
92 if ((ret = pthread_mutex_lock(l)) != 0) {
93 fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret));
94 exit(1);
95 }
96}
97
98static void unlock_mutex(pthread_mutex_t *l)
99{
100 int ret;
101
102 if ((ret = pthread_mutex_unlock(l)) != 0) {
103 fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret));
104 exit(1);
105 }
106}
107
108static void store_lock(void)
109{
110 lock_mutex(&pidns_store_mutex);
111}
112
113static void store_unlock(void)
114{
115 unlock_mutex(&pidns_store_mutex);
116}
117
118/* Must be called under store_lock */
119static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
120{
121 struct stat initsb;
122 char fnam[100];
123
124 snprintf(fnam, 100, "/proc/%d", e->initpid);
125 if (stat(fnam, &initsb) < 0)
126 return false;
127#if DEBUG
128 fprintf(stderr, "comparing ctime %ld %ld for pid %d\n",
129 e->ctime, initsb.st_ctime, e->initpid);
130#endif
131 if (e->ctime != initsb.st_ctime)
132 return false;
133 return true;
134}
135
136/* Must be called under store_lock */
137static void remove_initpid(struct pidns_init_store *e)
138{
139 struct pidns_init_store *tmp;
140
141#if DEBUG
142 fprintf(stderr, "remove_initpid: removing entry for %d\n", e->initpid);
143#endif
144 if (pidns_inits == e) {
145 pidns_inits = e->next;
146 free(e);
147 return;
148 }
149
150 tmp = pidns_inits;
151 while (tmp) {
152 if (tmp->next == e) {
153 tmp->next = e->next;
154 free(e);
155 return;
156 }
157 tmp = tmp->next;
158 }
159}
160
161#define PURGE_SECS 5
162/* Must be called under store_lock */
163static void prune_initpid_store(void)
164{
165 static long int last_prune = 0;
166 struct pidns_init_store *e, *prev, *delme;
167 long int now, threshold;
168
169 if (!last_prune) {
170 last_prune = time(NULL);
171 return;
172 }
173 now = time(NULL);
174 if (now < last_prune + PURGE_SECS)
175 return;
176#if DEBUG
177 fprintf(stderr, "pruning\n");
178#endif
179 last_prune = now;
180 threshold = now - 2 * PURGE_SECS;
181
182 for (prev = NULL, e = pidns_inits; e; ) {
183 if (e->lastcheck < threshold) {
184#if DEBUG
185 fprintf(stderr, "Removing cached entry for %d\n", e->initpid);
186#endif
187 delme = e;
188 if (prev)
189 prev->next = e->next;
190 else
191 pidns_inits = e->next;
192 e = e->next;
193 free(delme);
194 } else {
195 prev = e;
196 e = e->next;
197 }
198 }
199}
200
201/* Must be called under store_lock */
202static void save_initpid(struct stat *sb, pid_t pid)
203{
204 struct pidns_init_store *e;
205 char fpath[100];
206 struct stat procsb;
207
208#if DEBUG
209 fprintf(stderr, "save_initpid: adding entry for %d\n", pid);
210#endif
211 snprintf(fpath, 100, "/proc/%d", pid);
212 if (stat(fpath, &procsb) < 0)
213 return;
214 do {
215 e = malloc(sizeof(*e));
216 } while (!e);
217 e->ino = sb->st_ino;
218 e->initpid = pid;
219 e->ctime = procsb.st_ctime;
220 e->next = pidns_inits;
221 e->lastcheck = time(NULL);
222 pidns_inits = e;
223}
224
225/*
226 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
227 * entry for the inode number and creation time. Verify that the init pid
228 * is still valid. If not, remove it. Return the entry if valid, NULL
229 * otherwise.
230 * Must be called under store_lock
231 */
232static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
233{
234 struct pidns_init_store *e = pidns_inits;
235 while (e) {
236 if (e->ino == sb->st_ino) {
237 if (initpid_still_valid(e, sb)) {
238 e->lastcheck = time(NULL);
239 return e;
240 }
241 remove_initpid(e);
242 return NULL;
243 }
244 e = e->next;
245 }
246
247 return NULL;
248}
249
250#define SEND_CREDS_OK 0
251#define SEND_CREDS_NOTSK 1
252#define SEND_CREDS_FAIL 2
253static bool recv_creds(int sock, struct ucred *cred, char *v);
254static int wait_for_pid(pid_t pid);
255static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
256
257/*
258 * fork a task which switches to @task's namespace and writes '1'.
259 * over a unix sock so we can read the task's reaper's pid in our
260 * namespace
261 */
262static void write_task_init_pid_exit(int sock, pid_t target)
263{
264 struct ucred cred;
265 char fnam[100];
266 pid_t pid;
267 char v;
268 int fd, ret;
269
270 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
271 if (ret < 0 || ret >= sizeof(fnam))
272 _exit(1);
273
274 fd = open(fnam, O_RDONLY);
275 if (fd < 0) {
276 perror("write_task_init_pid_exit open of ns/pid");
277 _exit(1);
278 }
279 if (setns(fd, 0)) {
280 perror("write_task_init_pid_exit setns 1");
281 close(fd);
282 _exit(1);
283 }
284 pid = fork();
285 if (pid < 0)
286 _exit(1);
287 if (pid != 0) {
288 if (!wait_for_pid(pid))
289 _exit(1);
290 _exit(0);
291 }
292
293 /* we are the child */
294 cred.uid = 0;
295 cred.gid = 0;
296 cred.pid = 1;
297 v = '1';
298 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
299 _exit(1);
300 _exit(0);
301}
302
303static pid_t get_init_pid_for_task(pid_t task)
304{
305 int sock[2];
306 pid_t pid;
307 pid_t ret = -1;
308 char v = '0';
309 struct ucred cred;
310
311 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
312 perror("socketpair");
313 return -1;
314 }
315
316 pid = fork();
317 if (pid < 0)
318 goto out;
319 if (!pid) {
320 close(sock[1]);
321 write_task_init_pid_exit(sock[0], task);
322 _exit(0);
323 }
324
325 if (!recv_creds(sock[1], &cred, &v))
326 goto out;
327 ret = cred.pid;
328
329out:
330 close(sock[0]);
331 close(sock[1]);
332 if (pid > 0)
333 wait_for_pid(pid);
334 return ret;
335}
336
337static pid_t lookup_initpid_in_store(pid_t qpid)
338{
339 pid_t answer = 0;
340 struct stat sb;
341 struct pidns_init_store *e;
342 char fnam[100];
343
344 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
345 store_lock();
346 if (stat(fnam, &sb) < 0)
347 goto out;
348 e = lookup_verify_initpid(&sb);
349 if (e) {
350 answer = e->initpid;
351 goto out;
352 }
353 answer = get_init_pid_for_task(qpid);
354 if (answer > 0)
355 save_initpid(&sb, answer);
356
357out:
358 /* we prune at end in case we are returning
359 * the value we were about to return */
360 prune_initpid_store();
361 store_unlock();
362 return answer;
363}
364
365static int wait_for_pid(pid_t pid)
366{
367 int status, ret;
368
369 if (pid <= 0)
370 return -1;
371
372again:
373 ret = waitpid(pid, &status, 0);
374 if (ret == -1) {
375 if (errno == EINTR)
376 goto again;
377 return -1;
378 }
379 if (ret != pid)
380 goto again;
381 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
382 return -1;
383 return 0;
384}
385
386
387/*
388 * append pid to *src.
389 * src: a pointer to a char* in which ot append the pid.
390 * sz: the number of characters printed so far, minus trailing \0.
391 * asz: the allocated size so far
392 * pid: the pid to append
393 */
394static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
395{
396 char tmp[30];
397
398 int tmplen = sprintf(tmp, "%d\n", (int)pid);
399
400 if (!*src || tmplen + *sz + 1 >= *asz) {
401 char *tmp;
402 do {
403 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
404 } while (!tmp);
405 *src = tmp;
406 *asz += BUF_RESERVE_SIZE;
407 }
408 memcpy((*src) +*sz , tmp, tmplen);
409 *sz += tmplen;
410 (*src)[*sz] = '\0';
411}
412
413/*
414 * Given a open file * to /proc/pid/{u,g}id_map, and an id
415 * valid in the caller's namespace, return the id mapped into
416 * pid's namespace.
417 * Returns the mapped id, or -1 on error.
418 */
419unsigned int
420convert_id_to_ns(FILE *idfile, unsigned int in_id)
421{
422 unsigned int nsuid, // base id for a range in the idfile's namespace
423 hostuid, // base id for a range in the caller's namespace
424 count; // number of ids in this range
425 char line[400];
426 int ret;
427
428 fseek(idfile, 0L, SEEK_SET);
429 while (fgets(line, 400, idfile)) {
430 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
431 if (ret != 3)
432 continue;
433 if (hostuid + count < hostuid || nsuid + count < nsuid) {
434 /*
435 * uids wrapped around - unexpected as this is a procfile,
436 * so just bail.
437 */
438 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
439 nsuid, hostuid, count, line);
440 return -1;
441 }
442 if (hostuid <= in_id && hostuid+count > in_id) {
443 /*
444 * now since hostuid <= in_id < hostuid+count, and
445 * hostuid+count and nsuid+count do not wrap around,
446 * we know that nsuid+(in_id-hostuid) which must be
447 * less that nsuid+(count) must not wrap around
448 */
449 return (in_id - hostuid) + nsuid;
450 }
451 }
452
453 // no answer found
454 return -1;
455}
456
457/*
458 * for is_privileged_over,
459 * specify whether we require the calling uid to be root in his
460 * namespace
461 */
462#define NS_ROOT_REQD true
463#define NS_ROOT_OPT false
464
465#define PROCLEN 100
466
467static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
468{
469 char fpath[PROCLEN];
470 int ret;
471 bool answer = false;
472 uid_t nsuid;
473
474 if (victim == -1 || uid == -1)
475 return false;
476
477 /*
478 * If the request is one not requiring root in the namespace,
479 * then having the same uid suffices. (i.e. uid 1000 has write
480 * access to files owned by uid 1000
481 */
482 if (!req_ns_root && uid == victim)
483 return true;
484
485 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
486 if (ret < 0 || ret >= PROCLEN)
487 return false;
488 FILE *f = fopen(fpath, "r");
489 if (!f)
490 return false;
491
492 /* if caller's not root in his namespace, reject */
493 nsuid = convert_id_to_ns(f, uid);
494 if (nsuid)
495 goto out;
496
497 /*
498 * If victim is not mapped into caller's ns, reject.
499 * XXX I'm not sure this check is needed given that fuse
500 * will be sending requests where the vfs has converted
501 */
502 nsuid = convert_id_to_ns(f, victim);
503 if (nsuid == -1)
504 goto out;
505
506 answer = true;
507
508out:
509 fclose(f);
510 return answer;
511}
512
513static bool perms_include(int fmode, mode_t req_mode)
514{
515 mode_t r;
516
517 switch (req_mode & O_ACCMODE) {
518 case O_RDONLY:
519 r = S_IROTH;
520 break;
521 case O_WRONLY:
522 r = S_IWOTH;
523 break;
524 case O_RDWR:
525 r = S_IROTH | S_IWOTH;
526 break;
527 default:
528 return false;
529 }
530 return ((fmode & r) == r);
531}
532
533
534/*
535 * taskcg is a/b/c
536 * querycg is /a/b/c/d/e
537 * we return 'd'
538 */
539static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
540{
541 char *start, *end;
542
543 if (strlen(taskcg) <= strlen(querycg)) {
544 fprintf(stderr, "%s: I was fed bad input\n", __func__);
545 return NULL;
546 }
547
548 if (strcmp(querycg, "/") == 0)
549 start = strdup(taskcg + 1);
550 else
551 start = strdup(taskcg + strlen(querycg) + 1);
552 if (!start)
553 return NULL;
554 end = strchr(start, '/');
555 if (end)
556 *end = '\0';
557 return start;
558}
559
560static void stripnewline(char *x)
561{
562 size_t l = strlen(x);
563 if (l && x[l-1] == '\n')
564 x[l-1] = '\0';
565}
566
567static char *get_pid_cgroup(pid_t pid, const char *contrl)
568{
569 char fnam[PROCLEN];
570 FILE *f;
571 char *answer = NULL;
572 char *line = NULL;
573 size_t len = 0;
574 int ret;
575 const char *h = find_mounted_controller(contrl);
576 if (!h)
577 return NULL;
578
579 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
580 if (ret < 0 || ret >= PROCLEN)
581 return NULL;
582 if (!(f = fopen(fnam, "r")))
583 return NULL;
584
585 while (getline(&line, &len, f) != -1) {
586 char *c1, *c2;
587 if (!line[0])
588 continue;
589 c1 = strchr(line, ':');
590 if (!c1)
591 goto out;
592 c1++;
593 c2 = strchr(c1, ':');
594 if (!c2)
595 goto out;
596 *c2 = '\0';
597 if (strcmp(c1, h) != 0)
598 continue;
599 c2++;
600 stripnewline(c2);
601 do {
602 answer = strdup(c2);
603 } while (!answer);
604 break;
605 }
606
607out:
608 fclose(f);
609 free(line);
610 return answer;
611}
612
613/*
614 * check whether a fuse context may access a cgroup dir or file
615 *
616 * If file is not null, it is a cgroup file to check under cg.
617 * If file is null, then we are checking perms on cg itself.
618 *
619 * For files we can check the mode of the list_keys result.
620 * For cgroups, we must make assumptions based on the files under the
621 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
622 * yet.
623 */
624static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
625{
626 struct cgfs_files *k = NULL;
627 bool ret = false;
628
629 k = cgfs_get_key(contrl, cg, file);
630 if (!k)
631 return false;
632
633 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
634 if (perms_include(k->mode >> 6, mode)) {
635 ret = true;
636 goto out;
637 }
638 }
639 if (fc->gid == k->gid) {
640 if (perms_include(k->mode >> 3, mode)) {
641 ret = true;
642 goto out;
643 }
644 }
645 ret = perms_include(k->mode, mode);
646
647out:
648 free_key(k);
649 return ret;
650}
651
652#define INITSCOPE "/init.scope"
653static void prune_init_slice(char *cg)
654{
655 char *point;
656 point = cg + strlen(cg) - strlen(INITSCOPE);
657 if (point < cg)
658 return;
659 if (strcmp(point, INITSCOPE) == 0) {
660 if (point == cg)
661 *(point+1) = '\0';
662 else
663 *point = '\0';
664 }
665}
666
667/*
668 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
669 * If pid is in /a, he may act on /a/b, but not on /b.
670 * if the answer is false and nextcg is not NULL, then *nextcg will point
671 * to a string containing the next cgroup directory under cg, which must be
672 * freed by the caller.
673 */
674static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
675{
676 bool answer = false;
677 char *c2 = get_pid_cgroup(pid, contrl);
678 char *linecmp;
679
680 if (!c2)
681 return false;
682 prune_init_slice(c2);
683
684 /*
685 * callers pass in '/' for root cgroup, otherwise they pass
686 * in a cgroup without leading '/'
687 */
688 linecmp = *cg == '/' ? c2 : c2+1;
689 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
690 if (nextcg) {
691 *nextcg = get_next_cgroup_dir(linecmp, cg);
692 }
693 goto out;
694 }
695 answer = true;
696
697out:
698 free(c2);
699 return answer;
700}
701
702/*
703 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
704 */
705static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
706{
707 bool answer = false;
708 char *c2, *task_cg;
709 size_t target_len, task_len;
710
711 if (strcmp(cg, "/") == 0)
712 return true;
713
714 c2 = get_pid_cgroup(pid, contrl);
715 if (!c2)
716 return false;
717 prune_init_slice(c2);
718
719 task_cg = c2 + 1;
720 target_len = strlen(cg);
721 task_len = strlen(task_cg);
722 if (task_len == 0) {
723 /* Task is in the root cg, it can see everything. This case is
724 * not handled by the strmcps below, since they test for the
725 * last /, but that is the first / that we've chopped off
726 * above.
727 */
728 answer = true;
729 goto out;
730 }
731 if (strcmp(cg, task_cg) == 0) {
732 answer = true;
733 goto out;
734 }
735 if (target_len < task_len) {
736 /* looking up a parent dir */
737 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
738 answer = true;
739 goto out;
740 }
741 if (target_len > task_len) {
742 /* looking up a child dir */
743 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
744 answer = true;
745 goto out;
746 }
747
748out:
749 free(c2);
750 return answer;
751}
752
753/*
754 * given /cgroup/freezer/a/b, return "freezer".
755 * the returned char* should NOT be freed.
756 */
757static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
758{
759 const char *p1;
760 char *contr, *slash;
761
762 if (strlen(path) < 9)
763 return NULL;
764 if (*(path+7) != '/')
765 return NULL;
766 p1 = path+8;
767 contr = strdupa(p1);
768 if (!contr)
769 return NULL;
770 slash = strstr(contr, "/");
771 if (slash)
772 *slash = '\0';
773
774 int i;
775 for (i = 0; i < num_hierarchies; i++) {
776 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
777 return hierarchies[i];
778 }
779 return NULL;
780}
781
782/*
783 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
784 * Note that the returned value may include files (keynames) etc
785 */
786static const char *find_cgroup_in_path(const char *path)
787{
788 const char *p1;
789
790 if (strlen(path) < 9)
791 return NULL;
792 p1 = strstr(path+8, "/");
793 if (!p1)
794 return NULL;
795 return p1+1;
796}
797
798/*
799 * split the last path element from the path in @cg.
800 * @dir is newly allocated and should be freed, @last not
801*/
802static void get_cgdir_and_path(const char *cg, char **dir, char **last)
803{
804 char *p;
805
806 do {
807 *dir = strdup(cg);
808 } while (!*dir);
809 *last = strrchr(cg, '/');
810 if (!*last) {
811 *last = NULL;
812 return;
813 }
814 p = strrchr(*dir, '/');
815 *p = '\0';
816}
817
818/*
819 * FUSE ops for /cgroup
820 */
821
822static int cg_getattr(const char *path, struct stat *sb)
823{
824 struct timespec now;
825 struct fuse_context *fc = fuse_get_context();
826 char * cgdir = NULL;
827 char *last = NULL, *path1, *path2;
828 struct cgfs_files *k = NULL;
829 const char *cgroup;
830 const char *controller = NULL;
831 int ret = -ENOENT;
832
833
834 if (!fc)
835 return -EIO;
836
837 memset(sb, 0, sizeof(struct stat));
838
839 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
840 return -EINVAL;
841
842 sb->st_uid = sb->st_gid = 0;
843 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
844 sb->st_size = 0;
845
846 if (strcmp(path, "/cgroup") == 0) {
847 sb->st_mode = S_IFDIR | 00755;
848 sb->st_nlink = 2;
849 return 0;
850 }
851
852 controller = pick_controller_from_path(fc, path);
853 if (!controller)
854 return -EIO;
855 cgroup = find_cgroup_in_path(path);
856 if (!cgroup) {
857 /* this is just /cgroup/controller, return it as a dir */
858 sb->st_mode = S_IFDIR | 00755;
859 sb->st_nlink = 2;
860 return 0;
861 }
862
863 get_cgdir_and_path(cgroup, &cgdir, &last);
864
865 if (!last) {
866 path1 = "/";
867 path2 = cgdir;
868 } else {
869 path1 = cgdir;
870 path2 = last;
871 }
872
873 pid_t initpid = lookup_initpid_in_store(fc->pid);
874 if (initpid <= 0)
875 initpid = fc->pid;
876 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
877 * Then check that caller's cgroup is under path if last is a child
878 * cgroup, or cgdir if last is a file */
879
880 if (is_child_cgroup(controller, path1, path2)) {
881 if (!caller_may_see_dir(initpid, controller, cgroup)) {
882 ret = -ENOENT;
883 goto out;
884 }
885 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
886 /* this is just /cgroup/controller, return it as a dir */
887 sb->st_mode = S_IFDIR | 00555;
888 sb->st_nlink = 2;
889 ret = 0;
890 goto out;
891 }
892 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
893 ret = -EACCES;
894 goto out;
895 }
896
897 // get uid, gid, from '/tasks' file and make up a mode
898 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
899 sb->st_mode = S_IFDIR | 00755;
900 k = cgfs_get_key(controller, cgroup, NULL);
901 if (!k) {
902 sb->st_uid = sb->st_gid = 0;
903 } else {
904 sb->st_uid = k->uid;
905 sb->st_gid = k->gid;
906 }
907 free_key(k);
908 sb->st_nlink = 2;
909 ret = 0;
910 goto out;
911 }
912
913 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
914 sb->st_mode = S_IFREG | k->mode;
915 sb->st_nlink = 1;
916 sb->st_uid = k->uid;
917 sb->st_gid = k->gid;
918 sb->st_size = 0;
919 free_key(k);
920 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
921 ret = -ENOENT;
922 goto out;
923 }
924 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
925 ret = -EACCES;
926 goto out;
927 }
928
929 ret = 0;
930 }
931
932out:
933 free(cgdir);
934 return ret;
935}
936
937static int cg_opendir(const char *path, struct fuse_file_info *fi)
938{
939 struct fuse_context *fc = fuse_get_context();
940 const char *cgroup;
941 struct file_info *dir_info;
942 char *controller = NULL;
943
944 if (!fc)
945 return -EIO;
946
947 if (strcmp(path, "/cgroup") == 0) {
948 cgroup = NULL;
949 controller = NULL;
950 } else {
951 // return list of keys for the controller, and list of child cgroups
952 controller = pick_controller_from_path(fc, path);
953 if (!controller)
954 return -EIO;
955
956 cgroup = find_cgroup_in_path(path);
957 if (!cgroup) {
958 /* this is just /cgroup/controller, return its contents */
959 cgroup = "/";
960 }
961 }
962
963 pid_t initpid = lookup_initpid_in_store(fc->pid);
964 if (initpid <= 0)
965 initpid = fc->pid;
966 if (cgroup) {
967 if (!caller_may_see_dir(initpid, controller, cgroup))
968 return -ENOENT;
969 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
970 return -EACCES;
971 }
972
973 /* we'll free this at cg_releasedir */
974 dir_info = malloc(sizeof(*dir_info));
975 if (!dir_info)
976 return -ENOMEM;
977 dir_info->controller = must_copy_string(controller);
978 dir_info->cgroup = must_copy_string(cgroup);
979 dir_info->type = LXC_TYPE_CGDIR;
980 dir_info->buf = NULL;
981 dir_info->file = NULL;
982 dir_info->buflen = 0;
983
984 fi->fh = (unsigned long)dir_info;
985 return 0;
986}
987
988static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
989 struct fuse_file_info *fi)
990{
991 struct file_info *d = (struct file_info *)fi->fh;
992 struct cgfs_files **list = NULL;
993 int i, ret;
994 char *nextcg = NULL;
995 struct fuse_context *fc = fuse_get_context();
996 char **clist = NULL;
997
998 if (d->type != LXC_TYPE_CGDIR) {
999 fprintf(stderr, "Internal error: file cache info used in readdir\n");
1000 return -EIO;
1001 }
1002 if (!d->cgroup && !d->controller) {
1003 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1004 int i;
1005
1006 for (i = 0; i < num_hierarchies; i++) {
1007 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1008 return -EIO;
1009 }
1010 }
1011 return 0;
1012 }
1013
1014 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1015 // not a valid cgroup
1016 ret = -EINVAL;
1017 goto out;
1018 }
1019
1020 pid_t initpid = lookup_initpid_in_store(fc->pid);
1021 if (initpid <= 0)
1022 initpid = fc->pid;
1023 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1024 if (nextcg) {
1025 int ret;
1026 ret = filler(buf, nextcg, NULL, 0);
1027 free(nextcg);
1028 if (ret != 0) {
1029 ret = -EIO;
1030 goto out;
1031 }
1032 }
1033 ret = 0;
1034 goto out;
1035 }
1036
1037 for (i = 0; list[i]; i++) {
1038 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1039 ret = -EIO;
1040 goto out;
1041 }
1042 }
1043
1044 // now get the list of child cgroups
1045
1046 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1047 ret = 0;
1048 goto out;
1049 }
1050 for (i = 0; clist[i]; i++) {
1051 if (filler(buf, clist[i], NULL, 0) != 0) {
1052 ret = -EIO;
1053 goto out;
1054 }
1055 }
1056 ret = 0;
1057
1058out:
1059 free_keys(list);
1060 if (clist) {
1061 for (i = 0; clist[i]; i++)
1062 free(clist[i]);
1063 free(clist);
1064 }
1065 return ret;
1066}
1067
1068static void do_release_file_info(struct file_info *f)
1069{
1070 if (!f)
1071 return;
1072 free(f->controller);
1073 free(f->cgroup);
1074 free(f->file);
1075 free(f->buf);
1076 free(f);
1077}
1078
1079static int cg_releasedir(const char *path, struct fuse_file_info *fi)
1080{
1081 struct file_info *d = (struct file_info *)fi->fh;
1082
1083 do_release_file_info(d);
1084 return 0;
1085}
1086
1087static int cg_open(const char *path, struct fuse_file_info *fi)
1088{
1089 const char *cgroup;
1090 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1091 struct cgfs_files *k = NULL;
1092 struct file_info *file_info;
1093 struct fuse_context *fc = fuse_get_context();
1094 int ret;
1095
1096 if (!fc)
1097 return -EIO;
1098
1099 controller = pick_controller_from_path(fc, path);
1100 if (!controller)
1101 return -EIO;
1102 cgroup = find_cgroup_in_path(path);
1103 if (!cgroup)
1104 return -EINVAL;
1105
1106 get_cgdir_and_path(cgroup, &cgdir, &last);
1107 if (!last) {
1108 path1 = "/";
1109 path2 = cgdir;
1110 } else {
1111 path1 = cgdir;
1112 path2 = last;
1113 }
1114
1115 k = cgfs_get_key(controller, path1, path2);
1116 if (!k) {
1117 ret = -EINVAL;
1118 goto out;
1119 }
1120 free_key(k);
1121
1122 pid_t initpid = lookup_initpid_in_store(fc->pid);
1123 if (initpid <= 0)
1124 initpid = fc->pid;
1125 if (!caller_may_see_dir(initpid, controller, path1)) {
1126 ret = -ENOENT;
1127 goto out;
1128 }
1129 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
1130 // should never get here
1131 ret = -EACCES;
1132 goto out;
1133 }
1134
1135 /* we'll free this at cg_release */
1136 file_info = malloc(sizeof(*file_info));
1137 if (!file_info) {
1138 ret = -ENOMEM;
1139 goto out;
1140 }
1141 file_info->controller = must_copy_string(controller);
1142 file_info->cgroup = must_copy_string(path1);
1143 file_info->file = must_copy_string(path2);
1144 file_info->type = LXC_TYPE_CGFILE;
1145 file_info->buf = NULL;
1146 file_info->buflen = 0;
1147
1148 fi->fh = (unsigned long)file_info;
1149 ret = 0;
1150
1151out:
1152 free(cgdir);
1153 return ret;
1154}
1155
1156static int cg_release(const char *path, struct fuse_file_info *fi)
1157{
1158 struct file_info *f = (struct file_info *)fi->fh;
1159
1160 do_release_file_info(f);
1161 return 0;
1162}
1163
1164#define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
1165
1166static bool wait_for_sock(int sock, int timeout)
1167{
1168 struct epoll_event ev;
1169 int epfd, ret, now, starttime, deltatime, saved_errno;
1170
1171 if ((starttime = time(NULL)) < 0)
1172 return false;
1173
1174 if ((epfd = epoll_create(1)) < 0) {
1175 fprintf(stderr, "Failed to create epoll socket: %m\n");
1176 return false;
1177 }
1178
1179 ev.events = POLLIN_SET;
1180 ev.data.fd = sock;
1181 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
1182 fprintf(stderr, "Failed adding socket to epoll: %m\n");
1183 close(epfd);
1184 return false;
1185 }
1186
1187again:
1188 if ((now = time(NULL)) < 0) {
1189 close(epfd);
1190 return false;
1191 }
1192
1193 deltatime = (starttime + timeout) - now;
1194 if (deltatime < 0) { // timeout
1195 errno = 0;
1196 close(epfd);
1197 return false;
1198 }
1199 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
1200 if (ret < 0 && errno == EINTR)
1201 goto again;
1202 saved_errno = errno;
1203 close(epfd);
1204
1205 if (ret <= 0) {
1206 errno = saved_errno;
1207 return false;
1208 }
1209 return true;
1210}
1211
1212static int msgrecv(int sockfd, void *buf, size_t len)
1213{
1214 if (!wait_for_sock(sockfd, 2))
1215 return -1;
1216 return recv(sockfd, buf, len, MSG_DONTWAIT);
1217}
1218
1219static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
1220{
1221 struct msghdr msg = { 0 };
1222 struct iovec iov;
1223 struct cmsghdr *cmsg;
1224 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
1225 char buf[1];
1226 buf[0] = 'p';
1227
1228 if (pingfirst) {
1229 if (msgrecv(sock, buf, 1) != 1) {
1230 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
1231 __func__);
1232 return SEND_CREDS_FAIL;
1233 }
1234 }
1235
1236 msg.msg_control = cmsgbuf;
1237 msg.msg_controllen = sizeof(cmsgbuf);
1238
1239 cmsg = CMSG_FIRSTHDR(&msg);
1240 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
1241 cmsg->cmsg_level = SOL_SOCKET;
1242 cmsg->cmsg_type = SCM_CREDENTIALS;
1243 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
1244
1245 msg.msg_name = NULL;
1246 msg.msg_namelen = 0;
1247
1248 buf[0] = v;
1249 iov.iov_base = buf;
1250 iov.iov_len = sizeof(buf);
1251 msg.msg_iov = &iov;
1252 msg.msg_iovlen = 1;
1253
1254 if (sendmsg(sock, &msg, 0) < 0) {
1255 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
1256 strerror(errno));
1257 if (errno == 3)
1258 return SEND_CREDS_NOTSK;
1259 return SEND_CREDS_FAIL;
1260 }
1261
1262 return SEND_CREDS_OK;
1263}
1264
1265static bool recv_creds(int sock, struct ucred *cred, char *v)
1266{
1267 struct msghdr msg = { 0 };
1268 struct iovec iov;
1269 struct cmsghdr *cmsg;
1270 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
1271 char buf[1];
1272 int ret;
1273 int optval = 1;
1274
1275 *v = '1';
1276
1277 cred->pid = -1;
1278 cred->uid = -1;
1279 cred->gid = -1;
1280
1281 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
1282 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
1283 return false;
1284 }
1285 buf[0] = '1';
1286 if (write(sock, buf, 1) != 1) {
1287 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
1288 return false;
1289 }
1290
1291 msg.msg_name = NULL;
1292 msg.msg_namelen = 0;
1293 msg.msg_control = cmsgbuf;
1294 msg.msg_controllen = sizeof(cmsgbuf);
1295
1296 iov.iov_base = buf;
1297 iov.iov_len = sizeof(buf);
1298 msg.msg_iov = &iov;
1299 msg.msg_iovlen = 1;
1300
1301 if (!wait_for_sock(sock, 2)) {
1302 fprintf(stderr, "Timed out waiting for scm_cred: %s\n",
1303 strerror(errno));
1304 return false;
1305 }
1306 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
1307 if (ret < 0) {
1308 fprintf(stderr, "Failed to receive scm_cred: %s\n",
1309 strerror(errno));
1310 return false;
1311 }
1312
1313 cmsg = CMSG_FIRSTHDR(&msg);
1314
1315 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
1316 cmsg->cmsg_level == SOL_SOCKET &&
1317 cmsg->cmsg_type == SCM_CREDENTIALS) {
1318 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
1319 }
1320 *v = buf[0];
1321
1322 return true;
1323}
1324
1325
1326/*
1327 * pid_to_ns - reads pids from a ucred over a socket, then writes the
1328 * int value back over the socket. This shifts the pid from the
1329 * sender's pidns into tpid's pidns.
1330 */
1331static void pid_to_ns(int sock, pid_t tpid)
1332{
1333 char v = '0';
1334 struct ucred cred;
1335
1336 while (recv_creds(sock, &cred, &v)) {
1337 if (v == '1')
1338 _exit(0);
1339 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
1340 _exit(1);
1341 }
1342 _exit(0);
1343}
1344
1345/*
1346 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
1347 * in your old pidns. Only children which you fork will be in the target
1348 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
1349 * actually convert pids
1350 */
1351static void pid_to_ns_wrapper(int sock, pid_t tpid)
1352{
1353 int newnsfd = -1, ret, cpipe[2];
1354 char fnam[100];
1355 pid_t cpid;
1356 char v;
1357
1358 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1359 if (ret < 0 || ret >= sizeof(fnam))
1360 _exit(1);
1361 newnsfd = open(fnam, O_RDONLY);
1362 if (newnsfd < 0)
1363 _exit(1);
1364 if (setns(newnsfd, 0) < 0)
1365 _exit(1);
1366 close(newnsfd);
1367
1368 if (pipe(cpipe) < 0)
1369 _exit(1);
1370
1371 cpid = fork();
1372 if (cpid < 0)
1373 _exit(1);
1374
1375 if (!cpid) {
1376 char b = '1';
1377 close(cpipe[0]);
1378 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1379 fprintf(stderr, "%s (child): erorr on write: %s\n",
1380 __func__, strerror(errno));
1381 }
1382 close(cpipe[1]);
1383 pid_to_ns(sock, tpid);
1384 _exit(1); // not reached
1385 }
1386 // give the child 1 second to be done forking and
1387 // write its ack
1388 if (!wait_for_sock(cpipe[0], 1))
1389 _exit(1);
1390 ret = read(cpipe[0], &v, 1);
1391 if (ret != sizeof(char) || v != '1')
1392 _exit(1);
1393
1394 if (!wait_for_pid(cpid))
1395 _exit(1);
1396 _exit(0);
1397}
1398
1399/*
1400 * To read cgroup files with a particular pid, we will setns into the child
1401 * pidns, open a pipe, fork a child - which will be the first to really be in
1402 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
1403 */
1404static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
1405{
1406 int sock[2] = {-1, -1};
1407 char *tmpdata = NULL;
1408 int ret;
1409 pid_t qpid, cpid = -1;
1410 bool answer = false;
1411 char v = '0';
1412 struct ucred cred;
1413 size_t sz = 0, asz = 0;
1414
1415 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
1416 return false;
1417
1418 /*
1419 * Now we read the pids from returned data one by one, pass
1420 * them into a child in the target namespace, read back the
1421 * translated pids, and put them into our to-return data
1422 */
1423
1424 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1425 perror("socketpair");
1426 free(tmpdata);
1427 return false;
1428 }
1429
1430 cpid = fork();
1431 if (cpid == -1)
1432 goto out;
1433
1434 if (!cpid) // child - exits when done
1435 pid_to_ns_wrapper(sock[1], tpid);
1436
1437 char *ptr = tmpdata;
1438 cred.uid = 0;
1439 cred.gid = 0;
1440 while (sscanf(ptr, "%d\n", &qpid) == 1) {
1441 cred.pid = qpid;
1442 ret = send_creds(sock[0], &cred, v, true);
1443
1444 if (ret == SEND_CREDS_NOTSK)
1445 goto next;
1446 if (ret == SEND_CREDS_FAIL)
1447 goto out;
1448
1449 // read converted results
1450 if (!wait_for_sock(sock[0], 2)) {
1451 fprintf(stderr, "%s: timed out waiting for pid from child: %s\n",
1452 __func__, strerror(errno));
1453 goto out;
1454 }
1455 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1456 fprintf(stderr, "%s: error reading pid from child: %s\n",
1457 __func__, strerror(errno));
1458 goto out;
1459 }
1460 must_strcat_pid(d, &sz, &asz, qpid);
1461next:
1462 ptr = strchr(ptr, '\n');
1463 if (!ptr)
1464 break;
1465 ptr++;
1466 }
1467
1468 cred.pid = getpid();
1469 v = '1';
1470 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
1471 // failed to ask child to exit
1472 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1473 __func__, strerror(errno));
1474 goto out;
1475 }
1476
1477 answer = true;
1478
1479out:
1480 free(tmpdata);
1481 if (cpid != -1)
1482 wait_for_pid(cpid);
1483 if (sock[0] != -1) {
1484 close(sock[0]);
1485 close(sock[1]);
1486 }
1487 return answer;
1488}
1489
1490static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1491 struct fuse_file_info *fi)
1492{
1493 struct fuse_context *fc = fuse_get_context();
1494 struct file_info *f = (struct file_info *)fi->fh;
1495 struct cgfs_files *k = NULL;
1496 char *data = NULL;
1497 int ret, s;
1498 bool r;
1499
1500 if (f->type != LXC_TYPE_CGFILE) {
1501 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1502 return -EIO;
1503 }
1504
1505 if (offset)
1506 return 0;
1507
1508 if (!fc)
1509 return -EIO;
1510
1511 if (!f->controller)
1512 return -EINVAL;
1513
1514 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
1515 return -EINVAL;
1516 }
1517 free_key(k);
1518
1519
1520 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { // should never get here
1521 ret = -EACCES;
1522 goto out;
1523 }
1524
1525 if (strcmp(f->file, "tasks") == 0 ||
1526 strcmp(f->file, "/tasks") == 0 ||
1527 strcmp(f->file, "/cgroup.procs") == 0 ||
1528 strcmp(f->file, "cgroup.procs") == 0)
1529 // special case - we have to translate the pids
1530 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
1531 else
1532 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
1533
1534 if (!r) {
1535 ret = -EINVAL;
1536 goto out;
1537 }
1538
1539 if (!data) {
1540 ret = 0;
1541 goto out;
1542 }
1543 s = strlen(data);
1544 if (s > size)
1545 s = size;
1546 memcpy(buf, data, s);
1547 if (s > 0 && s < size && data[s-1] != '\n')
1548 buf[s++] = '\n';
1549
1550 ret = s;
1551
1552out:
1553 free(data);
1554 return ret;
1555}
1556
1557static void pid_from_ns(int sock, pid_t tpid)
1558{
1559 pid_t vpid;
1560 struct ucred cred;
1561 char v;
1562 int ret;
1563
1564 cred.uid = 0;
1565 cred.gid = 0;
1566 while (1) {
1567 if (!wait_for_sock(sock, 2)) {
1568 fprintf(stderr, "%s: timeout reading from parent\n", __func__);
1569 _exit(1);
1570 }
1571 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1572 fprintf(stderr, "%s: bad read from parent: %s\n",
1573 __func__, strerror(errno));
1574 _exit(1);
1575 }
1576 if (vpid == -1) // done
1577 break;
1578 v = '0';
1579 cred.pid = vpid;
1580 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
1581 v = '1';
1582 cred.pid = getpid();
1583 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
1584 _exit(1);
1585 }
1586 }
1587 _exit(0);
1588}
1589
1590static void pid_from_ns_wrapper(int sock, pid_t tpid)
1591{
1592 int newnsfd = -1, ret, cpipe[2];
1593 char fnam[100];
1594 pid_t cpid;
1595 char v;
1596
1597 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1598 if (ret < 0 || ret >= sizeof(fnam))
1599 _exit(1);
1600 newnsfd = open(fnam, O_RDONLY);
1601 if (newnsfd < 0)
1602 _exit(1);
1603 if (setns(newnsfd, 0) < 0)
1604 _exit(1);
1605 close(newnsfd);
1606
1607 if (pipe(cpipe) < 0)
1608 _exit(1);
1609
1610loop:
1611 cpid = fork();
1612
1613 if (cpid < 0)
1614 _exit(1);
1615
1616 if (!cpid) {
1617 char b = '1';
1618 close(cpipe[0]);
1619 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1620 fprintf(stderr, "%s (child): erorr on write: %s\n",
1621 __func__, strerror(errno));
1622 }
1623 close(cpipe[1]);
1624 pid_from_ns(sock, tpid);
1625 }
1626
1627 // give the child 1 second to be done forking and
1628 // write its ack
1629 if (!wait_for_sock(cpipe[0], 1))
1630 goto again;
1631 ret = read(cpipe[0], &v, 1);
1632 if (ret != sizeof(char) || v != '1') {
1633 goto again;
1634 }
1635
1636 if (!wait_for_pid(cpid))
1637 _exit(1);
1638 _exit(0);
1639
1640again:
1641 kill(cpid, SIGKILL);
1642 wait_for_pid(cpid);
1643 goto loop;
1644}
1645
1646/*
1647 * Given host @uid, return the uid to which it maps in
1648 * @pid's user namespace, or -1 if none.
1649 */
1650bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
1651{
1652 FILE *f;
1653 char line[400];
1654
1655 sprintf(line, "/proc/%d/uid_map", pid);
1656 if ((f = fopen(line, "r")) == NULL) {
1657 return false;
1658 }
1659
1660 *answer = convert_id_to_ns(f, uid);
1661 fclose(f);
1662
1663 if (*answer == -1)
1664 return false;
1665 return true;
1666}
1667
1668/*
1669 * get_pid_creds: get the real uid and gid of @pid from
1670 * /proc/$$/status
1671 * (XXX should we use euid here?)
1672 */
1673void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
1674{
1675 char line[400];
1676 uid_t u;
1677 gid_t g;
1678 FILE *f;
1679
1680 *uid = -1;
1681 *gid = -1;
1682 sprintf(line, "/proc/%d/status", pid);
1683 if ((f = fopen(line, "r")) == NULL) {
1684 fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno));
1685 return;
1686 }
1687 while (fgets(line, 400, f)) {
1688 if (strncmp(line, "Uid:", 4) == 0) {
1689 if (sscanf(line+4, "%u", &u) != 1) {
1690 fprintf(stderr, "bad uid line for pid %u\n", pid);
1691 fclose(f);
1692 return;
1693 }
1694 *uid = u;
1695 } else if (strncmp(line, "Gid:", 4) == 0) {
1696 if (sscanf(line+4, "%u", &g) != 1) {
1697 fprintf(stderr, "bad gid line for pid %u\n", pid);
1698 fclose(f);
1699 return;
1700 }
1701 *gid = g;
1702 }
1703 }
1704 fclose(f);
1705}
1706
1707/*
1708 * May the requestor @r move victim @v to a new cgroup?
1709 * This is allowed if
1710 * . they are the same task
1711 * . they are ownedy by the same uid
1712 * . @r is root on the host, or
1713 * . @v's uid is mapped into @r's where @r is root.
1714 */
1715bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
1716{
1717 uid_t v_uid, tmpuid;
1718 gid_t v_gid;
1719
1720 if (r == v)
1721 return true;
1722 if (r_uid == 0)
1723 return true;
1724 get_pid_creds(v, &v_uid, &v_gid);
1725 if (r_uid == v_uid)
1726 return true;
1727 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
1728 && hostuid_to_ns(v_uid, r, &tmpuid))
1729 return true;
1730 return false;
1731}
1732
1733static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
1734 const char *file, const char *buf)
1735{
1736 int sock[2] = {-1, -1};
1737 pid_t qpid, cpid = -1;
1738 FILE *pids_file = NULL;
1739 bool answer = false, fail = false;
1740
1741 pids_file = open_pids_file(contrl, cg);
1742 if (!pids_file)
1743 return false;
1744
1745 /*
1746 * write the pids to a socket, have helper in writer's pidns
1747 * call movepid for us
1748 */
1749 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1750 perror("socketpair");
1751 goto out;
1752 }
1753
1754 cpid = fork();
1755 if (cpid == -1)
1756 goto out;
1757
1758 if (!cpid) { // child
1759 fclose(pids_file);
1760 pid_from_ns_wrapper(sock[1], tpid);
1761 }
1762
1763 const char *ptr = buf;
1764 while (sscanf(ptr, "%d", &qpid) == 1) {
1765 struct ucred cred;
1766 char v;
1767
1768 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1769 fprintf(stderr, "%s: error writing pid to child: %s\n",
1770 __func__, strerror(errno));
1771 goto out;
1772 }
1773
1774 if (recv_creds(sock[0], &cred, &v)) {
1775 if (v == '0') {
1776 if (!may_move_pid(tpid, tuid, cred.pid)) {
1777 fail = true;
1778 break;
1779 }
1780 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
1781 fail = true;
1782 }
1783 }
1784
1785 ptr = strchr(ptr, '\n');
1786 if (!ptr)
1787 break;
1788 ptr++;
1789 }
1790
1791 /* All good, write the value */
1792 qpid = -1;
1793 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1794 fprintf(stderr, "Warning: failed to ask child to exit\n");
1795
1796 if (!fail)
1797 answer = true;
1798
1799out:
1800 if (cpid != -1)
1801 wait_for_pid(cpid);
1802 if (sock[0] != -1) {
1803 close(sock[0]);
1804 close(sock[1]);
1805 }
1806 if (pids_file) {
1807 if (fclose(pids_file) != 0)
1808 answer = false;
1809 }
1810 return answer;
1811}
1812
1813int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1814 struct fuse_file_info *fi)
1815{
1816 struct fuse_context *fc = fuse_get_context();
1817 char *localbuf = NULL;
1818 struct cgfs_files *k = NULL;
1819 struct file_info *f = (struct file_info *)fi->fh;
1820 bool r;
1821
1822 if (f->type != LXC_TYPE_CGFILE) {
1823 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1824 return -EIO;
1825 }
1826
1827 if (offset)
1828 return 0;
1829
1830 if (!fc)
1831 return -EIO;
1832
1833 localbuf = alloca(size+1);
1834 localbuf[size] = '\0';
1835 memcpy(localbuf, buf, size);
1836
1837 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
1838 size = -EINVAL;
1839 goto out;
1840 }
1841
1842 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
1843 size = -EACCES;
1844 goto out;
1845 }
1846
1847 if (strcmp(f->file, "tasks") == 0 ||
1848 strcmp(f->file, "/tasks") == 0 ||
1849 strcmp(f->file, "/cgroup.procs") == 0 ||
1850 strcmp(f->file, "cgroup.procs") == 0)
1851 // special case - we have to translate the pids
1852 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
1853 else
1854 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
1855
1856 if (!r)
1857 size = -EINVAL;
1858
1859out:
1860 free_key(k);
1861 return size;
1862}
1863
1864int cg_chown(const char *path, uid_t uid, gid_t gid)
1865{
1866 struct fuse_context *fc = fuse_get_context();
1867 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
1868 struct cgfs_files *k = NULL;
1869 const char *cgroup;
1870 int ret;
1871
1872 if (!fc)
1873 return -EIO;
1874
1875 if (strcmp(path, "/cgroup") == 0)
1876 return -EINVAL;
1877
1878 controller = pick_controller_from_path(fc, path);
1879 if (!controller)
1880 return -EINVAL;
1881 cgroup = find_cgroup_in_path(path);
1882 if (!cgroup)
1883 /* this is just /cgroup/controller */
1884 return -EINVAL;
1885
1886 get_cgdir_and_path(cgroup, &cgdir, &last);
1887
1888 if (!last) {
1889 path1 = "/";
1890 path2 = cgdir;
1891 } else {
1892 path1 = cgdir;
1893 path2 = last;
1894 }
1895
1896 if (is_child_cgroup(controller, path1, path2)) {
1897 // get uid, gid, from '/tasks' file and make up a mode
1898 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1899 k = cgfs_get_key(controller, cgroup, "tasks");
1900
1901 } else
1902 k = cgfs_get_key(controller, path1, path2);
1903
1904 if (!k) {
1905 ret = -EINVAL;
1906 goto out;
1907 }
1908
1909 /*
1910 * This being a fuse request, the uid and gid must be valid
1911 * in the caller's namespace. So we can just check to make
1912 * sure that the caller is root in his uid, and privileged
1913 * over the file's current owner.
1914 */
1915 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
1916 ret = -EACCES;
1917 goto out;
1918 }
1919
1920 ret = cgfs_chown_file(controller, cgroup, uid, gid);
1921
1922out:
1923 free_key(k);
1924 free(cgdir);
1925
1926 return ret;
1927}
1928
1929int cg_chmod(const char *path, mode_t mode)
1930{
1931 struct fuse_context *fc = fuse_get_context();
1932 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
1933 struct cgfs_files *k = NULL;
1934 const char *cgroup;
1935 int ret;
1936
1937 if (!fc)
1938 return -EIO;
1939
1940 if (strcmp(path, "/cgroup") == 0)
1941 return -EINVAL;
1942
1943 controller = pick_controller_from_path(fc, path);
1944 if (!controller)
1945 return -EINVAL;
1946 cgroup = find_cgroup_in_path(path);
1947 if (!cgroup)
1948 /* this is just /cgroup/controller */
1949 return -EINVAL;
1950
1951 get_cgdir_and_path(cgroup, &cgdir, &last);
1952
1953 if (!last) {
1954 path1 = "/";
1955 path2 = cgdir;
1956 } else {
1957 path1 = cgdir;
1958 path2 = last;
1959 }
1960
1961 if (is_child_cgroup(controller, path1, path2)) {
1962 // get uid, gid, from '/tasks' file and make up a mode
1963 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1964 k = cgfs_get_key(controller, cgroup, "tasks");
1965
1966 } else
1967 k = cgfs_get_key(controller, path1, path2);
1968
1969 if (!k) {
1970 ret = -EINVAL;
1971 goto out;
1972 }
1973
1974 /*
1975 * This being a fuse request, the uid and gid must be valid
1976 * in the caller's namespace. So we can just check to make
1977 * sure that the caller is root in his uid, and privileged
1978 * over the file's current owner.
1979 */
1980 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1981 ret = -EPERM;
1982 goto out;
1983 }
1984
1985 if (!cgfs_chmod_file(controller, cgroup, mode)) {
1986 ret = -EINVAL;
1987 goto out;
1988 }
1989
1990 ret = 0;
1991out:
1992 free_key(k);
1993 free(cgdir);
1994 return ret;
1995}
1996
1997int cg_mkdir(const char *path, mode_t mode)
1998{
1999 struct fuse_context *fc = fuse_get_context();
2000 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2001 const char *cgroup;
2002 int ret;
2003
2004 if (!fc)
2005 return -EIO;
2006
2007
2008 controller = pick_controller_from_path(fc, path);
2009 if (!controller)
2010 return -EINVAL;
2011
2012 cgroup = find_cgroup_in_path(path);
2013 if (!cgroup)
2014 return -EINVAL;
2015
2016 get_cgdir_and_path(cgroup, &cgdir, &last);
2017 if (!last)
2018 path1 = "/";
2019 else
2020 path1 = cgdir;
2021
2022 pid_t initpid = lookup_initpid_in_store(fc->pid);
2023 if (initpid <= 0)
2024 initpid = fc->pid;
2025 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
2026 if (!next)
2027 ret = -EINVAL;
2028 else if (last && strcmp(next, last) == 0)
2029 ret = -EEXIST;
2030 else
2031 ret = -ENOENT;
2032 goto out;
2033 }
2034
2035 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
2036 ret = -EACCES;
2037 goto out;
2038 }
2039 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2040 ret = -EACCES;
2041 goto out;
2042 }
2043
2044 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
2045
2046out:
2047 free(cgdir);
2048 free(next);
2049 return ret;
2050}
2051
2052static int cg_rmdir(const char *path)
2053{
2054 struct fuse_context *fc = fuse_get_context();
2055 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
2056 const char *cgroup;
2057 int ret;
2058
2059 if (!fc)
2060 return -EIO;
2061
2062 controller = pick_controller_from_path(fc, path);
2063 if (!controller)
2064 return -EINVAL;
2065
2066 cgroup = find_cgroup_in_path(path);
2067 if (!cgroup)
2068 return -EINVAL;
2069
2070 get_cgdir_and_path(cgroup, &cgdir, &last);
2071 if (!last) {
2072 ret = -EINVAL;
2073 goto out;
2074 }
2075
2076 pid_t initpid = lookup_initpid_in_store(fc->pid);
2077 if (initpid <= 0)
2078 initpid = fc->pid;
2079 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
2080 if (!last || strcmp(next, last) == 0)
2081 ret = -EBUSY;
2082 else
2083 ret = -ENOENT;
2084 goto out;
2085 }
2086
2087 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
2088 ret = -EACCES;
2089 goto out;
2090 }
2091 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2092 ret = -EACCES;
2093 goto out;
2094 }
2095
2096 if (!cgfs_remove(controller, cgroup)) {
2097 ret = -EINVAL;
2098 goto out;
2099 }
2100
2101 ret = 0;
2102
2103out:
2104 free(cgdir);
2105 free(next);
2106 return ret;
2107}
2108
2109static bool startswith(const char *line, const char *pref)
2110{
2111 if (strncmp(line, pref, strlen(pref)) == 0)
2112 return true;
2113 return false;
2114}
2115
2116static void get_mem_cached(char *memstat, unsigned long *v)
2117{
2118 char *eol;
2119
2120 *v = 0;
2121 while (*memstat) {
2122 if (startswith(memstat, "total_cache")) {
2123 sscanf(memstat + 11, "%lu", v);
2124 *v /= 1024;
2125 return;
2126 }
2127 eol = strchr(memstat, '\n');
2128 if (!eol)
2129 return;
2130 memstat = eol+1;
2131 }
2132}
2133
2134static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2135{
2136 char *eol;
2137 char key[32];
2138
2139 memset(key, 0, 32);
2140 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2141
2142 size_t len = strlen(key);
2143 *v = 0;
2144
2145 while (*str) {
2146 if (startswith(str, key)) {
2147 sscanf(str + len, "%lu", v);
2148 return;
2149 }
2150 eol = strchr(str, '\n');
2151 if (!eol)
2152 return;
2153 str = eol+1;
2154 }
2155}
2156
2157static int read_file(const char *path, char *buf, size_t size,
2158 struct file_info *d)
2159{
2160 size_t linelen = 0, total_len = 0, rv = 0;
2161 char *line = NULL;
2162 char *cache = d->buf;
2163 size_t cache_size = d->buflen;
2164 FILE *f = fopen(path, "r");
2165 if (!f)
2166 return 0;
2167
2168 while (getline(&line, &linelen, f) != -1) {
2169 size_t l = snprintf(cache, cache_size, "%s", line);
2170 if (l < 0) {
2171 perror("Error writing to cache");
2172 rv = 0;
2173 goto err;
2174 }
2175 if (l >= cache_size) {
2176 fprintf(stderr, "Internal error: truncated write to cache\n");
2177 rv = 0;
2178 goto err;
2179 }
2180 if (l < cache_size) {
2181 cache += l;
2182 cache_size -= l;
2183 total_len += l;
2184 } else {
2185 cache += cache_size;
2186 total_len += cache_size;
2187 cache_size = 0;
2188 break;
2189 }
2190 }
2191
2192 d->size = total_len;
2193 if (total_len > size ) total_len = size;
2194
2195 /* read from off 0 */
2196 memcpy(buf, d->buf, total_len);
2197 rv = total_len;
2198 err:
2199 fclose(f);
2200 free(line);
2201 return rv;
2202}
2203
2204/*
2205 * FUSE ops for /proc
2206 */
2207
2208static unsigned long get_memlimit(const char *cgroup)
2209{
2210 char *memlimit_str = NULL;
2211 unsigned long memlimit = -1;
2212
2213 if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
2214 memlimit = strtoul(memlimit_str, NULL, 10);
2215
2216 free(memlimit_str);
2217
2218 return memlimit;
2219}
2220
2221static unsigned long get_min_memlimit(const char *cgroup)
2222{
2223 char *copy = strdupa(cgroup);
2224 unsigned long memlimit = 0, retlimit;
2225
2226 retlimit = get_memlimit(copy);
2227
2228 while (strcmp(copy, "/") != 0) {
2229 copy = dirname(copy);
2230 memlimit = get_memlimit(copy);
2231 if (memlimit != -1 && memlimit < retlimit)
2232 retlimit = memlimit;
2233 };
2234
2235 return retlimit;
2236}
2237
2238static int proc_meminfo_read(char *buf, size_t size, off_t offset,
2239 struct fuse_file_info *fi)
2240{
2241 struct fuse_context *fc = fuse_get_context();
2242 struct file_info *d = (struct file_info *)fi->fh;
2243 char *cg;
2244 char *memusage_str = NULL, *memstat_str = NULL,
2245 *memswlimit_str = NULL, *memswusage_str = NULL,
2246 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
2247 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
2248 cached = 0, hosttotal = 0;
2249 char *line = NULL;
2250 size_t linelen = 0, total_len = 0, rv = 0;
2251 char *cache = d->buf;
2252 size_t cache_size = d->buflen;
2253 FILE *f = NULL;
2254
2255 if (offset){
2256 if (offset > d->size)
2257 return -EINVAL;
2258 if (!d->cached)
2259 return 0;
2260 int left = d->size - offset;
2261 total_len = left > size ? size: left;
2262 memcpy(buf, cache + offset, total_len);
2263 return total_len;
2264 }
2265
2266 pid_t initpid = lookup_initpid_in_store(fc->pid);
2267 if (initpid <= 0)
2268 initpid = fc->pid;
2269 cg = get_pid_cgroup(initpid, "memory");
2270 if (!cg)
2271 return read_file("/proc/meminfo", buf, size, d);
2272
2273 memlimit = get_min_memlimit(cg);
2274 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
2275 goto err;
2276 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
2277 goto err;
2278
2279 // Following values are allowed to fail, because swapaccount might be turned
2280 // off for current kernel
2281 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
2282 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
2283 {
2284 /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
2285 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
2286 goto err;
2287 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
2288 goto err;
2289
2290 memswlimit = strtoul(memswlimit_str, NULL, 10);
2291 memswusage = strtoul(memswusage_str, NULL, 10);
2292
2293 if (!strcmp(memswlimit_str, memswlimit_default_str))
2294 memswlimit = 0;
2295 if (!strcmp(memswusage_str, memswusage_default_str))
2296 memswusage = 0;
2297
2298 memswlimit = memswlimit / 1024;
2299 memswusage = memswusage / 1024;
2300 }
2301
2302 memusage = strtoul(memusage_str, NULL, 10);
2303 memlimit /= 1024;
2304 memusage /= 1024;
2305
2306 get_mem_cached(memstat_str, &cached);
2307
2308 f = fopen("/proc/meminfo", "r");
2309 if (!f)
2310 goto err;
2311
2312 while (getline(&line, &linelen, f) != -1) {
2313 size_t l;
2314 char *printme, lbuf[100];
2315
2316 memset(lbuf, 0, 100);
2317 if (startswith(line, "MemTotal:")) {
2318 sscanf(line+14, "%lu", &hosttotal);
2319 if (hosttotal < memlimit)
2320 memlimit = hosttotal;
2321 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
2322 printme = lbuf;
2323 } else if (startswith(line, "MemFree:")) {
2324 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
2325 printme = lbuf;
2326 } else if (startswith(line, "MemAvailable:")) {
2327 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
2328 printme = lbuf;
2329 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
2330 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit);
2331 printme = lbuf;
2332 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
2333 snprintf(lbuf, 100, "SwapFree: %8lu kB\n",
2334 (memswlimit - memlimit) - (memswusage - memusage));
2335 printme = lbuf;
2336 } else if (startswith(line, "Buffers:")) {
2337 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
2338 printme = lbuf;
2339 } else if (startswith(line, "Cached:")) {
2340 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
2341 printme = lbuf;
2342 } else if (startswith(line, "SwapCached:")) {
2343 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
2344 printme = lbuf;
2345 } else
2346 printme = line;
2347
2348 l = snprintf(cache, cache_size, "%s", printme);
2349 if (l < 0) {
2350 perror("Error writing to cache");
2351 rv = 0;
2352 goto err;
2353
2354 }
2355 if (l >= cache_size) {
2356 fprintf(stderr, "Internal error: truncated write to cache\n");
2357 rv = 0;
2358 goto err;
2359 }
2360
2361 cache += l;
2362 cache_size -= l;
2363 total_len += l;
2364 }
2365
2366 d->cached = 1;
2367 d->size = total_len;
2368 if (total_len > size ) total_len = size;
2369 memcpy(buf, d->buf, total_len);
2370
2371 rv = total_len;
2372err:
2373 if (f)
2374 fclose(f);
2375 free(line);
2376 free(cg);
2377 free(memusage_str);
2378 free(memswlimit_str);
2379 free(memswusage_str);
2380 free(memstat_str);
2381 free(memswlimit_default_str);
2382 free(memswusage_default_str);
2383 return rv;
2384}
2385
2386/*
2387 * Read the cpuset.cpus for cg
2388 * Return the answer in a newly allocated string which must be freed
2389 */
2390static char *get_cpuset(const char *cg)
2391{
2392 char *answer;
2393
2394 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
2395 return NULL;
2396 return answer;
2397}
2398
2399bool cpu_in_cpuset(int cpu, const char *cpuset);
2400
2401static bool cpuline_in_cpuset(const char *line, const char *cpuset)
2402{
2403 int cpu;
2404
2405 if (sscanf(line, "processor : %d", &cpu) != 1)
2406 return false;
2407 return cpu_in_cpuset(cpu, cpuset);
2408}
2409
2410/*
2411 * check whether this is a '^processor" line in /proc/cpuinfo
2412 */
2413static bool is_processor_line(const char *line)
2414{
2415 int cpu;
2416
2417 if (sscanf(line, "processor : %d", &cpu) == 1)
2418 return true;
2419 return false;
2420}
2421
2422static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
2423 struct fuse_file_info *fi)
2424{
2425 struct fuse_context *fc = fuse_get_context();
2426 struct file_info *d = (struct file_info *)fi->fh;
2427 char *cg;
2428 char *cpuset = NULL;
2429 char *line = NULL;
2430 size_t linelen = 0, total_len = 0, rv = 0;
2431 bool am_printing = false;
2432 int curcpu = -1;
2433 char *cache = d->buf;
2434 size_t cache_size = d->buflen;
2435 FILE *f = NULL;
2436
2437 if (offset){
2438 if (offset > d->size)
2439 return -EINVAL;
2440 if (!d->cached)
2441 return 0;
2442 int left = d->size - offset;
2443 total_len = left > size ? size: left;
2444 memcpy(buf, cache + offset, total_len);
2445 return total_len;
2446 }
2447
2448 pid_t initpid = lookup_initpid_in_store(fc->pid);
2449 if (initpid <= 0)
2450 initpid = fc->pid;
2451 cg = get_pid_cgroup(initpid, "cpuset");
2452 if (!cg)
2453 return read_file("proc/cpuinfo", buf, size, d);
2454
2455 cpuset = get_cpuset(cg);
2456 if (!cpuset)
2457 goto err;
2458
2459 f = fopen("/proc/cpuinfo", "r");
2460 if (!f)
2461 goto err;
2462
2463 while (getline(&line, &linelen, f) != -1) {
2464 size_t l;
2465 if (is_processor_line(line)) {
2466 am_printing = cpuline_in_cpuset(line, cpuset);
2467 if (am_printing) {
2468 curcpu ++;
2469 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
2470 if (l < 0) {
2471 perror("Error writing to cache");
2472 rv = 0;
2473 goto err;
2474 }
2475 if (l >= cache_size) {
2476 fprintf(stderr, "Internal error: truncated write to cache\n");
2477 rv = 0;
2478 goto err;
2479 }
2480 if (l < cache_size){
2481 cache += l;
2482 cache_size -= l;
2483 total_len += l;
2484 }else{
2485 cache += cache_size;
2486 total_len += cache_size;
2487 cache_size = 0;
2488 break;
2489 }
2490 }
2491 continue;
2492 }
2493 if (am_printing) {
2494 l = snprintf(cache, cache_size, "%s", line);
2495 if (l < 0) {
2496 perror("Error writing to cache");
2497 rv = 0;
2498 goto err;
2499 }
2500 if (l >= cache_size) {
2501 fprintf(stderr, "Internal error: truncated write to cache\n");
2502 rv = 0;
2503 goto err;
2504 }
2505 if (l < cache_size) {
2506 cache += l;
2507 cache_size -= l;
2508 total_len += l;
2509 } else {
2510 cache += cache_size;
2511 total_len += cache_size;
2512 cache_size = 0;
2513 break;
2514 }
2515 }
2516 }
2517
2518 d->cached = 1;
2519 d->size = total_len;
2520 if (total_len > size ) total_len = size;
2521
2522 /* read from off 0 */
2523 memcpy(buf, d->buf, total_len);
2524 rv = total_len;
2525err:
2526 if (f)
2527 fclose(f);
2528 free(line);
2529 free(cpuset);
2530 free(cg);
2531 return rv;
2532}
2533
2534static int proc_stat_read(char *buf, size_t size, off_t offset,
2535 struct fuse_file_info *fi)
2536{
2537 struct fuse_context *fc = fuse_get_context();
2538 struct file_info *d = (struct file_info *)fi->fh;
2539 char *cg;
2540 char *cpuset = NULL;
2541 char *line = NULL;
2542 size_t linelen = 0, total_len = 0, rv = 0;
2543 int curcpu = -1; /* cpu numbering starts at 0 */
2544 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
2545 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
2546 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
2547#define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
2548 char cpuall[CPUALL_MAX_SIZE];
2549 /* reserve for cpu all */
2550 char *cache = d->buf + CPUALL_MAX_SIZE;
2551 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
2552 FILE *f = NULL;
2553
2554 if (offset){
2555 if (offset > d->size)
2556 return -EINVAL;
2557 if (!d->cached)
2558 return 0;
2559 int left = d->size - offset;
2560 total_len = left > size ? size: left;
2561 memcpy(buf, d->buf + offset, total_len);
2562 return total_len;
2563 }
2564
2565 pid_t initpid = lookup_initpid_in_store(fc->pid);
2566 if (initpid <= 0)
2567 initpid = fc->pid;
2568 cg = get_pid_cgroup(initpid, "cpuset");
2569 if (!cg)
2570 return read_file("/proc/stat", buf, size, d);
2571
2572 cpuset = get_cpuset(cg);
2573 if (!cpuset)
2574 goto err;
2575
2576 f = fopen("/proc/stat", "r");
2577 if (!f)
2578 goto err;
2579
2580 //skip first line
2581 if (getline(&line, &linelen, f) < 0) {
2582 fprintf(stderr, "proc_stat_read read first line failed\n");
2583 goto err;
2584 }
2585
2586 while (getline(&line, &linelen, f) != -1) {
2587 size_t l;
2588 int cpu;
2589 char cpu_char[10]; /* That's a lot of cores */
2590 char *c;
2591
2592 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
2593 /* not a ^cpuN line containing a number N, just print it */
2594 l = snprintf(cache, cache_size, "%s", line);
2595 if (l < 0) {
2596 perror("Error writing to cache");
2597 rv = 0;
2598 goto err;
2599 }
2600 if (l >= cache_size) {
2601 fprintf(stderr, "Internal error: truncated write to cache\n");
2602 rv = 0;
2603 goto err;
2604 }
2605 if (l < cache_size) {
2606 cache += l;
2607 cache_size -= l;
2608 total_len += l;
2609 continue;
2610 } else {
2611 //no more space, break it
2612 cache += cache_size;
2613 total_len += cache_size;
2614 cache_size = 0;
2615 break;
2616 }
2617 }
2618
2619 if (sscanf(cpu_char, "%d", &cpu) != 1)
2620 continue;
2621 if (!cpu_in_cpuset(cpu, cpuset))
2622 continue;
2623 curcpu ++;
2624
2625 c = strchr(line, ' ');
2626 if (!c)
2627 continue;
2628 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
2629 if (l < 0) {
2630 perror("Error writing to cache");
2631 rv = 0;
2632 goto err;
2633
2634 }
2635 if (l >= cache_size) {
2636 fprintf(stderr, "Internal error: truncated write to cache\n");
2637 rv = 0;
2638 goto err;
2639 }
2640
2641 cache += l;
2642 cache_size -= l;
2643 total_len += l;
2644
2645 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
2646 &softirq, &steal, &guest) != 9)
2647 continue;
2648 user_sum += user;
2649 nice_sum += nice;
2650 system_sum += system;
2651 idle_sum += idle;
2652 iowait_sum += iowait;
2653 irq_sum += irq;
2654 softirq_sum += softirq;
2655 steal_sum += steal;
2656 guest_sum += guest;
2657 }
2658
2659 cache = d->buf;
2660
2661 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2662 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
2663 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
2664 memcpy(cache, cpuall, cpuall_len);
2665 cache += cpuall_len;
2666 } else{
2667 /* shouldn't happen */
2668 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
2669 cpuall_len = 0;
2670 }
2671
2672 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
2673 total_len += cpuall_len;
2674 d->cached = 1;
2675 d->size = total_len;
2676 if (total_len > size ) total_len = size;
2677
2678 memcpy(buf, d->buf, total_len);
2679 rv = total_len;
2680
2681err:
2682 if (f)
2683 fclose(f);
2684 free(line);
2685 free(cpuset);
2686 free(cg);
2687 return rv;
2688}
2689
2690static long int getreaperage(pid_t pid)
2691{
2692 char fnam[100];
2693 struct stat sb;
2694 int ret;
2695 pid_t qpid;
2696
2697 qpid = lookup_initpid_in_store(pid);
2698 if (qpid <= 0)
2699 return 0;
2700
2701 ret = snprintf(fnam, 100, "/proc/%d", qpid);
2702 if (ret < 0 || ret >= 100)
2703 return 0;
2704
2705 if (lstat(fnam, &sb) < 0)
2706 return 0;
2707
2708 return time(NULL) - sb.st_ctime;
2709}
2710
2711static unsigned long get_reaper_busy(pid_t task)
2712{
2713 pid_t initpid = lookup_initpid_in_store(task);
2714 char *cgroup = NULL, *usage_str = NULL;
2715 unsigned long usage = 0;
2716
2717 if (initpid <= 0)
2718 return 0;
2719
2720 cgroup = get_pid_cgroup(initpid, "cpuacct");
2721 if (!cgroup)
2722 goto out;
2723 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
2724 goto out;
2725 usage = strtoul(usage_str, NULL, 10);
2726 usage /= 1000000000;
2727
2728out:
2729 free(cgroup);
2730 free(usage_str);
2731 return usage;
2732}
2733
2734/*
2735 * We read /proc/uptime and reuse its second field.
2736 * For the first field, we use the mtime for the reaper for
2737 * the calling pid as returned by getreaperage
2738 */
2739static int proc_uptime_read(char *buf, size_t size, off_t offset,
2740 struct fuse_file_info *fi)
2741{
2742 struct fuse_context *fc = fuse_get_context();
2743 struct file_info *d = (struct file_info *)fi->fh;
2744 long int reaperage = getreaperage(fc->pid);
2745 unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
2746 char *cache = d->buf;
2747 size_t total_len = 0;
2748
2749 if (offset){
2750 if (offset > d->size)
2751 return -EINVAL;
2752 if (!d->cached)
2753 return 0;
2754 int left = d->size - offset;
2755 total_len = left > size ? size: left;
2756 memcpy(buf, cache + offset, total_len);
2757 return total_len;
2758 }
2759
2760 idletime = reaperage - busytime;
2761 if (idletime > reaperage)
2762 idletime = reaperage;
2763
2764 total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
2765 if (total_len < 0){
2766 perror("Error writing to cache");
2767 return 0;
2768 }
2769
2770 d->size = (int)total_len;
2771 d->cached = 1;
2772
2773 if (total_len > size) total_len = size;
2774
2775 memcpy(buf, d->buf, total_len);
2776 return total_len;
2777}
2778
2779static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2780 struct fuse_file_info *fi)
2781{
2782 char dev_name[72];
2783 struct fuse_context *fc = fuse_get_context();
2784 struct file_info *d = (struct file_info *)fi->fh;
2785 char *cg;
2786 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
2787 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2788 unsigned long read = 0, write = 0;
2789 unsigned long read_merged = 0, write_merged = 0;
2790 unsigned long read_sectors = 0, write_sectors = 0;
2791 unsigned long read_ticks = 0, write_ticks = 0;
2792 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2793 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
2794 char *cache = d->buf;
2795 size_t cache_size = d->buflen;
2796 char *line = NULL;
2797 size_t linelen = 0, total_len = 0, rv = 0;
2798 unsigned int major = 0, minor = 0;
2799 int i = 0;
2800 FILE *f = NULL;
2801
2802 if (offset){
2803 if (offset > d->size)
2804 return -EINVAL;
2805 if (!d->cached)
2806 return 0;
2807 int left = d->size - offset;
2808 total_len = left > size ? size: left;
2809 memcpy(buf, cache + offset, total_len);
2810 return total_len;
2811 }
2812
2813 pid_t initpid = lookup_initpid_in_store(fc->pid);
2814 if (initpid <= 0)
2815 initpid = fc->pid;
2816 cg = get_pid_cgroup(initpid, "blkio");
2817 if (!cg)
2818 return read_file("/proc/diskstats", buf, size, d);
2819
2820 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2821 goto err;
2822 if (!cgfs_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2823 goto err;
2824 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2825 goto err;
2826 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2827 goto err;
2828 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2829 goto err;
2830
2831
2832 f = fopen("/proc/diskstats", "r");
2833 if (!f)
2834 goto err;
2835
2836 while (getline(&line, &linelen, f) != -1) {
2837 size_t l;
2838 char *printme, lbuf[256];
2839
2840 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2841 if(i == 3){
2842 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2843 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2844 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2845 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2846 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2847 read_sectors = read_sectors/512;
2848 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2849 write_sectors = write_sectors/512;
2850
2851 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2852 rd_svctm = rd_svctm/1000000;
2853 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2854 rd_wait = rd_wait/1000000;
2855 read_ticks = rd_svctm + rd_wait;
2856
2857 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2858 wr_svctm = wr_svctm/1000000;
2859 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2860 wr_wait = wr_wait/1000000;
2861 write_ticks = wr_svctm + wr_wait;
2862
2863 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2864 tot_ticks = tot_ticks/1000000;
2865 }else{
2866 continue;
2867 }
2868
2869 memset(lbuf, 0, 256);
2870 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2871 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2872 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2873 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2874 printme = lbuf;
2875 } else
2876 continue;
2877
2878 l = snprintf(cache, cache_size, "%s", printme);
2879 if (l < 0) {
2880 perror("Error writing to fuse buf");
2881 rv = 0;
2882 goto err;
2883 }
2884 if (l >= cache_size) {
2885 fprintf(stderr, "Internal error: truncated write to cache\n");
2886 rv = 0;
2887 goto err;
2888 }
2889 cache += l;
2890 cache_size -= l;
2891 total_len += l;
2892 }
2893
2894 d->cached = 1;
2895 d->size = total_len;
2896 if (total_len > size ) total_len = size;
2897 memcpy(buf, d->buf, total_len);
2898
2899 rv = total_len;
2900err:
2901 free(cg);
2902 if (f)
2903 fclose(f);
2904 free(line);
2905 free(io_serviced_str);
2906 free(io_merged_str);
2907 free(io_service_bytes_str);
2908 free(io_wait_time_str);
2909 free(io_service_time_str);
2910 return rv;
2911}
2912
2913static off_t get_procfile_size(const char *which)
2914{
2915 FILE *f = fopen(which, "r");
2916 char *line = NULL;
2917 size_t len = 0;
2918 ssize_t sz, answer = 0;
2919 if (!f)
2920 return 0;
2921
2922 while ((sz = getline(&line, &len, f)) != -1)
2923 answer += sz;
2924 fclose (f);
2925 free(line);
2926
2927 return answer;
2928}
2929
2930static int proc_getattr(const char *path, struct stat *sb)
2931{
2932 struct timespec now;
2933
2934 memset(sb, 0, sizeof(struct stat));
2935 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2936 return -EINVAL;
2937 sb->st_uid = sb->st_gid = 0;
2938 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2939 if (strcmp(path, "/proc") == 0) {
2940 sb->st_mode = S_IFDIR | 00555;
2941 sb->st_nlink = 2;
2942 return 0;
2943 }
2944 if (strcmp(path, "/proc/meminfo") == 0 ||
2945 strcmp(path, "/proc/cpuinfo") == 0 ||
2946 strcmp(path, "/proc/uptime") == 0 ||
2947 strcmp(path, "/proc/stat") == 0 ||
2948 strcmp(path, "/proc/diskstats") == 0) {
2949 sb->st_size = 0;
2950 sb->st_mode = S_IFREG | 00444;
2951 sb->st_nlink = 1;
2952 return 0;
2953 }
2954
2955 return -ENOENT;
2956}
2957
2958static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2959 struct fuse_file_info *fi)
2960{
2961 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2962 filler(buf, "meminfo", NULL, 0) != 0 ||
2963 filler(buf, "stat", NULL, 0) != 0 ||
2964 filler(buf, "uptime", NULL, 0) != 0 ||
2965 filler(buf, "diskstats", NULL, 0) != 0)
2966 return -EINVAL;
2967 return 0;
2968}
2969
2970static int proc_open(const char *path, struct fuse_file_info *fi)
2971{
2972 int type = -1;
2973 struct file_info *info;
2974
2975 if (strcmp(path, "/proc/meminfo") == 0)
2976 type = LXC_TYPE_PROC_MEMINFO;
2977 else if (strcmp(path, "/proc/cpuinfo") == 0)
2978 type = LXC_TYPE_PROC_CPUINFO;
2979 else if (strcmp(path, "/proc/uptime") == 0)
2980 type = LXC_TYPE_PROC_UPTIME;
2981 else if (strcmp(path, "/proc/stat") == 0)
2982 type = LXC_TYPE_PROC_STAT;
2983 else if (strcmp(path, "/proc/diskstats") == 0)
2984 type = LXC_TYPE_PROC_DISKSTATS;
2985 if (type == -1)
2986 return -ENOENT;
2987
2988 info = malloc(sizeof(*info));
2989 if (!info)
2990 return -ENOMEM;
2991
2992 memset(info, 0, sizeof(*info));
2993 info->type = type;
2994
2995 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
2996 do {
2997 info->buf = malloc(info->buflen);
2998 } while (!info->buf);
2999 memset(info->buf, 0, info->buflen);
3000 /* set actual size to buffer size */
3001 info->size = info->buflen;
3002
3003 fi->fh = (unsigned long)info;
3004 return 0;
3005}
3006
3007static int proc_release(const char *path, struct fuse_file_info *fi)
3008{
3009 struct file_info *f = (struct file_info *)fi->fh;
3010
3011 do_release_file_info(f);
3012 return 0;
3013}
3014
3015static int proc_read(const char *path, char *buf, size_t size, off_t offset,
3016 struct fuse_file_info *fi)
3017{
3018 struct file_info *f = (struct file_info *) fi->fh;
3019
3020 switch (f->type) {
3021 case LXC_TYPE_PROC_MEMINFO:
3022 return proc_meminfo_read(buf, size, offset, fi);
3023 case LXC_TYPE_PROC_CPUINFO:
3024 return proc_cpuinfo_read(buf, size, offset, fi);
3025 case LXC_TYPE_PROC_UPTIME:
3026 return proc_uptime_read(buf, size, offset, fi);
3027 case LXC_TYPE_PROC_STAT:
3028 return proc_stat_read(buf, size, offset, fi);
3029 case LXC_TYPE_PROC_DISKSTATS:
3030 return proc_diskstats_read(buf, size, offset, fi);
3031 default:
3032 return -EINVAL;
3033 }
3034}
3035
3036/*
3037 * FUSE ops for /
3038 * these just delegate to the /proc and /cgroup ops as
3039 * needed
3040 */
3041
3042static int lxcfs_getattr(const char *path, struct stat *sb)
3043{
3044 if (strcmp(path, "/") == 0) {
3045 sb->st_mode = S_IFDIR | 00755;
3046 sb->st_nlink = 2;
3047 return 0;
3048 }
3049 if (strncmp(path, "/cgroup", 7) == 0) {
3050 return cg_getattr(path, sb);
3051 }
3052 if (strncmp(path, "/proc", 5) == 0) {
3053 return proc_getattr(path, sb);
3054 }
3055 return -EINVAL;
3056}
3057
3058static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
3059{
3060 if (strcmp(path, "/") == 0)
3061 return 0;
3062
3063 if (strncmp(path, "/cgroup", 7) == 0) {
3064 return cg_opendir(path, fi);
3065 }
3066 if (strcmp(path, "/proc") == 0)
3067 return 0;
3068 return -ENOENT;
3069}
3070
3071static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
3072 struct fuse_file_info *fi)
3073{
3074 if (strcmp(path, "/") == 0) {
3075 if (filler(buf, "proc", NULL, 0) != 0 ||
3076 filler(buf, "cgroup", NULL, 0) != 0)
3077 return -EINVAL;
3078 return 0;
3079 }
3080 if (strncmp(path, "/cgroup", 7) == 0)
3081 return cg_readdir(path, buf, filler, offset, fi);
3082 if (strcmp(path, "/proc") == 0)
3083 return proc_readdir(path, buf, filler, offset, fi);
3084 return -EINVAL;
3085}
3086
3087static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
3088{
3089 if (strcmp(path, "/") == 0)
3090 return 0;
3091 if (strncmp(path, "/cgroup", 7) == 0) {
3092 return cg_releasedir(path, fi);
3093 }
3094 if (strcmp(path, "/proc") == 0)
3095 return 0;
3096 return -EINVAL;
3097}
3098
3099static int lxcfs_open(const char *path, struct fuse_file_info *fi)
3100{
3101 if (strncmp(path, "/cgroup", 7) == 0)
3102 return cg_open(path, fi);
3103 if (strncmp(path, "/proc", 5) == 0)
3104 return proc_open(path, fi);
3105
3106 return -EINVAL;
3107}
3108
3109static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
3110 struct fuse_file_info *fi)
3111{
3112 if (strncmp(path, "/cgroup", 7) == 0)
3113 return cg_read(path, buf, size, offset, fi);
3114 if (strncmp(path, "/proc", 5) == 0)
3115 return proc_read(path, buf, size, offset, fi);
3116
3117 return -EINVAL;
3118}
3119
3120int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
3121 struct fuse_file_info *fi)
3122{
3123 if (strncmp(path, "/cgroup", 7) == 0) {
3124 return cg_write(path, buf, size, offset, fi);
3125 }
3126
3127 return -EINVAL;
3128}
3129
3130static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
3131{
3132 return 0;
3133}
3134
3135static int lxcfs_release(const char *path, struct fuse_file_info *fi)
3136{
3137 if (strncmp(path, "/cgroup", 7) == 0)
3138 return cg_release(path, fi);
3139 if (strncmp(path, "/proc", 5) == 0)
3140 return proc_release(path, fi);
3141
3142 return -EINVAL;
3143}
3144
3145static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
3146{
3147 return 0;
3148}
3149
3150int lxcfs_mkdir(const char *path, mode_t mode)
3151{
3152 if (strncmp(path, "/cgroup", 7) == 0)
3153 return cg_mkdir(path, mode);
3154
3155 return -EINVAL;
3156}
3157
3158int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
3159{
3160 if (strncmp(path, "/cgroup", 7) == 0)
3161 return cg_chown(path, uid, gid);
3162
3163 return -EINVAL;
3164}
3165
3166/*
3167 * cat first does a truncate before doing ops->write. This doesn't
3168 * really make sense for cgroups. So just return 0 always but do
3169 * nothing.
3170 */
3171int lxcfs_truncate(const char *path, off_t newsize)
3172{
3173 if (strncmp(path, "/cgroup", 7) == 0)
3174 return 0;
3175 return -EINVAL;
3176}
3177
3178int lxcfs_rmdir(const char *path)
3179{
3180 if (strncmp(path, "/cgroup", 7) == 0)
3181 return cg_rmdir(path);
3182 return -EINVAL;
3183}
3184
3185int lxcfs_chmod(const char *path, mode_t mode)
3186{
3187 if (strncmp(path, "/cgroup", 7) == 0)
3188 return cg_chmod(path, mode);
3189 return -EINVAL;
3190}
3191
3192const struct fuse_operations lxcfs_ops = {
3193 .getattr = lxcfs_getattr,
3194 .readlink = NULL,
3195 .getdir = NULL,
3196 .mknod = NULL,
3197 .mkdir = lxcfs_mkdir,
3198 .unlink = NULL,
3199 .rmdir = lxcfs_rmdir,
3200 .symlink = NULL,
3201 .rename = NULL,
3202 .link = NULL,
3203 .chmod = lxcfs_chmod,
3204 .chown = lxcfs_chown,
3205 .truncate = lxcfs_truncate,
3206 .utime = NULL,
3207
3208 .open = lxcfs_open,
3209 .read = lxcfs_read,
3210 .release = lxcfs_release,
3211 .write = lxcfs_write,
3212
3213 .statfs = NULL,
3214 .flush = lxcfs_flush,
3215 .fsync = lxcfs_fsync,
3216
3217 .setxattr = NULL,
3218 .getxattr = NULL,
3219 .listxattr = NULL,
3220 .removexattr = NULL,
3221
3222 .opendir = lxcfs_opendir,
3223 .readdir = lxcfs_readdir,
3224 .releasedir = lxcfs_releasedir,
3225
3226 .fsyncdir = NULL,
3227 .init = NULL,
3228 .destroy = NULL,
3229 .access = NULL,
3230 .create = NULL,
3231 .ftruncate = NULL,
3232 .fgetattr = NULL,
3233};
3234
3235static void usage(const char *me)
3236{
3237 fprintf(stderr, "Usage:\n");
3238 fprintf(stderr, "\n");
3239 fprintf(stderr, "%s mountpoint\n", me);
3240 fprintf(stderr, "%s -h\n", me);
3241 exit(1);
3242}
3243
3244static bool is_help(char *w)
3245{
3246 if (strcmp(w, "-h") == 0 ||
3247 strcmp(w, "--help") == 0 ||
3248 strcmp(w, "-help") == 0 ||
3249 strcmp(w, "help") == 0)
3250 return true;
3251 return false;
3252}
3253
3254void swallow_arg(int *argcp, char *argv[], char *which)
3255{
3256 int i;
3257
3258 for (i = 1; argv[i]; i++) {
3259 if (strcmp(argv[i], which) != 0)
3260 continue;
3261 for (; argv[i]; i++) {
3262 argv[i] = argv[i+1];
3263 }
3264 (*argcp)--;
3265 return;
3266 }
3267}
3268
3269void swallow_option(int *argcp, char *argv[], char *opt, char *v)
3270{
3271 int i;
3272
3273 for (i = 1; argv[i]; i++) {
3274 if (!argv[i+1])
3275 continue;
3276 if (strcmp(argv[i], opt) != 0)
3277 continue;
3278 if (strcmp(argv[i+1], v) != 0) {
3279 fprintf(stderr, "Warning: unexpected fuse option %s\n", v);
3280 exit(1);
3281 }
3282 for (; argv[i+1]; i++) {
3283 argv[i] = argv[i+2];
3284 }
3285 (*argcp) -= 2;
3286 return;
3287 }
3288}
3289
3290int main(int argc, char *argv[])
3291{
3292 int ret = -1;
3293 /*
3294 * what we pass to fuse_main is:
3295 * argv[0] -s -f -o allow_other,directio argv[1] NULL
3296 */
3297 int nargs = 5, cnt = 0;
3298 char *newargv[6];
3299
3300#ifdef FORTRAVIS
3301 /* for travis which runs on 12.04 */
3302 if (glib_check_version (2, 36, 0) != NULL)
3303 g_type_init ();
3304#endif
3305
3306 /* accomodate older init scripts */
3307 swallow_arg(&argc, argv, "-s");
3308 swallow_arg(&argc, argv, "-f");
3309 swallow_option(&argc, argv, "-o", "allow_other");
3310
3311 if (argc == 2 && strcmp(argv[1], "--version") == 0) {
3312 fprintf(stderr, "%s\n", VERSION);
3313 exit(0);
3314 }
3315 if (argc != 2 || is_help(argv[1]))
3316 usage(argv[0]);
3317
3318 newargv[cnt++] = argv[0];
3319 newargv[cnt++] = "-f";
3320 newargv[cnt++] = "-o";
3321 newargv[cnt++] = "allow_other,direct_io,entry_timeout=0.5,attr_timeout=0.5";
3322 newargv[cnt++] = argv[1];
3323 newargv[cnt++] = NULL;
3324
3325 if (!cgfs_setup_controllers())
3326 goto out;
3327
3328 ret = fuse_main(nargs, newargv, &lxcfs_ops, NULL);
3329
3330out:
3331 return ret;
3332}