]> git.proxmox.com Git - mirror_lxcfs.git/blob - lxcfs.c
Avoid theoretical underflow in prune_init_{slice,scope}
[mirror_lxcfs.git] / lxcfs.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #include <stdio.h>
12 #include <dirent.h>
13 #include <fcntl.h>
14 #include <fuse.h>
15 #include <unistd.h>
16 #include <errno.h>
17 #include <stdbool.h>
18 #include <time.h>
19 #include <string.h>
20 #include <stdlib.h>
21 #include <libgen.h>
22 #include <sched.h>
23 #include <pthread.h>
24 #include <linux/sched.h>
25 #include <sys/socket.h>
26 #include <sys/mount.h>
27 #include <sys/epoll.h>
28 #include <wait.h>
29
30 #ifdef FORTRAVIS
31 #define GLIB_DISABLE_DEPRECATION_WARNINGS
32 #include <glib-object.h>
33 #endif
34
35 #include "cgfs.h"
36 #include "config.h" // for VERSION
37
38 enum {
39 LXC_TYPE_CGDIR,
40 LXC_TYPE_CGFILE,
41 LXC_TYPE_PROC_MEMINFO,
42 LXC_TYPE_PROC_CPUINFO,
43 LXC_TYPE_PROC_UPTIME,
44 LXC_TYPE_PROC_STAT,
45 LXC_TYPE_PROC_DISKSTATS,
46 };
47
48 struct file_info {
49 char *controller;
50 char *cgroup;
51 char *file;
52 int type;
53 char *buf; // unused as of yet
54 int buflen;
55 int size; //actual data size
56 int cached;
57 };
58
59 /* reserve buffer size, for cpuall in /proc/stat */
60 #define BUF_RESERVE_SIZE 256
61
62 /*
63 * A table caching which pid is init for a pid namespace.
64 * When looking up which pid is init for $qpid, we first
65 * 1. Stat /proc/$qpid/ns/pid.
66 * 2. Check whether the ino_t is in our store.
67 * a. if not, fork a child in qpid's ns to send us
68 * ucred.pid = 1, and read the initpid. Cache
69 * initpid and creation time for /proc/initpid
70 * in a new store entry.
71 * b. if so, verify that /proc/initpid still matches
72 * what we have saved. If not, clear the store
73 * entry and go back to a. If so, return the
74 * cached initpid.
75 */
76 struct pidns_init_store {
77 ino_t ino; // inode number for /proc/$pid/ns/pid
78 pid_t initpid; // the pid of nit in that ns
79 long int ctime; // the time at which /proc/$initpid was created
80 struct pidns_init_store *next;
81 long int lastcheck;
82 };
83
84 /* lol - look at how they are allocated in the kernel */
85 #define PIDNS_HASH_SIZE 4096
86 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
87
88 struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
89 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
90 static void lock_mutex(pthread_mutex_t *l)
91 {
92 int ret;
93
94 if ((ret = pthread_mutex_lock(l)) != 0) {
95 fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret));
96 exit(1);
97 }
98 }
99
100 static void unlock_mutex(pthread_mutex_t *l)
101 {
102 int ret;
103
104 if ((ret = pthread_mutex_unlock(l)) != 0) {
105 fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret));
106 exit(1);
107 }
108 }
109
110 static void store_lock(void)
111 {
112 lock_mutex(&pidns_store_mutex);
113 }
114
115 static void store_unlock(void)
116 {
117 unlock_mutex(&pidns_store_mutex);
118 }
119
120 /* Must be called under store_lock */
121 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
122 {
123 struct stat initsb;
124 char fnam[100];
125
126 snprintf(fnam, 100, "/proc/%d", e->initpid);
127 if (stat(fnam, &initsb) < 0)
128 return false;
129 #if DEBUG
130 fprintf(stderr, "comparing ctime %ld %ld for pid %d\n",
131 e->ctime, initsb.st_ctime, e->initpid);
132 #endif
133 if (e->ctime != initsb.st_ctime)
134 return false;
135 return true;
136 }
137
138 /* Must be called under store_lock */
139 static void remove_initpid(struct pidns_init_store *e)
140 {
141 struct pidns_init_store *tmp;
142 int h;
143
144 #if DEBUG
145 fprintf(stderr, "remove_initpid: removing entry for %d\n", e->initpid);
146 #endif
147 h = HASH(e->ino);
148 if (pidns_hash_table[h] == e) {
149 pidns_hash_table[h] = e->next;
150 free(e);
151 return;
152 }
153
154 tmp = pidns_hash_table[h];
155 while (tmp) {
156 if (tmp->next == e) {
157 tmp->next = e->next;
158 free(e);
159 return;
160 }
161 tmp = tmp->next;
162 }
163 }
164
165 #define PURGE_SECS 5
166 /* Must be called under store_lock */
167 static void prune_initpid_store(void)
168 {
169 static long int last_prune = 0;
170 struct pidns_init_store *e, *prev, *delme;
171 long int now, threshold;
172 int i;
173
174 if (!last_prune) {
175 last_prune = time(NULL);
176 return;
177 }
178 now = time(NULL);
179 if (now < last_prune + PURGE_SECS)
180 return;
181 #if DEBUG
182 fprintf(stderr, "pruning\n");
183 #endif
184 last_prune = now;
185 threshold = now - 2 * PURGE_SECS;
186
187 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
188 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
189 if (e->lastcheck < threshold) {
190 #if DEBUG
191 fprintf(stderr, "Removing cached entry for %d\n", e->initpid);
192 #endif
193 delme = e;
194 if (prev)
195 prev->next = e->next;
196 else
197 pidns_hash_table[i] = e->next;
198 e = e->next;
199 free(delme);
200 } else {
201 prev = e;
202 e = e->next;
203 }
204 }
205 }
206 }
207
208 /* Must be called under store_lock */
209 static void save_initpid(struct stat *sb, pid_t pid)
210 {
211 struct pidns_init_store *e;
212 char fpath[100];
213 struct stat procsb;
214 int h;
215
216 #if DEBUG
217 fprintf(stderr, "save_initpid: adding entry for %d\n", pid);
218 #endif
219 snprintf(fpath, 100, "/proc/%d", pid);
220 if (stat(fpath, &procsb) < 0)
221 return;
222 do {
223 e = malloc(sizeof(*e));
224 } while (!e);
225 e->ino = sb->st_ino;
226 e->initpid = pid;
227 e->ctime = procsb.st_ctime;
228 h = HASH(e->ino);
229 e->next = pidns_hash_table[h];
230 e->lastcheck = time(NULL);
231 pidns_hash_table[h] = e;
232 }
233
234 /*
235 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
236 * entry for the inode number and creation time. Verify that the init pid
237 * is still valid. If not, remove it. Return the entry if valid, NULL
238 * otherwise.
239 * Must be called under store_lock
240 */
241 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
242 {
243 int h = HASH(sb->st_ino);
244 struct pidns_init_store *e = pidns_hash_table[h];
245
246 while (e) {
247 if (e->ino == sb->st_ino) {
248 if (initpid_still_valid(e, sb)) {
249 e->lastcheck = time(NULL);
250 return e;
251 }
252 remove_initpid(e);
253 return NULL;
254 }
255 e = e->next;
256 }
257
258 return NULL;
259 }
260
261 #define SEND_CREDS_OK 0
262 #define SEND_CREDS_NOTSK 1
263 #define SEND_CREDS_FAIL 2
264 static bool recv_creds(int sock, struct ucred *cred, char *v);
265 static int wait_for_pid(pid_t pid);
266 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
267
268 /*
269 * fork a task which switches to @task's namespace and writes '1'.
270 * over a unix sock so we can read the task's reaper's pid in our
271 * namespace
272 */
273 static void write_task_init_pid_exit(int sock, pid_t target)
274 {
275 struct ucred cred;
276 char fnam[100];
277 pid_t pid;
278 char v;
279 int fd, ret;
280
281 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
282 if (ret < 0 || ret >= sizeof(fnam))
283 _exit(1);
284
285 fd = open(fnam, O_RDONLY);
286 if (fd < 0) {
287 perror("write_task_init_pid_exit open of ns/pid");
288 _exit(1);
289 }
290 if (setns(fd, 0)) {
291 perror("write_task_init_pid_exit setns 1");
292 close(fd);
293 _exit(1);
294 }
295 pid = fork();
296 if (pid < 0)
297 _exit(1);
298 if (pid != 0) {
299 if (!wait_for_pid(pid))
300 _exit(1);
301 _exit(0);
302 }
303
304 /* we are the child */
305 cred.uid = 0;
306 cred.gid = 0;
307 cred.pid = 1;
308 v = '1';
309 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
310 _exit(1);
311 _exit(0);
312 }
313
314 static pid_t get_init_pid_for_task(pid_t task)
315 {
316 int sock[2];
317 pid_t pid;
318 pid_t ret = -1;
319 char v = '0';
320 struct ucred cred;
321
322 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
323 perror("socketpair");
324 return -1;
325 }
326
327 pid = fork();
328 if (pid < 0)
329 goto out;
330 if (!pid) {
331 close(sock[1]);
332 write_task_init_pid_exit(sock[0], task);
333 _exit(0);
334 }
335
336 if (!recv_creds(sock[1], &cred, &v))
337 goto out;
338 ret = cred.pid;
339
340 out:
341 close(sock[0]);
342 close(sock[1]);
343 if (pid > 0)
344 wait_for_pid(pid);
345 return ret;
346 }
347
348 static pid_t lookup_initpid_in_store(pid_t qpid)
349 {
350 pid_t answer = 0;
351 struct stat sb;
352 struct pidns_init_store *e;
353 char fnam[100];
354
355 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
356 store_lock();
357 if (stat(fnam, &sb) < 0)
358 goto out;
359 e = lookup_verify_initpid(&sb);
360 if (e) {
361 answer = e->initpid;
362 goto out;
363 }
364 answer = get_init_pid_for_task(qpid);
365 if (answer > 0)
366 save_initpid(&sb, answer);
367
368 out:
369 /* we prune at end in case we are returning
370 * the value we were about to return */
371 prune_initpid_store();
372 store_unlock();
373 return answer;
374 }
375
376 static int wait_for_pid(pid_t pid)
377 {
378 int status, ret;
379
380 if (pid <= 0)
381 return -1;
382
383 again:
384 ret = waitpid(pid, &status, 0);
385 if (ret == -1) {
386 if (errno == EINTR)
387 goto again;
388 return -1;
389 }
390 if (ret != pid)
391 goto again;
392 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
393 return -1;
394 return 0;
395 }
396
397
398 /*
399 * append pid to *src.
400 * src: a pointer to a char* in which ot append the pid.
401 * sz: the number of characters printed so far, minus trailing \0.
402 * asz: the allocated size so far
403 * pid: the pid to append
404 */
405 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
406 {
407 char tmp[30];
408
409 int tmplen = sprintf(tmp, "%d\n", (int)pid);
410
411 if (!*src || tmplen + *sz + 1 >= *asz) {
412 char *tmp;
413 do {
414 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
415 } while (!tmp);
416 *src = tmp;
417 *asz += BUF_RESERVE_SIZE;
418 }
419 memcpy((*src) +*sz , tmp, tmplen);
420 *sz += tmplen;
421 (*src)[*sz] = '\0';
422 }
423
424 /*
425 * Given a open file * to /proc/pid/{u,g}id_map, and an id
426 * valid in the caller's namespace, return the id mapped into
427 * pid's namespace.
428 * Returns the mapped id, or -1 on error.
429 */
430 unsigned int
431 convert_id_to_ns(FILE *idfile, unsigned int in_id)
432 {
433 unsigned int nsuid, // base id for a range in the idfile's namespace
434 hostuid, // base id for a range in the caller's namespace
435 count; // number of ids in this range
436 char line[400];
437 int ret;
438
439 fseek(idfile, 0L, SEEK_SET);
440 while (fgets(line, 400, idfile)) {
441 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
442 if (ret != 3)
443 continue;
444 if (hostuid + count < hostuid || nsuid + count < nsuid) {
445 /*
446 * uids wrapped around - unexpected as this is a procfile,
447 * so just bail.
448 */
449 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
450 nsuid, hostuid, count, line);
451 return -1;
452 }
453 if (hostuid <= in_id && hostuid+count > in_id) {
454 /*
455 * now since hostuid <= in_id < hostuid+count, and
456 * hostuid+count and nsuid+count do not wrap around,
457 * we know that nsuid+(in_id-hostuid) which must be
458 * less that nsuid+(count) must not wrap around
459 */
460 return (in_id - hostuid) + nsuid;
461 }
462 }
463
464 // no answer found
465 return -1;
466 }
467
468 /*
469 * for is_privileged_over,
470 * specify whether we require the calling uid to be root in his
471 * namespace
472 */
473 #define NS_ROOT_REQD true
474 #define NS_ROOT_OPT false
475
476 #define PROCLEN 100
477
478 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
479 {
480 char fpath[PROCLEN];
481 int ret;
482 bool answer = false;
483 uid_t nsuid;
484
485 if (victim == -1 || uid == -1)
486 return false;
487
488 /*
489 * If the request is one not requiring root in the namespace,
490 * then having the same uid suffices. (i.e. uid 1000 has write
491 * access to files owned by uid 1000
492 */
493 if (!req_ns_root && uid == victim)
494 return true;
495
496 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
497 if (ret < 0 || ret >= PROCLEN)
498 return false;
499 FILE *f = fopen(fpath, "r");
500 if (!f)
501 return false;
502
503 /* if caller's not root in his namespace, reject */
504 nsuid = convert_id_to_ns(f, uid);
505 if (nsuid)
506 goto out;
507
508 /*
509 * If victim is not mapped into caller's ns, reject.
510 * XXX I'm not sure this check is needed given that fuse
511 * will be sending requests where the vfs has converted
512 */
513 nsuid = convert_id_to_ns(f, victim);
514 if (nsuid == -1)
515 goto out;
516
517 answer = true;
518
519 out:
520 fclose(f);
521 return answer;
522 }
523
524 static bool perms_include(int fmode, mode_t req_mode)
525 {
526 mode_t r;
527
528 switch (req_mode & O_ACCMODE) {
529 case O_RDONLY:
530 r = S_IROTH;
531 break;
532 case O_WRONLY:
533 r = S_IWOTH;
534 break;
535 case O_RDWR:
536 r = S_IROTH | S_IWOTH;
537 break;
538 default:
539 return false;
540 }
541 return ((fmode & r) == r);
542 }
543
544
545 /*
546 * taskcg is a/b/c
547 * querycg is /a/b/c/d/e
548 * we return 'd'
549 */
550 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
551 {
552 char *start, *end;
553
554 if (strlen(taskcg) <= strlen(querycg)) {
555 fprintf(stderr, "%s: I was fed bad input\n", __func__);
556 return NULL;
557 }
558
559 if (strcmp(querycg, "/") == 0)
560 start = strdup(taskcg + 1);
561 else
562 start = strdup(taskcg + strlen(querycg) + 1);
563 if (!start)
564 return NULL;
565 end = strchr(start, '/');
566 if (end)
567 *end = '\0';
568 return start;
569 }
570
571 static void stripnewline(char *x)
572 {
573 size_t l = strlen(x);
574 if (l && x[l-1] == '\n')
575 x[l-1] = '\0';
576 }
577
578 static char *get_pid_cgroup(pid_t pid, const char *contrl)
579 {
580 char fnam[PROCLEN];
581 FILE *f;
582 char *answer = NULL;
583 char *line = NULL;
584 size_t len = 0;
585 int ret;
586 const char *h = find_mounted_controller(contrl);
587 if (!h)
588 return NULL;
589
590 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
591 if (ret < 0 || ret >= PROCLEN)
592 return NULL;
593 if (!(f = fopen(fnam, "r")))
594 return NULL;
595
596 while (getline(&line, &len, f) != -1) {
597 char *c1, *c2;
598 if (!line[0])
599 continue;
600 c1 = strchr(line, ':');
601 if (!c1)
602 goto out;
603 c1++;
604 c2 = strchr(c1, ':');
605 if (!c2)
606 goto out;
607 *c2 = '\0';
608 if (strcmp(c1, h) != 0)
609 continue;
610 c2++;
611 stripnewline(c2);
612 do {
613 answer = strdup(c2);
614 } while (!answer);
615 break;
616 }
617
618 out:
619 fclose(f);
620 free(line);
621 return answer;
622 }
623
624 /*
625 * check whether a fuse context may access a cgroup dir or file
626 *
627 * If file is not null, it is a cgroup file to check under cg.
628 * If file is null, then we are checking perms on cg itself.
629 *
630 * For files we can check the mode of the list_keys result.
631 * For cgroups, we must make assumptions based on the files under the
632 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
633 * yet.
634 */
635 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
636 {
637 struct cgfs_files *k = NULL;
638 bool ret = false;
639
640 k = cgfs_get_key(contrl, cg, file);
641 if (!k)
642 return false;
643
644 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
645 if (perms_include(k->mode >> 6, mode)) {
646 ret = true;
647 goto out;
648 }
649 }
650 if (fc->gid == k->gid) {
651 if (perms_include(k->mode >> 3, mode)) {
652 ret = true;
653 goto out;
654 }
655 }
656 ret = perms_include(k->mode, mode);
657
658 out:
659 free_key(k);
660 return ret;
661 }
662
663 #define INITSCOPE "/init.scope"
664 static void prune_init_slice(char *cg)
665 {
666 char *point;
667 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
668
669 if (cg_len < initscope_len)
670 return;
671
672 point = cg + cg_len - initscope_len;
673 if (strcmp(point, INITSCOPE) == 0) {
674 if (point == cg)
675 *(point+1) = '\0';
676 else
677 *point = '\0';
678 }
679 }
680
681 /*
682 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
683 * If pid is in /a, he may act on /a/b, but not on /b.
684 * if the answer is false and nextcg is not NULL, then *nextcg will point
685 * to a string containing the next cgroup directory under cg, which must be
686 * freed by the caller.
687 */
688 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
689 {
690 bool answer = false;
691 char *c2 = get_pid_cgroup(pid, contrl);
692 char *linecmp;
693
694 if (!c2)
695 return false;
696 prune_init_slice(c2);
697
698 /*
699 * callers pass in '/' for root cgroup, otherwise they pass
700 * in a cgroup without leading '/'
701 */
702 linecmp = *cg == '/' ? c2 : c2+1;
703 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
704 if (nextcg) {
705 *nextcg = get_next_cgroup_dir(linecmp, cg);
706 }
707 goto out;
708 }
709 answer = true;
710
711 out:
712 free(c2);
713 return answer;
714 }
715
716 /*
717 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
718 */
719 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
720 {
721 bool answer = false;
722 char *c2, *task_cg;
723 size_t target_len, task_len;
724
725 if (strcmp(cg, "/") == 0)
726 return true;
727
728 c2 = get_pid_cgroup(pid, contrl);
729 if (!c2)
730 return false;
731 prune_init_slice(c2);
732
733 task_cg = c2 + 1;
734 target_len = strlen(cg);
735 task_len = strlen(task_cg);
736 if (task_len == 0) {
737 /* Task is in the root cg, it can see everything. This case is
738 * not handled by the strmcps below, since they test for the
739 * last /, but that is the first / that we've chopped off
740 * above.
741 */
742 answer = true;
743 goto out;
744 }
745 if (strcmp(cg, task_cg) == 0) {
746 answer = true;
747 goto out;
748 }
749 if (target_len < task_len) {
750 /* looking up a parent dir */
751 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
752 answer = true;
753 goto out;
754 }
755 if (target_len > task_len) {
756 /* looking up a child dir */
757 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
758 answer = true;
759 goto out;
760 }
761
762 out:
763 free(c2);
764 return answer;
765 }
766
767 /*
768 * given /cgroup/freezer/a/b, return "freezer".
769 * the returned char* should NOT be freed.
770 */
771 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
772 {
773 const char *p1;
774 char *contr, *slash;
775
776 if (strlen(path) < 9)
777 return NULL;
778 if (*(path+7) != '/')
779 return NULL;
780 p1 = path+8;
781 contr = strdupa(p1);
782 if (!contr)
783 return NULL;
784 slash = strstr(contr, "/");
785 if (slash)
786 *slash = '\0';
787
788 int i;
789 for (i = 0; i < num_hierarchies; i++) {
790 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
791 return hierarchies[i];
792 }
793 return NULL;
794 }
795
796 /*
797 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
798 * Note that the returned value may include files (keynames) etc
799 */
800 static const char *find_cgroup_in_path(const char *path)
801 {
802 const char *p1;
803
804 if (strlen(path) < 9)
805 return NULL;
806 p1 = strstr(path+8, "/");
807 if (!p1)
808 return NULL;
809 return p1+1;
810 }
811
812 /*
813 * split the last path element from the path in @cg.
814 * @dir is newly allocated and should be freed, @last not
815 */
816 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
817 {
818 char *p;
819
820 do {
821 *dir = strdup(cg);
822 } while (!*dir);
823 *last = strrchr(cg, '/');
824 if (!*last) {
825 *last = NULL;
826 return;
827 }
828 p = strrchr(*dir, '/');
829 *p = '\0';
830 }
831
832 /*
833 * FUSE ops for /cgroup
834 */
835
836 static int cg_getattr(const char *path, struct stat *sb)
837 {
838 struct timespec now;
839 struct fuse_context *fc = fuse_get_context();
840 char * cgdir = NULL;
841 char *last = NULL, *path1, *path2;
842 struct cgfs_files *k = NULL;
843 const char *cgroup;
844 const char *controller = NULL;
845 int ret = -ENOENT;
846
847
848 if (!fc)
849 return -EIO;
850
851 memset(sb, 0, sizeof(struct stat));
852
853 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
854 return -EINVAL;
855
856 sb->st_uid = sb->st_gid = 0;
857 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
858 sb->st_size = 0;
859
860 if (strcmp(path, "/cgroup") == 0) {
861 sb->st_mode = S_IFDIR | 00755;
862 sb->st_nlink = 2;
863 return 0;
864 }
865
866 controller = pick_controller_from_path(fc, path);
867 if (!controller)
868 return -EIO;
869 cgroup = find_cgroup_in_path(path);
870 if (!cgroup) {
871 /* this is just /cgroup/controller, return it as a dir */
872 sb->st_mode = S_IFDIR | 00755;
873 sb->st_nlink = 2;
874 return 0;
875 }
876
877 get_cgdir_and_path(cgroup, &cgdir, &last);
878
879 if (!last) {
880 path1 = "/";
881 path2 = cgdir;
882 } else {
883 path1 = cgdir;
884 path2 = last;
885 }
886
887 pid_t initpid = lookup_initpid_in_store(fc->pid);
888 if (initpid <= 0)
889 initpid = fc->pid;
890 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
891 * Then check that caller's cgroup is under path if last is a child
892 * cgroup, or cgdir if last is a file */
893
894 if (is_child_cgroup(controller, path1, path2)) {
895 if (!caller_may_see_dir(initpid, controller, cgroup)) {
896 ret = -ENOENT;
897 goto out;
898 }
899 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
900 /* this is just /cgroup/controller, return it as a dir */
901 sb->st_mode = S_IFDIR | 00555;
902 sb->st_nlink = 2;
903 ret = 0;
904 goto out;
905 }
906 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
907 ret = -EACCES;
908 goto out;
909 }
910
911 // get uid, gid, from '/tasks' file and make up a mode
912 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
913 sb->st_mode = S_IFDIR | 00755;
914 k = cgfs_get_key(controller, cgroup, NULL);
915 if (!k) {
916 sb->st_uid = sb->st_gid = 0;
917 } else {
918 sb->st_uid = k->uid;
919 sb->st_gid = k->gid;
920 }
921 free_key(k);
922 sb->st_nlink = 2;
923 ret = 0;
924 goto out;
925 }
926
927 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
928 sb->st_mode = S_IFREG | k->mode;
929 sb->st_nlink = 1;
930 sb->st_uid = k->uid;
931 sb->st_gid = k->gid;
932 sb->st_size = 0;
933 free_key(k);
934 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
935 ret = -ENOENT;
936 goto out;
937 }
938 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
939 ret = -EACCES;
940 goto out;
941 }
942
943 ret = 0;
944 }
945
946 out:
947 free(cgdir);
948 return ret;
949 }
950
951 static int cg_opendir(const char *path, struct fuse_file_info *fi)
952 {
953 struct fuse_context *fc = fuse_get_context();
954 const char *cgroup;
955 struct file_info *dir_info;
956 char *controller = NULL;
957
958 if (!fc)
959 return -EIO;
960
961 if (strcmp(path, "/cgroup") == 0) {
962 cgroup = NULL;
963 controller = NULL;
964 } else {
965 // return list of keys for the controller, and list of child cgroups
966 controller = pick_controller_from_path(fc, path);
967 if (!controller)
968 return -EIO;
969
970 cgroup = find_cgroup_in_path(path);
971 if (!cgroup) {
972 /* this is just /cgroup/controller, return its contents */
973 cgroup = "/";
974 }
975 }
976
977 pid_t initpid = lookup_initpid_in_store(fc->pid);
978 if (initpid <= 0)
979 initpid = fc->pid;
980 if (cgroup) {
981 if (!caller_may_see_dir(initpid, controller, cgroup))
982 return -ENOENT;
983 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
984 return -EACCES;
985 }
986
987 /* we'll free this at cg_releasedir */
988 dir_info = malloc(sizeof(*dir_info));
989 if (!dir_info)
990 return -ENOMEM;
991 dir_info->controller = must_copy_string(controller);
992 dir_info->cgroup = must_copy_string(cgroup);
993 dir_info->type = LXC_TYPE_CGDIR;
994 dir_info->buf = NULL;
995 dir_info->file = NULL;
996 dir_info->buflen = 0;
997
998 fi->fh = (unsigned long)dir_info;
999 return 0;
1000 }
1001
1002 static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1003 struct fuse_file_info *fi)
1004 {
1005 struct file_info *d = (struct file_info *)fi->fh;
1006 struct cgfs_files **list = NULL;
1007 int i, ret;
1008 char *nextcg = NULL;
1009 struct fuse_context *fc = fuse_get_context();
1010 char **clist = NULL;
1011
1012 if (d->type != LXC_TYPE_CGDIR) {
1013 fprintf(stderr, "Internal error: file cache info used in readdir\n");
1014 return -EIO;
1015 }
1016 if (!d->cgroup && !d->controller) {
1017 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1018 int i;
1019
1020 for (i = 0; i < num_hierarchies; i++) {
1021 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1022 return -EIO;
1023 }
1024 }
1025 return 0;
1026 }
1027
1028 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1029 // not a valid cgroup
1030 ret = -EINVAL;
1031 goto out;
1032 }
1033
1034 pid_t initpid = lookup_initpid_in_store(fc->pid);
1035 if (initpid <= 0)
1036 initpid = fc->pid;
1037 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1038 if (nextcg) {
1039 int ret;
1040 ret = filler(buf, nextcg, NULL, 0);
1041 free(nextcg);
1042 if (ret != 0) {
1043 ret = -EIO;
1044 goto out;
1045 }
1046 }
1047 ret = 0;
1048 goto out;
1049 }
1050
1051 for (i = 0; list[i]; i++) {
1052 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1053 ret = -EIO;
1054 goto out;
1055 }
1056 }
1057
1058 // now get the list of child cgroups
1059
1060 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1061 ret = 0;
1062 goto out;
1063 }
1064 for (i = 0; clist[i]; i++) {
1065 if (filler(buf, clist[i], NULL, 0) != 0) {
1066 ret = -EIO;
1067 goto out;
1068 }
1069 }
1070 ret = 0;
1071
1072 out:
1073 free_keys(list);
1074 if (clist) {
1075 for (i = 0; clist[i]; i++)
1076 free(clist[i]);
1077 free(clist);
1078 }
1079 return ret;
1080 }
1081
1082 static void do_release_file_info(struct file_info *f)
1083 {
1084 if (!f)
1085 return;
1086 free(f->controller);
1087 free(f->cgroup);
1088 free(f->file);
1089 free(f->buf);
1090 free(f);
1091 }
1092
1093 static int cg_releasedir(const char *path, struct fuse_file_info *fi)
1094 {
1095 struct file_info *d = (struct file_info *)fi->fh;
1096
1097 do_release_file_info(d);
1098 return 0;
1099 }
1100
1101 static int cg_open(const char *path, struct fuse_file_info *fi)
1102 {
1103 const char *cgroup;
1104 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1105 struct cgfs_files *k = NULL;
1106 struct file_info *file_info;
1107 struct fuse_context *fc = fuse_get_context();
1108 int ret;
1109
1110 if (!fc)
1111 return -EIO;
1112
1113 controller = pick_controller_from_path(fc, path);
1114 if (!controller)
1115 return -EIO;
1116 cgroup = find_cgroup_in_path(path);
1117 if (!cgroup)
1118 return -EINVAL;
1119
1120 get_cgdir_and_path(cgroup, &cgdir, &last);
1121 if (!last) {
1122 path1 = "/";
1123 path2 = cgdir;
1124 } else {
1125 path1 = cgdir;
1126 path2 = last;
1127 }
1128
1129 k = cgfs_get_key(controller, path1, path2);
1130 if (!k) {
1131 ret = -EINVAL;
1132 goto out;
1133 }
1134 free_key(k);
1135
1136 pid_t initpid = lookup_initpid_in_store(fc->pid);
1137 if (initpid <= 0)
1138 initpid = fc->pid;
1139 if (!caller_may_see_dir(initpid, controller, path1)) {
1140 ret = -ENOENT;
1141 goto out;
1142 }
1143 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
1144 // should never get here
1145 ret = -EACCES;
1146 goto out;
1147 }
1148
1149 /* we'll free this at cg_release */
1150 file_info = malloc(sizeof(*file_info));
1151 if (!file_info) {
1152 ret = -ENOMEM;
1153 goto out;
1154 }
1155 file_info->controller = must_copy_string(controller);
1156 file_info->cgroup = must_copy_string(path1);
1157 file_info->file = must_copy_string(path2);
1158 file_info->type = LXC_TYPE_CGFILE;
1159 file_info->buf = NULL;
1160 file_info->buflen = 0;
1161
1162 fi->fh = (unsigned long)file_info;
1163 ret = 0;
1164
1165 out:
1166 free(cgdir);
1167 return ret;
1168 }
1169
1170 static int cg_release(const char *path, struct fuse_file_info *fi)
1171 {
1172 struct file_info *f = (struct file_info *)fi->fh;
1173
1174 do_release_file_info(f);
1175 return 0;
1176 }
1177
1178 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
1179
1180 static bool wait_for_sock(int sock, int timeout)
1181 {
1182 struct epoll_event ev;
1183 int epfd, ret, now, starttime, deltatime, saved_errno;
1184
1185 if ((starttime = time(NULL)) < 0)
1186 return false;
1187
1188 if ((epfd = epoll_create(1)) < 0) {
1189 fprintf(stderr, "Failed to create epoll socket: %m\n");
1190 return false;
1191 }
1192
1193 ev.events = POLLIN_SET;
1194 ev.data.fd = sock;
1195 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
1196 fprintf(stderr, "Failed adding socket to epoll: %m\n");
1197 close(epfd);
1198 return false;
1199 }
1200
1201 again:
1202 if ((now = time(NULL)) < 0) {
1203 close(epfd);
1204 return false;
1205 }
1206
1207 deltatime = (starttime + timeout) - now;
1208 if (deltatime < 0) { // timeout
1209 errno = 0;
1210 close(epfd);
1211 return false;
1212 }
1213 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
1214 if (ret < 0 && errno == EINTR)
1215 goto again;
1216 saved_errno = errno;
1217 close(epfd);
1218
1219 if (ret <= 0) {
1220 errno = saved_errno;
1221 return false;
1222 }
1223 return true;
1224 }
1225
1226 static int msgrecv(int sockfd, void *buf, size_t len)
1227 {
1228 if (!wait_for_sock(sockfd, 2))
1229 return -1;
1230 return recv(sockfd, buf, len, MSG_DONTWAIT);
1231 }
1232
1233 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
1234 {
1235 struct msghdr msg = { 0 };
1236 struct iovec iov;
1237 struct cmsghdr *cmsg;
1238 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
1239 char buf[1];
1240 buf[0] = 'p';
1241
1242 if (pingfirst) {
1243 if (msgrecv(sock, buf, 1) != 1) {
1244 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
1245 __func__);
1246 return SEND_CREDS_FAIL;
1247 }
1248 }
1249
1250 msg.msg_control = cmsgbuf;
1251 msg.msg_controllen = sizeof(cmsgbuf);
1252
1253 cmsg = CMSG_FIRSTHDR(&msg);
1254 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
1255 cmsg->cmsg_level = SOL_SOCKET;
1256 cmsg->cmsg_type = SCM_CREDENTIALS;
1257 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
1258
1259 msg.msg_name = NULL;
1260 msg.msg_namelen = 0;
1261
1262 buf[0] = v;
1263 iov.iov_base = buf;
1264 iov.iov_len = sizeof(buf);
1265 msg.msg_iov = &iov;
1266 msg.msg_iovlen = 1;
1267
1268 if (sendmsg(sock, &msg, 0) < 0) {
1269 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
1270 strerror(errno));
1271 if (errno == 3)
1272 return SEND_CREDS_NOTSK;
1273 return SEND_CREDS_FAIL;
1274 }
1275
1276 return SEND_CREDS_OK;
1277 }
1278
1279 static bool recv_creds(int sock, struct ucred *cred, char *v)
1280 {
1281 struct msghdr msg = { 0 };
1282 struct iovec iov;
1283 struct cmsghdr *cmsg;
1284 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
1285 char buf[1];
1286 int ret;
1287 int optval = 1;
1288
1289 *v = '1';
1290
1291 cred->pid = -1;
1292 cred->uid = -1;
1293 cred->gid = -1;
1294
1295 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
1296 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
1297 return false;
1298 }
1299 buf[0] = '1';
1300 if (write(sock, buf, 1) != 1) {
1301 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
1302 return false;
1303 }
1304
1305 msg.msg_name = NULL;
1306 msg.msg_namelen = 0;
1307 msg.msg_control = cmsgbuf;
1308 msg.msg_controllen = sizeof(cmsgbuf);
1309
1310 iov.iov_base = buf;
1311 iov.iov_len = sizeof(buf);
1312 msg.msg_iov = &iov;
1313 msg.msg_iovlen = 1;
1314
1315 if (!wait_for_sock(sock, 2)) {
1316 fprintf(stderr, "Timed out waiting for scm_cred: %s\n",
1317 strerror(errno));
1318 return false;
1319 }
1320 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
1321 if (ret < 0) {
1322 fprintf(stderr, "Failed to receive scm_cred: %s\n",
1323 strerror(errno));
1324 return false;
1325 }
1326
1327 cmsg = CMSG_FIRSTHDR(&msg);
1328
1329 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
1330 cmsg->cmsg_level == SOL_SOCKET &&
1331 cmsg->cmsg_type == SCM_CREDENTIALS) {
1332 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
1333 }
1334 *v = buf[0];
1335
1336 return true;
1337 }
1338
1339
1340 /*
1341 * pid_to_ns - reads pids from a ucred over a socket, then writes the
1342 * int value back over the socket. This shifts the pid from the
1343 * sender's pidns into tpid's pidns.
1344 */
1345 static void pid_to_ns(int sock, pid_t tpid)
1346 {
1347 char v = '0';
1348 struct ucred cred;
1349
1350 while (recv_creds(sock, &cred, &v)) {
1351 if (v == '1')
1352 _exit(0);
1353 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
1354 _exit(1);
1355 }
1356 _exit(0);
1357 }
1358
1359 /*
1360 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
1361 * in your old pidns. Only children which you fork will be in the target
1362 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
1363 * actually convert pids
1364 */
1365 static void pid_to_ns_wrapper(int sock, pid_t tpid)
1366 {
1367 int newnsfd = -1, ret, cpipe[2];
1368 char fnam[100];
1369 pid_t cpid;
1370 char v;
1371
1372 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1373 if (ret < 0 || ret >= sizeof(fnam))
1374 _exit(1);
1375 newnsfd = open(fnam, O_RDONLY);
1376 if (newnsfd < 0)
1377 _exit(1);
1378 if (setns(newnsfd, 0) < 0)
1379 _exit(1);
1380 close(newnsfd);
1381
1382 if (pipe(cpipe) < 0)
1383 _exit(1);
1384
1385 cpid = fork();
1386 if (cpid < 0)
1387 _exit(1);
1388
1389 if (!cpid) {
1390 char b = '1';
1391 close(cpipe[0]);
1392 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1393 fprintf(stderr, "%s (child): erorr on write: %s\n",
1394 __func__, strerror(errno));
1395 }
1396 close(cpipe[1]);
1397 pid_to_ns(sock, tpid);
1398 _exit(1); // not reached
1399 }
1400 // give the child 1 second to be done forking and
1401 // write its ack
1402 if (!wait_for_sock(cpipe[0], 1))
1403 _exit(1);
1404 ret = read(cpipe[0], &v, 1);
1405 if (ret != sizeof(char) || v != '1')
1406 _exit(1);
1407
1408 if (!wait_for_pid(cpid))
1409 _exit(1);
1410 _exit(0);
1411 }
1412
1413 /*
1414 * To read cgroup files with a particular pid, we will setns into the child
1415 * pidns, open a pipe, fork a child - which will be the first to really be in
1416 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
1417 */
1418 static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
1419 {
1420 int sock[2] = {-1, -1};
1421 char *tmpdata = NULL;
1422 int ret;
1423 pid_t qpid, cpid = -1;
1424 bool answer = false;
1425 char v = '0';
1426 struct ucred cred;
1427 size_t sz = 0, asz = 0;
1428
1429 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
1430 return false;
1431
1432 /*
1433 * Now we read the pids from returned data one by one, pass
1434 * them into a child in the target namespace, read back the
1435 * translated pids, and put them into our to-return data
1436 */
1437
1438 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1439 perror("socketpair");
1440 free(tmpdata);
1441 return false;
1442 }
1443
1444 cpid = fork();
1445 if (cpid == -1)
1446 goto out;
1447
1448 if (!cpid) // child - exits when done
1449 pid_to_ns_wrapper(sock[1], tpid);
1450
1451 char *ptr = tmpdata;
1452 cred.uid = 0;
1453 cred.gid = 0;
1454 while (sscanf(ptr, "%d\n", &qpid) == 1) {
1455 cred.pid = qpid;
1456 ret = send_creds(sock[0], &cred, v, true);
1457
1458 if (ret == SEND_CREDS_NOTSK)
1459 goto next;
1460 if (ret == SEND_CREDS_FAIL)
1461 goto out;
1462
1463 // read converted results
1464 if (!wait_for_sock(sock[0], 2)) {
1465 fprintf(stderr, "%s: timed out waiting for pid from child: %s\n",
1466 __func__, strerror(errno));
1467 goto out;
1468 }
1469 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1470 fprintf(stderr, "%s: error reading pid from child: %s\n",
1471 __func__, strerror(errno));
1472 goto out;
1473 }
1474 must_strcat_pid(d, &sz, &asz, qpid);
1475 next:
1476 ptr = strchr(ptr, '\n');
1477 if (!ptr)
1478 break;
1479 ptr++;
1480 }
1481
1482 cred.pid = getpid();
1483 v = '1';
1484 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
1485 // failed to ask child to exit
1486 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1487 __func__, strerror(errno));
1488 goto out;
1489 }
1490
1491 answer = true;
1492
1493 out:
1494 free(tmpdata);
1495 if (cpid != -1)
1496 wait_for_pid(cpid);
1497 if (sock[0] != -1) {
1498 close(sock[0]);
1499 close(sock[1]);
1500 }
1501 return answer;
1502 }
1503
1504 static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1505 struct fuse_file_info *fi)
1506 {
1507 struct fuse_context *fc = fuse_get_context();
1508 struct file_info *f = (struct file_info *)fi->fh;
1509 struct cgfs_files *k = NULL;
1510 char *data = NULL;
1511 int ret, s;
1512 bool r;
1513
1514 if (f->type != LXC_TYPE_CGFILE) {
1515 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1516 return -EIO;
1517 }
1518
1519 if (offset)
1520 return 0;
1521
1522 if (!fc)
1523 return -EIO;
1524
1525 if (!f->controller)
1526 return -EINVAL;
1527
1528 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
1529 return -EINVAL;
1530 }
1531 free_key(k);
1532
1533
1534 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { // should never get here
1535 ret = -EACCES;
1536 goto out;
1537 }
1538
1539 if (strcmp(f->file, "tasks") == 0 ||
1540 strcmp(f->file, "/tasks") == 0 ||
1541 strcmp(f->file, "/cgroup.procs") == 0 ||
1542 strcmp(f->file, "cgroup.procs") == 0)
1543 // special case - we have to translate the pids
1544 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
1545 else
1546 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
1547
1548 if (!r) {
1549 ret = -EINVAL;
1550 goto out;
1551 }
1552
1553 if (!data) {
1554 ret = 0;
1555 goto out;
1556 }
1557 s = strlen(data);
1558 if (s > size)
1559 s = size;
1560 memcpy(buf, data, s);
1561 if (s > 0 && s < size && data[s-1] != '\n')
1562 buf[s++] = '\n';
1563
1564 ret = s;
1565
1566 out:
1567 free(data);
1568 return ret;
1569 }
1570
1571 static void pid_from_ns(int sock, pid_t tpid)
1572 {
1573 pid_t vpid;
1574 struct ucred cred;
1575 char v;
1576 int ret;
1577
1578 cred.uid = 0;
1579 cred.gid = 0;
1580 while (1) {
1581 if (!wait_for_sock(sock, 2)) {
1582 fprintf(stderr, "%s: timeout reading from parent\n", __func__);
1583 _exit(1);
1584 }
1585 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1586 fprintf(stderr, "%s: bad read from parent: %s\n",
1587 __func__, strerror(errno));
1588 _exit(1);
1589 }
1590 if (vpid == -1) // done
1591 break;
1592 v = '0';
1593 cred.pid = vpid;
1594 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
1595 v = '1';
1596 cred.pid = getpid();
1597 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
1598 _exit(1);
1599 }
1600 }
1601 _exit(0);
1602 }
1603
1604 static void pid_from_ns_wrapper(int sock, pid_t tpid)
1605 {
1606 int newnsfd = -1, ret, cpipe[2];
1607 char fnam[100];
1608 pid_t cpid;
1609 char v;
1610
1611 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1612 if (ret < 0 || ret >= sizeof(fnam))
1613 _exit(1);
1614 newnsfd = open(fnam, O_RDONLY);
1615 if (newnsfd < 0)
1616 _exit(1);
1617 if (setns(newnsfd, 0) < 0)
1618 _exit(1);
1619 close(newnsfd);
1620
1621 if (pipe(cpipe) < 0)
1622 _exit(1);
1623
1624 loop:
1625 cpid = fork();
1626
1627 if (cpid < 0)
1628 _exit(1);
1629
1630 if (!cpid) {
1631 char b = '1';
1632 close(cpipe[0]);
1633 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1634 fprintf(stderr, "%s (child): erorr on write: %s\n",
1635 __func__, strerror(errno));
1636 }
1637 close(cpipe[1]);
1638 pid_from_ns(sock, tpid);
1639 }
1640
1641 // give the child 1 second to be done forking and
1642 // write its ack
1643 if (!wait_for_sock(cpipe[0], 1))
1644 goto again;
1645 ret = read(cpipe[0], &v, 1);
1646 if (ret != sizeof(char) || v != '1') {
1647 goto again;
1648 }
1649
1650 if (!wait_for_pid(cpid))
1651 _exit(1);
1652 _exit(0);
1653
1654 again:
1655 kill(cpid, SIGKILL);
1656 wait_for_pid(cpid);
1657 goto loop;
1658 }
1659
1660 /*
1661 * Given host @uid, return the uid to which it maps in
1662 * @pid's user namespace, or -1 if none.
1663 */
1664 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
1665 {
1666 FILE *f;
1667 char line[400];
1668
1669 sprintf(line, "/proc/%d/uid_map", pid);
1670 if ((f = fopen(line, "r")) == NULL) {
1671 return false;
1672 }
1673
1674 *answer = convert_id_to_ns(f, uid);
1675 fclose(f);
1676
1677 if (*answer == -1)
1678 return false;
1679 return true;
1680 }
1681
1682 /*
1683 * get_pid_creds: get the real uid and gid of @pid from
1684 * /proc/$$/status
1685 * (XXX should we use euid here?)
1686 */
1687 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
1688 {
1689 char line[400];
1690 uid_t u;
1691 gid_t g;
1692 FILE *f;
1693
1694 *uid = -1;
1695 *gid = -1;
1696 sprintf(line, "/proc/%d/status", pid);
1697 if ((f = fopen(line, "r")) == NULL) {
1698 fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno));
1699 return;
1700 }
1701 while (fgets(line, 400, f)) {
1702 if (strncmp(line, "Uid:", 4) == 0) {
1703 if (sscanf(line+4, "%u", &u) != 1) {
1704 fprintf(stderr, "bad uid line for pid %u\n", pid);
1705 fclose(f);
1706 return;
1707 }
1708 *uid = u;
1709 } else if (strncmp(line, "Gid:", 4) == 0) {
1710 if (sscanf(line+4, "%u", &g) != 1) {
1711 fprintf(stderr, "bad gid line for pid %u\n", pid);
1712 fclose(f);
1713 return;
1714 }
1715 *gid = g;
1716 }
1717 }
1718 fclose(f);
1719 }
1720
1721 /*
1722 * May the requestor @r move victim @v to a new cgroup?
1723 * This is allowed if
1724 * . they are the same task
1725 * . they are ownedy by the same uid
1726 * . @r is root on the host, or
1727 * . @v's uid is mapped into @r's where @r is root.
1728 */
1729 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
1730 {
1731 uid_t v_uid, tmpuid;
1732 gid_t v_gid;
1733
1734 if (r == v)
1735 return true;
1736 if (r_uid == 0)
1737 return true;
1738 get_pid_creds(v, &v_uid, &v_gid);
1739 if (r_uid == v_uid)
1740 return true;
1741 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
1742 && hostuid_to_ns(v_uid, r, &tmpuid))
1743 return true;
1744 return false;
1745 }
1746
1747 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
1748 const char *file, const char *buf)
1749 {
1750 int sock[2] = {-1, -1};
1751 pid_t qpid, cpid = -1;
1752 FILE *pids_file = NULL;
1753 bool answer = false, fail = false;
1754
1755 pids_file = open_pids_file(contrl, cg);
1756 if (!pids_file)
1757 return false;
1758
1759 /*
1760 * write the pids to a socket, have helper in writer's pidns
1761 * call movepid for us
1762 */
1763 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1764 perror("socketpair");
1765 goto out;
1766 }
1767
1768 cpid = fork();
1769 if (cpid == -1)
1770 goto out;
1771
1772 if (!cpid) { // child
1773 fclose(pids_file);
1774 pid_from_ns_wrapper(sock[1], tpid);
1775 }
1776
1777 const char *ptr = buf;
1778 while (sscanf(ptr, "%d", &qpid) == 1) {
1779 struct ucred cred;
1780 char v;
1781
1782 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1783 fprintf(stderr, "%s: error writing pid to child: %s\n",
1784 __func__, strerror(errno));
1785 goto out;
1786 }
1787
1788 if (recv_creds(sock[0], &cred, &v)) {
1789 if (v == '0') {
1790 if (!may_move_pid(tpid, tuid, cred.pid)) {
1791 fail = true;
1792 break;
1793 }
1794 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
1795 fail = true;
1796 }
1797 }
1798
1799 ptr = strchr(ptr, '\n');
1800 if (!ptr)
1801 break;
1802 ptr++;
1803 }
1804
1805 /* All good, write the value */
1806 qpid = -1;
1807 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1808 fprintf(stderr, "Warning: failed to ask child to exit\n");
1809
1810 if (!fail)
1811 answer = true;
1812
1813 out:
1814 if (cpid != -1)
1815 wait_for_pid(cpid);
1816 if (sock[0] != -1) {
1817 close(sock[0]);
1818 close(sock[1]);
1819 }
1820 if (pids_file) {
1821 if (fclose(pids_file) != 0)
1822 answer = false;
1823 }
1824 return answer;
1825 }
1826
1827 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1828 struct fuse_file_info *fi)
1829 {
1830 struct fuse_context *fc = fuse_get_context();
1831 char *localbuf = NULL;
1832 struct cgfs_files *k = NULL;
1833 struct file_info *f = (struct file_info *)fi->fh;
1834 bool r;
1835
1836 if (f->type != LXC_TYPE_CGFILE) {
1837 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1838 return -EIO;
1839 }
1840
1841 if (offset)
1842 return 0;
1843
1844 if (!fc)
1845 return -EIO;
1846
1847 localbuf = alloca(size+1);
1848 localbuf[size] = '\0';
1849 memcpy(localbuf, buf, size);
1850
1851 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
1852 size = -EINVAL;
1853 goto out;
1854 }
1855
1856 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
1857 size = -EACCES;
1858 goto out;
1859 }
1860
1861 if (strcmp(f->file, "tasks") == 0 ||
1862 strcmp(f->file, "/tasks") == 0 ||
1863 strcmp(f->file, "/cgroup.procs") == 0 ||
1864 strcmp(f->file, "cgroup.procs") == 0)
1865 // special case - we have to translate the pids
1866 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
1867 else
1868 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
1869
1870 if (!r)
1871 size = -EINVAL;
1872
1873 out:
1874 free_key(k);
1875 return size;
1876 }
1877
1878 int cg_chown(const char *path, uid_t uid, gid_t gid)
1879 {
1880 struct fuse_context *fc = fuse_get_context();
1881 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
1882 struct cgfs_files *k = NULL;
1883 const char *cgroup;
1884 int ret;
1885
1886 if (!fc)
1887 return -EIO;
1888
1889 if (strcmp(path, "/cgroup") == 0)
1890 return -EINVAL;
1891
1892 controller = pick_controller_from_path(fc, path);
1893 if (!controller)
1894 return -EINVAL;
1895 cgroup = find_cgroup_in_path(path);
1896 if (!cgroup)
1897 /* this is just /cgroup/controller */
1898 return -EINVAL;
1899
1900 get_cgdir_and_path(cgroup, &cgdir, &last);
1901
1902 if (!last) {
1903 path1 = "/";
1904 path2 = cgdir;
1905 } else {
1906 path1 = cgdir;
1907 path2 = last;
1908 }
1909
1910 if (is_child_cgroup(controller, path1, path2)) {
1911 // get uid, gid, from '/tasks' file and make up a mode
1912 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1913 k = cgfs_get_key(controller, cgroup, "tasks");
1914
1915 } else
1916 k = cgfs_get_key(controller, path1, path2);
1917
1918 if (!k) {
1919 ret = -EINVAL;
1920 goto out;
1921 }
1922
1923 /*
1924 * This being a fuse request, the uid and gid must be valid
1925 * in the caller's namespace. So we can just check to make
1926 * sure that the caller is root in his uid, and privileged
1927 * over the file's current owner.
1928 */
1929 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
1930 ret = -EACCES;
1931 goto out;
1932 }
1933
1934 ret = cgfs_chown_file(controller, cgroup, uid, gid);
1935
1936 out:
1937 free_key(k);
1938 free(cgdir);
1939
1940 return ret;
1941 }
1942
1943 int cg_chmod(const char *path, mode_t mode)
1944 {
1945 struct fuse_context *fc = fuse_get_context();
1946 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
1947 struct cgfs_files *k = NULL;
1948 const char *cgroup;
1949 int ret;
1950
1951 if (!fc)
1952 return -EIO;
1953
1954 if (strcmp(path, "/cgroup") == 0)
1955 return -EINVAL;
1956
1957 controller = pick_controller_from_path(fc, path);
1958 if (!controller)
1959 return -EINVAL;
1960 cgroup = find_cgroup_in_path(path);
1961 if (!cgroup)
1962 /* this is just /cgroup/controller */
1963 return -EINVAL;
1964
1965 get_cgdir_and_path(cgroup, &cgdir, &last);
1966
1967 if (!last) {
1968 path1 = "/";
1969 path2 = cgdir;
1970 } else {
1971 path1 = cgdir;
1972 path2 = last;
1973 }
1974
1975 if (is_child_cgroup(controller, path1, path2)) {
1976 // get uid, gid, from '/tasks' file and make up a mode
1977 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1978 k = cgfs_get_key(controller, cgroup, "tasks");
1979
1980 } else
1981 k = cgfs_get_key(controller, path1, path2);
1982
1983 if (!k) {
1984 ret = -EINVAL;
1985 goto out;
1986 }
1987
1988 /*
1989 * This being a fuse request, the uid and gid must be valid
1990 * in the caller's namespace. So we can just check to make
1991 * sure that the caller is root in his uid, and privileged
1992 * over the file's current owner.
1993 */
1994 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1995 ret = -EPERM;
1996 goto out;
1997 }
1998
1999 if (!cgfs_chmod_file(controller, cgroup, mode)) {
2000 ret = -EINVAL;
2001 goto out;
2002 }
2003
2004 ret = 0;
2005 out:
2006 free_key(k);
2007 free(cgdir);
2008 return ret;
2009 }
2010
2011 int cg_mkdir(const char *path, mode_t mode)
2012 {
2013 struct fuse_context *fc = fuse_get_context();
2014 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2015 const char *cgroup;
2016 int ret;
2017
2018 if (!fc)
2019 return -EIO;
2020
2021
2022 controller = pick_controller_from_path(fc, path);
2023 if (!controller)
2024 return -EINVAL;
2025
2026 cgroup = find_cgroup_in_path(path);
2027 if (!cgroup)
2028 return -EINVAL;
2029
2030 get_cgdir_and_path(cgroup, &cgdir, &last);
2031 if (!last)
2032 path1 = "/";
2033 else
2034 path1 = cgdir;
2035
2036 pid_t initpid = lookup_initpid_in_store(fc->pid);
2037 if (initpid <= 0)
2038 initpid = fc->pid;
2039 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
2040 if (!next)
2041 ret = -EINVAL;
2042 else if (last && strcmp(next, last) == 0)
2043 ret = -EEXIST;
2044 else
2045 ret = -ENOENT;
2046 goto out;
2047 }
2048
2049 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
2050 ret = -EACCES;
2051 goto out;
2052 }
2053 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2054 ret = -EACCES;
2055 goto out;
2056 }
2057
2058 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
2059
2060 out:
2061 free(cgdir);
2062 free(next);
2063 return ret;
2064 }
2065
2066 static int cg_rmdir(const char *path)
2067 {
2068 struct fuse_context *fc = fuse_get_context();
2069 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
2070 const char *cgroup;
2071 int ret;
2072
2073 if (!fc)
2074 return -EIO;
2075
2076 controller = pick_controller_from_path(fc, path);
2077 if (!controller)
2078 return -EINVAL;
2079
2080 cgroup = find_cgroup_in_path(path);
2081 if (!cgroup)
2082 return -EINVAL;
2083
2084 get_cgdir_and_path(cgroup, &cgdir, &last);
2085 if (!last) {
2086 ret = -EINVAL;
2087 goto out;
2088 }
2089
2090 pid_t initpid = lookup_initpid_in_store(fc->pid);
2091 if (initpid <= 0)
2092 initpid = fc->pid;
2093 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
2094 if (!last || strcmp(next, last) == 0)
2095 ret = -EBUSY;
2096 else
2097 ret = -ENOENT;
2098 goto out;
2099 }
2100
2101 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
2102 ret = -EACCES;
2103 goto out;
2104 }
2105 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2106 ret = -EACCES;
2107 goto out;
2108 }
2109
2110 if (!cgfs_remove(controller, cgroup)) {
2111 ret = -EINVAL;
2112 goto out;
2113 }
2114
2115 ret = 0;
2116
2117 out:
2118 free(cgdir);
2119 free(next);
2120 return ret;
2121 }
2122
2123 static bool startswith(const char *line, const char *pref)
2124 {
2125 if (strncmp(line, pref, strlen(pref)) == 0)
2126 return true;
2127 return false;
2128 }
2129
2130 static void get_mem_cached(char *memstat, unsigned long *v)
2131 {
2132 char *eol;
2133
2134 *v = 0;
2135 while (*memstat) {
2136 if (startswith(memstat, "total_cache")) {
2137 sscanf(memstat + 11, "%lu", v);
2138 *v /= 1024;
2139 return;
2140 }
2141 eol = strchr(memstat, '\n');
2142 if (!eol)
2143 return;
2144 memstat = eol+1;
2145 }
2146 }
2147
2148 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2149 {
2150 char *eol;
2151 char key[32];
2152
2153 memset(key, 0, 32);
2154 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2155
2156 size_t len = strlen(key);
2157 *v = 0;
2158
2159 while (*str) {
2160 if (startswith(str, key)) {
2161 sscanf(str + len, "%lu", v);
2162 return;
2163 }
2164 eol = strchr(str, '\n');
2165 if (!eol)
2166 return;
2167 str = eol+1;
2168 }
2169 }
2170
2171 static int read_file(const char *path, char *buf, size_t size,
2172 struct file_info *d)
2173 {
2174 size_t linelen = 0, total_len = 0, rv = 0;
2175 char *line = NULL;
2176 char *cache = d->buf;
2177 size_t cache_size = d->buflen;
2178 FILE *f = fopen(path, "r");
2179 if (!f)
2180 return 0;
2181
2182 while (getline(&line, &linelen, f) != -1) {
2183 size_t l = snprintf(cache, cache_size, "%s", line);
2184 if (l < 0) {
2185 perror("Error writing to cache");
2186 rv = 0;
2187 goto err;
2188 }
2189 if (l >= cache_size) {
2190 fprintf(stderr, "Internal error: truncated write to cache\n");
2191 rv = 0;
2192 goto err;
2193 }
2194 if (l < cache_size) {
2195 cache += l;
2196 cache_size -= l;
2197 total_len += l;
2198 } else {
2199 cache += cache_size;
2200 total_len += cache_size;
2201 cache_size = 0;
2202 break;
2203 }
2204 }
2205
2206 d->size = total_len;
2207 if (total_len > size ) total_len = size;
2208
2209 /* read from off 0 */
2210 memcpy(buf, d->buf, total_len);
2211 rv = total_len;
2212 err:
2213 fclose(f);
2214 free(line);
2215 return rv;
2216 }
2217
2218 /*
2219 * FUSE ops for /proc
2220 */
2221
2222 static unsigned long get_memlimit(const char *cgroup)
2223 {
2224 char *memlimit_str = NULL;
2225 unsigned long memlimit = -1;
2226
2227 if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
2228 memlimit = strtoul(memlimit_str, NULL, 10);
2229
2230 free(memlimit_str);
2231
2232 return memlimit;
2233 }
2234
2235 static unsigned long get_min_memlimit(const char *cgroup)
2236 {
2237 char *copy = strdupa(cgroup);
2238 unsigned long memlimit = 0, retlimit;
2239
2240 retlimit = get_memlimit(copy);
2241
2242 while (strcmp(copy, "/") != 0) {
2243 copy = dirname(copy);
2244 memlimit = get_memlimit(copy);
2245 if (memlimit != -1 && memlimit < retlimit)
2246 retlimit = memlimit;
2247 };
2248
2249 return retlimit;
2250 }
2251
2252 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
2253 struct fuse_file_info *fi)
2254 {
2255 struct fuse_context *fc = fuse_get_context();
2256 struct file_info *d = (struct file_info *)fi->fh;
2257 char *cg;
2258 char *memusage_str = NULL, *memstat_str = NULL,
2259 *memswlimit_str = NULL, *memswusage_str = NULL,
2260 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
2261 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
2262 cached = 0, hosttotal = 0;
2263 char *line = NULL;
2264 size_t linelen = 0, total_len = 0, rv = 0;
2265 char *cache = d->buf;
2266 size_t cache_size = d->buflen;
2267 FILE *f = NULL;
2268
2269 if (offset){
2270 if (offset > d->size)
2271 return -EINVAL;
2272 if (!d->cached)
2273 return 0;
2274 int left = d->size - offset;
2275 total_len = left > size ? size: left;
2276 memcpy(buf, cache + offset, total_len);
2277 return total_len;
2278 }
2279
2280 pid_t initpid = lookup_initpid_in_store(fc->pid);
2281 if (initpid <= 0)
2282 initpid = fc->pid;
2283 cg = get_pid_cgroup(initpid, "memory");
2284 if (!cg)
2285 return read_file("/proc/meminfo", buf, size, d);
2286
2287 memlimit = get_min_memlimit(cg);
2288 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
2289 goto err;
2290 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
2291 goto err;
2292
2293 // Following values are allowed to fail, because swapaccount might be turned
2294 // off for current kernel
2295 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
2296 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
2297 {
2298 /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
2299 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
2300 goto err;
2301 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
2302 goto err;
2303
2304 memswlimit = strtoul(memswlimit_str, NULL, 10);
2305 memswusage = strtoul(memswusage_str, NULL, 10);
2306
2307 if (!strcmp(memswlimit_str, memswlimit_default_str))
2308 memswlimit = 0;
2309 if (!strcmp(memswusage_str, memswusage_default_str))
2310 memswusage = 0;
2311
2312 memswlimit = memswlimit / 1024;
2313 memswusage = memswusage / 1024;
2314 }
2315
2316 memusage = strtoul(memusage_str, NULL, 10);
2317 memlimit /= 1024;
2318 memusage /= 1024;
2319
2320 get_mem_cached(memstat_str, &cached);
2321
2322 f = fopen("/proc/meminfo", "r");
2323 if (!f)
2324 goto err;
2325
2326 while (getline(&line, &linelen, f) != -1) {
2327 size_t l;
2328 char *printme, lbuf[100];
2329
2330 memset(lbuf, 0, 100);
2331 if (startswith(line, "MemTotal:")) {
2332 sscanf(line+14, "%lu", &hosttotal);
2333 if (hosttotal < memlimit)
2334 memlimit = hosttotal;
2335 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
2336 printme = lbuf;
2337 } else if (startswith(line, "MemFree:")) {
2338 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
2339 printme = lbuf;
2340 } else if (startswith(line, "MemAvailable:")) {
2341 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
2342 printme = lbuf;
2343 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
2344 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit);
2345 printme = lbuf;
2346 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
2347 snprintf(lbuf, 100, "SwapFree: %8lu kB\n",
2348 (memswlimit - memlimit) - (memswusage - memusage));
2349 printme = lbuf;
2350 } else if (startswith(line, "Buffers:")) {
2351 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
2352 printme = lbuf;
2353 } else if (startswith(line, "Cached:")) {
2354 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
2355 printme = lbuf;
2356 } else if (startswith(line, "SwapCached:")) {
2357 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
2358 printme = lbuf;
2359 } else
2360 printme = line;
2361
2362 l = snprintf(cache, cache_size, "%s", printme);
2363 if (l < 0) {
2364 perror("Error writing to cache");
2365 rv = 0;
2366 goto err;
2367
2368 }
2369 if (l >= cache_size) {
2370 fprintf(stderr, "Internal error: truncated write to cache\n");
2371 rv = 0;
2372 goto err;
2373 }
2374
2375 cache += l;
2376 cache_size -= l;
2377 total_len += l;
2378 }
2379
2380 d->cached = 1;
2381 d->size = total_len;
2382 if (total_len > size ) total_len = size;
2383 memcpy(buf, d->buf, total_len);
2384
2385 rv = total_len;
2386 err:
2387 if (f)
2388 fclose(f);
2389 free(line);
2390 free(cg);
2391 free(memusage_str);
2392 free(memswlimit_str);
2393 free(memswusage_str);
2394 free(memstat_str);
2395 free(memswlimit_default_str);
2396 free(memswusage_default_str);
2397 return rv;
2398 }
2399
2400 /*
2401 * Read the cpuset.cpus for cg
2402 * Return the answer in a newly allocated string which must be freed
2403 */
2404 static char *get_cpuset(const char *cg)
2405 {
2406 char *answer;
2407
2408 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
2409 return NULL;
2410 return answer;
2411 }
2412
2413 bool cpu_in_cpuset(int cpu, const char *cpuset);
2414
2415 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
2416 {
2417 int cpu;
2418
2419 if (sscanf(line, "processor : %d", &cpu) != 1)
2420 return false;
2421 return cpu_in_cpuset(cpu, cpuset);
2422 }
2423
2424 /*
2425 * check whether this is a '^processor" line in /proc/cpuinfo
2426 */
2427 static bool is_processor_line(const char *line)
2428 {
2429 int cpu;
2430
2431 if (sscanf(line, "processor : %d", &cpu) == 1)
2432 return true;
2433 return false;
2434 }
2435
2436 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
2437 struct fuse_file_info *fi)
2438 {
2439 struct fuse_context *fc = fuse_get_context();
2440 struct file_info *d = (struct file_info *)fi->fh;
2441 char *cg;
2442 char *cpuset = NULL;
2443 char *line = NULL;
2444 size_t linelen = 0, total_len = 0, rv = 0;
2445 bool am_printing = false;
2446 int curcpu = -1;
2447 char *cache = d->buf;
2448 size_t cache_size = d->buflen;
2449 FILE *f = NULL;
2450
2451 if (offset){
2452 if (offset > d->size)
2453 return -EINVAL;
2454 if (!d->cached)
2455 return 0;
2456 int left = d->size - offset;
2457 total_len = left > size ? size: left;
2458 memcpy(buf, cache + offset, total_len);
2459 return total_len;
2460 }
2461
2462 pid_t initpid = lookup_initpid_in_store(fc->pid);
2463 if (initpid <= 0)
2464 initpid = fc->pid;
2465 cg = get_pid_cgroup(initpid, "cpuset");
2466 if (!cg)
2467 return read_file("proc/cpuinfo", buf, size, d);
2468
2469 cpuset = get_cpuset(cg);
2470 if (!cpuset)
2471 goto err;
2472
2473 f = fopen("/proc/cpuinfo", "r");
2474 if (!f)
2475 goto err;
2476
2477 while (getline(&line, &linelen, f) != -1) {
2478 size_t l;
2479 if (is_processor_line(line)) {
2480 am_printing = cpuline_in_cpuset(line, cpuset);
2481 if (am_printing) {
2482 curcpu ++;
2483 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
2484 if (l < 0) {
2485 perror("Error writing to cache");
2486 rv = 0;
2487 goto err;
2488 }
2489 if (l >= cache_size) {
2490 fprintf(stderr, "Internal error: truncated write to cache\n");
2491 rv = 0;
2492 goto err;
2493 }
2494 if (l < cache_size){
2495 cache += l;
2496 cache_size -= l;
2497 total_len += l;
2498 }else{
2499 cache += cache_size;
2500 total_len += cache_size;
2501 cache_size = 0;
2502 break;
2503 }
2504 }
2505 continue;
2506 }
2507 if (am_printing) {
2508 l = snprintf(cache, cache_size, "%s", line);
2509 if (l < 0) {
2510 perror("Error writing to cache");
2511 rv = 0;
2512 goto err;
2513 }
2514 if (l >= cache_size) {
2515 fprintf(stderr, "Internal error: truncated write to cache\n");
2516 rv = 0;
2517 goto err;
2518 }
2519 if (l < cache_size) {
2520 cache += l;
2521 cache_size -= l;
2522 total_len += l;
2523 } else {
2524 cache += cache_size;
2525 total_len += cache_size;
2526 cache_size = 0;
2527 break;
2528 }
2529 }
2530 }
2531
2532 d->cached = 1;
2533 d->size = total_len;
2534 if (total_len > size ) total_len = size;
2535
2536 /* read from off 0 */
2537 memcpy(buf, d->buf, total_len);
2538 rv = total_len;
2539 err:
2540 if (f)
2541 fclose(f);
2542 free(line);
2543 free(cpuset);
2544 free(cg);
2545 return rv;
2546 }
2547
2548 static int proc_stat_read(char *buf, size_t size, off_t offset,
2549 struct fuse_file_info *fi)
2550 {
2551 struct fuse_context *fc = fuse_get_context();
2552 struct file_info *d = (struct file_info *)fi->fh;
2553 char *cg;
2554 char *cpuset = NULL;
2555 char *line = NULL;
2556 size_t linelen = 0, total_len = 0, rv = 0;
2557 int curcpu = -1; /* cpu numbering starts at 0 */
2558 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
2559 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
2560 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
2561 #define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
2562 char cpuall[CPUALL_MAX_SIZE];
2563 /* reserve for cpu all */
2564 char *cache = d->buf + CPUALL_MAX_SIZE;
2565 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
2566 FILE *f = NULL;
2567
2568 if (offset){
2569 if (offset > d->size)
2570 return -EINVAL;
2571 if (!d->cached)
2572 return 0;
2573 int left = d->size - offset;
2574 total_len = left > size ? size: left;
2575 memcpy(buf, d->buf + offset, total_len);
2576 return total_len;
2577 }
2578
2579 pid_t initpid = lookup_initpid_in_store(fc->pid);
2580 if (initpid <= 0)
2581 initpid = fc->pid;
2582 cg = get_pid_cgroup(initpid, "cpuset");
2583 if (!cg)
2584 return read_file("/proc/stat", buf, size, d);
2585
2586 cpuset = get_cpuset(cg);
2587 if (!cpuset)
2588 goto err;
2589
2590 f = fopen("/proc/stat", "r");
2591 if (!f)
2592 goto err;
2593
2594 //skip first line
2595 if (getline(&line, &linelen, f) < 0) {
2596 fprintf(stderr, "proc_stat_read read first line failed\n");
2597 goto err;
2598 }
2599
2600 while (getline(&line, &linelen, f) != -1) {
2601 size_t l;
2602 int cpu;
2603 char cpu_char[10]; /* That's a lot of cores */
2604 char *c;
2605
2606 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
2607 /* not a ^cpuN line containing a number N, just print it */
2608 l = snprintf(cache, cache_size, "%s", line);
2609 if (l < 0) {
2610 perror("Error writing to cache");
2611 rv = 0;
2612 goto err;
2613 }
2614 if (l >= cache_size) {
2615 fprintf(stderr, "Internal error: truncated write to cache\n");
2616 rv = 0;
2617 goto err;
2618 }
2619 if (l < cache_size) {
2620 cache += l;
2621 cache_size -= l;
2622 total_len += l;
2623 continue;
2624 } else {
2625 //no more space, break it
2626 cache += cache_size;
2627 total_len += cache_size;
2628 cache_size = 0;
2629 break;
2630 }
2631 }
2632
2633 if (sscanf(cpu_char, "%d", &cpu) != 1)
2634 continue;
2635 if (!cpu_in_cpuset(cpu, cpuset))
2636 continue;
2637 curcpu ++;
2638
2639 c = strchr(line, ' ');
2640 if (!c)
2641 continue;
2642 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
2643 if (l < 0) {
2644 perror("Error writing to cache");
2645 rv = 0;
2646 goto err;
2647
2648 }
2649 if (l >= cache_size) {
2650 fprintf(stderr, "Internal error: truncated write to cache\n");
2651 rv = 0;
2652 goto err;
2653 }
2654
2655 cache += l;
2656 cache_size -= l;
2657 total_len += l;
2658
2659 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
2660 &softirq, &steal, &guest) != 9)
2661 continue;
2662 user_sum += user;
2663 nice_sum += nice;
2664 system_sum += system;
2665 idle_sum += idle;
2666 iowait_sum += iowait;
2667 irq_sum += irq;
2668 softirq_sum += softirq;
2669 steal_sum += steal;
2670 guest_sum += guest;
2671 }
2672
2673 cache = d->buf;
2674
2675 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2676 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
2677 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
2678 memcpy(cache, cpuall, cpuall_len);
2679 cache += cpuall_len;
2680 } else{
2681 /* shouldn't happen */
2682 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
2683 cpuall_len = 0;
2684 }
2685
2686 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
2687 total_len += cpuall_len;
2688 d->cached = 1;
2689 d->size = total_len;
2690 if (total_len > size ) total_len = size;
2691
2692 memcpy(buf, d->buf, total_len);
2693 rv = total_len;
2694
2695 err:
2696 if (f)
2697 fclose(f);
2698 free(line);
2699 free(cpuset);
2700 free(cg);
2701 return rv;
2702 }
2703
2704 static long int getreaperage(pid_t pid)
2705 {
2706 char fnam[100];
2707 struct stat sb;
2708 int ret;
2709 pid_t qpid;
2710
2711 qpid = lookup_initpid_in_store(pid);
2712 if (qpid <= 0)
2713 return 0;
2714
2715 ret = snprintf(fnam, 100, "/proc/%d", qpid);
2716 if (ret < 0 || ret >= 100)
2717 return 0;
2718
2719 if (lstat(fnam, &sb) < 0)
2720 return 0;
2721
2722 return time(NULL) - sb.st_ctime;
2723 }
2724
2725 static unsigned long get_reaper_busy(pid_t task)
2726 {
2727 pid_t initpid = lookup_initpid_in_store(task);
2728 char *cgroup = NULL, *usage_str = NULL;
2729 unsigned long usage = 0;
2730
2731 if (initpid <= 0)
2732 return 0;
2733
2734 cgroup = get_pid_cgroup(initpid, "cpuacct");
2735 if (!cgroup)
2736 goto out;
2737 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
2738 goto out;
2739 usage = strtoul(usage_str, NULL, 10);
2740 usage /= 1000000000;
2741
2742 out:
2743 free(cgroup);
2744 free(usage_str);
2745 return usage;
2746 }
2747
2748 /*
2749 * We read /proc/uptime and reuse its second field.
2750 * For the first field, we use the mtime for the reaper for
2751 * the calling pid as returned by getreaperage
2752 */
2753 static int proc_uptime_read(char *buf, size_t size, off_t offset,
2754 struct fuse_file_info *fi)
2755 {
2756 struct fuse_context *fc = fuse_get_context();
2757 struct file_info *d = (struct file_info *)fi->fh;
2758 long int reaperage = getreaperage(fc->pid);
2759 unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
2760 char *cache = d->buf;
2761 size_t total_len = 0;
2762
2763 if (offset){
2764 if (offset > d->size)
2765 return -EINVAL;
2766 if (!d->cached)
2767 return 0;
2768 int left = d->size - offset;
2769 total_len = left > size ? size: left;
2770 memcpy(buf, cache + offset, total_len);
2771 return total_len;
2772 }
2773
2774 idletime = reaperage - busytime;
2775 if (idletime > reaperage)
2776 idletime = reaperage;
2777
2778 total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
2779 if (total_len < 0){
2780 perror("Error writing to cache");
2781 return 0;
2782 }
2783
2784 d->size = (int)total_len;
2785 d->cached = 1;
2786
2787 if (total_len > size) total_len = size;
2788
2789 memcpy(buf, d->buf, total_len);
2790 return total_len;
2791 }
2792
2793 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2794 struct fuse_file_info *fi)
2795 {
2796 char dev_name[72];
2797 struct fuse_context *fc = fuse_get_context();
2798 struct file_info *d = (struct file_info *)fi->fh;
2799 char *cg;
2800 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
2801 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2802 unsigned long read = 0, write = 0;
2803 unsigned long read_merged = 0, write_merged = 0;
2804 unsigned long read_sectors = 0, write_sectors = 0;
2805 unsigned long read_ticks = 0, write_ticks = 0;
2806 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2807 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
2808 char *cache = d->buf;
2809 size_t cache_size = d->buflen;
2810 char *line = NULL;
2811 size_t linelen = 0, total_len = 0, rv = 0;
2812 unsigned int major = 0, minor = 0;
2813 int i = 0;
2814 FILE *f = NULL;
2815
2816 if (offset){
2817 if (offset > d->size)
2818 return -EINVAL;
2819 if (!d->cached)
2820 return 0;
2821 int left = d->size - offset;
2822 total_len = left > size ? size: left;
2823 memcpy(buf, cache + offset, total_len);
2824 return total_len;
2825 }
2826
2827 pid_t initpid = lookup_initpid_in_store(fc->pid);
2828 if (initpid <= 0)
2829 initpid = fc->pid;
2830 cg = get_pid_cgroup(initpid, "blkio");
2831 if (!cg)
2832 return read_file("/proc/diskstats", buf, size, d);
2833
2834 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2835 goto err;
2836 if (!cgfs_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2837 goto err;
2838 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2839 goto err;
2840 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2841 goto err;
2842 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2843 goto err;
2844
2845
2846 f = fopen("/proc/diskstats", "r");
2847 if (!f)
2848 goto err;
2849
2850 while (getline(&line, &linelen, f) != -1) {
2851 size_t l;
2852 char *printme, lbuf[256];
2853
2854 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2855 if(i == 3){
2856 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2857 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2858 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2859 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2860 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2861 read_sectors = read_sectors/512;
2862 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2863 write_sectors = write_sectors/512;
2864
2865 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2866 rd_svctm = rd_svctm/1000000;
2867 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2868 rd_wait = rd_wait/1000000;
2869 read_ticks = rd_svctm + rd_wait;
2870
2871 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2872 wr_svctm = wr_svctm/1000000;
2873 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2874 wr_wait = wr_wait/1000000;
2875 write_ticks = wr_svctm + wr_wait;
2876
2877 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2878 tot_ticks = tot_ticks/1000000;
2879 }else{
2880 continue;
2881 }
2882
2883 memset(lbuf, 0, 256);
2884 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2885 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2886 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2887 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2888 printme = lbuf;
2889 } else
2890 continue;
2891
2892 l = snprintf(cache, cache_size, "%s", printme);
2893 if (l < 0) {
2894 perror("Error writing to fuse buf");
2895 rv = 0;
2896 goto err;
2897 }
2898 if (l >= cache_size) {
2899 fprintf(stderr, "Internal error: truncated write to cache\n");
2900 rv = 0;
2901 goto err;
2902 }
2903 cache += l;
2904 cache_size -= l;
2905 total_len += l;
2906 }
2907
2908 d->cached = 1;
2909 d->size = total_len;
2910 if (total_len > size ) total_len = size;
2911 memcpy(buf, d->buf, total_len);
2912
2913 rv = total_len;
2914 err:
2915 free(cg);
2916 if (f)
2917 fclose(f);
2918 free(line);
2919 free(io_serviced_str);
2920 free(io_merged_str);
2921 free(io_service_bytes_str);
2922 free(io_wait_time_str);
2923 free(io_service_time_str);
2924 return rv;
2925 }
2926
2927 static off_t get_procfile_size(const char *which)
2928 {
2929 FILE *f = fopen(which, "r");
2930 char *line = NULL;
2931 size_t len = 0;
2932 ssize_t sz, answer = 0;
2933 if (!f)
2934 return 0;
2935
2936 while ((sz = getline(&line, &len, f)) != -1)
2937 answer += sz;
2938 fclose (f);
2939 free(line);
2940
2941 return answer;
2942 }
2943
2944 static int proc_getattr(const char *path, struct stat *sb)
2945 {
2946 struct timespec now;
2947
2948 memset(sb, 0, sizeof(struct stat));
2949 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2950 return -EINVAL;
2951 sb->st_uid = sb->st_gid = 0;
2952 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2953 if (strcmp(path, "/proc") == 0) {
2954 sb->st_mode = S_IFDIR | 00555;
2955 sb->st_nlink = 2;
2956 return 0;
2957 }
2958 if (strcmp(path, "/proc/meminfo") == 0 ||
2959 strcmp(path, "/proc/cpuinfo") == 0 ||
2960 strcmp(path, "/proc/uptime") == 0 ||
2961 strcmp(path, "/proc/stat") == 0 ||
2962 strcmp(path, "/proc/diskstats") == 0) {
2963 sb->st_size = 0;
2964 sb->st_mode = S_IFREG | 00444;
2965 sb->st_nlink = 1;
2966 return 0;
2967 }
2968
2969 return -ENOENT;
2970 }
2971
2972 static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2973 struct fuse_file_info *fi)
2974 {
2975 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2976 filler(buf, "meminfo", NULL, 0) != 0 ||
2977 filler(buf, "stat", NULL, 0) != 0 ||
2978 filler(buf, "uptime", NULL, 0) != 0 ||
2979 filler(buf, "diskstats", NULL, 0) != 0)
2980 return -EINVAL;
2981 return 0;
2982 }
2983
2984 static int proc_open(const char *path, struct fuse_file_info *fi)
2985 {
2986 int type = -1;
2987 struct file_info *info;
2988
2989 if (strcmp(path, "/proc/meminfo") == 0)
2990 type = LXC_TYPE_PROC_MEMINFO;
2991 else if (strcmp(path, "/proc/cpuinfo") == 0)
2992 type = LXC_TYPE_PROC_CPUINFO;
2993 else if (strcmp(path, "/proc/uptime") == 0)
2994 type = LXC_TYPE_PROC_UPTIME;
2995 else if (strcmp(path, "/proc/stat") == 0)
2996 type = LXC_TYPE_PROC_STAT;
2997 else if (strcmp(path, "/proc/diskstats") == 0)
2998 type = LXC_TYPE_PROC_DISKSTATS;
2999 if (type == -1)
3000 return -ENOENT;
3001
3002 info = malloc(sizeof(*info));
3003 if (!info)
3004 return -ENOMEM;
3005
3006 memset(info, 0, sizeof(*info));
3007 info->type = type;
3008
3009 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
3010 do {
3011 info->buf = malloc(info->buflen);
3012 } while (!info->buf);
3013 memset(info->buf, 0, info->buflen);
3014 /* set actual size to buffer size */
3015 info->size = info->buflen;
3016
3017 fi->fh = (unsigned long)info;
3018 return 0;
3019 }
3020
3021 static int proc_release(const char *path, struct fuse_file_info *fi)
3022 {
3023 struct file_info *f = (struct file_info *)fi->fh;
3024
3025 do_release_file_info(f);
3026 return 0;
3027 }
3028
3029 static int proc_read(const char *path, char *buf, size_t size, off_t offset,
3030 struct fuse_file_info *fi)
3031 {
3032 struct file_info *f = (struct file_info *) fi->fh;
3033
3034 switch (f->type) {
3035 case LXC_TYPE_PROC_MEMINFO:
3036 return proc_meminfo_read(buf, size, offset, fi);
3037 case LXC_TYPE_PROC_CPUINFO:
3038 return proc_cpuinfo_read(buf, size, offset, fi);
3039 case LXC_TYPE_PROC_UPTIME:
3040 return proc_uptime_read(buf, size, offset, fi);
3041 case LXC_TYPE_PROC_STAT:
3042 return proc_stat_read(buf, size, offset, fi);
3043 case LXC_TYPE_PROC_DISKSTATS:
3044 return proc_diskstats_read(buf, size, offset, fi);
3045 default:
3046 return -EINVAL;
3047 }
3048 }
3049
3050 /*
3051 * FUSE ops for /
3052 * these just delegate to the /proc and /cgroup ops as
3053 * needed
3054 */
3055
3056 static int lxcfs_getattr(const char *path, struct stat *sb)
3057 {
3058 if (strcmp(path, "/") == 0) {
3059 sb->st_mode = S_IFDIR | 00755;
3060 sb->st_nlink = 2;
3061 return 0;
3062 }
3063 if (strncmp(path, "/cgroup", 7) == 0) {
3064 return cg_getattr(path, sb);
3065 }
3066 if (strncmp(path, "/proc", 5) == 0) {
3067 return proc_getattr(path, sb);
3068 }
3069 return -EINVAL;
3070 }
3071
3072 static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
3073 {
3074 if (strcmp(path, "/") == 0)
3075 return 0;
3076
3077 if (strncmp(path, "/cgroup", 7) == 0) {
3078 return cg_opendir(path, fi);
3079 }
3080 if (strcmp(path, "/proc") == 0)
3081 return 0;
3082 return -ENOENT;
3083 }
3084
3085 static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
3086 struct fuse_file_info *fi)
3087 {
3088 if (strcmp(path, "/") == 0) {
3089 if (filler(buf, "proc", NULL, 0) != 0 ||
3090 filler(buf, "cgroup", NULL, 0) != 0)
3091 return -EINVAL;
3092 return 0;
3093 }
3094 if (strncmp(path, "/cgroup", 7) == 0)
3095 return cg_readdir(path, buf, filler, offset, fi);
3096 if (strcmp(path, "/proc") == 0)
3097 return proc_readdir(path, buf, filler, offset, fi);
3098 return -EINVAL;
3099 }
3100
3101 static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
3102 {
3103 if (strcmp(path, "/") == 0)
3104 return 0;
3105 if (strncmp(path, "/cgroup", 7) == 0) {
3106 return cg_releasedir(path, fi);
3107 }
3108 if (strcmp(path, "/proc") == 0)
3109 return 0;
3110 return -EINVAL;
3111 }
3112
3113 static int lxcfs_open(const char *path, struct fuse_file_info *fi)
3114 {
3115 if (strncmp(path, "/cgroup", 7) == 0)
3116 return cg_open(path, fi);
3117 if (strncmp(path, "/proc", 5) == 0)
3118 return proc_open(path, fi);
3119
3120 return -EINVAL;
3121 }
3122
3123 static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
3124 struct fuse_file_info *fi)
3125 {
3126 if (strncmp(path, "/cgroup", 7) == 0)
3127 return cg_read(path, buf, size, offset, fi);
3128 if (strncmp(path, "/proc", 5) == 0)
3129 return proc_read(path, buf, size, offset, fi);
3130
3131 return -EINVAL;
3132 }
3133
3134 int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
3135 struct fuse_file_info *fi)
3136 {
3137 if (strncmp(path, "/cgroup", 7) == 0) {
3138 return cg_write(path, buf, size, offset, fi);
3139 }
3140
3141 return -EINVAL;
3142 }
3143
3144 static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
3145 {
3146 return 0;
3147 }
3148
3149 static int lxcfs_release(const char *path, struct fuse_file_info *fi)
3150 {
3151 if (strncmp(path, "/cgroup", 7) == 0)
3152 return cg_release(path, fi);
3153 if (strncmp(path, "/proc", 5) == 0)
3154 return proc_release(path, fi);
3155
3156 return -EINVAL;
3157 }
3158
3159 static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
3160 {
3161 return 0;
3162 }
3163
3164 int lxcfs_mkdir(const char *path, mode_t mode)
3165 {
3166 if (strncmp(path, "/cgroup", 7) == 0)
3167 return cg_mkdir(path, mode);
3168
3169 return -EINVAL;
3170 }
3171
3172 int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
3173 {
3174 if (strncmp(path, "/cgroup", 7) == 0)
3175 return cg_chown(path, uid, gid);
3176
3177 return -EINVAL;
3178 }
3179
3180 /*
3181 * cat first does a truncate before doing ops->write. This doesn't
3182 * really make sense for cgroups. So just return 0 always but do
3183 * nothing.
3184 */
3185 int lxcfs_truncate(const char *path, off_t newsize)
3186 {
3187 if (strncmp(path, "/cgroup", 7) == 0)
3188 return 0;
3189 return -EINVAL;
3190 }
3191
3192 int lxcfs_rmdir(const char *path)
3193 {
3194 if (strncmp(path, "/cgroup", 7) == 0)
3195 return cg_rmdir(path);
3196 return -EINVAL;
3197 }
3198
3199 int lxcfs_chmod(const char *path, mode_t mode)
3200 {
3201 if (strncmp(path, "/cgroup", 7) == 0)
3202 return cg_chmod(path, mode);
3203 return -EINVAL;
3204 }
3205
3206 const struct fuse_operations lxcfs_ops = {
3207 .getattr = lxcfs_getattr,
3208 .readlink = NULL,
3209 .getdir = NULL,
3210 .mknod = NULL,
3211 .mkdir = lxcfs_mkdir,
3212 .unlink = NULL,
3213 .rmdir = lxcfs_rmdir,
3214 .symlink = NULL,
3215 .rename = NULL,
3216 .link = NULL,
3217 .chmod = lxcfs_chmod,
3218 .chown = lxcfs_chown,
3219 .truncate = lxcfs_truncate,
3220 .utime = NULL,
3221
3222 .open = lxcfs_open,
3223 .read = lxcfs_read,
3224 .release = lxcfs_release,
3225 .write = lxcfs_write,
3226
3227 .statfs = NULL,
3228 .flush = lxcfs_flush,
3229 .fsync = lxcfs_fsync,
3230
3231 .setxattr = NULL,
3232 .getxattr = NULL,
3233 .listxattr = NULL,
3234 .removexattr = NULL,
3235
3236 .opendir = lxcfs_opendir,
3237 .readdir = lxcfs_readdir,
3238 .releasedir = lxcfs_releasedir,
3239
3240 .fsyncdir = NULL,
3241 .init = NULL,
3242 .destroy = NULL,
3243 .access = NULL,
3244 .create = NULL,
3245 .ftruncate = NULL,
3246 .fgetattr = NULL,
3247 };
3248
3249 static void usage(const char *me)
3250 {
3251 fprintf(stderr, "Usage:\n");
3252 fprintf(stderr, "\n");
3253 fprintf(stderr, "%s mountpoint\n", me);
3254 fprintf(stderr, "%s -h\n", me);
3255 exit(1);
3256 }
3257
3258 static bool is_help(char *w)
3259 {
3260 if (strcmp(w, "-h") == 0 ||
3261 strcmp(w, "--help") == 0 ||
3262 strcmp(w, "-help") == 0 ||
3263 strcmp(w, "help") == 0)
3264 return true;
3265 return false;
3266 }
3267
3268 void swallow_arg(int *argcp, char *argv[], char *which)
3269 {
3270 int i;
3271
3272 for (i = 1; argv[i]; i++) {
3273 if (strcmp(argv[i], which) != 0)
3274 continue;
3275 for (; argv[i]; i++) {
3276 argv[i] = argv[i+1];
3277 }
3278 (*argcp)--;
3279 return;
3280 }
3281 }
3282
3283 void swallow_option(int *argcp, char *argv[], char *opt, char *v)
3284 {
3285 int i;
3286
3287 for (i = 1; argv[i]; i++) {
3288 if (!argv[i+1])
3289 continue;
3290 if (strcmp(argv[i], opt) != 0)
3291 continue;
3292 if (strcmp(argv[i+1], v) != 0) {
3293 fprintf(stderr, "Warning: unexpected fuse option %s\n", v);
3294 exit(1);
3295 }
3296 for (; argv[i+1]; i++) {
3297 argv[i] = argv[i+2];
3298 }
3299 (*argcp) -= 2;
3300 return;
3301 }
3302 }
3303
3304 int main(int argc, char *argv[])
3305 {
3306 int ret = -1;
3307 /*
3308 * what we pass to fuse_main is:
3309 * argv[0] -s -f -o allow_other,directio argv[1] NULL
3310 */
3311 int nargs = 5, cnt = 0;
3312 char *newargv[6];
3313
3314 #ifdef FORTRAVIS
3315 /* for travis which runs on 12.04 */
3316 if (glib_check_version (2, 36, 0) != NULL)
3317 g_type_init ();
3318 #endif
3319
3320 /* accomodate older init scripts */
3321 swallow_arg(&argc, argv, "-s");
3322 swallow_arg(&argc, argv, "-f");
3323 swallow_option(&argc, argv, "-o", "allow_other");
3324
3325 if (argc == 2 && strcmp(argv[1], "--version") == 0) {
3326 fprintf(stderr, "%s\n", VERSION);
3327 exit(0);
3328 }
3329 if (argc != 2 || is_help(argv[1]))
3330 usage(argv[0]);
3331
3332 newargv[cnt++] = argv[0];
3333 newargv[cnt++] = "-f";
3334 newargv[cnt++] = "-o";
3335 newargv[cnt++] = "allow_other,direct_io,entry_timeout=0.5,attr_timeout=0.5";
3336 newargv[cnt++] = argv[1];
3337 newargv[cnt++] = NULL;
3338
3339 if (!cgfs_setup_controllers())
3340 goto out;
3341
3342 ret = fuse_main(nargs, newargv, &lxcfs_ops, NULL);
3343
3344 out:
3345 return ret;
3346 }