]> git.proxmox.com Git - mirror_lxcfs.git/blob - lxcfs.c
make HASH a smidge safer
[mirror_lxcfs.git] / lxcfs.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #include <stdio.h>
12 #include <dirent.h>
13 #include <fcntl.h>
14 #include <fuse.h>
15 #include <unistd.h>
16 #include <errno.h>
17 #include <stdbool.h>
18 #include <time.h>
19 #include <string.h>
20 #include <stdlib.h>
21 #include <libgen.h>
22 #include <sched.h>
23 #include <pthread.h>
24 #include <linux/sched.h>
25 #include <sys/socket.h>
26 #include <sys/mount.h>
27 #include <sys/epoll.h>
28 #include <wait.h>
29
30 #ifdef FORTRAVIS
31 #define GLIB_DISABLE_DEPRECATION_WARNINGS
32 #include <glib-object.h>
33 #endif
34
35 #include "cgfs.h"
36 #include "config.h" // for VERSION
37
38 enum {
39 LXC_TYPE_CGDIR,
40 LXC_TYPE_CGFILE,
41 LXC_TYPE_PROC_MEMINFO,
42 LXC_TYPE_PROC_CPUINFO,
43 LXC_TYPE_PROC_UPTIME,
44 LXC_TYPE_PROC_STAT,
45 LXC_TYPE_PROC_DISKSTATS,
46 };
47
48 struct file_info {
49 char *controller;
50 char *cgroup;
51 char *file;
52 int type;
53 char *buf; // unused as of yet
54 int buflen;
55 int size; //actual data size
56 int cached;
57 };
58
59 /* reserve buffer size, for cpuall in /proc/stat */
60 #define BUF_RESERVE_SIZE 256
61
62 /*
63 * A table caching which pid is init for a pid namespace.
64 * When looking up which pid is init for $qpid, we first
65 * 1. Stat /proc/$qpid/ns/pid.
66 * 2. Check whether the ino_t is in our store.
67 * a. if not, fork a child in qpid's ns to send us
68 * ucred.pid = 1, and read the initpid. Cache
69 * initpid and creation time for /proc/initpid
70 * in a new store entry.
71 * b. if so, verify that /proc/initpid still matches
72 * what we have saved. If not, clear the store
73 * entry and go back to a. If so, return the
74 * cached initpid.
75 */
76 struct pidns_init_store {
77 ino_t ino; // inode number for /proc/$pid/ns/pid
78 pid_t initpid; // the pid of nit in that ns
79 long int ctime; // the time at which /proc/$initpid was created
80 struct pidns_init_store *next;
81 long int lastcheck;
82 };
83
84 /* lol - look at how they are allocated in the kernel */
85 #define PIDNS_HASH_SIZE 4096
86 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
87
88 struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
89 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
90 static void lock_mutex(pthread_mutex_t *l)
91 {
92 int ret;
93
94 if ((ret = pthread_mutex_lock(l)) != 0) {
95 fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret));
96 exit(1);
97 }
98 }
99
100 static void unlock_mutex(pthread_mutex_t *l)
101 {
102 int ret;
103
104 if ((ret = pthread_mutex_unlock(l)) != 0) {
105 fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret));
106 exit(1);
107 }
108 }
109
110 static void store_lock(void)
111 {
112 lock_mutex(&pidns_store_mutex);
113 }
114
115 static void store_unlock(void)
116 {
117 unlock_mutex(&pidns_store_mutex);
118 }
119
120 /* Must be called under store_lock */
121 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
122 {
123 struct stat initsb;
124 char fnam[100];
125
126 snprintf(fnam, 100, "/proc/%d", e->initpid);
127 if (stat(fnam, &initsb) < 0)
128 return false;
129 #if DEBUG
130 fprintf(stderr, "comparing ctime %ld %ld for pid %d\n",
131 e->ctime, initsb.st_ctime, e->initpid);
132 #endif
133 if (e->ctime != initsb.st_ctime)
134 return false;
135 return true;
136 }
137
138 /* Must be called under store_lock */
139 static void remove_initpid(struct pidns_init_store *e)
140 {
141 struct pidns_init_store *tmp;
142 int h;
143
144 #if DEBUG
145 fprintf(stderr, "remove_initpid: removing entry for %d\n", e->initpid);
146 #endif
147 h = HASH(e->ino);
148 if (pidns_hash_table[h] == e) {
149 pidns_hash_table[h] = e->next;
150 free(e);
151 return;
152 }
153
154 tmp = pidns_hash_table[h];
155 while (tmp) {
156 if (tmp->next == e) {
157 tmp->next = e->next;
158 free(e);
159 return;
160 }
161 tmp = tmp->next;
162 }
163 }
164
165 #define PURGE_SECS 5
166 /* Must be called under store_lock */
167 static void prune_initpid_store(void)
168 {
169 static long int last_prune = 0;
170 struct pidns_init_store *e, *prev, *delme;
171 long int now, threshold;
172 int i;
173
174 if (!last_prune) {
175 last_prune = time(NULL);
176 return;
177 }
178 now = time(NULL);
179 if (now < last_prune + PURGE_SECS)
180 return;
181 #if DEBUG
182 fprintf(stderr, "pruning\n");
183 #endif
184 last_prune = now;
185 threshold = now - 2 * PURGE_SECS;
186
187 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
188 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
189 if (e->lastcheck < threshold) {
190 #if DEBUG
191 fprintf(stderr, "Removing cached entry for %d\n", e->initpid);
192 #endif
193 delme = e;
194 if (prev)
195 prev->next = e->next;
196 else
197 pidns_hash_table[i] = e->next;
198 e = e->next;
199 free(delme);
200 } else {
201 prev = e;
202 e = e->next;
203 }
204 }
205 }
206 }
207
208 /* Must be called under store_lock */
209 static void save_initpid(struct stat *sb, pid_t pid)
210 {
211 struct pidns_init_store *e;
212 char fpath[100];
213 struct stat procsb;
214 int h;
215
216 #if DEBUG
217 fprintf(stderr, "save_initpid: adding entry for %d\n", pid);
218 #endif
219 snprintf(fpath, 100, "/proc/%d", pid);
220 if (stat(fpath, &procsb) < 0)
221 return;
222 do {
223 e = malloc(sizeof(*e));
224 } while (!e);
225 e->ino = sb->st_ino;
226 e->initpid = pid;
227 e->ctime = procsb.st_ctime;
228 h = HASH(e->ino);
229 e->next = pidns_hash_table[h];
230 e->lastcheck = time(NULL);
231 pidns_hash_table[h] = e;
232 }
233
234 /*
235 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
236 * entry for the inode number and creation time. Verify that the init pid
237 * is still valid. If not, remove it. Return the entry if valid, NULL
238 * otherwise.
239 * Must be called under store_lock
240 */
241 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
242 {
243 int h = HASH(sb->st_ino);
244 struct pidns_init_store *e = pidns_hash_table[h];
245
246 while (e) {
247 if (e->ino == sb->st_ino) {
248 if (initpid_still_valid(e, sb)) {
249 e->lastcheck = time(NULL);
250 return e;
251 }
252 remove_initpid(e);
253 return NULL;
254 }
255 e = e->next;
256 }
257
258 return NULL;
259 }
260
261 #define SEND_CREDS_OK 0
262 #define SEND_CREDS_NOTSK 1
263 #define SEND_CREDS_FAIL 2
264 static bool recv_creds(int sock, struct ucred *cred, char *v);
265 static int wait_for_pid(pid_t pid);
266 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
267
268 /*
269 * fork a task which switches to @task's namespace and writes '1'.
270 * over a unix sock so we can read the task's reaper's pid in our
271 * namespace
272 */
273 static void write_task_init_pid_exit(int sock, pid_t target)
274 {
275 struct ucred cred;
276 char fnam[100];
277 pid_t pid;
278 char v;
279 int fd, ret;
280
281 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
282 if (ret < 0 || ret >= sizeof(fnam))
283 _exit(1);
284
285 fd = open(fnam, O_RDONLY);
286 if (fd < 0) {
287 perror("write_task_init_pid_exit open of ns/pid");
288 _exit(1);
289 }
290 if (setns(fd, 0)) {
291 perror("write_task_init_pid_exit setns 1");
292 close(fd);
293 _exit(1);
294 }
295 pid = fork();
296 if (pid < 0)
297 _exit(1);
298 if (pid != 0) {
299 if (!wait_for_pid(pid))
300 _exit(1);
301 _exit(0);
302 }
303
304 /* we are the child */
305 cred.uid = 0;
306 cred.gid = 0;
307 cred.pid = 1;
308 v = '1';
309 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
310 _exit(1);
311 _exit(0);
312 }
313
314 static pid_t get_init_pid_for_task(pid_t task)
315 {
316 int sock[2];
317 pid_t pid;
318 pid_t ret = -1;
319 char v = '0';
320 struct ucred cred;
321
322 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
323 perror("socketpair");
324 return -1;
325 }
326
327 pid = fork();
328 if (pid < 0)
329 goto out;
330 if (!pid) {
331 close(sock[1]);
332 write_task_init_pid_exit(sock[0], task);
333 _exit(0);
334 }
335
336 if (!recv_creds(sock[1], &cred, &v))
337 goto out;
338 ret = cred.pid;
339
340 out:
341 close(sock[0]);
342 close(sock[1]);
343 if (pid > 0)
344 wait_for_pid(pid);
345 return ret;
346 }
347
348 static pid_t lookup_initpid_in_store(pid_t qpid)
349 {
350 pid_t answer = 0;
351 struct stat sb;
352 struct pidns_init_store *e;
353 char fnam[100];
354
355 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
356 store_lock();
357 if (stat(fnam, &sb) < 0)
358 goto out;
359 e = lookup_verify_initpid(&sb);
360 if (e) {
361 answer = e->initpid;
362 goto out;
363 }
364 answer = get_init_pid_for_task(qpid);
365 if (answer > 0)
366 save_initpid(&sb, answer);
367
368 out:
369 /* we prune at end in case we are returning
370 * the value we were about to return */
371 prune_initpid_store();
372 store_unlock();
373 return answer;
374 }
375
376 static int wait_for_pid(pid_t pid)
377 {
378 int status, ret;
379
380 if (pid <= 0)
381 return -1;
382
383 again:
384 ret = waitpid(pid, &status, 0);
385 if (ret == -1) {
386 if (errno == EINTR)
387 goto again;
388 return -1;
389 }
390 if (ret != pid)
391 goto again;
392 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
393 return -1;
394 return 0;
395 }
396
397
398 /*
399 * append pid to *src.
400 * src: a pointer to a char* in which ot append the pid.
401 * sz: the number of characters printed so far, minus trailing \0.
402 * asz: the allocated size so far
403 * pid: the pid to append
404 */
405 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
406 {
407 char tmp[30];
408
409 int tmplen = sprintf(tmp, "%d\n", (int)pid);
410
411 if (!*src || tmplen + *sz + 1 >= *asz) {
412 char *tmp;
413 do {
414 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
415 } while (!tmp);
416 *src = tmp;
417 *asz += BUF_RESERVE_SIZE;
418 }
419 memcpy((*src) +*sz , tmp, tmplen);
420 *sz += tmplen;
421 (*src)[*sz] = '\0';
422 }
423
424 /*
425 * Given a open file * to /proc/pid/{u,g}id_map, and an id
426 * valid in the caller's namespace, return the id mapped into
427 * pid's namespace.
428 * Returns the mapped id, or -1 on error.
429 */
430 unsigned int
431 convert_id_to_ns(FILE *idfile, unsigned int in_id)
432 {
433 unsigned int nsuid, // base id for a range in the idfile's namespace
434 hostuid, // base id for a range in the caller's namespace
435 count; // number of ids in this range
436 char line[400];
437 int ret;
438
439 fseek(idfile, 0L, SEEK_SET);
440 while (fgets(line, 400, idfile)) {
441 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
442 if (ret != 3)
443 continue;
444 if (hostuid + count < hostuid || nsuid + count < nsuid) {
445 /*
446 * uids wrapped around - unexpected as this is a procfile,
447 * so just bail.
448 */
449 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
450 nsuid, hostuid, count, line);
451 return -1;
452 }
453 if (hostuid <= in_id && hostuid+count > in_id) {
454 /*
455 * now since hostuid <= in_id < hostuid+count, and
456 * hostuid+count and nsuid+count do not wrap around,
457 * we know that nsuid+(in_id-hostuid) which must be
458 * less that nsuid+(count) must not wrap around
459 */
460 return (in_id - hostuid) + nsuid;
461 }
462 }
463
464 // no answer found
465 return -1;
466 }
467
468 /*
469 * for is_privileged_over,
470 * specify whether we require the calling uid to be root in his
471 * namespace
472 */
473 #define NS_ROOT_REQD true
474 #define NS_ROOT_OPT false
475
476 #define PROCLEN 100
477
478 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
479 {
480 char fpath[PROCLEN];
481 int ret;
482 bool answer = false;
483 uid_t nsuid;
484
485 if (victim == -1 || uid == -1)
486 return false;
487
488 /*
489 * If the request is one not requiring root in the namespace,
490 * then having the same uid suffices. (i.e. uid 1000 has write
491 * access to files owned by uid 1000
492 */
493 if (!req_ns_root && uid == victim)
494 return true;
495
496 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
497 if (ret < 0 || ret >= PROCLEN)
498 return false;
499 FILE *f = fopen(fpath, "r");
500 if (!f)
501 return false;
502
503 /* if caller's not root in his namespace, reject */
504 nsuid = convert_id_to_ns(f, uid);
505 if (nsuid)
506 goto out;
507
508 /*
509 * If victim is not mapped into caller's ns, reject.
510 * XXX I'm not sure this check is needed given that fuse
511 * will be sending requests where the vfs has converted
512 */
513 nsuid = convert_id_to_ns(f, victim);
514 if (nsuid == -1)
515 goto out;
516
517 answer = true;
518
519 out:
520 fclose(f);
521 return answer;
522 }
523
524 static bool perms_include(int fmode, mode_t req_mode)
525 {
526 mode_t r;
527
528 switch (req_mode & O_ACCMODE) {
529 case O_RDONLY:
530 r = S_IROTH;
531 break;
532 case O_WRONLY:
533 r = S_IWOTH;
534 break;
535 case O_RDWR:
536 r = S_IROTH | S_IWOTH;
537 break;
538 default:
539 return false;
540 }
541 return ((fmode & r) == r);
542 }
543
544
545 /*
546 * taskcg is a/b/c
547 * querycg is /a/b/c/d/e
548 * we return 'd'
549 */
550 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
551 {
552 char *start, *end;
553
554 if (strlen(taskcg) <= strlen(querycg)) {
555 fprintf(stderr, "%s: I was fed bad input\n", __func__);
556 return NULL;
557 }
558
559 if (strcmp(querycg, "/") == 0)
560 start = strdup(taskcg + 1);
561 else
562 start = strdup(taskcg + strlen(querycg) + 1);
563 if (!start)
564 return NULL;
565 end = strchr(start, '/');
566 if (end)
567 *end = '\0';
568 return start;
569 }
570
571 static void stripnewline(char *x)
572 {
573 size_t l = strlen(x);
574 if (l && x[l-1] == '\n')
575 x[l-1] = '\0';
576 }
577
578 static char *get_pid_cgroup(pid_t pid, const char *contrl)
579 {
580 char fnam[PROCLEN];
581 FILE *f;
582 char *answer = NULL;
583 char *line = NULL;
584 size_t len = 0;
585 int ret;
586 const char *h = find_mounted_controller(contrl);
587 if (!h)
588 return NULL;
589
590 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
591 if (ret < 0 || ret >= PROCLEN)
592 return NULL;
593 if (!(f = fopen(fnam, "r")))
594 return NULL;
595
596 while (getline(&line, &len, f) != -1) {
597 char *c1, *c2;
598 if (!line[0])
599 continue;
600 c1 = strchr(line, ':');
601 if (!c1)
602 goto out;
603 c1++;
604 c2 = strchr(c1, ':');
605 if (!c2)
606 goto out;
607 *c2 = '\0';
608 if (strcmp(c1, h) != 0)
609 continue;
610 c2++;
611 stripnewline(c2);
612 do {
613 answer = strdup(c2);
614 } while (!answer);
615 break;
616 }
617
618 out:
619 fclose(f);
620 free(line);
621 return answer;
622 }
623
624 /*
625 * check whether a fuse context may access a cgroup dir or file
626 *
627 * If file is not null, it is a cgroup file to check under cg.
628 * If file is null, then we are checking perms on cg itself.
629 *
630 * For files we can check the mode of the list_keys result.
631 * For cgroups, we must make assumptions based on the files under the
632 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
633 * yet.
634 */
635 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
636 {
637 struct cgfs_files *k = NULL;
638 bool ret = false;
639
640 k = cgfs_get_key(contrl, cg, file);
641 if (!k)
642 return false;
643
644 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
645 if (perms_include(k->mode >> 6, mode)) {
646 ret = true;
647 goto out;
648 }
649 }
650 if (fc->gid == k->gid) {
651 if (perms_include(k->mode >> 3, mode)) {
652 ret = true;
653 goto out;
654 }
655 }
656 ret = perms_include(k->mode, mode);
657
658 out:
659 free_key(k);
660 return ret;
661 }
662
663 #define INITSCOPE "/init.scope"
664 static void prune_init_slice(char *cg)
665 {
666 char *point;
667 point = cg + strlen(cg) - strlen(INITSCOPE);
668 if (point < cg)
669 return;
670 if (strcmp(point, INITSCOPE) == 0) {
671 if (point == cg)
672 *(point+1) = '\0';
673 else
674 *point = '\0';
675 }
676 }
677
678 /*
679 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
680 * If pid is in /a, he may act on /a/b, but not on /b.
681 * if the answer is false and nextcg is not NULL, then *nextcg will point
682 * to a string containing the next cgroup directory under cg, which must be
683 * freed by the caller.
684 */
685 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
686 {
687 bool answer = false;
688 char *c2 = get_pid_cgroup(pid, contrl);
689 char *linecmp;
690
691 if (!c2)
692 return false;
693 prune_init_slice(c2);
694
695 /*
696 * callers pass in '/' for root cgroup, otherwise they pass
697 * in a cgroup without leading '/'
698 */
699 linecmp = *cg == '/' ? c2 : c2+1;
700 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
701 if (nextcg) {
702 *nextcg = get_next_cgroup_dir(linecmp, cg);
703 }
704 goto out;
705 }
706 answer = true;
707
708 out:
709 free(c2);
710 return answer;
711 }
712
713 /*
714 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
715 */
716 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
717 {
718 bool answer = false;
719 char *c2, *task_cg;
720 size_t target_len, task_len;
721
722 if (strcmp(cg, "/") == 0)
723 return true;
724
725 c2 = get_pid_cgroup(pid, contrl);
726 if (!c2)
727 return false;
728 prune_init_slice(c2);
729
730 task_cg = c2 + 1;
731 target_len = strlen(cg);
732 task_len = strlen(task_cg);
733 if (task_len == 0) {
734 /* Task is in the root cg, it can see everything. This case is
735 * not handled by the strmcps below, since they test for the
736 * last /, but that is the first / that we've chopped off
737 * above.
738 */
739 answer = true;
740 goto out;
741 }
742 if (strcmp(cg, task_cg) == 0) {
743 answer = true;
744 goto out;
745 }
746 if (target_len < task_len) {
747 /* looking up a parent dir */
748 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
749 answer = true;
750 goto out;
751 }
752 if (target_len > task_len) {
753 /* looking up a child dir */
754 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
755 answer = true;
756 goto out;
757 }
758
759 out:
760 free(c2);
761 return answer;
762 }
763
764 /*
765 * given /cgroup/freezer/a/b, return "freezer".
766 * the returned char* should NOT be freed.
767 */
768 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
769 {
770 const char *p1;
771 char *contr, *slash;
772
773 if (strlen(path) < 9)
774 return NULL;
775 if (*(path+7) != '/')
776 return NULL;
777 p1 = path+8;
778 contr = strdupa(p1);
779 if (!contr)
780 return NULL;
781 slash = strstr(contr, "/");
782 if (slash)
783 *slash = '\0';
784
785 int i;
786 for (i = 0; i < num_hierarchies; i++) {
787 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
788 return hierarchies[i];
789 }
790 return NULL;
791 }
792
793 /*
794 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
795 * Note that the returned value may include files (keynames) etc
796 */
797 static const char *find_cgroup_in_path(const char *path)
798 {
799 const char *p1;
800
801 if (strlen(path) < 9)
802 return NULL;
803 p1 = strstr(path+8, "/");
804 if (!p1)
805 return NULL;
806 return p1+1;
807 }
808
809 /*
810 * split the last path element from the path in @cg.
811 * @dir is newly allocated and should be freed, @last not
812 */
813 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
814 {
815 char *p;
816
817 do {
818 *dir = strdup(cg);
819 } while (!*dir);
820 *last = strrchr(cg, '/');
821 if (!*last) {
822 *last = NULL;
823 return;
824 }
825 p = strrchr(*dir, '/');
826 *p = '\0';
827 }
828
829 /*
830 * FUSE ops for /cgroup
831 */
832
833 static int cg_getattr(const char *path, struct stat *sb)
834 {
835 struct timespec now;
836 struct fuse_context *fc = fuse_get_context();
837 char * cgdir = NULL;
838 char *last = NULL, *path1, *path2;
839 struct cgfs_files *k = NULL;
840 const char *cgroup;
841 const char *controller = NULL;
842 int ret = -ENOENT;
843
844
845 if (!fc)
846 return -EIO;
847
848 memset(sb, 0, sizeof(struct stat));
849
850 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
851 return -EINVAL;
852
853 sb->st_uid = sb->st_gid = 0;
854 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
855 sb->st_size = 0;
856
857 if (strcmp(path, "/cgroup") == 0) {
858 sb->st_mode = S_IFDIR | 00755;
859 sb->st_nlink = 2;
860 return 0;
861 }
862
863 controller = pick_controller_from_path(fc, path);
864 if (!controller)
865 return -EIO;
866 cgroup = find_cgroup_in_path(path);
867 if (!cgroup) {
868 /* this is just /cgroup/controller, return it as a dir */
869 sb->st_mode = S_IFDIR | 00755;
870 sb->st_nlink = 2;
871 return 0;
872 }
873
874 get_cgdir_and_path(cgroup, &cgdir, &last);
875
876 if (!last) {
877 path1 = "/";
878 path2 = cgdir;
879 } else {
880 path1 = cgdir;
881 path2 = last;
882 }
883
884 pid_t initpid = lookup_initpid_in_store(fc->pid);
885 if (initpid <= 0)
886 initpid = fc->pid;
887 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
888 * Then check that caller's cgroup is under path if last is a child
889 * cgroup, or cgdir if last is a file */
890
891 if (is_child_cgroup(controller, path1, path2)) {
892 if (!caller_may_see_dir(initpid, controller, cgroup)) {
893 ret = -ENOENT;
894 goto out;
895 }
896 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
897 /* this is just /cgroup/controller, return it as a dir */
898 sb->st_mode = S_IFDIR | 00555;
899 sb->st_nlink = 2;
900 ret = 0;
901 goto out;
902 }
903 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
904 ret = -EACCES;
905 goto out;
906 }
907
908 // get uid, gid, from '/tasks' file and make up a mode
909 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
910 sb->st_mode = S_IFDIR | 00755;
911 k = cgfs_get_key(controller, cgroup, NULL);
912 if (!k) {
913 sb->st_uid = sb->st_gid = 0;
914 } else {
915 sb->st_uid = k->uid;
916 sb->st_gid = k->gid;
917 }
918 free_key(k);
919 sb->st_nlink = 2;
920 ret = 0;
921 goto out;
922 }
923
924 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
925 sb->st_mode = S_IFREG | k->mode;
926 sb->st_nlink = 1;
927 sb->st_uid = k->uid;
928 sb->st_gid = k->gid;
929 sb->st_size = 0;
930 free_key(k);
931 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
932 ret = -ENOENT;
933 goto out;
934 }
935 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
936 ret = -EACCES;
937 goto out;
938 }
939
940 ret = 0;
941 }
942
943 out:
944 free(cgdir);
945 return ret;
946 }
947
948 static int cg_opendir(const char *path, struct fuse_file_info *fi)
949 {
950 struct fuse_context *fc = fuse_get_context();
951 const char *cgroup;
952 struct file_info *dir_info;
953 char *controller = NULL;
954
955 if (!fc)
956 return -EIO;
957
958 if (strcmp(path, "/cgroup") == 0) {
959 cgroup = NULL;
960 controller = NULL;
961 } else {
962 // return list of keys for the controller, and list of child cgroups
963 controller = pick_controller_from_path(fc, path);
964 if (!controller)
965 return -EIO;
966
967 cgroup = find_cgroup_in_path(path);
968 if (!cgroup) {
969 /* this is just /cgroup/controller, return its contents */
970 cgroup = "/";
971 }
972 }
973
974 pid_t initpid = lookup_initpid_in_store(fc->pid);
975 if (initpid <= 0)
976 initpid = fc->pid;
977 if (cgroup) {
978 if (!caller_may_see_dir(initpid, controller, cgroup))
979 return -ENOENT;
980 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
981 return -EACCES;
982 }
983
984 /* we'll free this at cg_releasedir */
985 dir_info = malloc(sizeof(*dir_info));
986 if (!dir_info)
987 return -ENOMEM;
988 dir_info->controller = must_copy_string(controller);
989 dir_info->cgroup = must_copy_string(cgroup);
990 dir_info->type = LXC_TYPE_CGDIR;
991 dir_info->buf = NULL;
992 dir_info->file = NULL;
993 dir_info->buflen = 0;
994
995 fi->fh = (unsigned long)dir_info;
996 return 0;
997 }
998
999 static int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1000 struct fuse_file_info *fi)
1001 {
1002 struct file_info *d = (struct file_info *)fi->fh;
1003 struct cgfs_files **list = NULL;
1004 int i, ret;
1005 char *nextcg = NULL;
1006 struct fuse_context *fc = fuse_get_context();
1007 char **clist = NULL;
1008
1009 if (d->type != LXC_TYPE_CGDIR) {
1010 fprintf(stderr, "Internal error: file cache info used in readdir\n");
1011 return -EIO;
1012 }
1013 if (!d->cgroup && !d->controller) {
1014 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1015 int i;
1016
1017 for (i = 0; i < num_hierarchies; i++) {
1018 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1019 return -EIO;
1020 }
1021 }
1022 return 0;
1023 }
1024
1025 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1026 // not a valid cgroup
1027 ret = -EINVAL;
1028 goto out;
1029 }
1030
1031 pid_t initpid = lookup_initpid_in_store(fc->pid);
1032 if (initpid <= 0)
1033 initpid = fc->pid;
1034 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1035 if (nextcg) {
1036 int ret;
1037 ret = filler(buf, nextcg, NULL, 0);
1038 free(nextcg);
1039 if (ret != 0) {
1040 ret = -EIO;
1041 goto out;
1042 }
1043 }
1044 ret = 0;
1045 goto out;
1046 }
1047
1048 for (i = 0; list[i]; i++) {
1049 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1050 ret = -EIO;
1051 goto out;
1052 }
1053 }
1054
1055 // now get the list of child cgroups
1056
1057 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1058 ret = 0;
1059 goto out;
1060 }
1061 for (i = 0; clist[i]; i++) {
1062 if (filler(buf, clist[i], NULL, 0) != 0) {
1063 ret = -EIO;
1064 goto out;
1065 }
1066 }
1067 ret = 0;
1068
1069 out:
1070 free_keys(list);
1071 if (clist) {
1072 for (i = 0; clist[i]; i++)
1073 free(clist[i]);
1074 free(clist);
1075 }
1076 return ret;
1077 }
1078
1079 static void do_release_file_info(struct file_info *f)
1080 {
1081 if (!f)
1082 return;
1083 free(f->controller);
1084 free(f->cgroup);
1085 free(f->file);
1086 free(f->buf);
1087 free(f);
1088 }
1089
1090 static int cg_releasedir(const char *path, struct fuse_file_info *fi)
1091 {
1092 struct file_info *d = (struct file_info *)fi->fh;
1093
1094 do_release_file_info(d);
1095 return 0;
1096 }
1097
1098 static int cg_open(const char *path, struct fuse_file_info *fi)
1099 {
1100 const char *cgroup;
1101 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1102 struct cgfs_files *k = NULL;
1103 struct file_info *file_info;
1104 struct fuse_context *fc = fuse_get_context();
1105 int ret;
1106
1107 if (!fc)
1108 return -EIO;
1109
1110 controller = pick_controller_from_path(fc, path);
1111 if (!controller)
1112 return -EIO;
1113 cgroup = find_cgroup_in_path(path);
1114 if (!cgroup)
1115 return -EINVAL;
1116
1117 get_cgdir_and_path(cgroup, &cgdir, &last);
1118 if (!last) {
1119 path1 = "/";
1120 path2 = cgdir;
1121 } else {
1122 path1 = cgdir;
1123 path2 = last;
1124 }
1125
1126 k = cgfs_get_key(controller, path1, path2);
1127 if (!k) {
1128 ret = -EINVAL;
1129 goto out;
1130 }
1131 free_key(k);
1132
1133 pid_t initpid = lookup_initpid_in_store(fc->pid);
1134 if (initpid <= 0)
1135 initpid = fc->pid;
1136 if (!caller_may_see_dir(initpid, controller, path1)) {
1137 ret = -ENOENT;
1138 goto out;
1139 }
1140 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
1141 // should never get here
1142 ret = -EACCES;
1143 goto out;
1144 }
1145
1146 /* we'll free this at cg_release */
1147 file_info = malloc(sizeof(*file_info));
1148 if (!file_info) {
1149 ret = -ENOMEM;
1150 goto out;
1151 }
1152 file_info->controller = must_copy_string(controller);
1153 file_info->cgroup = must_copy_string(path1);
1154 file_info->file = must_copy_string(path2);
1155 file_info->type = LXC_TYPE_CGFILE;
1156 file_info->buf = NULL;
1157 file_info->buflen = 0;
1158
1159 fi->fh = (unsigned long)file_info;
1160 ret = 0;
1161
1162 out:
1163 free(cgdir);
1164 return ret;
1165 }
1166
1167 static int cg_release(const char *path, struct fuse_file_info *fi)
1168 {
1169 struct file_info *f = (struct file_info *)fi->fh;
1170
1171 do_release_file_info(f);
1172 return 0;
1173 }
1174
1175 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
1176
1177 static bool wait_for_sock(int sock, int timeout)
1178 {
1179 struct epoll_event ev;
1180 int epfd, ret, now, starttime, deltatime, saved_errno;
1181
1182 if ((starttime = time(NULL)) < 0)
1183 return false;
1184
1185 if ((epfd = epoll_create(1)) < 0) {
1186 fprintf(stderr, "Failed to create epoll socket: %m\n");
1187 return false;
1188 }
1189
1190 ev.events = POLLIN_SET;
1191 ev.data.fd = sock;
1192 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
1193 fprintf(stderr, "Failed adding socket to epoll: %m\n");
1194 close(epfd);
1195 return false;
1196 }
1197
1198 again:
1199 if ((now = time(NULL)) < 0) {
1200 close(epfd);
1201 return false;
1202 }
1203
1204 deltatime = (starttime + timeout) - now;
1205 if (deltatime < 0) { // timeout
1206 errno = 0;
1207 close(epfd);
1208 return false;
1209 }
1210 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
1211 if (ret < 0 && errno == EINTR)
1212 goto again;
1213 saved_errno = errno;
1214 close(epfd);
1215
1216 if (ret <= 0) {
1217 errno = saved_errno;
1218 return false;
1219 }
1220 return true;
1221 }
1222
1223 static int msgrecv(int sockfd, void *buf, size_t len)
1224 {
1225 if (!wait_for_sock(sockfd, 2))
1226 return -1;
1227 return recv(sockfd, buf, len, MSG_DONTWAIT);
1228 }
1229
1230 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
1231 {
1232 struct msghdr msg = { 0 };
1233 struct iovec iov;
1234 struct cmsghdr *cmsg;
1235 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
1236 char buf[1];
1237 buf[0] = 'p';
1238
1239 if (pingfirst) {
1240 if (msgrecv(sock, buf, 1) != 1) {
1241 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
1242 __func__);
1243 return SEND_CREDS_FAIL;
1244 }
1245 }
1246
1247 msg.msg_control = cmsgbuf;
1248 msg.msg_controllen = sizeof(cmsgbuf);
1249
1250 cmsg = CMSG_FIRSTHDR(&msg);
1251 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
1252 cmsg->cmsg_level = SOL_SOCKET;
1253 cmsg->cmsg_type = SCM_CREDENTIALS;
1254 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
1255
1256 msg.msg_name = NULL;
1257 msg.msg_namelen = 0;
1258
1259 buf[0] = v;
1260 iov.iov_base = buf;
1261 iov.iov_len = sizeof(buf);
1262 msg.msg_iov = &iov;
1263 msg.msg_iovlen = 1;
1264
1265 if (sendmsg(sock, &msg, 0) < 0) {
1266 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
1267 strerror(errno));
1268 if (errno == 3)
1269 return SEND_CREDS_NOTSK;
1270 return SEND_CREDS_FAIL;
1271 }
1272
1273 return SEND_CREDS_OK;
1274 }
1275
1276 static bool recv_creds(int sock, struct ucred *cred, char *v)
1277 {
1278 struct msghdr msg = { 0 };
1279 struct iovec iov;
1280 struct cmsghdr *cmsg;
1281 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
1282 char buf[1];
1283 int ret;
1284 int optval = 1;
1285
1286 *v = '1';
1287
1288 cred->pid = -1;
1289 cred->uid = -1;
1290 cred->gid = -1;
1291
1292 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
1293 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
1294 return false;
1295 }
1296 buf[0] = '1';
1297 if (write(sock, buf, 1) != 1) {
1298 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
1299 return false;
1300 }
1301
1302 msg.msg_name = NULL;
1303 msg.msg_namelen = 0;
1304 msg.msg_control = cmsgbuf;
1305 msg.msg_controllen = sizeof(cmsgbuf);
1306
1307 iov.iov_base = buf;
1308 iov.iov_len = sizeof(buf);
1309 msg.msg_iov = &iov;
1310 msg.msg_iovlen = 1;
1311
1312 if (!wait_for_sock(sock, 2)) {
1313 fprintf(stderr, "Timed out waiting for scm_cred: %s\n",
1314 strerror(errno));
1315 return false;
1316 }
1317 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
1318 if (ret < 0) {
1319 fprintf(stderr, "Failed to receive scm_cred: %s\n",
1320 strerror(errno));
1321 return false;
1322 }
1323
1324 cmsg = CMSG_FIRSTHDR(&msg);
1325
1326 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
1327 cmsg->cmsg_level == SOL_SOCKET &&
1328 cmsg->cmsg_type == SCM_CREDENTIALS) {
1329 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
1330 }
1331 *v = buf[0];
1332
1333 return true;
1334 }
1335
1336
1337 /*
1338 * pid_to_ns - reads pids from a ucred over a socket, then writes the
1339 * int value back over the socket. This shifts the pid from the
1340 * sender's pidns into tpid's pidns.
1341 */
1342 static void pid_to_ns(int sock, pid_t tpid)
1343 {
1344 char v = '0';
1345 struct ucred cred;
1346
1347 while (recv_creds(sock, &cred, &v)) {
1348 if (v == '1')
1349 _exit(0);
1350 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
1351 _exit(1);
1352 }
1353 _exit(0);
1354 }
1355
1356 /*
1357 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
1358 * in your old pidns. Only children which you fork will be in the target
1359 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
1360 * actually convert pids
1361 */
1362 static void pid_to_ns_wrapper(int sock, pid_t tpid)
1363 {
1364 int newnsfd = -1, ret, cpipe[2];
1365 char fnam[100];
1366 pid_t cpid;
1367 char v;
1368
1369 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1370 if (ret < 0 || ret >= sizeof(fnam))
1371 _exit(1);
1372 newnsfd = open(fnam, O_RDONLY);
1373 if (newnsfd < 0)
1374 _exit(1);
1375 if (setns(newnsfd, 0) < 0)
1376 _exit(1);
1377 close(newnsfd);
1378
1379 if (pipe(cpipe) < 0)
1380 _exit(1);
1381
1382 cpid = fork();
1383 if (cpid < 0)
1384 _exit(1);
1385
1386 if (!cpid) {
1387 char b = '1';
1388 close(cpipe[0]);
1389 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1390 fprintf(stderr, "%s (child): erorr on write: %s\n",
1391 __func__, strerror(errno));
1392 }
1393 close(cpipe[1]);
1394 pid_to_ns(sock, tpid);
1395 _exit(1); // not reached
1396 }
1397 // give the child 1 second to be done forking and
1398 // write its ack
1399 if (!wait_for_sock(cpipe[0], 1))
1400 _exit(1);
1401 ret = read(cpipe[0], &v, 1);
1402 if (ret != sizeof(char) || v != '1')
1403 _exit(1);
1404
1405 if (!wait_for_pid(cpid))
1406 _exit(1);
1407 _exit(0);
1408 }
1409
1410 /*
1411 * To read cgroup files with a particular pid, we will setns into the child
1412 * pidns, open a pipe, fork a child - which will be the first to really be in
1413 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
1414 */
1415 static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
1416 {
1417 int sock[2] = {-1, -1};
1418 char *tmpdata = NULL;
1419 int ret;
1420 pid_t qpid, cpid = -1;
1421 bool answer = false;
1422 char v = '0';
1423 struct ucred cred;
1424 size_t sz = 0, asz = 0;
1425
1426 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
1427 return false;
1428
1429 /*
1430 * Now we read the pids from returned data one by one, pass
1431 * them into a child in the target namespace, read back the
1432 * translated pids, and put them into our to-return data
1433 */
1434
1435 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1436 perror("socketpair");
1437 free(tmpdata);
1438 return false;
1439 }
1440
1441 cpid = fork();
1442 if (cpid == -1)
1443 goto out;
1444
1445 if (!cpid) // child - exits when done
1446 pid_to_ns_wrapper(sock[1], tpid);
1447
1448 char *ptr = tmpdata;
1449 cred.uid = 0;
1450 cred.gid = 0;
1451 while (sscanf(ptr, "%d\n", &qpid) == 1) {
1452 cred.pid = qpid;
1453 ret = send_creds(sock[0], &cred, v, true);
1454
1455 if (ret == SEND_CREDS_NOTSK)
1456 goto next;
1457 if (ret == SEND_CREDS_FAIL)
1458 goto out;
1459
1460 // read converted results
1461 if (!wait_for_sock(sock[0], 2)) {
1462 fprintf(stderr, "%s: timed out waiting for pid from child: %s\n",
1463 __func__, strerror(errno));
1464 goto out;
1465 }
1466 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1467 fprintf(stderr, "%s: error reading pid from child: %s\n",
1468 __func__, strerror(errno));
1469 goto out;
1470 }
1471 must_strcat_pid(d, &sz, &asz, qpid);
1472 next:
1473 ptr = strchr(ptr, '\n');
1474 if (!ptr)
1475 break;
1476 ptr++;
1477 }
1478
1479 cred.pid = getpid();
1480 v = '1';
1481 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
1482 // failed to ask child to exit
1483 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
1484 __func__, strerror(errno));
1485 goto out;
1486 }
1487
1488 answer = true;
1489
1490 out:
1491 free(tmpdata);
1492 if (cpid != -1)
1493 wait_for_pid(cpid);
1494 if (sock[0] != -1) {
1495 close(sock[0]);
1496 close(sock[1]);
1497 }
1498 return answer;
1499 }
1500
1501 static int cg_read(const char *path, char *buf, size_t size, off_t offset,
1502 struct fuse_file_info *fi)
1503 {
1504 struct fuse_context *fc = fuse_get_context();
1505 struct file_info *f = (struct file_info *)fi->fh;
1506 struct cgfs_files *k = NULL;
1507 char *data = NULL;
1508 int ret, s;
1509 bool r;
1510
1511 if (f->type != LXC_TYPE_CGFILE) {
1512 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
1513 return -EIO;
1514 }
1515
1516 if (offset)
1517 return 0;
1518
1519 if (!fc)
1520 return -EIO;
1521
1522 if (!f->controller)
1523 return -EINVAL;
1524
1525 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
1526 return -EINVAL;
1527 }
1528 free_key(k);
1529
1530
1531 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { // should never get here
1532 ret = -EACCES;
1533 goto out;
1534 }
1535
1536 if (strcmp(f->file, "tasks") == 0 ||
1537 strcmp(f->file, "/tasks") == 0 ||
1538 strcmp(f->file, "/cgroup.procs") == 0 ||
1539 strcmp(f->file, "cgroup.procs") == 0)
1540 // special case - we have to translate the pids
1541 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
1542 else
1543 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
1544
1545 if (!r) {
1546 ret = -EINVAL;
1547 goto out;
1548 }
1549
1550 if (!data) {
1551 ret = 0;
1552 goto out;
1553 }
1554 s = strlen(data);
1555 if (s > size)
1556 s = size;
1557 memcpy(buf, data, s);
1558 if (s > 0 && s < size && data[s-1] != '\n')
1559 buf[s++] = '\n';
1560
1561 ret = s;
1562
1563 out:
1564 free(data);
1565 return ret;
1566 }
1567
1568 static void pid_from_ns(int sock, pid_t tpid)
1569 {
1570 pid_t vpid;
1571 struct ucred cred;
1572 char v;
1573 int ret;
1574
1575 cred.uid = 0;
1576 cred.gid = 0;
1577 while (1) {
1578 if (!wait_for_sock(sock, 2)) {
1579 fprintf(stderr, "%s: timeout reading from parent\n", __func__);
1580 _exit(1);
1581 }
1582 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
1583 fprintf(stderr, "%s: bad read from parent: %s\n",
1584 __func__, strerror(errno));
1585 _exit(1);
1586 }
1587 if (vpid == -1) // done
1588 break;
1589 v = '0';
1590 cred.pid = vpid;
1591 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
1592 v = '1';
1593 cred.pid = getpid();
1594 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
1595 _exit(1);
1596 }
1597 }
1598 _exit(0);
1599 }
1600
1601 static void pid_from_ns_wrapper(int sock, pid_t tpid)
1602 {
1603 int newnsfd = -1, ret, cpipe[2];
1604 char fnam[100];
1605 pid_t cpid;
1606 char v;
1607
1608 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1609 if (ret < 0 || ret >= sizeof(fnam))
1610 _exit(1);
1611 newnsfd = open(fnam, O_RDONLY);
1612 if (newnsfd < 0)
1613 _exit(1);
1614 if (setns(newnsfd, 0) < 0)
1615 _exit(1);
1616 close(newnsfd);
1617
1618 if (pipe(cpipe) < 0)
1619 _exit(1);
1620
1621 loop:
1622 cpid = fork();
1623
1624 if (cpid < 0)
1625 _exit(1);
1626
1627 if (!cpid) {
1628 char b = '1';
1629 close(cpipe[0]);
1630 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1631 fprintf(stderr, "%s (child): erorr on write: %s\n",
1632 __func__, strerror(errno));
1633 }
1634 close(cpipe[1]);
1635 pid_from_ns(sock, tpid);
1636 }
1637
1638 // give the child 1 second to be done forking and
1639 // write its ack
1640 if (!wait_for_sock(cpipe[0], 1))
1641 goto again;
1642 ret = read(cpipe[0], &v, 1);
1643 if (ret != sizeof(char) || v != '1') {
1644 goto again;
1645 }
1646
1647 if (!wait_for_pid(cpid))
1648 _exit(1);
1649 _exit(0);
1650
1651 again:
1652 kill(cpid, SIGKILL);
1653 wait_for_pid(cpid);
1654 goto loop;
1655 }
1656
1657 /*
1658 * Given host @uid, return the uid to which it maps in
1659 * @pid's user namespace, or -1 if none.
1660 */
1661 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
1662 {
1663 FILE *f;
1664 char line[400];
1665
1666 sprintf(line, "/proc/%d/uid_map", pid);
1667 if ((f = fopen(line, "r")) == NULL) {
1668 return false;
1669 }
1670
1671 *answer = convert_id_to_ns(f, uid);
1672 fclose(f);
1673
1674 if (*answer == -1)
1675 return false;
1676 return true;
1677 }
1678
1679 /*
1680 * get_pid_creds: get the real uid and gid of @pid from
1681 * /proc/$$/status
1682 * (XXX should we use euid here?)
1683 */
1684 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
1685 {
1686 char line[400];
1687 uid_t u;
1688 gid_t g;
1689 FILE *f;
1690
1691 *uid = -1;
1692 *gid = -1;
1693 sprintf(line, "/proc/%d/status", pid);
1694 if ((f = fopen(line, "r")) == NULL) {
1695 fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno));
1696 return;
1697 }
1698 while (fgets(line, 400, f)) {
1699 if (strncmp(line, "Uid:", 4) == 0) {
1700 if (sscanf(line+4, "%u", &u) != 1) {
1701 fprintf(stderr, "bad uid line for pid %u\n", pid);
1702 fclose(f);
1703 return;
1704 }
1705 *uid = u;
1706 } else if (strncmp(line, "Gid:", 4) == 0) {
1707 if (sscanf(line+4, "%u", &g) != 1) {
1708 fprintf(stderr, "bad gid line for pid %u\n", pid);
1709 fclose(f);
1710 return;
1711 }
1712 *gid = g;
1713 }
1714 }
1715 fclose(f);
1716 }
1717
1718 /*
1719 * May the requestor @r move victim @v to a new cgroup?
1720 * This is allowed if
1721 * . they are the same task
1722 * . they are ownedy by the same uid
1723 * . @r is root on the host, or
1724 * . @v's uid is mapped into @r's where @r is root.
1725 */
1726 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
1727 {
1728 uid_t v_uid, tmpuid;
1729 gid_t v_gid;
1730
1731 if (r == v)
1732 return true;
1733 if (r_uid == 0)
1734 return true;
1735 get_pid_creds(v, &v_uid, &v_gid);
1736 if (r_uid == v_uid)
1737 return true;
1738 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
1739 && hostuid_to_ns(v_uid, r, &tmpuid))
1740 return true;
1741 return false;
1742 }
1743
1744 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
1745 const char *file, const char *buf)
1746 {
1747 int sock[2] = {-1, -1};
1748 pid_t qpid, cpid = -1;
1749 FILE *pids_file = NULL;
1750 bool answer = false, fail = false;
1751
1752 pids_file = open_pids_file(contrl, cg);
1753 if (!pids_file)
1754 return false;
1755
1756 /*
1757 * write the pids to a socket, have helper in writer's pidns
1758 * call movepid for us
1759 */
1760 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1761 perror("socketpair");
1762 goto out;
1763 }
1764
1765 cpid = fork();
1766 if (cpid == -1)
1767 goto out;
1768
1769 if (!cpid) { // child
1770 fclose(pids_file);
1771 pid_from_ns_wrapper(sock[1], tpid);
1772 }
1773
1774 const char *ptr = buf;
1775 while (sscanf(ptr, "%d", &qpid) == 1) {
1776 struct ucred cred;
1777 char v;
1778
1779 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
1780 fprintf(stderr, "%s: error writing pid to child: %s\n",
1781 __func__, strerror(errno));
1782 goto out;
1783 }
1784
1785 if (recv_creds(sock[0], &cred, &v)) {
1786 if (v == '0') {
1787 if (!may_move_pid(tpid, tuid, cred.pid)) {
1788 fail = true;
1789 break;
1790 }
1791 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
1792 fail = true;
1793 }
1794 }
1795
1796 ptr = strchr(ptr, '\n');
1797 if (!ptr)
1798 break;
1799 ptr++;
1800 }
1801
1802 /* All good, write the value */
1803 qpid = -1;
1804 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
1805 fprintf(stderr, "Warning: failed to ask child to exit\n");
1806
1807 if (!fail)
1808 answer = true;
1809
1810 out:
1811 if (cpid != -1)
1812 wait_for_pid(cpid);
1813 if (sock[0] != -1) {
1814 close(sock[0]);
1815 close(sock[1]);
1816 }
1817 if (pids_file) {
1818 if (fclose(pids_file) != 0)
1819 answer = false;
1820 }
1821 return answer;
1822 }
1823
1824 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
1825 struct fuse_file_info *fi)
1826 {
1827 struct fuse_context *fc = fuse_get_context();
1828 char *localbuf = NULL;
1829 struct cgfs_files *k = NULL;
1830 struct file_info *f = (struct file_info *)fi->fh;
1831 bool r;
1832
1833 if (f->type != LXC_TYPE_CGFILE) {
1834 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
1835 return -EIO;
1836 }
1837
1838 if (offset)
1839 return 0;
1840
1841 if (!fc)
1842 return -EIO;
1843
1844 localbuf = alloca(size+1);
1845 localbuf[size] = '\0';
1846 memcpy(localbuf, buf, size);
1847
1848 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
1849 size = -EINVAL;
1850 goto out;
1851 }
1852
1853 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
1854 size = -EACCES;
1855 goto out;
1856 }
1857
1858 if (strcmp(f->file, "tasks") == 0 ||
1859 strcmp(f->file, "/tasks") == 0 ||
1860 strcmp(f->file, "/cgroup.procs") == 0 ||
1861 strcmp(f->file, "cgroup.procs") == 0)
1862 // special case - we have to translate the pids
1863 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
1864 else
1865 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
1866
1867 if (!r)
1868 size = -EINVAL;
1869
1870 out:
1871 free_key(k);
1872 return size;
1873 }
1874
1875 int cg_chown(const char *path, uid_t uid, gid_t gid)
1876 {
1877 struct fuse_context *fc = fuse_get_context();
1878 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
1879 struct cgfs_files *k = NULL;
1880 const char *cgroup;
1881 int ret;
1882
1883 if (!fc)
1884 return -EIO;
1885
1886 if (strcmp(path, "/cgroup") == 0)
1887 return -EINVAL;
1888
1889 controller = pick_controller_from_path(fc, path);
1890 if (!controller)
1891 return -EINVAL;
1892 cgroup = find_cgroup_in_path(path);
1893 if (!cgroup)
1894 /* this is just /cgroup/controller */
1895 return -EINVAL;
1896
1897 get_cgdir_and_path(cgroup, &cgdir, &last);
1898
1899 if (!last) {
1900 path1 = "/";
1901 path2 = cgdir;
1902 } else {
1903 path1 = cgdir;
1904 path2 = last;
1905 }
1906
1907 if (is_child_cgroup(controller, path1, path2)) {
1908 // get uid, gid, from '/tasks' file and make up a mode
1909 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1910 k = cgfs_get_key(controller, cgroup, "tasks");
1911
1912 } else
1913 k = cgfs_get_key(controller, path1, path2);
1914
1915 if (!k) {
1916 ret = -EINVAL;
1917 goto out;
1918 }
1919
1920 /*
1921 * This being a fuse request, the uid and gid must be valid
1922 * in the caller's namespace. So we can just check to make
1923 * sure that the caller is root in his uid, and privileged
1924 * over the file's current owner.
1925 */
1926 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
1927 ret = -EACCES;
1928 goto out;
1929 }
1930
1931 ret = cgfs_chown_file(controller, cgroup, uid, gid);
1932
1933 out:
1934 free_key(k);
1935 free(cgdir);
1936
1937 return ret;
1938 }
1939
1940 int cg_chmod(const char *path, mode_t mode)
1941 {
1942 struct fuse_context *fc = fuse_get_context();
1943 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
1944 struct cgfs_files *k = NULL;
1945 const char *cgroup;
1946 int ret;
1947
1948 if (!fc)
1949 return -EIO;
1950
1951 if (strcmp(path, "/cgroup") == 0)
1952 return -EINVAL;
1953
1954 controller = pick_controller_from_path(fc, path);
1955 if (!controller)
1956 return -EINVAL;
1957 cgroup = find_cgroup_in_path(path);
1958 if (!cgroup)
1959 /* this is just /cgroup/controller */
1960 return -EINVAL;
1961
1962 get_cgdir_and_path(cgroup, &cgdir, &last);
1963
1964 if (!last) {
1965 path1 = "/";
1966 path2 = cgdir;
1967 } else {
1968 path1 = cgdir;
1969 path2 = last;
1970 }
1971
1972 if (is_child_cgroup(controller, path1, path2)) {
1973 // get uid, gid, from '/tasks' file and make up a mode
1974 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1975 k = cgfs_get_key(controller, cgroup, "tasks");
1976
1977 } else
1978 k = cgfs_get_key(controller, path1, path2);
1979
1980 if (!k) {
1981 ret = -EINVAL;
1982 goto out;
1983 }
1984
1985 /*
1986 * This being a fuse request, the uid and gid must be valid
1987 * in the caller's namespace. So we can just check to make
1988 * sure that the caller is root in his uid, and privileged
1989 * over the file's current owner.
1990 */
1991 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1992 ret = -EPERM;
1993 goto out;
1994 }
1995
1996 if (!cgfs_chmod_file(controller, cgroup, mode)) {
1997 ret = -EINVAL;
1998 goto out;
1999 }
2000
2001 ret = 0;
2002 out:
2003 free_key(k);
2004 free(cgdir);
2005 return ret;
2006 }
2007
2008 int cg_mkdir(const char *path, mode_t mode)
2009 {
2010 struct fuse_context *fc = fuse_get_context();
2011 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2012 const char *cgroup;
2013 int ret;
2014
2015 if (!fc)
2016 return -EIO;
2017
2018
2019 controller = pick_controller_from_path(fc, path);
2020 if (!controller)
2021 return -EINVAL;
2022
2023 cgroup = find_cgroup_in_path(path);
2024 if (!cgroup)
2025 return -EINVAL;
2026
2027 get_cgdir_and_path(cgroup, &cgdir, &last);
2028 if (!last)
2029 path1 = "/";
2030 else
2031 path1 = cgdir;
2032
2033 pid_t initpid = lookup_initpid_in_store(fc->pid);
2034 if (initpid <= 0)
2035 initpid = fc->pid;
2036 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
2037 if (!next)
2038 ret = -EINVAL;
2039 else if (last && strcmp(next, last) == 0)
2040 ret = -EEXIST;
2041 else
2042 ret = -ENOENT;
2043 goto out;
2044 }
2045
2046 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
2047 ret = -EACCES;
2048 goto out;
2049 }
2050 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2051 ret = -EACCES;
2052 goto out;
2053 }
2054
2055 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
2056
2057 out:
2058 free(cgdir);
2059 free(next);
2060 return ret;
2061 }
2062
2063 static int cg_rmdir(const char *path)
2064 {
2065 struct fuse_context *fc = fuse_get_context();
2066 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
2067 const char *cgroup;
2068 int ret;
2069
2070 if (!fc)
2071 return -EIO;
2072
2073 controller = pick_controller_from_path(fc, path);
2074 if (!controller)
2075 return -EINVAL;
2076
2077 cgroup = find_cgroup_in_path(path);
2078 if (!cgroup)
2079 return -EINVAL;
2080
2081 get_cgdir_and_path(cgroup, &cgdir, &last);
2082 if (!last) {
2083 ret = -EINVAL;
2084 goto out;
2085 }
2086
2087 pid_t initpid = lookup_initpid_in_store(fc->pid);
2088 if (initpid <= 0)
2089 initpid = fc->pid;
2090 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
2091 if (!last || strcmp(next, last) == 0)
2092 ret = -EBUSY;
2093 else
2094 ret = -ENOENT;
2095 goto out;
2096 }
2097
2098 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
2099 ret = -EACCES;
2100 goto out;
2101 }
2102 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2103 ret = -EACCES;
2104 goto out;
2105 }
2106
2107 if (!cgfs_remove(controller, cgroup)) {
2108 ret = -EINVAL;
2109 goto out;
2110 }
2111
2112 ret = 0;
2113
2114 out:
2115 free(cgdir);
2116 free(next);
2117 return ret;
2118 }
2119
2120 static bool startswith(const char *line, const char *pref)
2121 {
2122 if (strncmp(line, pref, strlen(pref)) == 0)
2123 return true;
2124 return false;
2125 }
2126
2127 static void get_mem_cached(char *memstat, unsigned long *v)
2128 {
2129 char *eol;
2130
2131 *v = 0;
2132 while (*memstat) {
2133 if (startswith(memstat, "total_cache")) {
2134 sscanf(memstat + 11, "%lu", v);
2135 *v /= 1024;
2136 return;
2137 }
2138 eol = strchr(memstat, '\n');
2139 if (!eol)
2140 return;
2141 memstat = eol+1;
2142 }
2143 }
2144
2145 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2146 {
2147 char *eol;
2148 char key[32];
2149
2150 memset(key, 0, 32);
2151 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2152
2153 size_t len = strlen(key);
2154 *v = 0;
2155
2156 while (*str) {
2157 if (startswith(str, key)) {
2158 sscanf(str + len, "%lu", v);
2159 return;
2160 }
2161 eol = strchr(str, '\n');
2162 if (!eol)
2163 return;
2164 str = eol+1;
2165 }
2166 }
2167
2168 static int read_file(const char *path, char *buf, size_t size,
2169 struct file_info *d)
2170 {
2171 size_t linelen = 0, total_len = 0, rv = 0;
2172 char *line = NULL;
2173 char *cache = d->buf;
2174 size_t cache_size = d->buflen;
2175 FILE *f = fopen(path, "r");
2176 if (!f)
2177 return 0;
2178
2179 while (getline(&line, &linelen, f) != -1) {
2180 size_t l = snprintf(cache, cache_size, "%s", line);
2181 if (l < 0) {
2182 perror("Error writing to cache");
2183 rv = 0;
2184 goto err;
2185 }
2186 if (l >= cache_size) {
2187 fprintf(stderr, "Internal error: truncated write to cache\n");
2188 rv = 0;
2189 goto err;
2190 }
2191 if (l < cache_size) {
2192 cache += l;
2193 cache_size -= l;
2194 total_len += l;
2195 } else {
2196 cache += cache_size;
2197 total_len += cache_size;
2198 cache_size = 0;
2199 break;
2200 }
2201 }
2202
2203 d->size = total_len;
2204 if (total_len > size ) total_len = size;
2205
2206 /* read from off 0 */
2207 memcpy(buf, d->buf, total_len);
2208 rv = total_len;
2209 err:
2210 fclose(f);
2211 free(line);
2212 return rv;
2213 }
2214
2215 /*
2216 * FUSE ops for /proc
2217 */
2218
2219 static unsigned long get_memlimit(const char *cgroup)
2220 {
2221 char *memlimit_str = NULL;
2222 unsigned long memlimit = -1;
2223
2224 if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
2225 memlimit = strtoul(memlimit_str, NULL, 10);
2226
2227 free(memlimit_str);
2228
2229 return memlimit;
2230 }
2231
2232 static unsigned long get_min_memlimit(const char *cgroup)
2233 {
2234 char *copy = strdupa(cgroup);
2235 unsigned long memlimit = 0, retlimit;
2236
2237 retlimit = get_memlimit(copy);
2238
2239 while (strcmp(copy, "/") != 0) {
2240 copy = dirname(copy);
2241 memlimit = get_memlimit(copy);
2242 if (memlimit != -1 && memlimit < retlimit)
2243 retlimit = memlimit;
2244 };
2245
2246 return retlimit;
2247 }
2248
2249 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
2250 struct fuse_file_info *fi)
2251 {
2252 struct fuse_context *fc = fuse_get_context();
2253 struct file_info *d = (struct file_info *)fi->fh;
2254 char *cg;
2255 char *memusage_str = NULL, *memstat_str = NULL,
2256 *memswlimit_str = NULL, *memswusage_str = NULL,
2257 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
2258 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
2259 cached = 0, hosttotal = 0;
2260 char *line = NULL;
2261 size_t linelen = 0, total_len = 0, rv = 0;
2262 char *cache = d->buf;
2263 size_t cache_size = d->buflen;
2264 FILE *f = NULL;
2265
2266 if (offset){
2267 if (offset > d->size)
2268 return -EINVAL;
2269 if (!d->cached)
2270 return 0;
2271 int left = d->size - offset;
2272 total_len = left > size ? size: left;
2273 memcpy(buf, cache + offset, total_len);
2274 return total_len;
2275 }
2276
2277 pid_t initpid = lookup_initpid_in_store(fc->pid);
2278 if (initpid <= 0)
2279 initpid = fc->pid;
2280 cg = get_pid_cgroup(initpid, "memory");
2281 if (!cg)
2282 return read_file("/proc/meminfo", buf, size, d);
2283
2284 memlimit = get_min_memlimit(cg);
2285 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
2286 goto err;
2287 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
2288 goto err;
2289
2290 // Following values are allowed to fail, because swapaccount might be turned
2291 // off for current kernel
2292 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
2293 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
2294 {
2295 /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
2296 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
2297 goto err;
2298 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
2299 goto err;
2300
2301 memswlimit = strtoul(memswlimit_str, NULL, 10);
2302 memswusage = strtoul(memswusage_str, NULL, 10);
2303
2304 if (!strcmp(memswlimit_str, memswlimit_default_str))
2305 memswlimit = 0;
2306 if (!strcmp(memswusage_str, memswusage_default_str))
2307 memswusage = 0;
2308
2309 memswlimit = memswlimit / 1024;
2310 memswusage = memswusage / 1024;
2311 }
2312
2313 memusage = strtoul(memusage_str, NULL, 10);
2314 memlimit /= 1024;
2315 memusage /= 1024;
2316
2317 get_mem_cached(memstat_str, &cached);
2318
2319 f = fopen("/proc/meminfo", "r");
2320 if (!f)
2321 goto err;
2322
2323 while (getline(&line, &linelen, f) != -1) {
2324 size_t l;
2325 char *printme, lbuf[100];
2326
2327 memset(lbuf, 0, 100);
2328 if (startswith(line, "MemTotal:")) {
2329 sscanf(line+14, "%lu", &hosttotal);
2330 if (hosttotal < memlimit)
2331 memlimit = hosttotal;
2332 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
2333 printme = lbuf;
2334 } else if (startswith(line, "MemFree:")) {
2335 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
2336 printme = lbuf;
2337 } else if (startswith(line, "MemAvailable:")) {
2338 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
2339 printme = lbuf;
2340 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
2341 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit);
2342 printme = lbuf;
2343 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
2344 snprintf(lbuf, 100, "SwapFree: %8lu kB\n",
2345 (memswlimit - memlimit) - (memswusage - memusage));
2346 printme = lbuf;
2347 } else if (startswith(line, "Buffers:")) {
2348 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
2349 printme = lbuf;
2350 } else if (startswith(line, "Cached:")) {
2351 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
2352 printme = lbuf;
2353 } else if (startswith(line, "SwapCached:")) {
2354 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
2355 printme = lbuf;
2356 } else
2357 printme = line;
2358
2359 l = snprintf(cache, cache_size, "%s", printme);
2360 if (l < 0) {
2361 perror("Error writing to cache");
2362 rv = 0;
2363 goto err;
2364
2365 }
2366 if (l >= cache_size) {
2367 fprintf(stderr, "Internal error: truncated write to cache\n");
2368 rv = 0;
2369 goto err;
2370 }
2371
2372 cache += l;
2373 cache_size -= l;
2374 total_len += l;
2375 }
2376
2377 d->cached = 1;
2378 d->size = total_len;
2379 if (total_len > size ) total_len = size;
2380 memcpy(buf, d->buf, total_len);
2381
2382 rv = total_len;
2383 err:
2384 if (f)
2385 fclose(f);
2386 free(line);
2387 free(cg);
2388 free(memusage_str);
2389 free(memswlimit_str);
2390 free(memswusage_str);
2391 free(memstat_str);
2392 free(memswlimit_default_str);
2393 free(memswusage_default_str);
2394 return rv;
2395 }
2396
2397 /*
2398 * Read the cpuset.cpus for cg
2399 * Return the answer in a newly allocated string which must be freed
2400 */
2401 static char *get_cpuset(const char *cg)
2402 {
2403 char *answer;
2404
2405 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
2406 return NULL;
2407 return answer;
2408 }
2409
2410 bool cpu_in_cpuset(int cpu, const char *cpuset);
2411
2412 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
2413 {
2414 int cpu;
2415
2416 if (sscanf(line, "processor : %d", &cpu) != 1)
2417 return false;
2418 return cpu_in_cpuset(cpu, cpuset);
2419 }
2420
2421 /*
2422 * check whether this is a '^processor" line in /proc/cpuinfo
2423 */
2424 static bool is_processor_line(const char *line)
2425 {
2426 int cpu;
2427
2428 if (sscanf(line, "processor : %d", &cpu) == 1)
2429 return true;
2430 return false;
2431 }
2432
2433 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
2434 struct fuse_file_info *fi)
2435 {
2436 struct fuse_context *fc = fuse_get_context();
2437 struct file_info *d = (struct file_info *)fi->fh;
2438 char *cg;
2439 char *cpuset = NULL;
2440 char *line = NULL;
2441 size_t linelen = 0, total_len = 0, rv = 0;
2442 bool am_printing = false;
2443 int curcpu = -1;
2444 char *cache = d->buf;
2445 size_t cache_size = d->buflen;
2446 FILE *f = NULL;
2447
2448 if (offset){
2449 if (offset > d->size)
2450 return -EINVAL;
2451 if (!d->cached)
2452 return 0;
2453 int left = d->size - offset;
2454 total_len = left > size ? size: left;
2455 memcpy(buf, cache + offset, total_len);
2456 return total_len;
2457 }
2458
2459 pid_t initpid = lookup_initpid_in_store(fc->pid);
2460 if (initpid <= 0)
2461 initpid = fc->pid;
2462 cg = get_pid_cgroup(initpid, "cpuset");
2463 if (!cg)
2464 return read_file("proc/cpuinfo", buf, size, d);
2465
2466 cpuset = get_cpuset(cg);
2467 if (!cpuset)
2468 goto err;
2469
2470 f = fopen("/proc/cpuinfo", "r");
2471 if (!f)
2472 goto err;
2473
2474 while (getline(&line, &linelen, f) != -1) {
2475 size_t l;
2476 if (is_processor_line(line)) {
2477 am_printing = cpuline_in_cpuset(line, cpuset);
2478 if (am_printing) {
2479 curcpu ++;
2480 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
2481 if (l < 0) {
2482 perror("Error writing to cache");
2483 rv = 0;
2484 goto err;
2485 }
2486 if (l >= cache_size) {
2487 fprintf(stderr, "Internal error: truncated write to cache\n");
2488 rv = 0;
2489 goto err;
2490 }
2491 if (l < cache_size){
2492 cache += l;
2493 cache_size -= l;
2494 total_len += l;
2495 }else{
2496 cache += cache_size;
2497 total_len += cache_size;
2498 cache_size = 0;
2499 break;
2500 }
2501 }
2502 continue;
2503 }
2504 if (am_printing) {
2505 l = snprintf(cache, cache_size, "%s", line);
2506 if (l < 0) {
2507 perror("Error writing to cache");
2508 rv = 0;
2509 goto err;
2510 }
2511 if (l >= cache_size) {
2512 fprintf(stderr, "Internal error: truncated write to cache\n");
2513 rv = 0;
2514 goto err;
2515 }
2516 if (l < cache_size) {
2517 cache += l;
2518 cache_size -= l;
2519 total_len += l;
2520 } else {
2521 cache += cache_size;
2522 total_len += cache_size;
2523 cache_size = 0;
2524 break;
2525 }
2526 }
2527 }
2528
2529 d->cached = 1;
2530 d->size = total_len;
2531 if (total_len > size ) total_len = size;
2532
2533 /* read from off 0 */
2534 memcpy(buf, d->buf, total_len);
2535 rv = total_len;
2536 err:
2537 if (f)
2538 fclose(f);
2539 free(line);
2540 free(cpuset);
2541 free(cg);
2542 return rv;
2543 }
2544
2545 static int proc_stat_read(char *buf, size_t size, off_t offset,
2546 struct fuse_file_info *fi)
2547 {
2548 struct fuse_context *fc = fuse_get_context();
2549 struct file_info *d = (struct file_info *)fi->fh;
2550 char *cg;
2551 char *cpuset = NULL;
2552 char *line = NULL;
2553 size_t linelen = 0, total_len = 0, rv = 0;
2554 int curcpu = -1; /* cpu numbering starts at 0 */
2555 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
2556 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
2557 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
2558 #define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
2559 char cpuall[CPUALL_MAX_SIZE];
2560 /* reserve for cpu all */
2561 char *cache = d->buf + CPUALL_MAX_SIZE;
2562 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
2563 FILE *f = NULL;
2564
2565 if (offset){
2566 if (offset > d->size)
2567 return -EINVAL;
2568 if (!d->cached)
2569 return 0;
2570 int left = d->size - offset;
2571 total_len = left > size ? size: left;
2572 memcpy(buf, d->buf + offset, total_len);
2573 return total_len;
2574 }
2575
2576 pid_t initpid = lookup_initpid_in_store(fc->pid);
2577 if (initpid <= 0)
2578 initpid = fc->pid;
2579 cg = get_pid_cgroup(initpid, "cpuset");
2580 if (!cg)
2581 return read_file("/proc/stat", buf, size, d);
2582
2583 cpuset = get_cpuset(cg);
2584 if (!cpuset)
2585 goto err;
2586
2587 f = fopen("/proc/stat", "r");
2588 if (!f)
2589 goto err;
2590
2591 //skip first line
2592 if (getline(&line, &linelen, f) < 0) {
2593 fprintf(stderr, "proc_stat_read read first line failed\n");
2594 goto err;
2595 }
2596
2597 while (getline(&line, &linelen, f) != -1) {
2598 size_t l;
2599 int cpu;
2600 char cpu_char[10]; /* That's a lot of cores */
2601 char *c;
2602
2603 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
2604 /* not a ^cpuN line containing a number N, just print it */
2605 l = snprintf(cache, cache_size, "%s", line);
2606 if (l < 0) {
2607 perror("Error writing to cache");
2608 rv = 0;
2609 goto err;
2610 }
2611 if (l >= cache_size) {
2612 fprintf(stderr, "Internal error: truncated write to cache\n");
2613 rv = 0;
2614 goto err;
2615 }
2616 if (l < cache_size) {
2617 cache += l;
2618 cache_size -= l;
2619 total_len += l;
2620 continue;
2621 } else {
2622 //no more space, break it
2623 cache += cache_size;
2624 total_len += cache_size;
2625 cache_size = 0;
2626 break;
2627 }
2628 }
2629
2630 if (sscanf(cpu_char, "%d", &cpu) != 1)
2631 continue;
2632 if (!cpu_in_cpuset(cpu, cpuset))
2633 continue;
2634 curcpu ++;
2635
2636 c = strchr(line, ' ');
2637 if (!c)
2638 continue;
2639 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
2640 if (l < 0) {
2641 perror("Error writing to cache");
2642 rv = 0;
2643 goto err;
2644
2645 }
2646 if (l >= cache_size) {
2647 fprintf(stderr, "Internal error: truncated write to cache\n");
2648 rv = 0;
2649 goto err;
2650 }
2651
2652 cache += l;
2653 cache_size -= l;
2654 total_len += l;
2655
2656 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
2657 &softirq, &steal, &guest) != 9)
2658 continue;
2659 user_sum += user;
2660 nice_sum += nice;
2661 system_sum += system;
2662 idle_sum += idle;
2663 iowait_sum += iowait;
2664 irq_sum += irq;
2665 softirq_sum += softirq;
2666 steal_sum += steal;
2667 guest_sum += guest;
2668 }
2669
2670 cache = d->buf;
2671
2672 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2673 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
2674 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
2675 memcpy(cache, cpuall, cpuall_len);
2676 cache += cpuall_len;
2677 } else{
2678 /* shouldn't happen */
2679 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
2680 cpuall_len = 0;
2681 }
2682
2683 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
2684 total_len += cpuall_len;
2685 d->cached = 1;
2686 d->size = total_len;
2687 if (total_len > size ) total_len = size;
2688
2689 memcpy(buf, d->buf, total_len);
2690 rv = total_len;
2691
2692 err:
2693 if (f)
2694 fclose(f);
2695 free(line);
2696 free(cpuset);
2697 free(cg);
2698 return rv;
2699 }
2700
2701 static long int getreaperage(pid_t pid)
2702 {
2703 char fnam[100];
2704 struct stat sb;
2705 int ret;
2706 pid_t qpid;
2707
2708 qpid = lookup_initpid_in_store(pid);
2709 if (qpid <= 0)
2710 return 0;
2711
2712 ret = snprintf(fnam, 100, "/proc/%d", qpid);
2713 if (ret < 0 || ret >= 100)
2714 return 0;
2715
2716 if (lstat(fnam, &sb) < 0)
2717 return 0;
2718
2719 return time(NULL) - sb.st_ctime;
2720 }
2721
2722 static unsigned long get_reaper_busy(pid_t task)
2723 {
2724 pid_t initpid = lookup_initpid_in_store(task);
2725 char *cgroup = NULL, *usage_str = NULL;
2726 unsigned long usage = 0;
2727
2728 if (initpid <= 0)
2729 return 0;
2730
2731 cgroup = get_pid_cgroup(initpid, "cpuacct");
2732 if (!cgroup)
2733 goto out;
2734 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
2735 goto out;
2736 usage = strtoul(usage_str, NULL, 10);
2737 usage /= 1000000000;
2738
2739 out:
2740 free(cgroup);
2741 free(usage_str);
2742 return usage;
2743 }
2744
2745 /*
2746 * We read /proc/uptime and reuse its second field.
2747 * For the first field, we use the mtime for the reaper for
2748 * the calling pid as returned by getreaperage
2749 */
2750 static int proc_uptime_read(char *buf, size_t size, off_t offset,
2751 struct fuse_file_info *fi)
2752 {
2753 struct fuse_context *fc = fuse_get_context();
2754 struct file_info *d = (struct file_info *)fi->fh;
2755 long int reaperage = getreaperage(fc->pid);
2756 unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
2757 char *cache = d->buf;
2758 size_t total_len = 0;
2759
2760 if (offset){
2761 if (offset > d->size)
2762 return -EINVAL;
2763 if (!d->cached)
2764 return 0;
2765 int left = d->size - offset;
2766 total_len = left > size ? size: left;
2767 memcpy(buf, cache + offset, total_len);
2768 return total_len;
2769 }
2770
2771 idletime = reaperage - busytime;
2772 if (idletime > reaperage)
2773 idletime = reaperage;
2774
2775 total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
2776 if (total_len < 0){
2777 perror("Error writing to cache");
2778 return 0;
2779 }
2780
2781 d->size = (int)total_len;
2782 d->cached = 1;
2783
2784 if (total_len > size) total_len = size;
2785
2786 memcpy(buf, d->buf, total_len);
2787 return total_len;
2788 }
2789
2790 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
2791 struct fuse_file_info *fi)
2792 {
2793 char dev_name[72];
2794 struct fuse_context *fc = fuse_get_context();
2795 struct file_info *d = (struct file_info *)fi->fh;
2796 char *cg;
2797 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
2798 *io_wait_time_str = NULL, *io_service_time_str = NULL;
2799 unsigned long read = 0, write = 0;
2800 unsigned long read_merged = 0, write_merged = 0;
2801 unsigned long read_sectors = 0, write_sectors = 0;
2802 unsigned long read_ticks = 0, write_ticks = 0;
2803 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
2804 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
2805 char *cache = d->buf;
2806 size_t cache_size = d->buflen;
2807 char *line = NULL;
2808 size_t linelen = 0, total_len = 0, rv = 0;
2809 unsigned int major = 0, minor = 0;
2810 int i = 0;
2811 FILE *f = NULL;
2812
2813 if (offset){
2814 if (offset > d->size)
2815 return -EINVAL;
2816 if (!d->cached)
2817 return 0;
2818 int left = d->size - offset;
2819 total_len = left > size ? size: left;
2820 memcpy(buf, cache + offset, total_len);
2821 return total_len;
2822 }
2823
2824 pid_t initpid = lookup_initpid_in_store(fc->pid);
2825 if (initpid <= 0)
2826 initpid = fc->pid;
2827 cg = get_pid_cgroup(initpid, "blkio");
2828 if (!cg)
2829 return read_file("/proc/diskstats", buf, size, d);
2830
2831 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
2832 goto err;
2833 if (!cgfs_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
2834 goto err;
2835 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
2836 goto err;
2837 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
2838 goto err;
2839 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
2840 goto err;
2841
2842
2843 f = fopen("/proc/diskstats", "r");
2844 if (!f)
2845 goto err;
2846
2847 while (getline(&line, &linelen, f) != -1) {
2848 size_t l;
2849 char *printme, lbuf[256];
2850
2851 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
2852 if(i == 3){
2853 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
2854 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
2855 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
2856 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
2857 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
2858 read_sectors = read_sectors/512;
2859 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
2860 write_sectors = write_sectors/512;
2861
2862 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
2863 rd_svctm = rd_svctm/1000000;
2864 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
2865 rd_wait = rd_wait/1000000;
2866 read_ticks = rd_svctm + rd_wait;
2867
2868 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
2869 wr_svctm = wr_svctm/1000000;
2870 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
2871 wr_wait = wr_wait/1000000;
2872 write_ticks = wr_svctm + wr_wait;
2873
2874 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
2875 tot_ticks = tot_ticks/1000000;
2876 }else{
2877 continue;
2878 }
2879
2880 memset(lbuf, 0, 256);
2881 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
2882 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
2883 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
2884 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
2885 printme = lbuf;
2886 } else
2887 continue;
2888
2889 l = snprintf(cache, cache_size, "%s", printme);
2890 if (l < 0) {
2891 perror("Error writing to fuse buf");
2892 rv = 0;
2893 goto err;
2894 }
2895 if (l >= cache_size) {
2896 fprintf(stderr, "Internal error: truncated write to cache\n");
2897 rv = 0;
2898 goto err;
2899 }
2900 cache += l;
2901 cache_size -= l;
2902 total_len += l;
2903 }
2904
2905 d->cached = 1;
2906 d->size = total_len;
2907 if (total_len > size ) total_len = size;
2908 memcpy(buf, d->buf, total_len);
2909
2910 rv = total_len;
2911 err:
2912 free(cg);
2913 if (f)
2914 fclose(f);
2915 free(line);
2916 free(io_serviced_str);
2917 free(io_merged_str);
2918 free(io_service_bytes_str);
2919 free(io_wait_time_str);
2920 free(io_service_time_str);
2921 return rv;
2922 }
2923
2924 static off_t get_procfile_size(const char *which)
2925 {
2926 FILE *f = fopen(which, "r");
2927 char *line = NULL;
2928 size_t len = 0;
2929 ssize_t sz, answer = 0;
2930 if (!f)
2931 return 0;
2932
2933 while ((sz = getline(&line, &len, f)) != -1)
2934 answer += sz;
2935 fclose (f);
2936 free(line);
2937
2938 return answer;
2939 }
2940
2941 static int proc_getattr(const char *path, struct stat *sb)
2942 {
2943 struct timespec now;
2944
2945 memset(sb, 0, sizeof(struct stat));
2946 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2947 return -EINVAL;
2948 sb->st_uid = sb->st_gid = 0;
2949 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2950 if (strcmp(path, "/proc") == 0) {
2951 sb->st_mode = S_IFDIR | 00555;
2952 sb->st_nlink = 2;
2953 return 0;
2954 }
2955 if (strcmp(path, "/proc/meminfo") == 0 ||
2956 strcmp(path, "/proc/cpuinfo") == 0 ||
2957 strcmp(path, "/proc/uptime") == 0 ||
2958 strcmp(path, "/proc/stat") == 0 ||
2959 strcmp(path, "/proc/diskstats") == 0) {
2960 sb->st_size = 0;
2961 sb->st_mode = S_IFREG | 00444;
2962 sb->st_nlink = 1;
2963 return 0;
2964 }
2965
2966 return -ENOENT;
2967 }
2968
2969 static int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2970 struct fuse_file_info *fi)
2971 {
2972 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
2973 filler(buf, "meminfo", NULL, 0) != 0 ||
2974 filler(buf, "stat", NULL, 0) != 0 ||
2975 filler(buf, "uptime", NULL, 0) != 0 ||
2976 filler(buf, "diskstats", NULL, 0) != 0)
2977 return -EINVAL;
2978 return 0;
2979 }
2980
2981 static int proc_open(const char *path, struct fuse_file_info *fi)
2982 {
2983 int type = -1;
2984 struct file_info *info;
2985
2986 if (strcmp(path, "/proc/meminfo") == 0)
2987 type = LXC_TYPE_PROC_MEMINFO;
2988 else if (strcmp(path, "/proc/cpuinfo") == 0)
2989 type = LXC_TYPE_PROC_CPUINFO;
2990 else if (strcmp(path, "/proc/uptime") == 0)
2991 type = LXC_TYPE_PROC_UPTIME;
2992 else if (strcmp(path, "/proc/stat") == 0)
2993 type = LXC_TYPE_PROC_STAT;
2994 else if (strcmp(path, "/proc/diskstats") == 0)
2995 type = LXC_TYPE_PROC_DISKSTATS;
2996 if (type == -1)
2997 return -ENOENT;
2998
2999 info = malloc(sizeof(*info));
3000 if (!info)
3001 return -ENOMEM;
3002
3003 memset(info, 0, sizeof(*info));
3004 info->type = type;
3005
3006 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
3007 do {
3008 info->buf = malloc(info->buflen);
3009 } while (!info->buf);
3010 memset(info->buf, 0, info->buflen);
3011 /* set actual size to buffer size */
3012 info->size = info->buflen;
3013
3014 fi->fh = (unsigned long)info;
3015 return 0;
3016 }
3017
3018 static int proc_release(const char *path, struct fuse_file_info *fi)
3019 {
3020 struct file_info *f = (struct file_info *)fi->fh;
3021
3022 do_release_file_info(f);
3023 return 0;
3024 }
3025
3026 static int proc_read(const char *path, char *buf, size_t size, off_t offset,
3027 struct fuse_file_info *fi)
3028 {
3029 struct file_info *f = (struct file_info *) fi->fh;
3030
3031 switch (f->type) {
3032 case LXC_TYPE_PROC_MEMINFO:
3033 return proc_meminfo_read(buf, size, offset, fi);
3034 case LXC_TYPE_PROC_CPUINFO:
3035 return proc_cpuinfo_read(buf, size, offset, fi);
3036 case LXC_TYPE_PROC_UPTIME:
3037 return proc_uptime_read(buf, size, offset, fi);
3038 case LXC_TYPE_PROC_STAT:
3039 return proc_stat_read(buf, size, offset, fi);
3040 case LXC_TYPE_PROC_DISKSTATS:
3041 return proc_diskstats_read(buf, size, offset, fi);
3042 default:
3043 return -EINVAL;
3044 }
3045 }
3046
3047 /*
3048 * FUSE ops for /
3049 * these just delegate to the /proc and /cgroup ops as
3050 * needed
3051 */
3052
3053 static int lxcfs_getattr(const char *path, struct stat *sb)
3054 {
3055 if (strcmp(path, "/") == 0) {
3056 sb->st_mode = S_IFDIR | 00755;
3057 sb->st_nlink = 2;
3058 return 0;
3059 }
3060 if (strncmp(path, "/cgroup", 7) == 0) {
3061 return cg_getattr(path, sb);
3062 }
3063 if (strncmp(path, "/proc", 5) == 0) {
3064 return proc_getattr(path, sb);
3065 }
3066 return -EINVAL;
3067 }
3068
3069 static int lxcfs_opendir(const char *path, struct fuse_file_info *fi)
3070 {
3071 if (strcmp(path, "/") == 0)
3072 return 0;
3073
3074 if (strncmp(path, "/cgroup", 7) == 0) {
3075 return cg_opendir(path, fi);
3076 }
3077 if (strcmp(path, "/proc") == 0)
3078 return 0;
3079 return -ENOENT;
3080 }
3081
3082 static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
3083 struct fuse_file_info *fi)
3084 {
3085 if (strcmp(path, "/") == 0) {
3086 if (filler(buf, "proc", NULL, 0) != 0 ||
3087 filler(buf, "cgroup", NULL, 0) != 0)
3088 return -EINVAL;
3089 return 0;
3090 }
3091 if (strncmp(path, "/cgroup", 7) == 0)
3092 return cg_readdir(path, buf, filler, offset, fi);
3093 if (strcmp(path, "/proc") == 0)
3094 return proc_readdir(path, buf, filler, offset, fi);
3095 return -EINVAL;
3096 }
3097
3098 static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi)
3099 {
3100 if (strcmp(path, "/") == 0)
3101 return 0;
3102 if (strncmp(path, "/cgroup", 7) == 0) {
3103 return cg_releasedir(path, fi);
3104 }
3105 if (strcmp(path, "/proc") == 0)
3106 return 0;
3107 return -EINVAL;
3108 }
3109
3110 static int lxcfs_open(const char *path, struct fuse_file_info *fi)
3111 {
3112 if (strncmp(path, "/cgroup", 7) == 0)
3113 return cg_open(path, fi);
3114 if (strncmp(path, "/proc", 5) == 0)
3115 return proc_open(path, fi);
3116
3117 return -EINVAL;
3118 }
3119
3120 static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset,
3121 struct fuse_file_info *fi)
3122 {
3123 if (strncmp(path, "/cgroup", 7) == 0)
3124 return cg_read(path, buf, size, offset, fi);
3125 if (strncmp(path, "/proc", 5) == 0)
3126 return proc_read(path, buf, size, offset, fi);
3127
3128 return -EINVAL;
3129 }
3130
3131 int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset,
3132 struct fuse_file_info *fi)
3133 {
3134 if (strncmp(path, "/cgroup", 7) == 0) {
3135 return cg_write(path, buf, size, offset, fi);
3136 }
3137
3138 return -EINVAL;
3139 }
3140
3141 static int lxcfs_flush(const char *path, struct fuse_file_info *fi)
3142 {
3143 return 0;
3144 }
3145
3146 static int lxcfs_release(const char *path, struct fuse_file_info *fi)
3147 {
3148 if (strncmp(path, "/cgroup", 7) == 0)
3149 return cg_release(path, fi);
3150 if (strncmp(path, "/proc", 5) == 0)
3151 return proc_release(path, fi);
3152
3153 return -EINVAL;
3154 }
3155
3156 static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi)
3157 {
3158 return 0;
3159 }
3160
3161 int lxcfs_mkdir(const char *path, mode_t mode)
3162 {
3163 if (strncmp(path, "/cgroup", 7) == 0)
3164 return cg_mkdir(path, mode);
3165
3166 return -EINVAL;
3167 }
3168
3169 int lxcfs_chown(const char *path, uid_t uid, gid_t gid)
3170 {
3171 if (strncmp(path, "/cgroup", 7) == 0)
3172 return cg_chown(path, uid, gid);
3173
3174 return -EINVAL;
3175 }
3176
3177 /*
3178 * cat first does a truncate before doing ops->write. This doesn't
3179 * really make sense for cgroups. So just return 0 always but do
3180 * nothing.
3181 */
3182 int lxcfs_truncate(const char *path, off_t newsize)
3183 {
3184 if (strncmp(path, "/cgroup", 7) == 0)
3185 return 0;
3186 return -EINVAL;
3187 }
3188
3189 int lxcfs_rmdir(const char *path)
3190 {
3191 if (strncmp(path, "/cgroup", 7) == 0)
3192 return cg_rmdir(path);
3193 return -EINVAL;
3194 }
3195
3196 int lxcfs_chmod(const char *path, mode_t mode)
3197 {
3198 if (strncmp(path, "/cgroup", 7) == 0)
3199 return cg_chmod(path, mode);
3200 return -EINVAL;
3201 }
3202
3203 const struct fuse_operations lxcfs_ops = {
3204 .getattr = lxcfs_getattr,
3205 .readlink = NULL,
3206 .getdir = NULL,
3207 .mknod = NULL,
3208 .mkdir = lxcfs_mkdir,
3209 .unlink = NULL,
3210 .rmdir = lxcfs_rmdir,
3211 .symlink = NULL,
3212 .rename = NULL,
3213 .link = NULL,
3214 .chmod = lxcfs_chmod,
3215 .chown = lxcfs_chown,
3216 .truncate = lxcfs_truncate,
3217 .utime = NULL,
3218
3219 .open = lxcfs_open,
3220 .read = lxcfs_read,
3221 .release = lxcfs_release,
3222 .write = lxcfs_write,
3223
3224 .statfs = NULL,
3225 .flush = lxcfs_flush,
3226 .fsync = lxcfs_fsync,
3227
3228 .setxattr = NULL,
3229 .getxattr = NULL,
3230 .listxattr = NULL,
3231 .removexattr = NULL,
3232
3233 .opendir = lxcfs_opendir,
3234 .readdir = lxcfs_readdir,
3235 .releasedir = lxcfs_releasedir,
3236
3237 .fsyncdir = NULL,
3238 .init = NULL,
3239 .destroy = NULL,
3240 .access = NULL,
3241 .create = NULL,
3242 .ftruncate = NULL,
3243 .fgetattr = NULL,
3244 };
3245
3246 static void usage(const char *me)
3247 {
3248 fprintf(stderr, "Usage:\n");
3249 fprintf(stderr, "\n");
3250 fprintf(stderr, "%s mountpoint\n", me);
3251 fprintf(stderr, "%s -h\n", me);
3252 exit(1);
3253 }
3254
3255 static bool is_help(char *w)
3256 {
3257 if (strcmp(w, "-h") == 0 ||
3258 strcmp(w, "--help") == 0 ||
3259 strcmp(w, "-help") == 0 ||
3260 strcmp(w, "help") == 0)
3261 return true;
3262 return false;
3263 }
3264
3265 void swallow_arg(int *argcp, char *argv[], char *which)
3266 {
3267 int i;
3268
3269 for (i = 1; argv[i]; i++) {
3270 if (strcmp(argv[i], which) != 0)
3271 continue;
3272 for (; argv[i]; i++) {
3273 argv[i] = argv[i+1];
3274 }
3275 (*argcp)--;
3276 return;
3277 }
3278 }
3279
3280 void swallow_option(int *argcp, char *argv[], char *opt, char *v)
3281 {
3282 int i;
3283
3284 for (i = 1; argv[i]; i++) {
3285 if (!argv[i+1])
3286 continue;
3287 if (strcmp(argv[i], opt) != 0)
3288 continue;
3289 if (strcmp(argv[i+1], v) != 0) {
3290 fprintf(stderr, "Warning: unexpected fuse option %s\n", v);
3291 exit(1);
3292 }
3293 for (; argv[i+1]; i++) {
3294 argv[i] = argv[i+2];
3295 }
3296 (*argcp) -= 2;
3297 return;
3298 }
3299 }
3300
3301 int main(int argc, char *argv[])
3302 {
3303 int ret = -1;
3304 /*
3305 * what we pass to fuse_main is:
3306 * argv[0] -s -f -o allow_other,directio argv[1] NULL
3307 */
3308 int nargs = 5, cnt = 0;
3309 char *newargv[6];
3310
3311 #ifdef FORTRAVIS
3312 /* for travis which runs on 12.04 */
3313 if (glib_check_version (2, 36, 0) != NULL)
3314 g_type_init ();
3315 #endif
3316
3317 /* accomodate older init scripts */
3318 swallow_arg(&argc, argv, "-s");
3319 swallow_arg(&argc, argv, "-f");
3320 swallow_option(&argc, argv, "-o", "allow_other");
3321
3322 if (argc == 2 && strcmp(argv[1], "--version") == 0) {
3323 fprintf(stderr, "%s\n", VERSION);
3324 exit(0);
3325 }
3326 if (argc != 2 || is_help(argv[1]))
3327 usage(argv[0]);
3328
3329 newargv[cnt++] = argv[0];
3330 newargv[cnt++] = "-f";
3331 newargv[cnt++] = "-o";
3332 newargv[cnt++] = "allow_other,direct_io,entry_timeout=0.5,attr_timeout=0.5";
3333 newargv[cnt++] = argv[1];
3334 newargv[cnt++] = NULL;
3335
3336 if (!cgfs_setup_controllers())
3337 goto out;
3338
3339 ret = fuse_main(nargs, newargv, &lxcfs_ops, NULL);
3340
3341 out:
3342 return ret;
3343 }