]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
Don't tie entries in 'hierarchies' to their subsystem id
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #include <stdio.h>
12 #include <dirent.h>
13 #include <fcntl.h>
14 #include <fuse.h>
15 #include <unistd.h>
16 #include <errno.h>
17 #include <stdbool.h>
18 #include <time.h>
19 #include <string.h>
20 #include <stdlib.h>
21 #include <libgen.h>
22 #include <sched.h>
23 #include <pthread.h>
24 #include <linux/sched.h>
25 #include <sys/param.h>
26 #include <sys/socket.h>
27 #include <sys/mount.h>
28 #include <sys/epoll.h>
29 #include <wait.h>
30
31 #ifdef FORTRAVIS
32 #define GLIB_DISABLE_DEPRECATION_WARNINGS
33 #include <glib-object.h>
34 #endif
35
36 #include "bindings.h"
37
38 #include "config.h" // for VERSION
39
40 enum {
41 LXC_TYPE_CGDIR,
42 LXC_TYPE_CGFILE,
43 LXC_TYPE_PROC_MEMINFO,
44 LXC_TYPE_PROC_CPUINFO,
45 LXC_TYPE_PROC_UPTIME,
46 LXC_TYPE_PROC_STAT,
47 LXC_TYPE_PROC_DISKSTATS,
48 };
49
50 struct file_info {
51 char *controller;
52 char *cgroup;
53 char *file;
54 int type;
55 char *buf; // unused as of yet
56 int buflen;
57 int size; //actual data size
58 int cached;
59 };
60
61 /* reserve buffer size, for cpuall in /proc/stat */
62 #define BUF_RESERVE_SIZE 256
63
64 /*
65 * A table caching which pid is init for a pid namespace.
66 * When looking up which pid is init for $qpid, we first
67 * 1. Stat /proc/$qpid/ns/pid.
68 * 2. Check whether the ino_t is in our store.
69 * a. if not, fork a child in qpid's ns to send us
70 * ucred.pid = 1, and read the initpid. Cache
71 * initpid and creation time for /proc/initpid
72 * in a new store entry.
73 * b. if so, verify that /proc/initpid still matches
74 * what we have saved. If not, clear the store
75 * entry and go back to a. If so, return the
76 * cached initpid.
77 */
78 struct pidns_init_store {
79 ino_t ino; // inode number for /proc/$pid/ns/pid
80 pid_t initpid; // the pid of nit in that ns
81 long int ctime; // the time at which /proc/$initpid was created
82 struct pidns_init_store *next;
83 long int lastcheck;
84 };
85
86 /* lol - look at how they are allocated in the kernel */
87 #define PIDNS_HASH_SIZE 4096
88 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
89
90 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
91 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
92 static void lock_mutex(pthread_mutex_t *l)
93 {
94 int ret;
95
96 if ((ret = pthread_mutex_lock(l)) != 0) {
97 fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret));
98 exit(1);
99 }
100 }
101
102 static void unlock_mutex(pthread_mutex_t *l)
103 {
104 int ret;
105
106 if ((ret = pthread_mutex_unlock(l)) != 0) {
107 fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret));
108 exit(1);
109 }
110 }
111
112 static void store_lock(void)
113 {
114 lock_mutex(&pidns_store_mutex);
115 }
116
117 static void store_unlock(void)
118 {
119 unlock_mutex(&pidns_store_mutex);
120 }
121
122 /* Must be called under store_lock */
123 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
124 {
125 struct stat initsb;
126 char fnam[100];
127
128 snprintf(fnam, 100, "/proc/%d", e->initpid);
129 if (stat(fnam, &initsb) < 0)
130 return false;
131 #if DEBUG
132 fprintf(stderr, "comparing ctime %ld %ld for pid %d\n",
133 e->ctime, initsb.st_ctime, e->initpid);
134 #endif
135 if (e->ctime != initsb.st_ctime)
136 return false;
137 return true;
138 }
139
140 /* Must be called under store_lock */
141 static void remove_initpid(struct pidns_init_store *e)
142 {
143 struct pidns_init_store *tmp;
144 int h;
145
146 #if DEBUG
147 fprintf(stderr, "remove_initpid: removing entry for %d\n", e->initpid);
148 #endif
149 h = HASH(e->ino);
150 if (pidns_hash_table[h] == e) {
151 pidns_hash_table[h] = e->next;
152 free(e);
153 return;
154 }
155
156 tmp = pidns_hash_table[h];
157 while (tmp) {
158 if (tmp->next == e) {
159 tmp->next = e->next;
160 free(e);
161 return;
162 }
163 tmp = tmp->next;
164 }
165 }
166
167 #define PURGE_SECS 5
168 /* Must be called under store_lock */
169 static void prune_initpid_store(void)
170 {
171 static long int last_prune = 0;
172 struct pidns_init_store *e, *prev, *delme;
173 long int now, threshold;
174 int i;
175
176 if (!last_prune) {
177 last_prune = time(NULL);
178 return;
179 }
180 now = time(NULL);
181 if (now < last_prune + PURGE_SECS)
182 return;
183 #if DEBUG
184 fprintf(stderr, "pruning\n");
185 #endif
186 last_prune = now;
187 threshold = now - 2 * PURGE_SECS;
188
189 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
190 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
191 if (e->lastcheck < threshold) {
192 #if DEBUG
193 fprintf(stderr, "Removing cached entry for %d\n", e->initpid);
194 #endif
195 delme = e;
196 if (prev)
197 prev->next = e->next;
198 else
199 pidns_hash_table[i] = e->next;
200 e = e->next;
201 free(delme);
202 } else {
203 prev = e;
204 e = e->next;
205 }
206 }
207 }
208 }
209
210 /* Must be called under store_lock */
211 static void save_initpid(struct stat *sb, pid_t pid)
212 {
213 struct pidns_init_store *e;
214 char fpath[100];
215 struct stat procsb;
216 int h;
217
218 #if DEBUG
219 fprintf(stderr, "save_initpid: adding entry for %d\n", pid);
220 #endif
221 snprintf(fpath, 100, "/proc/%d", pid);
222 if (stat(fpath, &procsb) < 0)
223 return;
224 do {
225 e = malloc(sizeof(*e));
226 } while (!e);
227 e->ino = sb->st_ino;
228 e->initpid = pid;
229 e->ctime = procsb.st_ctime;
230 h = HASH(e->ino);
231 e->next = pidns_hash_table[h];
232 e->lastcheck = time(NULL);
233 pidns_hash_table[h] = e;
234 }
235
236 /*
237 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
238 * entry for the inode number and creation time. Verify that the init pid
239 * is still valid. If not, remove it. Return the entry if valid, NULL
240 * otherwise.
241 * Must be called under store_lock
242 */
243 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
244 {
245 int h = HASH(sb->st_ino);
246 struct pidns_init_store *e = pidns_hash_table[h];
247
248 while (e) {
249 if (e->ino == sb->st_ino) {
250 if (initpid_still_valid(e, sb)) {
251 e->lastcheck = time(NULL);
252 return e;
253 }
254 remove_initpid(e);
255 return NULL;
256 }
257 e = e->next;
258 }
259
260 return NULL;
261 }
262
263 static int is_dir(const char *path)
264 {
265 struct stat statbuf;
266 int ret = stat(path, &statbuf);
267 if (ret == 0 && S_ISDIR(statbuf.st_mode))
268 return 1;
269 return 0;
270 }
271
272 static char *must_copy_string(const char *str)
273 {
274 char *dup = NULL;
275 if (!str)
276 return NULL;
277 do {
278 dup = strdup(str);
279 } while (!dup);
280
281 return dup;
282 }
283
284 static inline void drop_trailing_newlines(char *s)
285 {
286 int l;
287
288 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
289 s[l-1] = '\0';
290 }
291
292 #define BATCH_SIZE 50
293 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
294 {
295 int newbatches = (newlen / BATCH_SIZE) + 1;
296 int oldbatches = (oldlen / BATCH_SIZE) + 1;
297
298 if (!*mem || newbatches > oldbatches) {
299 char *tmp;
300 do {
301 tmp = realloc(*mem, newbatches * BATCH_SIZE);
302 } while (!tmp);
303 *mem = tmp;
304 }
305 }
306 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
307 {
308 size_t newlen = *len + linelen;
309 dorealloc(contents, *len, newlen + 1);
310 memcpy(*contents + *len, line, linelen+1);
311 *len = newlen;
312 }
313
314 static char *slurp_file(const char *from)
315 {
316 char *line = NULL;
317 char *contents = NULL;
318 FILE *f = fopen(from, "r");
319 size_t len = 0, fulllen = 0;
320 ssize_t linelen;
321
322 if (!f)
323 return NULL;
324
325 while ((linelen = getline(&line, &len, f)) != -1) {
326 append_line(&contents, &fulllen, line, linelen);
327 }
328 fclose(f);
329
330 if (contents)
331 drop_trailing_newlines(contents);
332 free(line);
333 return contents;
334 }
335
336 static bool write_string(const char *fnam, const char *string)
337 {
338 FILE *f;
339 size_t len, ret;
340
341 if (!(f = fopen(fnam, "w")))
342 return false;
343 len = strlen(string);
344 ret = fwrite(string, 1, len, f);
345 if (ret != len) {
346 fprintf(stderr, "Error writing to file: %s\n", strerror(errno));
347 fclose(f);
348 return false;
349 }
350 if (fclose(f) < 0) {
351 fprintf(stderr, "Error writing to file: %s\n", strerror(errno));
352 return false;
353 }
354 return true;
355 }
356
357 /*
358 * hierarchies, i.e. 'cpu,cpuacct'
359 */
360 char **hierarchies;
361 int num_hierarchies;
362
363 struct cgfs_files {
364 char *name;
365 uint32_t uid, gid;
366 uint32_t mode;
367 };
368
369 #define ALLOC_NUM 20
370 static bool store_hierarchy(char *stridx, char *h)
371 {
372 if (num_hierarchies % ALLOC_NUM == 0) {
373 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
374 n *= ALLOC_NUM;
375 char **tmp = realloc(hierarchies, n * sizeof(char *));
376 printf("allocated %d\n", n);
377 if (!tmp) {
378 fprintf(stderr, "Out of memory\n");
379 exit(1);
380 }
381 hierarchies = tmp;
382 }
383
384 hierarchies[num_hierarchies++] = must_copy_string(h);
385 return true;
386 }
387
388 static void print_subsystems(void)
389 {
390 int i;
391
392 fprintf(stderr, "hierarchies:");
393 for (i = 0; i < num_hierarchies; i++) {
394 if (hierarchies[i])
395 fprintf(stderr, " %d: %s\n", i, hierarchies[i]);
396 }
397 }
398
399 static bool in_comma_list(const char *needle, const char *haystack)
400 {
401 const char *s = haystack, *e;
402 size_t nlen = strlen(needle);
403
404 while (*s && (e = index(s, ','))) {
405 if (nlen != e - s) {
406 s = e + 1;
407 continue;
408 }
409 if (strncmp(needle, s, nlen) == 0)
410 return true;
411 s = e + 1;
412 }
413 if (strcmp(needle, s) == 0)
414 return true;
415 return false;
416 }
417
418 /* do we need to do any massaging here? I'm not sure... */
419 static char *find_mounted_controller(const char *controller)
420 {
421 int i;
422
423 for (i = 0; i < num_hierarchies; i++) {
424 if (!hierarchies[i])
425 continue;
426 if (strcmp(hierarchies[i], controller) == 0)
427 return hierarchies[i];
428 if (in_comma_list(controller, hierarchies[i]))
429 return hierarchies[i];
430 }
431
432 return NULL;
433 }
434
435 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
436 const char *value)
437 {
438 size_t len;
439 char *fnam, *tmpc = find_mounted_controller(controller);
440
441 if (!tmpc)
442 return false;
443 /* basedir / tmpc / cgroup / file \0 */
444 len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + strlen(file) + 4;
445 fnam = alloca(len);
446 snprintf(fnam, len, "%s/%s/%s/%s", basedir, tmpc, cgroup, file);
447
448 return write_string(fnam, value);
449 }
450
451 // Chown all the files in the cgroup directory. We do this when we create
452 // a cgroup on behalf of a user.
453 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid)
454 {
455 struct dirent dirent, *direntp;
456 char path[MAXPATHLEN];
457 size_t len;
458 DIR *d;
459 int ret;
460
461 len = strlen(dirname);
462 if (len >= MAXPATHLEN) {
463 fprintf(stderr, "chown_all_cgroup_files: pathname too long: %s\n", dirname);
464 return;
465 }
466
467 d = opendir(dirname);
468 if (!d) {
469 fprintf(stderr, "chown_all_cgroup_files: failed to open %s\n", dirname);
470 return;
471 }
472
473 while (readdir_r(d, &dirent, &direntp) == 0 && direntp) {
474 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
475 continue;
476 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
477 if (ret < 0 || ret >= MAXPATHLEN) {
478 fprintf(stderr, "chown_all_cgroup_files: pathname too long under %s\n", dirname);
479 continue;
480 }
481 if (chown(path, uid, gid) < 0)
482 fprintf(stderr, "Failed to chown file %s to %u:%u", path, uid, gid);
483 }
484 closedir(d);
485 }
486
487 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
488 {
489 size_t len;
490 char *dirnam, *tmpc = find_mounted_controller(controller);
491
492 if (!tmpc)
493 return -EINVAL;
494 /* basedir / tmpc / cg \0 */
495 len = strlen(basedir) + strlen(tmpc) + strlen(cg) + 3;
496 dirnam = alloca(len);
497 snprintf(dirnam, len, "%s/%s/%s", basedir,tmpc, cg);
498
499 if (mkdir(dirnam, 0755) < 0)
500 return -errno;
501
502 if (uid == 0 && gid == 0)
503 return 0;
504
505 if (chown(dirnam, uid, gid) < 0)
506 return -errno;
507
508 chown_all_cgroup_files(dirnam, uid, gid);
509
510 return 0;
511 }
512
513 static bool recursive_rmdir(const char *dirname)
514 {
515 struct dirent dirent, *direntp;
516 DIR *dir;
517 bool ret = false;
518 char pathname[MAXPATHLEN];
519
520 dir = opendir(dirname);
521 if (!dir) {
522 #if DEBUG
523 fprintf(stderr, "%s: failed to open %s: %s\n", __func__, dirname, strerror(errno));
524 #endif
525 return false;
526 }
527
528 while (!readdir_r(dir, &dirent, &direntp)) {
529 struct stat mystat;
530 int rc;
531
532 if (!direntp)
533 break;
534
535 if (!strcmp(direntp->d_name, ".") ||
536 !strcmp(direntp->d_name, ".."))
537 continue;
538
539 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
540 if (rc < 0 || rc >= MAXPATHLEN) {
541 fprintf(stderr, "pathname too long\n");
542 continue;
543 }
544
545 ret = lstat(pathname, &mystat);
546 if (ret) {
547 #if DEBUG
548 fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno));
549 #endif
550 continue;
551 }
552 if (S_ISDIR(mystat.st_mode)) {
553 if (!recursive_rmdir(pathname)) {
554 #if DEBUG
555 fprintf(stderr, "Error removing %s\n", pathname);
556 #endif
557 }
558 }
559 }
560
561 ret = true;
562 if (closedir(dir) < 0) {
563 fprintf(stderr, "%s: failed to close directory %s: %s\n", __func__, dirname, strerror(errno));
564 ret = false;
565 }
566
567 if (rmdir(dirname) < 0) {
568 #if DEBUG
569 fprintf(stderr, "%s: failed to delete %s: %s\n", __func__, dirname, strerror(errno));
570 #endif
571 ret = false;
572 }
573
574 return ret;
575 }
576
577 bool cgfs_remove(const char *controller, const char *cg)
578 {
579 size_t len;
580 char *dirnam, *tmpc = find_mounted_controller(controller);
581
582 if (!tmpc)
583 return false;
584 /* basedir / tmpc / cg \0 */
585 len = strlen(basedir) + strlen(tmpc) + strlen(cg) + 3;
586 dirnam = alloca(len);
587 snprintf(dirnam, len, "%s/%s/%s", basedir,tmpc, cg);
588 return recursive_rmdir(dirnam);
589 }
590
591 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
592 {
593 size_t len;
594 char *pathname, *tmpc = find_mounted_controller(controller);
595
596 if (!tmpc)
597 return false;
598 /* basedir / tmpc / file \0 */
599 len = strlen(basedir) + strlen(tmpc) + strlen(file) + 3;
600 pathname = alloca(len);
601 snprintf(pathname, len, "%s/%s/%s", basedir, tmpc, file);
602 if (chmod(pathname, mode) < 0)
603 return false;
604 return true;
605 }
606
607 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid)
608 {
609 size_t len;
610 char *fname;
611
612 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
613 fname = alloca(len);
614 snprintf(fname, len, "%s/tasks", dirname);
615 if (chown(fname, uid, gid) != 0)
616 return -errno;
617 snprintf(fname, len, "%s/cgroup.procs", dirname);
618 if (chown(fname, uid, gid) != 0)
619 return -errno;
620 return 0;
621 }
622
623 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
624 {
625 size_t len;
626 char *pathname, *tmpc = find_mounted_controller(controller);
627
628 if (!tmpc)
629 return -EINVAL;
630 /* basedir / tmpc / file \0 */
631 len = strlen(basedir) + strlen(tmpc) + strlen(file) + 3;
632 pathname = alloca(len);
633 snprintf(pathname, len, "%s/%s/%s", basedir, tmpc, file);
634 if (chown(pathname, uid, gid) < 0)
635 return -errno;
636
637 if (is_dir(pathname))
638 // like cgmanager did, we want to chown the tasks file as well
639 return chown_tasks_files(pathname, uid, gid);
640
641 return 0;
642 }
643
644 FILE *open_pids_file(const char *controller, const char *cgroup)
645 {
646 size_t len;
647 char *pathname, *tmpc = find_mounted_controller(controller);
648
649 if (!tmpc)
650 return NULL;
651 /* basedir / tmpc / cgroup / "cgroup.procs" \0 */
652 len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + 4 + strlen("cgroup.procs");
653 pathname = alloca(len);
654 snprintf(pathname, len, "%s/%s/%s/cgroup.procs", basedir, tmpc, cgroup);
655 return fopen(pathname, "w");
656 }
657
658 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
659 {
660 size_t len;
661 char *dirname, *tmpc = find_mounted_controller(controller);
662 char pathname[MAXPATHLEN];
663 size_t sz = 0, asz = BATCH_SIZE;
664 struct dirent dirent, *direntp;
665 DIR *dir;
666 int ret;
667
668 do {
669 *list = malloc(asz * sizeof(char *));
670 } while (!*list);
671 (*list)[0] = NULL;
672
673 if (!tmpc)
674 return NULL;
675
676 /* basedir / tmpc / cgroup \0 */
677 len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + 3;
678 dirname = alloca(len);
679 snprintf(dirname, len, "%s/%s/%s", basedir, tmpc, cgroup);
680
681 dir = opendir(dirname);
682 if (!dir)
683 return false;
684
685 while (!readdir_r(dir, &dirent, &direntp)) {
686 struct stat mystat;
687 int rc;
688
689 if (!direntp)
690 break;
691
692 if (!strcmp(direntp->d_name, ".") ||
693 !strcmp(direntp->d_name, ".."))
694 continue;
695
696 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
697 if (rc < 0 || rc >= MAXPATHLEN) {
698 fprintf(stderr, "%s: pathname too long under %s\n", __func__, dirname);
699 continue;
700 }
701
702 ret = lstat(pathname, &mystat);
703 if (ret) {
704 fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno));
705 continue;
706 }
707 if (!S_ISDIR(mystat.st_mode))
708 continue;
709
710 if (sz+2 >= asz) {
711 char **tmp;
712 asz += BATCH_SIZE;
713 do {
714 tmp = realloc(*list, asz * sizeof(char *));
715 } while (!tmp);
716 *list = tmp;
717 }
718 do {
719 (*list)[sz] = strdup(direntp->d_name);
720 } while (!(*list)[sz]);
721 (*list)[sz+1] = NULL;
722 sz++;
723 }
724 if (closedir(dir) < 0) {
725 fprintf(stderr, "%s: failed closedir for %s: %s\n", __func__, dirname, strerror(errno));
726 return false;
727 }
728 return true;
729 }
730
731 void free_key(struct cgfs_files *k)
732 {
733 if (!k)
734 return;
735 free(k->name);
736 free(k);
737 }
738
739 void free_keys(struct cgfs_files **keys)
740 {
741 int i;
742
743 if (!keys)
744 return;
745 for (i = 0; keys[i]; i++) {
746 free_key(keys[i]);
747 }
748 free(keys);
749 }
750
751 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
752 {
753 size_t len;
754 char *fnam, *tmpc = find_mounted_controller(controller);
755
756 if (!tmpc)
757 return false;
758 /* basedir / tmpc / cgroup / file \0 */
759 len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + strlen(file) + 4;
760 fnam = alloca(len);
761 snprintf(fnam, len, "%s/%s/%s/%s", basedir, tmpc, cgroup, file);
762
763 *value = slurp_file(fnam);
764 return *value != NULL;
765 }
766
767 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
768 {
769 size_t len;
770 char *fnam, *tmpc = find_mounted_controller(controller);
771 struct stat sb;
772 struct cgfs_files *newkey;
773 int ret;
774
775 if (!tmpc)
776 return false;
777
778 if (file && *file == '/')
779 file++;
780
781 if (file && index(file, '/'))
782 return NULL;
783
784 /* basedir / tmpc / cgroup / file \0 */
785 len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + 3;
786 if (file)
787 len += strlen(file) + 1;
788 fnam = alloca(len);
789 snprintf(fnam, len, "%s/%s/%s%s%s", basedir, tmpc, cgroup,
790 file ? "/" : "", file ? file : "");
791
792 ret = stat(fnam, &sb);
793 if (ret < 0)
794 return NULL;
795
796 do {
797 newkey = malloc(sizeof(struct cgfs_files));
798 } while (!newkey);
799 if (file)
800 newkey->name = must_copy_string(file);
801 else if (rindex(cgroup, '/'))
802 newkey->name = must_copy_string(rindex(cgroup, '/'));
803 else
804 newkey->name = must_copy_string(cgroup);
805 newkey->uid = sb.st_uid;
806 newkey->gid = sb.st_gid;
807 newkey->mode = sb.st_mode;
808
809 return newkey;
810 }
811
812 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
813 {
814 size_t len;
815 char *dirname, *tmpc = find_mounted_controller(controller);
816 char pathname[MAXPATHLEN];
817 size_t sz = 0, asz = 0;
818 struct dirent dirent, *direntp;
819 DIR *dir;
820 int ret;
821
822 *keys = NULL;
823 if (!tmpc)
824 return NULL;
825
826 /* basedir / tmpc / cgroup \0 */
827 len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + 3;
828 dirname = alloca(len);
829 snprintf(dirname, len, "%s/%s/%s", basedir, tmpc, cgroup);
830
831 dir = opendir(dirname);
832 if (!dir)
833 return false;
834
835 while (!readdir_r(dir, &dirent, &direntp)) {
836 struct stat mystat;
837 int rc;
838
839 if (!direntp)
840 break;
841
842 if (!strcmp(direntp->d_name, ".") ||
843 !strcmp(direntp->d_name, ".."))
844 continue;
845
846 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
847 if (rc < 0 || rc >= MAXPATHLEN) {
848 fprintf(stderr, "%s: pathname too long under %s\n", __func__, dirname);
849 continue;
850 }
851
852 ret = lstat(pathname, &mystat);
853 if (ret) {
854 fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno));
855 continue;
856 }
857 if (!S_ISREG(mystat.st_mode))
858 continue;
859
860 if (sz+2 >= asz) {
861 struct cgfs_files **tmp;
862 asz += BATCH_SIZE;
863 do {
864 tmp = realloc(*keys, asz * sizeof(struct cgfs_files *));
865 } while (!tmp);
866 *keys = tmp;
867 }
868 (*keys)[sz] = cgfs_get_key(controller, cgroup, direntp->d_name);
869 (*keys)[sz+1] = NULL;
870 if (!(*keys)[sz]) {
871 fprintf(stderr, "%s: Error getting files under %s:%s\n",
872 __func__, controller, cgroup);
873 continue;
874 }
875 sz++;
876 }
877 if (closedir(dir) < 0) {
878 fprintf(stderr, "%s: failed closedir for %s: %s\n", __func__, dirname, strerror(errno));
879 return false;
880 }
881 return true;
882 }
883
884 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
885 { size_t len;
886 char *fnam, *tmpc = find_mounted_controller(controller);
887 int ret;
888 struct stat sb;
889
890 if (!tmpc)
891 return false;
892 /* basedir / tmpc / cgroup / f \0 */
893 len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + strlen(f) + 4;
894 fnam = alloca(len);
895 snprintf(fnam, len, "%s/%s/%s/%s", basedir, tmpc, cgroup, f);
896
897 ret = stat(fnam, &sb);
898 if (ret < 0 || !S_ISDIR(sb.st_mode))
899 return false;
900 return true;
901 }
902
903 #define SEND_CREDS_OK 0
904 #define SEND_CREDS_NOTSK 1
905 #define SEND_CREDS_FAIL 2
906 static bool recv_creds(int sock, struct ucred *cred, char *v);
907 static int wait_for_pid(pid_t pid);
908 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
909
910 /*
911 * fork a task which switches to @task's namespace and writes '1'.
912 * over a unix sock so we can read the task's reaper's pid in our
913 * namespace
914 */
915 static void write_task_init_pid_exit(int sock, pid_t target)
916 {
917 struct ucred cred;
918 char fnam[100];
919 pid_t pid;
920 char v;
921 int fd, ret;
922
923 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
924 if (ret < 0 || ret >= sizeof(fnam))
925 _exit(1);
926
927 fd = open(fnam, O_RDONLY);
928 if (fd < 0) {
929 perror("write_task_init_pid_exit open of ns/pid");
930 _exit(1);
931 }
932 if (setns(fd, 0)) {
933 perror("write_task_init_pid_exit setns 1");
934 close(fd);
935 _exit(1);
936 }
937 pid = fork();
938 if (pid < 0)
939 _exit(1);
940 if (pid != 0) {
941 if (!wait_for_pid(pid))
942 _exit(1);
943 _exit(0);
944 }
945
946 /* we are the child */
947 cred.uid = 0;
948 cred.gid = 0;
949 cred.pid = 1;
950 v = '1';
951 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
952 _exit(1);
953 _exit(0);
954 }
955
956 static pid_t get_init_pid_for_task(pid_t task)
957 {
958 int sock[2];
959 pid_t pid;
960 pid_t ret = -1;
961 char v = '0';
962 struct ucred cred;
963
964 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
965 perror("socketpair");
966 return -1;
967 }
968
969 pid = fork();
970 if (pid < 0)
971 goto out;
972 if (!pid) {
973 close(sock[1]);
974 write_task_init_pid_exit(sock[0], task);
975 _exit(0);
976 }
977
978 if (!recv_creds(sock[1], &cred, &v))
979 goto out;
980 ret = cred.pid;
981
982 out:
983 close(sock[0]);
984 close(sock[1]);
985 if (pid > 0)
986 wait_for_pid(pid);
987 return ret;
988 }
989
990 static pid_t lookup_initpid_in_store(pid_t qpid)
991 {
992 pid_t answer = 0;
993 struct stat sb;
994 struct pidns_init_store *e;
995 char fnam[100];
996
997 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
998 store_lock();
999 if (stat(fnam, &sb) < 0)
1000 goto out;
1001 e = lookup_verify_initpid(&sb);
1002 if (e) {
1003 answer = e->initpid;
1004 goto out;
1005 }
1006 answer = get_init_pid_for_task(qpid);
1007 if (answer > 0)
1008 save_initpid(&sb, answer);
1009
1010 out:
1011 /* we prune at end in case we are returning
1012 * the value we were about to return */
1013 prune_initpid_store();
1014 store_unlock();
1015 return answer;
1016 }
1017
1018 static int wait_for_pid(pid_t pid)
1019 {
1020 int status, ret;
1021
1022 if (pid <= 0)
1023 return -1;
1024
1025 again:
1026 ret = waitpid(pid, &status, 0);
1027 if (ret == -1) {
1028 if (errno == EINTR)
1029 goto again;
1030 return -1;
1031 }
1032 if (ret != pid)
1033 goto again;
1034 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1035 return -1;
1036 return 0;
1037 }
1038
1039
1040 /*
1041 * append pid to *src.
1042 * src: a pointer to a char* in which ot append the pid.
1043 * sz: the number of characters printed so far, minus trailing \0.
1044 * asz: the allocated size so far
1045 * pid: the pid to append
1046 */
1047 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1048 {
1049 char tmp[30];
1050
1051 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1052
1053 if (!*src || tmplen + *sz + 1 >= *asz) {
1054 char *tmp;
1055 do {
1056 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1057 } while (!tmp);
1058 *src = tmp;
1059 *asz += BUF_RESERVE_SIZE;
1060 }
1061 memcpy((*src) +*sz , tmp, tmplen);
1062 *sz += tmplen;
1063 (*src)[*sz] = '\0';
1064 }
1065
1066 /*
1067 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1068 * valid in the caller's namespace, return the id mapped into
1069 * pid's namespace.
1070 * Returns the mapped id, or -1 on error.
1071 */
1072 unsigned int
1073 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1074 {
1075 unsigned int nsuid, // base id for a range in the idfile's namespace
1076 hostuid, // base id for a range in the caller's namespace
1077 count; // number of ids in this range
1078 char line[400];
1079 int ret;
1080
1081 fseek(idfile, 0L, SEEK_SET);
1082 while (fgets(line, 400, idfile)) {
1083 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1084 if (ret != 3)
1085 continue;
1086 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1087 /*
1088 * uids wrapped around - unexpected as this is a procfile,
1089 * so just bail.
1090 */
1091 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
1092 nsuid, hostuid, count, line);
1093 return -1;
1094 }
1095 if (hostuid <= in_id && hostuid+count > in_id) {
1096 /*
1097 * now since hostuid <= in_id < hostuid+count, and
1098 * hostuid+count and nsuid+count do not wrap around,
1099 * we know that nsuid+(in_id-hostuid) which must be
1100 * less that nsuid+(count) must not wrap around
1101 */
1102 return (in_id - hostuid) + nsuid;
1103 }
1104 }
1105
1106 // no answer found
1107 return -1;
1108 }
1109
1110 /*
1111 * for is_privileged_over,
1112 * specify whether we require the calling uid to be root in his
1113 * namespace
1114 */
1115 #define NS_ROOT_REQD true
1116 #define NS_ROOT_OPT false
1117
1118 #define PROCLEN 100
1119
1120 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1121 {
1122 char fpath[PROCLEN];
1123 int ret;
1124 bool answer = false;
1125 uid_t nsuid;
1126
1127 if (victim == -1 || uid == -1)
1128 return false;
1129
1130 /*
1131 * If the request is one not requiring root in the namespace,
1132 * then having the same uid suffices. (i.e. uid 1000 has write
1133 * access to files owned by uid 1000
1134 */
1135 if (!req_ns_root && uid == victim)
1136 return true;
1137
1138 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1139 if (ret < 0 || ret >= PROCLEN)
1140 return false;
1141 FILE *f = fopen(fpath, "r");
1142 if (!f)
1143 return false;
1144
1145 /* if caller's not root in his namespace, reject */
1146 nsuid = convert_id_to_ns(f, uid);
1147 if (nsuid)
1148 goto out;
1149
1150 /*
1151 * If victim is not mapped into caller's ns, reject.
1152 * XXX I'm not sure this check is needed given that fuse
1153 * will be sending requests where the vfs has converted
1154 */
1155 nsuid = convert_id_to_ns(f, victim);
1156 if (nsuid == -1)
1157 goto out;
1158
1159 answer = true;
1160
1161 out:
1162 fclose(f);
1163 return answer;
1164 }
1165
1166 static bool perms_include(int fmode, mode_t req_mode)
1167 {
1168 mode_t r;
1169
1170 switch (req_mode & O_ACCMODE) {
1171 case O_RDONLY:
1172 r = S_IROTH;
1173 break;
1174 case O_WRONLY:
1175 r = S_IWOTH;
1176 break;
1177 case O_RDWR:
1178 r = S_IROTH | S_IWOTH;
1179 break;
1180 default:
1181 return false;
1182 }
1183 return ((fmode & r) == r);
1184 }
1185
1186
1187 /*
1188 * taskcg is a/b/c
1189 * querycg is /a/b/c/d/e
1190 * we return 'd'
1191 */
1192 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1193 {
1194 char *start, *end;
1195
1196 if (strlen(taskcg) <= strlen(querycg)) {
1197 fprintf(stderr, "%s: I was fed bad input\n", __func__);
1198 return NULL;
1199 }
1200
1201 if (strcmp(querycg, "/") == 0)
1202 start = strdup(taskcg + 1);
1203 else
1204 start = strdup(taskcg + strlen(querycg) + 1);
1205 if (!start)
1206 return NULL;
1207 end = strchr(start, '/');
1208 if (end)
1209 *end = '\0';
1210 return start;
1211 }
1212
1213 static void stripnewline(char *x)
1214 {
1215 size_t l = strlen(x);
1216 if (l && x[l-1] == '\n')
1217 x[l-1] = '\0';
1218 }
1219
1220 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1221 {
1222 char fnam[PROCLEN];
1223 FILE *f;
1224 char *answer = NULL;
1225 char *line = NULL;
1226 size_t len = 0;
1227 int ret;
1228 const char *h = find_mounted_controller(contrl);
1229 if (!h)
1230 return NULL;
1231
1232 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1233 if (ret < 0 || ret >= PROCLEN)
1234 return NULL;
1235 if (!(f = fopen(fnam, "r")))
1236 return NULL;
1237
1238 while (getline(&line, &len, f) != -1) {
1239 char *c1, *c2;
1240 if (!line[0])
1241 continue;
1242 c1 = strchr(line, ':');
1243 if (!c1)
1244 goto out;
1245 c1++;
1246 c2 = strchr(c1, ':');
1247 if (!c2)
1248 goto out;
1249 *c2 = '\0';
1250 if (strcmp(c1, h) != 0)
1251 continue;
1252 c2++;
1253 stripnewline(c2);
1254 do {
1255 answer = strdup(c2);
1256 } while (!answer);
1257 break;
1258 }
1259
1260 out:
1261 fclose(f);
1262 free(line);
1263 return answer;
1264 }
1265
1266 /*
1267 * check whether a fuse context may access a cgroup dir or file
1268 *
1269 * If file is not null, it is a cgroup file to check under cg.
1270 * If file is null, then we are checking perms on cg itself.
1271 *
1272 * For files we can check the mode of the list_keys result.
1273 * For cgroups, we must make assumptions based on the files under the
1274 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1275 * yet.
1276 */
1277 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1278 {
1279 struct cgfs_files *k = NULL;
1280 bool ret = false;
1281
1282 k = cgfs_get_key(contrl, cg, file);
1283 if (!k)
1284 return false;
1285
1286 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1287 if (perms_include(k->mode >> 6, mode)) {
1288 ret = true;
1289 goto out;
1290 }
1291 }
1292 if (fc->gid == k->gid) {
1293 if (perms_include(k->mode >> 3, mode)) {
1294 ret = true;
1295 goto out;
1296 }
1297 }
1298 ret = perms_include(k->mode, mode);
1299
1300 out:
1301 free_key(k);
1302 return ret;
1303 }
1304
1305 #define INITSCOPE "/init.scope"
1306 static void prune_init_slice(char *cg)
1307 {
1308 char *point;
1309 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1310
1311 if (cg_len < initscope_len)
1312 return;
1313
1314 point = cg + cg_len - initscope_len;
1315 if (strcmp(point, INITSCOPE) == 0) {
1316 if (point == cg)
1317 *(point+1) = '\0';
1318 else
1319 *point = '\0';
1320 }
1321 }
1322
1323 /*
1324 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1325 * If pid is in /a, he may act on /a/b, but not on /b.
1326 * if the answer is false and nextcg is not NULL, then *nextcg will point
1327 * to a string containing the next cgroup directory under cg, which must be
1328 * freed by the caller.
1329 */
1330 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1331 {
1332 bool answer = false;
1333 char *c2 = get_pid_cgroup(pid, contrl);
1334 char *linecmp;
1335
1336 if (!c2)
1337 return false;
1338 prune_init_slice(c2);
1339
1340 /*
1341 * callers pass in '/' for root cgroup, otherwise they pass
1342 * in a cgroup without leading '/'
1343 */
1344 linecmp = *cg == '/' ? c2 : c2+1;
1345 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1346 if (nextcg) {
1347 *nextcg = get_next_cgroup_dir(linecmp, cg);
1348 }
1349 goto out;
1350 }
1351 answer = true;
1352
1353 out:
1354 free(c2);
1355 return answer;
1356 }
1357
1358 /*
1359 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1360 */
1361 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1362 {
1363 bool answer = false;
1364 char *c2, *task_cg;
1365 size_t target_len, task_len;
1366
1367 if (strcmp(cg, "/") == 0)
1368 return true;
1369
1370 c2 = get_pid_cgroup(pid, contrl);
1371 if (!c2)
1372 return false;
1373 prune_init_slice(c2);
1374
1375 task_cg = c2 + 1;
1376 target_len = strlen(cg);
1377 task_len = strlen(task_cg);
1378 if (task_len == 0) {
1379 /* Task is in the root cg, it can see everything. This case is
1380 * not handled by the strmcps below, since they test for the
1381 * last /, but that is the first / that we've chopped off
1382 * above.
1383 */
1384 answer = true;
1385 goto out;
1386 }
1387 if (strcmp(cg, task_cg) == 0) {
1388 answer = true;
1389 goto out;
1390 }
1391 if (target_len < task_len) {
1392 /* looking up a parent dir */
1393 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1394 answer = true;
1395 goto out;
1396 }
1397 if (target_len > task_len) {
1398 /* looking up a child dir */
1399 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1400 answer = true;
1401 goto out;
1402 }
1403
1404 out:
1405 free(c2);
1406 return answer;
1407 }
1408
1409 /*
1410 * given /cgroup/freezer/a/b, return "freezer".
1411 * the returned char* should NOT be freed.
1412 */
1413 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1414 {
1415 const char *p1;
1416 char *contr, *slash;
1417
1418 if (strlen(path) < 9)
1419 return NULL;
1420 if (*(path+7) != '/')
1421 return NULL;
1422 p1 = path+8;
1423 contr = strdupa(p1);
1424 if (!contr)
1425 return NULL;
1426 slash = strstr(contr, "/");
1427 if (slash)
1428 *slash = '\0';
1429
1430 int i;
1431 for (i = 0; i < num_hierarchies; i++) {
1432 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1433 return hierarchies[i];
1434 }
1435 return NULL;
1436 }
1437
1438 /*
1439 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1440 * Note that the returned value may include files (keynames) etc
1441 */
1442 static const char *find_cgroup_in_path(const char *path)
1443 {
1444 const char *p1;
1445
1446 if (strlen(path) < 9)
1447 return NULL;
1448 p1 = strstr(path+8, "/");
1449 if (!p1)
1450 return NULL;
1451 return p1+1;
1452 }
1453
1454 /*
1455 * split the last path element from the path in @cg.
1456 * @dir is newly allocated and should be freed, @last not
1457 */
1458 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1459 {
1460 char *p;
1461
1462 do {
1463 *dir = strdup(cg);
1464 } while (!*dir);
1465 *last = strrchr(cg, '/');
1466 if (!*last) {
1467 *last = NULL;
1468 return;
1469 }
1470 p = strrchr(*dir, '/');
1471 *p = '\0';
1472 }
1473
1474 /*
1475 * FUSE ops for /cgroup
1476 */
1477
1478 int cg_getattr(const char *path, struct stat *sb)
1479 {
1480 struct timespec now;
1481 struct fuse_context *fc = fuse_get_context();
1482 char * cgdir = NULL;
1483 char *last = NULL, *path1, *path2;
1484 struct cgfs_files *k = NULL;
1485 const char *cgroup;
1486 const char *controller = NULL;
1487 int ret = -ENOENT;
1488
1489
1490 if (!fc)
1491 return -EIO;
1492
1493 memset(sb, 0, sizeof(struct stat));
1494
1495 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1496 return -EINVAL;
1497
1498 sb->st_uid = sb->st_gid = 0;
1499 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1500 sb->st_size = 0;
1501
1502 if (strcmp(path, "/cgroup") == 0) {
1503 sb->st_mode = S_IFDIR | 00755;
1504 sb->st_nlink = 2;
1505 return 0;
1506 }
1507
1508 controller = pick_controller_from_path(fc, path);
1509 if (!controller)
1510 return -EIO;
1511 cgroup = find_cgroup_in_path(path);
1512 if (!cgroup) {
1513 /* this is just /cgroup/controller, return it as a dir */
1514 sb->st_mode = S_IFDIR | 00755;
1515 sb->st_nlink = 2;
1516 return 0;
1517 }
1518
1519 get_cgdir_and_path(cgroup, &cgdir, &last);
1520
1521 if (!last) {
1522 path1 = "/";
1523 path2 = cgdir;
1524 } else {
1525 path1 = cgdir;
1526 path2 = last;
1527 }
1528
1529 pid_t initpid = lookup_initpid_in_store(fc->pid);
1530 if (initpid <= 0)
1531 initpid = fc->pid;
1532 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1533 * Then check that caller's cgroup is under path if last is a child
1534 * cgroup, or cgdir if last is a file */
1535
1536 if (is_child_cgroup(controller, path1, path2)) {
1537 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1538 ret = -ENOENT;
1539 goto out;
1540 }
1541 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1542 /* this is just /cgroup/controller, return it as a dir */
1543 sb->st_mode = S_IFDIR | 00555;
1544 sb->st_nlink = 2;
1545 ret = 0;
1546 goto out;
1547 }
1548 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1549 ret = -EACCES;
1550 goto out;
1551 }
1552
1553 // get uid, gid, from '/tasks' file and make up a mode
1554 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1555 sb->st_mode = S_IFDIR | 00755;
1556 k = cgfs_get_key(controller, cgroup, NULL);
1557 if (!k) {
1558 sb->st_uid = sb->st_gid = 0;
1559 } else {
1560 sb->st_uid = k->uid;
1561 sb->st_gid = k->gid;
1562 }
1563 free_key(k);
1564 sb->st_nlink = 2;
1565 ret = 0;
1566 goto out;
1567 }
1568
1569 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1570 sb->st_mode = S_IFREG | k->mode;
1571 sb->st_nlink = 1;
1572 sb->st_uid = k->uid;
1573 sb->st_gid = k->gid;
1574 sb->st_size = 0;
1575 free_key(k);
1576 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1577 ret = -ENOENT;
1578 goto out;
1579 }
1580 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
1581 ret = -EACCES;
1582 goto out;
1583 }
1584
1585 ret = 0;
1586 }
1587
1588 out:
1589 free(cgdir);
1590 return ret;
1591 }
1592
1593 int cg_opendir(const char *path, struct fuse_file_info *fi)
1594 {
1595 struct fuse_context *fc = fuse_get_context();
1596 const char *cgroup;
1597 struct file_info *dir_info;
1598 char *controller = NULL;
1599
1600 if (!fc)
1601 return -EIO;
1602
1603 if (strcmp(path, "/cgroup") == 0) {
1604 cgroup = NULL;
1605 controller = NULL;
1606 } else {
1607 // return list of keys for the controller, and list of child cgroups
1608 controller = pick_controller_from_path(fc, path);
1609 if (!controller)
1610 return -EIO;
1611
1612 cgroup = find_cgroup_in_path(path);
1613 if (!cgroup) {
1614 /* this is just /cgroup/controller, return its contents */
1615 cgroup = "/";
1616 }
1617 }
1618
1619 pid_t initpid = lookup_initpid_in_store(fc->pid);
1620 if (initpid <= 0)
1621 initpid = fc->pid;
1622 if (cgroup) {
1623 if (!caller_may_see_dir(initpid, controller, cgroup))
1624 return -ENOENT;
1625 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1626 return -EACCES;
1627 }
1628
1629 /* we'll free this at cg_releasedir */
1630 dir_info = malloc(sizeof(*dir_info));
1631 if (!dir_info)
1632 return -ENOMEM;
1633 dir_info->controller = must_copy_string(controller);
1634 dir_info->cgroup = must_copy_string(cgroup);
1635 dir_info->type = LXC_TYPE_CGDIR;
1636 dir_info->buf = NULL;
1637 dir_info->file = NULL;
1638 dir_info->buflen = 0;
1639
1640 fi->fh = (unsigned long)dir_info;
1641 return 0;
1642 }
1643
1644 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1645 struct fuse_file_info *fi)
1646 {
1647 struct file_info *d = (struct file_info *)fi->fh;
1648 struct cgfs_files **list = NULL;
1649 int i, ret;
1650 char *nextcg = NULL;
1651 struct fuse_context *fc = fuse_get_context();
1652 char **clist = NULL;
1653
1654 if (d->type != LXC_TYPE_CGDIR) {
1655 fprintf(stderr, "Internal error: file cache info used in readdir\n");
1656 return -EIO;
1657 }
1658 if (!d->cgroup && !d->controller) {
1659 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1660 int i;
1661
1662 for (i = 0; i < num_hierarchies; i++) {
1663 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1664 return -EIO;
1665 }
1666 }
1667 return 0;
1668 }
1669
1670 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1671 // not a valid cgroup
1672 ret = -EINVAL;
1673 goto out;
1674 }
1675
1676 pid_t initpid = lookup_initpid_in_store(fc->pid);
1677 if (initpid <= 0)
1678 initpid = fc->pid;
1679 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1680 if (nextcg) {
1681 ret = filler(buf, nextcg, NULL, 0);
1682 free(nextcg);
1683 if (ret != 0) {
1684 ret = -EIO;
1685 goto out;
1686 }
1687 }
1688 ret = 0;
1689 goto out;
1690 }
1691
1692 for (i = 0; list[i]; i++) {
1693 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1694 ret = -EIO;
1695 goto out;
1696 }
1697 }
1698
1699 // now get the list of child cgroups
1700
1701 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1702 ret = 0;
1703 goto out;
1704 }
1705 for (i = 0; clist[i]; i++) {
1706 if (filler(buf, clist[i], NULL, 0) != 0) {
1707 ret = -EIO;
1708 goto out;
1709 }
1710 }
1711 ret = 0;
1712
1713 out:
1714 free_keys(list);
1715 if (clist) {
1716 for (i = 0; clist[i]; i++)
1717 free(clist[i]);
1718 free(clist);
1719 }
1720 return ret;
1721 }
1722
1723 static void do_release_file_info(struct file_info *f)
1724 {
1725 if (!f)
1726 return;
1727 free(f->controller);
1728 free(f->cgroup);
1729 free(f->file);
1730 free(f->buf);
1731 free(f);
1732 }
1733
1734 int cg_releasedir(const char *path, struct fuse_file_info *fi)
1735 {
1736 struct file_info *d = (struct file_info *)fi->fh;
1737
1738 do_release_file_info(d);
1739 return 0;
1740 }
1741
1742 int cg_open(const char *path, struct fuse_file_info *fi)
1743 {
1744 const char *cgroup;
1745 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1746 struct cgfs_files *k = NULL;
1747 struct file_info *file_info;
1748 struct fuse_context *fc = fuse_get_context();
1749 int ret;
1750
1751 if (!fc)
1752 return -EIO;
1753
1754 controller = pick_controller_from_path(fc, path);
1755 if (!controller)
1756 return -EIO;
1757 cgroup = find_cgroup_in_path(path);
1758 if (!cgroup)
1759 return -EINVAL;
1760
1761 get_cgdir_and_path(cgroup, &cgdir, &last);
1762 if (!last) {
1763 path1 = "/";
1764 path2 = cgdir;
1765 } else {
1766 path1 = cgdir;
1767 path2 = last;
1768 }
1769
1770 k = cgfs_get_key(controller, path1, path2);
1771 if (!k) {
1772 ret = -EINVAL;
1773 goto out;
1774 }
1775 free_key(k);
1776
1777 pid_t initpid = lookup_initpid_in_store(fc->pid);
1778 if (initpid <= 0)
1779 initpid = fc->pid;
1780 if (!caller_may_see_dir(initpid, controller, path1)) {
1781 ret = -ENOENT;
1782 goto out;
1783 }
1784 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
1785 // should never get here
1786 ret = -EACCES;
1787 goto out;
1788 }
1789
1790 /* we'll free this at cg_release */
1791 file_info = malloc(sizeof(*file_info));
1792 if (!file_info) {
1793 ret = -ENOMEM;
1794 goto out;
1795 }
1796 file_info->controller = must_copy_string(controller);
1797 file_info->cgroup = must_copy_string(path1);
1798 file_info->file = must_copy_string(path2);
1799 file_info->type = LXC_TYPE_CGFILE;
1800 file_info->buf = NULL;
1801 file_info->buflen = 0;
1802
1803 fi->fh = (unsigned long)file_info;
1804 ret = 0;
1805
1806 out:
1807 free(cgdir);
1808 return ret;
1809 }
1810
1811 int cg_release(const char *path, struct fuse_file_info *fi)
1812 {
1813 struct file_info *f = (struct file_info *)fi->fh;
1814
1815 do_release_file_info(f);
1816 return 0;
1817 }
1818
1819 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
1820
1821 static bool wait_for_sock(int sock, int timeout)
1822 {
1823 struct epoll_event ev;
1824 int epfd, ret, now, starttime, deltatime, saved_errno;
1825
1826 if ((starttime = time(NULL)) < 0)
1827 return false;
1828
1829 if ((epfd = epoll_create(1)) < 0) {
1830 fprintf(stderr, "Failed to create epoll socket: %m\n");
1831 return false;
1832 }
1833
1834 ev.events = POLLIN_SET;
1835 ev.data.fd = sock;
1836 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
1837 fprintf(stderr, "Failed adding socket to epoll: %m\n");
1838 close(epfd);
1839 return false;
1840 }
1841
1842 again:
1843 if ((now = time(NULL)) < 0) {
1844 close(epfd);
1845 return false;
1846 }
1847
1848 deltatime = (starttime + timeout) - now;
1849 if (deltatime < 0) { // timeout
1850 errno = 0;
1851 close(epfd);
1852 return false;
1853 }
1854 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
1855 if (ret < 0 && errno == EINTR)
1856 goto again;
1857 saved_errno = errno;
1858 close(epfd);
1859
1860 if (ret <= 0) {
1861 errno = saved_errno;
1862 return false;
1863 }
1864 return true;
1865 }
1866
1867 static int msgrecv(int sockfd, void *buf, size_t len)
1868 {
1869 if (!wait_for_sock(sockfd, 2))
1870 return -1;
1871 return recv(sockfd, buf, len, MSG_DONTWAIT);
1872 }
1873
1874 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
1875 {
1876 struct msghdr msg = { 0 };
1877 struct iovec iov;
1878 struct cmsghdr *cmsg;
1879 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
1880 char buf[1];
1881 buf[0] = 'p';
1882
1883 if (pingfirst) {
1884 if (msgrecv(sock, buf, 1) != 1) {
1885 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
1886 __func__);
1887 return SEND_CREDS_FAIL;
1888 }
1889 }
1890
1891 msg.msg_control = cmsgbuf;
1892 msg.msg_controllen = sizeof(cmsgbuf);
1893
1894 cmsg = CMSG_FIRSTHDR(&msg);
1895 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
1896 cmsg->cmsg_level = SOL_SOCKET;
1897 cmsg->cmsg_type = SCM_CREDENTIALS;
1898 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
1899
1900 msg.msg_name = NULL;
1901 msg.msg_namelen = 0;
1902
1903 buf[0] = v;
1904 iov.iov_base = buf;
1905 iov.iov_len = sizeof(buf);
1906 msg.msg_iov = &iov;
1907 msg.msg_iovlen = 1;
1908
1909 if (sendmsg(sock, &msg, 0) < 0) {
1910 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
1911 strerror(errno));
1912 if (errno == 3)
1913 return SEND_CREDS_NOTSK;
1914 return SEND_CREDS_FAIL;
1915 }
1916
1917 return SEND_CREDS_OK;
1918 }
1919
1920 static bool recv_creds(int sock, struct ucred *cred, char *v)
1921 {
1922 struct msghdr msg = { 0 };
1923 struct iovec iov;
1924 struct cmsghdr *cmsg;
1925 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
1926 char buf[1];
1927 int ret;
1928 int optval = 1;
1929
1930 *v = '1';
1931
1932 cred->pid = -1;
1933 cred->uid = -1;
1934 cred->gid = -1;
1935
1936 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
1937 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
1938 return false;
1939 }
1940 buf[0] = '1';
1941 if (write(sock, buf, 1) != 1) {
1942 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
1943 return false;
1944 }
1945
1946 msg.msg_name = NULL;
1947 msg.msg_namelen = 0;
1948 msg.msg_control = cmsgbuf;
1949 msg.msg_controllen = sizeof(cmsgbuf);
1950
1951 iov.iov_base = buf;
1952 iov.iov_len = sizeof(buf);
1953 msg.msg_iov = &iov;
1954 msg.msg_iovlen = 1;
1955
1956 if (!wait_for_sock(sock, 2)) {
1957 fprintf(stderr, "Timed out waiting for scm_cred: %s\n",
1958 strerror(errno));
1959 return false;
1960 }
1961 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
1962 if (ret < 0) {
1963 fprintf(stderr, "Failed to receive scm_cred: %s\n",
1964 strerror(errno));
1965 return false;
1966 }
1967
1968 cmsg = CMSG_FIRSTHDR(&msg);
1969
1970 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
1971 cmsg->cmsg_level == SOL_SOCKET &&
1972 cmsg->cmsg_type == SCM_CREDENTIALS) {
1973 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
1974 }
1975 *v = buf[0];
1976
1977 return true;
1978 }
1979
1980
1981 /*
1982 * pid_to_ns - reads pids from a ucred over a socket, then writes the
1983 * int value back over the socket. This shifts the pid from the
1984 * sender's pidns into tpid's pidns.
1985 */
1986 static void pid_to_ns(int sock, pid_t tpid)
1987 {
1988 char v = '0';
1989 struct ucred cred;
1990
1991 while (recv_creds(sock, &cred, &v)) {
1992 if (v == '1')
1993 _exit(0);
1994 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
1995 _exit(1);
1996 }
1997 _exit(0);
1998 }
1999
2000 /*
2001 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2002 * in your old pidns. Only children which you fork will be in the target
2003 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
2004 * actually convert pids
2005 */
2006 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2007 {
2008 int newnsfd = -1, ret, cpipe[2];
2009 char fnam[100];
2010 pid_t cpid;
2011 char v;
2012
2013 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2014 if (ret < 0 || ret >= sizeof(fnam))
2015 _exit(1);
2016 newnsfd = open(fnam, O_RDONLY);
2017 if (newnsfd < 0)
2018 _exit(1);
2019 if (setns(newnsfd, 0) < 0)
2020 _exit(1);
2021 close(newnsfd);
2022
2023 if (pipe(cpipe) < 0)
2024 _exit(1);
2025
2026 cpid = fork();
2027 if (cpid < 0)
2028 _exit(1);
2029
2030 if (!cpid) {
2031 char b = '1';
2032 close(cpipe[0]);
2033 if (write(cpipe[1], &b, sizeof(char)) < 0) {
2034 fprintf(stderr, "%s (child): erorr on write: %s\n",
2035 __func__, strerror(errno));
2036 }
2037 close(cpipe[1]);
2038 pid_to_ns(sock, tpid);
2039 _exit(1); // not reached
2040 }
2041 // give the child 1 second to be done forking and
2042 // write its ack
2043 if (!wait_for_sock(cpipe[0], 1))
2044 _exit(1);
2045 ret = read(cpipe[0], &v, 1);
2046 if (ret != sizeof(char) || v != '1')
2047 _exit(1);
2048
2049 if (!wait_for_pid(cpid))
2050 _exit(1);
2051 _exit(0);
2052 }
2053
2054 /*
2055 * To read cgroup files with a particular pid, we will setns into the child
2056 * pidns, open a pipe, fork a child - which will be the first to really be in
2057 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2058 */
2059 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2060 {
2061 int sock[2] = {-1, -1};
2062 char *tmpdata = NULL;
2063 int ret;
2064 pid_t qpid, cpid = -1;
2065 bool answer = false;
2066 char v = '0';
2067 struct ucred cred;
2068 size_t sz = 0, asz = 0;
2069
2070 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2071 return false;
2072
2073 /*
2074 * Now we read the pids from returned data one by one, pass
2075 * them into a child in the target namespace, read back the
2076 * translated pids, and put them into our to-return data
2077 */
2078
2079 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2080 perror("socketpair");
2081 free(tmpdata);
2082 return false;
2083 }
2084
2085 cpid = fork();
2086 if (cpid == -1)
2087 goto out;
2088
2089 if (!cpid) // child - exits when done
2090 pid_to_ns_wrapper(sock[1], tpid);
2091
2092 char *ptr = tmpdata;
2093 cred.uid = 0;
2094 cred.gid = 0;
2095 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2096 cred.pid = qpid;
2097 ret = send_creds(sock[0], &cred, v, true);
2098
2099 if (ret == SEND_CREDS_NOTSK)
2100 goto next;
2101 if (ret == SEND_CREDS_FAIL)
2102 goto out;
2103
2104 // read converted results
2105 if (!wait_for_sock(sock[0], 2)) {
2106 fprintf(stderr, "%s: timed out waiting for pid from child: %s\n",
2107 __func__, strerror(errno));
2108 goto out;
2109 }
2110 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2111 fprintf(stderr, "%s: error reading pid from child: %s\n",
2112 __func__, strerror(errno));
2113 goto out;
2114 }
2115 must_strcat_pid(d, &sz, &asz, qpid);
2116 next:
2117 ptr = strchr(ptr, '\n');
2118 if (!ptr)
2119 break;
2120 ptr++;
2121 }
2122
2123 cred.pid = getpid();
2124 v = '1';
2125 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2126 // failed to ask child to exit
2127 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
2128 __func__, strerror(errno));
2129 goto out;
2130 }
2131
2132 answer = true;
2133
2134 out:
2135 free(tmpdata);
2136 if (cpid != -1)
2137 wait_for_pid(cpid);
2138 if (sock[0] != -1) {
2139 close(sock[0]);
2140 close(sock[1]);
2141 }
2142 return answer;
2143 }
2144
2145 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2146 struct fuse_file_info *fi)
2147 {
2148 struct fuse_context *fc = fuse_get_context();
2149 struct file_info *f = (struct file_info *)fi->fh;
2150 struct cgfs_files *k = NULL;
2151 char *data = NULL;
2152 int ret, s;
2153 bool r;
2154
2155 if (f->type != LXC_TYPE_CGFILE) {
2156 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
2157 return -EIO;
2158 }
2159
2160 if (offset)
2161 return 0;
2162
2163 if (!fc)
2164 return -EIO;
2165
2166 if (!f->controller)
2167 return -EINVAL;
2168
2169 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2170 return -EINVAL;
2171 }
2172 free_key(k);
2173
2174
2175 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { // should never get here
2176 ret = -EACCES;
2177 goto out;
2178 }
2179
2180 if (strcmp(f->file, "tasks") == 0 ||
2181 strcmp(f->file, "/tasks") == 0 ||
2182 strcmp(f->file, "/cgroup.procs") == 0 ||
2183 strcmp(f->file, "cgroup.procs") == 0)
2184 // special case - we have to translate the pids
2185 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2186 else
2187 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2188
2189 if (!r) {
2190 ret = -EINVAL;
2191 goto out;
2192 }
2193
2194 if (!data) {
2195 ret = 0;
2196 goto out;
2197 }
2198 s = strlen(data);
2199 if (s > size)
2200 s = size;
2201 memcpy(buf, data, s);
2202 if (s > 0 && s < size && data[s-1] != '\n')
2203 buf[s++] = '\n';
2204
2205 ret = s;
2206
2207 out:
2208 free(data);
2209 return ret;
2210 }
2211
2212 static void pid_from_ns(int sock, pid_t tpid)
2213 {
2214 pid_t vpid;
2215 struct ucred cred;
2216 char v;
2217 int ret;
2218
2219 cred.uid = 0;
2220 cred.gid = 0;
2221 while (1) {
2222 if (!wait_for_sock(sock, 2)) {
2223 fprintf(stderr, "%s: timeout reading from parent\n", __func__);
2224 _exit(1);
2225 }
2226 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2227 fprintf(stderr, "%s: bad read from parent: %s\n",
2228 __func__, strerror(errno));
2229 _exit(1);
2230 }
2231 if (vpid == -1) // done
2232 break;
2233 v = '0';
2234 cred.pid = vpid;
2235 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2236 v = '1';
2237 cred.pid = getpid();
2238 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2239 _exit(1);
2240 }
2241 }
2242 _exit(0);
2243 }
2244
2245 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2246 {
2247 int newnsfd = -1, ret, cpipe[2];
2248 char fnam[100];
2249 pid_t cpid;
2250 char v;
2251
2252 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2253 if (ret < 0 || ret >= sizeof(fnam))
2254 _exit(1);
2255 newnsfd = open(fnam, O_RDONLY);
2256 if (newnsfd < 0)
2257 _exit(1);
2258 if (setns(newnsfd, 0) < 0)
2259 _exit(1);
2260 close(newnsfd);
2261
2262 if (pipe(cpipe) < 0)
2263 _exit(1);
2264
2265 loop:
2266 cpid = fork();
2267
2268 if (cpid < 0)
2269 _exit(1);
2270
2271 if (!cpid) {
2272 char b = '1';
2273 close(cpipe[0]);
2274 if (write(cpipe[1], &b, sizeof(char)) < 0) {
2275 fprintf(stderr, "%s (child): erorr on write: %s\n",
2276 __func__, strerror(errno));
2277 }
2278 close(cpipe[1]);
2279 pid_from_ns(sock, tpid);
2280 }
2281
2282 // give the child 1 second to be done forking and
2283 // write its ack
2284 if (!wait_for_sock(cpipe[0], 1))
2285 goto again;
2286 ret = read(cpipe[0], &v, 1);
2287 if (ret != sizeof(char) || v != '1') {
2288 goto again;
2289 }
2290
2291 if (!wait_for_pid(cpid))
2292 _exit(1);
2293 _exit(0);
2294
2295 again:
2296 kill(cpid, SIGKILL);
2297 wait_for_pid(cpid);
2298 goto loop;
2299 }
2300
2301 /*
2302 * Given host @uid, return the uid to which it maps in
2303 * @pid's user namespace, or -1 if none.
2304 */
2305 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2306 {
2307 FILE *f;
2308 char line[400];
2309
2310 sprintf(line, "/proc/%d/uid_map", pid);
2311 if ((f = fopen(line, "r")) == NULL) {
2312 return false;
2313 }
2314
2315 *answer = convert_id_to_ns(f, uid);
2316 fclose(f);
2317
2318 if (*answer == -1)
2319 return false;
2320 return true;
2321 }
2322
2323 /*
2324 * get_pid_creds: get the real uid and gid of @pid from
2325 * /proc/$$/status
2326 * (XXX should we use euid here?)
2327 */
2328 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2329 {
2330 char line[400];
2331 uid_t u;
2332 gid_t g;
2333 FILE *f;
2334
2335 *uid = -1;
2336 *gid = -1;
2337 sprintf(line, "/proc/%d/status", pid);
2338 if ((f = fopen(line, "r")) == NULL) {
2339 fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno));
2340 return;
2341 }
2342 while (fgets(line, 400, f)) {
2343 if (strncmp(line, "Uid:", 4) == 0) {
2344 if (sscanf(line+4, "%u", &u) != 1) {
2345 fprintf(stderr, "bad uid line for pid %u\n", pid);
2346 fclose(f);
2347 return;
2348 }
2349 *uid = u;
2350 } else if (strncmp(line, "Gid:", 4) == 0) {
2351 if (sscanf(line+4, "%u", &g) != 1) {
2352 fprintf(stderr, "bad gid line for pid %u\n", pid);
2353 fclose(f);
2354 return;
2355 }
2356 *gid = g;
2357 }
2358 }
2359 fclose(f);
2360 }
2361
2362 /*
2363 * May the requestor @r move victim @v to a new cgroup?
2364 * This is allowed if
2365 * . they are the same task
2366 * . they are ownedy by the same uid
2367 * . @r is root on the host, or
2368 * . @v's uid is mapped into @r's where @r is root.
2369 */
2370 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2371 {
2372 uid_t v_uid, tmpuid;
2373 gid_t v_gid;
2374
2375 if (r == v)
2376 return true;
2377 if (r_uid == 0)
2378 return true;
2379 get_pid_creds(v, &v_uid, &v_gid);
2380 if (r_uid == v_uid)
2381 return true;
2382 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2383 && hostuid_to_ns(v_uid, r, &tmpuid))
2384 return true;
2385 return false;
2386 }
2387
2388 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2389 const char *file, const char *buf)
2390 {
2391 int sock[2] = {-1, -1};
2392 pid_t qpid, cpid = -1;
2393 FILE *pids_file = NULL;
2394 bool answer = false, fail = false;
2395
2396 pids_file = open_pids_file(contrl, cg);
2397 if (!pids_file)
2398 return false;
2399
2400 /*
2401 * write the pids to a socket, have helper in writer's pidns
2402 * call movepid for us
2403 */
2404 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2405 perror("socketpair");
2406 goto out;
2407 }
2408
2409 cpid = fork();
2410 if (cpid == -1)
2411 goto out;
2412
2413 if (!cpid) { // child
2414 fclose(pids_file);
2415 pid_from_ns_wrapper(sock[1], tpid);
2416 }
2417
2418 const char *ptr = buf;
2419 while (sscanf(ptr, "%d", &qpid) == 1) {
2420 struct ucred cred;
2421 char v;
2422
2423 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2424 fprintf(stderr, "%s: error writing pid to child: %s\n",
2425 __func__, strerror(errno));
2426 goto out;
2427 }
2428
2429 if (recv_creds(sock[0], &cred, &v)) {
2430 if (v == '0') {
2431 if (!may_move_pid(tpid, tuid, cred.pid)) {
2432 fail = true;
2433 break;
2434 }
2435 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2436 fail = true;
2437 }
2438 }
2439
2440 ptr = strchr(ptr, '\n');
2441 if (!ptr)
2442 break;
2443 ptr++;
2444 }
2445
2446 /* All good, write the value */
2447 qpid = -1;
2448 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2449 fprintf(stderr, "Warning: failed to ask child to exit\n");
2450
2451 if (!fail)
2452 answer = true;
2453
2454 out:
2455 if (cpid != -1)
2456 wait_for_pid(cpid);
2457 if (sock[0] != -1) {
2458 close(sock[0]);
2459 close(sock[1]);
2460 }
2461 if (pids_file) {
2462 if (fclose(pids_file) != 0)
2463 answer = false;
2464 }
2465 return answer;
2466 }
2467
2468 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2469 struct fuse_file_info *fi)
2470 {
2471 struct fuse_context *fc = fuse_get_context();
2472 char *localbuf = NULL;
2473 struct cgfs_files *k = NULL;
2474 struct file_info *f = (struct file_info *)fi->fh;
2475 bool r;
2476
2477 if (f->type != LXC_TYPE_CGFILE) {
2478 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
2479 return -EIO;
2480 }
2481
2482 if (offset)
2483 return 0;
2484
2485 if (!fc)
2486 return -EIO;
2487
2488 localbuf = alloca(size+1);
2489 localbuf[size] = '\0';
2490 memcpy(localbuf, buf, size);
2491
2492 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2493 size = -EINVAL;
2494 goto out;
2495 }
2496
2497 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2498 size = -EACCES;
2499 goto out;
2500 }
2501
2502 if (strcmp(f->file, "tasks") == 0 ||
2503 strcmp(f->file, "/tasks") == 0 ||
2504 strcmp(f->file, "/cgroup.procs") == 0 ||
2505 strcmp(f->file, "cgroup.procs") == 0)
2506 // special case - we have to translate the pids
2507 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2508 else
2509 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2510
2511 if (!r)
2512 size = -EINVAL;
2513
2514 out:
2515 free_key(k);
2516 return size;
2517 }
2518
2519 int cg_chown(const char *path, uid_t uid, gid_t gid)
2520 {
2521 struct fuse_context *fc = fuse_get_context();
2522 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2523 struct cgfs_files *k = NULL;
2524 const char *cgroup;
2525 int ret;
2526
2527 if (!fc)
2528 return -EIO;
2529
2530 if (strcmp(path, "/cgroup") == 0)
2531 return -EINVAL;
2532
2533 controller = pick_controller_from_path(fc, path);
2534 if (!controller)
2535 return -EINVAL;
2536 cgroup = find_cgroup_in_path(path);
2537 if (!cgroup)
2538 /* this is just /cgroup/controller */
2539 return -EINVAL;
2540
2541 get_cgdir_and_path(cgroup, &cgdir, &last);
2542
2543 if (!last) {
2544 path1 = "/";
2545 path2 = cgdir;
2546 } else {
2547 path1 = cgdir;
2548 path2 = last;
2549 }
2550
2551 if (is_child_cgroup(controller, path1, path2)) {
2552 // get uid, gid, from '/tasks' file and make up a mode
2553 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2554 k = cgfs_get_key(controller, cgroup, "tasks");
2555
2556 } else
2557 k = cgfs_get_key(controller, path1, path2);
2558
2559 if (!k) {
2560 ret = -EINVAL;
2561 goto out;
2562 }
2563
2564 /*
2565 * This being a fuse request, the uid and gid must be valid
2566 * in the caller's namespace. So we can just check to make
2567 * sure that the caller is root in his uid, and privileged
2568 * over the file's current owner.
2569 */
2570 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2571 ret = -EACCES;
2572 goto out;
2573 }
2574
2575 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2576
2577 out:
2578 free_key(k);
2579 free(cgdir);
2580
2581 return ret;
2582 }
2583
2584 int cg_chmod(const char *path, mode_t mode)
2585 {
2586 struct fuse_context *fc = fuse_get_context();
2587 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2588 struct cgfs_files *k = NULL;
2589 const char *cgroup;
2590 int ret;
2591
2592 if (!fc)
2593 return -EIO;
2594
2595 if (strcmp(path, "/cgroup") == 0)
2596 return -EINVAL;
2597
2598 controller = pick_controller_from_path(fc, path);
2599 if (!controller)
2600 return -EINVAL;
2601 cgroup = find_cgroup_in_path(path);
2602 if (!cgroup)
2603 /* this is just /cgroup/controller */
2604 return -EINVAL;
2605
2606 get_cgdir_and_path(cgroup, &cgdir, &last);
2607
2608 if (!last) {
2609 path1 = "/";
2610 path2 = cgdir;
2611 } else {
2612 path1 = cgdir;
2613 path2 = last;
2614 }
2615
2616 if (is_child_cgroup(controller, path1, path2)) {
2617 // get uid, gid, from '/tasks' file and make up a mode
2618 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2619 k = cgfs_get_key(controller, cgroup, "tasks");
2620
2621 } else
2622 k = cgfs_get_key(controller, path1, path2);
2623
2624 if (!k) {
2625 ret = -EINVAL;
2626 goto out;
2627 }
2628
2629 /*
2630 * This being a fuse request, the uid and gid must be valid
2631 * in the caller's namespace. So we can just check to make
2632 * sure that the caller is root in his uid, and privileged
2633 * over the file's current owner.
2634 */
2635 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2636 ret = -EPERM;
2637 goto out;
2638 }
2639
2640 if (!cgfs_chmod_file(controller, cgroup, mode)) {
2641 ret = -EINVAL;
2642 goto out;
2643 }
2644
2645 ret = 0;
2646 out:
2647 free_key(k);
2648 free(cgdir);
2649 return ret;
2650 }
2651
2652 int cg_mkdir(const char *path, mode_t mode)
2653 {
2654 struct fuse_context *fc = fuse_get_context();
2655 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2656 const char *cgroup;
2657 int ret;
2658
2659 if (!fc)
2660 return -EIO;
2661
2662
2663 controller = pick_controller_from_path(fc, path);
2664 if (!controller)
2665 return -EINVAL;
2666
2667 cgroup = find_cgroup_in_path(path);
2668 if (!cgroup)
2669 return -EINVAL;
2670
2671 get_cgdir_and_path(cgroup, &cgdir, &last);
2672 if (!last)
2673 path1 = "/";
2674 else
2675 path1 = cgdir;
2676
2677 pid_t initpid = lookup_initpid_in_store(fc->pid);
2678 if (initpid <= 0)
2679 initpid = fc->pid;
2680 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
2681 if (!next)
2682 ret = -EINVAL;
2683 else if (last && strcmp(next, last) == 0)
2684 ret = -EEXIST;
2685 else
2686 ret = -ENOENT;
2687 goto out;
2688 }
2689
2690 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
2691 ret = -EACCES;
2692 goto out;
2693 }
2694 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2695 ret = -EACCES;
2696 goto out;
2697 }
2698
2699 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
2700
2701 out:
2702 free(cgdir);
2703 free(next);
2704 return ret;
2705 }
2706
2707 int cg_rmdir(const char *path)
2708 {
2709 struct fuse_context *fc = fuse_get_context();
2710 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
2711 const char *cgroup;
2712 int ret;
2713
2714 if (!fc)
2715 return -EIO;
2716
2717 controller = pick_controller_from_path(fc, path);
2718 if (!controller)
2719 return -EINVAL;
2720
2721 cgroup = find_cgroup_in_path(path);
2722 if (!cgroup)
2723 return -EINVAL;
2724
2725 get_cgdir_and_path(cgroup, &cgdir, &last);
2726 if (!last) {
2727 ret = -EINVAL;
2728 goto out;
2729 }
2730
2731 pid_t initpid = lookup_initpid_in_store(fc->pid);
2732 if (initpid <= 0)
2733 initpid = fc->pid;
2734 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
2735 if (!last || strcmp(next, last) == 0)
2736 ret = -EBUSY;
2737 else
2738 ret = -ENOENT;
2739 goto out;
2740 }
2741
2742 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
2743 ret = -EACCES;
2744 goto out;
2745 }
2746 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2747 ret = -EACCES;
2748 goto out;
2749 }
2750
2751 if (!cgfs_remove(controller, cgroup)) {
2752 ret = -EINVAL;
2753 goto out;
2754 }
2755
2756 ret = 0;
2757
2758 out:
2759 free(cgdir);
2760 free(next);
2761 return ret;
2762 }
2763
2764 static bool startswith(const char *line, const char *pref)
2765 {
2766 if (strncmp(line, pref, strlen(pref)) == 0)
2767 return true;
2768 return false;
2769 }
2770
2771 static void get_mem_cached(char *memstat, unsigned long *v)
2772 {
2773 char *eol;
2774
2775 *v = 0;
2776 while (*memstat) {
2777 if (startswith(memstat, "total_cache")) {
2778 sscanf(memstat + 11, "%lu", v);
2779 *v /= 1024;
2780 return;
2781 }
2782 eol = strchr(memstat, '\n');
2783 if (!eol)
2784 return;
2785 memstat = eol+1;
2786 }
2787 }
2788
2789 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2790 {
2791 char *eol;
2792 char key[32];
2793
2794 memset(key, 0, 32);
2795 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2796
2797 size_t len = strlen(key);
2798 *v = 0;
2799
2800 while (*str) {
2801 if (startswith(str, key)) {
2802 sscanf(str + len, "%lu", v);
2803 return;
2804 }
2805 eol = strchr(str, '\n');
2806 if (!eol)
2807 return;
2808 str = eol+1;
2809 }
2810 }
2811
2812 static int read_file(const char *path, char *buf, size_t size,
2813 struct file_info *d)
2814 {
2815 size_t linelen = 0, total_len = 0, rv = 0;
2816 char *line = NULL;
2817 char *cache = d->buf;
2818 size_t cache_size = d->buflen;
2819 FILE *f = fopen(path, "r");
2820 if (!f)
2821 return 0;
2822
2823 while (getline(&line, &linelen, f) != -1) {
2824 size_t l = snprintf(cache, cache_size, "%s", line);
2825 if (l < 0) {
2826 perror("Error writing to cache");
2827 rv = 0;
2828 goto err;
2829 }
2830 if (l >= cache_size) {
2831 fprintf(stderr, "Internal error: truncated write to cache\n");
2832 rv = 0;
2833 goto err;
2834 }
2835 cache += l;
2836 cache_size -= l;
2837 total_len += l;
2838 }
2839
2840 d->size = total_len;
2841 if (total_len > size ) total_len = size;
2842
2843 /* read from off 0 */
2844 memcpy(buf, d->buf, total_len);
2845 rv = total_len;
2846 err:
2847 fclose(f);
2848 free(line);
2849 return rv;
2850 }
2851
2852 /*
2853 * FUSE ops for /proc
2854 */
2855
2856 static unsigned long get_memlimit(const char *cgroup)
2857 {
2858 char *memlimit_str = NULL;
2859 unsigned long memlimit = -1;
2860
2861 if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
2862 memlimit = strtoul(memlimit_str, NULL, 10);
2863
2864 free(memlimit_str);
2865
2866 return memlimit;
2867 }
2868
2869 static unsigned long get_min_memlimit(const char *cgroup)
2870 {
2871 char *copy = strdupa(cgroup);
2872 unsigned long memlimit = 0, retlimit;
2873
2874 retlimit = get_memlimit(copy);
2875
2876 while (strcmp(copy, "/") != 0) {
2877 copy = dirname(copy);
2878 memlimit = get_memlimit(copy);
2879 if (memlimit != -1 && memlimit < retlimit)
2880 retlimit = memlimit;
2881 };
2882
2883 return retlimit;
2884 }
2885
2886 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
2887 struct fuse_file_info *fi)
2888 {
2889 struct fuse_context *fc = fuse_get_context();
2890 struct file_info *d = (struct file_info *)fi->fh;
2891 char *cg;
2892 char *memusage_str = NULL, *memstat_str = NULL,
2893 *memswlimit_str = NULL, *memswusage_str = NULL,
2894 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
2895 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
2896 cached = 0, hosttotal = 0;
2897 char *line = NULL;
2898 size_t linelen = 0, total_len = 0, rv = 0;
2899 char *cache = d->buf;
2900 size_t cache_size = d->buflen;
2901 FILE *f = NULL;
2902
2903 if (offset){
2904 if (offset > d->size)
2905 return -EINVAL;
2906 if (!d->cached)
2907 return 0;
2908 int left = d->size - offset;
2909 total_len = left > size ? size: left;
2910 memcpy(buf, cache + offset, total_len);
2911 return total_len;
2912 }
2913
2914 pid_t initpid = lookup_initpid_in_store(fc->pid);
2915 if (initpid <= 0)
2916 initpid = fc->pid;
2917 cg = get_pid_cgroup(initpid, "memory");
2918 if (!cg)
2919 return read_file("/proc/meminfo", buf, size, d);
2920
2921 memlimit = get_min_memlimit(cg);
2922 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
2923 goto err;
2924 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
2925 goto err;
2926
2927 // Following values are allowed to fail, because swapaccount might be turned
2928 // off for current kernel
2929 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
2930 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
2931 {
2932 /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
2933 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
2934 goto err;
2935 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
2936 goto err;
2937
2938 memswlimit = strtoul(memswlimit_str, NULL, 10);
2939 memswusage = strtoul(memswusage_str, NULL, 10);
2940
2941 if (!strcmp(memswlimit_str, memswlimit_default_str))
2942 memswlimit = 0;
2943 if (!strcmp(memswusage_str, memswusage_default_str))
2944 memswusage = 0;
2945
2946 memswlimit = memswlimit / 1024;
2947 memswusage = memswusage / 1024;
2948 }
2949
2950 memusage = strtoul(memusage_str, NULL, 10);
2951 memlimit /= 1024;
2952 memusage /= 1024;
2953
2954 get_mem_cached(memstat_str, &cached);
2955
2956 f = fopen("/proc/meminfo", "r");
2957 if (!f)
2958 goto err;
2959
2960 while (getline(&line, &linelen, f) != -1) {
2961 size_t l;
2962 char *printme, lbuf[100];
2963
2964 memset(lbuf, 0, 100);
2965 if (startswith(line, "MemTotal:")) {
2966 sscanf(line+14, "%lu", &hosttotal);
2967 if (hosttotal < memlimit)
2968 memlimit = hosttotal;
2969 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
2970 printme = lbuf;
2971 } else if (startswith(line, "MemFree:")) {
2972 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
2973 printme = lbuf;
2974 } else if (startswith(line, "MemAvailable:")) {
2975 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
2976 printme = lbuf;
2977 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
2978 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit);
2979 printme = lbuf;
2980 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
2981 snprintf(lbuf, 100, "SwapFree: %8lu kB\n",
2982 (memswlimit - memlimit) - (memswusage - memusage));
2983 printme = lbuf;
2984 } else if (startswith(line, "Buffers:")) {
2985 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
2986 printme = lbuf;
2987 } else if (startswith(line, "Cached:")) {
2988 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
2989 printme = lbuf;
2990 } else if (startswith(line, "SwapCached:")) {
2991 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
2992 printme = lbuf;
2993 } else
2994 printme = line;
2995
2996 l = snprintf(cache, cache_size, "%s", printme);
2997 if (l < 0) {
2998 perror("Error writing to cache");
2999 rv = 0;
3000 goto err;
3001
3002 }
3003 if (l >= cache_size) {
3004 fprintf(stderr, "Internal error: truncated write to cache\n");
3005 rv = 0;
3006 goto err;
3007 }
3008
3009 cache += l;
3010 cache_size -= l;
3011 total_len += l;
3012 }
3013
3014 d->cached = 1;
3015 d->size = total_len;
3016 if (total_len > size ) total_len = size;
3017 memcpy(buf, d->buf, total_len);
3018
3019 rv = total_len;
3020 err:
3021 if (f)
3022 fclose(f);
3023 free(line);
3024 free(cg);
3025 free(memusage_str);
3026 free(memswlimit_str);
3027 free(memswusage_str);
3028 free(memstat_str);
3029 free(memswlimit_default_str);
3030 free(memswusage_default_str);
3031 return rv;
3032 }
3033
3034 /*
3035 * Read the cpuset.cpus for cg
3036 * Return the answer in a newly allocated string which must be freed
3037 */
3038 static char *get_cpuset(const char *cg)
3039 {
3040 char *answer;
3041
3042 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3043 return NULL;
3044 return answer;
3045 }
3046
3047 bool cpu_in_cpuset(int cpu, const char *cpuset);
3048
3049 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3050 {
3051 int cpu;
3052
3053 if (sscanf(line, "processor : %d", &cpu) != 1)
3054 return false;
3055 return cpu_in_cpuset(cpu, cpuset);
3056 }
3057
3058 /*
3059 * check whether this is a '^processor" line in /proc/cpuinfo
3060 */
3061 static bool is_processor_line(const char *line)
3062 {
3063 int cpu;
3064
3065 if (sscanf(line, "processor : %d", &cpu) == 1)
3066 return true;
3067 return false;
3068 }
3069
3070 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3071 struct fuse_file_info *fi)
3072 {
3073 struct fuse_context *fc = fuse_get_context();
3074 struct file_info *d = (struct file_info *)fi->fh;
3075 char *cg;
3076 char *cpuset = NULL;
3077 char *line = NULL;
3078 size_t linelen = 0, total_len = 0, rv = 0;
3079 bool am_printing = false;
3080 int curcpu = -1;
3081 char *cache = d->buf;
3082 size_t cache_size = d->buflen;
3083 FILE *f = NULL;
3084
3085 if (offset){
3086 if (offset > d->size)
3087 return -EINVAL;
3088 if (!d->cached)
3089 return 0;
3090 int left = d->size - offset;
3091 total_len = left > size ? size: left;
3092 memcpy(buf, cache + offset, total_len);
3093 return total_len;
3094 }
3095
3096 pid_t initpid = lookup_initpid_in_store(fc->pid);
3097 if (initpid <= 0)
3098 initpid = fc->pid;
3099 cg = get_pid_cgroup(initpid, "cpuset");
3100 if (!cg)
3101 return read_file("proc/cpuinfo", buf, size, d);
3102
3103 cpuset = get_cpuset(cg);
3104 if (!cpuset)
3105 goto err;
3106
3107 f = fopen("/proc/cpuinfo", "r");
3108 if (!f)
3109 goto err;
3110
3111 while (getline(&line, &linelen, f) != -1) {
3112 size_t l;
3113 if (is_processor_line(line)) {
3114 am_printing = cpuline_in_cpuset(line, cpuset);
3115 if (am_printing) {
3116 curcpu ++;
3117 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3118 if (l < 0) {
3119 perror("Error writing to cache");
3120 rv = 0;
3121 goto err;
3122 }
3123 if (l >= cache_size) {
3124 fprintf(stderr, "Internal error: truncated write to cache\n");
3125 rv = 0;
3126 goto err;
3127 }
3128 cache += l;
3129 cache_size -= l;
3130 total_len += l;
3131 }
3132 continue;
3133 }
3134 if (am_printing) {
3135 l = snprintf(cache, cache_size, "%s", line);
3136 if (l < 0) {
3137 perror("Error writing to cache");
3138 rv = 0;
3139 goto err;
3140 }
3141 if (l >= cache_size) {
3142 fprintf(stderr, "Internal error: truncated write to cache\n");
3143 rv = 0;
3144 goto err;
3145 }
3146 cache += l;
3147 cache_size -= l;
3148 total_len += l;
3149 }
3150 }
3151
3152 d->cached = 1;
3153 d->size = total_len;
3154 if (total_len > size ) total_len = size;
3155
3156 /* read from off 0 */
3157 memcpy(buf, d->buf, total_len);
3158 rv = total_len;
3159 err:
3160 if (f)
3161 fclose(f);
3162 free(line);
3163 free(cpuset);
3164 free(cg);
3165 return rv;
3166 }
3167
3168 static int proc_stat_read(char *buf, size_t size, off_t offset,
3169 struct fuse_file_info *fi)
3170 {
3171 struct fuse_context *fc = fuse_get_context();
3172 struct file_info *d = (struct file_info *)fi->fh;
3173 char *cg;
3174 char *cpuset = NULL;
3175 char *line = NULL;
3176 size_t linelen = 0, total_len = 0, rv = 0;
3177 int curcpu = -1; /* cpu numbering starts at 0 */
3178 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
3179 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3180 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
3181 #define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
3182 char cpuall[CPUALL_MAX_SIZE];
3183 /* reserve for cpu all */
3184 char *cache = d->buf + CPUALL_MAX_SIZE;
3185 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3186 FILE *f = NULL;
3187
3188 if (offset){
3189 if (offset > d->size)
3190 return -EINVAL;
3191 if (!d->cached)
3192 return 0;
3193 int left = d->size - offset;
3194 total_len = left > size ? size: left;
3195 memcpy(buf, d->buf + offset, total_len);
3196 return total_len;
3197 }
3198
3199 pid_t initpid = lookup_initpid_in_store(fc->pid);
3200 if (initpid <= 0)
3201 initpid = fc->pid;
3202 cg = get_pid_cgroup(initpid, "cpuset");
3203 if (!cg)
3204 return read_file("/proc/stat", buf, size, d);
3205
3206 cpuset = get_cpuset(cg);
3207 if (!cpuset)
3208 goto err;
3209
3210 f = fopen("/proc/stat", "r");
3211 if (!f)
3212 goto err;
3213
3214 //skip first line
3215 if (getline(&line, &linelen, f) < 0) {
3216 fprintf(stderr, "proc_stat_read read first line failed\n");
3217 goto err;
3218 }
3219
3220 while (getline(&line, &linelen, f) != -1) {
3221 size_t l;
3222 int cpu;
3223 char cpu_char[10]; /* That's a lot of cores */
3224 char *c;
3225
3226 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3227 /* not a ^cpuN line containing a number N, just print it */
3228 l = snprintf(cache, cache_size, "%s", line);
3229 if (l < 0) {
3230 perror("Error writing to cache");
3231 rv = 0;
3232 goto err;
3233 }
3234 if (l >= cache_size) {
3235 fprintf(stderr, "Internal error: truncated write to cache\n");
3236 rv = 0;
3237 goto err;
3238 }
3239 cache += l;
3240 cache_size -= l;
3241 total_len += l;
3242 continue;
3243 }
3244
3245 if (sscanf(cpu_char, "%d", &cpu) != 1)
3246 continue;
3247 if (!cpu_in_cpuset(cpu, cpuset))
3248 continue;
3249 curcpu ++;
3250
3251 c = strchr(line, ' ');
3252 if (!c)
3253 continue;
3254 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3255 if (l < 0) {
3256 perror("Error writing to cache");
3257 rv = 0;
3258 goto err;
3259
3260 }
3261 if (l >= cache_size) {
3262 fprintf(stderr, "Internal error: truncated write to cache\n");
3263 rv = 0;
3264 goto err;
3265 }
3266
3267 cache += l;
3268 cache_size -= l;
3269 total_len += l;
3270
3271 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
3272 &softirq, &steal, &guest) != 9)
3273 continue;
3274 user_sum += user;
3275 nice_sum += nice;
3276 system_sum += system;
3277 idle_sum += idle;
3278 iowait_sum += iowait;
3279 irq_sum += irq;
3280 softirq_sum += softirq;
3281 steal_sum += steal;
3282 guest_sum += guest;
3283 }
3284
3285 cache = d->buf;
3286
3287 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3288 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
3289 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
3290 memcpy(cache, cpuall, cpuall_len);
3291 cache += cpuall_len;
3292 } else{
3293 /* shouldn't happen */
3294 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
3295 cpuall_len = 0;
3296 }
3297
3298 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3299 total_len += cpuall_len;
3300 d->cached = 1;
3301 d->size = total_len;
3302 if (total_len > size ) total_len = size;
3303
3304 memcpy(buf, d->buf, total_len);
3305 rv = total_len;
3306
3307 err:
3308 if (f)
3309 fclose(f);
3310 free(line);
3311 free(cpuset);
3312 free(cg);
3313 return rv;
3314 }
3315
3316 static long int getreaperage(pid_t pid)
3317 {
3318 char fnam[100];
3319 struct stat sb;
3320 int ret;
3321 pid_t qpid;
3322
3323 qpid = lookup_initpid_in_store(pid);
3324 if (qpid <= 0)
3325 return 0;
3326
3327 ret = snprintf(fnam, 100, "/proc/%d", qpid);
3328 if (ret < 0 || ret >= 100)
3329 return 0;
3330
3331 if (lstat(fnam, &sb) < 0)
3332 return 0;
3333
3334 return time(NULL) - sb.st_ctime;
3335 }
3336
3337 static unsigned long get_reaper_busy(pid_t task)
3338 {
3339 pid_t initpid = lookup_initpid_in_store(task);
3340 char *cgroup = NULL, *usage_str = NULL;
3341 unsigned long usage = 0;
3342
3343 if (initpid <= 0)
3344 return 0;
3345
3346 cgroup = get_pid_cgroup(initpid, "cpuacct");
3347 if (!cgroup)
3348 goto out;
3349 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3350 goto out;
3351 usage = strtoul(usage_str, NULL, 10);
3352 usage /= 1000000000;
3353
3354 out:
3355 free(cgroup);
3356 free(usage_str);
3357 return usage;
3358 }
3359
3360 #if RELOADTEST
3361 void iwashere(void)
3362 {
3363 char *name, *cwd = get_current_dir_name();
3364 size_t len;
3365 int fd;
3366
3367 if (!cwd)
3368 exit(1);
3369 len = strlen(cwd) + strlen("/iwashere") + 1;
3370 name = alloca(len);
3371 snprintf(name, len, "%s/iwashere", cwd);
3372 free(cwd);
3373 fd = creat(name, 0755);
3374 if (fd >= 0)
3375 close(fd);
3376 }
3377 #endif
3378
3379 /*
3380 * We read /proc/uptime and reuse its second field.
3381 * For the first field, we use the mtime for the reaper for
3382 * the calling pid as returned by getreaperage
3383 */
3384 static int proc_uptime_read(char *buf, size_t size, off_t offset,
3385 struct fuse_file_info *fi)
3386 {
3387 struct fuse_context *fc = fuse_get_context();
3388 struct file_info *d = (struct file_info *)fi->fh;
3389 long int reaperage = getreaperage(fc->pid);
3390 unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
3391 char *cache = d->buf;
3392 size_t total_len = 0;
3393
3394 #if RELOADTEST
3395 iwashere();
3396 #endif
3397
3398 if (offset){
3399 if (offset > d->size)
3400 return -EINVAL;
3401 if (!d->cached)
3402 return 0;
3403 int left = d->size - offset;
3404 total_len = left > size ? size: left;
3405 memcpy(buf, cache + offset, total_len);
3406 return total_len;
3407 }
3408
3409 idletime = reaperage - busytime;
3410 if (idletime > reaperage)
3411 idletime = reaperage;
3412
3413 total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
3414 if (total_len < 0){
3415 perror("Error writing to cache");
3416 return 0;
3417 }
3418
3419 d->size = (int)total_len;
3420 d->cached = 1;
3421
3422 if (total_len > size) total_len = size;
3423
3424 memcpy(buf, d->buf, total_len);
3425 return total_len;
3426 }
3427
3428 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
3429 struct fuse_file_info *fi)
3430 {
3431 char dev_name[72];
3432 struct fuse_context *fc = fuse_get_context();
3433 struct file_info *d = (struct file_info *)fi->fh;
3434 char *cg;
3435 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
3436 *io_wait_time_str = NULL, *io_service_time_str = NULL;
3437 unsigned long read = 0, write = 0;
3438 unsigned long read_merged = 0, write_merged = 0;
3439 unsigned long read_sectors = 0, write_sectors = 0;
3440 unsigned long read_ticks = 0, write_ticks = 0;
3441 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
3442 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
3443 char *cache = d->buf;
3444 size_t cache_size = d->buflen;
3445 char *line = NULL;
3446 size_t linelen = 0, total_len = 0, rv = 0;
3447 unsigned int major = 0, minor = 0;
3448 int i = 0;
3449 FILE *f = NULL;
3450
3451 if (offset){
3452 if (offset > d->size)
3453 return -EINVAL;
3454 if (!d->cached)
3455 return 0;
3456 int left = d->size - offset;
3457 total_len = left > size ? size: left;
3458 memcpy(buf, cache + offset, total_len);
3459 return total_len;
3460 }
3461
3462 pid_t initpid = lookup_initpid_in_store(fc->pid);
3463 if (initpid <= 0)
3464 initpid = fc->pid;
3465 cg = get_pid_cgroup(initpid, "blkio");
3466 if (!cg)
3467 return read_file("/proc/diskstats", buf, size, d);
3468
3469 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
3470 goto err;
3471 if (!cgfs_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
3472 goto err;
3473 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
3474 goto err;
3475 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
3476 goto err;
3477 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
3478 goto err;
3479
3480
3481 f = fopen("/proc/diskstats", "r");
3482 if (!f)
3483 goto err;
3484
3485 while (getline(&line, &linelen, f) != -1) {
3486 size_t l;
3487 char *printme, lbuf[256];
3488
3489 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
3490 if(i == 3){
3491 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
3492 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
3493 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
3494 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
3495 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
3496 read_sectors = read_sectors/512;
3497 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
3498 write_sectors = write_sectors/512;
3499
3500 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
3501 rd_svctm = rd_svctm/1000000;
3502 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
3503 rd_wait = rd_wait/1000000;
3504 read_ticks = rd_svctm + rd_wait;
3505
3506 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
3507 wr_svctm = wr_svctm/1000000;
3508 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
3509 wr_wait = wr_wait/1000000;
3510 write_ticks = wr_svctm + wr_wait;
3511
3512 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
3513 tot_ticks = tot_ticks/1000000;
3514 }else{
3515 continue;
3516 }
3517
3518 memset(lbuf, 0, 256);
3519 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
3520 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3521 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
3522 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
3523 printme = lbuf;
3524 } else
3525 continue;
3526
3527 l = snprintf(cache, cache_size, "%s", printme);
3528 if (l < 0) {
3529 perror("Error writing to fuse buf");
3530 rv = 0;
3531 goto err;
3532 }
3533 if (l >= cache_size) {
3534 fprintf(stderr, "Internal error: truncated write to cache\n");
3535 rv = 0;
3536 goto err;
3537 }
3538 cache += l;
3539 cache_size -= l;
3540 total_len += l;
3541 }
3542
3543 d->cached = 1;
3544 d->size = total_len;
3545 if (total_len > size ) total_len = size;
3546 memcpy(buf, d->buf, total_len);
3547
3548 rv = total_len;
3549 err:
3550 free(cg);
3551 if (f)
3552 fclose(f);
3553 free(line);
3554 free(io_serviced_str);
3555 free(io_merged_str);
3556 free(io_service_bytes_str);
3557 free(io_wait_time_str);
3558 free(io_service_time_str);
3559 return rv;
3560 }
3561
3562 static off_t get_procfile_size(const char *which)
3563 {
3564 FILE *f = fopen(which, "r");
3565 char *line = NULL;
3566 size_t len = 0;
3567 ssize_t sz, answer = 0;
3568 if (!f)
3569 return 0;
3570
3571 while ((sz = getline(&line, &len, f)) != -1)
3572 answer += sz;
3573 fclose (f);
3574 free(line);
3575
3576 return answer;
3577 }
3578
3579 int proc_getattr(const char *path, struct stat *sb)
3580 {
3581 struct timespec now;
3582
3583 memset(sb, 0, sizeof(struct stat));
3584 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
3585 return -EINVAL;
3586 sb->st_uid = sb->st_gid = 0;
3587 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
3588 if (strcmp(path, "/proc") == 0) {
3589 sb->st_mode = S_IFDIR | 00555;
3590 sb->st_nlink = 2;
3591 return 0;
3592 }
3593 if (strcmp(path, "/proc/meminfo") == 0 ||
3594 strcmp(path, "/proc/cpuinfo") == 0 ||
3595 strcmp(path, "/proc/uptime") == 0 ||
3596 strcmp(path, "/proc/stat") == 0 ||
3597 strcmp(path, "/proc/diskstats") == 0) {
3598 sb->st_size = 0;
3599 sb->st_mode = S_IFREG | 00444;
3600 sb->st_nlink = 1;
3601 return 0;
3602 }
3603
3604 return -ENOENT;
3605 }
3606
3607 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
3608 struct fuse_file_info *fi)
3609 {
3610 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
3611 filler(buf, "meminfo", NULL, 0) != 0 ||
3612 filler(buf, "stat", NULL, 0) != 0 ||
3613 filler(buf, "uptime", NULL, 0) != 0 ||
3614 filler(buf, "diskstats", NULL, 0) != 0)
3615 return -EINVAL;
3616 return 0;
3617 }
3618
3619 int proc_open(const char *path, struct fuse_file_info *fi)
3620 {
3621 int type = -1;
3622 struct file_info *info;
3623
3624 if (strcmp(path, "/proc/meminfo") == 0)
3625 type = LXC_TYPE_PROC_MEMINFO;
3626 else if (strcmp(path, "/proc/cpuinfo") == 0)
3627 type = LXC_TYPE_PROC_CPUINFO;
3628 else if (strcmp(path, "/proc/uptime") == 0)
3629 type = LXC_TYPE_PROC_UPTIME;
3630 else if (strcmp(path, "/proc/stat") == 0)
3631 type = LXC_TYPE_PROC_STAT;
3632 else if (strcmp(path, "/proc/diskstats") == 0)
3633 type = LXC_TYPE_PROC_DISKSTATS;
3634 if (type == -1)
3635 return -ENOENT;
3636
3637 info = malloc(sizeof(*info));
3638 if (!info)
3639 return -ENOMEM;
3640
3641 memset(info, 0, sizeof(*info));
3642 info->type = type;
3643
3644 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
3645 do {
3646 info->buf = malloc(info->buflen);
3647 } while (!info->buf);
3648 memset(info->buf, 0, info->buflen);
3649 /* set actual size to buffer size */
3650 info->size = info->buflen;
3651
3652 fi->fh = (unsigned long)info;
3653 return 0;
3654 }
3655
3656 int proc_release(const char *path, struct fuse_file_info *fi)
3657 {
3658 struct file_info *f = (struct file_info *)fi->fh;
3659
3660 do_release_file_info(f);
3661 return 0;
3662 }
3663
3664 int proc_read(const char *path, char *buf, size_t size, off_t offset,
3665 struct fuse_file_info *fi)
3666 {
3667 struct file_info *f = (struct file_info *) fi->fh;
3668
3669 switch (f->type) {
3670 case LXC_TYPE_PROC_MEMINFO:
3671 return proc_meminfo_read(buf, size, offset, fi);
3672 case LXC_TYPE_PROC_CPUINFO:
3673 return proc_cpuinfo_read(buf, size, offset, fi);
3674 case LXC_TYPE_PROC_UPTIME:
3675 return proc_uptime_read(buf, size, offset, fi);
3676 case LXC_TYPE_PROC_STAT:
3677 return proc_stat_read(buf, size, offset, fi);
3678 case LXC_TYPE_PROC_DISKSTATS:
3679 return proc_diskstats_read(buf, size, offset, fi);
3680 default:
3681 return -EINVAL;
3682 }
3683 }
3684
3685 static void __attribute__((constructor)) collect_subsystems(void)
3686 {
3687 FILE *f;
3688 char *line = NULL;
3689 size_t len = 0;
3690
3691 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
3692 fprintf(stderr, "Error opening /proc/self/cgroup: %s\n", strerror(errno));
3693 return;
3694 }
3695 while (getline(&line, &len, f) != -1) {
3696 char *p, *p2;
3697
3698 p = strchr(line, ':');
3699 if (!p)
3700 goto out;
3701 *(p++) = '\0';
3702
3703 p2 = strrchr(p, ':');
3704 if (!p2)
3705 goto out;
3706 *p2 = '\0';
3707
3708 if (!store_hierarchy(line, p))
3709 goto out;
3710 }
3711
3712 print_subsystems();
3713
3714 out:
3715 free(line);
3716 fclose(f);
3717 }
3718
3719 static void __attribute__((destructor)) free_subsystems(void)
3720 {
3721 int i;
3722
3723 for (i = 0; i < num_hierarchies; i++)
3724 if (hierarchies[i])
3725 free(hierarchies[i]);
3726 free(hierarchies);
3727 }