]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
bindings: avoid allocating an unused buffer
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #include <stdio.h>
12 #include <dirent.h>
13 #include <fcntl.h>
14 #include <fuse.h>
15 #include <unistd.h>
16 #include <errno.h>
17 #include <stdbool.h>
18 #include <time.h>
19 #include <string.h>
20 #include <stdlib.h>
21 #include <libgen.h>
22 #include <sched.h>
23 #include <pthread.h>
24 #include <linux/sched.h>
25 #include <sys/param.h>
26 #include <sys/socket.h>
27 #include <sys/mount.h>
28 #include <sys/epoll.h>
29 #include <wait.h>
30
31 #include "bindings.h"
32
33 #include "config.h" // for VERSION
34
35 enum {
36 LXC_TYPE_CGDIR,
37 LXC_TYPE_CGFILE,
38 LXC_TYPE_PROC_MEMINFO,
39 LXC_TYPE_PROC_CPUINFO,
40 LXC_TYPE_PROC_UPTIME,
41 LXC_TYPE_PROC_STAT,
42 LXC_TYPE_PROC_DISKSTATS,
43 };
44
45 struct file_info {
46 char *controller;
47 char *cgroup;
48 char *file;
49 int type;
50 char *buf; // unused as of yet
51 int buflen;
52 int size; //actual data size
53 int cached;
54 };
55
56 /* reserve buffer size, for cpuall in /proc/stat */
57 #define BUF_RESERVE_SIZE 256
58
59 /*
60 * A table caching which pid is init for a pid namespace.
61 * When looking up which pid is init for $qpid, we first
62 * 1. Stat /proc/$qpid/ns/pid.
63 * 2. Check whether the ino_t is in our store.
64 * a. if not, fork a child in qpid's ns to send us
65 * ucred.pid = 1, and read the initpid. Cache
66 * initpid and creation time for /proc/initpid
67 * in a new store entry.
68 * b. if so, verify that /proc/initpid still matches
69 * what we have saved. If not, clear the store
70 * entry and go back to a. If so, return the
71 * cached initpid.
72 */
73 struct pidns_init_store {
74 ino_t ino; // inode number for /proc/$pid/ns/pid
75 pid_t initpid; // the pid of nit in that ns
76 long int ctime; // the time at which /proc/$initpid was created
77 struct pidns_init_store *next;
78 long int lastcheck;
79 };
80
81 /* lol - look at how they are allocated in the kernel */
82 #define PIDNS_HASH_SIZE 4096
83 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
84
85 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
86 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
87 static void lock_mutex(pthread_mutex_t *l)
88 {
89 int ret;
90
91 if ((ret = pthread_mutex_lock(l)) != 0) {
92 fprintf(stderr, "pthread_mutex_lock returned:%d %s\n", ret, strerror(ret));
93 exit(1);
94 }
95 }
96
97 static void unlock_mutex(pthread_mutex_t *l)
98 {
99 int ret;
100
101 if ((ret = pthread_mutex_unlock(l)) != 0) {
102 fprintf(stderr, "pthread_mutex_unlock returned:%d %s\n", ret, strerror(ret));
103 exit(1);
104 }
105 }
106
107 static void store_lock(void)
108 {
109 lock_mutex(&pidns_store_mutex);
110 }
111
112 static void store_unlock(void)
113 {
114 unlock_mutex(&pidns_store_mutex);
115 }
116
117 /* Must be called under store_lock */
118 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
119 {
120 struct stat initsb;
121 char fnam[100];
122
123 snprintf(fnam, 100, "/proc/%d", e->initpid);
124 if (stat(fnam, &initsb) < 0)
125 return false;
126 #if DEBUG
127 fprintf(stderr, "comparing ctime %ld %ld for pid %d\n",
128 e->ctime, initsb.st_ctime, e->initpid);
129 #endif
130 if (e->ctime != initsb.st_ctime)
131 return false;
132 return true;
133 }
134
135 /* Must be called under store_lock */
136 static void remove_initpid(struct pidns_init_store *e)
137 {
138 struct pidns_init_store *tmp;
139 int h;
140
141 #if DEBUG
142 fprintf(stderr, "remove_initpid: removing entry for %d\n", e->initpid);
143 #endif
144 h = HASH(e->ino);
145 if (pidns_hash_table[h] == e) {
146 pidns_hash_table[h] = e->next;
147 free(e);
148 return;
149 }
150
151 tmp = pidns_hash_table[h];
152 while (tmp) {
153 if (tmp->next == e) {
154 tmp->next = e->next;
155 free(e);
156 return;
157 }
158 tmp = tmp->next;
159 }
160 }
161
162 #define PURGE_SECS 5
163 /* Must be called under store_lock */
164 static void prune_initpid_store(void)
165 {
166 static long int last_prune = 0;
167 struct pidns_init_store *e, *prev, *delme;
168 long int now, threshold;
169 int i;
170
171 if (!last_prune) {
172 last_prune = time(NULL);
173 return;
174 }
175 now = time(NULL);
176 if (now < last_prune + PURGE_SECS)
177 return;
178 #if DEBUG
179 fprintf(stderr, "pruning\n");
180 #endif
181 last_prune = now;
182 threshold = now - 2 * PURGE_SECS;
183
184 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
185 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
186 if (e->lastcheck < threshold) {
187 #if DEBUG
188 fprintf(stderr, "Removing cached entry for %d\n", e->initpid);
189 #endif
190 delme = e;
191 if (prev)
192 prev->next = e->next;
193 else
194 pidns_hash_table[i] = e->next;
195 e = e->next;
196 free(delme);
197 } else {
198 prev = e;
199 e = e->next;
200 }
201 }
202 }
203 }
204
205 /* Must be called under store_lock */
206 static void save_initpid(struct stat *sb, pid_t pid)
207 {
208 struct pidns_init_store *e;
209 char fpath[100];
210 struct stat procsb;
211 int h;
212
213 #if DEBUG
214 fprintf(stderr, "save_initpid: adding entry for %d\n", pid);
215 #endif
216 snprintf(fpath, 100, "/proc/%d", pid);
217 if (stat(fpath, &procsb) < 0)
218 return;
219 do {
220 e = malloc(sizeof(*e));
221 } while (!e);
222 e->ino = sb->st_ino;
223 e->initpid = pid;
224 e->ctime = procsb.st_ctime;
225 h = HASH(e->ino);
226 e->next = pidns_hash_table[h];
227 e->lastcheck = time(NULL);
228 pidns_hash_table[h] = e;
229 }
230
231 /*
232 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
233 * entry for the inode number and creation time. Verify that the init pid
234 * is still valid. If not, remove it. Return the entry if valid, NULL
235 * otherwise.
236 * Must be called under store_lock
237 */
238 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
239 {
240 int h = HASH(sb->st_ino);
241 struct pidns_init_store *e = pidns_hash_table[h];
242
243 while (e) {
244 if (e->ino == sb->st_ino) {
245 if (initpid_still_valid(e, sb)) {
246 e->lastcheck = time(NULL);
247 return e;
248 }
249 remove_initpid(e);
250 return NULL;
251 }
252 e = e->next;
253 }
254
255 return NULL;
256 }
257
258 static int is_dir(const char *path)
259 {
260 struct stat statbuf;
261 int ret = stat(path, &statbuf);
262 if (ret == 0 && S_ISDIR(statbuf.st_mode))
263 return 1;
264 return 0;
265 }
266
267 static char *must_copy_string(const char *str)
268 {
269 char *dup = NULL;
270 if (!str)
271 return NULL;
272 do {
273 dup = strdup(str);
274 } while (!dup);
275
276 return dup;
277 }
278
279 static inline void drop_trailing_newlines(char *s)
280 {
281 int l;
282
283 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
284 s[l-1] = '\0';
285 }
286
287 #define BATCH_SIZE 50
288 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
289 {
290 int newbatches = (newlen / BATCH_SIZE) + 1;
291 int oldbatches = (oldlen / BATCH_SIZE) + 1;
292
293 if (!*mem || newbatches > oldbatches) {
294 char *tmp;
295 do {
296 tmp = realloc(*mem, newbatches * BATCH_SIZE);
297 } while (!tmp);
298 *mem = tmp;
299 }
300 }
301 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
302 {
303 size_t newlen = *len + linelen;
304 dorealloc(contents, *len, newlen + 1);
305 memcpy(*contents + *len, line, linelen+1);
306 *len = newlen;
307 }
308
309 static char *slurp_file(const char *from)
310 {
311 char *line = NULL;
312 char *contents = NULL;
313 FILE *f = fopen(from, "r");
314 size_t len = 0, fulllen = 0;
315 ssize_t linelen;
316
317 if (!f)
318 return NULL;
319
320 while ((linelen = getline(&line, &len, f)) != -1) {
321 append_line(&contents, &fulllen, line, linelen);
322 }
323 fclose(f);
324
325 if (contents)
326 drop_trailing_newlines(contents);
327 free(line);
328 return contents;
329 }
330
331 static bool write_string(const char *fnam, const char *string)
332 {
333 FILE *f;
334 size_t len, ret;
335
336 if (!(f = fopen(fnam, "w")))
337 return false;
338 len = strlen(string);
339 ret = fwrite(string, 1, len, f);
340 if (ret != len) {
341 fprintf(stderr, "Error writing to file: %s\n", strerror(errno));
342 fclose(f);
343 return false;
344 }
345 if (fclose(f) < 0) {
346 fprintf(stderr, "Error writing to file: %s\n", strerror(errno));
347 return false;
348 }
349 return true;
350 }
351
352 /*
353 * hierarchies, i.e. 'cpu,cpuacct'
354 */
355 char **hierarchies;
356 int num_hierarchies;
357
358 struct cgfs_files {
359 char *name;
360 uint32_t uid, gid;
361 uint32_t mode;
362 };
363
364 #define ALLOC_NUM 20
365 static bool store_hierarchy(char *stridx, char *h)
366 {
367 if (num_hierarchies % ALLOC_NUM == 0) {
368 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
369 n *= ALLOC_NUM;
370 char **tmp = realloc(hierarchies, n * sizeof(char *));
371 if (!tmp) {
372 fprintf(stderr, "Out of memory\n");
373 exit(1);
374 }
375 hierarchies = tmp;
376 }
377
378 hierarchies[num_hierarchies++] = must_copy_string(h);
379 return true;
380 }
381
382 static void print_subsystems(void)
383 {
384 int i;
385
386 fprintf(stderr, "hierarchies:");
387 for (i = 0; i < num_hierarchies; i++) {
388 if (hierarchies[i])
389 fprintf(stderr, " %d: %s\n", i, hierarchies[i]);
390 }
391 }
392
393 static bool in_comma_list(const char *needle, const char *haystack)
394 {
395 const char *s = haystack, *e;
396 size_t nlen = strlen(needle);
397
398 while (*s && (e = index(s, ','))) {
399 if (nlen != e - s) {
400 s = e + 1;
401 continue;
402 }
403 if (strncmp(needle, s, nlen) == 0)
404 return true;
405 s = e + 1;
406 }
407 if (strcmp(needle, s) == 0)
408 return true;
409 return false;
410 }
411
412 /* do we need to do any massaging here? I'm not sure... */
413 static char *find_mounted_controller(const char *controller)
414 {
415 int i;
416
417 for (i = 0; i < num_hierarchies; i++) {
418 if (!hierarchies[i])
419 continue;
420 if (strcmp(hierarchies[i], controller) == 0)
421 return hierarchies[i];
422 if (in_comma_list(controller, hierarchies[i]))
423 return hierarchies[i];
424 }
425
426 return NULL;
427 }
428
429 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
430 const char *value)
431 {
432 size_t len;
433 char *fnam, *tmpc = find_mounted_controller(controller);
434
435 if (!tmpc)
436 return false;
437 /* basedir / tmpc / cgroup / file \0 */
438 len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + strlen(file) + 4;
439 fnam = alloca(len);
440 snprintf(fnam, len, "%s/%s/%s/%s", basedir, tmpc, cgroup, file);
441
442 return write_string(fnam, value);
443 }
444
445 // Chown all the files in the cgroup directory. We do this when we create
446 // a cgroup on behalf of a user.
447 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid)
448 {
449 struct dirent dirent, *direntp;
450 char path[MAXPATHLEN];
451 size_t len;
452 DIR *d;
453 int ret;
454
455 len = strlen(dirname);
456 if (len >= MAXPATHLEN) {
457 fprintf(stderr, "chown_all_cgroup_files: pathname too long: %s\n", dirname);
458 return;
459 }
460
461 d = opendir(dirname);
462 if (!d) {
463 fprintf(stderr, "chown_all_cgroup_files: failed to open %s\n", dirname);
464 return;
465 }
466
467 while (readdir_r(d, &dirent, &direntp) == 0 && direntp) {
468 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
469 continue;
470 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
471 if (ret < 0 || ret >= MAXPATHLEN) {
472 fprintf(stderr, "chown_all_cgroup_files: pathname too long under %s\n", dirname);
473 continue;
474 }
475 if (chown(path, uid, gid) < 0)
476 fprintf(stderr, "Failed to chown file %s to %u:%u", path, uid, gid);
477 }
478 closedir(d);
479 }
480
481 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
482 {
483 size_t len;
484 char *dirnam, *tmpc = find_mounted_controller(controller);
485
486 if (!tmpc)
487 return -EINVAL;
488 /* basedir / tmpc / cg \0 */
489 len = strlen(basedir) + strlen(tmpc) + strlen(cg) + 3;
490 dirnam = alloca(len);
491 snprintf(dirnam, len, "%s/%s/%s", basedir,tmpc, cg);
492
493 if (mkdir(dirnam, 0755) < 0)
494 return -errno;
495
496 if (uid == 0 && gid == 0)
497 return 0;
498
499 if (chown(dirnam, uid, gid) < 0)
500 return -errno;
501
502 chown_all_cgroup_files(dirnam, uid, gid);
503
504 return 0;
505 }
506
507 static bool recursive_rmdir(const char *dirname)
508 {
509 struct dirent dirent, *direntp;
510 DIR *dir;
511 bool ret = false;
512 char pathname[MAXPATHLEN];
513
514 dir = opendir(dirname);
515 if (!dir) {
516 #if DEBUG
517 fprintf(stderr, "%s: failed to open %s: %s\n", __func__, dirname, strerror(errno));
518 #endif
519 return false;
520 }
521
522 while (!readdir_r(dir, &dirent, &direntp)) {
523 struct stat mystat;
524 int rc;
525
526 if (!direntp)
527 break;
528
529 if (!strcmp(direntp->d_name, ".") ||
530 !strcmp(direntp->d_name, ".."))
531 continue;
532
533 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
534 if (rc < 0 || rc >= MAXPATHLEN) {
535 fprintf(stderr, "pathname too long\n");
536 continue;
537 }
538
539 ret = lstat(pathname, &mystat);
540 if (ret) {
541 #if DEBUG
542 fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno));
543 #endif
544 continue;
545 }
546 if (S_ISDIR(mystat.st_mode)) {
547 if (!recursive_rmdir(pathname)) {
548 #if DEBUG
549 fprintf(stderr, "Error removing %s\n", pathname);
550 #endif
551 }
552 }
553 }
554
555 ret = true;
556 if (closedir(dir) < 0) {
557 fprintf(stderr, "%s: failed to close directory %s: %s\n", __func__, dirname, strerror(errno));
558 ret = false;
559 }
560
561 if (rmdir(dirname) < 0) {
562 #if DEBUG
563 fprintf(stderr, "%s: failed to delete %s: %s\n", __func__, dirname, strerror(errno));
564 #endif
565 ret = false;
566 }
567
568 return ret;
569 }
570
571 bool cgfs_remove(const char *controller, const char *cg)
572 {
573 size_t len;
574 char *dirnam, *tmpc = find_mounted_controller(controller);
575
576 if (!tmpc)
577 return false;
578 /* basedir / tmpc / cg \0 */
579 len = strlen(basedir) + strlen(tmpc) + strlen(cg) + 3;
580 dirnam = alloca(len);
581 snprintf(dirnam, len, "%s/%s/%s", basedir,tmpc, cg);
582 return recursive_rmdir(dirnam);
583 }
584
585 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
586 {
587 size_t len;
588 char *pathname, *tmpc = find_mounted_controller(controller);
589
590 if (!tmpc)
591 return false;
592 /* basedir / tmpc / file \0 */
593 len = strlen(basedir) + strlen(tmpc) + strlen(file) + 3;
594 pathname = alloca(len);
595 snprintf(pathname, len, "%s/%s/%s", basedir, tmpc, file);
596 if (chmod(pathname, mode) < 0)
597 return false;
598 return true;
599 }
600
601 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid)
602 {
603 size_t len;
604 char *fname;
605
606 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
607 fname = alloca(len);
608 snprintf(fname, len, "%s/tasks", dirname);
609 if (chown(fname, uid, gid) != 0)
610 return -errno;
611 snprintf(fname, len, "%s/cgroup.procs", dirname);
612 if (chown(fname, uid, gid) != 0)
613 return -errno;
614 return 0;
615 }
616
617 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
618 {
619 size_t len;
620 char *pathname, *tmpc = find_mounted_controller(controller);
621
622 if (!tmpc)
623 return -EINVAL;
624 /* basedir / tmpc / file \0 */
625 len = strlen(basedir) + strlen(tmpc) + strlen(file) + 3;
626 pathname = alloca(len);
627 snprintf(pathname, len, "%s/%s/%s", basedir, tmpc, file);
628 if (chown(pathname, uid, gid) < 0)
629 return -errno;
630
631 if (is_dir(pathname))
632 // like cgmanager did, we want to chown the tasks file as well
633 return chown_tasks_files(pathname, uid, gid);
634
635 return 0;
636 }
637
638 FILE *open_pids_file(const char *controller, const char *cgroup)
639 {
640 size_t len;
641 char *pathname, *tmpc = find_mounted_controller(controller);
642
643 if (!tmpc)
644 return NULL;
645 /* basedir / tmpc / cgroup / "cgroup.procs" \0 */
646 len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + 4 + strlen("cgroup.procs");
647 pathname = alloca(len);
648 snprintf(pathname, len, "%s/%s/%s/cgroup.procs", basedir, tmpc, cgroup);
649 return fopen(pathname, "w");
650 }
651
652 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
653 void ***list, size_t typesize,
654 void* (*iterator)(const char*, const char*, const char*))
655 {
656 size_t len;
657 char *dirname, *tmpc = find_mounted_controller(controller);
658 char pathname[MAXPATHLEN];
659 size_t sz = 0, asz = 0;
660 struct dirent dirent, *direntp;
661 DIR *dir;
662 int ret;
663
664 *list = NULL;
665 if (!tmpc)
666 return false;
667
668 /* basedir / tmpc / cgroup \0 */
669 len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + 3;
670 dirname = alloca(len);
671 snprintf(dirname, len, "%s/%s/%s", basedir, tmpc, cgroup);
672
673 dir = opendir(dirname);
674 if (!dir)
675 return false;
676
677 while (!readdir_r(dir, &dirent, &direntp)) {
678 struct stat mystat;
679 int rc;
680
681 if (!direntp)
682 break;
683
684 if (!strcmp(direntp->d_name, ".") ||
685 !strcmp(direntp->d_name, ".."))
686 continue;
687
688 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
689 if (rc < 0 || rc >= MAXPATHLEN) {
690 fprintf(stderr, "%s: pathname too long under %s\n", __func__, dirname);
691 continue;
692 }
693
694 ret = lstat(pathname, &mystat);
695 if (ret) {
696 fprintf(stderr, "%s: failed to stat %s: %s\n", __func__, pathname, strerror(errno));
697 continue;
698 }
699 if ((!directories && !S_ISREG(mystat.st_mode)) ||
700 (directories && !S_ISDIR(mystat.st_mode)))
701 continue;
702
703 if (sz+2 >= asz) {
704 void **tmp;
705 asz += BATCH_SIZE;
706 do {
707 tmp = realloc(*list, asz * typesize);
708 } while (!tmp);
709 *list = tmp;
710 }
711 (*list)[sz] = (*iterator)(controller, cgroup, direntp->d_name);
712 (*list)[sz+1] = NULL;
713 sz++;
714 }
715 if (closedir(dir) < 0) {
716 fprintf(stderr, "%s: failed closedir for %s: %s\n", __func__, dirname, strerror(errno));
717 return false;
718 }
719 return true;
720 }
721
722 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
723 {
724 char *dup;
725 do {
726 dup = strdup(dir_entry);
727 } while (!dup);
728 return dup;
729 }
730
731 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
732 {
733 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
734 }
735
736 void free_key(struct cgfs_files *k)
737 {
738 if (!k)
739 return;
740 free(k->name);
741 free(k);
742 }
743
744 void free_keys(struct cgfs_files **keys)
745 {
746 int i;
747
748 if (!keys)
749 return;
750 for (i = 0; keys[i]; i++) {
751 free_key(keys[i]);
752 }
753 free(keys);
754 }
755
756 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
757 {
758 size_t len;
759 char *fnam, *tmpc = find_mounted_controller(controller);
760
761 if (!tmpc)
762 return false;
763 /* basedir / tmpc / cgroup / file \0 */
764 len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + strlen(file) + 4;
765 fnam = alloca(len);
766 snprintf(fnam, len, "%s/%s/%s/%s", basedir, tmpc, cgroup, file);
767
768 *value = slurp_file(fnam);
769 return *value != NULL;
770 }
771
772 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
773 {
774 size_t len;
775 char *fnam, *tmpc = find_mounted_controller(controller);
776 struct stat sb;
777 struct cgfs_files *newkey;
778 int ret;
779
780 if (!tmpc)
781 return false;
782
783 if (file && *file == '/')
784 file++;
785
786 if (file && index(file, '/'))
787 return NULL;
788
789 /* basedir / tmpc / cgroup / file \0 */
790 len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + 3;
791 if (file)
792 len += strlen(file) + 1;
793 fnam = alloca(len);
794 snprintf(fnam, len, "%s/%s/%s%s%s", basedir, tmpc, cgroup,
795 file ? "/" : "", file ? file : "");
796
797 ret = stat(fnam, &sb);
798 if (ret < 0)
799 return NULL;
800
801 do {
802 newkey = malloc(sizeof(struct cgfs_files));
803 } while (!newkey);
804 if (file)
805 newkey->name = must_copy_string(file);
806 else if (rindex(cgroup, '/'))
807 newkey->name = must_copy_string(rindex(cgroup, '/'));
808 else
809 newkey->name = must_copy_string(cgroup);
810 newkey->uid = sb.st_uid;
811 newkey->gid = sb.st_gid;
812 newkey->mode = sb.st_mode;
813
814 return newkey;
815 }
816
817 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
818 {
819 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
820 if (!entry) {
821 fprintf(stderr, "%s: Error getting files under %s:%s\n",
822 __func__, controller, cgroup);
823 }
824 return entry;
825 }
826
827 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
828 {
829 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
830 }
831
832 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
833 { size_t len;
834 char *fnam, *tmpc = find_mounted_controller(controller);
835 int ret;
836 struct stat sb;
837
838 if (!tmpc)
839 return false;
840 /* basedir / tmpc / cgroup / f \0 */
841 len = strlen(basedir) + strlen(tmpc) + strlen(cgroup) + strlen(f) + 4;
842 fnam = alloca(len);
843 snprintf(fnam, len, "%s/%s/%s/%s", basedir, tmpc, cgroup, f);
844
845 ret = stat(fnam, &sb);
846 if (ret < 0 || !S_ISDIR(sb.st_mode))
847 return false;
848 return true;
849 }
850
851 #define SEND_CREDS_OK 0
852 #define SEND_CREDS_NOTSK 1
853 #define SEND_CREDS_FAIL 2
854 static bool recv_creds(int sock, struct ucred *cred, char *v);
855 static int wait_for_pid(pid_t pid);
856 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
857
858 /*
859 * fork a task which switches to @task's namespace and writes '1'.
860 * over a unix sock so we can read the task's reaper's pid in our
861 * namespace
862 */
863 static void write_task_init_pid_exit(int sock, pid_t target)
864 {
865 struct ucred cred;
866 char fnam[100];
867 pid_t pid;
868 char v;
869 int fd, ret;
870
871 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
872 if (ret < 0 || ret >= sizeof(fnam))
873 _exit(1);
874
875 fd = open(fnam, O_RDONLY);
876 if (fd < 0) {
877 perror("write_task_init_pid_exit open of ns/pid");
878 _exit(1);
879 }
880 if (setns(fd, 0)) {
881 perror("write_task_init_pid_exit setns 1");
882 close(fd);
883 _exit(1);
884 }
885 pid = fork();
886 if (pid < 0)
887 _exit(1);
888 if (pid != 0) {
889 if (!wait_for_pid(pid))
890 _exit(1);
891 _exit(0);
892 }
893
894 /* we are the child */
895 cred.uid = 0;
896 cred.gid = 0;
897 cred.pid = 1;
898 v = '1';
899 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
900 _exit(1);
901 _exit(0);
902 }
903
904 static pid_t get_init_pid_for_task(pid_t task)
905 {
906 int sock[2];
907 pid_t pid;
908 pid_t ret = -1;
909 char v = '0';
910 struct ucred cred;
911
912 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
913 perror("socketpair");
914 return -1;
915 }
916
917 pid = fork();
918 if (pid < 0)
919 goto out;
920 if (!pid) {
921 close(sock[1]);
922 write_task_init_pid_exit(sock[0], task);
923 _exit(0);
924 }
925
926 if (!recv_creds(sock[1], &cred, &v))
927 goto out;
928 ret = cred.pid;
929
930 out:
931 close(sock[0]);
932 close(sock[1]);
933 if (pid > 0)
934 wait_for_pid(pid);
935 return ret;
936 }
937
938 static pid_t lookup_initpid_in_store(pid_t qpid)
939 {
940 pid_t answer = 0;
941 struct stat sb;
942 struct pidns_init_store *e;
943 char fnam[100];
944
945 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
946 store_lock();
947 if (stat(fnam, &sb) < 0)
948 goto out;
949 e = lookup_verify_initpid(&sb);
950 if (e) {
951 answer = e->initpid;
952 goto out;
953 }
954 answer = get_init_pid_for_task(qpid);
955 if (answer > 0)
956 save_initpid(&sb, answer);
957
958 out:
959 /* we prune at end in case we are returning
960 * the value we were about to return */
961 prune_initpid_store();
962 store_unlock();
963 return answer;
964 }
965
966 static int wait_for_pid(pid_t pid)
967 {
968 int status, ret;
969
970 if (pid <= 0)
971 return -1;
972
973 again:
974 ret = waitpid(pid, &status, 0);
975 if (ret == -1) {
976 if (errno == EINTR)
977 goto again;
978 return -1;
979 }
980 if (ret != pid)
981 goto again;
982 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
983 return -1;
984 return 0;
985 }
986
987
988 /*
989 * append pid to *src.
990 * src: a pointer to a char* in which ot append the pid.
991 * sz: the number of characters printed so far, minus trailing \0.
992 * asz: the allocated size so far
993 * pid: the pid to append
994 */
995 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
996 {
997 char tmp[30];
998
999 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1000
1001 if (!*src || tmplen + *sz + 1 >= *asz) {
1002 char *tmp;
1003 do {
1004 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1005 } while (!tmp);
1006 *src = tmp;
1007 *asz += BUF_RESERVE_SIZE;
1008 }
1009 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1010 *sz += tmplen;
1011 }
1012
1013 /*
1014 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1015 * valid in the caller's namespace, return the id mapped into
1016 * pid's namespace.
1017 * Returns the mapped id, or -1 on error.
1018 */
1019 unsigned int
1020 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1021 {
1022 unsigned int nsuid, // base id for a range in the idfile's namespace
1023 hostuid, // base id for a range in the caller's namespace
1024 count; // number of ids in this range
1025 char line[400];
1026 int ret;
1027
1028 fseek(idfile, 0L, SEEK_SET);
1029 while (fgets(line, 400, idfile)) {
1030 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1031 if (ret != 3)
1032 continue;
1033 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1034 /*
1035 * uids wrapped around - unexpected as this is a procfile,
1036 * so just bail.
1037 */
1038 fprintf(stderr, "pid wrapparound at entry %u %u %u in %s\n",
1039 nsuid, hostuid, count, line);
1040 return -1;
1041 }
1042 if (hostuid <= in_id && hostuid+count > in_id) {
1043 /*
1044 * now since hostuid <= in_id < hostuid+count, and
1045 * hostuid+count and nsuid+count do not wrap around,
1046 * we know that nsuid+(in_id-hostuid) which must be
1047 * less that nsuid+(count) must not wrap around
1048 */
1049 return (in_id - hostuid) + nsuid;
1050 }
1051 }
1052
1053 // no answer found
1054 return -1;
1055 }
1056
1057 /*
1058 * for is_privileged_over,
1059 * specify whether we require the calling uid to be root in his
1060 * namespace
1061 */
1062 #define NS_ROOT_REQD true
1063 #define NS_ROOT_OPT false
1064
1065 #define PROCLEN 100
1066
1067 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1068 {
1069 char fpath[PROCLEN];
1070 int ret;
1071 bool answer = false;
1072 uid_t nsuid;
1073
1074 if (victim == -1 || uid == -1)
1075 return false;
1076
1077 /*
1078 * If the request is one not requiring root in the namespace,
1079 * then having the same uid suffices. (i.e. uid 1000 has write
1080 * access to files owned by uid 1000
1081 */
1082 if (!req_ns_root && uid == victim)
1083 return true;
1084
1085 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1086 if (ret < 0 || ret >= PROCLEN)
1087 return false;
1088 FILE *f = fopen(fpath, "r");
1089 if (!f)
1090 return false;
1091
1092 /* if caller's not root in his namespace, reject */
1093 nsuid = convert_id_to_ns(f, uid);
1094 if (nsuid)
1095 goto out;
1096
1097 /*
1098 * If victim is not mapped into caller's ns, reject.
1099 * XXX I'm not sure this check is needed given that fuse
1100 * will be sending requests where the vfs has converted
1101 */
1102 nsuid = convert_id_to_ns(f, victim);
1103 if (nsuid == -1)
1104 goto out;
1105
1106 answer = true;
1107
1108 out:
1109 fclose(f);
1110 return answer;
1111 }
1112
1113 static bool perms_include(int fmode, mode_t req_mode)
1114 {
1115 mode_t r;
1116
1117 switch (req_mode & O_ACCMODE) {
1118 case O_RDONLY:
1119 r = S_IROTH;
1120 break;
1121 case O_WRONLY:
1122 r = S_IWOTH;
1123 break;
1124 case O_RDWR:
1125 r = S_IROTH | S_IWOTH;
1126 break;
1127 default:
1128 return false;
1129 }
1130 return ((fmode & r) == r);
1131 }
1132
1133
1134 /*
1135 * taskcg is a/b/c
1136 * querycg is /a/b/c/d/e
1137 * we return 'd'
1138 */
1139 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1140 {
1141 char *start, *end;
1142
1143 if (strlen(taskcg) <= strlen(querycg)) {
1144 fprintf(stderr, "%s: I was fed bad input\n", __func__);
1145 return NULL;
1146 }
1147
1148 if (strcmp(querycg, "/") == 0)
1149 start = strdup(taskcg + 1);
1150 else
1151 start = strdup(taskcg + strlen(querycg) + 1);
1152 if (!start)
1153 return NULL;
1154 end = strchr(start, '/');
1155 if (end)
1156 *end = '\0';
1157 return start;
1158 }
1159
1160 static void stripnewline(char *x)
1161 {
1162 size_t l = strlen(x);
1163 if (l && x[l-1] == '\n')
1164 x[l-1] = '\0';
1165 }
1166
1167 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1168 {
1169 char fnam[PROCLEN];
1170 FILE *f;
1171 char *answer = NULL;
1172 char *line = NULL;
1173 size_t len = 0;
1174 int ret;
1175 const char *h = find_mounted_controller(contrl);
1176 if (!h)
1177 return NULL;
1178
1179 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1180 if (ret < 0 || ret >= PROCLEN)
1181 return NULL;
1182 if (!(f = fopen(fnam, "r")))
1183 return NULL;
1184
1185 while (getline(&line, &len, f) != -1) {
1186 char *c1, *c2;
1187 if (!line[0])
1188 continue;
1189 c1 = strchr(line, ':');
1190 if (!c1)
1191 goto out;
1192 c1++;
1193 c2 = strchr(c1, ':');
1194 if (!c2)
1195 goto out;
1196 *c2 = '\0';
1197 if (strcmp(c1, h) != 0)
1198 continue;
1199 c2++;
1200 stripnewline(c2);
1201 do {
1202 answer = strdup(c2);
1203 } while (!answer);
1204 break;
1205 }
1206
1207 out:
1208 fclose(f);
1209 free(line);
1210 return answer;
1211 }
1212
1213 /*
1214 * check whether a fuse context may access a cgroup dir or file
1215 *
1216 * If file is not null, it is a cgroup file to check under cg.
1217 * If file is null, then we are checking perms on cg itself.
1218 *
1219 * For files we can check the mode of the list_keys result.
1220 * For cgroups, we must make assumptions based on the files under the
1221 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1222 * yet.
1223 */
1224 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1225 {
1226 struct cgfs_files *k = NULL;
1227 bool ret = false;
1228
1229 k = cgfs_get_key(contrl, cg, file);
1230 if (!k)
1231 return false;
1232
1233 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1234 if (perms_include(k->mode >> 6, mode)) {
1235 ret = true;
1236 goto out;
1237 }
1238 }
1239 if (fc->gid == k->gid) {
1240 if (perms_include(k->mode >> 3, mode)) {
1241 ret = true;
1242 goto out;
1243 }
1244 }
1245 ret = perms_include(k->mode, mode);
1246
1247 out:
1248 free_key(k);
1249 return ret;
1250 }
1251
1252 #define INITSCOPE "/init.scope"
1253 static void prune_init_slice(char *cg)
1254 {
1255 char *point;
1256 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1257
1258 if (cg_len < initscope_len)
1259 return;
1260
1261 point = cg + cg_len - initscope_len;
1262 if (strcmp(point, INITSCOPE) == 0) {
1263 if (point == cg)
1264 *(point+1) = '\0';
1265 else
1266 *point = '\0';
1267 }
1268 }
1269
1270 /*
1271 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1272 * If pid is in /a, he may act on /a/b, but not on /b.
1273 * if the answer is false and nextcg is not NULL, then *nextcg will point
1274 * to a string containing the next cgroup directory under cg, which must be
1275 * freed by the caller.
1276 */
1277 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1278 {
1279 bool answer = false;
1280 char *c2 = get_pid_cgroup(pid, contrl);
1281 char *linecmp;
1282
1283 if (!c2)
1284 return false;
1285 prune_init_slice(c2);
1286
1287 /*
1288 * callers pass in '/' for root cgroup, otherwise they pass
1289 * in a cgroup without leading '/'
1290 */
1291 linecmp = *cg == '/' ? c2 : c2+1;
1292 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1293 if (nextcg) {
1294 *nextcg = get_next_cgroup_dir(linecmp, cg);
1295 }
1296 goto out;
1297 }
1298 answer = true;
1299
1300 out:
1301 free(c2);
1302 return answer;
1303 }
1304
1305 /*
1306 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1307 */
1308 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1309 {
1310 bool answer = false;
1311 char *c2, *task_cg;
1312 size_t target_len, task_len;
1313
1314 if (strcmp(cg, "/") == 0)
1315 return true;
1316
1317 c2 = get_pid_cgroup(pid, contrl);
1318 if (!c2)
1319 return false;
1320 prune_init_slice(c2);
1321
1322 task_cg = c2 + 1;
1323 target_len = strlen(cg);
1324 task_len = strlen(task_cg);
1325 if (task_len == 0) {
1326 /* Task is in the root cg, it can see everything. This case is
1327 * not handled by the strmcps below, since they test for the
1328 * last /, but that is the first / that we've chopped off
1329 * above.
1330 */
1331 answer = true;
1332 goto out;
1333 }
1334 if (strcmp(cg, task_cg) == 0) {
1335 answer = true;
1336 goto out;
1337 }
1338 if (target_len < task_len) {
1339 /* looking up a parent dir */
1340 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1341 answer = true;
1342 goto out;
1343 }
1344 if (target_len > task_len) {
1345 /* looking up a child dir */
1346 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1347 answer = true;
1348 goto out;
1349 }
1350
1351 out:
1352 free(c2);
1353 return answer;
1354 }
1355
1356 /*
1357 * given /cgroup/freezer/a/b, return "freezer".
1358 * the returned char* should NOT be freed.
1359 */
1360 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1361 {
1362 const char *p1;
1363 char *contr, *slash;
1364
1365 if (strlen(path) < 9)
1366 return NULL;
1367 if (*(path+7) != '/')
1368 return NULL;
1369 p1 = path+8;
1370 contr = strdupa(p1);
1371 if (!contr)
1372 return NULL;
1373 slash = strstr(contr, "/");
1374 if (slash)
1375 *slash = '\0';
1376
1377 int i;
1378 for (i = 0; i < num_hierarchies; i++) {
1379 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1380 return hierarchies[i];
1381 }
1382 return NULL;
1383 }
1384
1385 /*
1386 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1387 * Note that the returned value may include files (keynames) etc
1388 */
1389 static const char *find_cgroup_in_path(const char *path)
1390 {
1391 const char *p1;
1392
1393 if (strlen(path) < 9)
1394 return NULL;
1395 p1 = strstr(path+8, "/");
1396 if (!p1)
1397 return NULL;
1398 return p1+1;
1399 }
1400
1401 /*
1402 * split the last path element from the path in @cg.
1403 * @dir is newly allocated and should be freed, @last not
1404 */
1405 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1406 {
1407 char *p;
1408
1409 do {
1410 *dir = strdup(cg);
1411 } while (!*dir);
1412 *last = strrchr(cg, '/');
1413 if (!*last) {
1414 *last = NULL;
1415 return;
1416 }
1417 p = strrchr(*dir, '/');
1418 *p = '\0';
1419 }
1420
1421 /*
1422 * FUSE ops for /cgroup
1423 */
1424
1425 int cg_getattr(const char *path, struct stat *sb)
1426 {
1427 struct timespec now;
1428 struct fuse_context *fc = fuse_get_context();
1429 char * cgdir = NULL;
1430 char *last = NULL, *path1, *path2;
1431 struct cgfs_files *k = NULL;
1432 const char *cgroup;
1433 const char *controller = NULL;
1434 int ret = -ENOENT;
1435
1436
1437 if (!fc)
1438 return -EIO;
1439
1440 memset(sb, 0, sizeof(struct stat));
1441
1442 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1443 return -EINVAL;
1444
1445 sb->st_uid = sb->st_gid = 0;
1446 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1447 sb->st_size = 0;
1448
1449 if (strcmp(path, "/cgroup") == 0) {
1450 sb->st_mode = S_IFDIR | 00755;
1451 sb->st_nlink = 2;
1452 return 0;
1453 }
1454
1455 controller = pick_controller_from_path(fc, path);
1456 if (!controller)
1457 return -EIO;
1458 cgroup = find_cgroup_in_path(path);
1459 if (!cgroup) {
1460 /* this is just /cgroup/controller, return it as a dir */
1461 sb->st_mode = S_IFDIR | 00755;
1462 sb->st_nlink = 2;
1463 return 0;
1464 }
1465
1466 get_cgdir_and_path(cgroup, &cgdir, &last);
1467
1468 if (!last) {
1469 path1 = "/";
1470 path2 = cgdir;
1471 } else {
1472 path1 = cgdir;
1473 path2 = last;
1474 }
1475
1476 pid_t initpid = lookup_initpid_in_store(fc->pid);
1477 if (initpid <= 0)
1478 initpid = fc->pid;
1479 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1480 * Then check that caller's cgroup is under path if last is a child
1481 * cgroup, or cgdir if last is a file */
1482
1483 if (is_child_cgroup(controller, path1, path2)) {
1484 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1485 ret = -ENOENT;
1486 goto out;
1487 }
1488 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1489 /* this is just /cgroup/controller, return it as a dir */
1490 sb->st_mode = S_IFDIR | 00555;
1491 sb->st_nlink = 2;
1492 ret = 0;
1493 goto out;
1494 }
1495 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1496 ret = -EACCES;
1497 goto out;
1498 }
1499
1500 // get uid, gid, from '/tasks' file and make up a mode
1501 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1502 sb->st_mode = S_IFDIR | 00755;
1503 k = cgfs_get_key(controller, cgroup, NULL);
1504 if (!k) {
1505 sb->st_uid = sb->st_gid = 0;
1506 } else {
1507 sb->st_uid = k->uid;
1508 sb->st_gid = k->gid;
1509 }
1510 free_key(k);
1511 sb->st_nlink = 2;
1512 ret = 0;
1513 goto out;
1514 }
1515
1516 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1517 sb->st_mode = S_IFREG | k->mode;
1518 sb->st_nlink = 1;
1519 sb->st_uid = k->uid;
1520 sb->st_gid = k->gid;
1521 sb->st_size = 0;
1522 free_key(k);
1523 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1524 ret = -ENOENT;
1525 goto out;
1526 }
1527 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
1528 ret = -EACCES;
1529 goto out;
1530 }
1531
1532 ret = 0;
1533 }
1534
1535 out:
1536 free(cgdir);
1537 return ret;
1538 }
1539
1540 int cg_opendir(const char *path, struct fuse_file_info *fi)
1541 {
1542 struct fuse_context *fc = fuse_get_context();
1543 const char *cgroup;
1544 struct file_info *dir_info;
1545 char *controller = NULL;
1546
1547 if (!fc)
1548 return -EIO;
1549
1550 if (strcmp(path, "/cgroup") == 0) {
1551 cgroup = NULL;
1552 controller = NULL;
1553 } else {
1554 // return list of keys for the controller, and list of child cgroups
1555 controller = pick_controller_from_path(fc, path);
1556 if (!controller)
1557 return -EIO;
1558
1559 cgroup = find_cgroup_in_path(path);
1560 if (!cgroup) {
1561 /* this is just /cgroup/controller, return its contents */
1562 cgroup = "/";
1563 }
1564 }
1565
1566 pid_t initpid = lookup_initpid_in_store(fc->pid);
1567 if (initpid <= 0)
1568 initpid = fc->pid;
1569 if (cgroup) {
1570 if (!caller_may_see_dir(initpid, controller, cgroup))
1571 return -ENOENT;
1572 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1573 return -EACCES;
1574 }
1575
1576 /* we'll free this at cg_releasedir */
1577 dir_info = malloc(sizeof(*dir_info));
1578 if (!dir_info)
1579 return -ENOMEM;
1580 dir_info->controller = must_copy_string(controller);
1581 dir_info->cgroup = must_copy_string(cgroup);
1582 dir_info->type = LXC_TYPE_CGDIR;
1583 dir_info->buf = NULL;
1584 dir_info->file = NULL;
1585 dir_info->buflen = 0;
1586
1587 fi->fh = (unsigned long)dir_info;
1588 return 0;
1589 }
1590
1591 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1592 struct fuse_file_info *fi)
1593 {
1594 struct file_info *d = (struct file_info *)fi->fh;
1595 struct cgfs_files **list = NULL;
1596 int i, ret;
1597 char *nextcg = NULL;
1598 struct fuse_context *fc = fuse_get_context();
1599 char **clist = NULL;
1600
1601 if (d->type != LXC_TYPE_CGDIR) {
1602 fprintf(stderr, "Internal error: file cache info used in readdir\n");
1603 return -EIO;
1604 }
1605 if (!d->cgroup && !d->controller) {
1606 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1607 int i;
1608
1609 for (i = 0; i < num_hierarchies; i++) {
1610 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1611 return -EIO;
1612 }
1613 }
1614 return 0;
1615 }
1616
1617 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1618 // not a valid cgroup
1619 ret = -EINVAL;
1620 goto out;
1621 }
1622
1623 pid_t initpid = lookup_initpid_in_store(fc->pid);
1624 if (initpid <= 0)
1625 initpid = fc->pid;
1626 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1627 if (nextcg) {
1628 ret = filler(buf, nextcg, NULL, 0);
1629 free(nextcg);
1630 if (ret != 0) {
1631 ret = -EIO;
1632 goto out;
1633 }
1634 }
1635 ret = 0;
1636 goto out;
1637 }
1638
1639 for (i = 0; list[i]; i++) {
1640 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1641 ret = -EIO;
1642 goto out;
1643 }
1644 }
1645
1646 // now get the list of child cgroups
1647
1648 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1649 ret = 0;
1650 goto out;
1651 }
1652 if (clist) {
1653 for (i = 0; clist[i]; i++) {
1654 if (filler(buf, clist[i], NULL, 0) != 0) {
1655 ret = -EIO;
1656 goto out;
1657 }
1658 }
1659 }
1660 ret = 0;
1661
1662 out:
1663 free_keys(list);
1664 if (clist) {
1665 for (i = 0; clist[i]; i++)
1666 free(clist[i]);
1667 free(clist);
1668 }
1669 return ret;
1670 }
1671
1672 static void do_release_file_info(struct file_info *f)
1673 {
1674 if (!f)
1675 return;
1676 free(f->controller);
1677 free(f->cgroup);
1678 free(f->file);
1679 free(f->buf);
1680 free(f);
1681 }
1682
1683 int cg_releasedir(const char *path, struct fuse_file_info *fi)
1684 {
1685 struct file_info *d = (struct file_info *)fi->fh;
1686
1687 do_release_file_info(d);
1688 return 0;
1689 }
1690
1691 int cg_open(const char *path, struct fuse_file_info *fi)
1692 {
1693 const char *cgroup;
1694 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1695 struct cgfs_files *k = NULL;
1696 struct file_info *file_info;
1697 struct fuse_context *fc = fuse_get_context();
1698 int ret;
1699
1700 if (!fc)
1701 return -EIO;
1702
1703 controller = pick_controller_from_path(fc, path);
1704 if (!controller)
1705 return -EIO;
1706 cgroup = find_cgroup_in_path(path);
1707 if (!cgroup)
1708 return -EINVAL;
1709
1710 get_cgdir_and_path(cgroup, &cgdir, &last);
1711 if (!last) {
1712 path1 = "/";
1713 path2 = cgdir;
1714 } else {
1715 path1 = cgdir;
1716 path2 = last;
1717 }
1718
1719 k = cgfs_get_key(controller, path1, path2);
1720 if (!k) {
1721 ret = -EINVAL;
1722 goto out;
1723 }
1724 free_key(k);
1725
1726 pid_t initpid = lookup_initpid_in_store(fc->pid);
1727 if (initpid <= 0)
1728 initpid = fc->pid;
1729 if (!caller_may_see_dir(initpid, controller, path1)) {
1730 ret = -ENOENT;
1731 goto out;
1732 }
1733 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
1734 // should never get here
1735 ret = -EACCES;
1736 goto out;
1737 }
1738
1739 /* we'll free this at cg_release */
1740 file_info = malloc(sizeof(*file_info));
1741 if (!file_info) {
1742 ret = -ENOMEM;
1743 goto out;
1744 }
1745 file_info->controller = must_copy_string(controller);
1746 file_info->cgroup = must_copy_string(path1);
1747 file_info->file = must_copy_string(path2);
1748 file_info->type = LXC_TYPE_CGFILE;
1749 file_info->buf = NULL;
1750 file_info->buflen = 0;
1751
1752 fi->fh = (unsigned long)file_info;
1753 ret = 0;
1754
1755 out:
1756 free(cgdir);
1757 return ret;
1758 }
1759
1760 int cg_release(const char *path, struct fuse_file_info *fi)
1761 {
1762 struct file_info *f = (struct file_info *)fi->fh;
1763
1764 do_release_file_info(f);
1765 return 0;
1766 }
1767
1768 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
1769
1770 static bool wait_for_sock(int sock, int timeout)
1771 {
1772 struct epoll_event ev;
1773 int epfd, ret, now, starttime, deltatime, saved_errno;
1774
1775 if ((starttime = time(NULL)) < 0)
1776 return false;
1777
1778 if ((epfd = epoll_create(1)) < 0) {
1779 fprintf(stderr, "Failed to create epoll socket: %m\n");
1780 return false;
1781 }
1782
1783 ev.events = POLLIN_SET;
1784 ev.data.fd = sock;
1785 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
1786 fprintf(stderr, "Failed adding socket to epoll: %m\n");
1787 close(epfd);
1788 return false;
1789 }
1790
1791 again:
1792 if ((now = time(NULL)) < 0) {
1793 close(epfd);
1794 return false;
1795 }
1796
1797 deltatime = (starttime + timeout) - now;
1798 if (deltatime < 0) { // timeout
1799 errno = 0;
1800 close(epfd);
1801 return false;
1802 }
1803 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
1804 if (ret < 0 && errno == EINTR)
1805 goto again;
1806 saved_errno = errno;
1807 close(epfd);
1808
1809 if (ret <= 0) {
1810 errno = saved_errno;
1811 return false;
1812 }
1813 return true;
1814 }
1815
1816 static int msgrecv(int sockfd, void *buf, size_t len)
1817 {
1818 if (!wait_for_sock(sockfd, 2))
1819 return -1;
1820 return recv(sockfd, buf, len, MSG_DONTWAIT);
1821 }
1822
1823 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
1824 {
1825 struct msghdr msg = { 0 };
1826 struct iovec iov;
1827 struct cmsghdr *cmsg;
1828 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
1829 char buf[1];
1830 buf[0] = 'p';
1831
1832 if (pingfirst) {
1833 if (msgrecv(sock, buf, 1) != 1) {
1834 fprintf(stderr, "%s: Error getting reply from server over socketpair\n",
1835 __func__);
1836 return SEND_CREDS_FAIL;
1837 }
1838 }
1839
1840 msg.msg_control = cmsgbuf;
1841 msg.msg_controllen = sizeof(cmsgbuf);
1842
1843 cmsg = CMSG_FIRSTHDR(&msg);
1844 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
1845 cmsg->cmsg_level = SOL_SOCKET;
1846 cmsg->cmsg_type = SCM_CREDENTIALS;
1847 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
1848
1849 msg.msg_name = NULL;
1850 msg.msg_namelen = 0;
1851
1852 buf[0] = v;
1853 iov.iov_base = buf;
1854 iov.iov_len = sizeof(buf);
1855 msg.msg_iov = &iov;
1856 msg.msg_iovlen = 1;
1857
1858 if (sendmsg(sock, &msg, 0) < 0) {
1859 fprintf(stderr, "%s: failed at sendmsg: %s\n", __func__,
1860 strerror(errno));
1861 if (errno == 3)
1862 return SEND_CREDS_NOTSK;
1863 return SEND_CREDS_FAIL;
1864 }
1865
1866 return SEND_CREDS_OK;
1867 }
1868
1869 static bool recv_creds(int sock, struct ucred *cred, char *v)
1870 {
1871 struct msghdr msg = { 0 };
1872 struct iovec iov;
1873 struct cmsghdr *cmsg;
1874 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
1875 char buf[1];
1876 int ret;
1877 int optval = 1;
1878
1879 *v = '1';
1880
1881 cred->pid = -1;
1882 cred->uid = -1;
1883 cred->gid = -1;
1884
1885 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
1886 fprintf(stderr, "Failed to set passcred: %s\n", strerror(errno));
1887 return false;
1888 }
1889 buf[0] = '1';
1890 if (write(sock, buf, 1) != 1) {
1891 fprintf(stderr, "Failed to start write on scm fd: %s\n", strerror(errno));
1892 return false;
1893 }
1894
1895 msg.msg_name = NULL;
1896 msg.msg_namelen = 0;
1897 msg.msg_control = cmsgbuf;
1898 msg.msg_controllen = sizeof(cmsgbuf);
1899
1900 iov.iov_base = buf;
1901 iov.iov_len = sizeof(buf);
1902 msg.msg_iov = &iov;
1903 msg.msg_iovlen = 1;
1904
1905 if (!wait_for_sock(sock, 2)) {
1906 fprintf(stderr, "Timed out waiting for scm_cred: %s\n",
1907 strerror(errno));
1908 return false;
1909 }
1910 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
1911 if (ret < 0) {
1912 fprintf(stderr, "Failed to receive scm_cred: %s\n",
1913 strerror(errno));
1914 return false;
1915 }
1916
1917 cmsg = CMSG_FIRSTHDR(&msg);
1918
1919 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
1920 cmsg->cmsg_level == SOL_SOCKET &&
1921 cmsg->cmsg_type == SCM_CREDENTIALS) {
1922 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
1923 }
1924 *v = buf[0];
1925
1926 return true;
1927 }
1928
1929
1930 /*
1931 * pid_to_ns - reads pids from a ucred over a socket, then writes the
1932 * int value back over the socket. This shifts the pid from the
1933 * sender's pidns into tpid's pidns.
1934 */
1935 static void pid_to_ns(int sock, pid_t tpid)
1936 {
1937 char v = '0';
1938 struct ucred cred;
1939
1940 while (recv_creds(sock, &cred, &v)) {
1941 if (v == '1')
1942 _exit(0);
1943 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
1944 _exit(1);
1945 }
1946 _exit(0);
1947 }
1948
1949 /*
1950 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
1951 * in your old pidns. Only children which you fork will be in the target
1952 * pidns. So the pid_to_ns_wrapper does the setns, then forks a child to
1953 * actually convert pids
1954 */
1955 static void pid_to_ns_wrapper(int sock, pid_t tpid)
1956 {
1957 int newnsfd = -1, ret, cpipe[2];
1958 char fnam[100];
1959 pid_t cpid;
1960 char v;
1961
1962 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
1963 if (ret < 0 || ret >= sizeof(fnam))
1964 _exit(1);
1965 newnsfd = open(fnam, O_RDONLY);
1966 if (newnsfd < 0)
1967 _exit(1);
1968 if (setns(newnsfd, 0) < 0)
1969 _exit(1);
1970 close(newnsfd);
1971
1972 if (pipe(cpipe) < 0)
1973 _exit(1);
1974
1975 cpid = fork();
1976 if (cpid < 0)
1977 _exit(1);
1978
1979 if (!cpid) {
1980 char b = '1';
1981 close(cpipe[0]);
1982 if (write(cpipe[1], &b, sizeof(char)) < 0) {
1983 fprintf(stderr, "%s (child): erorr on write: %s\n",
1984 __func__, strerror(errno));
1985 }
1986 close(cpipe[1]);
1987 pid_to_ns(sock, tpid);
1988 _exit(1); // not reached
1989 }
1990 // give the child 1 second to be done forking and
1991 // write its ack
1992 if (!wait_for_sock(cpipe[0], 1))
1993 _exit(1);
1994 ret = read(cpipe[0], &v, 1);
1995 if (ret != sizeof(char) || v != '1')
1996 _exit(1);
1997
1998 if (!wait_for_pid(cpid))
1999 _exit(1);
2000 _exit(0);
2001 }
2002
2003 /*
2004 * To read cgroup files with a particular pid, we will setns into the child
2005 * pidns, open a pipe, fork a child - which will be the first to really be in
2006 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2007 */
2008 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2009 {
2010 int sock[2] = {-1, -1};
2011 char *tmpdata = NULL;
2012 int ret;
2013 pid_t qpid, cpid = -1;
2014 bool answer = false;
2015 char v = '0';
2016 struct ucred cred;
2017 size_t sz = 0, asz = 0;
2018
2019 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2020 return false;
2021
2022 /*
2023 * Now we read the pids from returned data one by one, pass
2024 * them into a child in the target namespace, read back the
2025 * translated pids, and put them into our to-return data
2026 */
2027
2028 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2029 perror("socketpair");
2030 free(tmpdata);
2031 return false;
2032 }
2033
2034 cpid = fork();
2035 if (cpid == -1)
2036 goto out;
2037
2038 if (!cpid) // child - exits when done
2039 pid_to_ns_wrapper(sock[1], tpid);
2040
2041 char *ptr = tmpdata;
2042 cred.uid = 0;
2043 cred.gid = 0;
2044 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2045 cred.pid = qpid;
2046 ret = send_creds(sock[0], &cred, v, true);
2047
2048 if (ret == SEND_CREDS_NOTSK)
2049 goto next;
2050 if (ret == SEND_CREDS_FAIL)
2051 goto out;
2052
2053 // read converted results
2054 if (!wait_for_sock(sock[0], 2)) {
2055 fprintf(stderr, "%s: timed out waiting for pid from child: %s\n",
2056 __func__, strerror(errno));
2057 goto out;
2058 }
2059 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2060 fprintf(stderr, "%s: error reading pid from child: %s\n",
2061 __func__, strerror(errno));
2062 goto out;
2063 }
2064 must_strcat_pid(d, &sz, &asz, qpid);
2065 next:
2066 ptr = strchr(ptr, '\n');
2067 if (!ptr)
2068 break;
2069 ptr++;
2070 }
2071
2072 cred.pid = getpid();
2073 v = '1';
2074 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2075 // failed to ask child to exit
2076 fprintf(stderr, "%s: failed to ask child to exit: %s\n",
2077 __func__, strerror(errno));
2078 goto out;
2079 }
2080
2081 answer = true;
2082
2083 out:
2084 free(tmpdata);
2085 if (cpid != -1)
2086 wait_for_pid(cpid);
2087 if (sock[0] != -1) {
2088 close(sock[0]);
2089 close(sock[1]);
2090 }
2091 return answer;
2092 }
2093
2094 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2095 struct fuse_file_info *fi)
2096 {
2097 struct fuse_context *fc = fuse_get_context();
2098 struct file_info *f = (struct file_info *)fi->fh;
2099 struct cgfs_files *k = NULL;
2100 char *data = NULL;
2101 int ret, s;
2102 bool r;
2103
2104 if (f->type != LXC_TYPE_CGFILE) {
2105 fprintf(stderr, "Internal error: directory cache info used in cg_read\n");
2106 return -EIO;
2107 }
2108
2109 if (offset)
2110 return 0;
2111
2112 if (!fc)
2113 return -EIO;
2114
2115 if (!f->controller)
2116 return -EINVAL;
2117
2118 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2119 return -EINVAL;
2120 }
2121 free_key(k);
2122
2123
2124 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { // should never get here
2125 ret = -EACCES;
2126 goto out;
2127 }
2128
2129 if (strcmp(f->file, "tasks") == 0 ||
2130 strcmp(f->file, "/tasks") == 0 ||
2131 strcmp(f->file, "/cgroup.procs") == 0 ||
2132 strcmp(f->file, "cgroup.procs") == 0)
2133 // special case - we have to translate the pids
2134 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2135 else
2136 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2137
2138 if (!r) {
2139 ret = -EINVAL;
2140 goto out;
2141 }
2142
2143 if (!data) {
2144 ret = 0;
2145 goto out;
2146 }
2147 s = strlen(data);
2148 if (s > size)
2149 s = size;
2150 memcpy(buf, data, s);
2151 if (s > 0 && s < size && data[s-1] != '\n')
2152 buf[s++] = '\n';
2153
2154 ret = s;
2155
2156 out:
2157 free(data);
2158 return ret;
2159 }
2160
2161 static void pid_from_ns(int sock, pid_t tpid)
2162 {
2163 pid_t vpid;
2164 struct ucred cred;
2165 char v;
2166 int ret;
2167
2168 cred.uid = 0;
2169 cred.gid = 0;
2170 while (1) {
2171 if (!wait_for_sock(sock, 2)) {
2172 fprintf(stderr, "%s: timeout reading from parent\n", __func__);
2173 _exit(1);
2174 }
2175 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2176 fprintf(stderr, "%s: bad read from parent: %s\n",
2177 __func__, strerror(errno));
2178 _exit(1);
2179 }
2180 if (vpid == -1) // done
2181 break;
2182 v = '0';
2183 cred.pid = vpid;
2184 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2185 v = '1';
2186 cred.pid = getpid();
2187 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2188 _exit(1);
2189 }
2190 }
2191 _exit(0);
2192 }
2193
2194 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2195 {
2196 int newnsfd = -1, ret, cpipe[2];
2197 char fnam[100];
2198 pid_t cpid;
2199 char v;
2200
2201 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2202 if (ret < 0 || ret >= sizeof(fnam))
2203 _exit(1);
2204 newnsfd = open(fnam, O_RDONLY);
2205 if (newnsfd < 0)
2206 _exit(1);
2207 if (setns(newnsfd, 0) < 0)
2208 _exit(1);
2209 close(newnsfd);
2210
2211 if (pipe(cpipe) < 0)
2212 _exit(1);
2213
2214 loop:
2215 cpid = fork();
2216
2217 if (cpid < 0)
2218 _exit(1);
2219
2220 if (!cpid) {
2221 char b = '1';
2222 close(cpipe[0]);
2223 if (write(cpipe[1], &b, sizeof(char)) < 0) {
2224 fprintf(stderr, "%s (child): erorr on write: %s\n",
2225 __func__, strerror(errno));
2226 }
2227 close(cpipe[1]);
2228 pid_from_ns(sock, tpid);
2229 }
2230
2231 // give the child 1 second to be done forking and
2232 // write its ack
2233 if (!wait_for_sock(cpipe[0], 1))
2234 goto again;
2235 ret = read(cpipe[0], &v, 1);
2236 if (ret != sizeof(char) || v != '1') {
2237 goto again;
2238 }
2239
2240 if (!wait_for_pid(cpid))
2241 _exit(1);
2242 _exit(0);
2243
2244 again:
2245 kill(cpid, SIGKILL);
2246 wait_for_pid(cpid);
2247 goto loop;
2248 }
2249
2250 /*
2251 * Given host @uid, return the uid to which it maps in
2252 * @pid's user namespace, or -1 if none.
2253 */
2254 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2255 {
2256 FILE *f;
2257 char line[400];
2258
2259 sprintf(line, "/proc/%d/uid_map", pid);
2260 if ((f = fopen(line, "r")) == NULL) {
2261 return false;
2262 }
2263
2264 *answer = convert_id_to_ns(f, uid);
2265 fclose(f);
2266
2267 if (*answer == -1)
2268 return false;
2269 return true;
2270 }
2271
2272 /*
2273 * get_pid_creds: get the real uid and gid of @pid from
2274 * /proc/$$/status
2275 * (XXX should we use euid here?)
2276 */
2277 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2278 {
2279 char line[400];
2280 uid_t u;
2281 gid_t g;
2282 FILE *f;
2283
2284 *uid = -1;
2285 *gid = -1;
2286 sprintf(line, "/proc/%d/status", pid);
2287 if ((f = fopen(line, "r")) == NULL) {
2288 fprintf(stderr, "Error opening %s: %s\n", line, strerror(errno));
2289 return;
2290 }
2291 while (fgets(line, 400, f)) {
2292 if (strncmp(line, "Uid:", 4) == 0) {
2293 if (sscanf(line+4, "%u", &u) != 1) {
2294 fprintf(stderr, "bad uid line for pid %u\n", pid);
2295 fclose(f);
2296 return;
2297 }
2298 *uid = u;
2299 } else if (strncmp(line, "Gid:", 4) == 0) {
2300 if (sscanf(line+4, "%u", &g) != 1) {
2301 fprintf(stderr, "bad gid line for pid %u\n", pid);
2302 fclose(f);
2303 return;
2304 }
2305 *gid = g;
2306 }
2307 }
2308 fclose(f);
2309 }
2310
2311 /*
2312 * May the requestor @r move victim @v to a new cgroup?
2313 * This is allowed if
2314 * . they are the same task
2315 * . they are ownedy by the same uid
2316 * . @r is root on the host, or
2317 * . @v's uid is mapped into @r's where @r is root.
2318 */
2319 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2320 {
2321 uid_t v_uid, tmpuid;
2322 gid_t v_gid;
2323
2324 if (r == v)
2325 return true;
2326 if (r_uid == 0)
2327 return true;
2328 get_pid_creds(v, &v_uid, &v_gid);
2329 if (r_uid == v_uid)
2330 return true;
2331 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2332 && hostuid_to_ns(v_uid, r, &tmpuid))
2333 return true;
2334 return false;
2335 }
2336
2337 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2338 const char *file, const char *buf)
2339 {
2340 int sock[2] = {-1, -1};
2341 pid_t qpid, cpid = -1;
2342 FILE *pids_file = NULL;
2343 bool answer = false, fail = false;
2344
2345 pids_file = open_pids_file(contrl, cg);
2346 if (!pids_file)
2347 return false;
2348
2349 /*
2350 * write the pids to a socket, have helper in writer's pidns
2351 * call movepid for us
2352 */
2353 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2354 perror("socketpair");
2355 goto out;
2356 }
2357
2358 cpid = fork();
2359 if (cpid == -1)
2360 goto out;
2361
2362 if (!cpid) { // child
2363 fclose(pids_file);
2364 pid_from_ns_wrapper(sock[1], tpid);
2365 }
2366
2367 const char *ptr = buf;
2368 while (sscanf(ptr, "%d", &qpid) == 1) {
2369 struct ucred cred;
2370 char v;
2371
2372 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2373 fprintf(stderr, "%s: error writing pid to child: %s\n",
2374 __func__, strerror(errno));
2375 goto out;
2376 }
2377
2378 if (recv_creds(sock[0], &cred, &v)) {
2379 if (v == '0') {
2380 if (!may_move_pid(tpid, tuid, cred.pid)) {
2381 fail = true;
2382 break;
2383 }
2384 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2385 fail = true;
2386 }
2387 }
2388
2389 ptr = strchr(ptr, '\n');
2390 if (!ptr)
2391 break;
2392 ptr++;
2393 }
2394
2395 /* All good, write the value */
2396 qpid = -1;
2397 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2398 fprintf(stderr, "Warning: failed to ask child to exit\n");
2399
2400 if (!fail)
2401 answer = true;
2402
2403 out:
2404 if (cpid != -1)
2405 wait_for_pid(cpid);
2406 if (sock[0] != -1) {
2407 close(sock[0]);
2408 close(sock[1]);
2409 }
2410 if (pids_file) {
2411 if (fclose(pids_file) != 0)
2412 answer = false;
2413 }
2414 return answer;
2415 }
2416
2417 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2418 struct fuse_file_info *fi)
2419 {
2420 struct fuse_context *fc = fuse_get_context();
2421 char *localbuf = NULL;
2422 struct cgfs_files *k = NULL;
2423 struct file_info *f = (struct file_info *)fi->fh;
2424 bool r;
2425
2426 if (f->type != LXC_TYPE_CGFILE) {
2427 fprintf(stderr, "Internal error: directory cache info used in cg_write\n");
2428 return -EIO;
2429 }
2430
2431 if (offset)
2432 return 0;
2433
2434 if (!fc)
2435 return -EIO;
2436
2437 localbuf = alloca(size+1);
2438 localbuf[size] = '\0';
2439 memcpy(localbuf, buf, size);
2440
2441 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2442 size = -EINVAL;
2443 goto out;
2444 }
2445
2446 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2447 size = -EACCES;
2448 goto out;
2449 }
2450
2451 if (strcmp(f->file, "tasks") == 0 ||
2452 strcmp(f->file, "/tasks") == 0 ||
2453 strcmp(f->file, "/cgroup.procs") == 0 ||
2454 strcmp(f->file, "cgroup.procs") == 0)
2455 // special case - we have to translate the pids
2456 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2457 else
2458 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2459
2460 if (!r)
2461 size = -EINVAL;
2462
2463 out:
2464 free_key(k);
2465 return size;
2466 }
2467
2468 int cg_chown(const char *path, uid_t uid, gid_t gid)
2469 {
2470 struct fuse_context *fc = fuse_get_context();
2471 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2472 struct cgfs_files *k = NULL;
2473 const char *cgroup;
2474 int ret;
2475
2476 if (!fc)
2477 return -EIO;
2478
2479 if (strcmp(path, "/cgroup") == 0)
2480 return -EINVAL;
2481
2482 controller = pick_controller_from_path(fc, path);
2483 if (!controller)
2484 return -EINVAL;
2485 cgroup = find_cgroup_in_path(path);
2486 if (!cgroup)
2487 /* this is just /cgroup/controller */
2488 return -EINVAL;
2489
2490 get_cgdir_and_path(cgroup, &cgdir, &last);
2491
2492 if (!last) {
2493 path1 = "/";
2494 path2 = cgdir;
2495 } else {
2496 path1 = cgdir;
2497 path2 = last;
2498 }
2499
2500 if (is_child_cgroup(controller, path1, path2)) {
2501 // get uid, gid, from '/tasks' file and make up a mode
2502 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2503 k = cgfs_get_key(controller, cgroup, "tasks");
2504
2505 } else
2506 k = cgfs_get_key(controller, path1, path2);
2507
2508 if (!k) {
2509 ret = -EINVAL;
2510 goto out;
2511 }
2512
2513 /*
2514 * This being a fuse request, the uid and gid must be valid
2515 * in the caller's namespace. So we can just check to make
2516 * sure that the caller is root in his uid, and privileged
2517 * over the file's current owner.
2518 */
2519 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2520 ret = -EACCES;
2521 goto out;
2522 }
2523
2524 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2525
2526 out:
2527 free_key(k);
2528 free(cgdir);
2529
2530 return ret;
2531 }
2532
2533 int cg_chmod(const char *path, mode_t mode)
2534 {
2535 struct fuse_context *fc = fuse_get_context();
2536 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2537 struct cgfs_files *k = NULL;
2538 const char *cgroup;
2539 int ret;
2540
2541 if (!fc)
2542 return -EIO;
2543
2544 if (strcmp(path, "/cgroup") == 0)
2545 return -EINVAL;
2546
2547 controller = pick_controller_from_path(fc, path);
2548 if (!controller)
2549 return -EINVAL;
2550 cgroup = find_cgroup_in_path(path);
2551 if (!cgroup)
2552 /* this is just /cgroup/controller */
2553 return -EINVAL;
2554
2555 get_cgdir_and_path(cgroup, &cgdir, &last);
2556
2557 if (!last) {
2558 path1 = "/";
2559 path2 = cgdir;
2560 } else {
2561 path1 = cgdir;
2562 path2 = last;
2563 }
2564
2565 if (is_child_cgroup(controller, path1, path2)) {
2566 // get uid, gid, from '/tasks' file and make up a mode
2567 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2568 k = cgfs_get_key(controller, cgroup, "tasks");
2569
2570 } else
2571 k = cgfs_get_key(controller, path1, path2);
2572
2573 if (!k) {
2574 ret = -EINVAL;
2575 goto out;
2576 }
2577
2578 /*
2579 * This being a fuse request, the uid and gid must be valid
2580 * in the caller's namespace. So we can just check to make
2581 * sure that the caller is root in his uid, and privileged
2582 * over the file's current owner.
2583 */
2584 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2585 ret = -EPERM;
2586 goto out;
2587 }
2588
2589 if (!cgfs_chmod_file(controller, cgroup, mode)) {
2590 ret = -EINVAL;
2591 goto out;
2592 }
2593
2594 ret = 0;
2595 out:
2596 free_key(k);
2597 free(cgdir);
2598 return ret;
2599 }
2600
2601 int cg_mkdir(const char *path, mode_t mode)
2602 {
2603 struct fuse_context *fc = fuse_get_context();
2604 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2605 const char *cgroup;
2606 int ret;
2607
2608 if (!fc)
2609 return -EIO;
2610
2611
2612 controller = pick_controller_from_path(fc, path);
2613 if (!controller)
2614 return -EINVAL;
2615
2616 cgroup = find_cgroup_in_path(path);
2617 if (!cgroup)
2618 return -EINVAL;
2619
2620 get_cgdir_and_path(cgroup, &cgdir, &last);
2621 if (!last)
2622 path1 = "/";
2623 else
2624 path1 = cgdir;
2625
2626 pid_t initpid = lookup_initpid_in_store(fc->pid);
2627 if (initpid <= 0)
2628 initpid = fc->pid;
2629 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
2630 if (!next)
2631 ret = -EINVAL;
2632 else if (last && strcmp(next, last) == 0)
2633 ret = -EEXIST;
2634 else
2635 ret = -ENOENT;
2636 goto out;
2637 }
2638
2639 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
2640 ret = -EACCES;
2641 goto out;
2642 }
2643 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2644 ret = -EACCES;
2645 goto out;
2646 }
2647
2648 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
2649
2650 out:
2651 free(cgdir);
2652 free(next);
2653 return ret;
2654 }
2655
2656 int cg_rmdir(const char *path)
2657 {
2658 struct fuse_context *fc = fuse_get_context();
2659 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
2660 const char *cgroup;
2661 int ret;
2662
2663 if (!fc)
2664 return -EIO;
2665
2666 controller = pick_controller_from_path(fc, path);
2667 if (!controller)
2668 return -EINVAL;
2669
2670 cgroup = find_cgroup_in_path(path);
2671 if (!cgroup)
2672 return -EINVAL;
2673
2674 get_cgdir_and_path(cgroup, &cgdir, &last);
2675 if (!last) {
2676 ret = -EINVAL;
2677 goto out;
2678 }
2679
2680 pid_t initpid = lookup_initpid_in_store(fc->pid);
2681 if (initpid <= 0)
2682 initpid = fc->pid;
2683 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
2684 if (!last || strcmp(next, last) == 0)
2685 ret = -EBUSY;
2686 else
2687 ret = -ENOENT;
2688 goto out;
2689 }
2690
2691 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
2692 ret = -EACCES;
2693 goto out;
2694 }
2695 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2696 ret = -EACCES;
2697 goto out;
2698 }
2699
2700 if (!cgfs_remove(controller, cgroup)) {
2701 ret = -EINVAL;
2702 goto out;
2703 }
2704
2705 ret = 0;
2706
2707 out:
2708 free(cgdir);
2709 free(next);
2710 return ret;
2711 }
2712
2713 static bool startswith(const char *line, const char *pref)
2714 {
2715 if (strncmp(line, pref, strlen(pref)) == 0)
2716 return true;
2717 return false;
2718 }
2719
2720 static void get_mem_cached(char *memstat, unsigned long *v)
2721 {
2722 char *eol;
2723
2724 *v = 0;
2725 while (*memstat) {
2726 if (startswith(memstat, "total_cache")) {
2727 sscanf(memstat + 11, "%lu", v);
2728 *v /= 1024;
2729 return;
2730 }
2731 eol = strchr(memstat, '\n');
2732 if (!eol)
2733 return;
2734 memstat = eol+1;
2735 }
2736 }
2737
2738 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2739 {
2740 char *eol;
2741 char key[32];
2742
2743 memset(key, 0, 32);
2744 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2745
2746 size_t len = strlen(key);
2747 *v = 0;
2748
2749 while (*str) {
2750 if (startswith(str, key)) {
2751 sscanf(str + len, "%lu", v);
2752 return;
2753 }
2754 eol = strchr(str, '\n');
2755 if (!eol)
2756 return;
2757 str = eol+1;
2758 }
2759 }
2760
2761 static int read_file(const char *path, char *buf, size_t size,
2762 struct file_info *d)
2763 {
2764 size_t linelen = 0, total_len = 0, rv = 0;
2765 char *line = NULL;
2766 char *cache = d->buf;
2767 size_t cache_size = d->buflen;
2768 FILE *f = fopen(path, "r");
2769 if (!f)
2770 return 0;
2771
2772 while (getline(&line, &linelen, f) != -1) {
2773 size_t l = snprintf(cache, cache_size, "%s", line);
2774 if (l < 0) {
2775 perror("Error writing to cache");
2776 rv = 0;
2777 goto err;
2778 }
2779 if (l >= cache_size) {
2780 fprintf(stderr, "Internal error: truncated write to cache\n");
2781 rv = 0;
2782 goto err;
2783 }
2784 cache += l;
2785 cache_size -= l;
2786 total_len += l;
2787 }
2788
2789 d->size = total_len;
2790 if (total_len > size ) total_len = size;
2791
2792 /* read from off 0 */
2793 memcpy(buf, d->buf, total_len);
2794 rv = total_len;
2795 err:
2796 fclose(f);
2797 free(line);
2798 return rv;
2799 }
2800
2801 /*
2802 * FUSE ops for /proc
2803 */
2804
2805 static unsigned long get_memlimit(const char *cgroup)
2806 {
2807 char *memlimit_str = NULL;
2808 unsigned long memlimit = -1;
2809
2810 if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
2811 memlimit = strtoul(memlimit_str, NULL, 10);
2812
2813 free(memlimit_str);
2814
2815 return memlimit;
2816 }
2817
2818 static unsigned long get_min_memlimit(const char *cgroup)
2819 {
2820 char *copy = strdupa(cgroup);
2821 unsigned long memlimit = 0, retlimit;
2822
2823 retlimit = get_memlimit(copy);
2824
2825 while (strcmp(copy, "/") != 0) {
2826 copy = dirname(copy);
2827 memlimit = get_memlimit(copy);
2828 if (memlimit != -1 && memlimit < retlimit)
2829 retlimit = memlimit;
2830 };
2831
2832 return retlimit;
2833 }
2834
2835 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
2836 struct fuse_file_info *fi)
2837 {
2838 struct fuse_context *fc = fuse_get_context();
2839 struct file_info *d = (struct file_info *)fi->fh;
2840 char *cg;
2841 char *memusage_str = NULL, *memstat_str = NULL,
2842 *memswlimit_str = NULL, *memswusage_str = NULL,
2843 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
2844 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
2845 cached = 0, hosttotal = 0;
2846 char *line = NULL;
2847 size_t linelen = 0, total_len = 0, rv = 0;
2848 char *cache = d->buf;
2849 size_t cache_size = d->buflen;
2850 FILE *f = NULL;
2851
2852 if (offset){
2853 if (offset > d->size)
2854 return -EINVAL;
2855 if (!d->cached)
2856 return 0;
2857 int left = d->size - offset;
2858 total_len = left > size ? size: left;
2859 memcpy(buf, cache + offset, total_len);
2860 return total_len;
2861 }
2862
2863 pid_t initpid = lookup_initpid_in_store(fc->pid);
2864 if (initpid <= 0)
2865 initpid = fc->pid;
2866 cg = get_pid_cgroup(initpid, "memory");
2867 if (!cg)
2868 return read_file("/proc/meminfo", buf, size, d);
2869
2870 memlimit = get_min_memlimit(cg);
2871 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
2872 goto err;
2873 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
2874 goto err;
2875
2876 // Following values are allowed to fail, because swapaccount might be turned
2877 // off for current kernel
2878 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
2879 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
2880 {
2881 /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
2882 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
2883 goto err;
2884 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
2885 goto err;
2886
2887 memswlimit = strtoul(memswlimit_str, NULL, 10);
2888 memswusage = strtoul(memswusage_str, NULL, 10);
2889
2890 if (!strcmp(memswlimit_str, memswlimit_default_str))
2891 memswlimit = 0;
2892 if (!strcmp(memswusage_str, memswusage_default_str))
2893 memswusage = 0;
2894
2895 memswlimit = memswlimit / 1024;
2896 memswusage = memswusage / 1024;
2897 }
2898
2899 memusage = strtoul(memusage_str, NULL, 10);
2900 memlimit /= 1024;
2901 memusage /= 1024;
2902
2903 get_mem_cached(memstat_str, &cached);
2904
2905 f = fopen("/proc/meminfo", "r");
2906 if (!f)
2907 goto err;
2908
2909 while (getline(&line, &linelen, f) != -1) {
2910 size_t l;
2911 char *printme, lbuf[100];
2912
2913 memset(lbuf, 0, 100);
2914 if (startswith(line, "MemTotal:")) {
2915 sscanf(line+14, "%lu", &hosttotal);
2916 if (hosttotal < memlimit)
2917 memlimit = hosttotal;
2918 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
2919 printme = lbuf;
2920 } else if (startswith(line, "MemFree:")) {
2921 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
2922 printme = lbuf;
2923 } else if (startswith(line, "MemAvailable:")) {
2924 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
2925 printme = lbuf;
2926 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
2927 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit);
2928 printme = lbuf;
2929 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
2930 snprintf(lbuf, 100, "SwapFree: %8lu kB\n",
2931 (memswlimit - memlimit) - (memswusage - memusage));
2932 printme = lbuf;
2933 } else if (startswith(line, "Buffers:")) {
2934 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
2935 printme = lbuf;
2936 } else if (startswith(line, "Cached:")) {
2937 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
2938 printme = lbuf;
2939 } else if (startswith(line, "SwapCached:")) {
2940 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
2941 printme = lbuf;
2942 } else
2943 printme = line;
2944
2945 l = snprintf(cache, cache_size, "%s", printme);
2946 if (l < 0) {
2947 perror("Error writing to cache");
2948 rv = 0;
2949 goto err;
2950
2951 }
2952 if (l >= cache_size) {
2953 fprintf(stderr, "Internal error: truncated write to cache\n");
2954 rv = 0;
2955 goto err;
2956 }
2957
2958 cache += l;
2959 cache_size -= l;
2960 total_len += l;
2961 }
2962
2963 d->cached = 1;
2964 d->size = total_len;
2965 if (total_len > size ) total_len = size;
2966 memcpy(buf, d->buf, total_len);
2967
2968 rv = total_len;
2969 err:
2970 if (f)
2971 fclose(f);
2972 free(line);
2973 free(cg);
2974 free(memusage_str);
2975 free(memswlimit_str);
2976 free(memswusage_str);
2977 free(memstat_str);
2978 free(memswlimit_default_str);
2979 free(memswusage_default_str);
2980 return rv;
2981 }
2982
2983 /*
2984 * Read the cpuset.cpus for cg
2985 * Return the answer in a newly allocated string which must be freed
2986 */
2987 static char *get_cpuset(const char *cg)
2988 {
2989 char *answer;
2990
2991 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
2992 return NULL;
2993 return answer;
2994 }
2995
2996 bool cpu_in_cpuset(int cpu, const char *cpuset);
2997
2998 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
2999 {
3000 int cpu;
3001
3002 if (sscanf(line, "processor : %d", &cpu) != 1)
3003 return false;
3004 return cpu_in_cpuset(cpu, cpuset);
3005 }
3006
3007 /*
3008 * check whether this is a '^processor" line in /proc/cpuinfo
3009 */
3010 static bool is_processor_line(const char *line)
3011 {
3012 int cpu;
3013
3014 if (sscanf(line, "processor : %d", &cpu) == 1)
3015 return true;
3016 return false;
3017 }
3018
3019 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3020 struct fuse_file_info *fi)
3021 {
3022 struct fuse_context *fc = fuse_get_context();
3023 struct file_info *d = (struct file_info *)fi->fh;
3024 char *cg;
3025 char *cpuset = NULL;
3026 char *line = NULL;
3027 size_t linelen = 0, total_len = 0, rv = 0;
3028 bool am_printing = false;
3029 int curcpu = -1;
3030 char *cache = d->buf;
3031 size_t cache_size = d->buflen;
3032 FILE *f = NULL;
3033
3034 if (offset){
3035 if (offset > d->size)
3036 return -EINVAL;
3037 if (!d->cached)
3038 return 0;
3039 int left = d->size - offset;
3040 total_len = left > size ? size: left;
3041 memcpy(buf, cache + offset, total_len);
3042 return total_len;
3043 }
3044
3045 pid_t initpid = lookup_initpid_in_store(fc->pid);
3046 if (initpid <= 0)
3047 initpid = fc->pid;
3048 cg = get_pid_cgroup(initpid, "cpuset");
3049 if (!cg)
3050 return read_file("proc/cpuinfo", buf, size, d);
3051
3052 cpuset = get_cpuset(cg);
3053 if (!cpuset)
3054 goto err;
3055
3056 f = fopen("/proc/cpuinfo", "r");
3057 if (!f)
3058 goto err;
3059
3060 while (getline(&line, &linelen, f) != -1) {
3061 size_t l;
3062 if (is_processor_line(line)) {
3063 am_printing = cpuline_in_cpuset(line, cpuset);
3064 if (am_printing) {
3065 curcpu ++;
3066 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3067 if (l < 0) {
3068 perror("Error writing to cache");
3069 rv = 0;
3070 goto err;
3071 }
3072 if (l >= cache_size) {
3073 fprintf(stderr, "Internal error: truncated write to cache\n");
3074 rv = 0;
3075 goto err;
3076 }
3077 cache += l;
3078 cache_size -= l;
3079 total_len += l;
3080 }
3081 continue;
3082 }
3083 if (am_printing) {
3084 l = snprintf(cache, cache_size, "%s", line);
3085 if (l < 0) {
3086 perror("Error writing to cache");
3087 rv = 0;
3088 goto err;
3089 }
3090 if (l >= cache_size) {
3091 fprintf(stderr, "Internal error: truncated write to cache\n");
3092 rv = 0;
3093 goto err;
3094 }
3095 cache += l;
3096 cache_size -= l;
3097 total_len += l;
3098 }
3099 }
3100
3101 d->cached = 1;
3102 d->size = total_len;
3103 if (total_len > size ) total_len = size;
3104
3105 /* read from off 0 */
3106 memcpy(buf, d->buf, total_len);
3107 rv = total_len;
3108 err:
3109 if (f)
3110 fclose(f);
3111 free(line);
3112 free(cpuset);
3113 free(cg);
3114 return rv;
3115 }
3116
3117 static int proc_stat_read(char *buf, size_t size, off_t offset,
3118 struct fuse_file_info *fi)
3119 {
3120 struct fuse_context *fc = fuse_get_context();
3121 struct file_info *d = (struct file_info *)fi->fh;
3122 char *cg;
3123 char *cpuset = NULL;
3124 char *line = NULL;
3125 size_t linelen = 0, total_len = 0, rv = 0;
3126 int curcpu = -1; /* cpu numbering starts at 0 */
3127 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
3128 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3129 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
3130 #define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
3131 char cpuall[CPUALL_MAX_SIZE];
3132 /* reserve for cpu all */
3133 char *cache = d->buf + CPUALL_MAX_SIZE;
3134 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3135 FILE *f = NULL;
3136
3137 if (offset){
3138 if (offset > d->size)
3139 return -EINVAL;
3140 if (!d->cached)
3141 return 0;
3142 int left = d->size - offset;
3143 total_len = left > size ? size: left;
3144 memcpy(buf, d->buf + offset, total_len);
3145 return total_len;
3146 }
3147
3148 pid_t initpid = lookup_initpid_in_store(fc->pid);
3149 if (initpid <= 0)
3150 initpid = fc->pid;
3151 cg = get_pid_cgroup(initpid, "cpuset");
3152 if (!cg)
3153 return read_file("/proc/stat", buf, size, d);
3154
3155 cpuset = get_cpuset(cg);
3156 if (!cpuset)
3157 goto err;
3158
3159 f = fopen("/proc/stat", "r");
3160 if (!f)
3161 goto err;
3162
3163 //skip first line
3164 if (getline(&line, &linelen, f) < 0) {
3165 fprintf(stderr, "proc_stat_read read first line failed\n");
3166 goto err;
3167 }
3168
3169 while (getline(&line, &linelen, f) != -1) {
3170 size_t l;
3171 int cpu;
3172 char cpu_char[10]; /* That's a lot of cores */
3173 char *c;
3174
3175 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3176 /* not a ^cpuN line containing a number N, just print it */
3177 l = snprintf(cache, cache_size, "%s", line);
3178 if (l < 0) {
3179 perror("Error writing to cache");
3180 rv = 0;
3181 goto err;
3182 }
3183 if (l >= cache_size) {
3184 fprintf(stderr, "Internal error: truncated write to cache\n");
3185 rv = 0;
3186 goto err;
3187 }
3188 cache += l;
3189 cache_size -= l;
3190 total_len += l;
3191 continue;
3192 }
3193
3194 if (sscanf(cpu_char, "%d", &cpu) != 1)
3195 continue;
3196 if (!cpu_in_cpuset(cpu, cpuset))
3197 continue;
3198 curcpu ++;
3199
3200 c = strchr(line, ' ');
3201 if (!c)
3202 continue;
3203 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3204 if (l < 0) {
3205 perror("Error writing to cache");
3206 rv = 0;
3207 goto err;
3208
3209 }
3210 if (l >= cache_size) {
3211 fprintf(stderr, "Internal error: truncated write to cache\n");
3212 rv = 0;
3213 goto err;
3214 }
3215
3216 cache += l;
3217 cache_size -= l;
3218 total_len += l;
3219
3220 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
3221 &softirq, &steal, &guest) != 9)
3222 continue;
3223 user_sum += user;
3224 nice_sum += nice;
3225 system_sum += system;
3226 idle_sum += idle;
3227 iowait_sum += iowait;
3228 irq_sum += irq;
3229 softirq_sum += softirq;
3230 steal_sum += steal;
3231 guest_sum += guest;
3232 }
3233
3234 cache = d->buf;
3235
3236 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3237 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
3238 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
3239 memcpy(cache, cpuall, cpuall_len);
3240 cache += cpuall_len;
3241 } else{
3242 /* shouldn't happen */
3243 fprintf(stderr, "proc_stat_read copy cpuall failed, cpuall_len=%d\n", cpuall_len);
3244 cpuall_len = 0;
3245 }
3246
3247 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3248 total_len += cpuall_len;
3249 d->cached = 1;
3250 d->size = total_len;
3251 if (total_len > size ) total_len = size;
3252
3253 memcpy(buf, d->buf, total_len);
3254 rv = total_len;
3255
3256 err:
3257 if (f)
3258 fclose(f);
3259 free(line);
3260 free(cpuset);
3261 free(cg);
3262 return rv;
3263 }
3264
3265 static long int getreaperage(pid_t pid)
3266 {
3267 char fnam[100];
3268 struct stat sb;
3269 int ret;
3270 pid_t qpid;
3271
3272 qpid = lookup_initpid_in_store(pid);
3273 if (qpid <= 0)
3274 return 0;
3275
3276 ret = snprintf(fnam, 100, "/proc/%d", qpid);
3277 if (ret < 0 || ret >= 100)
3278 return 0;
3279
3280 if (lstat(fnam, &sb) < 0)
3281 return 0;
3282
3283 return time(NULL) - sb.st_ctime;
3284 }
3285
3286 static unsigned long get_reaper_busy(pid_t task)
3287 {
3288 pid_t initpid = lookup_initpid_in_store(task);
3289 char *cgroup = NULL, *usage_str = NULL;
3290 unsigned long usage = 0;
3291
3292 if (initpid <= 0)
3293 return 0;
3294
3295 cgroup = get_pid_cgroup(initpid, "cpuacct");
3296 if (!cgroup)
3297 goto out;
3298 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3299 goto out;
3300 usage = strtoul(usage_str, NULL, 10);
3301 usage /= 1000000000;
3302
3303 out:
3304 free(cgroup);
3305 free(usage_str);
3306 return usage;
3307 }
3308
3309 #if RELOADTEST
3310 void iwashere(void)
3311 {
3312 char *name, *cwd = get_current_dir_name();
3313 size_t len;
3314 int fd;
3315
3316 if (!cwd)
3317 exit(1);
3318 len = strlen(cwd) + strlen("/iwashere") + 1;
3319 name = alloca(len);
3320 snprintf(name, len, "%s/iwashere", cwd);
3321 free(cwd);
3322 fd = creat(name, 0755);
3323 if (fd >= 0)
3324 close(fd);
3325 }
3326 #endif
3327
3328 /*
3329 * We read /proc/uptime and reuse its second field.
3330 * For the first field, we use the mtime for the reaper for
3331 * the calling pid as returned by getreaperage
3332 */
3333 static int proc_uptime_read(char *buf, size_t size, off_t offset,
3334 struct fuse_file_info *fi)
3335 {
3336 struct fuse_context *fc = fuse_get_context();
3337 struct file_info *d = (struct file_info *)fi->fh;
3338 long int reaperage = getreaperage(fc->pid);
3339 unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
3340 char *cache = d->buf;
3341 size_t total_len = 0;
3342
3343 #if RELOADTEST
3344 iwashere();
3345 #endif
3346
3347 if (offset){
3348 if (offset > d->size)
3349 return -EINVAL;
3350 if (!d->cached)
3351 return 0;
3352 int left = d->size - offset;
3353 total_len = left > size ? size: left;
3354 memcpy(buf, cache + offset, total_len);
3355 return total_len;
3356 }
3357
3358 idletime = reaperage - busytime;
3359 if (idletime > reaperage)
3360 idletime = reaperage;
3361
3362 total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
3363 if (total_len < 0){
3364 perror("Error writing to cache");
3365 return 0;
3366 }
3367
3368 d->size = (int)total_len;
3369 d->cached = 1;
3370
3371 if (total_len > size) total_len = size;
3372
3373 memcpy(buf, d->buf, total_len);
3374 return total_len;
3375 }
3376
3377 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
3378 struct fuse_file_info *fi)
3379 {
3380 char dev_name[72];
3381 struct fuse_context *fc = fuse_get_context();
3382 struct file_info *d = (struct file_info *)fi->fh;
3383 char *cg;
3384 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
3385 *io_wait_time_str = NULL, *io_service_time_str = NULL;
3386 unsigned long read = 0, write = 0;
3387 unsigned long read_merged = 0, write_merged = 0;
3388 unsigned long read_sectors = 0, write_sectors = 0;
3389 unsigned long read_ticks = 0, write_ticks = 0;
3390 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
3391 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
3392 char *cache = d->buf;
3393 size_t cache_size = d->buflen;
3394 char *line = NULL;
3395 size_t linelen = 0, total_len = 0, rv = 0;
3396 unsigned int major = 0, minor = 0;
3397 int i = 0;
3398 FILE *f = NULL;
3399
3400 if (offset){
3401 if (offset > d->size)
3402 return -EINVAL;
3403 if (!d->cached)
3404 return 0;
3405 int left = d->size - offset;
3406 total_len = left > size ? size: left;
3407 memcpy(buf, cache + offset, total_len);
3408 return total_len;
3409 }
3410
3411 pid_t initpid = lookup_initpid_in_store(fc->pid);
3412 if (initpid <= 0)
3413 initpid = fc->pid;
3414 cg = get_pid_cgroup(initpid, "blkio");
3415 if (!cg)
3416 return read_file("/proc/diskstats", buf, size, d);
3417
3418 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced", &io_serviced_str))
3419 goto err;
3420 if (!cgfs_get_value("blkio", cg, "blkio.io_merged", &io_merged_str))
3421 goto err;
3422 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes", &io_service_bytes_str))
3423 goto err;
3424 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time", &io_wait_time_str))
3425 goto err;
3426 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time", &io_service_time_str))
3427 goto err;
3428
3429
3430 f = fopen("/proc/diskstats", "r");
3431 if (!f)
3432 goto err;
3433
3434 while (getline(&line, &linelen, f) != -1) {
3435 size_t l;
3436 char *printme, lbuf[256];
3437
3438 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
3439 if(i == 3){
3440 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
3441 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
3442 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
3443 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
3444 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
3445 read_sectors = read_sectors/512;
3446 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
3447 write_sectors = write_sectors/512;
3448
3449 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
3450 rd_svctm = rd_svctm/1000000;
3451 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
3452 rd_wait = rd_wait/1000000;
3453 read_ticks = rd_svctm + rd_wait;
3454
3455 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
3456 wr_svctm = wr_svctm/1000000;
3457 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
3458 wr_wait = wr_wait/1000000;
3459 write_ticks = wr_svctm + wr_wait;
3460
3461 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
3462 tot_ticks = tot_ticks/1000000;
3463 }else{
3464 continue;
3465 }
3466
3467 memset(lbuf, 0, 256);
3468 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks) {
3469 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3470 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
3471 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
3472 printme = lbuf;
3473 } else
3474 continue;
3475
3476 l = snprintf(cache, cache_size, "%s", printme);
3477 if (l < 0) {
3478 perror("Error writing to fuse buf");
3479 rv = 0;
3480 goto err;
3481 }
3482 if (l >= cache_size) {
3483 fprintf(stderr, "Internal error: truncated write to cache\n");
3484 rv = 0;
3485 goto err;
3486 }
3487 cache += l;
3488 cache_size -= l;
3489 total_len += l;
3490 }
3491
3492 d->cached = 1;
3493 d->size = total_len;
3494 if (total_len > size ) total_len = size;
3495 memcpy(buf, d->buf, total_len);
3496
3497 rv = total_len;
3498 err:
3499 free(cg);
3500 if (f)
3501 fclose(f);
3502 free(line);
3503 free(io_serviced_str);
3504 free(io_merged_str);
3505 free(io_service_bytes_str);
3506 free(io_wait_time_str);
3507 free(io_service_time_str);
3508 return rv;
3509 }
3510
3511 static off_t get_procfile_size(const char *which)
3512 {
3513 FILE *f = fopen(which, "r");
3514 char *line = NULL;
3515 size_t len = 0;
3516 ssize_t sz, answer = 0;
3517 if (!f)
3518 return 0;
3519
3520 while ((sz = getline(&line, &len, f)) != -1)
3521 answer += sz;
3522 fclose (f);
3523 free(line);
3524
3525 return answer;
3526 }
3527
3528 int proc_getattr(const char *path, struct stat *sb)
3529 {
3530 struct timespec now;
3531
3532 memset(sb, 0, sizeof(struct stat));
3533 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
3534 return -EINVAL;
3535 sb->st_uid = sb->st_gid = 0;
3536 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
3537 if (strcmp(path, "/proc") == 0) {
3538 sb->st_mode = S_IFDIR | 00555;
3539 sb->st_nlink = 2;
3540 return 0;
3541 }
3542 if (strcmp(path, "/proc/meminfo") == 0 ||
3543 strcmp(path, "/proc/cpuinfo") == 0 ||
3544 strcmp(path, "/proc/uptime") == 0 ||
3545 strcmp(path, "/proc/stat") == 0 ||
3546 strcmp(path, "/proc/diskstats") == 0) {
3547 sb->st_size = 0;
3548 sb->st_mode = S_IFREG | 00444;
3549 sb->st_nlink = 1;
3550 return 0;
3551 }
3552
3553 return -ENOENT;
3554 }
3555
3556 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
3557 struct fuse_file_info *fi)
3558 {
3559 if (filler(buf, "cpuinfo", NULL, 0) != 0 ||
3560 filler(buf, "meminfo", NULL, 0) != 0 ||
3561 filler(buf, "stat", NULL, 0) != 0 ||
3562 filler(buf, "uptime", NULL, 0) != 0 ||
3563 filler(buf, "diskstats", NULL, 0) != 0)
3564 return -EINVAL;
3565 return 0;
3566 }
3567
3568 int proc_open(const char *path, struct fuse_file_info *fi)
3569 {
3570 int type = -1;
3571 struct file_info *info;
3572
3573 if (strcmp(path, "/proc/meminfo") == 0)
3574 type = LXC_TYPE_PROC_MEMINFO;
3575 else if (strcmp(path, "/proc/cpuinfo") == 0)
3576 type = LXC_TYPE_PROC_CPUINFO;
3577 else if (strcmp(path, "/proc/uptime") == 0)
3578 type = LXC_TYPE_PROC_UPTIME;
3579 else if (strcmp(path, "/proc/stat") == 0)
3580 type = LXC_TYPE_PROC_STAT;
3581 else if (strcmp(path, "/proc/diskstats") == 0)
3582 type = LXC_TYPE_PROC_DISKSTATS;
3583 if (type == -1)
3584 return -ENOENT;
3585
3586 info = malloc(sizeof(*info));
3587 if (!info)
3588 return -ENOMEM;
3589
3590 memset(info, 0, sizeof(*info));
3591 info->type = type;
3592
3593 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
3594 do {
3595 info->buf = malloc(info->buflen);
3596 } while (!info->buf);
3597 memset(info->buf, 0, info->buflen);
3598 /* set actual size to buffer size */
3599 info->size = info->buflen;
3600
3601 fi->fh = (unsigned long)info;
3602 return 0;
3603 }
3604
3605 int proc_release(const char *path, struct fuse_file_info *fi)
3606 {
3607 struct file_info *f = (struct file_info *)fi->fh;
3608
3609 do_release_file_info(f);
3610 return 0;
3611 }
3612
3613 int proc_read(const char *path, char *buf, size_t size, off_t offset,
3614 struct fuse_file_info *fi)
3615 {
3616 struct file_info *f = (struct file_info *) fi->fh;
3617
3618 switch (f->type) {
3619 case LXC_TYPE_PROC_MEMINFO:
3620 return proc_meminfo_read(buf, size, offset, fi);
3621 case LXC_TYPE_PROC_CPUINFO:
3622 return proc_cpuinfo_read(buf, size, offset, fi);
3623 case LXC_TYPE_PROC_UPTIME:
3624 return proc_uptime_read(buf, size, offset, fi);
3625 case LXC_TYPE_PROC_STAT:
3626 return proc_stat_read(buf, size, offset, fi);
3627 case LXC_TYPE_PROC_DISKSTATS:
3628 return proc_diskstats_read(buf, size, offset, fi);
3629 default:
3630 return -EINVAL;
3631 }
3632 }
3633
3634 static void __attribute__((constructor)) collect_subsystems(void)
3635 {
3636 FILE *f;
3637 char *line = NULL;
3638 size_t len = 0;
3639
3640 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
3641 fprintf(stderr, "Error opening /proc/self/cgroup: %s\n", strerror(errno));
3642 return;
3643 }
3644 while (getline(&line, &len, f) != -1) {
3645 char *p, *p2;
3646
3647 p = strchr(line, ':');
3648 if (!p)
3649 goto out;
3650 *(p++) = '\0';
3651
3652 p2 = strrchr(p, ':');
3653 if (!p2)
3654 goto out;
3655 *p2 = '\0';
3656
3657 if (!store_hierarchy(line, p))
3658 goto out;
3659 }
3660
3661 print_subsystems();
3662
3663 out:
3664 free(line);
3665 fclose(f);
3666 }
3667
3668 static void __attribute__((destructor)) free_subsystems(void)
3669 {
3670 int i;
3671
3672 for (i = 0; i < num_hierarchies; i++)
3673 if (hierarchies[i])
3674 free(hierarchies[i]);
3675 free(hierarchies);
3676 }