]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
Merge pull request #138 from brauner/2016-08-29/libtool_module
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #include <dirent.h>
12 #include <errno.h>
13 #include <fcntl.h>
14 #include <fuse.h>
15 #include <libgen.h>
16 #include <pthread.h>
17 #include <sched.h>
18 #include <stdbool.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <time.h>
23 #include <unistd.h>
24 #include <wait.h>
25 #include <linux/sched.h>
26 #include <sys/epoll.h>
27 #include <sys/mman.h>
28 #include <sys/mount.h>
29 #include <sys/param.h>
30 #include <sys/socket.h>
31 #include <sys/syscall.h>
32
33 #include "bindings.h"
34 #include "config.h" // for VERSION
35
36 /* Define pivot_root() if missing from the C library */
37 #ifndef HAVE_PIVOT_ROOT
38 static int pivot_root(const char * new_root, const char * put_old)
39 {
40 #ifdef __NR_pivot_root
41 return syscall(__NR_pivot_root, new_root, put_old);
42 #else
43 errno = ENOSYS;
44 return -1;
45 #endif
46 }
47 #else
48 extern int pivot_root(const char * new_root, const char * put_old);
49 #endif
50
51 enum {
52 LXC_TYPE_CGDIR,
53 LXC_TYPE_CGFILE,
54 LXC_TYPE_PROC_MEMINFO,
55 LXC_TYPE_PROC_CPUINFO,
56 LXC_TYPE_PROC_UPTIME,
57 LXC_TYPE_PROC_STAT,
58 LXC_TYPE_PROC_DISKSTATS,
59 LXC_TYPE_PROC_SWAPS,
60 };
61
62 struct file_info {
63 char *controller;
64 char *cgroup;
65 char *file;
66 int type;
67 char *buf; // unused as of yet
68 int buflen;
69 int size; //actual data size
70 int cached;
71 };
72
73 /* reserve buffer size, for cpuall in /proc/stat */
74 #define BUF_RESERVE_SIZE 256
75
76 /*
77 * A table caching which pid is init for a pid namespace.
78 * When looking up which pid is init for $qpid, we first
79 * 1. Stat /proc/$qpid/ns/pid.
80 * 2. Check whether the ino_t is in our store.
81 * a. if not, fork a child in qpid's ns to send us
82 * ucred.pid = 1, and read the initpid. Cache
83 * initpid and creation time for /proc/initpid
84 * in a new store entry.
85 * b. if so, verify that /proc/initpid still matches
86 * what we have saved. If not, clear the store
87 * entry and go back to a. If so, return the
88 * cached initpid.
89 */
90 struct pidns_init_store {
91 ino_t ino; // inode number for /proc/$pid/ns/pid
92 pid_t initpid; // the pid of nit in that ns
93 long int ctime; // the time at which /proc/$initpid was created
94 struct pidns_init_store *next;
95 long int lastcheck;
96 };
97
98 /* lol - look at how they are allocated in the kernel */
99 #define PIDNS_HASH_SIZE 4096
100 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
101
102 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
103 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
104 static void lock_mutex(pthread_mutex_t *l)
105 {
106 int ret;
107
108 if ((ret = pthread_mutex_lock(l)) != 0) {
109 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
110 exit(1);
111 }
112 }
113
114 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
115 * Number of hierarchies mounted. */
116 static int num_hierarchies;
117
118 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
119 * Hierachies mounted {cpuset, blkio, ...}:
120 * Initialized via __constructor__ collect_and_mount_subsystems(). */
121 static char **hierarchies;
122
123 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
124 * Open file descriptors:
125 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
126 * private mount namespace.
127 * Initialized via __constructor__ collect_and_mount_subsystems().
128 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
129 * mounts and respective files in the private namespace even when located in
130 * another namespace using the *at() family of functions
131 * {openat(), fchownat(), ...}. */
132 static int *fd_hierarchies;
133
134 static void unlock_mutex(pthread_mutex_t *l)
135 {
136 int ret;
137
138 if ((ret = pthread_mutex_unlock(l)) != 0) {
139 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
140 exit(1);
141 }
142 }
143
144 static void store_lock(void)
145 {
146 lock_mutex(&pidns_store_mutex);
147 }
148
149 static void store_unlock(void)
150 {
151 unlock_mutex(&pidns_store_mutex);
152 }
153
154 /* Must be called under store_lock */
155 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
156 {
157 struct stat initsb;
158 char fnam[100];
159
160 snprintf(fnam, 100, "/proc/%d", e->initpid);
161 if (stat(fnam, &initsb) < 0)
162 return false;
163
164 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
165 initsb.st_ctime, e->initpid);
166
167 if (e->ctime != initsb.st_ctime)
168 return false;
169 return true;
170 }
171
172 /* Must be called under store_lock */
173 static void remove_initpid(struct pidns_init_store *e)
174 {
175 struct pidns_init_store *tmp;
176 int h;
177
178 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
179
180 h = HASH(e->ino);
181 if (pidns_hash_table[h] == e) {
182 pidns_hash_table[h] = e->next;
183 free(e);
184 return;
185 }
186
187 tmp = pidns_hash_table[h];
188 while (tmp) {
189 if (tmp->next == e) {
190 tmp->next = e->next;
191 free(e);
192 return;
193 }
194 tmp = tmp->next;
195 }
196 }
197
198 #define PURGE_SECS 5
199 /* Must be called under store_lock */
200 static void prune_initpid_store(void)
201 {
202 static long int last_prune = 0;
203 struct pidns_init_store *e, *prev, *delme;
204 long int now, threshold;
205 int i;
206
207 if (!last_prune) {
208 last_prune = time(NULL);
209 return;
210 }
211 now = time(NULL);
212 if (now < last_prune + PURGE_SECS)
213 return;
214
215 lxcfs_debug("%s\n", "Pruning.");
216
217 last_prune = now;
218 threshold = now - 2 * PURGE_SECS;
219
220 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
221 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
222 if (e->lastcheck < threshold) {
223
224 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
225
226 delme = e;
227 if (prev)
228 prev->next = e->next;
229 else
230 pidns_hash_table[i] = e->next;
231 e = e->next;
232 free(delme);
233 } else {
234 prev = e;
235 e = e->next;
236 }
237 }
238 }
239 }
240
241 /* Must be called under store_lock */
242 static void save_initpid(struct stat *sb, pid_t pid)
243 {
244 struct pidns_init_store *e;
245 char fpath[100];
246 struct stat procsb;
247 int h;
248
249 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
250
251 snprintf(fpath, 100, "/proc/%d", pid);
252 if (stat(fpath, &procsb) < 0)
253 return;
254 do {
255 e = malloc(sizeof(*e));
256 } while (!e);
257 e->ino = sb->st_ino;
258 e->initpid = pid;
259 e->ctime = procsb.st_ctime;
260 h = HASH(e->ino);
261 e->next = pidns_hash_table[h];
262 e->lastcheck = time(NULL);
263 pidns_hash_table[h] = e;
264 }
265
266 /*
267 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
268 * entry for the inode number and creation time. Verify that the init pid
269 * is still valid. If not, remove it. Return the entry if valid, NULL
270 * otherwise.
271 * Must be called under store_lock
272 */
273 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
274 {
275 int h = HASH(sb->st_ino);
276 struct pidns_init_store *e = pidns_hash_table[h];
277
278 while (e) {
279 if (e->ino == sb->st_ino) {
280 if (initpid_still_valid(e, sb)) {
281 e->lastcheck = time(NULL);
282 return e;
283 }
284 remove_initpid(e);
285 return NULL;
286 }
287 e = e->next;
288 }
289
290 return NULL;
291 }
292
293 static int is_dir(const char *path, int fd)
294 {
295 struct stat statbuf;
296 int ret = fstatat(fd, path, &statbuf, fd);
297 if (ret == 0 && S_ISDIR(statbuf.st_mode))
298 return 1;
299 return 0;
300 }
301
302 static char *must_copy_string(const char *str)
303 {
304 char *dup = NULL;
305 if (!str)
306 return NULL;
307 do {
308 dup = strdup(str);
309 } while (!dup);
310
311 return dup;
312 }
313
314 static inline void drop_trailing_newlines(char *s)
315 {
316 int l;
317
318 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
319 s[l-1] = '\0';
320 }
321
322 #define BATCH_SIZE 50
323 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
324 {
325 int newbatches = (newlen / BATCH_SIZE) + 1;
326 int oldbatches = (oldlen / BATCH_SIZE) + 1;
327
328 if (!*mem || newbatches > oldbatches) {
329 char *tmp;
330 do {
331 tmp = realloc(*mem, newbatches * BATCH_SIZE);
332 } while (!tmp);
333 *mem = tmp;
334 }
335 }
336 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
337 {
338 size_t newlen = *len + linelen;
339 dorealloc(contents, *len, newlen + 1);
340 memcpy(*contents + *len, line, linelen+1);
341 *len = newlen;
342 }
343
344 static char *slurp_file(const char *from, int fd)
345 {
346 char *line = NULL;
347 char *contents = NULL;
348 FILE *f = fdopen(fd, "r");
349 size_t len = 0, fulllen = 0;
350 ssize_t linelen;
351
352 if (!f)
353 return NULL;
354
355 while ((linelen = getline(&line, &len, f)) != -1) {
356 append_line(&contents, &fulllen, line, linelen);
357 }
358 fclose(f);
359
360 if (contents)
361 drop_trailing_newlines(contents);
362 free(line);
363 return contents;
364 }
365
366 static bool write_string(const char *fnam, const char *string, int fd)
367 {
368 FILE *f;
369 size_t len, ret;
370
371 if (!(f = fdopen(fd, "w")))
372 return false;
373 len = strlen(string);
374 ret = fwrite(string, 1, len, f);
375 if (ret != len) {
376 lxcfs_error("Error writing to file: %s\n", strerror(errno));
377 fclose(f);
378 return false;
379 }
380 if (fclose(f) < 0) {
381 lxcfs_error("Error writing to file: %s\n", strerror(errno));
382 return false;
383 }
384 return true;
385 }
386
387 struct cgfs_files {
388 char *name;
389 uint32_t uid, gid;
390 uint32_t mode;
391 };
392
393 #define ALLOC_NUM 20
394 static bool store_hierarchy(char *stridx, char *h)
395 {
396 if (num_hierarchies % ALLOC_NUM == 0) {
397 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
398 n *= ALLOC_NUM;
399 char **tmp = realloc(hierarchies, n * sizeof(char *));
400 if (!tmp) {
401 lxcfs_error("%s\n", strerror(errno));
402 exit(1);
403 }
404 hierarchies = tmp;
405 }
406
407 hierarchies[num_hierarchies++] = must_copy_string(h);
408 return true;
409 }
410
411 static void print_subsystems(void)
412 {
413 int i;
414
415 fprintf(stderr, "hierarchies:\n");
416 for (i = 0; i < num_hierarchies; i++) {
417 if (hierarchies[i])
418 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
419 fd_hierarchies[i], hierarchies[i]);
420 }
421 }
422
423 static bool in_comma_list(const char *needle, const char *haystack)
424 {
425 const char *s = haystack, *e;
426 size_t nlen = strlen(needle);
427
428 while (*s && (e = strchr(s, ','))) {
429 if (nlen != e - s) {
430 s = e + 1;
431 continue;
432 }
433 if (strncmp(needle, s, nlen) == 0)
434 return true;
435 s = e + 1;
436 }
437 if (strcmp(needle, s) == 0)
438 return true;
439 return false;
440 }
441
442 /* do we need to do any massaging here? I'm not sure... */
443 /* Return the mounted controller and store the corresponding open file descriptor
444 * referring to the controller mountpoint in the private lxcfs namespace in
445 * @cfd.
446 */
447 static char *find_mounted_controller(const char *controller, int *cfd)
448 {
449 int i;
450
451 for (i = 0; i < num_hierarchies; i++) {
452 if (!hierarchies[i])
453 continue;
454 if (strcmp(hierarchies[i], controller) == 0) {
455 *cfd = fd_hierarchies[i];
456 return hierarchies[i];
457 }
458 if (in_comma_list(controller, hierarchies[i])) {
459 *cfd = fd_hierarchies[i];
460 return hierarchies[i];
461 }
462 }
463
464 return NULL;
465 }
466
467 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
468 const char *value)
469 {
470 int ret, fd, cfd;
471 size_t len;
472 char *fnam, *tmpc;
473
474 tmpc = find_mounted_controller(controller, &cfd);
475 if (!tmpc)
476 return false;
477
478 /* Make sure we pass a relative path to *at() family of functions.
479 * . + /cgroup + / + file + \0
480 */
481 len = strlen(cgroup) + strlen(file) + 3;
482 fnam = alloca(len);
483 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
484 if (ret < 0 || (size_t)ret >= len)
485 return false;
486
487 fd = openat(cfd, fnam, O_WRONLY);
488 if (fd < 0)
489 return false;
490
491 return write_string(fnam, value, fd);
492 }
493
494 // Chown all the files in the cgroup directory. We do this when we create
495 // a cgroup on behalf of a user.
496 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
497 {
498 struct dirent *direntp;
499 char path[MAXPATHLEN];
500 size_t len;
501 DIR *d;
502 int fd1, ret;
503
504 len = strlen(dirname);
505 if (len >= MAXPATHLEN) {
506 lxcfs_error("Pathname too long: %s\n", dirname);
507 return;
508 }
509
510 fd1 = openat(fd, dirname, O_DIRECTORY);
511 if (fd1 < 0)
512 return;
513
514 d = fdopendir(fd1);
515 if (!d) {
516 lxcfs_error("Failed to open %s\n", dirname);
517 return;
518 }
519
520 while ((direntp = readdir(d))) {
521 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
522 continue;
523 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
524 if (ret < 0 || ret >= MAXPATHLEN) {
525 lxcfs_error("Pathname too long under %s\n", dirname);
526 continue;
527 }
528 if (fchownat(fd, path, uid, gid, 0) < 0)
529 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
530 }
531 closedir(d);
532 }
533
534 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
535 {
536 int cfd;
537 size_t len;
538 char *dirnam, *tmpc;
539
540 tmpc = find_mounted_controller(controller, &cfd);
541 if (!tmpc)
542 return -EINVAL;
543
544 /* Make sure we pass a relative path to *at() family of functions.
545 * . + /cg + \0
546 */
547 len = strlen(cg) + 2;
548 dirnam = alloca(len);
549 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
550
551 if (mkdirat(cfd, dirnam, 0755) < 0)
552 return -errno;
553
554 if (uid == 0 && gid == 0)
555 return 0;
556
557 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
558 return -errno;
559
560 chown_all_cgroup_files(dirnam, uid, gid, cfd);
561
562 return 0;
563 }
564
565 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
566 {
567 struct dirent *direntp;
568 DIR *dir;
569 bool ret = false;
570 char pathname[MAXPATHLEN];
571 int dupfd;
572
573 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
574 if (dupfd < 0)
575 return false;
576
577 dir = fdopendir(dupfd);
578 if (!dir) {
579 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
580 close(dupfd);
581 return false;
582 }
583
584 while ((direntp = readdir(dir))) {
585 struct stat mystat;
586 int rc;
587
588 if (!strcmp(direntp->d_name, ".") ||
589 !strcmp(direntp->d_name, ".."))
590 continue;
591
592 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
593 if (rc < 0 || rc >= MAXPATHLEN) {
594 lxcfs_error("%s\n", "Pathname too long.");
595 continue;
596 }
597
598 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
599 if (rc) {
600 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
601 continue;
602 }
603 if (S_ISDIR(mystat.st_mode))
604 if (!recursive_rmdir(pathname, fd, cfd))
605 lxcfs_debug("Error removing %s.\n", pathname);
606 }
607
608 ret = true;
609 if (closedir(dir) < 0) {
610 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
611 ret = false;
612 }
613
614 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
615 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
616 ret = false;
617 }
618
619 close(dupfd);
620
621 return ret;
622 }
623
624 bool cgfs_remove(const char *controller, const char *cg)
625 {
626 int fd, cfd;
627 size_t len;
628 char *dirnam, *tmpc;
629 bool bret;
630
631 tmpc = find_mounted_controller(controller, &cfd);
632 if (!tmpc)
633 return false;
634
635 /* Make sure we pass a relative path to *at() family of functions.
636 * . + /cg + \0
637 */
638 len = strlen(cg) + 2;
639 dirnam = alloca(len);
640 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
641
642 fd = openat(cfd, dirnam, O_DIRECTORY);
643 if (fd < 0)
644 return false;
645
646 bret = recursive_rmdir(dirnam, fd, cfd);
647 close(fd);
648 return bret;
649 }
650
651 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
652 {
653 int cfd;
654 size_t len;
655 char *pathname, *tmpc;
656
657 tmpc = find_mounted_controller(controller, &cfd);
658 if (!tmpc)
659 return false;
660
661 /* Make sure we pass a relative path to *at() family of functions.
662 * . + /file + \0
663 */
664 len = strlen(file) + 2;
665 pathname = alloca(len);
666 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
667 if (fchmodat(cfd, pathname, mode, 0) < 0)
668 return false;
669 return true;
670 }
671
672 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
673 {
674 size_t len;
675 char *fname;
676
677 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
678 fname = alloca(len);
679 snprintf(fname, len, "%s/tasks", dirname);
680 if (fchownat(fd, fname, uid, gid, 0) != 0)
681 return -errno;
682 snprintf(fname, len, "%s/cgroup.procs", dirname);
683 if (fchownat(fd, fname, uid, gid, 0) != 0)
684 return -errno;
685 return 0;
686 }
687
688 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
689 {
690 int cfd;
691 size_t len;
692 char *pathname, *tmpc;
693
694 tmpc = find_mounted_controller(controller, &cfd);
695 if (!tmpc)
696 return -EINVAL;
697
698 /* Make sure we pass a relative path to *at() family of functions.
699 * . + /file + \0
700 */
701 len = strlen(file) + 2;
702 pathname = alloca(len);
703 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
704 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
705 return -errno;
706
707 if (is_dir(pathname, cfd))
708 // like cgmanager did, we want to chown the tasks file as well
709 return chown_tasks_files(pathname, uid, gid, cfd);
710
711 return 0;
712 }
713
714 FILE *open_pids_file(const char *controller, const char *cgroup)
715 {
716 int fd, cfd;
717 size_t len;
718 char *pathname, *tmpc;
719
720 tmpc = find_mounted_controller(controller, &cfd);
721 if (!tmpc)
722 return NULL;
723
724 /* Make sure we pass a relative path to *at() family of functions.
725 * . + /cgroup + / "cgroup.procs" + \0
726 */
727 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
728 pathname = alloca(len);
729 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
730
731 fd = openat(cfd, pathname, O_WRONLY);
732 if (fd < 0)
733 return NULL;
734
735 return fdopen(fd, "w");
736 }
737
738 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
739 void ***list, size_t typesize,
740 void* (*iterator)(const char*, const char*, const char*))
741 {
742 int cfd, fd, ret;
743 size_t len;
744 char *cg, *tmpc;
745 char pathname[MAXPATHLEN];
746 size_t sz = 0, asz = 0;
747 struct dirent *dirent;
748 DIR *dir;
749
750 tmpc = find_mounted_controller(controller, &cfd);
751 *list = NULL;
752 if (!tmpc)
753 return false;
754
755 /* Make sure we pass a relative path to *at() family of functions. */
756 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
757 cg = alloca(len);
758 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
759 if (ret < 0 || (size_t)ret >= len) {
760 lxcfs_error("Pathname too long under %s\n", cgroup);
761 return false;
762 }
763
764 fd = openat(cfd, cg, O_DIRECTORY);
765 if (fd < 0)
766 return false;
767
768 dir = fdopendir(fd);
769 if (!dir)
770 return false;
771
772 while ((dirent = readdir(dir))) {
773 struct stat mystat;
774
775 if (!strcmp(dirent->d_name, ".") ||
776 !strcmp(dirent->d_name, ".."))
777 continue;
778
779 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
780 if (ret < 0 || ret >= MAXPATHLEN) {
781 lxcfs_error("Pathname too long under %s\n", cg);
782 continue;
783 }
784
785 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
786 if (ret) {
787 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
788 continue;
789 }
790 if ((!directories && !S_ISREG(mystat.st_mode)) ||
791 (directories && !S_ISDIR(mystat.st_mode)))
792 continue;
793
794 if (sz+2 >= asz) {
795 void **tmp;
796 asz += BATCH_SIZE;
797 do {
798 tmp = realloc(*list, asz * typesize);
799 } while (!tmp);
800 *list = tmp;
801 }
802 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
803 (*list)[sz+1] = NULL;
804 sz++;
805 }
806 if (closedir(dir) < 0) {
807 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
808 return false;
809 }
810 return true;
811 }
812
813 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
814 {
815 char *dup;
816 do {
817 dup = strdup(dir_entry);
818 } while (!dup);
819 return dup;
820 }
821
822 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
823 {
824 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
825 }
826
827 void free_key(struct cgfs_files *k)
828 {
829 if (!k)
830 return;
831 free(k->name);
832 free(k);
833 }
834
835 void free_keys(struct cgfs_files **keys)
836 {
837 int i;
838
839 if (!keys)
840 return;
841 for (i = 0; keys[i]; i++) {
842 free_key(keys[i]);
843 }
844 free(keys);
845 }
846
847 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
848 {
849 int ret, fd, cfd;
850 size_t len;
851 char *fnam, *tmpc;
852
853 tmpc = find_mounted_controller(controller, &cfd);
854 if (!tmpc)
855 return false;
856
857 /* Make sure we pass a relative path to *at() family of functions.
858 * . + /cgroup + / + file + \0
859 */
860 len = strlen(cgroup) + strlen(file) + 3;
861 fnam = alloca(len);
862 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
863 if (ret < 0 || (size_t)ret >= len)
864 return NULL;
865
866 fd = openat(cfd, fnam, O_RDONLY);
867 if (fd < 0)
868 return NULL;
869
870 *value = slurp_file(fnam, fd);
871 return *value != NULL;
872 }
873
874 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
875 {
876 int ret, cfd;
877 size_t len;
878 char *fnam, *tmpc;
879 struct stat sb;
880 struct cgfs_files *newkey;
881
882 tmpc = find_mounted_controller(controller, &cfd);
883 if (!tmpc)
884 return false;
885
886 if (file && *file == '/')
887 file++;
888
889 if (file && strchr(file, '/'))
890 return NULL;
891
892 /* Make sure we pass a relative path to *at() family of functions.
893 * . + /cgroup + / + file + \0
894 */
895 len = strlen(cgroup) + 3;
896 if (file)
897 len += strlen(file) + 1;
898 fnam = alloca(len);
899 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
900 file ? "/" : "", file ? file : "");
901
902 ret = fstatat(cfd, fnam, &sb, 0);
903 if (ret < 0)
904 return NULL;
905
906 do {
907 newkey = malloc(sizeof(struct cgfs_files));
908 } while (!newkey);
909 if (file)
910 newkey->name = must_copy_string(file);
911 else if (strrchr(cgroup, '/'))
912 newkey->name = must_copy_string(strrchr(cgroup, '/'));
913 else
914 newkey->name = must_copy_string(cgroup);
915 newkey->uid = sb.st_uid;
916 newkey->gid = sb.st_gid;
917 newkey->mode = sb.st_mode;
918
919 return newkey;
920 }
921
922 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
923 {
924 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
925 if (!entry) {
926 lxcfs_error("Error getting files under %s:%s\n", controller,
927 cgroup);
928 }
929 return entry;
930 }
931
932 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
933 {
934 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
935 }
936
937 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
938 {
939 int cfd;
940 size_t len;
941 char *fnam, *tmpc;
942 int ret;
943 struct stat sb;
944
945 tmpc = find_mounted_controller(controller, &cfd);
946 if (!tmpc)
947 return false;
948
949 /* Make sure we pass a relative path to *at() family of functions.
950 * . + /cgroup + / + f + \0
951 */
952 len = strlen(cgroup) + strlen(f) + 3;
953 fnam = alloca(len);
954 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
955 if (ret < 0 || (size_t)ret >= len)
956 return false;
957
958 ret = fstatat(cfd, fnam, &sb, 0);
959 if (ret < 0 || !S_ISDIR(sb.st_mode))
960 return false;
961
962 return true;
963 }
964
965 #define SEND_CREDS_OK 0
966 #define SEND_CREDS_NOTSK 1
967 #define SEND_CREDS_FAIL 2
968 static bool recv_creds(int sock, struct ucred *cred, char *v);
969 static int wait_for_pid(pid_t pid);
970 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
971 static int send_creds_clone_wrapper(void *arg);
972
973 /*
974 * clone a task which switches to @task's namespace and writes '1'.
975 * over a unix sock so we can read the task's reaper's pid in our
976 * namespace
977 *
978 * Note: glibc's fork() does not respect pidns, which can lead to failed
979 * assertions inside glibc (and thus failed forks) if the child's pid in
980 * the pidns and the parent pid outside are identical. Using clone prevents
981 * this issue.
982 */
983 static void write_task_init_pid_exit(int sock, pid_t target)
984 {
985 char fnam[100];
986 pid_t pid;
987 int fd, ret;
988 size_t stack_size = sysconf(_SC_PAGESIZE);
989 void *stack = alloca(stack_size);
990
991 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
992 if (ret < 0 || ret >= sizeof(fnam))
993 _exit(1);
994
995 fd = open(fnam, O_RDONLY);
996 if (fd < 0) {
997 perror("write_task_init_pid_exit open of ns/pid");
998 _exit(1);
999 }
1000 if (setns(fd, 0)) {
1001 perror("write_task_init_pid_exit setns 1");
1002 close(fd);
1003 _exit(1);
1004 }
1005 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1006 if (pid < 0)
1007 _exit(1);
1008 if (pid != 0) {
1009 if (!wait_for_pid(pid))
1010 _exit(1);
1011 _exit(0);
1012 }
1013 }
1014
1015 static int send_creds_clone_wrapper(void *arg) {
1016 struct ucred cred;
1017 char v;
1018 int sock = *(int *)arg;
1019
1020 /* we are the child */
1021 cred.uid = 0;
1022 cred.gid = 0;
1023 cred.pid = 1;
1024 v = '1';
1025 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1026 return 1;
1027 return 0;
1028 }
1029
1030 static pid_t get_init_pid_for_task(pid_t task)
1031 {
1032 int sock[2];
1033 pid_t pid;
1034 pid_t ret = -1;
1035 char v = '0';
1036 struct ucred cred;
1037
1038 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1039 perror("socketpair");
1040 return -1;
1041 }
1042
1043 pid = fork();
1044 if (pid < 0)
1045 goto out;
1046 if (!pid) {
1047 close(sock[1]);
1048 write_task_init_pid_exit(sock[0], task);
1049 _exit(0);
1050 }
1051
1052 if (!recv_creds(sock[1], &cred, &v))
1053 goto out;
1054 ret = cred.pid;
1055
1056 out:
1057 close(sock[0]);
1058 close(sock[1]);
1059 if (pid > 0)
1060 wait_for_pid(pid);
1061 return ret;
1062 }
1063
1064 static pid_t lookup_initpid_in_store(pid_t qpid)
1065 {
1066 pid_t answer = 0;
1067 struct stat sb;
1068 struct pidns_init_store *e;
1069 char fnam[100];
1070
1071 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1072 store_lock();
1073 if (stat(fnam, &sb) < 0)
1074 goto out;
1075 e = lookup_verify_initpid(&sb);
1076 if (e) {
1077 answer = e->initpid;
1078 goto out;
1079 }
1080 answer = get_init_pid_for_task(qpid);
1081 if (answer > 0)
1082 save_initpid(&sb, answer);
1083
1084 out:
1085 /* we prune at end in case we are returning
1086 * the value we were about to return */
1087 prune_initpid_store();
1088 store_unlock();
1089 return answer;
1090 }
1091
1092 static int wait_for_pid(pid_t pid)
1093 {
1094 int status, ret;
1095
1096 if (pid <= 0)
1097 return -1;
1098
1099 again:
1100 ret = waitpid(pid, &status, 0);
1101 if (ret == -1) {
1102 if (errno == EINTR)
1103 goto again;
1104 return -1;
1105 }
1106 if (ret != pid)
1107 goto again;
1108 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1109 return -1;
1110 return 0;
1111 }
1112
1113
1114 /*
1115 * append pid to *src.
1116 * src: a pointer to a char* in which ot append the pid.
1117 * sz: the number of characters printed so far, minus trailing \0.
1118 * asz: the allocated size so far
1119 * pid: the pid to append
1120 */
1121 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1122 {
1123 char tmp[30];
1124
1125 int tmplen = sprintf(tmp, "%d\n", (int)pid);
1126
1127 if (!*src || tmplen + *sz + 1 >= *asz) {
1128 char *tmp;
1129 do {
1130 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1131 } while (!tmp);
1132 *src = tmp;
1133 *asz += BUF_RESERVE_SIZE;
1134 }
1135 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1136 *sz += tmplen;
1137 }
1138
1139 /*
1140 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1141 * valid in the caller's namespace, return the id mapped into
1142 * pid's namespace.
1143 * Returns the mapped id, or -1 on error.
1144 */
1145 unsigned int
1146 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1147 {
1148 unsigned int nsuid, // base id for a range in the idfile's namespace
1149 hostuid, // base id for a range in the caller's namespace
1150 count; // number of ids in this range
1151 char line[400];
1152 int ret;
1153
1154 fseek(idfile, 0L, SEEK_SET);
1155 while (fgets(line, 400, idfile)) {
1156 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1157 if (ret != 3)
1158 continue;
1159 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1160 /*
1161 * uids wrapped around - unexpected as this is a procfile,
1162 * so just bail.
1163 */
1164 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1165 nsuid, hostuid, count, line);
1166 return -1;
1167 }
1168 if (hostuid <= in_id && hostuid+count > in_id) {
1169 /*
1170 * now since hostuid <= in_id < hostuid+count, and
1171 * hostuid+count and nsuid+count do not wrap around,
1172 * we know that nsuid+(in_id-hostuid) which must be
1173 * less that nsuid+(count) must not wrap around
1174 */
1175 return (in_id - hostuid) + nsuid;
1176 }
1177 }
1178
1179 // no answer found
1180 return -1;
1181 }
1182
1183 /*
1184 * for is_privileged_over,
1185 * specify whether we require the calling uid to be root in his
1186 * namespace
1187 */
1188 #define NS_ROOT_REQD true
1189 #define NS_ROOT_OPT false
1190
1191 #define PROCLEN 100
1192
1193 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1194 {
1195 char fpath[PROCLEN];
1196 int ret;
1197 bool answer = false;
1198 uid_t nsuid;
1199
1200 if (victim == -1 || uid == -1)
1201 return false;
1202
1203 /*
1204 * If the request is one not requiring root in the namespace,
1205 * then having the same uid suffices. (i.e. uid 1000 has write
1206 * access to files owned by uid 1000
1207 */
1208 if (!req_ns_root && uid == victim)
1209 return true;
1210
1211 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1212 if (ret < 0 || ret >= PROCLEN)
1213 return false;
1214 FILE *f = fopen(fpath, "r");
1215 if (!f)
1216 return false;
1217
1218 /* if caller's not root in his namespace, reject */
1219 nsuid = convert_id_to_ns(f, uid);
1220 if (nsuid)
1221 goto out;
1222
1223 /*
1224 * If victim is not mapped into caller's ns, reject.
1225 * XXX I'm not sure this check is needed given that fuse
1226 * will be sending requests where the vfs has converted
1227 */
1228 nsuid = convert_id_to_ns(f, victim);
1229 if (nsuid == -1)
1230 goto out;
1231
1232 answer = true;
1233
1234 out:
1235 fclose(f);
1236 return answer;
1237 }
1238
1239 static bool perms_include(int fmode, mode_t req_mode)
1240 {
1241 mode_t r;
1242
1243 switch (req_mode & O_ACCMODE) {
1244 case O_RDONLY:
1245 r = S_IROTH;
1246 break;
1247 case O_WRONLY:
1248 r = S_IWOTH;
1249 break;
1250 case O_RDWR:
1251 r = S_IROTH | S_IWOTH;
1252 break;
1253 default:
1254 return false;
1255 }
1256 return ((fmode & r) == r);
1257 }
1258
1259
1260 /*
1261 * taskcg is a/b/c
1262 * querycg is /a/b/c/d/e
1263 * we return 'd'
1264 */
1265 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1266 {
1267 char *start, *end;
1268
1269 if (strlen(taskcg) <= strlen(querycg)) {
1270 lxcfs_error("%s\n", "I was fed bad input.");
1271 return NULL;
1272 }
1273
1274 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1275 start = strdup(taskcg + 1);
1276 else
1277 start = strdup(taskcg + strlen(querycg) + 1);
1278 if (!start)
1279 return NULL;
1280 end = strchr(start, '/');
1281 if (end)
1282 *end = '\0';
1283 return start;
1284 }
1285
1286 static void stripnewline(char *x)
1287 {
1288 size_t l = strlen(x);
1289 if (l && x[l-1] == '\n')
1290 x[l-1] = '\0';
1291 }
1292
1293 static char *get_pid_cgroup(pid_t pid, const char *contrl)
1294 {
1295 int cfd;
1296 char fnam[PROCLEN];
1297 FILE *f;
1298 char *answer = NULL;
1299 char *line = NULL;
1300 size_t len = 0;
1301 int ret;
1302 const char *h = find_mounted_controller(contrl, &cfd);
1303 if (!h)
1304 return NULL;
1305
1306 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1307 if (ret < 0 || ret >= PROCLEN)
1308 return NULL;
1309 if (!(f = fopen(fnam, "r")))
1310 return NULL;
1311
1312 while (getline(&line, &len, f) != -1) {
1313 char *c1, *c2;
1314 if (!line[0])
1315 continue;
1316 c1 = strchr(line, ':');
1317 if (!c1)
1318 goto out;
1319 c1++;
1320 c2 = strchr(c1, ':');
1321 if (!c2)
1322 goto out;
1323 *c2 = '\0';
1324 if (strcmp(c1, h) != 0)
1325 continue;
1326 c2++;
1327 stripnewline(c2);
1328 do {
1329 answer = strdup(c2);
1330 } while (!answer);
1331 break;
1332 }
1333
1334 out:
1335 fclose(f);
1336 free(line);
1337 return answer;
1338 }
1339
1340 /*
1341 * check whether a fuse context may access a cgroup dir or file
1342 *
1343 * If file is not null, it is a cgroup file to check under cg.
1344 * If file is null, then we are checking perms on cg itself.
1345 *
1346 * For files we can check the mode of the list_keys result.
1347 * For cgroups, we must make assumptions based on the files under the
1348 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1349 * yet.
1350 */
1351 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1352 {
1353 struct cgfs_files *k = NULL;
1354 bool ret = false;
1355
1356 k = cgfs_get_key(contrl, cg, file);
1357 if (!k)
1358 return false;
1359
1360 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1361 if (perms_include(k->mode >> 6, mode)) {
1362 ret = true;
1363 goto out;
1364 }
1365 }
1366 if (fc->gid == k->gid) {
1367 if (perms_include(k->mode >> 3, mode)) {
1368 ret = true;
1369 goto out;
1370 }
1371 }
1372 ret = perms_include(k->mode, mode);
1373
1374 out:
1375 free_key(k);
1376 return ret;
1377 }
1378
1379 #define INITSCOPE "/init.scope"
1380 static void prune_init_slice(char *cg)
1381 {
1382 char *point;
1383 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1384
1385 if (cg_len < initscope_len)
1386 return;
1387
1388 point = cg + cg_len - initscope_len;
1389 if (strcmp(point, INITSCOPE) == 0) {
1390 if (point == cg)
1391 *(point+1) = '\0';
1392 else
1393 *point = '\0';
1394 }
1395 }
1396
1397 /*
1398 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1399 * If pid is in /a, he may act on /a/b, but not on /b.
1400 * if the answer is false and nextcg is not NULL, then *nextcg will point
1401 * to a string containing the next cgroup directory under cg, which must be
1402 * freed by the caller.
1403 */
1404 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1405 {
1406 bool answer = false;
1407 char *c2 = get_pid_cgroup(pid, contrl);
1408 char *linecmp;
1409
1410 if (!c2)
1411 return false;
1412 prune_init_slice(c2);
1413
1414 /*
1415 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1416 * they pass in a cgroup without leading '/'
1417 *
1418 * The original line here was:
1419 * linecmp = *cg == '/' ? c2 : c2+1;
1420 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1421 * Serge, do you know?
1422 */
1423 if (*cg == '/' || !strncmp(cg, "./", 2))
1424 linecmp = c2;
1425 else
1426 linecmp = c2 + 1;
1427 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1428 if (nextcg) {
1429 *nextcg = get_next_cgroup_dir(linecmp, cg);
1430 }
1431 goto out;
1432 }
1433 answer = true;
1434
1435 out:
1436 free(c2);
1437 return answer;
1438 }
1439
1440 /*
1441 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1442 */
1443 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1444 {
1445 bool answer = false;
1446 char *c2, *task_cg;
1447 size_t target_len, task_len;
1448
1449 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1450 return true;
1451
1452 c2 = get_pid_cgroup(pid, contrl);
1453 if (!c2)
1454 return false;
1455 prune_init_slice(c2);
1456
1457 task_cg = c2 + 1;
1458 target_len = strlen(cg);
1459 task_len = strlen(task_cg);
1460 if (task_len == 0) {
1461 /* Task is in the root cg, it can see everything. This case is
1462 * not handled by the strmcps below, since they test for the
1463 * last /, but that is the first / that we've chopped off
1464 * above.
1465 */
1466 answer = true;
1467 goto out;
1468 }
1469 if (strcmp(cg, task_cg) == 0) {
1470 answer = true;
1471 goto out;
1472 }
1473 if (target_len < task_len) {
1474 /* looking up a parent dir */
1475 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1476 answer = true;
1477 goto out;
1478 }
1479 if (target_len > task_len) {
1480 /* looking up a child dir */
1481 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1482 answer = true;
1483 goto out;
1484 }
1485
1486 out:
1487 free(c2);
1488 return answer;
1489 }
1490
1491 /*
1492 * given /cgroup/freezer/a/b, return "freezer".
1493 * the returned char* should NOT be freed.
1494 */
1495 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1496 {
1497 const char *p1;
1498 char *contr, *slash;
1499
1500 if (strlen(path) < 9) {
1501 errno = EACCES;
1502 return NULL;
1503 }
1504 if (*(path + 7) != '/') {
1505 errno = EINVAL;
1506 return NULL;
1507 }
1508 p1 = path + 8;
1509 contr = strdupa(p1);
1510 if (!contr) {
1511 errno = ENOMEM;
1512 return NULL;
1513 }
1514 slash = strstr(contr, "/");
1515 if (slash)
1516 *slash = '\0';
1517
1518 int i;
1519 for (i = 0; i < num_hierarchies; i++) {
1520 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1521 return hierarchies[i];
1522 }
1523 errno = ENOENT;
1524 return NULL;
1525 }
1526
1527 /*
1528 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1529 * Note that the returned value may include files (keynames) etc
1530 */
1531 static const char *find_cgroup_in_path(const char *path)
1532 {
1533 const char *p1;
1534
1535 if (strlen(path) < 9) {
1536 errno = EACCES;
1537 return NULL;
1538 }
1539 p1 = strstr(path + 8, "/");
1540 if (!p1) {
1541 errno = EINVAL;
1542 return NULL;
1543 }
1544 errno = 0;
1545 return p1 + 1;
1546 }
1547
1548 /*
1549 * split the last path element from the path in @cg.
1550 * @dir is newly allocated and should be freed, @last not
1551 */
1552 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1553 {
1554 char *p;
1555
1556 do {
1557 *dir = strdup(cg);
1558 } while (!*dir);
1559 *last = strrchr(cg, '/');
1560 if (!*last) {
1561 *last = NULL;
1562 return;
1563 }
1564 p = strrchr(*dir, '/');
1565 *p = '\0';
1566 }
1567
1568 /*
1569 * FUSE ops for /cgroup
1570 */
1571
1572 int cg_getattr(const char *path, struct stat *sb)
1573 {
1574 struct timespec now;
1575 struct fuse_context *fc = fuse_get_context();
1576 char * cgdir = NULL;
1577 char *last = NULL, *path1, *path2;
1578 struct cgfs_files *k = NULL;
1579 const char *cgroup;
1580 const char *controller = NULL;
1581 int ret = -ENOENT;
1582
1583
1584 if (!fc)
1585 return -EIO;
1586
1587 memset(sb, 0, sizeof(struct stat));
1588
1589 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
1590 return -EINVAL;
1591
1592 sb->st_uid = sb->st_gid = 0;
1593 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
1594 sb->st_size = 0;
1595
1596 if (strcmp(path, "/cgroup") == 0) {
1597 sb->st_mode = S_IFDIR | 00755;
1598 sb->st_nlink = 2;
1599 return 0;
1600 }
1601
1602 controller = pick_controller_from_path(fc, path);
1603 if (!controller)
1604 return -errno;
1605 cgroup = find_cgroup_in_path(path);
1606 if (!cgroup) {
1607 /* this is just /cgroup/controller, return it as a dir */
1608 sb->st_mode = S_IFDIR | 00755;
1609 sb->st_nlink = 2;
1610 return 0;
1611 }
1612
1613 get_cgdir_and_path(cgroup, &cgdir, &last);
1614
1615 if (!last) {
1616 path1 = "/";
1617 path2 = cgdir;
1618 } else {
1619 path1 = cgdir;
1620 path2 = last;
1621 }
1622
1623 pid_t initpid = lookup_initpid_in_store(fc->pid);
1624 if (initpid <= 0)
1625 initpid = fc->pid;
1626 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
1627 * Then check that caller's cgroup is under path if last is a child
1628 * cgroup, or cgdir if last is a file */
1629
1630 if (is_child_cgroup(controller, path1, path2)) {
1631 if (!caller_may_see_dir(initpid, controller, cgroup)) {
1632 ret = -ENOENT;
1633 goto out;
1634 }
1635 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
1636 /* this is just /cgroup/controller, return it as a dir */
1637 sb->st_mode = S_IFDIR | 00555;
1638 sb->st_nlink = 2;
1639 ret = 0;
1640 goto out;
1641 }
1642 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
1643 ret = -EACCES;
1644 goto out;
1645 }
1646
1647 // get uid, gid, from '/tasks' file and make up a mode
1648 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
1649 sb->st_mode = S_IFDIR | 00755;
1650 k = cgfs_get_key(controller, cgroup, NULL);
1651 if (!k) {
1652 sb->st_uid = sb->st_gid = 0;
1653 } else {
1654 sb->st_uid = k->uid;
1655 sb->st_gid = k->gid;
1656 }
1657 free_key(k);
1658 sb->st_nlink = 2;
1659 ret = 0;
1660 goto out;
1661 }
1662
1663 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
1664 sb->st_mode = S_IFREG | k->mode;
1665 sb->st_nlink = 1;
1666 sb->st_uid = k->uid;
1667 sb->st_gid = k->gid;
1668 sb->st_size = 0;
1669 free_key(k);
1670 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
1671 ret = -ENOENT;
1672 goto out;
1673 }
1674 if (!fc_may_access(fc, controller, path1, path2, O_RDONLY)) {
1675 ret = -EACCES;
1676 goto out;
1677 }
1678
1679 ret = 0;
1680 }
1681
1682 out:
1683 free(cgdir);
1684 return ret;
1685 }
1686
1687 int cg_opendir(const char *path, struct fuse_file_info *fi)
1688 {
1689 struct fuse_context *fc = fuse_get_context();
1690 const char *cgroup;
1691 struct file_info *dir_info;
1692 char *controller = NULL;
1693
1694 if (!fc)
1695 return -EIO;
1696
1697 if (strcmp(path, "/cgroup") == 0) {
1698 cgroup = NULL;
1699 controller = NULL;
1700 } else {
1701 // return list of keys for the controller, and list of child cgroups
1702 controller = pick_controller_from_path(fc, path);
1703 if (!controller)
1704 return -errno;
1705
1706 cgroup = find_cgroup_in_path(path);
1707 if (!cgroup) {
1708 /* this is just /cgroup/controller, return its contents */
1709 cgroup = "/";
1710 }
1711 }
1712
1713 pid_t initpid = lookup_initpid_in_store(fc->pid);
1714 if (initpid <= 0)
1715 initpid = fc->pid;
1716 if (cgroup) {
1717 if (!caller_may_see_dir(initpid, controller, cgroup))
1718 return -ENOENT;
1719 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
1720 return -EACCES;
1721 }
1722
1723 /* we'll free this at cg_releasedir */
1724 dir_info = malloc(sizeof(*dir_info));
1725 if (!dir_info)
1726 return -ENOMEM;
1727 dir_info->controller = must_copy_string(controller);
1728 dir_info->cgroup = must_copy_string(cgroup);
1729 dir_info->type = LXC_TYPE_CGDIR;
1730 dir_info->buf = NULL;
1731 dir_info->file = NULL;
1732 dir_info->buflen = 0;
1733
1734 fi->fh = (unsigned long)dir_info;
1735 return 0;
1736 }
1737
1738 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
1739 struct fuse_file_info *fi)
1740 {
1741 struct file_info *d = (struct file_info *)fi->fh;
1742 struct cgfs_files **list = NULL;
1743 int i, ret;
1744 char *nextcg = NULL;
1745 struct fuse_context *fc = fuse_get_context();
1746 char **clist = NULL;
1747
1748 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
1749 return -EIO;
1750
1751 if (d->type != LXC_TYPE_CGDIR) {
1752 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
1753 return -EIO;
1754 }
1755 if (!d->cgroup && !d->controller) {
1756 // ls /var/lib/lxcfs/cgroup - just show list of controllers
1757 int i;
1758
1759 for (i = 0; i < num_hierarchies; i++) {
1760 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
1761 return -EIO;
1762 }
1763 }
1764 return 0;
1765 }
1766
1767 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
1768 // not a valid cgroup
1769 ret = -EINVAL;
1770 goto out;
1771 }
1772
1773 pid_t initpid = lookup_initpid_in_store(fc->pid);
1774 if (initpid <= 0)
1775 initpid = fc->pid;
1776 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
1777 if (nextcg) {
1778 ret = filler(buf, nextcg, NULL, 0);
1779 free(nextcg);
1780 if (ret != 0) {
1781 ret = -EIO;
1782 goto out;
1783 }
1784 }
1785 ret = 0;
1786 goto out;
1787 }
1788
1789 for (i = 0; list[i]; i++) {
1790 if (filler(buf, list[i]->name, NULL, 0) != 0) {
1791 ret = -EIO;
1792 goto out;
1793 }
1794 }
1795
1796 // now get the list of child cgroups
1797
1798 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
1799 ret = 0;
1800 goto out;
1801 }
1802 if (clist) {
1803 for (i = 0; clist[i]; i++) {
1804 if (filler(buf, clist[i], NULL, 0) != 0) {
1805 ret = -EIO;
1806 goto out;
1807 }
1808 }
1809 }
1810 ret = 0;
1811
1812 out:
1813 free_keys(list);
1814 if (clist) {
1815 for (i = 0; clist[i]; i++)
1816 free(clist[i]);
1817 free(clist);
1818 }
1819 return ret;
1820 }
1821
1822 static void do_release_file_info(struct fuse_file_info *fi)
1823 {
1824 struct file_info *f = (struct file_info *)fi->fh;
1825
1826 if (!f)
1827 return;
1828
1829 fi->fh = 0;
1830
1831 free(f->controller);
1832 f->controller = NULL;
1833 free(f->cgroup);
1834 f->cgroup = NULL;
1835 free(f->file);
1836 f->file = NULL;
1837 free(f->buf);
1838 f->buf = NULL;
1839 free(f);
1840 }
1841
1842 int cg_releasedir(const char *path, struct fuse_file_info *fi)
1843 {
1844 do_release_file_info(fi);
1845 return 0;
1846 }
1847
1848 int cg_open(const char *path, struct fuse_file_info *fi)
1849 {
1850 const char *cgroup;
1851 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
1852 struct cgfs_files *k = NULL;
1853 struct file_info *file_info;
1854 struct fuse_context *fc = fuse_get_context();
1855 int ret;
1856
1857 if (!fc)
1858 return -EIO;
1859
1860 controller = pick_controller_from_path(fc, path);
1861 if (!controller)
1862 return -errno;
1863 cgroup = find_cgroup_in_path(path);
1864 if (!cgroup)
1865 return -errno;
1866
1867 get_cgdir_and_path(cgroup, &cgdir, &last);
1868 if (!last) {
1869 path1 = "/";
1870 path2 = cgdir;
1871 } else {
1872 path1 = cgdir;
1873 path2 = last;
1874 }
1875
1876 k = cgfs_get_key(controller, path1, path2);
1877 if (!k) {
1878 ret = -EINVAL;
1879 goto out;
1880 }
1881 free_key(k);
1882
1883 pid_t initpid = lookup_initpid_in_store(fc->pid);
1884 if (initpid <= 0)
1885 initpid = fc->pid;
1886 if (!caller_may_see_dir(initpid, controller, path1)) {
1887 ret = -ENOENT;
1888 goto out;
1889 }
1890 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
1891 ret = -EACCES;
1892 goto out;
1893 }
1894
1895 /* we'll free this at cg_release */
1896 file_info = malloc(sizeof(*file_info));
1897 if (!file_info) {
1898 ret = -ENOMEM;
1899 goto out;
1900 }
1901 file_info->controller = must_copy_string(controller);
1902 file_info->cgroup = must_copy_string(path1);
1903 file_info->file = must_copy_string(path2);
1904 file_info->type = LXC_TYPE_CGFILE;
1905 file_info->buf = NULL;
1906 file_info->buflen = 0;
1907
1908 fi->fh = (unsigned long)file_info;
1909 ret = 0;
1910
1911 out:
1912 free(cgdir);
1913 return ret;
1914 }
1915
1916 int cg_access(const char *path, int mode)
1917 {
1918 int ret;
1919 const char *cgroup;
1920 char *path1, *path2, *controller;
1921 char *last = NULL, *cgdir = NULL;
1922 struct cgfs_files *k = NULL;
1923 struct fuse_context *fc = fuse_get_context();
1924
1925 if (strcmp(path, "/cgroup") == 0)
1926 return 0;
1927
1928 if (!fc)
1929 return -EIO;
1930
1931 controller = pick_controller_from_path(fc, path);
1932 if (!controller)
1933 return -errno;
1934 cgroup = find_cgroup_in_path(path);
1935 if (!cgroup) {
1936 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
1937 if ((mode & W_OK) == 0)
1938 return 0;
1939 return -EACCES;
1940 }
1941
1942 get_cgdir_and_path(cgroup, &cgdir, &last);
1943 if (!last) {
1944 path1 = "/";
1945 path2 = cgdir;
1946 } else {
1947 path1 = cgdir;
1948 path2 = last;
1949 }
1950
1951 k = cgfs_get_key(controller, path1, path2);
1952 if (!k) {
1953 if ((mode & W_OK) == 0)
1954 ret = 0;
1955 else
1956 ret = -EACCES;
1957 goto out;
1958 }
1959 free_key(k);
1960
1961 pid_t initpid = lookup_initpid_in_store(fc->pid);
1962 if (initpid <= 0)
1963 initpid = fc->pid;
1964 if (!caller_may_see_dir(initpid, controller, path1)) {
1965 ret = -ENOENT;
1966 goto out;
1967 }
1968 if (!fc_may_access(fc, controller, path1, path2, mode)) {
1969 ret = -EACCES;
1970 goto out;
1971 }
1972
1973 ret = 0;
1974
1975 out:
1976 free(cgdir);
1977 return ret;
1978 }
1979
1980 int cg_release(const char *path, struct fuse_file_info *fi)
1981 {
1982 do_release_file_info(fi);
1983 return 0;
1984 }
1985
1986 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
1987
1988 static bool wait_for_sock(int sock, int timeout)
1989 {
1990 struct epoll_event ev;
1991 int epfd, ret, now, starttime, deltatime, saved_errno;
1992
1993 if ((starttime = time(NULL)) < 0)
1994 return false;
1995
1996 if ((epfd = epoll_create(1)) < 0) {
1997 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
1998 return false;
1999 }
2000
2001 ev.events = POLLIN_SET;
2002 ev.data.fd = sock;
2003 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2004 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2005 close(epfd);
2006 return false;
2007 }
2008
2009 again:
2010 if ((now = time(NULL)) < 0) {
2011 close(epfd);
2012 return false;
2013 }
2014
2015 deltatime = (starttime + timeout) - now;
2016 if (deltatime < 0) { // timeout
2017 errno = 0;
2018 close(epfd);
2019 return false;
2020 }
2021 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2022 if (ret < 0 && errno == EINTR)
2023 goto again;
2024 saved_errno = errno;
2025 close(epfd);
2026
2027 if (ret <= 0) {
2028 errno = saved_errno;
2029 return false;
2030 }
2031 return true;
2032 }
2033
2034 static int msgrecv(int sockfd, void *buf, size_t len)
2035 {
2036 if (!wait_for_sock(sockfd, 2))
2037 return -1;
2038 return recv(sockfd, buf, len, MSG_DONTWAIT);
2039 }
2040
2041 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2042 {
2043 struct msghdr msg = { 0 };
2044 struct iovec iov;
2045 struct cmsghdr *cmsg;
2046 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2047 char buf[1];
2048 buf[0] = 'p';
2049
2050 if (pingfirst) {
2051 if (msgrecv(sock, buf, 1) != 1) {
2052 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2053 return SEND_CREDS_FAIL;
2054 }
2055 }
2056
2057 msg.msg_control = cmsgbuf;
2058 msg.msg_controllen = sizeof(cmsgbuf);
2059
2060 cmsg = CMSG_FIRSTHDR(&msg);
2061 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2062 cmsg->cmsg_level = SOL_SOCKET;
2063 cmsg->cmsg_type = SCM_CREDENTIALS;
2064 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2065
2066 msg.msg_name = NULL;
2067 msg.msg_namelen = 0;
2068
2069 buf[0] = v;
2070 iov.iov_base = buf;
2071 iov.iov_len = sizeof(buf);
2072 msg.msg_iov = &iov;
2073 msg.msg_iovlen = 1;
2074
2075 if (sendmsg(sock, &msg, 0) < 0) {
2076 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2077 if (errno == 3)
2078 return SEND_CREDS_NOTSK;
2079 return SEND_CREDS_FAIL;
2080 }
2081
2082 return SEND_CREDS_OK;
2083 }
2084
2085 static bool recv_creds(int sock, struct ucred *cred, char *v)
2086 {
2087 struct msghdr msg = { 0 };
2088 struct iovec iov;
2089 struct cmsghdr *cmsg;
2090 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2091 char buf[1];
2092 int ret;
2093 int optval = 1;
2094
2095 *v = '1';
2096
2097 cred->pid = -1;
2098 cred->uid = -1;
2099 cred->gid = -1;
2100
2101 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2102 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2103 return false;
2104 }
2105 buf[0] = '1';
2106 if (write(sock, buf, 1) != 1) {
2107 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2108 return false;
2109 }
2110
2111 msg.msg_name = NULL;
2112 msg.msg_namelen = 0;
2113 msg.msg_control = cmsgbuf;
2114 msg.msg_controllen = sizeof(cmsgbuf);
2115
2116 iov.iov_base = buf;
2117 iov.iov_len = sizeof(buf);
2118 msg.msg_iov = &iov;
2119 msg.msg_iovlen = 1;
2120
2121 if (!wait_for_sock(sock, 2)) {
2122 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2123 return false;
2124 }
2125 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2126 if (ret < 0) {
2127 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2128 return false;
2129 }
2130
2131 cmsg = CMSG_FIRSTHDR(&msg);
2132
2133 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2134 cmsg->cmsg_level == SOL_SOCKET &&
2135 cmsg->cmsg_type == SCM_CREDENTIALS) {
2136 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2137 }
2138 *v = buf[0];
2139
2140 return true;
2141 }
2142
2143 struct pid_ns_clone_args {
2144 int *cpipe;
2145 int sock;
2146 pid_t tpid;
2147 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2148 };
2149
2150 /*
2151 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2152 * with clone(). This simply writes '1' as ACK back to the parent
2153 * before calling the actual wrapped function.
2154 */
2155 static int pid_ns_clone_wrapper(void *arg) {
2156 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2157 char b = '1';
2158
2159 close(args->cpipe[0]);
2160 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2161 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2162 close(args->cpipe[1]);
2163 return args->wrapped(args->sock, args->tpid);
2164 }
2165
2166 /*
2167 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2168 * int value back over the socket. This shifts the pid from the
2169 * sender's pidns into tpid's pidns.
2170 */
2171 static int pid_to_ns(int sock, pid_t tpid)
2172 {
2173 char v = '0';
2174 struct ucred cred;
2175
2176 while (recv_creds(sock, &cred, &v)) {
2177 if (v == '1')
2178 return 0;
2179 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2180 return 1;
2181 }
2182 return 0;
2183 }
2184
2185
2186 /*
2187 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2188 * in your old pidns. Only children which you clone will be in the target
2189 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2190 * actually convert pids.
2191 *
2192 * Note: glibc's fork() does not respect pidns, which can lead to failed
2193 * assertions inside glibc (and thus failed forks) if the child's pid in
2194 * the pidns and the parent pid outside are identical. Using clone prevents
2195 * this issue.
2196 */
2197 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2198 {
2199 int newnsfd = -1, ret, cpipe[2];
2200 char fnam[100];
2201 pid_t cpid;
2202 char v;
2203
2204 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2205 if (ret < 0 || ret >= sizeof(fnam))
2206 _exit(1);
2207 newnsfd = open(fnam, O_RDONLY);
2208 if (newnsfd < 0)
2209 _exit(1);
2210 if (setns(newnsfd, 0) < 0)
2211 _exit(1);
2212 close(newnsfd);
2213
2214 if (pipe(cpipe) < 0)
2215 _exit(1);
2216
2217 struct pid_ns_clone_args args = {
2218 .cpipe = cpipe,
2219 .sock = sock,
2220 .tpid = tpid,
2221 .wrapped = &pid_to_ns
2222 };
2223 size_t stack_size = sysconf(_SC_PAGESIZE);
2224 void *stack = alloca(stack_size);
2225
2226 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2227 if (cpid < 0)
2228 _exit(1);
2229
2230 // give the child 1 second to be done forking and
2231 // write its ack
2232 if (!wait_for_sock(cpipe[0], 1))
2233 _exit(1);
2234 ret = read(cpipe[0], &v, 1);
2235 if (ret != sizeof(char) || v != '1')
2236 _exit(1);
2237
2238 if (!wait_for_pid(cpid))
2239 _exit(1);
2240 _exit(0);
2241 }
2242
2243 /*
2244 * To read cgroup files with a particular pid, we will setns into the child
2245 * pidns, open a pipe, fork a child - which will be the first to really be in
2246 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2247 */
2248 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2249 {
2250 int sock[2] = {-1, -1};
2251 char *tmpdata = NULL;
2252 int ret;
2253 pid_t qpid, cpid = -1;
2254 bool answer = false;
2255 char v = '0';
2256 struct ucred cred;
2257 size_t sz = 0, asz = 0;
2258
2259 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2260 return false;
2261
2262 /*
2263 * Now we read the pids from returned data one by one, pass
2264 * them into a child in the target namespace, read back the
2265 * translated pids, and put them into our to-return data
2266 */
2267
2268 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2269 perror("socketpair");
2270 free(tmpdata);
2271 return false;
2272 }
2273
2274 cpid = fork();
2275 if (cpid == -1)
2276 goto out;
2277
2278 if (!cpid) // child - exits when done
2279 pid_to_ns_wrapper(sock[1], tpid);
2280
2281 char *ptr = tmpdata;
2282 cred.uid = 0;
2283 cred.gid = 0;
2284 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2285 cred.pid = qpid;
2286 ret = send_creds(sock[0], &cred, v, true);
2287
2288 if (ret == SEND_CREDS_NOTSK)
2289 goto next;
2290 if (ret == SEND_CREDS_FAIL)
2291 goto out;
2292
2293 // read converted results
2294 if (!wait_for_sock(sock[0], 2)) {
2295 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2296 goto out;
2297 }
2298 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2299 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2300 goto out;
2301 }
2302 must_strcat_pid(d, &sz, &asz, qpid);
2303 next:
2304 ptr = strchr(ptr, '\n');
2305 if (!ptr)
2306 break;
2307 ptr++;
2308 }
2309
2310 cred.pid = getpid();
2311 v = '1';
2312 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2313 // failed to ask child to exit
2314 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2315 goto out;
2316 }
2317
2318 answer = true;
2319
2320 out:
2321 free(tmpdata);
2322 if (cpid != -1)
2323 wait_for_pid(cpid);
2324 if (sock[0] != -1) {
2325 close(sock[0]);
2326 close(sock[1]);
2327 }
2328 return answer;
2329 }
2330
2331 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2332 struct fuse_file_info *fi)
2333 {
2334 struct fuse_context *fc = fuse_get_context();
2335 struct file_info *f = (struct file_info *)fi->fh;
2336 struct cgfs_files *k = NULL;
2337 char *data = NULL;
2338 int ret, s;
2339 bool r;
2340
2341 if (f->type != LXC_TYPE_CGFILE) {
2342 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2343 return -EIO;
2344 }
2345
2346 if (offset)
2347 return 0;
2348
2349 if (!fc)
2350 return -EIO;
2351
2352 if (!f->controller)
2353 return -EINVAL;
2354
2355 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2356 return -EINVAL;
2357 }
2358 free_key(k);
2359
2360
2361 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2362 ret = -EACCES;
2363 goto out;
2364 }
2365
2366 if (strcmp(f->file, "tasks") == 0 ||
2367 strcmp(f->file, "/tasks") == 0 ||
2368 strcmp(f->file, "/cgroup.procs") == 0 ||
2369 strcmp(f->file, "cgroup.procs") == 0)
2370 // special case - we have to translate the pids
2371 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2372 else
2373 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2374
2375 if (!r) {
2376 ret = -EINVAL;
2377 goto out;
2378 }
2379
2380 if (!data) {
2381 ret = 0;
2382 goto out;
2383 }
2384 s = strlen(data);
2385 if (s > size)
2386 s = size;
2387 memcpy(buf, data, s);
2388 if (s > 0 && s < size && data[s-1] != '\n')
2389 buf[s++] = '\n';
2390
2391 ret = s;
2392
2393 out:
2394 free(data);
2395 return ret;
2396 }
2397
2398 static int pid_from_ns(int sock, pid_t tpid)
2399 {
2400 pid_t vpid;
2401 struct ucred cred;
2402 char v;
2403 int ret;
2404
2405 cred.uid = 0;
2406 cred.gid = 0;
2407 while (1) {
2408 if (!wait_for_sock(sock, 2)) {
2409 lxcfs_error("%s\n", "Timeout reading from parent.");
2410 return 1;
2411 }
2412 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2413 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2414 return 1;
2415 }
2416 if (vpid == -1) // done
2417 break;
2418 v = '0';
2419 cred.pid = vpid;
2420 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2421 v = '1';
2422 cred.pid = getpid();
2423 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2424 return 1;
2425 }
2426 }
2427 return 0;
2428 }
2429
2430 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2431 {
2432 int newnsfd = -1, ret, cpipe[2];
2433 char fnam[100];
2434 pid_t cpid;
2435 char v;
2436
2437 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2438 if (ret < 0 || ret >= sizeof(fnam))
2439 _exit(1);
2440 newnsfd = open(fnam, O_RDONLY);
2441 if (newnsfd < 0)
2442 _exit(1);
2443 if (setns(newnsfd, 0) < 0)
2444 _exit(1);
2445 close(newnsfd);
2446
2447 if (pipe(cpipe) < 0)
2448 _exit(1);
2449
2450 struct pid_ns_clone_args args = {
2451 .cpipe = cpipe,
2452 .sock = sock,
2453 .tpid = tpid,
2454 .wrapped = &pid_from_ns
2455 };
2456 size_t stack_size = sysconf(_SC_PAGESIZE);
2457 void *stack = alloca(stack_size);
2458
2459 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2460 if (cpid < 0)
2461 _exit(1);
2462
2463 // give the child 1 second to be done forking and
2464 // write its ack
2465 if (!wait_for_sock(cpipe[0], 1))
2466 _exit(1);
2467 ret = read(cpipe[0], &v, 1);
2468 if (ret != sizeof(char) || v != '1')
2469 _exit(1);
2470
2471 if (!wait_for_pid(cpid))
2472 _exit(1);
2473 _exit(0);
2474 }
2475
2476 /*
2477 * Given host @uid, return the uid to which it maps in
2478 * @pid's user namespace, or -1 if none.
2479 */
2480 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2481 {
2482 FILE *f;
2483 char line[400];
2484
2485 sprintf(line, "/proc/%d/uid_map", pid);
2486 if ((f = fopen(line, "r")) == NULL) {
2487 return false;
2488 }
2489
2490 *answer = convert_id_to_ns(f, uid);
2491 fclose(f);
2492
2493 if (*answer == -1)
2494 return false;
2495 return true;
2496 }
2497
2498 /*
2499 * get_pid_creds: get the real uid and gid of @pid from
2500 * /proc/$$/status
2501 * (XXX should we use euid here?)
2502 */
2503 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2504 {
2505 char line[400];
2506 uid_t u;
2507 gid_t g;
2508 FILE *f;
2509
2510 *uid = -1;
2511 *gid = -1;
2512 sprintf(line, "/proc/%d/status", pid);
2513 if ((f = fopen(line, "r")) == NULL) {
2514 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2515 return;
2516 }
2517 while (fgets(line, 400, f)) {
2518 if (strncmp(line, "Uid:", 4) == 0) {
2519 if (sscanf(line+4, "%u", &u) != 1) {
2520 lxcfs_error("bad uid line for pid %u\n", pid);
2521 fclose(f);
2522 return;
2523 }
2524 *uid = u;
2525 } else if (strncmp(line, "Gid:", 4) == 0) {
2526 if (sscanf(line+4, "%u", &g) != 1) {
2527 lxcfs_error("bad gid line for pid %u\n", pid);
2528 fclose(f);
2529 return;
2530 }
2531 *gid = g;
2532 }
2533 }
2534 fclose(f);
2535 }
2536
2537 /*
2538 * May the requestor @r move victim @v to a new cgroup?
2539 * This is allowed if
2540 * . they are the same task
2541 * . they are ownedy by the same uid
2542 * . @r is root on the host, or
2543 * . @v's uid is mapped into @r's where @r is root.
2544 */
2545 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2546 {
2547 uid_t v_uid, tmpuid;
2548 gid_t v_gid;
2549
2550 if (r == v)
2551 return true;
2552 if (r_uid == 0)
2553 return true;
2554 get_pid_creds(v, &v_uid, &v_gid);
2555 if (r_uid == v_uid)
2556 return true;
2557 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2558 && hostuid_to_ns(v_uid, r, &tmpuid))
2559 return true;
2560 return false;
2561 }
2562
2563 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2564 const char *file, const char *buf)
2565 {
2566 int sock[2] = {-1, -1};
2567 pid_t qpid, cpid = -1;
2568 FILE *pids_file = NULL;
2569 bool answer = false, fail = false;
2570
2571 pids_file = open_pids_file(contrl, cg);
2572 if (!pids_file)
2573 return false;
2574
2575 /*
2576 * write the pids to a socket, have helper in writer's pidns
2577 * call movepid for us
2578 */
2579 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2580 perror("socketpair");
2581 goto out;
2582 }
2583
2584 cpid = fork();
2585 if (cpid == -1)
2586 goto out;
2587
2588 if (!cpid) { // child
2589 fclose(pids_file);
2590 pid_from_ns_wrapper(sock[1], tpid);
2591 }
2592
2593 const char *ptr = buf;
2594 while (sscanf(ptr, "%d", &qpid) == 1) {
2595 struct ucred cred;
2596 char v;
2597
2598 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2599 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
2600 goto out;
2601 }
2602
2603 if (recv_creds(sock[0], &cred, &v)) {
2604 if (v == '0') {
2605 if (!may_move_pid(tpid, tuid, cred.pid)) {
2606 fail = true;
2607 break;
2608 }
2609 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
2610 fail = true;
2611 }
2612 }
2613
2614 ptr = strchr(ptr, '\n');
2615 if (!ptr)
2616 break;
2617 ptr++;
2618 }
2619
2620 /* All good, write the value */
2621 qpid = -1;
2622 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
2623 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
2624
2625 if (!fail)
2626 answer = true;
2627
2628 out:
2629 if (cpid != -1)
2630 wait_for_pid(cpid);
2631 if (sock[0] != -1) {
2632 close(sock[0]);
2633 close(sock[1]);
2634 }
2635 if (pids_file) {
2636 if (fclose(pids_file) != 0)
2637 answer = false;
2638 }
2639 return answer;
2640 }
2641
2642 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
2643 struct fuse_file_info *fi)
2644 {
2645 struct fuse_context *fc = fuse_get_context();
2646 char *localbuf = NULL;
2647 struct cgfs_files *k = NULL;
2648 struct file_info *f = (struct file_info *)fi->fh;
2649 bool r;
2650
2651 if (f->type != LXC_TYPE_CGFILE) {
2652 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
2653 return -EIO;
2654 }
2655
2656 if (offset)
2657 return 0;
2658
2659 if (!fc)
2660 return -EIO;
2661
2662 localbuf = alloca(size+1);
2663 localbuf[size] = '\0';
2664 memcpy(localbuf, buf, size);
2665
2666 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2667 size = -EINVAL;
2668 goto out;
2669 }
2670
2671 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
2672 size = -EACCES;
2673 goto out;
2674 }
2675
2676 if (strcmp(f->file, "tasks") == 0 ||
2677 strcmp(f->file, "/tasks") == 0 ||
2678 strcmp(f->file, "/cgroup.procs") == 0 ||
2679 strcmp(f->file, "cgroup.procs") == 0)
2680 // special case - we have to translate the pids
2681 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
2682 else
2683 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
2684
2685 if (!r)
2686 size = -EINVAL;
2687
2688 out:
2689 free_key(k);
2690 return size;
2691 }
2692
2693 int cg_chown(const char *path, uid_t uid, gid_t gid)
2694 {
2695 struct fuse_context *fc = fuse_get_context();
2696 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2697 struct cgfs_files *k = NULL;
2698 const char *cgroup;
2699 int ret;
2700
2701 if (!fc)
2702 return -EIO;
2703
2704 if (strcmp(path, "/cgroup") == 0)
2705 return -EPERM;
2706
2707 controller = pick_controller_from_path(fc, path);
2708 if (!controller)
2709 return errno == ENOENT ? -EPERM : -errno;
2710
2711 cgroup = find_cgroup_in_path(path);
2712 if (!cgroup)
2713 /* this is just /cgroup/controller */
2714 return -EPERM;
2715
2716 get_cgdir_and_path(cgroup, &cgdir, &last);
2717
2718 if (!last) {
2719 path1 = "/";
2720 path2 = cgdir;
2721 } else {
2722 path1 = cgdir;
2723 path2 = last;
2724 }
2725
2726 if (is_child_cgroup(controller, path1, path2)) {
2727 // get uid, gid, from '/tasks' file and make up a mode
2728 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2729 k = cgfs_get_key(controller, cgroup, "tasks");
2730
2731 } else
2732 k = cgfs_get_key(controller, path1, path2);
2733
2734 if (!k) {
2735 ret = -EINVAL;
2736 goto out;
2737 }
2738
2739 /*
2740 * This being a fuse request, the uid and gid must be valid
2741 * in the caller's namespace. So we can just check to make
2742 * sure that the caller is root in his uid, and privileged
2743 * over the file's current owner.
2744 */
2745 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
2746 ret = -EACCES;
2747 goto out;
2748 }
2749
2750 ret = cgfs_chown_file(controller, cgroup, uid, gid);
2751
2752 out:
2753 free_key(k);
2754 free(cgdir);
2755
2756 return ret;
2757 }
2758
2759 int cg_chmod(const char *path, mode_t mode)
2760 {
2761 struct fuse_context *fc = fuse_get_context();
2762 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
2763 struct cgfs_files *k = NULL;
2764 const char *cgroup;
2765 int ret;
2766
2767 if (!fc)
2768 return -EIO;
2769
2770 if (strcmp(path, "/cgroup") == 0)
2771 return -EPERM;
2772
2773 controller = pick_controller_from_path(fc, path);
2774 if (!controller)
2775 return errno == ENOENT ? -EPERM : -errno;
2776
2777 cgroup = find_cgroup_in_path(path);
2778 if (!cgroup)
2779 /* this is just /cgroup/controller */
2780 return -EPERM;
2781
2782 get_cgdir_and_path(cgroup, &cgdir, &last);
2783
2784 if (!last) {
2785 path1 = "/";
2786 path2 = cgdir;
2787 } else {
2788 path1 = cgdir;
2789 path2 = last;
2790 }
2791
2792 if (is_child_cgroup(controller, path1, path2)) {
2793 // get uid, gid, from '/tasks' file and make up a mode
2794 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2795 k = cgfs_get_key(controller, cgroup, "tasks");
2796
2797 } else
2798 k = cgfs_get_key(controller, path1, path2);
2799
2800 if (!k) {
2801 ret = -EINVAL;
2802 goto out;
2803 }
2804
2805 /*
2806 * This being a fuse request, the uid and gid must be valid
2807 * in the caller's namespace. So we can just check to make
2808 * sure that the caller is root in his uid, and privileged
2809 * over the file's current owner.
2810 */
2811 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
2812 ret = -EPERM;
2813 goto out;
2814 }
2815
2816 if (!cgfs_chmod_file(controller, cgroup, mode)) {
2817 ret = -EINVAL;
2818 goto out;
2819 }
2820
2821 ret = 0;
2822 out:
2823 free_key(k);
2824 free(cgdir);
2825 return ret;
2826 }
2827
2828 int cg_mkdir(const char *path, mode_t mode)
2829 {
2830 struct fuse_context *fc = fuse_get_context();
2831 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
2832 const char *cgroup;
2833 int ret;
2834
2835 if (!fc)
2836 return -EIO;
2837
2838 controller = pick_controller_from_path(fc, path);
2839 if (!controller)
2840 return errno == ENOENT ? -EPERM : -errno;
2841
2842 cgroup = find_cgroup_in_path(path);
2843 if (!cgroup)
2844 return -errno;
2845
2846 get_cgdir_and_path(cgroup, &cgdir, &last);
2847 if (!last)
2848 path1 = "/";
2849 else
2850 path1 = cgdir;
2851
2852 pid_t initpid = lookup_initpid_in_store(fc->pid);
2853 if (initpid <= 0)
2854 initpid = fc->pid;
2855 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
2856 if (!next)
2857 ret = -EINVAL;
2858 else if (last && strcmp(next, last) == 0)
2859 ret = -EEXIST;
2860 else
2861 ret = -EPERM;
2862 goto out;
2863 }
2864
2865 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
2866 ret = -EACCES;
2867 goto out;
2868 }
2869 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2870 ret = -EACCES;
2871 goto out;
2872 }
2873
2874 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
2875
2876 out:
2877 free(cgdir);
2878 free(next);
2879 return ret;
2880 }
2881
2882 int cg_rmdir(const char *path)
2883 {
2884 struct fuse_context *fc = fuse_get_context();
2885 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
2886 const char *cgroup;
2887 int ret;
2888
2889 if (!fc)
2890 return -EIO;
2891
2892 controller = pick_controller_from_path(fc, path);
2893 if (!controller) /* Someone's trying to delete "/cgroup". */
2894 return -EPERM;
2895
2896 cgroup = find_cgroup_in_path(path);
2897 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
2898 return -EPERM;
2899
2900 get_cgdir_and_path(cgroup, &cgdir, &last);
2901 if (!last) {
2902 /* Someone's trying to delete a cgroup on the same level as the
2903 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
2904 * rmdir "/cgroup/blkio/init.slice".
2905 */
2906 ret = -EPERM;
2907 goto out;
2908 }
2909
2910 pid_t initpid = lookup_initpid_in_store(fc->pid);
2911 if (initpid <= 0)
2912 initpid = fc->pid;
2913 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
2914 if (!last || strcmp(next, last) == 0)
2915 ret = -EBUSY;
2916 else
2917 ret = -ENOENT;
2918 goto out;
2919 }
2920
2921 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
2922 ret = -EACCES;
2923 goto out;
2924 }
2925 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2926 ret = -EACCES;
2927 goto out;
2928 }
2929
2930 if (!cgfs_remove(controller, cgroup)) {
2931 ret = -EINVAL;
2932 goto out;
2933 }
2934
2935 ret = 0;
2936
2937 out:
2938 free(cgdir);
2939 free(next);
2940 return ret;
2941 }
2942
2943 static bool startswith(const char *line, const char *pref)
2944 {
2945 if (strncmp(line, pref, strlen(pref)) == 0)
2946 return true;
2947 return false;
2948 }
2949
2950 static void parse_memstat(char *memstat, unsigned long *cached,
2951 unsigned long *active_anon, unsigned long *inactive_anon,
2952 unsigned long *active_file, unsigned long *inactive_file,
2953 unsigned long *unevictable)
2954 {
2955 char *eol;
2956
2957 while (*memstat) {
2958 if (startswith(memstat, "cache")) {
2959 sscanf(memstat + 11, "%lu", cached);
2960 *cached /= 1024;
2961 } else if (startswith(memstat, "active_anon")) {
2962 sscanf(memstat + 11, "%lu", active_anon);
2963 *active_anon /= 1024;
2964 } else if (startswith(memstat, "inactive_anon")) {
2965 sscanf(memstat + 11, "%lu", inactive_anon);
2966 *inactive_anon /= 1024;
2967 } else if (startswith(memstat, "active_file")) {
2968 sscanf(memstat + 11, "%lu", active_file);
2969 *active_file /= 1024;
2970 } else if (startswith(memstat, "inactive_file")) {
2971 sscanf(memstat + 11, "%lu", inactive_file);
2972 *inactive_file /= 1024;
2973 } else if (startswith(memstat, "unevictable")) {
2974 sscanf(memstat + 11, "%lu", unevictable);
2975 *unevictable /= 1024;
2976 }
2977 eol = strchr(memstat, '\n');
2978 if (!eol)
2979 return;
2980 memstat = eol+1;
2981 }
2982 }
2983
2984 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
2985 {
2986 char *eol;
2987 char key[32];
2988
2989 memset(key, 0, 32);
2990 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
2991
2992 size_t len = strlen(key);
2993 *v = 0;
2994
2995 while (*str) {
2996 if (startswith(str, key)) {
2997 sscanf(str + len, "%lu", v);
2998 return;
2999 }
3000 eol = strchr(str, '\n');
3001 if (!eol)
3002 return;
3003 str = eol+1;
3004 }
3005 }
3006
3007 static int read_file(const char *path, char *buf, size_t size,
3008 struct file_info *d)
3009 {
3010 size_t linelen = 0, total_len = 0, rv = 0;
3011 char *line = NULL;
3012 char *cache = d->buf;
3013 size_t cache_size = d->buflen;
3014 FILE *f = fopen(path, "r");
3015 if (!f)
3016 return 0;
3017
3018 while (getline(&line, &linelen, f) != -1) {
3019 ssize_t l = snprintf(cache, cache_size, "%s", line);
3020 if (l < 0) {
3021 perror("Error writing to cache");
3022 rv = 0;
3023 goto err;
3024 }
3025 if (l >= cache_size) {
3026 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3027 rv = 0;
3028 goto err;
3029 }
3030 cache += l;
3031 cache_size -= l;
3032 total_len += l;
3033 }
3034
3035 d->size = total_len;
3036 if (total_len > size)
3037 total_len = size;
3038
3039 /* read from off 0 */
3040 memcpy(buf, d->buf, total_len);
3041 rv = total_len;
3042 err:
3043 fclose(f);
3044 free(line);
3045 return rv;
3046 }
3047
3048 /*
3049 * FUSE ops for /proc
3050 */
3051
3052 static unsigned long get_memlimit(const char *cgroup)
3053 {
3054 char *memlimit_str = NULL;
3055 unsigned long memlimit = -1;
3056
3057 if (cgfs_get_value("memory", cgroup, "memory.limit_in_bytes", &memlimit_str))
3058 memlimit = strtoul(memlimit_str, NULL, 10);
3059
3060 free(memlimit_str);
3061
3062 return memlimit;
3063 }
3064
3065 static unsigned long get_min_memlimit(const char *cgroup)
3066 {
3067 char *copy = strdupa(cgroup);
3068 unsigned long memlimit = 0, retlimit;
3069
3070 retlimit = get_memlimit(copy);
3071
3072 while (strcmp(copy, "/") != 0) {
3073 copy = dirname(copy);
3074 memlimit = get_memlimit(copy);
3075 if (memlimit != -1 && memlimit < retlimit)
3076 retlimit = memlimit;
3077 };
3078
3079 return retlimit;
3080 }
3081
3082 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3083 struct fuse_file_info *fi)
3084 {
3085 struct fuse_context *fc = fuse_get_context();
3086 struct file_info *d = (struct file_info *)fi->fh;
3087 char *cg;
3088 char *memusage_str = NULL, *memstat_str = NULL,
3089 *memswlimit_str = NULL, *memswusage_str = NULL,
3090 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
3091 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3092 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3093 active_file = 0, inactive_file = 0, unevictable = 0;
3094 char *line = NULL;
3095 size_t linelen = 0, total_len = 0, rv = 0;
3096 char *cache = d->buf;
3097 size_t cache_size = d->buflen;
3098 FILE *f = NULL;
3099
3100 if (offset){
3101 if (offset > d->size)
3102 return -EINVAL;
3103 if (!d->cached)
3104 return 0;
3105 int left = d->size - offset;
3106 total_len = left > size ? size: left;
3107 memcpy(buf, cache + offset, total_len);
3108 return total_len;
3109 }
3110
3111 pid_t initpid = lookup_initpid_in_store(fc->pid);
3112 if (initpid <= 0)
3113 initpid = fc->pid;
3114 cg = get_pid_cgroup(initpid, "memory");
3115 if (!cg)
3116 return read_file("/proc/meminfo", buf, size, d);
3117 prune_init_slice(cg);
3118
3119 memlimit = get_min_memlimit(cg);
3120 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3121 goto err;
3122 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3123 goto err;
3124
3125 // Following values are allowed to fail, because swapaccount might be turned
3126 // off for current kernel
3127 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3128 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3129 {
3130 /* If swapaccounting is turned on, then default value is assumed to be that of cgroup / */
3131 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
3132 goto err;
3133 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
3134 goto err;
3135
3136 memswlimit = strtoul(memswlimit_str, NULL, 10);
3137 memswusage = strtoul(memswusage_str, NULL, 10);
3138
3139 if (!strcmp(memswlimit_str, memswlimit_default_str))
3140 memswlimit = 0;
3141 if (!strcmp(memswusage_str, memswusage_default_str))
3142 memswusage = 0;
3143
3144 memswlimit = memswlimit / 1024;
3145 memswusage = memswusage / 1024;
3146 }
3147
3148 memusage = strtoul(memusage_str, NULL, 10);
3149 memlimit /= 1024;
3150 memusage /= 1024;
3151
3152 parse_memstat(memstat_str, &cached, &active_anon,
3153 &inactive_anon, &active_file, &inactive_file,
3154 &unevictable);
3155
3156 f = fopen("/proc/meminfo", "r");
3157 if (!f)
3158 goto err;
3159
3160 while (getline(&line, &linelen, f) != -1) {
3161 ssize_t l;
3162 char *printme, lbuf[100];
3163
3164 memset(lbuf, 0, 100);
3165 if (startswith(line, "MemTotal:")) {
3166 sscanf(line+14, "%lu", &hosttotal);
3167 if (hosttotal < memlimit)
3168 memlimit = hosttotal;
3169 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3170 printme = lbuf;
3171 } else if (startswith(line, "MemFree:")) {
3172 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3173 printme = lbuf;
3174 } else if (startswith(line, "MemAvailable:")) {
3175 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage);
3176 printme = lbuf;
3177 } else if (startswith(line, "SwapTotal:") && memswlimit > 0) {
3178 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit - memlimit);
3179 printme = lbuf;
3180 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0) {
3181 unsigned long swaptotal = memswlimit - memlimit,
3182 swapusage = memswusage - memusage,
3183 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3184 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3185 printme = lbuf;
3186 } else if (startswith(line, "Slab:")) {
3187 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3188 printme = lbuf;
3189 } else if (startswith(line, "Buffers:")) {
3190 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3191 printme = lbuf;
3192 } else if (startswith(line, "Cached:")) {
3193 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3194 printme = lbuf;
3195 } else if (startswith(line, "SwapCached:")) {
3196 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3197 printme = lbuf;
3198 } else if (startswith(line, "Active")) {
3199 snprintf(lbuf, 100, "Active: %8lu kB\n",
3200 active_anon + active_file);
3201 printme = lbuf;
3202 } else if (startswith(line, "Inactive")) {
3203 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3204 inactive_anon + inactive_file);
3205 printme = lbuf;
3206 } else if (startswith(line, "Active(anon)")) {
3207 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3208 printme = lbuf;
3209 } else if (startswith(line, "Inactive(anon)")) {
3210 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3211 printme = lbuf;
3212 } else if (startswith(line, "Active(file)")) {
3213 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3214 printme = lbuf;
3215 } else if (startswith(line, "Inactive(file)")) {
3216 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3217 printme = lbuf;
3218 } else if (startswith(line, "Unevictable")) {
3219 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3220 printme = lbuf;
3221 } else if (startswith(line, "SReclaimable")) {
3222 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3223 printme = lbuf;
3224 } else if (startswith(line, "SUnreclaim")) {
3225 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3226 printme = lbuf;
3227 } else
3228 printme = line;
3229
3230 l = snprintf(cache, cache_size, "%s", printme);
3231 if (l < 0) {
3232 perror("Error writing to cache");
3233 rv = 0;
3234 goto err;
3235
3236 }
3237 if (l >= cache_size) {
3238 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3239 rv = 0;
3240 goto err;
3241 }
3242
3243 cache += l;
3244 cache_size -= l;
3245 total_len += l;
3246 }
3247
3248 d->cached = 1;
3249 d->size = total_len;
3250 if (total_len > size ) total_len = size;
3251 memcpy(buf, d->buf, total_len);
3252
3253 rv = total_len;
3254 err:
3255 if (f)
3256 fclose(f);
3257 free(line);
3258 free(cg);
3259 free(memusage_str);
3260 free(memswlimit_str);
3261 free(memswusage_str);
3262 free(memstat_str);
3263 free(memswlimit_default_str);
3264 free(memswusage_default_str);
3265 return rv;
3266 }
3267
3268 /*
3269 * Read the cpuset.cpus for cg
3270 * Return the answer in a newly allocated string which must be freed
3271 */
3272 static char *get_cpuset(const char *cg)
3273 {
3274 char *answer;
3275
3276 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3277 return NULL;
3278 return answer;
3279 }
3280
3281 bool cpu_in_cpuset(int cpu, const char *cpuset);
3282
3283 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3284 {
3285 int cpu;
3286
3287 if (sscanf(line, "processor : %d", &cpu) != 1)
3288 return false;
3289 return cpu_in_cpuset(cpu, cpuset);
3290 }
3291
3292 /*
3293 * check whether this is a '^processor" line in /proc/cpuinfo
3294 */
3295 static bool is_processor_line(const char *line)
3296 {
3297 int cpu;
3298
3299 if (sscanf(line, "processor : %d", &cpu) == 1)
3300 return true;
3301 return false;
3302 }
3303
3304 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3305 struct fuse_file_info *fi)
3306 {
3307 struct fuse_context *fc = fuse_get_context();
3308 struct file_info *d = (struct file_info *)fi->fh;
3309 char *cg;
3310 char *cpuset = NULL;
3311 char *line = NULL;
3312 size_t linelen = 0, total_len = 0, rv = 0;
3313 bool am_printing = false, firstline = true, is_s390x = false;
3314 int curcpu = -1, cpu;
3315 char *cache = d->buf;
3316 size_t cache_size = d->buflen;
3317 FILE *f = NULL;
3318
3319 if (offset){
3320 if (offset > d->size)
3321 return -EINVAL;
3322 if (!d->cached)
3323 return 0;
3324 int left = d->size - offset;
3325 total_len = left > size ? size: left;
3326 memcpy(buf, cache + offset, total_len);
3327 return total_len;
3328 }
3329
3330 pid_t initpid = lookup_initpid_in_store(fc->pid);
3331 if (initpid <= 0)
3332 initpid = fc->pid;
3333 cg = get_pid_cgroup(initpid, "cpuset");
3334 if (!cg)
3335 return read_file("proc/cpuinfo", buf, size, d);
3336 prune_init_slice(cg);
3337
3338 cpuset = get_cpuset(cg);
3339 if (!cpuset)
3340 goto err;
3341
3342 f = fopen("/proc/cpuinfo", "r");
3343 if (!f)
3344 goto err;
3345
3346 while (getline(&line, &linelen, f) != -1) {
3347 ssize_t l;
3348 if (firstline) {
3349 firstline = false;
3350 if (strstr(line, "IBM/S390") != NULL) {
3351 is_s390x = true;
3352 am_printing = true;
3353 continue;
3354 }
3355 }
3356 if (strncmp(line, "# processors:", 12) == 0)
3357 continue;
3358 if (is_processor_line(line)) {
3359 am_printing = cpuline_in_cpuset(line, cpuset);
3360 if (am_printing) {
3361 curcpu ++;
3362 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3363 if (l < 0) {
3364 perror("Error writing to cache");
3365 rv = 0;
3366 goto err;
3367 }
3368 if (l >= cache_size) {
3369 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3370 rv = 0;
3371 goto err;
3372 }
3373 cache += l;
3374 cache_size -= l;
3375 total_len += l;
3376 }
3377 continue;
3378 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3379 char *p;
3380 if (!cpu_in_cpuset(cpu, cpuset))
3381 continue;
3382 curcpu ++;
3383 p = strchr(line, ':');
3384 if (!p || !*p)
3385 goto err;
3386 p++;
3387 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3388 if (l < 0) {
3389 perror("Error writing to cache");
3390 rv = 0;
3391 goto err;
3392 }
3393 if (l >= cache_size) {
3394 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3395 rv = 0;
3396 goto err;
3397 }
3398 cache += l;
3399 cache_size -= l;
3400 total_len += l;
3401 continue;
3402
3403 }
3404 if (am_printing) {
3405 l = snprintf(cache, cache_size, "%s", line);
3406 if (l < 0) {
3407 perror("Error writing to cache");
3408 rv = 0;
3409 goto err;
3410 }
3411 if (l >= cache_size) {
3412 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3413 rv = 0;
3414 goto err;
3415 }
3416 cache += l;
3417 cache_size -= l;
3418 total_len += l;
3419 }
3420 }
3421
3422 if (is_s390x) {
3423 char *origcache = d->buf;
3424 ssize_t l;
3425 do {
3426 d->buf = malloc(d->buflen);
3427 } while (!d->buf);
3428 cache = d->buf;
3429 cache_size = d->buflen;
3430 total_len = 0;
3431 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3432 if (l < 0 || l >= cache_size) {
3433 free(origcache);
3434 goto err;
3435 }
3436 cache_size -= l;
3437 cache += l;
3438 total_len += l;
3439 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
3440 if (l < 0 || l >= cache_size) {
3441 free(origcache);
3442 goto err;
3443 }
3444 cache_size -= l;
3445 cache += l;
3446 total_len += l;
3447 l = snprintf(cache, cache_size, "%s", origcache);
3448 free(origcache);
3449 if (l < 0 || l >= cache_size)
3450 goto err;
3451 total_len += l;
3452 }
3453
3454 d->cached = 1;
3455 d->size = total_len;
3456 if (total_len > size ) total_len = size;
3457
3458 /* read from off 0 */
3459 memcpy(buf, d->buf, total_len);
3460 rv = total_len;
3461 err:
3462 if (f)
3463 fclose(f);
3464 free(line);
3465 free(cpuset);
3466 free(cg);
3467 return rv;
3468 }
3469
3470 static int proc_stat_read(char *buf, size_t size, off_t offset,
3471 struct fuse_file_info *fi)
3472 {
3473 struct fuse_context *fc = fuse_get_context();
3474 struct file_info *d = (struct file_info *)fi->fh;
3475 char *cg;
3476 char *cpuset = NULL;
3477 char *line = NULL;
3478 size_t linelen = 0, total_len = 0, rv = 0;
3479 int curcpu = -1; /* cpu numbering starts at 0 */
3480 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0;
3481 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
3482 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0;
3483 #define CPUALL_MAX_SIZE BUF_RESERVE_SIZE
3484 char cpuall[CPUALL_MAX_SIZE];
3485 /* reserve for cpu all */
3486 char *cache = d->buf + CPUALL_MAX_SIZE;
3487 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
3488 FILE *f = NULL;
3489
3490 if (offset){
3491 if (offset > d->size)
3492 return -EINVAL;
3493 if (!d->cached)
3494 return 0;
3495 int left = d->size - offset;
3496 total_len = left > size ? size: left;
3497 memcpy(buf, d->buf + offset, total_len);
3498 return total_len;
3499 }
3500
3501 pid_t initpid = lookup_initpid_in_store(fc->pid);
3502 if (initpid <= 0)
3503 initpid = fc->pid;
3504 cg = get_pid_cgroup(initpid, "cpuset");
3505 if (!cg)
3506 return read_file("/proc/stat", buf, size, d);
3507 prune_init_slice(cg);
3508
3509 cpuset = get_cpuset(cg);
3510 if (!cpuset)
3511 goto err;
3512
3513 f = fopen("/proc/stat", "r");
3514 if (!f)
3515 goto err;
3516
3517 //skip first line
3518 if (getline(&line, &linelen, f) < 0) {
3519 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
3520 goto err;
3521 }
3522
3523 while (getline(&line, &linelen, f) != -1) {
3524 ssize_t l;
3525 int cpu;
3526 char cpu_char[10]; /* That's a lot of cores */
3527 char *c;
3528
3529 if (strlen(line) == 0)
3530 continue;
3531 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
3532 /* not a ^cpuN line containing a number N, just print it */
3533 l = snprintf(cache, cache_size, "%s", line);
3534 if (l < 0) {
3535 perror("Error writing to cache");
3536 rv = 0;
3537 goto err;
3538 }
3539 if (l >= cache_size) {
3540 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3541 rv = 0;
3542 goto err;
3543 }
3544 cache += l;
3545 cache_size -= l;
3546 total_len += l;
3547 continue;
3548 }
3549
3550 if (sscanf(cpu_char, "%d", &cpu) != 1)
3551 continue;
3552 if (!cpu_in_cpuset(cpu, cpuset))
3553 continue;
3554 curcpu ++;
3555
3556 c = strchr(line, ' ');
3557 if (!c)
3558 continue;
3559 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
3560 if (l < 0) {
3561 perror("Error writing to cache");
3562 rv = 0;
3563 goto err;
3564
3565 }
3566 if (l >= cache_size) {
3567 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3568 rv = 0;
3569 goto err;
3570 }
3571
3572 cache += l;
3573 cache_size -= l;
3574 total_len += l;
3575
3576 if (sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &system, &idle, &iowait, &irq,
3577 &softirq, &steal, &guest) != 9)
3578 continue;
3579 user_sum += user;
3580 nice_sum += nice;
3581 system_sum += system;
3582 idle_sum += idle;
3583 iowait_sum += iowait;
3584 irq_sum += irq;
3585 softirq_sum += softirq;
3586 steal_sum += steal;
3587 guest_sum += guest;
3588 }
3589
3590 cache = d->buf;
3591
3592 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "%s %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3593 "cpu ", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum);
3594 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE){
3595 memcpy(cache, cpuall, cpuall_len);
3596 cache += cpuall_len;
3597 } else{
3598 /* shouldn't happen */
3599 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
3600 cpuall_len = 0;
3601 }
3602
3603 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
3604 total_len += cpuall_len;
3605 d->cached = 1;
3606 d->size = total_len;
3607 if (total_len > size ) total_len = size;
3608
3609 memcpy(buf, d->buf, total_len);
3610 rv = total_len;
3611
3612 err:
3613 if (f)
3614 fclose(f);
3615 free(line);
3616 free(cpuset);
3617 free(cg);
3618 return rv;
3619 }
3620
3621 static long int getreaperage(pid_t pid)
3622 {
3623 char fnam[100];
3624 struct stat sb;
3625 int ret;
3626 pid_t qpid;
3627
3628 qpid = lookup_initpid_in_store(pid);
3629 if (qpid <= 0)
3630 return 0;
3631
3632 ret = snprintf(fnam, 100, "/proc/%d", qpid);
3633 if (ret < 0 || ret >= 100)
3634 return 0;
3635
3636 if (lstat(fnam, &sb) < 0)
3637 return 0;
3638
3639 return time(NULL) - sb.st_ctime;
3640 }
3641
3642 static unsigned long get_reaper_busy(pid_t task)
3643 {
3644 pid_t initpid = lookup_initpid_in_store(task);
3645 char *cgroup = NULL, *usage_str = NULL;
3646 unsigned long usage = 0;
3647
3648 if (initpid <= 0)
3649 return 0;
3650
3651 cgroup = get_pid_cgroup(initpid, "cpuacct");
3652 if (!cgroup)
3653 goto out;
3654 prune_init_slice(cgroup);
3655 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
3656 goto out;
3657 usage = strtoul(usage_str, NULL, 10);
3658 usage /= 1000000000;
3659
3660 out:
3661 free(cgroup);
3662 free(usage_str);
3663 return usage;
3664 }
3665
3666 #if RELOADTEST
3667 void iwashere(void)
3668 {
3669 int fd;
3670
3671 fd = creat("/tmp/lxcfs-iwashere", 0644);
3672 if (fd >= 0)
3673 close(fd);
3674 }
3675 #endif
3676
3677 /*
3678 * We read /proc/uptime and reuse its second field.
3679 * For the first field, we use the mtime for the reaper for
3680 * the calling pid as returned by getreaperage
3681 */
3682 static int proc_uptime_read(char *buf, size_t size, off_t offset,
3683 struct fuse_file_info *fi)
3684 {
3685 struct fuse_context *fc = fuse_get_context();
3686 struct file_info *d = (struct file_info *)fi->fh;
3687 long int reaperage = getreaperage(fc->pid);
3688 unsigned long int busytime = get_reaper_busy(fc->pid), idletime;
3689 char *cache = d->buf;
3690 ssize_t total_len = 0;
3691
3692 #if RELOADTEST
3693 iwashere();
3694 #endif
3695
3696 if (offset){
3697 if (offset > d->size)
3698 return -EINVAL;
3699 if (!d->cached)
3700 return 0;
3701 int left = d->size - offset;
3702 total_len = left > size ? size: left;
3703 memcpy(buf, cache + offset, total_len);
3704 return total_len;
3705 }
3706
3707 idletime = reaperage - busytime;
3708 if (idletime > reaperage)
3709 idletime = reaperage;
3710
3711 total_len = snprintf(d->buf, d->size, "%ld.0 %lu.0\n", reaperage, idletime);
3712 if (total_len < 0){
3713 perror("Error writing to cache");
3714 return 0;
3715 }
3716
3717 d->size = (int)total_len;
3718 d->cached = 1;
3719
3720 if (total_len > size) total_len = size;
3721
3722 memcpy(buf, d->buf, total_len);
3723 return total_len;
3724 }
3725
3726 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
3727 struct fuse_file_info *fi)
3728 {
3729 char dev_name[72];
3730 struct fuse_context *fc = fuse_get_context();
3731 struct file_info *d = (struct file_info *)fi->fh;
3732 char *cg;
3733 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
3734 *io_wait_time_str = NULL, *io_service_time_str = NULL;
3735 unsigned long read = 0, write = 0;
3736 unsigned long read_merged = 0, write_merged = 0;
3737 unsigned long read_sectors = 0, write_sectors = 0;
3738 unsigned long read_ticks = 0, write_ticks = 0;
3739 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
3740 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
3741 char *cache = d->buf;
3742 size_t cache_size = d->buflen;
3743 char *line = NULL;
3744 size_t linelen = 0, total_len = 0, rv = 0;
3745 unsigned int major = 0, minor = 0;
3746 int i = 0;
3747 FILE *f = NULL;
3748
3749 if (offset){
3750 if (offset > d->size)
3751 return -EINVAL;
3752 if (!d->cached)
3753 return 0;
3754 int left = d->size - offset;
3755 total_len = left > size ? size: left;
3756 memcpy(buf, cache + offset, total_len);
3757 return total_len;
3758 }
3759
3760 pid_t initpid = lookup_initpid_in_store(fc->pid);
3761 if (initpid <= 0)
3762 initpid = fc->pid;
3763 cg = get_pid_cgroup(initpid, "blkio");
3764 if (!cg)
3765 return read_file("/proc/diskstats", buf, size, d);
3766 prune_init_slice(cg);
3767
3768 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
3769 goto err;
3770 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
3771 goto err;
3772 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
3773 goto err;
3774 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
3775 goto err;
3776 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
3777 goto err;
3778
3779
3780 f = fopen("/proc/diskstats", "r");
3781 if (!f)
3782 goto err;
3783
3784 while (getline(&line, &linelen, f) != -1) {
3785 ssize_t l;
3786 char lbuf[256];
3787
3788 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
3789 if (i != 3)
3790 continue;
3791
3792 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
3793 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
3794 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
3795 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
3796 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
3797 read_sectors = read_sectors/512;
3798 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
3799 write_sectors = write_sectors/512;
3800
3801 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
3802 rd_svctm = rd_svctm/1000000;
3803 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
3804 rd_wait = rd_wait/1000000;
3805 read_ticks = rd_svctm + rd_wait;
3806
3807 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
3808 wr_svctm = wr_svctm/1000000;
3809 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
3810 wr_wait = wr_wait/1000000;
3811 write_ticks = wr_svctm + wr_wait;
3812
3813 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
3814 tot_ticks = tot_ticks/1000000;
3815
3816 memset(lbuf, 0, 256);
3817 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
3818 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
3819 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
3820 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
3821 else
3822 continue;
3823
3824 l = snprintf(cache, cache_size, "%s", lbuf);
3825 if (l < 0) {
3826 perror("Error writing to fuse buf");
3827 rv = 0;
3828 goto err;
3829 }
3830 if (l >= cache_size) {
3831 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3832 rv = 0;
3833 goto err;
3834 }
3835 cache += l;
3836 cache_size -= l;
3837 total_len += l;
3838 }
3839
3840 d->cached = 1;
3841 d->size = total_len;
3842 if (total_len > size ) total_len = size;
3843 memcpy(buf, d->buf, total_len);
3844
3845 rv = total_len;
3846 err:
3847 free(cg);
3848 if (f)
3849 fclose(f);
3850 free(line);
3851 free(io_serviced_str);
3852 free(io_merged_str);
3853 free(io_service_bytes_str);
3854 free(io_wait_time_str);
3855 free(io_service_time_str);
3856 return rv;
3857 }
3858
3859 static int proc_swaps_read(char *buf, size_t size, off_t offset,
3860 struct fuse_file_info *fi)
3861 {
3862 struct fuse_context *fc = fuse_get_context();
3863 struct file_info *d = (struct file_info *)fi->fh;
3864 char *cg = NULL;
3865 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL,
3866 *memswlimit_default_str = NULL, *memswusage_default_str = NULL;
3867 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
3868 ssize_t total_len = 0, rv = 0;
3869 ssize_t l = 0;
3870 char *cache = d->buf;
3871
3872 if (offset) {
3873 if (offset > d->size)
3874 return -EINVAL;
3875 if (!d->cached)
3876 return 0;
3877 int left = d->size - offset;
3878 total_len = left > size ? size: left;
3879 memcpy(buf, cache + offset, total_len);
3880 return total_len;
3881 }
3882
3883 pid_t initpid = lookup_initpid_in_store(fc->pid);
3884 if (initpid <= 0)
3885 initpid = fc->pid;
3886 cg = get_pid_cgroup(initpid, "memory");
3887 if (!cg)
3888 return read_file("/proc/swaps", buf, size, d);
3889 prune_init_slice(cg);
3890
3891 if (!cgfs_get_value("memory", cg, "memory.limit_in_bytes", &memlimit_str))
3892 goto err;
3893
3894 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3895 goto err;
3896
3897 memlimit = strtoul(memlimit_str, NULL, 10);
3898 memusage = strtoul(memusage_str, NULL, 10);
3899
3900 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
3901 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
3902
3903 /* If swap accounting is turned on, then default value is assumed to be that of cgroup / */
3904 if (!cgfs_get_value("memory", "/", "memory.memsw.limit_in_bytes", &memswlimit_default_str))
3905 goto err;
3906 if (!cgfs_get_value("memory", "/", "memory.memsw.usage_in_bytes", &memswusage_default_str))
3907 goto err;
3908
3909 memswlimit = strtoul(memswlimit_str, NULL, 10);
3910 memswusage = strtoul(memswusage_str, NULL, 10);
3911
3912 if (!strcmp(memswlimit_str, memswlimit_default_str))
3913 memswlimit = 0;
3914 if (!strcmp(memswusage_str, memswusage_default_str))
3915 memswusage = 0;
3916
3917 swap_total = (memswlimit - memlimit) / 1024;
3918 swap_free = (memswusage - memusage) / 1024;
3919 }
3920
3921 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
3922
3923 /* When no mem + swap limit is specified or swapaccount=0*/
3924 if (!memswlimit) {
3925 char *line = NULL;
3926 size_t linelen = 0;
3927 FILE *f = fopen("/proc/meminfo", "r");
3928
3929 if (!f)
3930 goto err;
3931
3932 while (getline(&line, &linelen, f) != -1) {
3933 if (startswith(line, "SwapTotal:")) {
3934 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
3935 } else if (startswith(line, "SwapFree:")) {
3936 sscanf(line, "SwapFree: %8lu kB", &swap_free);
3937 }
3938 }
3939
3940 free(line);
3941 fclose(f);
3942 }
3943
3944 if (swap_total > 0) {
3945 l = snprintf(d->buf + total_len, d->size - total_len,
3946 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
3947 swap_total, swap_free);
3948 total_len += l;
3949 }
3950
3951 if (total_len < 0 || l < 0) {
3952 perror("Error writing to cache");
3953 rv = 0;
3954 goto err;
3955 }
3956
3957 d->cached = 1;
3958 d->size = (int)total_len;
3959
3960 if (total_len > size) total_len = size;
3961 memcpy(buf, d->buf, total_len);
3962 rv = total_len;
3963
3964 err:
3965 free(cg);
3966 free(memswlimit_str);
3967 free(memlimit_str);
3968 free(memusage_str);
3969 free(memswusage_str);
3970 free(memswusage_default_str);
3971 free(memswlimit_default_str);
3972 return rv;
3973 }
3974
3975 static off_t get_procfile_size(const char *which)
3976 {
3977 FILE *f = fopen(which, "r");
3978 char *line = NULL;
3979 size_t len = 0;
3980 ssize_t sz, answer = 0;
3981 if (!f)
3982 return 0;
3983
3984 while ((sz = getline(&line, &len, f)) != -1)
3985 answer += sz;
3986 fclose (f);
3987 free(line);
3988
3989 return answer;
3990 }
3991
3992 int proc_getattr(const char *path, struct stat *sb)
3993 {
3994 struct timespec now;
3995
3996 memset(sb, 0, sizeof(struct stat));
3997 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
3998 return -EINVAL;
3999 sb->st_uid = sb->st_gid = 0;
4000 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
4001 if (strcmp(path, "/proc") == 0) {
4002 sb->st_mode = S_IFDIR | 00555;
4003 sb->st_nlink = 2;
4004 return 0;
4005 }
4006 if (strcmp(path, "/proc/meminfo") == 0 ||
4007 strcmp(path, "/proc/cpuinfo") == 0 ||
4008 strcmp(path, "/proc/uptime") == 0 ||
4009 strcmp(path, "/proc/stat") == 0 ||
4010 strcmp(path, "/proc/diskstats") == 0 ||
4011 strcmp(path, "/proc/swaps") == 0) {
4012 sb->st_size = 0;
4013 sb->st_mode = S_IFREG | 00444;
4014 sb->st_nlink = 1;
4015 return 0;
4016 }
4017
4018 return -ENOENT;
4019 }
4020
4021 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
4022 struct fuse_file_info *fi)
4023 {
4024 if (filler(buf, ".", NULL, 0) != 0 ||
4025 filler(buf, "..", NULL, 0) != 0 ||
4026 filler(buf, "cpuinfo", NULL, 0) != 0 ||
4027 filler(buf, "meminfo", NULL, 0) != 0 ||
4028 filler(buf, "stat", NULL, 0) != 0 ||
4029 filler(buf, "uptime", NULL, 0) != 0 ||
4030 filler(buf, "diskstats", NULL, 0) != 0 ||
4031 filler(buf, "swaps", NULL, 0) != 0)
4032 return -EINVAL;
4033 return 0;
4034 }
4035
4036 int proc_open(const char *path, struct fuse_file_info *fi)
4037 {
4038 int type = -1;
4039 struct file_info *info;
4040
4041 if (strcmp(path, "/proc/meminfo") == 0)
4042 type = LXC_TYPE_PROC_MEMINFO;
4043 else if (strcmp(path, "/proc/cpuinfo") == 0)
4044 type = LXC_TYPE_PROC_CPUINFO;
4045 else if (strcmp(path, "/proc/uptime") == 0)
4046 type = LXC_TYPE_PROC_UPTIME;
4047 else if (strcmp(path, "/proc/stat") == 0)
4048 type = LXC_TYPE_PROC_STAT;
4049 else if (strcmp(path, "/proc/diskstats") == 0)
4050 type = LXC_TYPE_PROC_DISKSTATS;
4051 else if (strcmp(path, "/proc/swaps") == 0)
4052 type = LXC_TYPE_PROC_SWAPS;
4053 if (type == -1)
4054 return -ENOENT;
4055
4056 info = malloc(sizeof(*info));
4057 if (!info)
4058 return -ENOMEM;
4059
4060 memset(info, 0, sizeof(*info));
4061 info->type = type;
4062
4063 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
4064 do {
4065 info->buf = malloc(info->buflen);
4066 } while (!info->buf);
4067 memset(info->buf, 0, info->buflen);
4068 /* set actual size to buffer size */
4069 info->size = info->buflen;
4070
4071 fi->fh = (unsigned long)info;
4072 return 0;
4073 }
4074
4075 int proc_access(const char *path, int mask)
4076 {
4077 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
4078 return 0;
4079
4080 /* these are all read-only */
4081 if ((mask & ~R_OK) != 0)
4082 return -EACCES;
4083 return 0;
4084 }
4085
4086 int proc_release(const char *path, struct fuse_file_info *fi)
4087 {
4088 do_release_file_info(fi);
4089 return 0;
4090 }
4091
4092 int proc_read(const char *path, char *buf, size_t size, off_t offset,
4093 struct fuse_file_info *fi)
4094 {
4095 struct file_info *f = (struct file_info *) fi->fh;
4096
4097 switch (f->type) {
4098 case LXC_TYPE_PROC_MEMINFO:
4099 return proc_meminfo_read(buf, size, offset, fi);
4100 case LXC_TYPE_PROC_CPUINFO:
4101 return proc_cpuinfo_read(buf, size, offset, fi);
4102 case LXC_TYPE_PROC_UPTIME:
4103 return proc_uptime_read(buf, size, offset, fi);
4104 case LXC_TYPE_PROC_STAT:
4105 return proc_stat_read(buf, size, offset, fi);
4106 case LXC_TYPE_PROC_DISKSTATS:
4107 return proc_diskstats_read(buf, size, offset, fi);
4108 case LXC_TYPE_PROC_SWAPS:
4109 return proc_swaps_read(buf, size, offset, fi);
4110 default:
4111 return -EINVAL;
4112 }
4113 }
4114
4115 /*
4116 * Functions needed to setup cgroups in the __constructor__.
4117 */
4118
4119 static bool mkdir_p(const char *dir, mode_t mode)
4120 {
4121 const char *tmp = dir;
4122 const char *orig = dir;
4123 char *makeme;
4124
4125 do {
4126 dir = tmp + strspn(tmp, "/");
4127 tmp = dir + strcspn(dir, "/");
4128 makeme = strndup(orig, dir - orig);
4129 if (!makeme)
4130 return false;
4131 if (mkdir(makeme, mode) && errno != EEXIST) {
4132 lxcfs_error("Failed to create directory '%s': %s.\n",
4133 makeme, strerror(errno));
4134 free(makeme);
4135 return false;
4136 }
4137 free(makeme);
4138 } while(tmp != dir);
4139
4140 return true;
4141 }
4142
4143 static bool umount_if_mounted(void)
4144 {
4145 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
4146 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
4147 return false;
4148 }
4149 return true;
4150 }
4151
4152 static int pivot_enter(void)
4153 {
4154 int ret = -1, oldroot = -1, newroot = -1;
4155
4156 oldroot = open("/", O_DIRECTORY | O_RDONLY);
4157 if (oldroot < 0) {
4158 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
4159 return ret;
4160 }
4161
4162 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
4163 if (newroot < 0) {
4164 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
4165 goto err;
4166 }
4167
4168 /* change into new root fs */
4169 if (fchdir(newroot) < 0) {
4170 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
4171 goto err;
4172 }
4173
4174 /* pivot_root into our new root fs */
4175 if (pivot_root(".", ".") < 0) {
4176 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
4177 goto err;
4178 }
4179
4180 /*
4181 * At this point the old-root is mounted on top of our new-root.
4182 * To unmounted it we must not be chdir'd into it, so escape back
4183 * to the old-root.
4184 */
4185 if (fchdir(oldroot) < 0) {
4186 lxcfs_error("%s\n", "Failed to enter old root.");
4187 goto err;
4188 }
4189 if (umount2(".", MNT_DETACH) < 0) {
4190 lxcfs_error("%s\n", "Failed to detach old root.");
4191 goto err;
4192 }
4193
4194 if (fchdir(newroot) < 0) {
4195 lxcfs_error("%s\n", "Failed to re-enter new root.");
4196 goto err;
4197 }
4198
4199 ret = 0;
4200
4201 err:
4202 if (oldroot > 0)
4203 close(oldroot);
4204 if (newroot > 0)
4205 close(newroot);
4206 return ret;
4207 }
4208
4209 /* Prepare our new clean root. */
4210 static int pivot_prepare(void)
4211 {
4212 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
4213 lxcfs_error("%s\n", "Failed to create directory for new root.");
4214 return -1;
4215 }
4216
4217 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
4218 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
4219 return -1;
4220 }
4221
4222 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
4223 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
4224 return -1;
4225 }
4226
4227 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
4228 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
4229 return -1;
4230 }
4231
4232 return 0;
4233 }
4234
4235 static bool pivot_new_root(void)
4236 {
4237 /* Prepare new root. */
4238 if (pivot_prepare() < 0)
4239 return false;
4240
4241 /* Pivot into new root. */
4242 if (pivot_enter() < 0)
4243 return false;
4244
4245 return true;
4246 }
4247
4248 static bool setup_cgfs_dir(void)
4249 {
4250 if (!mkdir_p(BASEDIR, 0700)) {
4251 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
4252 return false;
4253 }
4254
4255 if (!umount_if_mounted()) {
4256 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
4257 return false;
4258 }
4259
4260 if (unshare(CLONE_NEWNS) < 0) {
4261 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
4262 return false;
4263 }
4264
4265 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
4266 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
4267 return false;
4268 }
4269
4270 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
4271 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
4272 return false;
4273 }
4274
4275 return true;
4276 }
4277
4278 static bool do_mount_cgroups(void)
4279 {
4280 char *target;
4281 size_t clen, len;
4282 int i, ret;
4283
4284 for (i = 0; i < num_hierarchies; i++) {
4285 char *controller = hierarchies[i];
4286 clen = strlen(controller);
4287 len = strlen(BASEDIR) + clen + 2;
4288 target = malloc(len);
4289 if (!target)
4290 return false;
4291 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
4292 if (ret < 0 || ret >= len) {
4293 free(target);
4294 return false;
4295 }
4296 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
4297 free(target);
4298 return false;
4299 }
4300 if (mount(controller, target, "cgroup", 0, controller) < 0) {
4301 lxcfs_error("Failed mounting cgroup %s\n", controller);
4302 free(target);
4303 return false;
4304 }
4305
4306 fd_hierarchies[i] = open(target, O_DIRECTORY);
4307 if (fd_hierarchies[i] < 0) {
4308 free(target);
4309 return false;
4310 }
4311 free(target);
4312 }
4313 return true;
4314 }
4315
4316 static bool cgfs_setup_controllers(void)
4317 {
4318 if (!setup_cgfs_dir())
4319 return false;
4320
4321 if (!do_mount_cgroups()) {
4322 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
4323 return false;
4324 }
4325
4326 if (!pivot_new_root())
4327 return false;
4328
4329 return true;
4330 }
4331
4332 static int preserve_ns(int pid)
4333 {
4334 int ret;
4335 size_t len = 5 /* /proc */ + 21 /* /int_as_str */ + 7 /* /ns/mnt */ + 1 /* \0 */;
4336 char path[len];
4337
4338 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
4339 if (ret < 0 || (size_t)ret >= len)
4340 return -1;
4341
4342 return open(path, O_RDONLY | O_CLOEXEC);
4343 }
4344
4345 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
4346 {
4347 FILE *f;
4348 char *line = NULL;
4349 size_t len = 0;
4350 int i, init_ns = -1;
4351
4352 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
4353 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
4354 return;
4355 }
4356 while (getline(&line, &len, f) != -1) {
4357 char *p, *p2;
4358
4359 p = strchr(line, ':');
4360 if (!p)
4361 goto out;
4362 *(p++) = '\0';
4363
4364 p2 = strrchr(p, ':');
4365 if (!p2)
4366 goto out;
4367 *p2 = '\0';
4368
4369 /* With cgroupv2 /proc/self/cgroup can contain entries of the
4370 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
4371 * because it parses out the empty string "" and later on passes
4372 * it to mount(). Let's skip such entries.
4373 */
4374 if (!strcmp(p, ""))
4375 continue;
4376
4377 if (!store_hierarchy(line, p))
4378 goto out;
4379 }
4380
4381 /* Preserve initial namespace. */
4382 init_ns = preserve_ns(getpid());
4383 if (init_ns < 0) {
4384 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
4385 goto out;
4386 }
4387
4388 fd_hierarchies = malloc(sizeof(int *) * num_hierarchies);
4389 if (!fd_hierarchies) {
4390 lxcfs_error("%s\n", strerror(errno));
4391 goto out;
4392 }
4393
4394 for (i = 0; i < num_hierarchies; i++)
4395 fd_hierarchies[i] = -1;
4396
4397 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
4398 * to privately mount lxcfs cgroups. */
4399 if (!cgfs_setup_controllers()) {
4400 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
4401 goto out;
4402 }
4403
4404 if (setns(init_ns, 0) < 0) {
4405 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
4406 goto out;
4407 }
4408
4409 print_subsystems();
4410
4411 out:
4412 free(line);
4413 fclose(f);
4414 if (init_ns >= 0)
4415 close(init_ns);
4416 }
4417
4418 static void __attribute__((destructor)) free_subsystems(void)
4419 {
4420 int i;
4421
4422 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
4423
4424 for (i = 0; i < num_hierarchies; i++) {
4425 if (hierarchies[i])
4426 free(hierarchies[i]);
4427 if (fd_hierarchies && fd_hierarchies[i] >= 0)
4428 close(fd_hierarchies[i]);
4429 }
4430 free(hierarchies);
4431 free(fd_hierarchies);
4432 }