]> git.proxmox.com Git - mirror_lxcfs.git/blob - bindings.c
use is_shared_pidns helper
[mirror_lxcfs.git] / bindings.c
1 /* lxcfs
2 *
3 * Copyright © 2014-2016 Canonical, Inc
4 * Author: Serge Hallyn <serge.hallyn@ubuntu.com>
5 *
6 * See COPYING file for details.
7 */
8
9 #define FUSE_USE_VERSION 26
10
11 #define __STDC_FORMAT_MACROS
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <fuse.h>
16 #include <inttypes.h>
17 #include <libgen.h>
18 #include <pthread.h>
19 #include <sched.h>
20 #include <stdarg.h>
21 #include <stdbool.h>
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <time.h>
27 #include <unistd.h>
28 #include <wait.h>
29 #include <linux/magic.h>
30 #include <linux/sched.h>
31 #include <sys/epoll.h>
32 #include <sys/mman.h>
33 #include <sys/mount.h>
34 #include <sys/param.h>
35 #include <sys/socket.h>
36 #include <sys/syscall.h>
37 #include <sys/sysinfo.h>
38 #include <sys/vfs.h>
39
40 #include "bindings.h"
41 #include "memory_utils.h"
42 #include "config.h"
43
44 /* Define pivot_root() if missing from the C library */
45 #ifndef HAVE_PIVOT_ROOT
46 static int pivot_root(const char * new_root, const char * put_old)
47 {
48 #ifdef __NR_pivot_root
49 return syscall(__NR_pivot_root, new_root, put_old);
50 #else
51 errno = ENOSYS;
52 return -1;
53 #endif
54 }
55 #else
56 extern int pivot_root(const char * new_root, const char * put_old);
57 #endif
58
59 struct cpuacct_usage {
60 uint64_t user;
61 uint64_t system;
62 uint64_t idle;
63 bool online;
64 };
65
66 /* The function of hash table.*/
67 #define LOAD_SIZE 100 /*the size of hash_table */
68 #define FLUSH_TIME 5 /*the flush rate */
69 #define DEPTH_DIR 3 /*the depth of per cgroup */
70 /* The function of calculate loadavg .*/
71 #define FSHIFT 11 /* nr of bits of precision */
72 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
73 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
74 #define EXP_5 2014 /* 1/exp(5sec/5min) */
75 #define EXP_15 2037 /* 1/exp(5sec/15min) */
76 #define LOAD_INT(x) ((x) >> FSHIFT)
77 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
78 /*
79 * This parameter is used for proc_loadavg_read().
80 * 1 means use loadavg, 0 means not use.
81 */
82 static int loadavg = 0;
83 static volatile sig_atomic_t loadavg_stop = 0;
84 static int calc_hash(const char *name)
85 {
86 unsigned int hash = 0;
87 unsigned int x = 0;
88 /* ELFHash algorithm. */
89 while (*name) {
90 hash = (hash << 4) + *name++;
91 x = hash & 0xf0000000;
92 if (x != 0)
93 hash ^= (x >> 24);
94 hash &= ~x;
95 }
96 return (hash & 0x7fffffff);
97 }
98
99 struct load_node {
100 char *cg; /*cg */
101 unsigned long avenrun[3]; /* Load averages */
102 unsigned int run_pid;
103 unsigned int total_pid;
104 unsigned int last_pid;
105 int cfd; /* The file descriptor of the mounted cgroup */
106 struct load_node *next;
107 struct load_node **pre;
108 };
109
110 struct load_head {
111 /*
112 * The lock is about insert load_node and refresh load_node.To the first
113 * load_node of each hash bucket, insert and refresh in this hash bucket is
114 * mutually exclusive.
115 */
116 pthread_mutex_t lock;
117 /*
118 * The rdlock is about read loadavg and delete load_node.To each hash
119 * bucket, read and delete is mutually exclusive. But at the same time, we
120 * allow paratactic read operation. This rdlock is at list level.
121 */
122 pthread_rwlock_t rdlock;
123 /*
124 * The rilock is about read loadavg and insert load_node.To the first
125 * load_node of each hash bucket, read and insert is mutually exclusive.
126 * But at the same time, we allow paratactic read operation.
127 */
128 pthread_rwlock_t rilock;
129 struct load_node *next;
130 };
131
132 static struct load_head load_hash[LOAD_SIZE]; /* hash table */
133 /*
134 * init_load initialize the hash table.
135 * Return 0 on success, return -1 on failure.
136 */
137 static int init_load(void)
138 {
139 int i;
140 int ret;
141
142 for (i = 0; i < LOAD_SIZE; i++) {
143 load_hash[i].next = NULL;
144 ret = pthread_mutex_init(&load_hash[i].lock, NULL);
145 if (ret != 0) {
146 lxcfs_error("%s\n", "Failed to initialize lock");
147 goto out3;
148 }
149 ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL);
150 if (ret != 0) {
151 lxcfs_error("%s\n", "Failed to initialize rdlock");
152 goto out2;
153 }
154 ret = pthread_rwlock_init(&load_hash[i].rilock, NULL);
155 if (ret != 0) {
156 lxcfs_error("%s\n", "Failed to initialize rilock");
157 goto out1;
158 }
159 }
160 return 0;
161 out1:
162 pthread_rwlock_destroy(&load_hash[i].rdlock);
163 out2:
164 pthread_mutex_destroy(&load_hash[i].lock);
165 out3:
166 while (i > 0) {
167 i--;
168 pthread_mutex_destroy(&load_hash[i].lock);
169 pthread_rwlock_destroy(&load_hash[i].rdlock);
170 pthread_rwlock_destroy(&load_hash[i].rilock);
171 }
172 return -1;
173 }
174
175 static void insert_node(struct load_node **n, int locate)
176 {
177 struct load_node *f;
178
179 pthread_mutex_lock(&load_hash[locate].lock);
180 pthread_rwlock_wrlock(&load_hash[locate].rilock);
181 f = load_hash[locate].next;
182 load_hash[locate].next = *n;
183
184 (*n)->pre = &(load_hash[locate].next);
185 if (f)
186 f->pre = &((*n)->next);
187 (*n)->next = f;
188 pthread_mutex_unlock(&load_hash[locate].lock);
189 pthread_rwlock_unlock(&load_hash[locate].rilock);
190 }
191 /*
192 * locate_node() finds special node. Not return NULL means success.
193 * It should be noted that rdlock isn't unlocked at the end of code
194 * because this function is used to read special node. Delete is not
195 * allowed before read has ended.
196 * unlock rdlock only in proc_loadavg_read().
197 */
198 static struct load_node *locate_node(char *cg, int locate)
199 {
200 struct load_node *f = NULL;
201 int i = 0;
202
203 pthread_rwlock_rdlock(&load_hash[locate].rilock);
204 pthread_rwlock_rdlock(&load_hash[locate].rdlock);
205 if (load_hash[locate].next == NULL) {
206 pthread_rwlock_unlock(&load_hash[locate].rilock);
207 return f;
208 }
209 f = load_hash[locate].next;
210 pthread_rwlock_unlock(&load_hash[locate].rilock);
211 while (f && ((i = strcmp(f->cg, cg)) != 0))
212 f = f->next;
213 return f;
214 }
215 /* Delete the load_node n and return the next node of it. */
216 static struct load_node *del_node(struct load_node *n, int locate)
217 {
218 struct load_node *g;
219
220 pthread_rwlock_wrlock(&load_hash[locate].rdlock);
221 if (n->next == NULL) {
222 *(n->pre) = NULL;
223 } else {
224 *(n->pre) = n->next;
225 n->next->pre = n->pre;
226 }
227 g = n->next;
228 free(n->cg);
229 free(n);
230 pthread_rwlock_unlock(&load_hash[locate].rdlock);
231 return g;
232 }
233
234 static void load_free(void)
235 {
236 int i;
237 struct load_node *f, *p;
238
239 for (i = 0; i < LOAD_SIZE; i++) {
240 pthread_mutex_lock(&load_hash[i].lock);
241 pthread_rwlock_wrlock(&load_hash[i].rilock);
242 pthread_rwlock_wrlock(&load_hash[i].rdlock);
243 if (load_hash[i].next == NULL) {
244 pthread_mutex_unlock(&load_hash[i].lock);
245 pthread_mutex_destroy(&load_hash[i].lock);
246 pthread_rwlock_unlock(&load_hash[i].rilock);
247 pthread_rwlock_destroy(&load_hash[i].rilock);
248 pthread_rwlock_unlock(&load_hash[i].rdlock);
249 pthread_rwlock_destroy(&load_hash[i].rdlock);
250 continue;
251 }
252 for (f = load_hash[i].next; f; ) {
253 free(f->cg);
254 p = f->next;
255 free(f);
256 f = p;
257 }
258 pthread_mutex_unlock(&load_hash[i].lock);
259 pthread_mutex_destroy(&load_hash[i].lock);
260 pthread_rwlock_unlock(&load_hash[i].rilock);
261 pthread_rwlock_destroy(&load_hash[i].rilock);
262 pthread_rwlock_unlock(&load_hash[i].rdlock);
263 pthread_rwlock_destroy(&load_hash[i].rdlock);
264 }
265 }
266
267 /* Data for CPU view */
268 struct cg_proc_stat {
269 char *cg;
270 struct cpuacct_usage *usage; // Real usage as read from the host's /proc/stat
271 struct cpuacct_usage *view; // Usage stats reported to the container
272 int cpu_count;
273 pthread_mutex_t lock; // For node manipulation
274 struct cg_proc_stat *next;
275 };
276
277 struct cg_proc_stat_head {
278 struct cg_proc_stat *next;
279 time_t lastcheck;
280
281 /*
282 * For access to the list. Reading can be parallel, pruning is exclusive.
283 */
284 pthread_rwlock_t lock;
285 };
286
287 #define CPUVIEW_HASH_SIZE 100
288 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE];
289
290 static bool cpuview_init_head(struct cg_proc_stat_head **head)
291 {
292 *head = malloc(sizeof(struct cg_proc_stat_head));
293 if (!(*head)) {
294 lxcfs_error("%s\n", strerror(errno));
295 return false;
296 }
297
298 (*head)->lastcheck = time(NULL);
299 (*head)->next = NULL;
300
301 if (pthread_rwlock_init(&(*head)->lock, NULL) != 0) {
302 lxcfs_error("%s\n", "Failed to initialize list lock");
303 free(*head);
304 return false;
305 }
306
307 return true;
308 }
309
310 static bool init_cpuview()
311 {
312 int i;
313
314 for (i = 0; i < CPUVIEW_HASH_SIZE; i++)
315 proc_stat_history[i] = NULL;
316
317 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
318 if (!cpuview_init_head(&proc_stat_history[i]))
319 goto err;
320 }
321
322 return true;
323
324 err:
325 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
326 if (proc_stat_history[i]) {
327 free(proc_stat_history[i]);
328 proc_stat_history[i] = NULL;
329 }
330 }
331
332 return false;
333 }
334
335 static void free_proc_stat_node(struct cg_proc_stat *node)
336 {
337 pthread_mutex_destroy(&node->lock);
338 free(node->cg);
339 free(node->usage);
340 free(node->view);
341 free(node);
342 }
343
344 static void cpuview_free_head(struct cg_proc_stat_head *head)
345 {
346 struct cg_proc_stat *node, *tmp;
347
348 if (head->next) {
349 node = head->next;
350
351 for (;;) {
352 tmp = node;
353 node = node->next;
354 free_proc_stat_node(tmp);
355
356 if (!node)
357 break;
358 }
359 }
360
361 pthread_rwlock_destroy(&head->lock);
362 free(head);
363 }
364
365 static void free_cpuview()
366 {
367 int i;
368
369 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
370 if (proc_stat_history[i])
371 cpuview_free_head(proc_stat_history[i]);
372 }
373 }
374
375 /*
376 * A table caching which pid is init for a pid namespace.
377 * When looking up which pid is init for $qpid, we first
378 * 1. Stat /proc/$qpid/ns/pid.
379 * 2. Check whether the ino_t is in our store.
380 * a. if not, fork a child in qpid's ns to send us
381 * ucred.pid = 1, and read the initpid. Cache
382 * initpid and creation time for /proc/initpid
383 * in a new store entry.
384 * b. if so, verify that /proc/initpid still matches
385 * what we have saved. If not, clear the store
386 * entry and go back to a. If so, return the
387 * cached initpid.
388 */
389 struct pidns_init_store {
390 ino_t ino; // inode number for /proc/$pid/ns/pid
391 pid_t initpid; // the pid of nit in that ns
392 long int ctime; // the time at which /proc/$initpid was created
393 struct pidns_init_store *next;
394 long int lastcheck;
395 };
396
397 /* lol - look at how they are allocated in the kernel */
398 #define PIDNS_HASH_SIZE 4096
399 #define HASH(x) ((x) % PIDNS_HASH_SIZE)
400
401 static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE];
402 static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER;
403 static void lock_mutex(pthread_mutex_t *l)
404 {
405 int ret;
406
407 if ((ret = pthread_mutex_lock(l)) != 0) {
408 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
409 exit(1);
410 }
411 }
412
413 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
414 * Number of hierarchies mounted. */
415 static int num_hierarchies;
416
417 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
418 * Hierachies mounted {cpuset, blkio, ...}:
419 * Initialized via __constructor__ collect_and_mount_subsystems(). */
420 static char **hierarchies;
421
422 /* READ-ONLY after __constructor__ collect_and_mount_subsystems() has run.
423 * Open file descriptors:
424 * @fd_hierarchies[i] refers to cgroup @hierarchies[i]. They are mounted in a
425 * private mount namespace.
426 * Initialized via __constructor__ collect_and_mount_subsystems().
427 * @fd_hierarchies[i] can be used to perform file operations on the cgroup
428 * mounts and respective files in the private namespace even when located in
429 * another namespace using the *at() family of functions
430 * {openat(), fchownat(), ...}. */
431 static int *fd_hierarchies;
432 static int cgroup_mount_ns_fd = -1;
433
434 static void unlock_mutex(pthread_mutex_t *l)
435 {
436 int ret;
437
438 if ((ret = pthread_mutex_unlock(l)) != 0) {
439 lxcfs_error("returned:%d %s\n", ret, strerror(ret));
440 exit(1);
441 }
442 }
443
444 static void store_lock(void)
445 {
446 lock_mutex(&pidns_store_mutex);
447 }
448
449 static void store_unlock(void)
450 {
451 unlock_mutex(&pidns_store_mutex);
452 }
453
454 /* Must be called under store_lock */
455 static bool initpid_still_valid(struct pidns_init_store *e, struct stat *nsfdsb)
456 {
457 struct stat initsb;
458 char fnam[100];
459
460 snprintf(fnam, 100, "/proc/%d", e->initpid);
461 if (stat(fnam, &initsb) < 0)
462 return false;
463
464 lxcfs_debug("Comparing ctime %ld == %ld for pid %d.\n", e->ctime,
465 initsb.st_ctime, e->initpid);
466
467 if (e->ctime != initsb.st_ctime)
468 return false;
469 return true;
470 }
471
472 /* Must be called under store_lock */
473 static void remove_initpid(struct pidns_init_store *e)
474 {
475 struct pidns_init_store *tmp;
476 int h;
477
478 lxcfs_debug("Remove_initpid: removing entry for %d.\n", e->initpid);
479
480 h = HASH(e->ino);
481 if (pidns_hash_table[h] == e) {
482 pidns_hash_table[h] = e->next;
483 free(e);
484 return;
485 }
486
487 tmp = pidns_hash_table[h];
488 while (tmp) {
489 if (tmp->next == e) {
490 tmp->next = e->next;
491 free(e);
492 return;
493 }
494 tmp = tmp->next;
495 }
496 }
497
498 #define PURGE_SECS 5
499 /* Must be called under store_lock */
500 static void prune_initpid_store(void)
501 {
502 static long int last_prune = 0;
503 struct pidns_init_store *e, *prev, *delme;
504 long int now, threshold;
505 int i;
506
507 if (!last_prune) {
508 last_prune = time(NULL);
509 return;
510 }
511 now = time(NULL);
512 if (now < last_prune + PURGE_SECS)
513 return;
514
515 lxcfs_debug("%s\n", "Pruning.");
516
517 last_prune = now;
518 threshold = now - 2 * PURGE_SECS;
519
520 for (i = 0; i < PIDNS_HASH_SIZE; i++) {
521 for (prev = NULL, e = pidns_hash_table[i]; e; ) {
522 if (e->lastcheck < threshold) {
523
524 lxcfs_debug("Removing cached entry for %d.\n", e->initpid);
525
526 delme = e;
527 if (prev)
528 prev->next = e->next;
529 else
530 pidns_hash_table[i] = e->next;
531 e = e->next;
532 free(delme);
533 } else {
534 prev = e;
535 e = e->next;
536 }
537 }
538 }
539 }
540
541 /* Must be called under store_lock */
542 static void save_initpid(struct stat *sb, pid_t pid)
543 {
544 struct pidns_init_store *e;
545 char fpath[100];
546 struct stat procsb;
547 int h;
548
549 lxcfs_debug("Save_initpid: adding entry for %d.\n", pid);
550
551 snprintf(fpath, 100, "/proc/%d", pid);
552 if (stat(fpath, &procsb) < 0)
553 return;
554 do {
555 e = malloc(sizeof(*e));
556 } while (!e);
557 e->ino = sb->st_ino;
558 e->initpid = pid;
559 e->ctime = procsb.st_ctime;
560 h = HASH(e->ino);
561 e->next = pidns_hash_table[h];
562 e->lastcheck = time(NULL);
563 pidns_hash_table[h] = e;
564 }
565
566 /*
567 * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store
568 * entry for the inode number and creation time. Verify that the init pid
569 * is still valid. If not, remove it. Return the entry if valid, NULL
570 * otherwise.
571 * Must be called under store_lock
572 */
573 static struct pidns_init_store *lookup_verify_initpid(struct stat *sb)
574 {
575 int h = HASH(sb->st_ino);
576 struct pidns_init_store *e = pidns_hash_table[h];
577
578 while (e) {
579 if (e->ino == sb->st_ino) {
580 if (initpid_still_valid(e, sb)) {
581 e->lastcheck = time(NULL);
582 return e;
583 }
584 remove_initpid(e);
585 return NULL;
586 }
587 e = e->next;
588 }
589
590 return NULL;
591 }
592
593 static int is_dir(const char *path, int fd)
594 {
595 struct stat statbuf;
596 int ret = fstatat(fd, path, &statbuf, fd);
597 if (ret == 0 && S_ISDIR(statbuf.st_mode))
598 return 1;
599 return 0;
600 }
601
602 static char *must_copy_string(const char *str)
603 {
604 char *dup = NULL;
605 if (!str)
606 return NULL;
607 do {
608 dup = strdup(str);
609 } while (!dup);
610
611 return dup;
612 }
613
614 static inline void drop_trailing_newlines(char *s)
615 {
616 int l;
617
618 for (l=strlen(s); l>0 && s[l-1] == '\n'; l--)
619 s[l-1] = '\0';
620 }
621
622 #define BATCH_SIZE 50
623 static void dorealloc(char **mem, size_t oldlen, size_t newlen)
624 {
625 int newbatches = (newlen / BATCH_SIZE) + 1;
626 int oldbatches = (oldlen / BATCH_SIZE) + 1;
627
628 if (!*mem || newbatches > oldbatches) {
629 char *tmp;
630 do {
631 tmp = realloc(*mem, newbatches * BATCH_SIZE);
632 } while (!tmp);
633 *mem = tmp;
634 }
635 }
636 static void append_line(char **contents, size_t *len, char *line, ssize_t linelen)
637 {
638 size_t newlen = *len + linelen;
639 dorealloc(contents, *len, newlen + 1);
640 memcpy(*contents + *len, line, linelen+1);
641 *len = newlen;
642 }
643
644 static char *slurp_file(const char *from, int fd)
645 {
646 char *line = NULL;
647 char *contents = NULL;
648 FILE *f = fdopen(fd, "r");
649 size_t len = 0, fulllen = 0;
650 ssize_t linelen;
651
652 if (!f)
653 return NULL;
654
655 while ((linelen = getline(&line, &len, f)) != -1) {
656 append_line(&contents, &fulllen, line, linelen);
657 }
658 fclose(f);
659
660 if (contents)
661 drop_trailing_newlines(contents);
662 free(line);
663 return contents;
664 }
665
666 static int preserve_ns(const int pid, const char *ns)
667 {
668 int ret;
669 /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */
670 #define __NS_PATH_LEN 50
671 char path[__NS_PATH_LEN];
672
673 /* This way we can use this function to also check whether namespaces
674 * are supported by the kernel by passing in the NULL or the empty
675 * string.
676 */
677 ret = snprintf(path, __NS_PATH_LEN, "/proc/%d/ns%s%s", pid,
678 !ns || strcmp(ns, "") == 0 ? "" : "/",
679 !ns || strcmp(ns, "") == 0 ? "" : ns);
680 if (ret < 0 || (size_t)ret >= __NS_PATH_LEN) {
681 errno = EFBIG;
682 return -1;
683 }
684
685 return open(path, O_RDONLY | O_CLOEXEC);
686 }
687
688 /**
689 * in_same_namespace - Check whether two processes are in the same namespace.
690 * @pid1 - PID of the first process.
691 * @pid2 - PID of the second process.
692 * @ns - Name of the namespace to check. Must correspond to one of the names
693 * for the namespaces as shown in /proc/<pid/ns/
694 *
695 * If the two processes are not in the same namespace returns an fd to the
696 * namespace of the second process identified by @pid2. If the two processes are
697 * in the same namespace returns -EINVAL, -1 if an error occurred.
698 */
699 static int in_same_namespace(pid_t pid1, pid_t pid2, const char *ns)
700 {
701 __do_close_prot_errno int ns_fd1 = -1, ns_fd2 = -1;
702 int ret = -1;
703 struct stat ns_st1, ns_st2;
704
705 ns_fd1 = preserve_ns(pid1, ns);
706 if (ns_fd1 < 0) {
707 /* The kernel does not support this namespace. This is not an
708 * error.
709 */
710 if (errno == ENOENT)
711 return -EINVAL;
712
713 return -1;
714 }
715
716 ns_fd2 = preserve_ns(pid2, ns);
717 if (ns_fd2 < 0)
718 return -1;
719
720 ret = fstat(ns_fd1, &ns_st1);
721 if (ret < 0)
722 return -1;
723
724 ret = fstat(ns_fd2, &ns_st2);
725 if (ret < 0)
726 return -1;
727
728 /* processes are in the same namespace */
729 if ((ns_st1.st_dev == ns_st2.st_dev) && (ns_st1.st_ino == ns_st2.st_ino))
730 return -EINVAL;
731
732 /* processes are in different namespaces */
733 return move_fd(ns_fd2);
734 }
735
736 static bool is_shared_pidns(pid_t pid)
737 {
738 if (pid != 1)
739 return false;
740
741 if (in_same_namespace(pid, getpid(), "pid") == -EINVAL)
742 return true;
743
744 return false;
745 }
746
747 static bool write_string(const char *fnam, const char *string, int fd)
748 {
749 FILE *f;
750 size_t len, ret;
751
752 f = fdopen(fd, "w");
753 if (!f)
754 return false;
755
756 len = strlen(string);
757 ret = fwrite(string, 1, len, f);
758 if (ret != len) {
759 lxcfs_error("%s - Error writing \"%s\" to \"%s\"\n",
760 strerror(errno), string, fnam);
761 fclose(f);
762 return false;
763 }
764
765 if (fclose(f) < 0) {
766 lxcfs_error("%s - Failed to close \"%s\"\n", strerror(errno), fnam);
767 return false;
768 }
769
770 return true;
771 }
772
773 struct cgfs_files {
774 char *name;
775 uint32_t uid, gid;
776 uint32_t mode;
777 };
778
779 #define ALLOC_NUM 20
780 static bool store_hierarchy(char *stridx, char *h)
781 {
782 if (num_hierarchies % ALLOC_NUM == 0) {
783 size_t n = (num_hierarchies / ALLOC_NUM) + 1;
784 n *= ALLOC_NUM;
785 char **tmp = realloc(hierarchies, n * sizeof(char *));
786 if (!tmp) {
787 lxcfs_error("%s\n", strerror(errno));
788 exit(1);
789 }
790 hierarchies = tmp;
791 }
792
793 hierarchies[num_hierarchies++] = must_copy_string(h);
794 return true;
795 }
796
797 static void print_subsystems(void)
798 {
799 int i;
800
801 fprintf(stderr, "mount namespace: %d\n", cgroup_mount_ns_fd);
802 fprintf(stderr, "hierarchies:\n");
803 for (i = 0; i < num_hierarchies; i++) {
804 if (hierarchies[i])
805 fprintf(stderr, " %2d: fd: %3d: %s\n", i,
806 fd_hierarchies[i], hierarchies[i]);
807 }
808 }
809
810 static bool in_comma_list(const char *needle, const char *haystack)
811 {
812 const char *s = haystack, *e;
813 size_t nlen = strlen(needle);
814
815 while (*s && (e = strchr(s, ','))) {
816 if (nlen != e - s) {
817 s = e + 1;
818 continue;
819 }
820 if (strncmp(needle, s, nlen) == 0)
821 return true;
822 s = e + 1;
823 }
824 if (strcmp(needle, s) == 0)
825 return true;
826 return false;
827 }
828
829 /* do we need to do any massaging here? I'm not sure... */
830 /* Return the mounted controller and store the corresponding open file descriptor
831 * referring to the controller mountpoint in the private lxcfs namespace in
832 * @cfd.
833 */
834 static char *find_mounted_controller(const char *controller, int *cfd)
835 {
836 int i;
837
838 for (i = 0; i < num_hierarchies; i++) {
839 if (!hierarchies[i])
840 continue;
841 if (strcmp(hierarchies[i], controller) == 0) {
842 *cfd = fd_hierarchies[i];
843 return hierarchies[i];
844 }
845 if (in_comma_list(controller, hierarchies[i])) {
846 *cfd = fd_hierarchies[i];
847 return hierarchies[i];
848 }
849 }
850
851 return NULL;
852 }
853
854 bool cgfs_set_value(const char *controller, const char *cgroup, const char *file,
855 const char *value)
856 {
857 int ret, fd, cfd;
858 size_t len;
859 char *fnam, *tmpc;
860
861 tmpc = find_mounted_controller(controller, &cfd);
862 if (!tmpc)
863 return false;
864
865 /* Make sure we pass a relative path to *at() family of functions.
866 * . + /cgroup + / + file + \0
867 */
868 len = strlen(cgroup) + strlen(file) + 3;
869 fnam = alloca(len);
870 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
871 if (ret < 0 || (size_t)ret >= len)
872 return false;
873
874 fd = openat(cfd, fnam, O_WRONLY);
875 if (fd < 0)
876 return false;
877
878 return write_string(fnam, value, fd);
879 }
880
881 // Chown all the files in the cgroup directory. We do this when we create
882 // a cgroup on behalf of a user.
883 static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd)
884 {
885 struct dirent *direntp;
886 char path[MAXPATHLEN];
887 size_t len;
888 DIR *d;
889 int fd1, ret;
890
891 len = strlen(dirname);
892 if (len >= MAXPATHLEN) {
893 lxcfs_error("Pathname too long: %s\n", dirname);
894 return;
895 }
896
897 fd1 = openat(fd, dirname, O_DIRECTORY);
898 if (fd1 < 0)
899 return;
900
901 d = fdopendir(fd1);
902 if (!d) {
903 lxcfs_error("Failed to open %s\n", dirname);
904 return;
905 }
906
907 while ((direntp = readdir(d))) {
908 if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, ".."))
909 continue;
910 ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
911 if (ret < 0 || ret >= MAXPATHLEN) {
912 lxcfs_error("Pathname too long under %s\n", dirname);
913 continue;
914 }
915 if (fchownat(fd, path, uid, gid, 0) < 0)
916 lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid);
917 }
918 closedir(d);
919 }
920
921 int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid)
922 {
923 int cfd;
924 size_t len;
925 char *dirnam, *tmpc;
926
927 tmpc = find_mounted_controller(controller, &cfd);
928 if (!tmpc)
929 return -EINVAL;
930
931 /* Make sure we pass a relative path to *at() family of functions.
932 * . + /cg + \0
933 */
934 len = strlen(cg) + 2;
935 dirnam = alloca(len);
936 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
937
938 if (mkdirat(cfd, dirnam, 0755) < 0)
939 return -errno;
940
941 if (uid == 0 && gid == 0)
942 return 0;
943
944 if (fchownat(cfd, dirnam, uid, gid, 0) < 0)
945 return -errno;
946
947 chown_all_cgroup_files(dirnam, uid, gid, cfd);
948
949 return 0;
950 }
951
952 static bool recursive_rmdir(const char *dirname, int fd, const int cfd)
953 {
954 struct dirent *direntp;
955 DIR *dir;
956 bool ret = false;
957 char pathname[MAXPATHLEN];
958 int dupfd;
959
960 dupfd = dup(fd); // fdopendir() does bad things once it uses an fd.
961 if (dupfd < 0)
962 return false;
963
964 dir = fdopendir(dupfd);
965 if (!dir) {
966 lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno));
967 close(dupfd);
968 return false;
969 }
970
971 while ((direntp = readdir(dir))) {
972 struct stat mystat;
973 int rc;
974
975 if (!strcmp(direntp->d_name, ".") ||
976 !strcmp(direntp->d_name, ".."))
977 continue;
978
979 rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name);
980 if (rc < 0 || rc >= MAXPATHLEN) {
981 lxcfs_error("%s\n", "Pathname too long.");
982 continue;
983 }
984
985 rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
986 if (rc) {
987 lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno));
988 continue;
989 }
990 if (S_ISDIR(mystat.st_mode))
991 if (!recursive_rmdir(pathname, fd, cfd))
992 lxcfs_debug("Error removing %s.\n", pathname);
993 }
994
995 ret = true;
996 if (closedir(dir) < 0) {
997 lxcfs_error("Failed to close directory %s: %s\n", dirname, strerror(errno));
998 ret = false;
999 }
1000
1001 if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) {
1002 lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno));
1003 ret = false;
1004 }
1005
1006 close(dupfd);
1007
1008 return ret;
1009 }
1010
1011 bool cgfs_remove(const char *controller, const char *cg)
1012 {
1013 int fd, cfd;
1014 size_t len;
1015 char *dirnam, *tmpc;
1016 bool bret;
1017
1018 tmpc = find_mounted_controller(controller, &cfd);
1019 if (!tmpc)
1020 return false;
1021
1022 /* Make sure we pass a relative path to *at() family of functions.
1023 * . + /cg + \0
1024 */
1025 len = strlen(cg) + 2;
1026 dirnam = alloca(len);
1027 snprintf(dirnam, len, "%s%s", *cg == '/' ? "." : "", cg);
1028
1029 fd = openat(cfd, dirnam, O_DIRECTORY);
1030 if (fd < 0)
1031 return false;
1032
1033 bret = recursive_rmdir(dirnam, fd, cfd);
1034 close(fd);
1035 return bret;
1036 }
1037
1038 bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode)
1039 {
1040 int cfd;
1041 size_t len;
1042 char *pathname, *tmpc;
1043
1044 tmpc = find_mounted_controller(controller, &cfd);
1045 if (!tmpc)
1046 return false;
1047
1048 /* Make sure we pass a relative path to *at() family of functions.
1049 * . + /file + \0
1050 */
1051 len = strlen(file) + 2;
1052 pathname = alloca(len);
1053 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1054 if (fchmodat(cfd, pathname, mode, 0) < 0)
1055 return false;
1056 return true;
1057 }
1058
1059 static int chown_tasks_files(const char *dirname, uid_t uid, gid_t gid, int fd)
1060 {
1061 size_t len;
1062 char *fname;
1063
1064 len = strlen(dirname) + strlen("/cgroup.procs") + 1;
1065 fname = alloca(len);
1066 snprintf(fname, len, "%s/tasks", dirname);
1067 if (fchownat(fd, fname, uid, gid, 0) != 0)
1068 return -errno;
1069 snprintf(fname, len, "%s/cgroup.procs", dirname);
1070 if (fchownat(fd, fname, uid, gid, 0) != 0)
1071 return -errno;
1072 return 0;
1073 }
1074
1075 int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid)
1076 {
1077 int cfd;
1078 size_t len;
1079 char *pathname, *tmpc;
1080
1081 tmpc = find_mounted_controller(controller, &cfd);
1082 if (!tmpc)
1083 return -EINVAL;
1084
1085 /* Make sure we pass a relative path to *at() family of functions.
1086 * . + /file + \0
1087 */
1088 len = strlen(file) + 2;
1089 pathname = alloca(len);
1090 snprintf(pathname, len, "%s%s", *file == '/' ? "." : "", file);
1091 if (fchownat(cfd, pathname, uid, gid, 0) < 0)
1092 return -errno;
1093
1094 if (is_dir(pathname, cfd))
1095 // like cgmanager did, we want to chown the tasks file as well
1096 return chown_tasks_files(pathname, uid, gid, cfd);
1097
1098 return 0;
1099 }
1100
1101 FILE *open_pids_file(const char *controller, const char *cgroup)
1102 {
1103 int fd, cfd;
1104 size_t len;
1105 char *pathname, *tmpc;
1106
1107 tmpc = find_mounted_controller(controller, &cfd);
1108 if (!tmpc)
1109 return NULL;
1110
1111 /* Make sure we pass a relative path to *at() family of functions.
1112 * . + /cgroup + / "cgroup.procs" + \0
1113 */
1114 len = strlen(cgroup) + strlen("cgroup.procs") + 3;
1115 pathname = alloca(len);
1116 snprintf(pathname, len, "%s%s/cgroup.procs", *cgroup == '/' ? "." : "", cgroup);
1117
1118 fd = openat(cfd, pathname, O_WRONLY);
1119 if (fd < 0)
1120 return NULL;
1121
1122 return fdopen(fd, "w");
1123 }
1124
1125 static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories,
1126 void ***list, size_t typesize,
1127 void* (*iterator)(const char*, const char*, const char*))
1128 {
1129 int cfd, fd, ret;
1130 size_t len;
1131 char *cg, *tmpc;
1132 char pathname[MAXPATHLEN];
1133 size_t sz = 0, asz = 0;
1134 struct dirent *dirent;
1135 DIR *dir;
1136
1137 tmpc = find_mounted_controller(controller, &cfd);
1138 *list = NULL;
1139 if (!tmpc)
1140 return false;
1141
1142 /* Make sure we pass a relative path to *at() family of functions. */
1143 len = strlen(cgroup) + 1 /* . */ + 1 /* \0 */;
1144 cg = alloca(len);
1145 ret = snprintf(cg, len, "%s%s", *cgroup == '/' ? "." : "", cgroup);
1146 if (ret < 0 || (size_t)ret >= len) {
1147 lxcfs_error("Pathname too long under %s\n", cgroup);
1148 return false;
1149 }
1150
1151 fd = openat(cfd, cg, O_DIRECTORY);
1152 if (fd < 0)
1153 return false;
1154
1155 dir = fdopendir(fd);
1156 if (!dir)
1157 return false;
1158
1159 while ((dirent = readdir(dir))) {
1160 struct stat mystat;
1161
1162 if (!strcmp(dirent->d_name, ".") ||
1163 !strcmp(dirent->d_name, ".."))
1164 continue;
1165
1166 ret = snprintf(pathname, MAXPATHLEN, "%s/%s", cg, dirent->d_name);
1167 if (ret < 0 || ret >= MAXPATHLEN) {
1168 lxcfs_error("Pathname too long under %s\n", cg);
1169 continue;
1170 }
1171
1172 ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW);
1173 if (ret) {
1174 lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno));
1175 continue;
1176 }
1177 if ((!directories && !S_ISREG(mystat.st_mode)) ||
1178 (directories && !S_ISDIR(mystat.st_mode)))
1179 continue;
1180
1181 if (sz+2 >= asz) {
1182 void **tmp;
1183 asz += BATCH_SIZE;
1184 do {
1185 tmp = realloc(*list, asz * typesize);
1186 } while (!tmp);
1187 *list = tmp;
1188 }
1189 (*list)[sz] = (*iterator)(controller, cg, dirent->d_name);
1190 (*list)[sz+1] = NULL;
1191 sz++;
1192 }
1193 if (closedir(dir) < 0) {
1194 lxcfs_error("Failed closedir for %s: %s\n", cgroup, strerror(errno));
1195 return false;
1196 }
1197 return true;
1198 }
1199
1200 static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1201 {
1202 char *dup;
1203 do {
1204 dup = strdup(dir_entry);
1205 } while (!dup);
1206 return dup;
1207 }
1208
1209 bool cgfs_list_children(const char *controller, const char *cgroup, char ***list)
1210 {
1211 return cgfs_iterate_cgroup(controller, cgroup, true, (void***)list, sizeof(*list), &make_children_list_entry);
1212 }
1213
1214 void free_key(struct cgfs_files *k)
1215 {
1216 if (!k)
1217 return;
1218 free(k->name);
1219 free(k);
1220 }
1221
1222 void free_keys(struct cgfs_files **keys)
1223 {
1224 int i;
1225
1226 if (!keys)
1227 return;
1228 for (i = 0; keys[i]; i++) {
1229 free_key(keys[i]);
1230 }
1231 free(keys);
1232 }
1233
1234 bool cgfs_get_value(const char *controller, const char *cgroup, const char *file, char **value)
1235 {
1236 int ret, fd, cfd;
1237 size_t len;
1238 char *fnam, *tmpc;
1239
1240 tmpc = find_mounted_controller(controller, &cfd);
1241 if (!tmpc)
1242 return false;
1243
1244 /* Make sure we pass a relative path to *at() family of functions.
1245 * . + /cgroup + / + file + \0
1246 */
1247 len = strlen(cgroup) + strlen(file) + 3;
1248 fnam = alloca(len);
1249 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1250 if (ret < 0 || (size_t)ret >= len)
1251 return false;
1252
1253 fd = openat(cfd, fnam, O_RDONLY);
1254 if (fd < 0)
1255 return false;
1256
1257 *value = slurp_file(fnam, fd);
1258 return *value != NULL;
1259 }
1260
1261 bool cgfs_param_exist(const char *controller, const char *cgroup, const char *file)
1262 {
1263 int ret, cfd;
1264 size_t len;
1265 char *fnam, *tmpc;
1266
1267 tmpc = find_mounted_controller(controller, &cfd);
1268 if (!tmpc)
1269 return false;
1270
1271 /* Make sure we pass a relative path to *at() family of functions.
1272 * . + /cgroup + / + file + \0
1273 */
1274 len = strlen(cgroup) + strlen(file) + 3;
1275 fnam = alloca(len);
1276 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, file);
1277 if (ret < 0 || (size_t)ret >= len)
1278 return false;
1279
1280 return (faccessat(cfd, fnam, F_OK, 0) == 0);
1281 }
1282
1283 struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file)
1284 {
1285 int ret, cfd;
1286 size_t len;
1287 char *fnam, *tmpc;
1288 struct stat sb;
1289 struct cgfs_files *newkey;
1290
1291 tmpc = find_mounted_controller(controller, &cfd);
1292 if (!tmpc)
1293 return false;
1294
1295 if (file && *file == '/')
1296 file++;
1297
1298 if (file && strchr(file, '/'))
1299 return NULL;
1300
1301 /* Make sure we pass a relative path to *at() family of functions.
1302 * . + /cgroup + / + file + \0
1303 */
1304 len = strlen(cgroup) + 3;
1305 if (file)
1306 len += strlen(file) + 1;
1307 fnam = alloca(len);
1308 snprintf(fnam, len, "%s%s%s%s", *cgroup == '/' ? "." : "", cgroup,
1309 file ? "/" : "", file ? file : "");
1310
1311 ret = fstatat(cfd, fnam, &sb, 0);
1312 if (ret < 0)
1313 return NULL;
1314
1315 do {
1316 newkey = malloc(sizeof(struct cgfs_files));
1317 } while (!newkey);
1318 if (file)
1319 newkey->name = must_copy_string(file);
1320 else if (strrchr(cgroup, '/'))
1321 newkey->name = must_copy_string(strrchr(cgroup, '/'));
1322 else
1323 newkey->name = must_copy_string(cgroup);
1324 newkey->uid = sb.st_uid;
1325 newkey->gid = sb.st_gid;
1326 newkey->mode = sb.st_mode;
1327
1328 return newkey;
1329 }
1330
1331 static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry)
1332 {
1333 struct cgfs_files *entry = cgfs_get_key(controller, cgroup, dir_entry);
1334 if (!entry) {
1335 lxcfs_error("Error getting files under %s:%s\n", controller,
1336 cgroup);
1337 }
1338 return entry;
1339 }
1340
1341 bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys)
1342 {
1343 return cgfs_iterate_cgroup(controller, cgroup, false, (void***)keys, sizeof(*keys), &make_key_list_entry);
1344 }
1345
1346 bool is_child_cgroup(const char *controller, const char *cgroup, const char *f)
1347 {
1348 int cfd;
1349 size_t len;
1350 char *fnam, *tmpc;
1351 int ret;
1352 struct stat sb;
1353
1354 tmpc = find_mounted_controller(controller, &cfd);
1355 if (!tmpc)
1356 return false;
1357
1358 /* Make sure we pass a relative path to *at() family of functions.
1359 * . + /cgroup + / + f + \0
1360 */
1361 len = strlen(cgroup) + strlen(f) + 3;
1362 fnam = alloca(len);
1363 ret = snprintf(fnam, len, "%s%s/%s", *cgroup == '/' ? "." : "", cgroup, f);
1364 if (ret < 0 || (size_t)ret >= len)
1365 return false;
1366
1367 ret = fstatat(cfd, fnam, &sb, 0);
1368 if (ret < 0 || !S_ISDIR(sb.st_mode))
1369 return false;
1370
1371 return true;
1372 }
1373
1374 #define SEND_CREDS_OK 0
1375 #define SEND_CREDS_NOTSK 1
1376 #define SEND_CREDS_FAIL 2
1377 static bool recv_creds(int sock, struct ucred *cred, char *v);
1378 static int wait_for_pid(pid_t pid);
1379 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst);
1380 static int send_creds_clone_wrapper(void *arg);
1381
1382 /*
1383 * clone a task which switches to @task's namespace and writes '1'.
1384 * over a unix sock so we can read the task's reaper's pid in our
1385 * namespace
1386 *
1387 * Note: glibc's fork() does not respect pidns, which can lead to failed
1388 * assertions inside glibc (and thus failed forks) if the child's pid in
1389 * the pidns and the parent pid outside are identical. Using clone prevents
1390 * this issue.
1391 */
1392 static void write_task_init_pid_exit(int sock, pid_t target)
1393 {
1394 char fnam[100];
1395 pid_t pid;
1396 int fd, ret;
1397 size_t stack_size = sysconf(_SC_PAGESIZE);
1398 void *stack = alloca(stack_size);
1399
1400 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", (int)target);
1401 if (ret < 0 || ret >= sizeof(fnam))
1402 _exit(1);
1403
1404 fd = open(fnam, O_RDONLY);
1405 if (fd < 0) {
1406 perror("write_task_init_pid_exit open of ns/pid");
1407 _exit(1);
1408 }
1409 if (setns(fd, 0)) {
1410 perror("write_task_init_pid_exit setns 1");
1411 close(fd);
1412 _exit(1);
1413 }
1414 pid = clone(send_creds_clone_wrapper, stack + stack_size, SIGCHLD, &sock);
1415 if (pid < 0)
1416 _exit(1);
1417 if (pid != 0) {
1418 if (!wait_for_pid(pid))
1419 _exit(1);
1420 _exit(0);
1421 }
1422 }
1423
1424 static int send_creds_clone_wrapper(void *arg) {
1425 struct ucred cred;
1426 char v;
1427 int sock = *(int *)arg;
1428
1429 /* we are the child */
1430 cred.uid = 0;
1431 cred.gid = 0;
1432 cred.pid = 1;
1433 v = '1';
1434 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK)
1435 return 1;
1436 return 0;
1437 }
1438
1439 static pid_t get_init_pid_for_task(pid_t task)
1440 {
1441 int sock[2];
1442 pid_t pid;
1443 pid_t ret = -1;
1444 char v = '0';
1445 struct ucred cred;
1446
1447 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
1448 perror("socketpair");
1449 return -1;
1450 }
1451
1452 pid = fork();
1453 if (pid < 0)
1454 goto out;
1455 if (!pid) {
1456 close(sock[1]);
1457 write_task_init_pid_exit(sock[0], task);
1458 _exit(0);
1459 }
1460
1461 if (!recv_creds(sock[1], &cred, &v))
1462 goto out;
1463 ret = cred.pid;
1464
1465 out:
1466 close(sock[0]);
1467 close(sock[1]);
1468 if (pid > 0)
1469 wait_for_pid(pid);
1470 return ret;
1471 }
1472
1473 pid_t lookup_initpid_in_store(pid_t qpid)
1474 {
1475 pid_t answer = 0;
1476 struct stat sb;
1477 struct pidns_init_store *e;
1478 char fnam[100];
1479
1480 snprintf(fnam, 100, "/proc/%d/ns/pid", qpid);
1481 store_lock();
1482 if (stat(fnam, &sb) < 0)
1483 goto out;
1484 e = lookup_verify_initpid(&sb);
1485 if (e) {
1486 answer = e->initpid;
1487 goto out;
1488 }
1489 answer = get_init_pid_for_task(qpid);
1490 if (answer > 0)
1491 save_initpid(&sb, answer);
1492
1493 out:
1494 /* we prune at end in case we are returning
1495 * the value we were about to return */
1496 prune_initpid_store();
1497 store_unlock();
1498 return answer;
1499 }
1500
1501 static int wait_for_pid(pid_t pid)
1502 {
1503 int status, ret;
1504
1505 if (pid <= 0)
1506 return -1;
1507
1508 again:
1509 ret = waitpid(pid, &status, 0);
1510 if (ret == -1) {
1511 if (errno == EINTR)
1512 goto again;
1513 return -1;
1514 }
1515 if (ret != pid)
1516 goto again;
1517 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
1518 return -1;
1519 return 0;
1520 }
1521
1522 /*
1523 * append the given formatted string to *src.
1524 * src: a pointer to a char* in which to append the formatted string.
1525 * sz: the number of characters printed so far, minus trailing \0.
1526 * asz: the allocated size so far
1527 * format: string format. See printf for details.
1528 * ...: varargs. See printf for details.
1529 */
1530 static void must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...)
1531 {
1532 char tmp[BUF_RESERVE_SIZE];
1533 va_list args;
1534
1535 va_start (args, format);
1536 int tmplen = vsnprintf(tmp, BUF_RESERVE_SIZE, format, args);
1537 va_end(args);
1538
1539 if (!*src || tmplen + *sz + 1 >= *asz) {
1540 char *tmp;
1541 do {
1542 tmp = realloc(*src, *asz + BUF_RESERVE_SIZE);
1543 } while (!tmp);
1544 *src = tmp;
1545 *asz += BUF_RESERVE_SIZE;
1546 }
1547 memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */
1548 *sz += tmplen;
1549 }
1550
1551 /*
1552 * append pid to *src.
1553 * src: a pointer to a char* in which ot append the pid.
1554 * sz: the number of characters printed so far, minus trailing \0.
1555 * asz: the allocated size so far
1556 * pid: the pid to append
1557 */
1558 static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid)
1559 {
1560 must_strcat(src, sz, asz, "%d\n", (int)pid);
1561 }
1562
1563 /*
1564 * Given a open file * to /proc/pid/{u,g}id_map, and an id
1565 * valid in the caller's namespace, return the id mapped into
1566 * pid's namespace.
1567 * Returns the mapped id, or -1 on error.
1568 */
1569 unsigned int
1570 convert_id_to_ns(FILE *idfile, unsigned int in_id)
1571 {
1572 unsigned int nsuid, // base id for a range in the idfile's namespace
1573 hostuid, // base id for a range in the caller's namespace
1574 count; // number of ids in this range
1575 char line[400];
1576 int ret;
1577
1578 fseek(idfile, 0L, SEEK_SET);
1579 while (fgets(line, 400, idfile)) {
1580 ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count);
1581 if (ret != 3)
1582 continue;
1583 if (hostuid + count < hostuid || nsuid + count < nsuid) {
1584 /*
1585 * uids wrapped around - unexpected as this is a procfile,
1586 * so just bail.
1587 */
1588 lxcfs_error("pid wrapparound at entry %u %u %u in %s\n",
1589 nsuid, hostuid, count, line);
1590 return -1;
1591 }
1592 if (hostuid <= in_id && hostuid+count > in_id) {
1593 /*
1594 * now since hostuid <= in_id < hostuid+count, and
1595 * hostuid+count and nsuid+count do not wrap around,
1596 * we know that nsuid+(in_id-hostuid) which must be
1597 * less that nsuid+(count) must not wrap around
1598 */
1599 return (in_id - hostuid) + nsuid;
1600 }
1601 }
1602
1603 // no answer found
1604 return -1;
1605 }
1606
1607 /*
1608 * for is_privileged_over,
1609 * specify whether we require the calling uid to be root in his
1610 * namespace
1611 */
1612 #define NS_ROOT_REQD true
1613 #define NS_ROOT_OPT false
1614
1615 #define PROCLEN 100
1616
1617 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root)
1618 {
1619 char fpath[PROCLEN];
1620 int ret;
1621 bool answer = false;
1622 uid_t nsuid;
1623
1624 if (victim == -1 || uid == -1)
1625 return false;
1626
1627 /*
1628 * If the request is one not requiring root in the namespace,
1629 * then having the same uid suffices. (i.e. uid 1000 has write
1630 * access to files owned by uid 1000
1631 */
1632 if (!req_ns_root && uid == victim)
1633 return true;
1634
1635 ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid);
1636 if (ret < 0 || ret >= PROCLEN)
1637 return false;
1638 FILE *f = fopen(fpath, "r");
1639 if (!f)
1640 return false;
1641
1642 /* if caller's not root in his namespace, reject */
1643 nsuid = convert_id_to_ns(f, uid);
1644 if (nsuid)
1645 goto out;
1646
1647 /*
1648 * If victim is not mapped into caller's ns, reject.
1649 * XXX I'm not sure this check is needed given that fuse
1650 * will be sending requests where the vfs has converted
1651 */
1652 nsuid = convert_id_to_ns(f, victim);
1653 if (nsuid == -1)
1654 goto out;
1655
1656 answer = true;
1657
1658 out:
1659 fclose(f);
1660 return answer;
1661 }
1662
1663 static bool perms_include(int fmode, mode_t req_mode)
1664 {
1665 mode_t r;
1666
1667 switch (req_mode & O_ACCMODE) {
1668 case O_RDONLY:
1669 r = S_IROTH;
1670 break;
1671 case O_WRONLY:
1672 r = S_IWOTH;
1673 break;
1674 case O_RDWR:
1675 r = S_IROTH | S_IWOTH;
1676 break;
1677 default:
1678 return false;
1679 }
1680 return ((fmode & r) == r);
1681 }
1682
1683
1684 /*
1685 * taskcg is a/b/c
1686 * querycg is /a/b/c/d/e
1687 * we return 'd'
1688 */
1689 static char *get_next_cgroup_dir(const char *taskcg, const char *querycg)
1690 {
1691 char *start, *end;
1692
1693 if (strlen(taskcg) <= strlen(querycg)) {
1694 lxcfs_error("%s\n", "I was fed bad input.");
1695 return NULL;
1696 }
1697
1698 if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0))
1699 start = strdup(taskcg + 1);
1700 else
1701 start = strdup(taskcg + strlen(querycg) + 1);
1702 if (!start)
1703 return NULL;
1704 end = strchr(start, '/');
1705 if (end)
1706 *end = '\0';
1707 return start;
1708 }
1709
1710 static void stripnewline(char *x)
1711 {
1712 size_t l = strlen(x);
1713 if (l && x[l-1] == '\n')
1714 x[l-1] = '\0';
1715 }
1716
1717 char *get_pid_cgroup(pid_t pid, const char *contrl)
1718 {
1719 int cfd;
1720 char fnam[PROCLEN];
1721 FILE *f;
1722 char *answer = NULL;
1723 char *line = NULL;
1724 size_t len = 0;
1725 int ret;
1726 const char *h = find_mounted_controller(contrl, &cfd);
1727 if (!h)
1728 return NULL;
1729
1730 ret = snprintf(fnam, PROCLEN, "/proc/%d/cgroup", pid);
1731 if (ret < 0 || ret >= PROCLEN)
1732 return NULL;
1733 if (!(f = fopen(fnam, "r")))
1734 return NULL;
1735
1736 while (getline(&line, &len, f) != -1) {
1737 char *c1, *c2;
1738 if (!line[0])
1739 continue;
1740 c1 = strchr(line, ':');
1741 if (!c1)
1742 goto out;
1743 c1++;
1744 c2 = strchr(c1, ':');
1745 if (!c2)
1746 goto out;
1747 *c2 = '\0';
1748 if (strcmp(c1, h) != 0)
1749 continue;
1750 c2++;
1751 stripnewline(c2);
1752 do {
1753 answer = strdup(c2);
1754 } while (!answer);
1755 break;
1756 }
1757
1758 out:
1759 fclose(f);
1760 free(line);
1761 return answer;
1762 }
1763
1764 /*
1765 * check whether a fuse context may access a cgroup dir or file
1766 *
1767 * If file is not null, it is a cgroup file to check under cg.
1768 * If file is null, then we are checking perms on cg itself.
1769 *
1770 * For files we can check the mode of the list_keys result.
1771 * For cgroups, we must make assumptions based on the files under the
1772 * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups
1773 * yet.
1774 */
1775 static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode)
1776 {
1777 struct cgfs_files *k = NULL;
1778 bool ret = false;
1779
1780 k = cgfs_get_key(contrl, cg, file);
1781 if (!k)
1782 return false;
1783
1784 if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
1785 if (perms_include(k->mode >> 6, mode)) {
1786 ret = true;
1787 goto out;
1788 }
1789 }
1790 if (fc->gid == k->gid) {
1791 if (perms_include(k->mode >> 3, mode)) {
1792 ret = true;
1793 goto out;
1794 }
1795 }
1796 ret = perms_include(k->mode, mode);
1797
1798 out:
1799 free_key(k);
1800 return ret;
1801 }
1802
1803 #define INITSCOPE "/init.scope"
1804 void prune_init_slice(char *cg)
1805 {
1806 char *point;
1807 size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE);
1808
1809 if (cg_len < initscope_len)
1810 return;
1811
1812 point = cg + cg_len - initscope_len;
1813 if (strcmp(point, INITSCOPE) == 0) {
1814 if (point == cg)
1815 *(point+1) = '\0';
1816 else
1817 *point = '\0';
1818 }
1819 }
1820
1821 /*
1822 * If pid is in /a/b/c/d, he may only act on things under cg=/a/b/c/d.
1823 * If pid is in /a, he may act on /a/b, but not on /b.
1824 * if the answer is false and nextcg is not NULL, then *nextcg will point
1825 * to a string containing the next cgroup directory under cg, which must be
1826 * freed by the caller.
1827 */
1828 static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg)
1829 {
1830 bool answer = false;
1831 char *c2 = get_pid_cgroup(pid, contrl);
1832 char *linecmp;
1833
1834 if (!c2)
1835 return false;
1836 prune_init_slice(c2);
1837
1838 /*
1839 * callers pass in '/' or './' (openat()) for root cgroup, otherwise
1840 * they pass in a cgroup without leading '/'
1841 *
1842 * The original line here was:
1843 * linecmp = *cg == '/' ? c2 : c2+1;
1844 * TODO: I'm not sure why you'd want to increment when *cg != '/'?
1845 * Serge, do you know?
1846 */
1847 if (*cg == '/' || !strncmp(cg, "./", 2))
1848 linecmp = c2;
1849 else
1850 linecmp = c2 + 1;
1851 if (strncmp(linecmp, cg, strlen(linecmp)) != 0) {
1852 if (nextcg) {
1853 *nextcg = get_next_cgroup_dir(linecmp, cg);
1854 }
1855 goto out;
1856 }
1857 answer = true;
1858
1859 out:
1860 free(c2);
1861 return answer;
1862 }
1863
1864 /*
1865 * If pid is in /a/b/c, he may see that /a exists, but not /b or /a/c.
1866 */
1867 static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg)
1868 {
1869 bool answer = false;
1870 char *c2, *task_cg;
1871 size_t target_len, task_len;
1872
1873 if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0)
1874 return true;
1875
1876 c2 = get_pid_cgroup(pid, contrl);
1877 if (!c2)
1878 return false;
1879 prune_init_slice(c2);
1880
1881 task_cg = c2 + 1;
1882 target_len = strlen(cg);
1883 task_len = strlen(task_cg);
1884 if (task_len == 0) {
1885 /* Task is in the root cg, it can see everything. This case is
1886 * not handled by the strmcps below, since they test for the
1887 * last /, but that is the first / that we've chopped off
1888 * above.
1889 */
1890 answer = true;
1891 goto out;
1892 }
1893 if (strcmp(cg, task_cg) == 0) {
1894 answer = true;
1895 goto out;
1896 }
1897 if (target_len < task_len) {
1898 /* looking up a parent dir */
1899 if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/')
1900 answer = true;
1901 goto out;
1902 }
1903 if (target_len > task_len) {
1904 /* looking up a child dir */
1905 if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/')
1906 answer = true;
1907 goto out;
1908 }
1909
1910 out:
1911 free(c2);
1912 return answer;
1913 }
1914
1915 /*
1916 * given /cgroup/freezer/a/b, return "freezer".
1917 * the returned char* should NOT be freed.
1918 */
1919 static char *pick_controller_from_path(struct fuse_context *fc, const char *path)
1920 {
1921 const char *p1;
1922 char *contr, *slash;
1923
1924 if (strlen(path) < 9) {
1925 errno = EACCES;
1926 return NULL;
1927 }
1928 if (*(path + 7) != '/') {
1929 errno = EINVAL;
1930 return NULL;
1931 }
1932 p1 = path + 8;
1933 contr = strdupa(p1);
1934 if (!contr) {
1935 errno = ENOMEM;
1936 return NULL;
1937 }
1938 slash = strstr(contr, "/");
1939 if (slash)
1940 *slash = '\0';
1941
1942 int i;
1943 for (i = 0; i < num_hierarchies; i++) {
1944 if (hierarchies[i] && strcmp(hierarchies[i], contr) == 0)
1945 return hierarchies[i];
1946 }
1947 errno = ENOENT;
1948 return NULL;
1949 }
1950
1951 /*
1952 * Find the start of cgroup in /cgroup/controller/the/cgroup/path
1953 * Note that the returned value may include files (keynames) etc
1954 */
1955 static const char *find_cgroup_in_path(const char *path)
1956 {
1957 const char *p1;
1958
1959 if (strlen(path) < 9) {
1960 errno = EACCES;
1961 return NULL;
1962 }
1963 p1 = strstr(path + 8, "/");
1964 if (!p1) {
1965 errno = EINVAL;
1966 return NULL;
1967 }
1968 errno = 0;
1969 return p1 + 1;
1970 }
1971
1972 /*
1973 * split the last path element from the path in @cg.
1974 * @dir is newly allocated and should be freed, @last not
1975 */
1976 static void get_cgdir_and_path(const char *cg, char **dir, char **last)
1977 {
1978 char *p;
1979
1980 do {
1981 *dir = strdup(cg);
1982 } while (!*dir);
1983 *last = strrchr(cg, '/');
1984 if (!*last) {
1985 *last = NULL;
1986 return;
1987 }
1988 p = strrchr(*dir, '/');
1989 *p = '\0';
1990 }
1991
1992 /*
1993 * FUSE ops for /cgroup
1994 */
1995
1996 int cg_getattr(const char *path, struct stat *sb)
1997 {
1998 struct timespec now;
1999 struct fuse_context *fc = fuse_get_context();
2000 char * cgdir = NULL;
2001 char *last = NULL, *path1, *path2;
2002 struct cgfs_files *k = NULL;
2003 const char *cgroup;
2004 const char *controller = NULL;
2005 int ret = -ENOENT;
2006
2007
2008 if (!fc)
2009 return -EIO;
2010
2011 memset(sb, 0, sizeof(struct stat));
2012
2013 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
2014 return -EINVAL;
2015
2016 sb->st_uid = sb->st_gid = 0;
2017 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
2018 sb->st_size = 0;
2019
2020 if (strcmp(path, "/cgroup") == 0) {
2021 sb->st_mode = S_IFDIR | 00755;
2022 sb->st_nlink = 2;
2023 return 0;
2024 }
2025
2026 controller = pick_controller_from_path(fc, path);
2027 if (!controller)
2028 return -errno;
2029 cgroup = find_cgroup_in_path(path);
2030 if (!cgroup) {
2031 /* this is just /cgroup/controller, return it as a dir */
2032 sb->st_mode = S_IFDIR | 00755;
2033 sb->st_nlink = 2;
2034 return 0;
2035 }
2036
2037 get_cgdir_and_path(cgroup, &cgdir, &last);
2038
2039 if (!last) {
2040 path1 = "/";
2041 path2 = cgdir;
2042 } else {
2043 path1 = cgdir;
2044 path2 = last;
2045 }
2046
2047 pid_t initpid = lookup_initpid_in_store(fc->pid);
2048 if (initpid <= 1 || is_shared_pidns(initpid))
2049 initpid = fc->pid;
2050 /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys.
2051 * Then check that caller's cgroup is under path if last is a child
2052 * cgroup, or cgdir if last is a file */
2053
2054 if (is_child_cgroup(controller, path1, path2)) {
2055 if (!caller_may_see_dir(initpid, controller, cgroup)) {
2056 ret = -ENOENT;
2057 goto out;
2058 }
2059 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
2060 /* this is just /cgroup/controller, return it as a dir */
2061 sb->st_mode = S_IFDIR | 00555;
2062 sb->st_nlink = 2;
2063 ret = 0;
2064 goto out;
2065 }
2066 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) {
2067 ret = -EACCES;
2068 goto out;
2069 }
2070
2071 // get uid, gid, from '/tasks' file and make up a mode
2072 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
2073 sb->st_mode = S_IFDIR | 00755;
2074 k = cgfs_get_key(controller, cgroup, NULL);
2075 if (!k) {
2076 sb->st_uid = sb->st_gid = 0;
2077 } else {
2078 sb->st_uid = k->uid;
2079 sb->st_gid = k->gid;
2080 }
2081 free_key(k);
2082 sb->st_nlink = 2;
2083 ret = 0;
2084 goto out;
2085 }
2086
2087 if ((k = cgfs_get_key(controller, path1, path2)) != NULL) {
2088 sb->st_mode = S_IFREG | k->mode;
2089 sb->st_nlink = 1;
2090 sb->st_uid = k->uid;
2091 sb->st_gid = k->gid;
2092 sb->st_size = 0;
2093 free_key(k);
2094 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
2095 ret = -ENOENT;
2096 goto out;
2097 }
2098 ret = 0;
2099 }
2100
2101 out:
2102 free(cgdir);
2103 return ret;
2104 }
2105
2106 int cg_opendir(const char *path, struct fuse_file_info *fi)
2107 {
2108 struct fuse_context *fc = fuse_get_context();
2109 const char *cgroup;
2110 struct file_info *dir_info;
2111 char *controller = NULL;
2112
2113 if (!fc)
2114 return -EIO;
2115
2116 if (strcmp(path, "/cgroup") == 0) {
2117 cgroup = NULL;
2118 controller = NULL;
2119 } else {
2120 // return list of keys for the controller, and list of child cgroups
2121 controller = pick_controller_from_path(fc, path);
2122 if (!controller)
2123 return -errno;
2124
2125 cgroup = find_cgroup_in_path(path);
2126 if (!cgroup) {
2127 /* this is just /cgroup/controller, return its contents */
2128 cgroup = "/";
2129 }
2130 }
2131
2132 pid_t initpid = lookup_initpid_in_store(fc->pid);
2133 if (initpid <= 1 || is_shared_pidns(initpid))
2134 initpid = fc->pid;
2135 if (cgroup) {
2136 if (!caller_may_see_dir(initpid, controller, cgroup))
2137 return -ENOENT;
2138 if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY))
2139 return -EACCES;
2140 }
2141
2142 /* we'll free this at cg_releasedir */
2143 dir_info = malloc(sizeof(*dir_info));
2144 if (!dir_info)
2145 return -ENOMEM;
2146 dir_info->controller = must_copy_string(controller);
2147 dir_info->cgroup = must_copy_string(cgroup);
2148 dir_info->type = LXC_TYPE_CGDIR;
2149 dir_info->buf = NULL;
2150 dir_info->file = NULL;
2151 dir_info->buflen = 0;
2152
2153 fi->fh = (unsigned long)dir_info;
2154 return 0;
2155 }
2156
2157 int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
2158 struct fuse_file_info *fi)
2159 {
2160 struct file_info *d = (struct file_info *)fi->fh;
2161 struct cgfs_files **list = NULL;
2162 int i, ret;
2163 char *nextcg = NULL;
2164 struct fuse_context *fc = fuse_get_context();
2165 char **clist = NULL;
2166
2167 if (filler(buf, ".", NULL, 0) != 0 || filler(buf, "..", NULL, 0) != 0)
2168 return -EIO;
2169
2170 if (d->type != LXC_TYPE_CGDIR) {
2171 lxcfs_error("%s\n", "Internal error: file cache info used in readdir.");
2172 return -EIO;
2173 }
2174 if (!d->cgroup && !d->controller) {
2175 // ls /var/lib/lxcfs/cgroup - just show list of controllers
2176 int i;
2177
2178 for (i = 0; i < num_hierarchies; i++) {
2179 if (hierarchies[i] && filler(buf, hierarchies[i], NULL, 0) != 0) {
2180 return -EIO;
2181 }
2182 }
2183 return 0;
2184 }
2185
2186 if (!cgfs_list_keys(d->controller, d->cgroup, &list)) {
2187 // not a valid cgroup
2188 ret = -EINVAL;
2189 goto out;
2190 }
2191
2192 pid_t initpid = lookup_initpid_in_store(fc->pid);
2193 if (initpid <= 1 || is_shared_pidns(initpid))
2194 initpid = fc->pid;
2195 if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) {
2196 if (nextcg) {
2197 ret = filler(buf, nextcg, NULL, 0);
2198 free(nextcg);
2199 if (ret != 0) {
2200 ret = -EIO;
2201 goto out;
2202 }
2203 }
2204 ret = 0;
2205 goto out;
2206 }
2207
2208 for (i = 0; list && list[i]; i++) {
2209 if (filler(buf, list[i]->name, NULL, 0) != 0) {
2210 ret = -EIO;
2211 goto out;
2212 }
2213 }
2214
2215 // now get the list of child cgroups
2216
2217 if (!cgfs_list_children(d->controller, d->cgroup, &clist)) {
2218 ret = 0;
2219 goto out;
2220 }
2221 if (clist) {
2222 for (i = 0; clist[i]; i++) {
2223 if (filler(buf, clist[i], NULL, 0) != 0) {
2224 ret = -EIO;
2225 goto out;
2226 }
2227 }
2228 }
2229 ret = 0;
2230
2231 out:
2232 free_keys(list);
2233 if (clist) {
2234 for (i = 0; clist[i]; i++)
2235 free(clist[i]);
2236 free(clist);
2237 }
2238 return ret;
2239 }
2240
2241 void do_release_file_info(struct fuse_file_info *fi)
2242 {
2243 struct file_info *f = (struct file_info *)fi->fh;
2244
2245 if (!f)
2246 return;
2247
2248 fi->fh = 0;
2249
2250 free(f->controller);
2251 f->controller = NULL;
2252 free(f->cgroup);
2253 f->cgroup = NULL;
2254 free(f->file);
2255 f->file = NULL;
2256 free(f->buf);
2257 f->buf = NULL;
2258 free(f);
2259 f = NULL;
2260 }
2261
2262 int cg_releasedir(const char *path, struct fuse_file_info *fi)
2263 {
2264 do_release_file_info(fi);
2265 return 0;
2266 }
2267
2268 int cg_open(const char *path, struct fuse_file_info *fi)
2269 {
2270 const char *cgroup;
2271 char *last = NULL, *path1, *path2, * cgdir = NULL, *controller;
2272 struct cgfs_files *k = NULL;
2273 struct file_info *file_info;
2274 struct fuse_context *fc = fuse_get_context();
2275 int ret;
2276
2277 if (!fc)
2278 return -EIO;
2279
2280 controller = pick_controller_from_path(fc, path);
2281 if (!controller)
2282 return -errno;
2283 cgroup = find_cgroup_in_path(path);
2284 if (!cgroup)
2285 return -errno;
2286
2287 get_cgdir_and_path(cgroup, &cgdir, &last);
2288 if (!last) {
2289 path1 = "/";
2290 path2 = cgdir;
2291 } else {
2292 path1 = cgdir;
2293 path2 = last;
2294 }
2295
2296 k = cgfs_get_key(controller, path1, path2);
2297 if (!k) {
2298 ret = -EINVAL;
2299 goto out;
2300 }
2301 free_key(k);
2302
2303 pid_t initpid = lookup_initpid_in_store(fc->pid);
2304 if (initpid <= 1 || is_shared_pidns(initpid))
2305 initpid = fc->pid;
2306 if (!caller_may_see_dir(initpid, controller, path1)) {
2307 ret = -ENOENT;
2308 goto out;
2309 }
2310 if (!fc_may_access(fc, controller, path1, path2, fi->flags)) {
2311 ret = -EACCES;
2312 goto out;
2313 }
2314
2315 /* we'll free this at cg_release */
2316 file_info = malloc(sizeof(*file_info));
2317 if (!file_info) {
2318 ret = -ENOMEM;
2319 goto out;
2320 }
2321 file_info->controller = must_copy_string(controller);
2322 file_info->cgroup = must_copy_string(path1);
2323 file_info->file = must_copy_string(path2);
2324 file_info->type = LXC_TYPE_CGFILE;
2325 file_info->buf = NULL;
2326 file_info->buflen = 0;
2327
2328 fi->fh = (unsigned long)file_info;
2329 ret = 0;
2330
2331 out:
2332 free(cgdir);
2333 return ret;
2334 }
2335
2336 int cg_access(const char *path, int mode)
2337 {
2338 int ret;
2339 const char *cgroup;
2340 char *path1, *path2, *controller;
2341 char *last = NULL, *cgdir = NULL;
2342 struct cgfs_files *k = NULL;
2343 struct fuse_context *fc = fuse_get_context();
2344
2345 if (strcmp(path, "/cgroup") == 0)
2346 return 0;
2347
2348 if (!fc)
2349 return -EIO;
2350
2351 controller = pick_controller_from_path(fc, path);
2352 if (!controller)
2353 return -errno;
2354 cgroup = find_cgroup_in_path(path);
2355 if (!cgroup) {
2356 // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not
2357 if ((mode & W_OK) == 0)
2358 return 0;
2359 return -EACCES;
2360 }
2361
2362 get_cgdir_and_path(cgroup, &cgdir, &last);
2363 if (!last) {
2364 path1 = "/";
2365 path2 = cgdir;
2366 } else {
2367 path1 = cgdir;
2368 path2 = last;
2369 }
2370
2371 k = cgfs_get_key(controller, path1, path2);
2372 if (!k) {
2373 if ((mode & W_OK) == 0)
2374 ret = 0;
2375 else
2376 ret = -EACCES;
2377 goto out;
2378 }
2379 free_key(k);
2380
2381 pid_t initpid = lookup_initpid_in_store(fc->pid);
2382 if (initpid <= 1 || is_shared_pidns(initpid))
2383 initpid = fc->pid;
2384 if (!caller_may_see_dir(initpid, controller, path1)) {
2385 ret = -ENOENT;
2386 goto out;
2387 }
2388 if (!fc_may_access(fc, controller, path1, path2, mode)) {
2389 ret = -EACCES;
2390 goto out;
2391 }
2392
2393 ret = 0;
2394
2395 out:
2396 free(cgdir);
2397 return ret;
2398 }
2399
2400 int cg_release(const char *path, struct fuse_file_info *fi)
2401 {
2402 do_release_file_info(fi);
2403 return 0;
2404 }
2405
2406 #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP )
2407
2408 static bool wait_for_sock(int sock, int timeout)
2409 {
2410 struct epoll_event ev;
2411 int epfd, ret, now, starttime, deltatime, saved_errno;
2412
2413 if ((starttime = time(NULL)) < 0)
2414 return false;
2415
2416 if ((epfd = epoll_create(1)) < 0) {
2417 lxcfs_error("%s\n", "Failed to create epoll socket: %m.");
2418 return false;
2419 }
2420
2421 ev.events = POLLIN_SET;
2422 ev.data.fd = sock;
2423 if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) {
2424 lxcfs_error("%s\n", "Failed adding socket to epoll: %m.");
2425 close(epfd);
2426 return false;
2427 }
2428
2429 again:
2430 if ((now = time(NULL)) < 0) {
2431 close(epfd);
2432 return false;
2433 }
2434
2435 deltatime = (starttime + timeout) - now;
2436 if (deltatime < 0) { // timeout
2437 errno = 0;
2438 close(epfd);
2439 return false;
2440 }
2441 ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1);
2442 if (ret < 0 && errno == EINTR)
2443 goto again;
2444 saved_errno = errno;
2445 close(epfd);
2446
2447 if (ret <= 0) {
2448 errno = saved_errno;
2449 return false;
2450 }
2451 return true;
2452 }
2453
2454 static int msgrecv(int sockfd, void *buf, size_t len)
2455 {
2456 if (!wait_for_sock(sockfd, 2))
2457 return -1;
2458 return recv(sockfd, buf, len, MSG_DONTWAIT);
2459 }
2460
2461 static int send_creds(int sock, struct ucred *cred, char v, bool pingfirst)
2462 {
2463 struct msghdr msg = { 0 };
2464 struct iovec iov;
2465 struct cmsghdr *cmsg;
2466 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2467 char buf[1];
2468 buf[0] = 'p';
2469
2470 if (pingfirst) {
2471 if (msgrecv(sock, buf, 1) != 1) {
2472 lxcfs_error("%s\n", "Error getting reply from server over socketpair.");
2473 return SEND_CREDS_FAIL;
2474 }
2475 }
2476
2477 msg.msg_control = cmsgbuf;
2478 msg.msg_controllen = sizeof(cmsgbuf);
2479
2480 cmsg = CMSG_FIRSTHDR(&msg);
2481 cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
2482 cmsg->cmsg_level = SOL_SOCKET;
2483 cmsg->cmsg_type = SCM_CREDENTIALS;
2484 memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred));
2485
2486 msg.msg_name = NULL;
2487 msg.msg_namelen = 0;
2488
2489 buf[0] = v;
2490 iov.iov_base = buf;
2491 iov.iov_len = sizeof(buf);
2492 msg.msg_iov = &iov;
2493 msg.msg_iovlen = 1;
2494
2495 if (sendmsg(sock, &msg, 0) < 0) {
2496 lxcfs_error("Failed at sendmsg: %s.\n",strerror(errno));
2497 if (errno == 3)
2498 return SEND_CREDS_NOTSK;
2499 return SEND_CREDS_FAIL;
2500 }
2501
2502 return SEND_CREDS_OK;
2503 }
2504
2505 static bool recv_creds(int sock, struct ucred *cred, char *v)
2506 {
2507 struct msghdr msg = { 0 };
2508 struct iovec iov;
2509 struct cmsghdr *cmsg;
2510 char cmsgbuf[CMSG_SPACE(sizeof(*cred))];
2511 char buf[1];
2512 int ret;
2513 int optval = 1;
2514
2515 *v = '1';
2516
2517 cred->pid = -1;
2518 cred->uid = -1;
2519 cred->gid = -1;
2520
2521 if (setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)) == -1) {
2522 lxcfs_error("Failed to set passcred: %s\n", strerror(errno));
2523 return false;
2524 }
2525 buf[0] = '1';
2526 if (write(sock, buf, 1) != 1) {
2527 lxcfs_error("Failed to start write on scm fd: %s\n", strerror(errno));
2528 return false;
2529 }
2530
2531 msg.msg_name = NULL;
2532 msg.msg_namelen = 0;
2533 msg.msg_control = cmsgbuf;
2534 msg.msg_controllen = sizeof(cmsgbuf);
2535
2536 iov.iov_base = buf;
2537 iov.iov_len = sizeof(buf);
2538 msg.msg_iov = &iov;
2539 msg.msg_iovlen = 1;
2540
2541 if (!wait_for_sock(sock, 2)) {
2542 lxcfs_error("Timed out waiting for scm_cred: %s\n", strerror(errno));
2543 return false;
2544 }
2545 ret = recvmsg(sock, &msg, MSG_DONTWAIT);
2546 if (ret < 0) {
2547 lxcfs_error("Failed to receive scm_cred: %s\n", strerror(errno));
2548 return false;
2549 }
2550
2551 cmsg = CMSG_FIRSTHDR(&msg);
2552
2553 if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)) &&
2554 cmsg->cmsg_level == SOL_SOCKET &&
2555 cmsg->cmsg_type == SCM_CREDENTIALS) {
2556 memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred));
2557 }
2558 *v = buf[0];
2559
2560 return true;
2561 }
2562
2563 struct pid_ns_clone_args {
2564 int *cpipe;
2565 int sock;
2566 pid_t tpid;
2567 int (*wrapped) (int, pid_t); // pid_from_ns or pid_to_ns
2568 };
2569
2570 /*
2571 * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage
2572 * with clone(). This simply writes '1' as ACK back to the parent
2573 * before calling the actual wrapped function.
2574 */
2575 static int pid_ns_clone_wrapper(void *arg) {
2576 struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg;
2577 char b = '1';
2578
2579 close(args->cpipe[0]);
2580 if (write(args->cpipe[1], &b, sizeof(char)) < 0)
2581 lxcfs_error("(child): error on write: %s.\n", strerror(errno));
2582 close(args->cpipe[1]);
2583 return args->wrapped(args->sock, args->tpid);
2584 }
2585
2586 /*
2587 * pid_to_ns - reads pids from a ucred over a socket, then writes the
2588 * int value back over the socket. This shifts the pid from the
2589 * sender's pidns into tpid's pidns.
2590 */
2591 static int pid_to_ns(int sock, pid_t tpid)
2592 {
2593 char v = '0';
2594 struct ucred cred;
2595
2596 while (recv_creds(sock, &cred, &v)) {
2597 if (v == '1')
2598 return 0;
2599 if (write(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t))
2600 return 1;
2601 }
2602 return 0;
2603 }
2604
2605
2606 /*
2607 * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain
2608 * in your old pidns. Only children which you clone will be in the target
2609 * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to
2610 * actually convert pids.
2611 *
2612 * Note: glibc's fork() does not respect pidns, which can lead to failed
2613 * assertions inside glibc (and thus failed forks) if the child's pid in
2614 * the pidns and the parent pid outside are identical. Using clone prevents
2615 * this issue.
2616 */
2617 static void pid_to_ns_wrapper(int sock, pid_t tpid)
2618 {
2619 int newnsfd = -1, ret, cpipe[2];
2620 char fnam[100];
2621 pid_t cpid;
2622 char v;
2623
2624 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2625 if (ret < 0 || ret >= sizeof(fnam))
2626 _exit(1);
2627 newnsfd = open(fnam, O_RDONLY);
2628 if (newnsfd < 0)
2629 _exit(1);
2630 if (setns(newnsfd, 0) < 0)
2631 _exit(1);
2632 close(newnsfd);
2633
2634 if (pipe(cpipe) < 0)
2635 _exit(1);
2636
2637 struct pid_ns_clone_args args = {
2638 .cpipe = cpipe,
2639 .sock = sock,
2640 .tpid = tpid,
2641 .wrapped = &pid_to_ns
2642 };
2643 size_t stack_size = sysconf(_SC_PAGESIZE);
2644 void *stack = alloca(stack_size);
2645
2646 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2647 if (cpid < 0)
2648 _exit(1);
2649
2650 // give the child 1 second to be done forking and
2651 // write its ack
2652 if (!wait_for_sock(cpipe[0], 1))
2653 _exit(1);
2654 ret = read(cpipe[0], &v, 1);
2655 if (ret != sizeof(char) || v != '1')
2656 _exit(1);
2657
2658 if (!wait_for_pid(cpid))
2659 _exit(1);
2660 _exit(0);
2661 }
2662
2663 /*
2664 * To read cgroup files with a particular pid, we will setns into the child
2665 * pidns, open a pipe, fork a child - which will be the first to really be in
2666 * the child ns - which does the cgfs_get_value and writes the data to the pipe.
2667 */
2668 bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d)
2669 {
2670 int sock[2] = {-1, -1};
2671 char *tmpdata = NULL;
2672 int ret;
2673 pid_t qpid, cpid = -1;
2674 bool answer = false;
2675 char v = '0';
2676 struct ucred cred;
2677 size_t sz = 0, asz = 0;
2678
2679 if (!cgfs_get_value(contrl, cg, file, &tmpdata))
2680 return false;
2681
2682 /*
2683 * Now we read the pids from returned data one by one, pass
2684 * them into a child in the target namespace, read back the
2685 * translated pids, and put them into our to-return data
2686 */
2687
2688 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
2689 perror("socketpair");
2690 free(tmpdata);
2691 return false;
2692 }
2693
2694 cpid = fork();
2695 if (cpid == -1)
2696 goto out;
2697
2698 if (!cpid) // child - exits when done
2699 pid_to_ns_wrapper(sock[1], tpid);
2700
2701 char *ptr = tmpdata;
2702 cred.uid = 0;
2703 cred.gid = 0;
2704 while (sscanf(ptr, "%d\n", &qpid) == 1) {
2705 cred.pid = qpid;
2706 ret = send_creds(sock[0], &cred, v, true);
2707
2708 if (ret == SEND_CREDS_NOTSK)
2709 goto next;
2710 if (ret == SEND_CREDS_FAIL)
2711 goto out;
2712
2713 // read converted results
2714 if (!wait_for_sock(sock[0], 2)) {
2715 lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno));
2716 goto out;
2717 }
2718 if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
2719 lxcfs_error("Error reading pid from child: %s.\n", strerror(errno));
2720 goto out;
2721 }
2722 must_strcat_pid(d, &sz, &asz, qpid);
2723 next:
2724 ptr = strchr(ptr, '\n');
2725 if (!ptr)
2726 break;
2727 ptr++;
2728 }
2729
2730 cred.pid = getpid();
2731 v = '1';
2732 if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) {
2733 // failed to ask child to exit
2734 lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno));
2735 goto out;
2736 }
2737
2738 answer = true;
2739
2740 out:
2741 free(tmpdata);
2742 if (cpid != -1)
2743 wait_for_pid(cpid);
2744 if (sock[0] != -1) {
2745 close(sock[0]);
2746 close(sock[1]);
2747 }
2748 return answer;
2749 }
2750
2751 int cg_read(const char *path, char *buf, size_t size, off_t offset,
2752 struct fuse_file_info *fi)
2753 {
2754 struct fuse_context *fc = fuse_get_context();
2755 struct file_info *f = (struct file_info *)fi->fh;
2756 struct cgfs_files *k = NULL;
2757 char *data = NULL;
2758 int ret, s;
2759 bool r;
2760
2761 if (f->type != LXC_TYPE_CGFILE) {
2762 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read.");
2763 return -EIO;
2764 }
2765
2766 if (offset)
2767 return 0;
2768
2769 if (!fc)
2770 return -EIO;
2771
2772 if (!f->controller)
2773 return -EINVAL;
2774
2775 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
2776 return -EINVAL;
2777 }
2778 free_key(k);
2779
2780
2781 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) {
2782 ret = -EACCES;
2783 goto out;
2784 }
2785
2786 if (strcmp(f->file, "tasks") == 0 ||
2787 strcmp(f->file, "/tasks") == 0 ||
2788 strcmp(f->file, "/cgroup.procs") == 0 ||
2789 strcmp(f->file, "cgroup.procs") == 0)
2790 // special case - we have to translate the pids
2791 r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data);
2792 else
2793 r = cgfs_get_value(f->controller, f->cgroup, f->file, &data);
2794
2795 if (!r) {
2796 ret = -EINVAL;
2797 goto out;
2798 }
2799
2800 if (!data) {
2801 ret = 0;
2802 goto out;
2803 }
2804 s = strlen(data);
2805 if (s > size)
2806 s = size;
2807 memcpy(buf, data, s);
2808 if (s > 0 && s < size && data[s-1] != '\n')
2809 buf[s++] = '\n';
2810
2811 ret = s;
2812
2813 out:
2814 free(data);
2815 return ret;
2816 }
2817
2818 static int pid_from_ns(int sock, pid_t tpid)
2819 {
2820 pid_t vpid;
2821 struct ucred cred;
2822 char v;
2823 int ret;
2824
2825 cred.uid = 0;
2826 cred.gid = 0;
2827 while (1) {
2828 if (!wait_for_sock(sock, 2)) {
2829 lxcfs_error("%s\n", "Timeout reading from parent.");
2830 return 1;
2831 }
2832 if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) {
2833 lxcfs_error("Bad read from parent: %s.\n", strerror(errno));
2834 return 1;
2835 }
2836 if (vpid == -1) // done
2837 break;
2838 v = '0';
2839 cred.pid = vpid;
2840 if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) {
2841 v = '1';
2842 cred.pid = getpid();
2843 if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK)
2844 return 1;
2845 }
2846 }
2847 return 0;
2848 }
2849
2850 static void pid_from_ns_wrapper(int sock, pid_t tpid)
2851 {
2852 int newnsfd = -1, ret, cpipe[2];
2853 char fnam[100];
2854 pid_t cpid;
2855 char v;
2856
2857 ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid);
2858 if (ret < 0 || ret >= sizeof(fnam))
2859 _exit(1);
2860 newnsfd = open(fnam, O_RDONLY);
2861 if (newnsfd < 0)
2862 _exit(1);
2863 if (setns(newnsfd, 0) < 0)
2864 _exit(1);
2865 close(newnsfd);
2866
2867 if (pipe(cpipe) < 0)
2868 _exit(1);
2869
2870 struct pid_ns_clone_args args = {
2871 .cpipe = cpipe,
2872 .sock = sock,
2873 .tpid = tpid,
2874 .wrapped = &pid_from_ns
2875 };
2876 size_t stack_size = sysconf(_SC_PAGESIZE);
2877 void *stack = alloca(stack_size);
2878
2879 cpid = clone(pid_ns_clone_wrapper, stack + stack_size, SIGCHLD, &args);
2880 if (cpid < 0)
2881 _exit(1);
2882
2883 // give the child 1 second to be done forking and
2884 // write its ack
2885 if (!wait_for_sock(cpipe[0], 1))
2886 _exit(1);
2887 ret = read(cpipe[0], &v, 1);
2888 if (ret != sizeof(char) || v != '1')
2889 _exit(1);
2890
2891 if (!wait_for_pid(cpid))
2892 _exit(1);
2893 _exit(0);
2894 }
2895
2896 /*
2897 * Given host @uid, return the uid to which it maps in
2898 * @pid's user namespace, or -1 if none.
2899 */
2900 bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer)
2901 {
2902 FILE *f;
2903 char line[400];
2904
2905 sprintf(line, "/proc/%d/uid_map", pid);
2906 if ((f = fopen(line, "r")) == NULL) {
2907 return false;
2908 }
2909
2910 *answer = convert_id_to_ns(f, uid);
2911 fclose(f);
2912
2913 if (*answer == -1)
2914 return false;
2915 return true;
2916 }
2917
2918 /*
2919 * get_pid_creds: get the real uid and gid of @pid from
2920 * /proc/$$/status
2921 * (XXX should we use euid here?)
2922 */
2923 void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid)
2924 {
2925 char line[400];
2926 uid_t u;
2927 gid_t g;
2928 FILE *f;
2929
2930 *uid = -1;
2931 *gid = -1;
2932 sprintf(line, "/proc/%d/status", pid);
2933 if ((f = fopen(line, "r")) == NULL) {
2934 lxcfs_error("Error opening %s: %s\n", line, strerror(errno));
2935 return;
2936 }
2937 while (fgets(line, 400, f)) {
2938 if (strncmp(line, "Uid:", 4) == 0) {
2939 if (sscanf(line+4, "%u", &u) != 1) {
2940 lxcfs_error("bad uid line for pid %u\n", pid);
2941 fclose(f);
2942 return;
2943 }
2944 *uid = u;
2945 } else if (strncmp(line, "Gid:", 4) == 0) {
2946 if (sscanf(line+4, "%u", &g) != 1) {
2947 lxcfs_error("bad gid line for pid %u\n", pid);
2948 fclose(f);
2949 return;
2950 }
2951 *gid = g;
2952 }
2953 }
2954 fclose(f);
2955 }
2956
2957 /*
2958 * May the requestor @r move victim @v to a new cgroup?
2959 * This is allowed if
2960 * . they are the same task
2961 * . they are ownedy by the same uid
2962 * . @r is root on the host, or
2963 * . @v's uid is mapped into @r's where @r is root.
2964 */
2965 bool may_move_pid(pid_t r, uid_t r_uid, pid_t v)
2966 {
2967 uid_t v_uid, tmpuid;
2968 gid_t v_gid;
2969
2970 if (r == v)
2971 return true;
2972 if (r_uid == 0)
2973 return true;
2974 get_pid_creds(v, &v_uid, &v_gid);
2975 if (r_uid == v_uid)
2976 return true;
2977 if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0
2978 && hostuid_to_ns(v_uid, r, &tmpuid))
2979 return true;
2980 return false;
2981 }
2982
2983 static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg,
2984 const char *file, const char *buf)
2985 {
2986 int sock[2] = {-1, -1};
2987 pid_t qpid, cpid = -1;
2988 FILE *pids_file = NULL;
2989 bool answer = false, fail = false;
2990
2991 pids_file = open_pids_file(contrl, cg);
2992 if (!pids_file)
2993 return false;
2994
2995 /*
2996 * write the pids to a socket, have helper in writer's pidns
2997 * call movepid for us
2998 */
2999 if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) {
3000 perror("socketpair");
3001 goto out;
3002 }
3003
3004 cpid = fork();
3005 if (cpid == -1)
3006 goto out;
3007
3008 if (!cpid) { // child
3009 fclose(pids_file);
3010 pid_from_ns_wrapper(sock[1], tpid);
3011 }
3012
3013 const char *ptr = buf;
3014 while (sscanf(ptr, "%d", &qpid) == 1) {
3015 struct ucred cred;
3016 char v;
3017
3018 if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) {
3019 lxcfs_error("Error writing pid to child: %s.\n", strerror(errno));
3020 goto out;
3021 }
3022
3023 if (recv_creds(sock[0], &cred, &v)) {
3024 if (v == '0') {
3025 if (!may_move_pid(tpid, tuid, cred.pid)) {
3026 fail = true;
3027 break;
3028 }
3029 if (fprintf(pids_file, "%d", (int) cred.pid) < 0)
3030 fail = true;
3031 }
3032 }
3033
3034 ptr = strchr(ptr, '\n');
3035 if (!ptr)
3036 break;
3037 ptr++;
3038 }
3039
3040 /* All good, write the value */
3041 qpid = -1;
3042 if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid))
3043 lxcfs_error("%s\n", "Warning: failed to ask child to exit.");
3044
3045 if (!fail)
3046 answer = true;
3047
3048 out:
3049 if (cpid != -1)
3050 wait_for_pid(cpid);
3051 if (sock[0] != -1) {
3052 close(sock[0]);
3053 close(sock[1]);
3054 }
3055 if (pids_file) {
3056 if (fclose(pids_file) != 0)
3057 answer = false;
3058 }
3059 return answer;
3060 }
3061
3062 int cg_write(const char *path, const char *buf, size_t size, off_t offset,
3063 struct fuse_file_info *fi)
3064 {
3065 struct fuse_context *fc = fuse_get_context();
3066 char *localbuf = NULL;
3067 struct cgfs_files *k = NULL;
3068 struct file_info *f = (struct file_info *)fi->fh;
3069 bool r;
3070
3071 if (f->type != LXC_TYPE_CGFILE) {
3072 lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write.");
3073 return -EIO;
3074 }
3075
3076 if (offset)
3077 return 0;
3078
3079 if (!fc)
3080 return -EIO;
3081
3082 localbuf = alloca(size+1);
3083 localbuf[size] = '\0';
3084 memcpy(localbuf, buf, size);
3085
3086 if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) {
3087 size = -EINVAL;
3088 goto out;
3089 }
3090
3091 if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) {
3092 size = -EACCES;
3093 goto out;
3094 }
3095
3096 if (strcmp(f->file, "tasks") == 0 ||
3097 strcmp(f->file, "/tasks") == 0 ||
3098 strcmp(f->file, "/cgroup.procs") == 0 ||
3099 strcmp(f->file, "cgroup.procs") == 0)
3100 // special case - we have to translate the pids
3101 r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf);
3102 else
3103 r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf);
3104
3105 if (!r)
3106 size = -EINVAL;
3107
3108 out:
3109 free_key(k);
3110 return size;
3111 }
3112
3113 int cg_chown(const char *path, uid_t uid, gid_t gid)
3114 {
3115 struct fuse_context *fc = fuse_get_context();
3116 char *cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3117 struct cgfs_files *k = NULL;
3118 const char *cgroup;
3119 int ret;
3120
3121 if (!fc)
3122 return -EIO;
3123
3124 if (strcmp(path, "/cgroup") == 0)
3125 return -EPERM;
3126
3127 controller = pick_controller_from_path(fc, path);
3128 if (!controller)
3129 return errno == ENOENT ? -EPERM : -errno;
3130
3131 cgroup = find_cgroup_in_path(path);
3132 if (!cgroup)
3133 /* this is just /cgroup/controller */
3134 return -EPERM;
3135
3136 get_cgdir_and_path(cgroup, &cgdir, &last);
3137
3138 if (!last) {
3139 path1 = "/";
3140 path2 = cgdir;
3141 } else {
3142 path1 = cgdir;
3143 path2 = last;
3144 }
3145
3146 if (is_child_cgroup(controller, path1, path2)) {
3147 // get uid, gid, from '/tasks' file and make up a mode
3148 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3149 k = cgfs_get_key(controller, cgroup, "tasks");
3150
3151 } else
3152 k = cgfs_get_key(controller, path1, path2);
3153
3154 if (!k) {
3155 ret = -EINVAL;
3156 goto out;
3157 }
3158
3159 /*
3160 * This being a fuse request, the uid and gid must be valid
3161 * in the caller's namespace. So we can just check to make
3162 * sure that the caller is root in his uid, and privileged
3163 * over the file's current owner.
3164 */
3165 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) {
3166 ret = -EACCES;
3167 goto out;
3168 }
3169
3170 ret = cgfs_chown_file(controller, cgroup, uid, gid);
3171
3172 out:
3173 free_key(k);
3174 free(cgdir);
3175
3176 return ret;
3177 }
3178
3179 int cg_chmod(const char *path, mode_t mode)
3180 {
3181 struct fuse_context *fc = fuse_get_context();
3182 char * cgdir = NULL, *last = NULL, *path1, *path2, *controller;
3183 struct cgfs_files *k = NULL;
3184 const char *cgroup;
3185 int ret;
3186
3187 if (!fc)
3188 return -EIO;
3189
3190 if (strcmp(path, "/cgroup") == 0)
3191 return -EPERM;
3192
3193 controller = pick_controller_from_path(fc, path);
3194 if (!controller)
3195 return errno == ENOENT ? -EPERM : -errno;
3196
3197 cgroup = find_cgroup_in_path(path);
3198 if (!cgroup)
3199 /* this is just /cgroup/controller */
3200 return -EPERM;
3201
3202 get_cgdir_and_path(cgroup, &cgdir, &last);
3203
3204 if (!last) {
3205 path1 = "/";
3206 path2 = cgdir;
3207 } else {
3208 path1 = cgdir;
3209 path2 = last;
3210 }
3211
3212 if (is_child_cgroup(controller, path1, path2)) {
3213 // get uid, gid, from '/tasks' file and make up a mode
3214 // That is a hack, until cgmanager gains a GetCgroupPerms fn.
3215 k = cgfs_get_key(controller, cgroup, "tasks");
3216
3217 } else
3218 k = cgfs_get_key(controller, path1, path2);
3219
3220 if (!k) {
3221 ret = -EINVAL;
3222 goto out;
3223 }
3224
3225 /*
3226 * This being a fuse request, the uid and gid must be valid
3227 * in the caller's namespace. So we can just check to make
3228 * sure that the caller is root in his uid, and privileged
3229 * over the file's current owner.
3230 */
3231 if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) {
3232 ret = -EPERM;
3233 goto out;
3234 }
3235
3236 if (!cgfs_chmod_file(controller, cgroup, mode)) {
3237 ret = -EINVAL;
3238 goto out;
3239 }
3240
3241 ret = 0;
3242 out:
3243 free_key(k);
3244 free(cgdir);
3245 return ret;
3246 }
3247
3248 int cg_mkdir(const char *path, mode_t mode)
3249 {
3250 struct fuse_context *fc = fuse_get_context();
3251 char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL;
3252 const char *cgroup;
3253 int ret;
3254
3255 if (!fc)
3256 return -EIO;
3257
3258 controller = pick_controller_from_path(fc, path);
3259 if (!controller)
3260 return errno == ENOENT ? -EPERM : -errno;
3261
3262 cgroup = find_cgroup_in_path(path);
3263 if (!cgroup)
3264 return -errno;
3265
3266 get_cgdir_and_path(cgroup, &cgdir, &last);
3267 if (!last)
3268 path1 = "/";
3269 else
3270 path1 = cgdir;
3271
3272 pid_t initpid = lookup_initpid_in_store(fc->pid);
3273 if (initpid <= 1 || is_shared_pidns(initpid))
3274 initpid = fc->pid;
3275 if (!caller_is_in_ancestor(initpid, controller, path1, &next)) {
3276 if (!next)
3277 ret = -EINVAL;
3278 else if (last && strcmp(next, last) == 0)
3279 ret = -EEXIST;
3280 else
3281 ret = -EPERM;
3282 goto out;
3283 }
3284
3285 if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) {
3286 ret = -EACCES;
3287 goto out;
3288 }
3289 if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) {
3290 ret = -EACCES;
3291 goto out;
3292 }
3293
3294 ret = cgfs_create(controller, cgroup, fc->uid, fc->gid);
3295
3296 out:
3297 free(cgdir);
3298 free(next);
3299 return ret;
3300 }
3301
3302 int cg_rmdir(const char *path)
3303 {
3304 struct fuse_context *fc = fuse_get_context();
3305 char *last = NULL, *cgdir = NULL, *controller, *next = NULL;
3306 const char *cgroup;
3307 int ret;
3308
3309 if (!fc)
3310 return -EIO;
3311
3312 controller = pick_controller_from_path(fc, path);
3313 if (!controller) /* Someone's trying to delete "/cgroup". */
3314 return -EPERM;
3315
3316 cgroup = find_cgroup_in_path(path);
3317 if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */
3318 return -EPERM;
3319
3320 get_cgdir_and_path(cgroup, &cgdir, &last);
3321 if (!last) {
3322 /* Someone's trying to delete a cgroup on the same level as the
3323 * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or
3324 * rmdir "/cgroup/blkio/init.slice".
3325 */
3326 ret = -EPERM;
3327 goto out;
3328 }
3329
3330 pid_t initpid = lookup_initpid_in_store(fc->pid);
3331 if (initpid <= 1 || is_shared_pidns(initpid))
3332 initpid = fc->pid;
3333 if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) {
3334 if (!last || (next && (strcmp(next, last) == 0)))
3335 ret = -EBUSY;
3336 else
3337 ret = -ENOENT;
3338 goto out;
3339 }
3340
3341 if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) {
3342 ret = -EACCES;
3343 goto out;
3344 }
3345 if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) {
3346 ret = -EACCES;
3347 goto out;
3348 }
3349
3350 if (!cgfs_remove(controller, cgroup)) {
3351 ret = -EINVAL;
3352 goto out;
3353 }
3354
3355 ret = 0;
3356
3357 out:
3358 free(cgdir);
3359 free(next);
3360 return ret;
3361 }
3362
3363 static bool startswith(const char *line, const char *pref)
3364 {
3365 if (strncmp(line, pref, strlen(pref)) == 0)
3366 return true;
3367 return false;
3368 }
3369
3370 static void parse_memstat(char *memstat, unsigned long *cached,
3371 unsigned long *active_anon, unsigned long *inactive_anon,
3372 unsigned long *active_file, unsigned long *inactive_file,
3373 unsigned long *unevictable, unsigned long *shmem)
3374 {
3375 char *eol;
3376
3377 while (*memstat) {
3378 if (startswith(memstat, "total_cache")) {
3379 sscanf(memstat + 11, "%lu", cached);
3380 *cached /= 1024;
3381 } else if (startswith(memstat, "total_active_anon")) {
3382 sscanf(memstat + 17, "%lu", active_anon);
3383 *active_anon /= 1024;
3384 } else if (startswith(memstat, "total_inactive_anon")) {
3385 sscanf(memstat + 19, "%lu", inactive_anon);
3386 *inactive_anon /= 1024;
3387 } else if (startswith(memstat, "total_active_file")) {
3388 sscanf(memstat + 17, "%lu", active_file);
3389 *active_file /= 1024;
3390 } else if (startswith(memstat, "total_inactive_file")) {
3391 sscanf(memstat + 19, "%lu", inactive_file);
3392 *inactive_file /= 1024;
3393 } else if (startswith(memstat, "total_unevictable")) {
3394 sscanf(memstat + 17, "%lu", unevictable);
3395 *unevictable /= 1024;
3396 } else if (startswith(memstat, "total_shmem")) {
3397 sscanf(memstat + 11, "%lu", shmem);
3398 *shmem /= 1024;
3399 }
3400 eol = strchr(memstat, '\n');
3401 if (!eol)
3402 return;
3403 memstat = eol+1;
3404 }
3405 }
3406
3407 static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, unsigned long *v)
3408 {
3409 char *eol;
3410 char key[32];
3411
3412 memset(key, 0, 32);
3413 snprintf(key, 32, "%u:%u %s", major, minor, iotype);
3414
3415 size_t len = strlen(key);
3416 *v = 0;
3417
3418 while (*str) {
3419 if (startswith(str, key)) {
3420 sscanf(str + len, "%lu", v);
3421 return;
3422 }
3423 eol = strchr(str, '\n');
3424 if (!eol)
3425 return;
3426 str = eol+1;
3427 }
3428 }
3429
3430 int read_file(const char *path, char *buf, size_t size, struct file_info *d)
3431 {
3432 size_t linelen = 0, total_len = 0, rv = 0;
3433 char *line = NULL;
3434 char *cache = d->buf;
3435 size_t cache_size = d->buflen;
3436 FILE *f = fopen(path, "r");
3437 if (!f)
3438 return 0;
3439
3440 while (getline(&line, &linelen, f) != -1) {
3441 ssize_t l = snprintf(cache, cache_size, "%s", line);
3442 if (l < 0) {
3443 perror("Error writing to cache");
3444 rv = 0;
3445 goto err;
3446 }
3447 if (l >= cache_size) {
3448 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3449 rv = 0;
3450 goto err;
3451 }
3452 cache += l;
3453 cache_size -= l;
3454 total_len += l;
3455 }
3456
3457 d->size = total_len;
3458 if (total_len > size)
3459 total_len = size;
3460
3461 /* read from off 0 */
3462 memcpy(buf, d->buf, total_len);
3463 rv = total_len;
3464 err:
3465 fclose(f);
3466 free(line);
3467 return rv;
3468 }
3469
3470 /*
3471 * FUSE ops for /proc
3472 */
3473
3474 static unsigned long get_memlimit(const char *cgroup, const char *file)
3475 {
3476 char *memlimit_str = NULL;
3477 unsigned long memlimit = -1;
3478
3479 if (cgfs_get_value("memory", cgroup, file, &memlimit_str))
3480 memlimit = strtoul(memlimit_str, NULL, 10);
3481
3482 free(memlimit_str);
3483
3484 return memlimit;
3485 }
3486
3487 static unsigned long get_min_memlimit(const char *cgroup, const char *file)
3488 {
3489 char *copy = strdupa(cgroup);
3490 unsigned long memlimit = 0, retlimit;
3491
3492 retlimit = get_memlimit(copy, file);
3493
3494 while (strcmp(copy, "/") != 0) {
3495 copy = dirname(copy);
3496 memlimit = get_memlimit(copy, file);
3497 if (memlimit != -1 && memlimit < retlimit)
3498 retlimit = memlimit;
3499 };
3500
3501 return retlimit;
3502 }
3503
3504 static int proc_meminfo_read(char *buf, size_t size, off_t offset,
3505 struct fuse_file_info *fi)
3506 {
3507 struct fuse_context *fc = fuse_get_context();
3508 struct lxcfs_opts *opts = (struct lxcfs_opts *) fuse_get_context()->private_data;
3509 struct file_info *d = (struct file_info *)fi->fh;
3510 char *cg;
3511 char *memusage_str = NULL, *memstat_str = NULL,
3512 *memswlimit_str = NULL, *memswusage_str = NULL;
3513 unsigned long memlimit = 0, memusage = 0, memswlimit = 0, memswusage = 0,
3514 cached = 0, hosttotal = 0, active_anon = 0, inactive_anon = 0,
3515 active_file = 0, inactive_file = 0, unevictable = 0, shmem = 0,
3516 hostswtotal = 0;
3517 char *line = NULL;
3518 size_t linelen = 0, total_len = 0, rv = 0;
3519 char *cache = d->buf;
3520 size_t cache_size = d->buflen;
3521 FILE *f = NULL;
3522
3523 if (offset){
3524 if (offset > d->size)
3525 return -EINVAL;
3526 if (!d->cached)
3527 return 0;
3528 int left = d->size - offset;
3529 total_len = left > size ? size: left;
3530 memcpy(buf, cache + offset, total_len);
3531 return total_len;
3532 }
3533
3534 pid_t initpid = lookup_initpid_in_store(fc->pid);
3535 if (initpid <= 1 || is_shared_pidns(initpid))
3536 initpid = fc->pid;
3537 cg = get_pid_cgroup(initpid, "memory");
3538 if (!cg)
3539 return read_file("/proc/meminfo", buf, size, d);
3540 prune_init_slice(cg);
3541
3542 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
3543 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
3544 goto err;
3545 if (!cgfs_get_value("memory", cg, "memory.stat", &memstat_str))
3546 goto err;
3547
3548 // Following values are allowed to fail, because swapaccount might be turned
3549 // off for current kernel
3550 if(cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str) &&
3551 cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str))
3552 {
3553 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
3554 memswusage = strtoul(memswusage_str, NULL, 10);
3555
3556 memswlimit = memswlimit / 1024;
3557 memswusage = memswusage / 1024;
3558 }
3559
3560 memusage = strtoul(memusage_str, NULL, 10);
3561 memlimit /= 1024;
3562 memusage /= 1024;
3563
3564 parse_memstat(memstat_str, &cached, &active_anon,
3565 &inactive_anon, &active_file, &inactive_file,
3566 &unevictable, &shmem);
3567
3568 f = fopen("/proc/meminfo", "r");
3569 if (!f)
3570 goto err;
3571
3572 while (getline(&line, &linelen, f) != -1) {
3573 ssize_t l;
3574 char *printme, lbuf[100];
3575
3576 memset(lbuf, 0, 100);
3577 if (startswith(line, "MemTotal:")) {
3578 sscanf(line+sizeof("MemTotal:")-1, "%lu", &hosttotal);
3579 if (hosttotal < memlimit)
3580 memlimit = hosttotal;
3581 snprintf(lbuf, 100, "MemTotal: %8lu kB\n", memlimit);
3582 printme = lbuf;
3583 } else if (startswith(line, "MemFree:")) {
3584 snprintf(lbuf, 100, "MemFree: %8lu kB\n", memlimit - memusage);
3585 printme = lbuf;
3586 } else if (startswith(line, "MemAvailable:")) {
3587 snprintf(lbuf, 100, "MemAvailable: %8lu kB\n", memlimit - memusage + cached);
3588 printme = lbuf;
3589 } else if (startswith(line, "SwapTotal:") && memswlimit > 0 && opts && opts->swap_off == false) {
3590 sscanf(line+sizeof("SwapTotal:")-1, "%lu", &hostswtotal);
3591 if (hostswtotal < memswlimit)
3592 memswlimit = hostswtotal;
3593 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", memswlimit);
3594 printme = lbuf;
3595 } else if (startswith(line, "SwapTotal:") && opts && opts->swap_off == true) {
3596 snprintf(lbuf, 100, "SwapTotal: %8lu kB\n", 0UL);
3597 printme = lbuf;
3598 } else if (startswith(line, "SwapFree:") && memswlimit > 0 && memswusage > 0 && opts && opts->swap_off == false) {
3599 unsigned long swaptotal = memswlimit,
3600 swapusage = memswusage - memusage,
3601 swapfree = swapusage < swaptotal ? swaptotal - swapusage : 0;
3602 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", swapfree);
3603 printme = lbuf;
3604 } else if (startswith(line, "SwapFree:") && opts && opts->swap_off == true) {
3605 snprintf(lbuf, 100, "SwapFree: %8lu kB\n", 0UL);
3606 printme = lbuf;
3607 } else if (startswith(line, "Slab:")) {
3608 snprintf(lbuf, 100, "Slab: %8lu kB\n", 0UL);
3609 printme = lbuf;
3610 } else if (startswith(line, "Buffers:")) {
3611 snprintf(lbuf, 100, "Buffers: %8lu kB\n", 0UL);
3612 printme = lbuf;
3613 } else if (startswith(line, "Cached:")) {
3614 snprintf(lbuf, 100, "Cached: %8lu kB\n", cached);
3615 printme = lbuf;
3616 } else if (startswith(line, "SwapCached:")) {
3617 snprintf(lbuf, 100, "SwapCached: %8lu kB\n", 0UL);
3618 printme = lbuf;
3619 } else if (startswith(line, "Active:")) {
3620 snprintf(lbuf, 100, "Active: %8lu kB\n",
3621 active_anon + active_file);
3622 printme = lbuf;
3623 } else if (startswith(line, "Inactive:")) {
3624 snprintf(lbuf, 100, "Inactive: %8lu kB\n",
3625 inactive_anon + inactive_file);
3626 printme = lbuf;
3627 } else if (startswith(line, "Active(anon)")) {
3628 snprintf(lbuf, 100, "Active(anon): %8lu kB\n", active_anon);
3629 printme = lbuf;
3630 } else if (startswith(line, "Inactive(anon)")) {
3631 snprintf(lbuf, 100, "Inactive(anon): %8lu kB\n", inactive_anon);
3632 printme = lbuf;
3633 } else if (startswith(line, "Active(file)")) {
3634 snprintf(lbuf, 100, "Active(file): %8lu kB\n", active_file);
3635 printme = lbuf;
3636 } else if (startswith(line, "Inactive(file)")) {
3637 snprintf(lbuf, 100, "Inactive(file): %8lu kB\n", inactive_file);
3638 printme = lbuf;
3639 } else if (startswith(line, "Unevictable")) {
3640 snprintf(lbuf, 100, "Unevictable: %8lu kB\n", unevictable);
3641 printme = lbuf;
3642 } else if (startswith(line, "SReclaimable")) {
3643 snprintf(lbuf, 100, "SReclaimable: %8lu kB\n", 0UL);
3644 printme = lbuf;
3645 } else if (startswith(line, "SUnreclaim")) {
3646 snprintf(lbuf, 100, "SUnreclaim: %8lu kB\n", 0UL);
3647 printme = lbuf;
3648 } else if (startswith(line, "Shmem:")) {
3649 snprintf(lbuf, 100, "Shmem: %8lu kB\n", shmem);
3650 printme = lbuf;
3651 } else if (startswith(line, "ShmemHugePages")) {
3652 snprintf(lbuf, 100, "ShmemHugePages: %8lu kB\n", 0UL);
3653 printme = lbuf;
3654 } else if (startswith(line, "ShmemPmdMapped")) {
3655 snprintf(lbuf, 100, "ShmemPmdMapped: %8lu kB\n", 0UL);
3656 printme = lbuf;
3657 } else
3658 printme = line;
3659
3660 l = snprintf(cache, cache_size, "%s", printme);
3661 if (l < 0) {
3662 perror("Error writing to cache");
3663 rv = 0;
3664 goto err;
3665
3666 }
3667 if (l >= cache_size) {
3668 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3669 rv = 0;
3670 goto err;
3671 }
3672
3673 cache += l;
3674 cache_size -= l;
3675 total_len += l;
3676 }
3677
3678 d->cached = 1;
3679 d->size = total_len;
3680 if (total_len > size ) total_len = size;
3681 memcpy(buf, d->buf, total_len);
3682
3683 rv = total_len;
3684 err:
3685 if (f)
3686 fclose(f);
3687 free(line);
3688 free(cg);
3689 free(memusage_str);
3690 free(memswlimit_str);
3691 free(memswusage_str);
3692 free(memstat_str);
3693 return rv;
3694 }
3695
3696 /*
3697 * Read the cpuset.cpus for cg
3698 * Return the answer in a newly allocated string which must be freed
3699 */
3700 char *get_cpuset(const char *cg)
3701 {
3702 char *answer;
3703
3704 if (!cgfs_get_value("cpuset", cg, "cpuset.cpus", &answer))
3705 return NULL;
3706 return answer;
3707 }
3708
3709 bool cpu_in_cpuset(int cpu, const char *cpuset);
3710
3711 static bool cpuline_in_cpuset(const char *line, const char *cpuset)
3712 {
3713 int cpu;
3714
3715 if (sscanf(line, "processor : %d", &cpu) != 1)
3716 return false;
3717 return cpu_in_cpuset(cpu, cpuset);
3718 }
3719
3720 /*
3721 * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or `cpu.cfs_period_us`,
3722 * depending on `param`. Parameter value is returned throuh `value`.
3723 */
3724 static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value)
3725 {
3726 bool rv = false;
3727 char file[11 + 6 + 1]; // cpu.cfs__us + quota/period + \0
3728 char *str = NULL;
3729
3730 sprintf(file, "cpu.cfs_%s_us", param);
3731
3732 if (!cgfs_get_value("cpu", cg, file, &str))
3733 goto err;
3734
3735 if (sscanf(str, "%ld", value) != 1)
3736 goto err;
3737
3738 rv = true;
3739
3740 err:
3741 if (str)
3742 free(str);
3743 return rv;
3744 }
3745
3746 /*
3747 * Return the maximum number of visible CPUs based on CPU quotas.
3748 * If there is no quota set, zero is returned.
3749 */
3750 int max_cpu_count(const char *cg)
3751 {
3752 int rv, nprocs;
3753 int64_t cfs_quota, cfs_period;
3754 int nr_cpus_in_cpuset = 0;
3755 char *cpuset = NULL;
3756
3757 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3758 return 0;
3759
3760 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3761 return 0;
3762
3763 cpuset = get_cpuset(cg);
3764 if (cpuset)
3765 nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset);
3766
3767 if (cfs_quota <= 0 || cfs_period <= 0){
3768 if (nr_cpus_in_cpuset > 0)
3769 return nr_cpus_in_cpuset;
3770
3771 return 0;
3772 }
3773
3774 rv = cfs_quota / cfs_period;
3775
3776 /* In case quota/period does not yield a whole number, add one CPU for
3777 * the remainder.
3778 */
3779 if ((cfs_quota % cfs_period) > 0)
3780 rv += 1;
3781
3782 nprocs = get_nprocs();
3783
3784 if (rv > nprocs)
3785 rv = nprocs;
3786
3787 /* use min value in cpu quota and cpuset */
3788 if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv)
3789 rv = nr_cpus_in_cpuset;
3790
3791 return rv;
3792 }
3793
3794 /*
3795 * Return the exact number of visible CPUs based on CPU quotas.
3796 * If there is no quota set, zero is returned.
3797 */
3798 static double exact_cpu_count(const char *cg)
3799 {
3800 double rv;
3801 int nprocs;
3802 int64_t cfs_quota, cfs_period;
3803
3804 if (!read_cpu_cfs_param(cg, "quota", &cfs_quota))
3805 return 0;
3806
3807 if (!read_cpu_cfs_param(cg, "period", &cfs_period))
3808 return 0;
3809
3810 if (cfs_quota <= 0 || cfs_period <= 0)
3811 return 0;
3812
3813 rv = (double)cfs_quota / (double)cfs_period;
3814
3815 nprocs = get_nprocs();
3816
3817 if (rv > nprocs)
3818 rv = nprocs;
3819
3820 return rv;
3821 }
3822
3823 /*
3824 * Determine whether CPU views should be used or not.
3825 */
3826 bool use_cpuview(const char *cg)
3827 {
3828 int cfd;
3829 char *tmpc;
3830
3831 tmpc = find_mounted_controller("cpu", &cfd);
3832 if (!tmpc)
3833 return false;
3834
3835 tmpc = find_mounted_controller("cpuacct", &cfd);
3836 if (!tmpc)
3837 return false;
3838
3839 return true;
3840 }
3841
3842 /*
3843 * check whether this is a '^processor" line in /proc/cpuinfo
3844 */
3845 static bool is_processor_line(const char *line)
3846 {
3847 int cpu;
3848
3849 if (sscanf(line, "processor : %d", &cpu) == 1)
3850 return true;
3851 return false;
3852 }
3853
3854 static int proc_cpuinfo_read(char *buf, size_t size, off_t offset,
3855 struct fuse_file_info *fi)
3856 {
3857 struct fuse_context *fc = fuse_get_context();
3858 struct file_info *d = (struct file_info *)fi->fh;
3859 char *cg;
3860 char *cpuset = NULL;
3861 char *line = NULL;
3862 size_t linelen = 0, total_len = 0, rv = 0;
3863 bool am_printing = false, firstline = true, is_s390x = false;
3864 int curcpu = -1, cpu, max_cpus = 0;
3865 bool use_view;
3866 char *cache = d->buf;
3867 size_t cache_size = d->buflen;
3868 FILE *f = NULL;
3869
3870 if (offset){
3871 if (offset > d->size)
3872 return -EINVAL;
3873 if (!d->cached)
3874 return 0;
3875 int left = d->size - offset;
3876 total_len = left > size ? size: left;
3877 memcpy(buf, cache + offset, total_len);
3878 return total_len;
3879 }
3880
3881 pid_t initpid = lookup_initpid_in_store(fc->pid);
3882 if (initpid <= 1 || is_shared_pidns(initpid))
3883 initpid = fc->pid;
3884 cg = get_pid_cgroup(initpid, "cpuset");
3885 if (!cg)
3886 return read_file("proc/cpuinfo", buf, size, d);
3887 prune_init_slice(cg);
3888
3889 cpuset = get_cpuset(cg);
3890 if (!cpuset)
3891 goto err;
3892
3893 use_view = use_cpuview(cg);
3894
3895 if (use_view)
3896 max_cpus = max_cpu_count(cg);
3897
3898 f = fopen("/proc/cpuinfo", "r");
3899 if (!f)
3900 goto err;
3901
3902 while (getline(&line, &linelen, f) != -1) {
3903 ssize_t l;
3904 if (firstline) {
3905 firstline = false;
3906 if (strstr(line, "IBM/S390") != NULL) {
3907 is_s390x = true;
3908 am_printing = true;
3909 continue;
3910 }
3911 }
3912 if (strncmp(line, "# processors:", 12) == 0)
3913 continue;
3914 if (is_processor_line(line)) {
3915 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3916 break;
3917 am_printing = cpuline_in_cpuset(line, cpuset);
3918 if (am_printing) {
3919 curcpu ++;
3920 l = snprintf(cache, cache_size, "processor : %d\n", curcpu);
3921 if (l < 0) {
3922 perror("Error writing to cache");
3923 rv = 0;
3924 goto err;
3925 }
3926 if (l >= cache_size) {
3927 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3928 rv = 0;
3929 goto err;
3930 }
3931 cache += l;
3932 cache_size -= l;
3933 total_len += l;
3934 }
3935 continue;
3936 } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) {
3937 char *p;
3938 if (use_view && max_cpus > 0 && (curcpu+1) == max_cpus)
3939 break;
3940 if (!cpu_in_cpuset(cpu, cpuset))
3941 continue;
3942 curcpu ++;
3943 p = strchr(line, ':');
3944 if (!p || !*p)
3945 goto err;
3946 p++;
3947 l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p);
3948 if (l < 0) {
3949 perror("Error writing to cache");
3950 rv = 0;
3951 goto err;
3952 }
3953 if (l >= cache_size) {
3954 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3955 rv = 0;
3956 goto err;
3957 }
3958 cache += l;
3959 cache_size -= l;
3960 total_len += l;
3961 continue;
3962
3963 }
3964 if (am_printing) {
3965 l = snprintf(cache, cache_size, "%s", line);
3966 if (l < 0) {
3967 perror("Error writing to cache");
3968 rv = 0;
3969 goto err;
3970 }
3971 if (l >= cache_size) {
3972 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
3973 rv = 0;
3974 goto err;
3975 }
3976 cache += l;
3977 cache_size -= l;
3978 total_len += l;
3979 }
3980 }
3981
3982 if (is_s390x) {
3983 char *origcache = d->buf;
3984 ssize_t l;
3985 do {
3986 d->buf = malloc(d->buflen);
3987 } while (!d->buf);
3988 cache = d->buf;
3989 cache_size = d->buflen;
3990 total_len = 0;
3991 l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n");
3992 if (l < 0 || l >= cache_size) {
3993 free(origcache);
3994 goto err;
3995 }
3996 cache_size -= l;
3997 cache += l;
3998 total_len += l;
3999 l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1);
4000 if (l < 0 || l >= cache_size) {
4001 free(origcache);
4002 goto err;
4003 }
4004 cache_size -= l;
4005 cache += l;
4006 total_len += l;
4007 l = snprintf(cache, cache_size, "%s", origcache);
4008 free(origcache);
4009 if (l < 0 || l >= cache_size)
4010 goto err;
4011 total_len += l;
4012 }
4013
4014 d->cached = 1;
4015 d->size = total_len;
4016 if (total_len > size ) total_len = size;
4017
4018 /* read from off 0 */
4019 memcpy(buf, d->buf, total_len);
4020 rv = total_len;
4021 err:
4022 if (f)
4023 fclose(f);
4024 free(line);
4025 free(cpuset);
4026 free(cg);
4027 return rv;
4028 }
4029
4030 static uint64_t get_reaper_start_time(pid_t pid)
4031 {
4032 int ret;
4033 FILE *f;
4034 uint64_t starttime;
4035 /* strlen("/proc/") = 6
4036 * +
4037 * LXCFS_NUMSTRLEN64
4038 * +
4039 * strlen("/stat") = 5
4040 * +
4041 * \0 = 1
4042 * */
4043 #define __PROC_PID_STAT_LEN (6 + LXCFS_NUMSTRLEN64 + 5 + 1)
4044 char path[__PROC_PID_STAT_LEN];
4045 pid_t qpid;
4046
4047 qpid = lookup_initpid_in_store(pid);
4048 if (qpid <= 0) {
4049 /* Caller can check for EINVAL on 0. */
4050 errno = EINVAL;
4051 return 0;
4052 }
4053
4054 ret = snprintf(path, __PROC_PID_STAT_LEN, "/proc/%d/stat", qpid);
4055 if (ret < 0 || ret >= __PROC_PID_STAT_LEN) {
4056 /* Caller can check for EINVAL on 0. */
4057 errno = EINVAL;
4058 return 0;
4059 }
4060
4061 f = fopen(path, "r");
4062 if (!f) {
4063 /* Caller can check for EINVAL on 0. */
4064 errno = EINVAL;
4065 return 0;
4066 }
4067
4068 /* Note that the *scanf() argument supression requires that length
4069 * modifiers such as "l" are omitted. Otherwise some compilers will yell
4070 * at us. It's like telling someone you're not married and then asking
4071 * if you can bring your wife to the party.
4072 */
4073 ret = fscanf(f, "%*d " /* (1) pid %d */
4074 "%*s " /* (2) comm %s */
4075 "%*c " /* (3) state %c */
4076 "%*d " /* (4) ppid %d */
4077 "%*d " /* (5) pgrp %d */
4078 "%*d " /* (6) session %d */
4079 "%*d " /* (7) tty_nr %d */
4080 "%*d " /* (8) tpgid %d */
4081 "%*u " /* (9) flags %u */
4082 "%*u " /* (10) minflt %lu */
4083 "%*u " /* (11) cminflt %lu */
4084 "%*u " /* (12) majflt %lu */
4085 "%*u " /* (13) cmajflt %lu */
4086 "%*u " /* (14) utime %lu */
4087 "%*u " /* (15) stime %lu */
4088 "%*d " /* (16) cutime %ld */
4089 "%*d " /* (17) cstime %ld */
4090 "%*d " /* (18) priority %ld */
4091 "%*d " /* (19) nice %ld */
4092 "%*d " /* (20) num_threads %ld */
4093 "%*d " /* (21) itrealvalue %ld */
4094 "%" PRIu64, /* (22) starttime %llu */
4095 &starttime);
4096 if (ret != 1) {
4097 fclose(f);
4098 /* Caller can check for EINVAL on 0. */
4099 errno = EINVAL;
4100 return 0;
4101 }
4102
4103 fclose(f);
4104
4105 errno = 0;
4106 return starttime;
4107 }
4108
4109 static double get_reaper_start_time_in_sec(pid_t pid)
4110 {
4111 uint64_t clockticks, ticks_per_sec;
4112 int64_t ret;
4113 double res = 0;
4114
4115 clockticks = get_reaper_start_time(pid);
4116 if (clockticks == 0 && errno == EINVAL) {
4117 lxcfs_debug("failed to retrieve start time of pid %d\n", pid);
4118 return 0;
4119 }
4120
4121 ret = sysconf(_SC_CLK_TCK);
4122 if (ret < 0 && errno == EINVAL) {
4123 lxcfs_debug(
4124 "%s\n",
4125 "failed to determine number of clock ticks in a second");
4126 return 0;
4127 }
4128
4129 ticks_per_sec = (uint64_t)ret;
4130 res = (double)clockticks / ticks_per_sec;
4131 return res;
4132 }
4133
4134 static double get_reaper_age(pid_t pid)
4135 {
4136 uint64_t uptime_ms;
4137 double procstart, procage;
4138
4139 /* We need to substract the time the process has started since system
4140 * boot minus the time when the system has started to get the actual
4141 * reaper age.
4142 */
4143 procstart = get_reaper_start_time_in_sec(pid);
4144 procage = procstart;
4145 if (procstart > 0) {
4146 int ret;
4147 struct timespec spec;
4148
4149 ret = clock_gettime(CLOCK_BOOTTIME, &spec);
4150 if (ret < 0)
4151 return 0;
4152
4153 /* We could make this more precise here by using the tv_nsec
4154 * field in the timespec struct and convert it to milliseconds
4155 * and then create a double for the seconds and milliseconds but
4156 * that seems more work than it is worth.
4157 */
4158 uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6);
4159 procage = (uptime_ms - (procstart * 1000)) / 1000;
4160 }
4161
4162 return procage;
4163 }
4164
4165 /*
4166 * Returns 0 on success.
4167 * It is the caller's responsibility to free `return_usage`, unless this
4168 * function returns an error.
4169 */
4170 static int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size)
4171 {
4172 int cpucount = get_nprocs_conf();
4173 struct cpuacct_usage *cpu_usage;
4174 int rv = 0, i, j, ret;
4175 int cg_cpu;
4176 uint64_t cg_user, cg_system;
4177 int64_t ticks_per_sec;
4178 char *usage_str = NULL;
4179
4180 ticks_per_sec = sysconf(_SC_CLK_TCK);
4181
4182 if (ticks_per_sec < 0 && errno == EINVAL) {
4183 lxcfs_v(
4184 "%s\n",
4185 "read_cpuacct_usage_all failed to determine number of clock ticks "
4186 "in a second");
4187 return -1;
4188 }
4189
4190 cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount);
4191 if (!cpu_usage)
4192 return -ENOMEM;
4193
4194 memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount);
4195 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_all", &usage_str)) {
4196 // read cpuacct.usage_percpu instead
4197 lxcfs_v("failed to read cpuacct.usage_all. reading cpuacct.usage_percpu instead\n%s", "");
4198 if (!cgfs_get_value("cpuacct", cg, "cpuacct.usage_percpu", &usage_str)) {
4199 rv = -1;
4200 goto err;
4201 }
4202 lxcfs_v("usage_str: %s\n", usage_str);
4203
4204 // convert cpuacct.usage_percpu into cpuacct.usage_all
4205 lxcfs_v("converting cpuacct.usage_percpu into cpuacct.usage_all\n%s", "");
4206
4207 char *data = NULL;
4208 size_t sz = 0, asz = 0;
4209
4210 must_strcat(&data, &sz, &asz, "cpu user system\n");
4211
4212 int i = 0, read_pos = 0, read_cnt=0;
4213 while (sscanf(usage_str + read_pos, "%lu %n", &cg_user, &read_cnt) > 0) {
4214 lxcfs_debug("i: %d, cg_user: %lu, read_pos: %d, read_cnt: %d\n", i, cg_user, read_pos, read_cnt);
4215 must_strcat(&data, &sz, &asz, "%d %lu 0\n", i, cg_user);
4216 i++;
4217 read_pos += read_cnt;
4218 }
4219
4220 free(usage_str);
4221 usage_str = data;
4222
4223 lxcfs_v("usage_str: %s\n", usage_str);
4224 }
4225
4226 int read_pos = 0, read_cnt=0;
4227 if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) {
4228 lxcfs_error("read_cpuacct_usage_all reading first line from "
4229 "%s/cpuacct.usage_all failed.\n", cg);
4230 rv = -1;
4231 goto err;
4232 }
4233
4234 read_pos += read_cnt;
4235
4236 for (i = 0, j = 0; i < cpucount; i++) {
4237 ret = sscanf(usage_str + read_pos, "%d %lu %lu\n%n", &cg_cpu, &cg_user,
4238 &cg_system, &read_cnt);
4239
4240 if (ret == EOF)
4241 break;
4242
4243 if (ret != 3) {
4244 lxcfs_error("read_cpuacct_usage_all reading from %s/cpuacct.usage_all "
4245 "failed.\n", cg);
4246 rv = -1;
4247 goto err;
4248 }
4249
4250 read_pos += read_cnt;
4251
4252 /* Convert the time from nanoseconds to USER_HZ */
4253 cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec;
4254 cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec;
4255 j++;
4256 }
4257
4258 rv = 0;
4259 *return_usage = cpu_usage;
4260 *size = cpucount;
4261
4262 err:
4263 if (usage_str)
4264 free(usage_str);
4265
4266 if (rv != 0) {
4267 free(cpu_usage);
4268 *return_usage = NULL;
4269 }
4270
4271 return rv;
4272 }
4273
4274 static unsigned long diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count)
4275 {
4276 int i;
4277 unsigned long sum = 0;
4278
4279 for (i = 0; i < cpu_count; i++) {
4280 if (!newer[i].online)
4281 continue;
4282
4283 /* When cpuset is changed on the fly, the CPUs might get reordered.
4284 * We could either reset all counters, or check that the substractions
4285 * below will return expected results.
4286 */
4287 if (newer[i].user > older[i].user)
4288 diff[i].user = newer[i].user - older[i].user;
4289 else
4290 diff[i].user = 0;
4291
4292 if (newer[i].system > older[i].system)
4293 diff[i].system = newer[i].system - older[i].system;
4294 else
4295 diff[i].system = 0;
4296
4297 if (newer[i].idle > older[i].idle)
4298 diff[i].idle = newer[i].idle - older[i].idle;
4299 else
4300 diff[i].idle = 0;
4301
4302 sum += diff[i].user;
4303 sum += diff[i].system;
4304 sum += diff[i].idle;
4305 }
4306
4307 return sum;
4308 }
4309
4310 static void add_cpu_usage(unsigned long *surplus, struct cpuacct_usage *usage, unsigned long *counter, unsigned long threshold)
4311 {
4312 unsigned long free_space, to_add;
4313
4314 free_space = threshold - usage->user - usage->system;
4315
4316 if (free_space > usage->idle)
4317 free_space = usage->idle;
4318
4319 to_add = free_space > *surplus ? *surplus : free_space;
4320
4321 *counter += to_add;
4322 usage->idle -= to_add;
4323 *surplus -= to_add;
4324 }
4325
4326 static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node)
4327 {
4328 struct cg_proc_stat *first = NULL, *prev, *tmp;
4329
4330 for (prev = NULL; node; ) {
4331 if (!cgfs_param_exist("cpu", node->cg, "cpu.shares")) {
4332 tmp = node;
4333 lxcfs_debug("Removing stat node for %s\n", node->cg);
4334
4335 if (prev)
4336 prev->next = node->next;
4337 else
4338 first = node->next;
4339
4340 node = node->next;
4341 free_proc_stat_node(tmp);
4342 } else {
4343 if (!first)
4344 first = node;
4345 prev = node;
4346 node = node->next;
4347 }
4348 }
4349
4350 return first;
4351 }
4352
4353 #define PROC_STAT_PRUNE_INTERVAL 10
4354 static void prune_proc_stat_history(void)
4355 {
4356 int i;
4357 time_t now = time(NULL);
4358
4359 for (i = 0; i < CPUVIEW_HASH_SIZE; i++) {
4360 pthread_rwlock_wrlock(&proc_stat_history[i]->lock);
4361
4362 if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) {
4363 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4364 return;
4365 }
4366
4367 if (proc_stat_history[i]->next) {
4368 proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next);
4369 proc_stat_history[i]->lastcheck = now;
4370 }
4371
4372 pthread_rwlock_unlock(&proc_stat_history[i]->lock);
4373 }
4374 }
4375
4376 static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg)
4377 {
4378 struct cg_proc_stat *node;
4379
4380 pthread_rwlock_rdlock(&head->lock);
4381
4382 if (!head->next) {
4383 pthread_rwlock_unlock(&head->lock);
4384 return NULL;
4385 }
4386
4387 node = head->next;
4388
4389 do {
4390 if (strcmp(cg, node->cg) == 0)
4391 goto out;
4392 } while ((node = node->next));
4393
4394 node = NULL;
4395
4396 out:
4397 pthread_rwlock_unlock(&head->lock);
4398 prune_proc_stat_history();
4399 return node;
4400 }
4401
4402 static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4403 {
4404 struct cg_proc_stat *node;
4405 int i;
4406
4407 node = malloc(sizeof(struct cg_proc_stat));
4408 if (!node)
4409 goto err;
4410
4411 node->cg = NULL;
4412 node->usage = NULL;
4413 node->view = NULL;
4414
4415 node->cg = malloc(strlen(cg) + 1);
4416 if (!node->cg)
4417 goto err;
4418
4419 strcpy(node->cg, cg);
4420
4421 node->usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4422 if (!node->usage)
4423 goto err;
4424
4425 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4426
4427 node->view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4428 if (!node->view)
4429 goto err;
4430
4431 node->cpu_count = cpu_count;
4432 node->next = NULL;
4433
4434 if (pthread_mutex_init(&node->lock, NULL) != 0) {
4435 lxcfs_error("%s\n", "Failed to initialize node lock");
4436 goto err;
4437 }
4438
4439 for (i = 0; i < cpu_count; i++) {
4440 node->view[i].user = 0;
4441 node->view[i].system = 0;
4442 node->view[i].idle = 0;
4443 }
4444
4445 return node;
4446
4447 err:
4448 if (node && node->cg)
4449 free(node->cg);
4450 if (node && node->usage)
4451 free(node->usage);
4452 if (node && node->view)
4453 free(node->view);
4454 if (node)
4455 free(node);
4456
4457 return NULL;
4458 }
4459
4460 static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node)
4461 {
4462 int hash = calc_hash(new_node->cg) % CPUVIEW_HASH_SIZE;
4463 struct cg_proc_stat_head *head = proc_stat_history[hash];
4464 struct cg_proc_stat *node, *rv = new_node;
4465
4466 pthread_rwlock_wrlock(&head->lock);
4467
4468 if (!head->next) {
4469 head->next = new_node;
4470 goto out;
4471 }
4472
4473 node = head->next;
4474
4475 for (;;) {
4476 if (strcmp(node->cg, new_node->cg) == 0) {
4477 /* The node is already present, return it */
4478 free_proc_stat_node(new_node);
4479 rv = node;
4480 goto out;
4481 }
4482
4483 if (node->next) {
4484 node = node->next;
4485 continue;
4486 }
4487
4488 node->next = new_node;
4489 goto out;
4490 }
4491
4492 out:
4493 pthread_rwlock_unlock(&head->lock);
4494 return rv;
4495 }
4496
4497 static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count)
4498 {
4499 struct cpuacct_usage *new_usage, *new_view;
4500 int i;
4501
4502 /* Allocate new memory */
4503 new_usage = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4504 if (!new_usage)
4505 return false;
4506
4507 new_view = malloc(sizeof(struct cpuacct_usage) * cpu_count);
4508 if (!new_view) {
4509 free(new_usage);
4510 return false;
4511 }
4512
4513 /* Copy existing data & initialize new elements */
4514 for (i = 0; i < cpu_count; i++) {
4515 if (i < node->cpu_count) {
4516 new_usage[i].user = node->usage[i].user;
4517 new_usage[i].system = node->usage[i].system;
4518 new_usage[i].idle = node->usage[i].idle;
4519
4520 new_view[i].user = node->view[i].user;
4521 new_view[i].system = node->view[i].system;
4522 new_view[i].idle = node->view[i].idle;
4523 } else {
4524 new_usage[i].user = 0;
4525 new_usage[i].system = 0;
4526 new_usage[i].idle = 0;
4527
4528 new_view[i].user = 0;
4529 new_view[i].system = 0;
4530 new_view[i].idle = 0;
4531 }
4532 }
4533
4534 free(node->usage);
4535 free(node->view);
4536
4537 node->usage = new_usage;
4538 node->view = new_view;
4539 node->cpu_count = cpu_count;
4540
4541 return true;
4542 }
4543
4544 static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg)
4545 {
4546 int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE;
4547 struct cg_proc_stat_head *head = proc_stat_history[hash];
4548 struct cg_proc_stat *node;
4549
4550 node = find_proc_stat_node(head, cg);
4551
4552 if (!node) {
4553 node = new_proc_stat_node(usage, cpu_count, cg);
4554 if (!node)
4555 return NULL;
4556
4557 node = add_proc_stat_node(node);
4558 lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg);
4559 }
4560
4561 pthread_mutex_lock(&node->lock);
4562
4563 /* If additional CPUs on the host have been enabled, CPU usage counter
4564 * arrays have to be expanded */
4565 if (node->cpu_count < cpu_count) {
4566 lxcfs_debug("Expanding stat node %d->%d for %s\n",
4567 node->cpu_count, cpu_count, cg);
4568
4569 if (!expand_proc_stat_node(node, cpu_count)) {
4570 pthread_mutex_unlock(&node->lock);
4571 lxcfs_debug("Unable to expand stat node %d->%d for %s\n",
4572 node->cpu_count, cpu_count, cg);
4573 return NULL;
4574 }
4575 }
4576
4577 return node;
4578 }
4579
4580 static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count)
4581 {
4582 int i;
4583
4584 lxcfs_debug("Resetting stat node for %s\n", node->cg);
4585 memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count);
4586
4587 for (i = 0; i < cpu_count; i++) {
4588 node->view[i].user = 0;
4589 node->view[i].system = 0;
4590 node->view[i].idle = 0;
4591 }
4592
4593 node->cpu_count = cpu_count;
4594 }
4595
4596 static int cpuview_proc_stat(const char *cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, FILE *f, char *buf, size_t buf_size)
4597 {
4598 char *line = NULL;
4599 size_t linelen = 0, total_len = 0, rv = 0, l;
4600 int curcpu = -1; /* cpu numbering starts at 0 */
4601 int physcpu, i;
4602 int max_cpus = max_cpu_count(cg), cpu_cnt = 0;
4603 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4604 unsigned long user_sum = 0, system_sum = 0, idle_sum = 0;
4605 unsigned long user_surplus = 0, system_surplus = 0;
4606 unsigned long total_sum, threshold;
4607 struct cg_proc_stat *stat_node;
4608 struct cpuacct_usage *diff = NULL;
4609 int nprocs = get_nprocs_conf();
4610
4611 if (cg_cpu_usage_size < nprocs)
4612 nprocs = cg_cpu_usage_size;
4613
4614 /* Read all CPU stats and stop when we've encountered other lines */
4615 while (getline(&line, &linelen, f) != -1) {
4616 int ret;
4617 char cpu_char[10]; /* That's a lot of cores */
4618 uint64_t all_used, cg_used;
4619
4620 if (strlen(line) == 0)
4621 continue;
4622 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
4623 /* not a ^cpuN line containing a number N */
4624 break;
4625 }
4626
4627 if (sscanf(cpu_char, "%d", &physcpu) != 1)
4628 continue;
4629
4630 if (physcpu >= cg_cpu_usage_size)
4631 continue;
4632
4633 curcpu ++;
4634 cpu_cnt ++;
4635
4636 if (!cpu_in_cpuset(physcpu, cpuset)) {
4637 for (i = curcpu; i <= physcpu; i++) {
4638 cg_cpu_usage[i].online = false;
4639 }
4640 continue;
4641 }
4642
4643 if (curcpu < physcpu) {
4644 /* Some CPUs may be disabled */
4645 for (i = curcpu; i < physcpu; i++)
4646 cg_cpu_usage[i].online = false;
4647
4648 curcpu = physcpu;
4649 }
4650
4651 cg_cpu_usage[curcpu].online = true;
4652
4653 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
4654 &user,
4655 &nice,
4656 &system,
4657 &idle,
4658 &iowait,
4659 &irq,
4660 &softirq,
4661 &steal,
4662 &guest,
4663 &guest_nice);
4664
4665 if (ret != 10)
4666 continue;
4667
4668 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
4669 cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system;
4670
4671 if (all_used >= cg_used) {
4672 cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used);
4673
4674 } else {
4675 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
4676 "%lu in cpuacct.usage_all; unable to determine idle time\n",
4677 curcpu, cg, all_used, cg_used);
4678 cg_cpu_usage[curcpu].idle = idle;
4679 }
4680 }
4681
4682 /* Cannot use more CPUs than is available due to cpuset */
4683 if (max_cpus > cpu_cnt)
4684 max_cpus = cpu_cnt;
4685
4686 stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg);
4687
4688 if (!stat_node) {
4689 lxcfs_error("unable to find/create stat node for %s\n", cg);
4690 rv = 0;
4691 goto err;
4692 }
4693
4694 diff = malloc(sizeof(struct cpuacct_usage) * nprocs);
4695 if (!diff) {
4696 rv = 0;
4697 goto err;
4698 }
4699
4700 /*
4701 * If the new values are LOWER than values stored in memory, it means
4702 * the cgroup has been reset/recreated and we should reset too.
4703 */
4704 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4705 if (!cg_cpu_usage[curcpu].online)
4706 continue;
4707
4708 if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user)
4709 reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs);
4710
4711 break;
4712 }
4713
4714 total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs);
4715
4716 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4717 stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online;
4718
4719 if (!stat_node->usage[curcpu].online)
4720 continue;
4721
4722 i++;
4723
4724 stat_node->usage[curcpu].user += diff[curcpu].user;
4725 stat_node->usage[curcpu].system += diff[curcpu].system;
4726 stat_node->usage[curcpu].idle += diff[curcpu].idle;
4727
4728 if (max_cpus > 0 && i >= max_cpus) {
4729 user_surplus += diff[curcpu].user;
4730 system_surplus += diff[curcpu].system;
4731 }
4732 }
4733
4734 /* Calculate usage counters of visible CPUs */
4735 if (max_cpus > 0) {
4736 /* threshold = maximum usage per cpu, including idle */
4737 threshold = total_sum / cpu_cnt * max_cpus;
4738
4739 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4740 if (!stat_node->usage[curcpu].online)
4741 continue;
4742
4743 i++;
4744
4745 if (i == max_cpus)
4746 break;
4747
4748 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4749 continue;
4750
4751 /* Add user */
4752 add_cpu_usage(
4753 &user_surplus,
4754 &diff[curcpu],
4755 &diff[curcpu].user,
4756 threshold);
4757
4758 if (diff[curcpu].user + diff[curcpu].system >= threshold)
4759 continue;
4760
4761 /* If there is still room, add system */
4762 add_cpu_usage(
4763 &system_surplus,
4764 &diff[curcpu],
4765 &diff[curcpu].system,
4766 threshold);
4767 }
4768
4769 if (user_surplus > 0)
4770 lxcfs_debug("leftover user: %lu for %s\n", user_surplus, cg);
4771 if (system_surplus > 0)
4772 lxcfs_debug("leftover system: %lu for %s\n", system_surplus, cg);
4773
4774 unsigned long diff_user = 0;
4775 unsigned long diff_system = 0;
4776 unsigned long diff_idle = 0;
4777 unsigned long max_diff_idle = 0;
4778 unsigned long max_diff_idle_index = 0;
4779 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4780 if (!stat_node->usage[curcpu].online)
4781 continue;
4782
4783 i++;
4784
4785 if (i == max_cpus)
4786 break;
4787
4788 stat_node->view[curcpu].user += diff[curcpu].user;
4789 stat_node->view[curcpu].system += diff[curcpu].system;
4790 stat_node->view[curcpu].idle += diff[curcpu].idle;
4791
4792 user_sum += stat_node->view[curcpu].user;
4793 system_sum += stat_node->view[curcpu].system;
4794 idle_sum += stat_node->view[curcpu].idle;
4795
4796 diff_user += diff[curcpu].user;
4797 diff_system += diff[curcpu].system;
4798 diff_idle += diff[curcpu].idle;
4799 if (diff[curcpu].idle > max_diff_idle) {
4800 max_diff_idle = diff[curcpu].idle;
4801 max_diff_idle_index = curcpu;
4802 }
4803
4804 lxcfs_v("curcpu: %d, diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle);
4805 }
4806 lxcfs_v("total. diff_user: %lu, diff_system: %lu, diff_idle: %lu\n", diff_user, diff_system, diff_idle);
4807
4808 // revise cpu usage view to support partial cpu case
4809 double exact_cpus = exact_cpu_count(cg);
4810 if (exact_cpus < (double)max_cpus){
4811 lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus);
4812 unsigned long delta = (unsigned long)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus));
4813 lxcfs_v("delta: %lu\n", delta);
4814 lxcfs_v("idle_sum before: %lu\n", idle_sum);
4815 idle_sum = idle_sum > delta ? idle_sum - delta : 0;
4816 lxcfs_v("idle_sum after: %lu\n", idle_sum);
4817
4818 curcpu = max_diff_idle_index;
4819 lxcfs_v("curcpu: %d, idle before: %lu\n", curcpu, stat_node->view[curcpu].idle);
4820 stat_node->view[curcpu].idle = stat_node->view[curcpu].idle > delta ? stat_node->view[curcpu].idle - delta : 0;
4821 lxcfs_v("curcpu: %d, idle after: %lu\n", curcpu, stat_node->view[curcpu].idle);
4822 }
4823 } else {
4824 for (curcpu = 0; curcpu < nprocs; curcpu++) {
4825 if (!stat_node->usage[curcpu].online)
4826 continue;
4827
4828 stat_node->view[curcpu].user = stat_node->usage[curcpu].user;
4829 stat_node->view[curcpu].system = stat_node->usage[curcpu].system;
4830 stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle;
4831
4832 user_sum += stat_node->view[curcpu].user;
4833 system_sum += stat_node->view[curcpu].system;
4834 idle_sum += stat_node->view[curcpu].idle;
4835 }
4836 }
4837
4838 /* Render the file */
4839 /* cpu-all */
4840 l = snprintf(buf, buf_size, "cpu %lu 0 %lu %lu 0 0 0 0 0 0\n",
4841 user_sum,
4842 system_sum,
4843 idle_sum);
4844 lxcfs_v("cpu-all: %s\n", buf);
4845
4846 if (l < 0) {
4847 perror("Error writing to cache");
4848 rv = 0;
4849 goto err;
4850 }
4851 if (l >= buf_size) {
4852 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4853 rv = 0;
4854 goto err;
4855 }
4856
4857 buf += l;
4858 buf_size -= l;
4859 total_len += l;
4860
4861 /* Render visible CPUs */
4862 for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) {
4863 if (!stat_node->usage[curcpu].online)
4864 continue;
4865
4866 i++;
4867
4868 if (max_cpus > 0 && i == max_cpus)
4869 break;
4870
4871 l = snprintf(buf, buf_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
4872 i,
4873 stat_node->view[curcpu].user,
4874 stat_node->view[curcpu].system,
4875 stat_node->view[curcpu].idle);
4876 lxcfs_v("cpu: %s\n", buf);
4877
4878 if (l < 0) {
4879 perror("Error writing to cache");
4880 rv = 0;
4881 goto err;
4882
4883 }
4884 if (l >= buf_size) {
4885 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4886 rv = 0;
4887 goto err;
4888 }
4889
4890 buf += l;
4891 buf_size -= l;
4892 total_len += l;
4893 }
4894
4895 /* Pass the rest of /proc/stat, start with the last line read */
4896 l = snprintf(buf, buf_size, "%s", line);
4897
4898 if (l < 0) {
4899 perror("Error writing to cache");
4900 rv = 0;
4901 goto err;
4902
4903 }
4904 if (l >= buf_size) {
4905 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4906 rv = 0;
4907 goto err;
4908 }
4909
4910 buf += l;
4911 buf_size -= l;
4912 total_len += l;
4913
4914 /* Pass the rest of the host's /proc/stat */
4915 while (getline(&line, &linelen, f) != -1) {
4916 l = snprintf(buf, buf_size, "%s", line);
4917 if (l < 0) {
4918 perror("Error writing to cache");
4919 rv = 0;
4920 goto err;
4921 }
4922 if (l >= buf_size) {
4923 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
4924 rv = 0;
4925 goto err;
4926 }
4927 buf += l;
4928 buf_size -= l;
4929 total_len += l;
4930 }
4931
4932 rv = total_len;
4933
4934 err:
4935 if (stat_node)
4936 pthread_mutex_unlock(&stat_node->lock);
4937 if (line)
4938 free(line);
4939 if (diff)
4940 free(diff);
4941 return rv;
4942 }
4943
4944 #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2)
4945 static int proc_stat_read(char *buf, size_t size, off_t offset,
4946 struct fuse_file_info *fi)
4947 {
4948 struct fuse_context *fc = fuse_get_context();
4949 struct file_info *d = (struct file_info *)fi->fh;
4950 char *cg;
4951 char *cpuset = NULL;
4952 char *line = NULL;
4953 size_t linelen = 0, total_len = 0, rv = 0;
4954 int curcpu = -1; /* cpu numbering starts at 0 */
4955 int physcpu = 0;
4956 unsigned long user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0;
4957 unsigned long user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0,
4958 irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0;
4959 char cpuall[CPUALL_MAX_SIZE];
4960 /* reserve for cpu all */
4961 char *cache = d->buf + CPUALL_MAX_SIZE;
4962 size_t cache_size = d->buflen - CPUALL_MAX_SIZE;
4963 FILE *f = NULL;
4964 struct cpuacct_usage *cg_cpu_usage = NULL;
4965 int cg_cpu_usage_size = 0;
4966
4967 if (offset){
4968 if (offset > d->size)
4969 return -EINVAL;
4970 if (!d->cached)
4971 return 0;
4972 int left = d->size - offset;
4973 total_len = left > size ? size: left;
4974 memcpy(buf, d->buf + offset, total_len);
4975 return total_len;
4976 }
4977
4978 pid_t initpid = lookup_initpid_in_store(fc->pid);
4979 lxcfs_v("initpid: %d\n", initpid);
4980 if (initpid <= 0)
4981 initpid = fc->pid;
4982
4983 /*
4984 * when container run with host pid namespace initpid == 1, cgroup will "/"
4985 * we should return host os's /proc contents.
4986 * in some case cpuacct_usage.all in "/" will larger then /proc/stat
4987 */
4988 if (initpid == 1) {
4989 return read_file("/proc/stat", buf, size, d);
4990 }
4991
4992 cg = get_pid_cgroup(initpid, "cpuset");
4993 lxcfs_v("cg: %s\n", cg);
4994 if (!cg)
4995 return read_file("/proc/stat", buf, size, d);
4996 prune_init_slice(cg);
4997
4998 cpuset = get_cpuset(cg);
4999 if (!cpuset)
5000 goto err;
5001
5002 /*
5003 * Read cpuacct.usage_all for all CPUs.
5004 * If the cpuacct cgroup is present, it is used to calculate the container's
5005 * CPU usage. If not, values from the host's /proc/stat are used.
5006 */
5007 if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) != 0) {
5008 lxcfs_v("%s\n", "proc_stat_read failed to read from cpuacct, "
5009 "falling back to the host's /proc/stat");
5010 }
5011
5012 f = fopen("/proc/stat", "r");
5013 if (!f)
5014 goto err;
5015
5016 //skip first line
5017 if (getline(&line, &linelen, f) < 0) {
5018 lxcfs_error("%s\n", "proc_stat_read read first line failed.");
5019 goto err;
5020 }
5021
5022 if (use_cpuview(cg) && cg_cpu_usage) {
5023 total_len = cpuview_proc_stat(cg, cpuset, cg_cpu_usage, cg_cpu_usage_size,
5024 f, d->buf, d->buflen);
5025 goto out;
5026 }
5027
5028 while (getline(&line, &linelen, f) != -1) {
5029 ssize_t l;
5030 char cpu_char[10]; /* That's a lot of cores */
5031 char *c;
5032 uint64_t all_used, cg_used, new_idle;
5033 int ret;
5034
5035 if (strlen(line) == 0)
5036 continue;
5037 if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) {
5038 /* not a ^cpuN line containing a number N, just print it */
5039 l = snprintf(cache, cache_size, "%s", line);
5040 if (l < 0) {
5041 perror("Error writing to cache");
5042 rv = 0;
5043 goto err;
5044 }
5045 if (l >= cache_size) {
5046 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5047 rv = 0;
5048 goto err;
5049 }
5050 cache += l;
5051 cache_size -= l;
5052 total_len += l;
5053 continue;
5054 }
5055
5056 if (sscanf(cpu_char, "%d", &physcpu) != 1)
5057 continue;
5058 if (!cpu_in_cpuset(physcpu, cpuset))
5059 continue;
5060 curcpu ++;
5061
5062 ret = sscanf(line, "%*s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
5063 &user,
5064 &nice,
5065 &system,
5066 &idle,
5067 &iowait,
5068 &irq,
5069 &softirq,
5070 &steal,
5071 &guest,
5072 &guest_nice);
5073
5074 if (ret != 10 || !cg_cpu_usage) {
5075 c = strchr(line, ' ');
5076 if (!c)
5077 continue;
5078 l = snprintf(cache, cache_size, "cpu%d%s", curcpu, c);
5079 if (l < 0) {
5080 perror("Error writing to cache");
5081 rv = 0;
5082 goto err;
5083
5084 }
5085 if (l >= cache_size) {
5086 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5087 rv = 0;
5088 goto err;
5089 }
5090
5091 cache += l;
5092 cache_size -= l;
5093 total_len += l;
5094
5095 if (ret != 10)
5096 continue;
5097 }
5098
5099 if (cg_cpu_usage) {
5100 if (physcpu >= cg_cpu_usage_size)
5101 break;
5102
5103 all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice;
5104 cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system;
5105
5106 if (all_used >= cg_used) {
5107 new_idle = idle + (all_used - cg_used);
5108
5109 } else {
5110 lxcfs_error("cpu%d from %s has unexpected cpu time: %lu in /proc/stat, "
5111 "%lu in cpuacct.usage_all; unable to determine idle time\n",
5112 curcpu, cg, all_used, cg_used);
5113 new_idle = idle;
5114 }
5115
5116 l = snprintf(cache, cache_size, "cpu%d %lu 0 %lu %lu 0 0 0 0 0 0\n",
5117 curcpu, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system,
5118 new_idle);
5119
5120 if (l < 0) {
5121 perror("Error writing to cache");
5122 rv = 0;
5123 goto err;
5124
5125 }
5126 if (l >= cache_size) {
5127 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5128 rv = 0;
5129 goto err;
5130 }
5131
5132 cache += l;
5133 cache_size -= l;
5134 total_len += l;
5135
5136 user_sum += cg_cpu_usage[physcpu].user;
5137 system_sum += cg_cpu_usage[physcpu].system;
5138 idle_sum += new_idle;
5139
5140 } else {
5141 user_sum += user;
5142 nice_sum += nice;
5143 system_sum += system;
5144 idle_sum += idle;
5145 iowait_sum += iowait;
5146 irq_sum += irq;
5147 softirq_sum += softirq;
5148 steal_sum += steal;
5149 guest_sum += guest;
5150 guest_nice_sum += guest_nice;
5151 }
5152 }
5153
5154 cache = d->buf;
5155
5156 int cpuall_len = snprintf(cpuall, CPUALL_MAX_SIZE, "cpu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5157 user_sum,
5158 nice_sum,
5159 system_sum,
5160 idle_sum,
5161 iowait_sum,
5162 irq_sum,
5163 softirq_sum,
5164 steal_sum,
5165 guest_sum,
5166 guest_nice_sum);
5167 if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) {
5168 memcpy(cache, cpuall, cpuall_len);
5169 cache += cpuall_len;
5170 } else {
5171 /* shouldn't happen */
5172 lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d.", cpuall_len);
5173 cpuall_len = 0;
5174 }
5175
5176 memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len);
5177 total_len += cpuall_len;
5178
5179 out:
5180 d->cached = 1;
5181 d->size = total_len;
5182 if (total_len > size)
5183 total_len = size;
5184
5185 memcpy(buf, d->buf, total_len);
5186 rv = total_len;
5187
5188 err:
5189 if (f)
5190 fclose(f);
5191 if (cg_cpu_usage)
5192 free(cg_cpu_usage);
5193 free(line);
5194 free(cpuset);
5195 free(cg);
5196 return rv;
5197 }
5198
5199 /* This function retrieves the busy time of a group of tasks by looking at
5200 * cpuacct.usage. Unfortunately, this only makes sense when the container has
5201 * been given it's own cpuacct cgroup. If not, this function will take the busy
5202 * time of all other taks that do not actually belong to the container into
5203 * account as well. If someone has a clever solution for this please send a
5204 * patch!
5205 */
5206 static double get_reaper_busy(pid_t task)
5207 {
5208 pid_t initpid = lookup_initpid_in_store(task);
5209 char *cgroup = NULL, *usage_str = NULL;
5210 unsigned long usage = 0;
5211 double res = 0;
5212
5213 if (initpid <= 0)
5214 return 0;
5215
5216 cgroup = get_pid_cgroup(initpid, "cpuacct");
5217 if (!cgroup)
5218 goto out;
5219 prune_init_slice(cgroup);
5220 if (!cgfs_get_value("cpuacct", cgroup, "cpuacct.usage", &usage_str))
5221 goto out;
5222 usage = strtoul(usage_str, NULL, 10);
5223 res = (double)usage / 1000000000;
5224
5225 out:
5226 free(cgroup);
5227 free(usage_str);
5228 return res;
5229 }
5230
5231 #if RELOADTEST
5232 void iwashere(void)
5233 {
5234 int fd;
5235
5236 fd = creat("/tmp/lxcfs-iwashere", 0644);
5237 if (fd >= 0)
5238 close(fd);
5239 }
5240 #endif
5241
5242 /*
5243 * We read /proc/uptime and reuse its second field.
5244 * For the first field, we use the mtime for the reaper for
5245 * the calling pid as returned by getreaperage
5246 */
5247 static int proc_uptime_read(char *buf, size_t size, off_t offset,
5248 struct fuse_file_info *fi)
5249 {
5250 struct fuse_context *fc = fuse_get_context();
5251 struct file_info *d = (struct file_info *)fi->fh;
5252 double busytime = get_reaper_busy(fc->pid);
5253 char *cache = d->buf;
5254 ssize_t total_len = 0;
5255 double idletime, reaperage;
5256
5257 #if RELOADTEST
5258 iwashere();
5259 #endif
5260
5261 if (offset){
5262 if (!d->cached)
5263 return 0;
5264 if (offset > d->size)
5265 return -EINVAL;
5266 int left = d->size - offset;
5267 total_len = left > size ? size: left;
5268 memcpy(buf, cache + offset, total_len);
5269 return total_len;
5270 }
5271
5272 reaperage = get_reaper_age(fc->pid);
5273 /* To understand why this is done, please read the comment to the
5274 * get_reaper_busy() function.
5275 */
5276 idletime = reaperage;
5277 if (reaperage >= busytime)
5278 idletime = reaperage - busytime;
5279
5280 total_len = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime);
5281 if (total_len < 0 || total_len >= d->buflen){
5282 lxcfs_error("%s\n", "failed to write to cache");
5283 return 0;
5284 }
5285
5286 d->size = (int)total_len;
5287 d->cached = 1;
5288
5289 if (total_len > size) total_len = size;
5290
5291 memcpy(buf, d->buf, total_len);
5292 return total_len;
5293 }
5294
5295 static int proc_diskstats_read(char *buf, size_t size, off_t offset,
5296 struct fuse_file_info *fi)
5297 {
5298 char dev_name[72];
5299 struct fuse_context *fc = fuse_get_context();
5300 struct file_info *d = (struct file_info *)fi->fh;
5301 char *cg;
5302 char *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL,
5303 *io_wait_time_str = NULL, *io_service_time_str = NULL;
5304 unsigned long read = 0, write = 0;
5305 unsigned long read_merged = 0, write_merged = 0;
5306 unsigned long read_sectors = 0, write_sectors = 0;
5307 unsigned long read_ticks = 0, write_ticks = 0;
5308 unsigned long ios_pgr = 0, tot_ticks = 0, rq_ticks = 0;
5309 unsigned long rd_svctm = 0, wr_svctm = 0, rd_wait = 0, wr_wait = 0;
5310 char *cache = d->buf;
5311 size_t cache_size = d->buflen;
5312 char *line = NULL;
5313 size_t linelen = 0, total_len = 0, rv = 0;
5314 unsigned int major = 0, minor = 0;
5315 int i = 0;
5316 FILE *f = NULL;
5317
5318 if (offset){
5319 if (offset > d->size)
5320 return -EINVAL;
5321 if (!d->cached)
5322 return 0;
5323 int left = d->size - offset;
5324 total_len = left > size ? size: left;
5325 memcpy(buf, cache + offset, total_len);
5326 return total_len;
5327 }
5328
5329 pid_t initpid = lookup_initpid_in_store(fc->pid);
5330 if (initpid <= 1 || is_shared_pidns(initpid))
5331 initpid = fc->pid;
5332 cg = get_pid_cgroup(initpid, "blkio");
5333 if (!cg)
5334 return read_file("/proc/diskstats", buf, size, d);
5335 prune_init_slice(cg);
5336
5337 if (!cgfs_get_value("blkio", cg, "blkio.io_serviced_recursive", &io_serviced_str))
5338 goto err;
5339 if (!cgfs_get_value("blkio", cg, "blkio.io_merged_recursive", &io_merged_str))
5340 goto err;
5341 if (!cgfs_get_value("blkio", cg, "blkio.io_service_bytes_recursive", &io_service_bytes_str))
5342 goto err;
5343 if (!cgfs_get_value("blkio", cg, "blkio.io_wait_time_recursive", &io_wait_time_str))
5344 goto err;
5345 if (!cgfs_get_value("blkio", cg, "blkio.io_service_time_recursive", &io_service_time_str))
5346 goto err;
5347
5348
5349 f = fopen("/proc/diskstats", "r");
5350 if (!f)
5351 goto err;
5352
5353 while (getline(&line, &linelen, f) != -1) {
5354 ssize_t l;
5355 char lbuf[256];
5356
5357 i = sscanf(line, "%u %u %71s", &major, &minor, dev_name);
5358 if (i != 3)
5359 continue;
5360
5361 get_blkio_io_value(io_serviced_str, major, minor, "Read", &read);
5362 get_blkio_io_value(io_serviced_str, major, minor, "Write", &write);
5363 get_blkio_io_value(io_merged_str, major, minor, "Read", &read_merged);
5364 get_blkio_io_value(io_merged_str, major, minor, "Write", &write_merged);
5365 get_blkio_io_value(io_service_bytes_str, major, minor, "Read", &read_sectors);
5366 read_sectors = read_sectors/512;
5367 get_blkio_io_value(io_service_bytes_str, major, minor, "Write", &write_sectors);
5368 write_sectors = write_sectors/512;
5369
5370 get_blkio_io_value(io_service_time_str, major, minor, "Read", &rd_svctm);
5371 rd_svctm = rd_svctm/1000000;
5372 get_blkio_io_value(io_wait_time_str, major, minor, "Read", &rd_wait);
5373 rd_wait = rd_wait/1000000;
5374 read_ticks = rd_svctm + rd_wait;
5375
5376 get_blkio_io_value(io_service_time_str, major, minor, "Write", &wr_svctm);
5377 wr_svctm = wr_svctm/1000000;
5378 get_blkio_io_value(io_wait_time_str, major, minor, "Write", &wr_wait);
5379 wr_wait = wr_wait/1000000;
5380 write_ticks = wr_svctm + wr_wait;
5381
5382 get_blkio_io_value(io_service_time_str, major, minor, "Total", &tot_ticks);
5383 tot_ticks = tot_ticks/1000000;
5384
5385 memset(lbuf, 0, 256);
5386 if (read || write || read_merged || write_merged || read_sectors || write_sectors || read_ticks || write_ticks)
5387 snprintf(lbuf, 256, "%u %u %s %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
5388 major, minor, dev_name, read, read_merged, read_sectors, read_ticks,
5389 write, write_merged, write_sectors, write_ticks, ios_pgr, tot_ticks, rq_ticks);
5390 else
5391 continue;
5392
5393 l = snprintf(cache, cache_size, "%s", lbuf);
5394 if (l < 0) {
5395 perror("Error writing to fuse buf");
5396 rv = 0;
5397 goto err;
5398 }
5399 if (l >= cache_size) {
5400 lxcfs_error("%s\n", "Internal error: truncated write to cache.");
5401 rv = 0;
5402 goto err;
5403 }
5404 cache += l;
5405 cache_size -= l;
5406 total_len += l;
5407 }
5408
5409 d->cached = 1;
5410 d->size = total_len;
5411 if (total_len > size ) total_len = size;
5412 memcpy(buf, d->buf, total_len);
5413
5414 rv = total_len;
5415 err:
5416 free(cg);
5417 if (f)
5418 fclose(f);
5419 free(line);
5420 free(io_serviced_str);
5421 free(io_merged_str);
5422 free(io_service_bytes_str);
5423 free(io_wait_time_str);
5424 free(io_service_time_str);
5425 return rv;
5426 }
5427
5428 static int proc_swaps_read(char *buf, size_t size, off_t offset,
5429 struct fuse_file_info *fi)
5430 {
5431 struct fuse_context *fc = fuse_get_context();
5432 struct file_info *d = (struct file_info *)fi->fh;
5433 char *cg = NULL;
5434 char *memswlimit_str = NULL, *memlimit_str = NULL, *memusage_str = NULL, *memswusage_str = NULL;
5435 unsigned long memswlimit = 0, memlimit = 0, memusage = 0, memswusage = 0, swap_total = 0, swap_free = 0;
5436 ssize_t total_len = 0, rv = 0;
5437 ssize_t l = 0;
5438 char *cache = d->buf;
5439
5440 if (offset) {
5441 if (offset > d->size)
5442 return -EINVAL;
5443 if (!d->cached)
5444 return 0;
5445 int left = d->size - offset;
5446 total_len = left > size ? size: left;
5447 memcpy(buf, cache + offset, total_len);
5448 return total_len;
5449 }
5450
5451 pid_t initpid = lookup_initpid_in_store(fc->pid);
5452 if (initpid <= 1 || is_shared_pidns(initpid))
5453 initpid = fc->pid;
5454 cg = get_pid_cgroup(initpid, "memory");
5455 if (!cg)
5456 return read_file("/proc/swaps", buf, size, d);
5457 prune_init_slice(cg);
5458
5459 memlimit = get_min_memlimit(cg, "memory.limit_in_bytes");
5460
5461 if (!cgfs_get_value("memory", cg, "memory.usage_in_bytes", &memusage_str))
5462 goto err;
5463
5464 memusage = strtoul(memusage_str, NULL, 10);
5465
5466 if (cgfs_get_value("memory", cg, "memory.memsw.usage_in_bytes", &memswusage_str) &&
5467 cgfs_get_value("memory", cg, "memory.memsw.limit_in_bytes", &memswlimit_str)) {
5468
5469 memswlimit = get_min_memlimit(cg, "memory.memsw.limit_in_bytes");
5470 memswusage = strtoul(memswusage_str, NULL, 10);
5471
5472 swap_total = (memswlimit - memlimit) / 1024;
5473 swap_free = (memswusage - memusage) / 1024;
5474 }
5475
5476 total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
5477
5478 /* When no mem + swap limit is specified or swapaccount=0*/
5479 if (!memswlimit) {
5480 char *line = NULL;
5481 size_t linelen = 0;
5482 FILE *f = fopen("/proc/meminfo", "r");
5483
5484 if (!f)
5485 goto err;
5486
5487 while (getline(&line, &linelen, f) != -1) {
5488 if (startswith(line, "SwapTotal:")) {
5489 sscanf(line, "SwapTotal: %8lu kB", &swap_total);
5490 } else if (startswith(line, "SwapFree:")) {
5491 sscanf(line, "SwapFree: %8lu kB", &swap_free);
5492 }
5493 }
5494
5495 free(line);
5496 fclose(f);
5497 }
5498
5499 if (swap_total > 0) {
5500 l = snprintf(d->buf + total_len, d->size - total_len,
5501 "none%*svirtual\t\t%lu\t%lu\t0\n", 36, " ",
5502 swap_total, swap_free);
5503 total_len += l;
5504 }
5505
5506 if (total_len < 0 || l < 0) {
5507 perror("Error writing to cache");
5508 rv = 0;
5509 goto err;
5510 }
5511
5512 d->cached = 1;
5513 d->size = (int)total_len;
5514
5515 if (total_len > size) total_len = size;
5516 memcpy(buf, d->buf, total_len);
5517 rv = total_len;
5518
5519 err:
5520 free(cg);
5521 free(memswlimit_str);
5522 free(memlimit_str);
5523 free(memusage_str);
5524 free(memswusage_str);
5525 return rv;
5526 }
5527 /*
5528 * Find the process pid from cgroup path.
5529 * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid.
5530 * @pid_buf : put pid to pid_buf.
5531 * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ...
5532 * @depth : the depth of cgroup in container.
5533 * @sum : return the number of pid.
5534 * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu
5535 */
5536 static int calc_pid(char ***pid_buf, char *dpath, int depth, int sum, int cfd)
5537 {
5538 DIR *dir;
5539 int fd;
5540 struct dirent *file;
5541 FILE *f = NULL;
5542 size_t linelen = 0;
5543 char *line = NULL;
5544 int pd;
5545 char *path_dir, *path;
5546 char **pid;
5547
5548 /* path = dpath + "/cgroup.procs" + /0 */
5549 do {
5550 path = malloc(strlen(dpath) + 20);
5551 } while (!path);
5552
5553 strcpy(path, dpath);
5554 fd = openat(cfd, path, O_RDONLY);
5555 if (fd < 0)
5556 goto out;
5557
5558 dir = fdopendir(fd);
5559 if (dir == NULL) {
5560 close(fd);
5561 goto out;
5562 }
5563
5564 while (((file = readdir(dir)) != NULL) && depth > 0) {
5565 if (strncmp(file->d_name, ".", 1) == 0)
5566 continue;
5567 if (strncmp(file->d_name, "..", 1) == 0)
5568 continue;
5569 if (file->d_type == DT_DIR) {
5570 /* path + '/' + d_name +/0 */
5571 do {
5572 path_dir = malloc(strlen(path) + 2 + sizeof(file->d_name));
5573 } while (!path_dir);
5574 strcpy(path_dir, path);
5575 strcat(path_dir, "/");
5576 strcat(path_dir, file->d_name);
5577 pd = depth - 1;
5578 sum = calc_pid(pid_buf, path_dir, pd, sum, cfd);
5579 free(path_dir);
5580 }
5581 }
5582 closedir(dir);
5583
5584 strcat(path, "/cgroup.procs");
5585 fd = openat(cfd, path, O_RDONLY);
5586 if (fd < 0)
5587 goto out;
5588
5589 f = fdopen(fd, "r");
5590 if (!f) {
5591 close(fd);
5592 goto out;
5593 }
5594
5595 while (getline(&line, &linelen, f) != -1) {
5596 do {
5597 pid = realloc(*pid_buf, sizeof(char *) * (sum + 1));
5598 } while (!pid);
5599 *pid_buf = pid;
5600 do {
5601 *(*pid_buf + sum) = malloc(strlen(line) + 1);
5602 } while (*(*pid_buf + sum) == NULL);
5603 strcpy(*(*pid_buf + sum), line);
5604 sum++;
5605 }
5606 fclose(f);
5607 out:
5608 if (line)
5609 free(line);
5610 free(path);
5611 return sum;
5612 }
5613 /*
5614 * calc_load calculates the load according to the following formula:
5615 * load1 = load0 * exp + active * (1 - exp)
5616 *
5617 * @load1: the new loadavg.
5618 * @load0: the former loadavg.
5619 * @active: the total number of running pid at this moment.
5620 * @exp: the fixed-point defined in the beginning.
5621 */
5622 static unsigned long
5623 calc_load(unsigned long load, unsigned long exp, unsigned long active)
5624 {
5625 unsigned long newload;
5626
5627 active = active > 0 ? active * FIXED_1 : 0;
5628 newload = load * exp + active * (FIXED_1 - exp);
5629 if (active >= load)
5630 newload += FIXED_1 - 1;
5631
5632 return newload / FIXED_1;
5633 }
5634
5635 /*
5636 * Return 0 means that container p->cg is closed.
5637 * Return -1 means that error occurred in refresh.
5638 * Positive num equals the total number of pid.
5639 */
5640 static int refresh_load(struct load_node *p, char *path)
5641 {
5642 FILE *f = NULL;
5643 char **idbuf;
5644 char proc_path[256];
5645 int i, ret, run_pid = 0, total_pid = 0, last_pid = 0;
5646 char *line = NULL;
5647 size_t linelen = 0;
5648 int sum, length;
5649 DIR *dp;
5650 struct dirent *file;
5651
5652 do {
5653 idbuf = malloc(sizeof(char *));
5654 } while (!idbuf);
5655 sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd);
5656 /* normal exit */
5657 if (sum == 0)
5658 goto out;
5659
5660 for (i = 0; i < sum; i++) {
5661 /*clean up '\n' */
5662 length = strlen(idbuf[i])-1;
5663 idbuf[i][length] = '\0';
5664 ret = snprintf(proc_path, 256, "/proc/%s/task", idbuf[i]);
5665 if (ret < 0 || ret > 255) {
5666 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5667 i = sum;
5668 sum = -1;
5669 goto err_out;
5670 }
5671
5672 dp = opendir(proc_path);
5673 if (!dp) {
5674 lxcfs_error("%s\n", "Open proc_path failed in refresh_load.");
5675 continue;
5676 }
5677 while ((file = readdir(dp)) != NULL) {
5678 if (strncmp(file->d_name, ".", 1) == 0)
5679 continue;
5680 if (strncmp(file->d_name, "..", 1) == 0)
5681 continue;
5682 total_pid++;
5683 /* We make the biggest pid become last_pid.*/
5684 ret = atof(file->d_name);
5685 last_pid = (ret > last_pid) ? ret : last_pid;
5686
5687 ret = snprintf(proc_path, 256, "/proc/%s/task/%s/status", idbuf[i], file->d_name);
5688 if (ret < 0 || ret > 255) {
5689 lxcfs_error("%s\n", "snprintf() failed in refresh_load.");
5690 i = sum;
5691 sum = -1;
5692 closedir(dp);
5693 goto err_out;
5694 }
5695 f = fopen(proc_path, "r");
5696 if (f != NULL) {
5697 while (getline(&line, &linelen, f) != -1) {
5698 /* Find State */
5699 if ((line[0] == 'S') && (line[1] == 't'))
5700 break;
5701 }
5702 if ((line[7] == 'R') || (line[7] == 'D'))
5703 run_pid++;
5704 fclose(f);
5705 }
5706 }
5707 closedir(dp);
5708 }
5709 /*Calculate the loadavg.*/
5710 p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid);
5711 p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid);
5712 p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid);
5713 p->run_pid = run_pid;
5714 p->total_pid = total_pid;
5715 p->last_pid = last_pid;
5716
5717 free(line);
5718 err_out:
5719 for (; i > 0; i--)
5720 free(idbuf[i-1]);
5721 out:
5722 free(idbuf);
5723 return sum;
5724 }
5725 /*
5726 * Traverse the hash table and update it.
5727 */
5728 void *load_begin(void *arg)
5729 {
5730
5731 char *path = NULL;
5732 int i, sum, length, ret;
5733 struct load_node *f;
5734 int first_node;
5735 clock_t time1, time2;
5736
5737 while (1) {
5738 if (loadavg_stop == 1)
5739 return NULL;
5740
5741 time1 = clock();
5742 for (i = 0; i < LOAD_SIZE; i++) {
5743 pthread_mutex_lock(&load_hash[i].lock);
5744 if (load_hash[i].next == NULL) {
5745 pthread_mutex_unlock(&load_hash[i].lock);
5746 continue;
5747 }
5748 f = load_hash[i].next;
5749 first_node = 1;
5750 while (f) {
5751 length = strlen(f->cg) + 2;
5752 do {
5753 /* strlen(f->cg) + '.' or '' + \0 */
5754 path = malloc(length);
5755 } while (!path);
5756
5757 ret = snprintf(path, length, "%s%s", *(f->cg) == '/' ? "." : "", f->cg);
5758 if (ret < 0 || ret > length - 1) {
5759 /* snprintf failed, ignore the node.*/
5760 lxcfs_error("Refresh node %s failed for snprintf().\n", f->cg);
5761 goto out;
5762 }
5763 sum = refresh_load(f, path);
5764 if (sum == 0) {
5765 f = del_node(f, i);
5766 } else {
5767 out: f = f->next;
5768 }
5769 free(path);
5770 /* load_hash[i].lock locks only on the first node.*/
5771 if (first_node == 1) {
5772 first_node = 0;
5773 pthread_mutex_unlock(&load_hash[i].lock);
5774 }
5775 }
5776 }
5777
5778 if (loadavg_stop == 1)
5779 return NULL;
5780
5781 time2 = clock();
5782 usleep(FLUSH_TIME * 1000000 - (int)((time2 - time1) * 1000000 / CLOCKS_PER_SEC));
5783 }
5784 }
5785
5786 static int proc_loadavg_read(char *buf, size_t size, off_t offset,
5787 struct fuse_file_info *fi)
5788 {
5789 struct fuse_context *fc = fuse_get_context();
5790 struct file_info *d = (struct file_info *)fi->fh;
5791 pid_t initpid;
5792 char *cg;
5793 size_t total_len = 0;
5794 char *cache = d->buf;
5795 struct load_node *n;
5796 int hash;
5797 int cfd, rv = 0;
5798 unsigned long a, b, c;
5799
5800 if (offset) {
5801 if (offset > d->size)
5802 return -EINVAL;
5803 if (!d->cached)
5804 return 0;
5805 int left = d->size - offset;
5806 total_len = left > size ? size : left;
5807 memcpy(buf, cache + offset, total_len);
5808 return total_len;
5809 }
5810 if (!loadavg)
5811 return read_file("/proc/loadavg", buf, size, d);
5812
5813 initpid = lookup_initpid_in_store(fc->pid);
5814 if (initpid <= 1 || is_shared_pidns(initpid))
5815 initpid = fc->pid;
5816 cg = get_pid_cgroup(initpid, "cpu");
5817 if (!cg)
5818 return read_file("/proc/loadavg", buf, size, d);
5819
5820 prune_init_slice(cg);
5821 hash = calc_hash(cg) % LOAD_SIZE;
5822 n = locate_node(cg, hash);
5823
5824 /* First time */
5825 if (n == NULL) {
5826 if (!find_mounted_controller("cpu", &cfd)) {
5827 /*
5828 * In locate_node() above, pthread_rwlock_unlock() isn't used
5829 * because delete is not allowed before read has ended.
5830 */
5831 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5832 rv = 0;
5833 goto err;
5834 }
5835 do {
5836 n = malloc(sizeof(struct load_node));
5837 } while (!n);
5838
5839 do {
5840 n->cg = malloc(strlen(cg)+1);
5841 } while (!n->cg);
5842 strcpy(n->cg, cg);
5843 n->avenrun[0] = 0;
5844 n->avenrun[1] = 0;
5845 n->avenrun[2] = 0;
5846 n->run_pid = 0;
5847 n->total_pid = 1;
5848 n->last_pid = initpid;
5849 n->cfd = cfd;
5850 insert_node(&n, hash);
5851 }
5852 a = n->avenrun[0] + (FIXED_1/200);
5853 b = n->avenrun[1] + (FIXED_1/200);
5854 c = n->avenrun[2] + (FIXED_1/200);
5855 total_len = snprintf(d->buf, d->buflen, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
5856 LOAD_INT(a), LOAD_FRAC(a),
5857 LOAD_INT(b), LOAD_FRAC(b),
5858 LOAD_INT(c), LOAD_FRAC(c),
5859 n->run_pid, n->total_pid, n->last_pid);
5860 pthread_rwlock_unlock(&load_hash[hash].rdlock);
5861 if (total_len < 0 || total_len >= d->buflen) {
5862 lxcfs_error("%s\n", "Failed to write to cache");
5863 rv = 0;
5864 goto err;
5865 }
5866 d->size = (int)total_len;
5867 d->cached = 1;
5868
5869 if (total_len > size)
5870 total_len = size;
5871 memcpy(buf, d->buf, total_len);
5872 rv = total_len;
5873
5874 err:
5875 free(cg);
5876 return rv;
5877 }
5878 /* Return a positive number on success, return 0 on failure.*/
5879 pthread_t load_daemon(int load_use)
5880 {
5881 int ret;
5882 pthread_t pid;
5883
5884 ret = init_load();
5885 if (ret == -1) {
5886 lxcfs_error("%s\n", "Initialize hash_table fails in load_daemon!");
5887 return 0;
5888 }
5889 ret = pthread_create(&pid, NULL, load_begin, NULL);
5890 if (ret != 0) {
5891 lxcfs_error("%s\n", "Create pthread fails in load_daemon!");
5892 load_free();
5893 return 0;
5894 }
5895 /* use loadavg, here loadavg = 1*/
5896 loadavg = load_use;
5897 return pid;
5898 }
5899
5900 /* Returns 0 on success. */
5901 int stop_load_daemon(pthread_t pid)
5902 {
5903 int s;
5904
5905 /* Signal the thread to gracefully stop */
5906 loadavg_stop = 1;
5907
5908 s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */
5909 if (s != 0) {
5910 lxcfs_error("%s\n", "stop_load_daemon error: failed to join");
5911 return -1;
5912 }
5913
5914 load_free();
5915 loadavg_stop = 0;
5916
5917 return 0;
5918 }
5919
5920 static off_t get_procfile_size(const char *which)
5921 {
5922 FILE *f = fopen(which, "r");
5923 char *line = NULL;
5924 size_t len = 0;
5925 ssize_t sz, answer = 0;
5926 if (!f)
5927 return 0;
5928
5929 while ((sz = getline(&line, &len, f)) != -1)
5930 answer += sz;
5931 fclose (f);
5932 free(line);
5933
5934 return answer;
5935 }
5936
5937 int proc_getattr(const char *path, struct stat *sb)
5938 {
5939 struct timespec now;
5940
5941 memset(sb, 0, sizeof(struct stat));
5942 if (clock_gettime(CLOCK_REALTIME, &now) < 0)
5943 return -EINVAL;
5944 sb->st_uid = sb->st_gid = 0;
5945 sb->st_atim = sb->st_mtim = sb->st_ctim = now;
5946 if (strcmp(path, "/proc") == 0) {
5947 sb->st_mode = S_IFDIR | 00555;
5948 sb->st_nlink = 2;
5949 return 0;
5950 }
5951 if (strcmp(path, "/proc/meminfo") == 0 ||
5952 strcmp(path, "/proc/cpuinfo") == 0 ||
5953 strcmp(path, "/proc/uptime") == 0 ||
5954 strcmp(path, "/proc/stat") == 0 ||
5955 strcmp(path, "/proc/diskstats") == 0 ||
5956 strcmp(path, "/proc/swaps") == 0 ||
5957 strcmp(path, "/proc/loadavg") == 0) {
5958 sb->st_size = 0;
5959 sb->st_mode = S_IFREG | 00444;
5960 sb->st_nlink = 1;
5961 return 0;
5962 }
5963
5964 return -ENOENT;
5965 }
5966
5967 int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset,
5968 struct fuse_file_info *fi)
5969 {
5970 if (filler(buf, ".", NULL, 0) != 0 ||
5971 filler(buf, "..", NULL, 0) != 0 ||
5972 filler(buf, "cpuinfo", NULL, 0) != 0 ||
5973 filler(buf, "meminfo", NULL, 0) != 0 ||
5974 filler(buf, "stat", NULL, 0) != 0 ||
5975 filler(buf, "uptime", NULL, 0) != 0 ||
5976 filler(buf, "diskstats", NULL, 0) != 0 ||
5977 filler(buf, "swaps", NULL, 0) != 0 ||
5978 filler(buf, "loadavg", NULL, 0) != 0)
5979 return -EINVAL;
5980 return 0;
5981 }
5982
5983 int proc_open(const char *path, struct fuse_file_info *fi)
5984 {
5985 int type = -1;
5986 struct file_info *info;
5987
5988 if (strcmp(path, "/proc/meminfo") == 0)
5989 type = LXC_TYPE_PROC_MEMINFO;
5990 else if (strcmp(path, "/proc/cpuinfo") == 0)
5991 type = LXC_TYPE_PROC_CPUINFO;
5992 else if (strcmp(path, "/proc/uptime") == 0)
5993 type = LXC_TYPE_PROC_UPTIME;
5994 else if (strcmp(path, "/proc/stat") == 0)
5995 type = LXC_TYPE_PROC_STAT;
5996 else if (strcmp(path, "/proc/diskstats") == 0)
5997 type = LXC_TYPE_PROC_DISKSTATS;
5998 else if (strcmp(path, "/proc/swaps") == 0)
5999 type = LXC_TYPE_PROC_SWAPS;
6000 else if (strcmp(path, "/proc/loadavg") == 0)
6001 type = LXC_TYPE_PROC_LOADAVG;
6002 if (type == -1)
6003 return -ENOENT;
6004
6005 info = malloc(sizeof(*info));
6006 if (!info)
6007 return -ENOMEM;
6008
6009 memset(info, 0, sizeof(*info));
6010 info->type = type;
6011
6012 info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE;
6013 do {
6014 info->buf = malloc(info->buflen);
6015 } while (!info->buf);
6016 memset(info->buf, 0, info->buflen);
6017 /* set actual size to buffer size */
6018 info->size = info->buflen;
6019
6020 fi->fh = (unsigned long)info;
6021 return 0;
6022 }
6023
6024 int proc_access(const char *path, int mask)
6025 {
6026 if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0)
6027 return 0;
6028
6029 /* these are all read-only */
6030 if ((mask & ~R_OK) != 0)
6031 return -EACCES;
6032 return 0;
6033 }
6034
6035 int proc_release(const char *path, struct fuse_file_info *fi)
6036 {
6037 do_release_file_info(fi);
6038 return 0;
6039 }
6040
6041 int proc_read(const char *path, char *buf, size_t size, off_t offset,
6042 struct fuse_file_info *fi)
6043 {
6044 struct file_info *f = (struct file_info *) fi->fh;
6045
6046 switch (f->type) {
6047 case LXC_TYPE_PROC_MEMINFO:
6048 return proc_meminfo_read(buf, size, offset, fi);
6049 case LXC_TYPE_PROC_CPUINFO:
6050 return proc_cpuinfo_read(buf, size, offset, fi);
6051 case LXC_TYPE_PROC_UPTIME:
6052 return proc_uptime_read(buf, size, offset, fi);
6053 case LXC_TYPE_PROC_STAT:
6054 return proc_stat_read(buf, size, offset, fi);
6055 case LXC_TYPE_PROC_DISKSTATS:
6056 return proc_diskstats_read(buf, size, offset, fi);
6057 case LXC_TYPE_PROC_SWAPS:
6058 return proc_swaps_read(buf, size, offset, fi);
6059 case LXC_TYPE_PROC_LOADAVG:
6060 return proc_loadavg_read(buf, size, offset, fi);
6061 default:
6062 return -EINVAL;
6063 }
6064 }
6065
6066 /*
6067 * Functions needed to setup cgroups in the __constructor__.
6068 */
6069
6070 static bool mkdir_p(const char *dir, mode_t mode)
6071 {
6072 const char *tmp = dir;
6073 const char *orig = dir;
6074 char *makeme;
6075
6076 do {
6077 dir = tmp + strspn(tmp, "/");
6078 tmp = dir + strcspn(dir, "/");
6079 makeme = strndup(orig, dir - orig);
6080 if (!makeme)
6081 return false;
6082 if (mkdir(makeme, mode) && errno != EEXIST) {
6083 lxcfs_error("Failed to create directory '%s': %s.\n",
6084 makeme, strerror(errno));
6085 free(makeme);
6086 return false;
6087 }
6088 free(makeme);
6089 } while(tmp != dir);
6090
6091 return true;
6092 }
6093
6094 static bool umount_if_mounted(void)
6095 {
6096 if (umount2(BASEDIR, MNT_DETACH) < 0 && errno != EINVAL) {
6097 lxcfs_error("Failed to unmount %s: %s.\n", BASEDIR, strerror(errno));
6098 return false;
6099 }
6100 return true;
6101 }
6102
6103 /* __typeof__ should be safe to use with all compilers. */
6104 typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic;
6105 static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val)
6106 {
6107 return (fs->f_type == (fs_type_magic)magic_val);
6108 }
6109
6110 /*
6111 * looking at fs/proc_namespace.c, it appears we can
6112 * actually expect the rootfs entry to very specifically contain
6113 * " - rootfs rootfs "
6114 * IIUC, so long as we've chrooted so that rootfs is not our root,
6115 * the rootfs entry should always be skipped in mountinfo contents.
6116 */
6117 static bool is_on_ramfs(void)
6118 {
6119 FILE *f;
6120 char *p, *p2;
6121 char *line = NULL;
6122 size_t len = 0;
6123 int i;
6124
6125 f = fopen("/proc/self/mountinfo", "r");
6126 if (!f)
6127 return false;
6128
6129 while (getline(&line, &len, f) != -1) {
6130 for (p = line, i = 0; p && i < 4; i++)
6131 p = strchr(p + 1, ' ');
6132 if (!p)
6133 continue;
6134 p2 = strchr(p + 1, ' ');
6135 if (!p2)
6136 continue;
6137 *p2 = '\0';
6138 if (strcmp(p + 1, "/") == 0) {
6139 // this is '/'. is it the ramfs?
6140 p = strchr(p2 + 1, '-');
6141 if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) {
6142 free(line);
6143 fclose(f);
6144 return true;
6145 }
6146 }
6147 }
6148 free(line);
6149 fclose(f);
6150 return false;
6151 }
6152
6153 static int pivot_enter()
6154 {
6155 int ret = -1, oldroot = -1, newroot = -1;
6156
6157 oldroot = open("/", O_DIRECTORY | O_RDONLY);
6158 if (oldroot < 0) {
6159 lxcfs_error("%s\n", "Failed to open old root for fchdir.");
6160 return ret;
6161 }
6162
6163 newroot = open(ROOTDIR, O_DIRECTORY | O_RDONLY);
6164 if (newroot < 0) {
6165 lxcfs_error("%s\n", "Failed to open new root for fchdir.");
6166 goto err;
6167 }
6168
6169 /* change into new root fs */
6170 if (fchdir(newroot) < 0) {
6171 lxcfs_error("Failed to change directory to new rootfs: %s.\n", ROOTDIR);
6172 goto err;
6173 }
6174
6175 /* pivot_root into our new root fs */
6176 if (pivot_root(".", ".") < 0) {
6177 lxcfs_error("pivot_root() syscall failed: %s.\n", strerror(errno));
6178 goto err;
6179 }
6180
6181 /*
6182 * At this point the old-root is mounted on top of our new-root.
6183 * To unmounted it we must not be chdir'd into it, so escape back
6184 * to the old-root.
6185 */
6186 if (fchdir(oldroot) < 0) {
6187 lxcfs_error("%s\n", "Failed to enter old root.");
6188 goto err;
6189 }
6190
6191 if (umount2(".", MNT_DETACH) < 0) {
6192 lxcfs_error("%s\n", "Failed to detach old root.");
6193 goto err;
6194 }
6195
6196 if (fchdir(newroot) < 0) {
6197 lxcfs_error("%s\n", "Failed to re-enter new root.");
6198 goto err;
6199 }
6200
6201 ret = 0;
6202
6203 err:
6204 if (oldroot > 0)
6205 close(oldroot);
6206 if (newroot > 0)
6207 close(newroot);
6208
6209 return ret;
6210 }
6211
6212 static int chroot_enter()
6213 {
6214 if (mount(ROOTDIR, "/", NULL, MS_REC | MS_BIND, NULL)) {
6215 lxcfs_error("Failed to recursively bind-mount %s into /.", ROOTDIR);
6216 return -1;
6217 }
6218
6219 if (chroot(".") < 0) {
6220 lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno));
6221 return -1;
6222 }
6223
6224 if (chdir("/") < 0) {
6225 lxcfs_error("Failed to change directory: %s.\n", strerror(errno));
6226 return -1;
6227 }
6228
6229 return 0;
6230 }
6231
6232 static int permute_and_enter(void)
6233 {
6234 struct statfs sb;
6235
6236 if (statfs("/", &sb) < 0) {
6237 lxcfs_error("%s\n", "Could not stat / mountpoint.");
6238 return -1;
6239 }
6240
6241 /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will
6242 * likely report TMPFS_MAGIC. Hence, when it reports no we still check
6243 * /proc/1/mountinfo. */
6244 if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs())
6245 return chroot_enter();
6246
6247 if (pivot_enter() < 0) {
6248 lxcfs_error("%s\n", "Could not perform pivot root.");
6249 return -1;
6250 }
6251
6252 return 0;
6253 }
6254
6255 /* Prepare our new clean root. */
6256 static int permute_prepare(void)
6257 {
6258 if (mkdir(ROOTDIR, 0700) < 0 && errno != EEXIST) {
6259 lxcfs_error("%s\n", "Failed to create directory for new root.");
6260 return -1;
6261 }
6262
6263 if (mount("/", ROOTDIR, NULL, MS_BIND, 0) < 0) {
6264 lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno));
6265 return -1;
6266 }
6267
6268 if (mount(RUNTIME_PATH, ROOTDIR RUNTIME_PATH, NULL, MS_BIND, 0) < 0) {
6269 lxcfs_error("Failed to bind-mount /run into new root: %s.\n", strerror(errno));
6270 return -1;
6271 }
6272
6273 if (mount(BASEDIR, ROOTDIR BASEDIR, NULL, MS_REC | MS_MOVE, 0) < 0) {
6274 printf("Failed to move " BASEDIR " into new root: %s.\n", strerror(errno));
6275 return -1;
6276 }
6277
6278 return 0;
6279 }
6280
6281 /* Calls chroot() on ramfs, pivot_root() in all other cases. */
6282 static bool permute_root(void)
6283 {
6284 /* Prepare new root. */
6285 if (permute_prepare() < 0)
6286 return false;
6287
6288 /* Pivot into new root. */
6289 if (permute_and_enter() < 0)
6290 return false;
6291
6292 return true;
6293 }
6294
6295 static int preserve_mnt_ns(int pid)
6296 {
6297 int ret;
6298 size_t len = sizeof("/proc/") + 21 + sizeof("/ns/mnt");
6299 char path[len];
6300
6301 ret = snprintf(path, len, "/proc/%d/ns/mnt", pid);
6302 if (ret < 0 || (size_t)ret >= len)
6303 return -1;
6304
6305 return open(path, O_RDONLY | O_CLOEXEC);
6306 }
6307
6308 static bool cgfs_prepare_mounts(void)
6309 {
6310 if (!mkdir_p(BASEDIR, 0700)) {
6311 lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint.");
6312 return false;
6313 }
6314
6315 if (!umount_if_mounted()) {
6316 lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint.");
6317 return false;
6318 }
6319
6320 if (unshare(CLONE_NEWNS) < 0) {
6321 lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno));
6322 return false;
6323 }
6324
6325 cgroup_mount_ns_fd = preserve_mnt_ns(getpid());
6326 if (cgroup_mount_ns_fd < 0) {
6327 lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno));
6328 return false;
6329 }
6330
6331 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) {
6332 lxcfs_error("Failed to remount / private: %s.\n", strerror(errno));
6333 return false;
6334 }
6335
6336 if (mount("tmpfs", BASEDIR, "tmpfs", 0, "size=100000,mode=700") < 0) {
6337 lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint.");
6338 return false;
6339 }
6340
6341 return true;
6342 }
6343
6344 static bool cgfs_mount_hierarchies(void)
6345 {
6346 char *target;
6347 size_t clen, len;
6348 int i, ret;
6349
6350 for (i = 0; i < num_hierarchies; i++) {
6351 char *controller = hierarchies[i];
6352
6353 clen = strlen(controller);
6354 len = strlen(BASEDIR) + clen + 2;
6355 target = malloc(len);
6356 if (!target)
6357 return false;
6358
6359 ret = snprintf(target, len, "%s/%s", BASEDIR, controller);
6360 if (ret < 0 || ret >= len) {
6361 free(target);
6362 return false;
6363 }
6364 if (mkdir(target, 0755) < 0 && errno != EEXIST) {
6365 free(target);
6366 return false;
6367 }
6368 if (!strcmp(controller, "unified"))
6369 ret = mount("none", target, "cgroup2", 0, NULL);
6370 else
6371 ret = mount(controller, target, "cgroup", 0, controller);
6372 if (ret < 0) {
6373 lxcfs_error("Failed mounting cgroup %s: %s\n", controller, strerror(errno));
6374 free(target);
6375 return false;
6376 }
6377
6378 fd_hierarchies[i] = open(target, O_DIRECTORY);
6379 if (fd_hierarchies[i] < 0) {
6380 free(target);
6381 return false;
6382 }
6383 free(target);
6384 }
6385 return true;
6386 }
6387
6388 static bool cgfs_setup_controllers(void)
6389 {
6390 if (!cgfs_prepare_mounts())
6391 return false;
6392
6393 if (!cgfs_mount_hierarchies()) {
6394 lxcfs_error("%s\n", "Failed to set up private lxcfs cgroup mounts.");
6395 return false;
6396 }
6397
6398 if (!permute_root())
6399 return false;
6400
6401 return true;
6402 }
6403
6404 static void __attribute__((constructor)) collect_and_mount_subsystems(void)
6405 {
6406 FILE *f;
6407 char *cret, *line = NULL;
6408 char cwd[MAXPATHLEN];
6409 size_t len = 0;
6410 int i, init_ns = -1;
6411 bool found_unified = false;
6412
6413 if ((f = fopen("/proc/self/cgroup", "r")) == NULL) {
6414 lxcfs_error("Error opening /proc/self/cgroup: %s\n", strerror(errno));
6415 return;
6416 }
6417
6418 while (getline(&line, &len, f) != -1) {
6419 char *idx, *p, *p2;
6420
6421 p = strchr(line, ':');
6422 if (!p)
6423 goto out;
6424 idx = line;
6425 *(p++) = '\0';
6426
6427 p2 = strrchr(p, ':');
6428 if (!p2)
6429 goto out;
6430 *p2 = '\0';
6431
6432 /* With cgroupv2 /proc/self/cgroup can contain entries of the
6433 * form: 0::/ This will cause lxcfs to fail the cgroup mounts
6434 * because it parses out the empty string "" and later on passes
6435 * it to mount(). Let's skip such entries.
6436 */
6437 if (!strcmp(p, "") && !strcmp(idx, "0") && !found_unified) {
6438 found_unified = true;
6439 p = "unified";
6440 }
6441
6442 if (!store_hierarchy(line, p))
6443 goto out;
6444 }
6445
6446 /* Preserve initial namespace. */
6447 init_ns = preserve_mnt_ns(getpid());
6448 if (init_ns < 0) {
6449 lxcfs_error("%s\n", "Failed to preserve initial mount namespace.");
6450 goto out;
6451 }
6452
6453 fd_hierarchies = malloc(sizeof(int) * num_hierarchies);
6454 if (!fd_hierarchies) {
6455 lxcfs_error("%s\n", strerror(errno));
6456 goto out;
6457 }
6458
6459 for (i = 0; i < num_hierarchies; i++)
6460 fd_hierarchies[i] = -1;
6461
6462 cret = getcwd(cwd, MAXPATHLEN);
6463 if (!cret)
6464 lxcfs_debug("Could not retrieve current working directory: %s.\n", strerror(errno));
6465
6466 /* This function calls unshare(CLONE_NEWNS) our initial mount namespace
6467 * to privately mount lxcfs cgroups. */
6468 if (!cgfs_setup_controllers()) {
6469 lxcfs_error("%s\n", "Failed to setup private cgroup mounts for lxcfs.");
6470 goto out;
6471 }
6472
6473 if (setns(init_ns, 0) < 0) {
6474 lxcfs_error("Failed to switch back to initial mount namespace: %s.\n", strerror(errno));
6475 goto out;
6476 }
6477
6478 if (!cret || chdir(cwd) < 0)
6479 lxcfs_debug("Could not change back to original working directory: %s.\n", strerror(errno));
6480
6481 if (!init_cpuview()) {
6482 lxcfs_error("%s\n", "failed to init CPU view");
6483 goto out;
6484 }
6485
6486 print_subsystems();
6487
6488 out:
6489 free(line);
6490 fclose(f);
6491 if (init_ns >= 0)
6492 close(init_ns);
6493 }
6494
6495 static void __attribute__((destructor)) free_subsystems(void)
6496 {
6497 int i;
6498
6499 lxcfs_debug("%s\n", "Running destructor for liblxcfs.");
6500
6501 for (i = 0; i < num_hierarchies; i++) {
6502 if (hierarchies[i])
6503 free(hierarchies[i]);
6504 if (fd_hierarchies && fd_hierarchies[i] >= 0)
6505 close(fd_hierarchies[i]);
6506 }
6507 free(hierarchies);
6508 free(fd_hierarchies);
6509 free_cpuview();
6510
6511 if (cgroup_mount_ns_fd >= 0)
6512 close(cgroup_mount_ns_fd);
6513 }